565.57.01

2025-03-21 13:29:11 +01:00 · 2024-10-22 17:38:58 +02:00 · 2024-10-22 17:38:58 +02:00 · d5a0858f90
commit d5a0858f90
parent ed4be64962
1049 changed files with 209491 additions and 167508 deletions
--- a/README.md
+++ b/README.md
@ -1,7 +1,7 @@
 # NVIDIA Linux Open GPU Kernel Module Source

 This is the source release of the NVIDIA Linux open GPU kernel modules,
-version 560.35.03.
+version 565.57.01.


 ## How to Build
@ -17,7 +17,7 @@ as root:

 Note that the kernel modules built here must be used with GSP
 firmware and user-space NVIDIA GPU driver components from a corresponding
-560.35.03 driver release.  This can be achieved by installing
+565.57.01 driver release.  This can be achieved by installing
 the NVIDIA GPU driver from the .run file using the `--no-kernel-modules`
 option.  E.g.,

@ -185,7 +185,7 @@ table below).
 For details on feature support and limitations, see the NVIDIA GPU driver
 end user README here:

-https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/README/kernel_open.html
+https://us.download.nvidia.com/XFree86/Linux-x86_64/565.57.01/README/kernel_open.html

 For vGPU support, please refer to the README.vgpu packaged in the vGPU Host
 Package for more details.
@ -199,6 +199,7 @@ Subsystem Device ID.
 | NVIDIA TITAN RTX                                | 1E02           |
 | NVIDIA GeForce RTX 2080 Ti                      | 1E04           |
 | NVIDIA GeForce RTX 2080 Ti                      | 1E07           |
+| NVIDIA CMP 50HX                                 | 1E09           |
 | Quadro RTX 6000                                 | 1E30           |
 | Quadro RTX 8000                                 | 1E30 1028 129E |
 | Quadro RTX 8000                                 | 1E30 103C 129E |
@ -391,6 +392,7 @@ Subsystem Device ID.
 | NVIDIA GeForce RTX 2070                         | 1F07           |
 | NVIDIA GeForce RTX 2060                         | 1F08           |
 | NVIDIA GeForce GTX 1650                         | 1F0A           |
+| NVIDIA CMP 40HX                                 | 1F0B           |
 | NVIDIA GeForce RTX 2070                         | 1F10           |
 | NVIDIA GeForce RTX 2070 with Max-Q Design       | 1F10 1025 132D |
 | NVIDIA GeForce RTX 2070 with Max-Q Design       | 1F10 1025 1342 |
@ -691,6 +693,7 @@ Subsystem Device ID.
 | NVIDIA GeForce GTX 1660                         | 2184           |
 | NVIDIA GeForce GTX 1650 SUPER                   | 2187           |
 | NVIDIA GeForce GTX 1650                         | 2188           |
+| NVIDIA CMP 30HX                                 | 2189           |
 | NVIDIA GeForce GTX 1660 Ti                      | 2191           |
 | NVIDIA GeForce GTX 1660 Ti with Max-Q Design    | 2191 1028 0949 |
 | NVIDIA GeForce GTX 1660 Ti with Max-Q Design    | 2191 103C 85FB |
@ -758,9 +761,11 @@ Subsystem Device ID.
 | NVIDIA H200                                     | 2335 10DE 18BF |
 | NVIDIA H100                                     | 2339 10DE 17FC |
 | NVIDIA H800 NVL                                 | 233A 10DE 183A |
+| NVIDIA H200 NVL                                 | 233B 10DE 1996 |
 | NVIDIA GH200 120GB                              | 2342 10DE 16EB |
 | NVIDIA GH200 120GB                              | 2342 10DE 1805 |
 | NVIDIA GH200 480GB                              | 2342 10DE 1809 |
+| NVIDIA GH200 144G HBM3e                         | 2348 10DE 18D2 |
 | NVIDIA GeForce RTX 3060 Ti                      | 2414           |
 | NVIDIA GeForce RTX 3080 Ti Laptop GPU           | 2420           |
 | NVIDIA RTX A5500 Laptop GPU                     | 2438           |
@ -831,12 +836,10 @@ Subsystem Device ID.
 | NVIDIA GeForce RTX 2050                         | 25AD           |
 | NVIDIA RTX A1000                                | 25B0 1028 1878 |
 | NVIDIA RTX A1000                                | 25B0 103C 1878 |
-| NVIDIA RTX A1000                                | 25B0 103C 8D96 |
 | NVIDIA RTX A1000                                | 25B0 10DE 1878 |
 | NVIDIA RTX A1000                                | 25B0 17AA 1878 |
 | NVIDIA RTX A400                                 | 25B2 1028 1879 |
 | NVIDIA RTX A400                                 | 25B2 103C 1879 |
-| NVIDIA RTX A400                                 | 25B2 103C 8D95 |
 | NVIDIA RTX A400                                 | 25B2 10DE 1879 |
 | NVIDIA RTX A400                                 | 25B2 17AA 1879 |
 | NVIDIA A16                                      | 25B6 10DE 14A9 |
--- a/kernel-open/Kbuild
+++ b/kernel-open/Kbuild
@ -72,7 +72,7 @@ EXTRA_CFLAGS += -I$(src)/common/inc
 EXTRA_CFLAGS += -I$(src)
 EXTRA_CFLAGS += -Wall $(DEFINES) $(INCLUDES) -Wno-cast-qual -Wno-format-extra-args
 EXTRA_CFLAGS += -D__KERNEL__ -DMODULE -DNVRM
-EXTRA_CFLAGS += -DNV_VERSION_STRING=\"560.35.03\"
+EXTRA_CFLAGS += -DNV_VERSION_STRING=\"565.57.01\"

 ifneq ($(SYSSRCHOST1X),)
 EXTRA_CFLAGS += -I$(SYSSRCHOST1X)
--- a/kernel-open/common/inc/nv-kthread-q-os.h
+++ b/kernel-open/common/inc/nv-kthread-q-os.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2016 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2016-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -43,6 +43,8 @@ struct nv_kthread_q
    atomic_t main_loop_should_exit;

    struct task_struct *q_kthread;
+
+    bool is_unload_flush_ongoing;
 };

 struct nv_kthread_q_item
--- a/kernel-open/common/inc/nv-linux.h
+++ b/kernel-open/common/inc/nv-linux.h
@ -724,6 +724,7 @@ static inline dma_addr_t nv_phys_to_dma(struct device *dev, NvU64 pa)
 #endif
 }

+#define NV_GET_OFFSET_IN_PAGE(phys_page) offset_in_page(phys_page)
 #define NV_GET_PAGE_STRUCT(phys_page) virt_to_page(__va(phys_page))
 #define NV_VMA_PGOFF(vma)             ((vma)->vm_pgoff)
 #define NV_VMA_SIZE(vma)              ((vma)->vm_end - (vma)->vm_start)
@ -951,14 +952,14 @@ static inline int nv_remap_page_range(struct vm_area_struct *vma,
 }

 static inline int nv_io_remap_page_range(struct vm_area_struct *vma,
-    NvU64 phys_addr, NvU64 size, NvU32 extra_prot)
+    NvU64 phys_addr, NvU64 size, NvU32 extra_prot, NvU64 start)
 {
    int ret = -1;
 #if !defined(NV_XEN_SUPPORT_FULLY_VIRTUALIZED_KERNEL)
-    ret = nv_remap_page_range(vma, vma->vm_start, phys_addr, size,
+    ret = nv_remap_page_range(vma, start, phys_addr, size,
        nv_adjust_pgprot(vma->vm_page_prot, extra_prot));
 #else
-    ret = io_remap_pfn_range(vma, vma->vm_start, (phys_addr >> PAGE_SHIFT),
+    ret = io_remap_pfn_range(vma, start, (phys_addr >> PAGE_SHIFT),
        size, nv_adjust_pgprot(vma->vm_page_prot, extra_prot));
 #endif
    return ret;
@ -1207,6 +1208,7 @@ typedef struct nv_alloc_s {
        NvBool physical    : 1;
        NvBool unencrypted : 1;
        NvBool coherent    : 1;
+        NvBool carveout    : 1;
    } flags;
    unsigned int   cache_type;
    unsigned int   num_pages;
@ -1840,20 +1842,6 @@ static inline int nv_is_control_device(struct inode *inode)
 #endif
 #endif

-static inline NvU64 nv_pci_bus_address(struct pci_dev *dev, NvU8 bar_index)
-{
-    NvU64 bus_addr = 0;
-#if defined(NV_PCI_BUS_ADDRESS_PRESENT)
-    bus_addr = pci_bus_address(dev, bar_index);
-#elif defined(CONFIG_PCI)
-    struct pci_bus_region region;
-
-    pcibios_resource_to_bus(dev, &region, &dev->resource[bar_index]);
-    bus_addr = region.start;
-#endif
-    return bus_addr;
-}
-
 /*
 * Decrements the usage count of the allocation, and moves the allocation to
 * the given nvlfp's free list if the usage count drops to zero.
--- a/kernel-open/common/inc/nv-proto.h
+++ b/kernel-open/common/inc/nv-proto.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 1999-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1999-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -59,6 +59,8 @@ NV_STATUS   nv_uvm_resume               (void);
 void        nv_uvm_notify_start_device  (const NvU8 *uuid);
 void        nv_uvm_notify_stop_device   (const NvU8 *uuid);
 NV_STATUS   nv_uvm_event_interrupt      (const NvU8 *uuid);
+NV_STATUS   nv_uvm_drain_P2P            (const NvU8 *uuid);
+NV_STATUS   nv_uvm_resume_P2P           (const NvU8 *uuid);

 /* Move these to nv.h once implemented by other UNIX platforms */
 NvBool      nvidia_get_gpuid_list       (NvU32 *gpu_ids, NvU32 *gpu_count);
--- a/kernel-open/common/inc/nv.h
+++ b/kernel-open/common/inc/nv.h
@ -44,6 +44,7 @@
 #include <nv-ioctl.h>
 #include <nv-ioctl-numa.h>
 #include <nvmisc.h>
+#include <os/nv_memory_area.h>

 extern nv_cap_t *nvidia_caps_root;

@ -279,8 +280,7 @@ typedef struct nv_usermap_access_params_s
    NvU64    offset;
    NvU64   *page_array;
    NvU64    num_pages;
-    NvU64    mmap_start;
-    NvU64    mmap_size;
+    MemoryArea memArea;
    NvU64    access_start;
    NvU64    access_size;
    NvU64    remap_prot_extra;
@ -296,8 +296,7 @@ typedef struct nv_alloc_mapping_context_s {
    NvU64  page_index;
    NvU64 *page_array;
    NvU64  num_pages;
-    NvU64  mmap_start;
-    NvU64  mmap_size;
+    MemoryArea memArea;
    NvU64  access_start;
    NvU64  access_size;
    NvU64  remap_prot_extra;
@ -330,7 +329,7 @@ typedef struct nv_soc_irq_info_s {
    NvS32 ref_count;
 } nv_soc_irq_info_t;

-#define NV_MAX_SOC_IRQS              6
+#define NV_MAX_SOC_IRQS              10
 #define NV_MAX_DPAUX_NUM_DEVICES     4

 #define NV_MAX_SOC_DPAUX_NUM_DEVICES 2
@ -535,6 +534,7 @@ typedef struct UvmGpuAddressSpaceInfo_tag           *nvgpuAddressSpaceInfo_t;
 typedef struct UvmGpuAllocInfo_tag                  *nvgpuAllocInfo_t;
 typedef struct UvmGpuP2PCapsParams_tag              *nvgpuP2PCapsParams_t;
 typedef struct UvmGpuFbInfo_tag                     *nvgpuFbInfo_t;
+typedef struct UvmGpuNvlinkInfo_tag                 *nvgpuNvlinkInfo_t;
 typedef struct UvmGpuEccInfo_tag                    *nvgpuEccInfo_t;
 typedef struct UvmGpuFaultInfo_tag                  *nvgpuFaultInfo_t;
 typedef struct UvmGpuAccessCntrInfo_tag             *nvgpuAccessCntrInfo_t;
@ -545,6 +545,7 @@ typedef struct UvmPmaAllocationOptions_tag          *nvgpuPmaAllocationOptions_t
 typedef struct UvmPmaStatistics_tag                 *nvgpuPmaStatistics_t;
 typedef struct UvmGpuMemoryInfo_tag                 *nvgpuMemoryInfo_t;
 typedef struct UvmGpuExternalMappingInfo_tag        *nvgpuExternalMappingInfo_t;
+typedef struct UvmGpuExternalPhysAddrInfo_tag       *nvgpuExternalPhysAddrInfo_t;
 typedef struct UvmGpuChannelResourceInfo_tag        *nvgpuChannelResourceInfo_t;
 typedef struct UvmGpuChannelInstanceInfo_tag        *nvgpuChannelInstanceInfo_t;
 typedef struct UvmGpuChannelResourceBindParams_tag  *nvgpuChannelResourceBindParams_t;
@ -783,7 +784,7 @@ nv_state_t*  NV_API_CALL  nv_get_ctl_state       (void);

 void   NV_API_CALL  nv_set_dma_address_size      (nv_state_t *, NvU32 );

-NV_STATUS  NV_API_CALL  nv_alias_pages           (nv_state_t *, NvU32, NvU64, NvU32, NvU32, NvU64, NvU64 *, void **);
+NV_STATUS  NV_API_CALL  nv_alias_pages           (nv_state_t *, NvU32, NvU64, NvU32, NvU32, NvU64, NvU64 *, NvBool, void **);
 NV_STATUS  NV_API_CALL  nv_alloc_pages           (nv_state_t *, NvU32, NvU64, NvBool, NvU32, NvBool, NvBool, NvS32, NvU64 *, void **);
 NV_STATUS  NV_API_CALL  nv_free_pages            (nv_state_t *, NvU32, NvBool, NvU32, void *);

@ -904,6 +905,9 @@ void      NV_API_CALL nv_dma_release_dma_buf     (nv_dma_buf_t *);

 void      NV_API_CALL nv_schedule_uvm_isr        (nv_state_t *);

+NV_STATUS NV_API_CALL nv_schedule_uvm_drain_p2p  (NvU8 *);
+void      NV_API_CALL nv_schedule_uvm_resume_p2p (NvU8 *);
+
 NvBool    NV_API_CALL nv_platform_supports_s0ix  (void);
 NvBool    NV_API_CALL nv_s2idle_pm_configured    (void);

@ -1001,8 +1005,8 @@ NV_STATUS  NV_API_CALL  rm_p2p_put_pages_persistent(nvidia_stack_t *, void *, vo
 NV_STATUS  NV_API_CALL  rm_p2p_dma_map_pages      (nvidia_stack_t *, nv_dma_device_t *, NvU8 *, NvU64, NvU32, NvU64 *, void **);
 NV_STATUS  NV_API_CALL  rm_dma_buf_dup_mem_handle (nvidia_stack_t *, nv_state_t *, NvHandle, NvHandle, NvHandle, NvHandle, void *, NvHandle, NvU64, NvU64, NvHandle *, void **);
 void       NV_API_CALL  rm_dma_buf_undup_mem_handle(nvidia_stack_t *, nv_state_t *, NvHandle, NvHandle);
-NV_STATUS  NV_API_CALL  rm_dma_buf_map_mem_handle (nvidia_stack_t *, nv_state_t *, NvHandle, NvHandle, NvU64, NvU64, void *, nv_phys_addr_range_t **, NvU32 *);
-void       NV_API_CALL  rm_dma_buf_unmap_mem_handle(nvidia_stack_t *, nv_state_t *, NvHandle, NvHandle, NvU64, nv_phys_addr_range_t **, NvU32);
+NV_STATUS  NV_API_CALL  rm_dma_buf_map_mem_handle (nvidia_stack_t *, nv_state_t *, NvHandle, NvHandle, MemoryRange, void *, NvBool, MemoryArea *);
+void       NV_API_CALL  rm_dma_buf_unmap_mem_handle(nvidia_stack_t *, nv_state_t *, NvHandle, NvHandle, void *, NvBool, MemoryArea);
 NV_STATUS  NV_API_CALL  rm_dma_buf_get_client_and_device(nvidia_stack_t *, nv_state_t *, NvHandle, NvHandle, NvHandle *, NvHandle *, NvHandle *, void **, NvBool *);
 void       NV_API_CALL  rm_dma_buf_put_client_and_device(nvidia_stack_t *, nv_state_t *, NvHandle, NvHandle, NvHandle, void *);
 NV_STATUS  NV_API_CALL  rm_log_gpu_crash          (nv_stack_t *, nv_state_t *);
--- a/kernel-open/common/inc/nv_uvm_interface.h
+++ b/kernel-open/common/inc/nv_uvm_interface.h
@ -1085,6 +1085,22 @@ NV_STATUS nvUvmInterfaceRegisterUvmCallbacks(struct UvmOpsUvmEvents *importedUvm
 //
 void nvUvmInterfaceDeRegisterUvmOps(void);

+/*******************************************************************************
+    nvUvmInterfaceGetNvlinkInfo
+
+    Gets NVLINK information from RM.
+
+    Arguments:
+        device[IN]        - GPU device handle
+        nvlinkInfo [OUT]     - Pointer to NvlinkInfo structure
+
+    Error codes:
+      NV_ERROR
+      NV_ERR_INVALID_ARGUMENT
+*/
+NV_STATUS nvUvmInterfaceGetNvlinkInfo(uvmGpuDeviceHandle device,
+                                      UvmGpuNvlinkInfo *nvlinkInfo);
+
 /*******************************************************************************
    nvUvmInterfaceP2pObjectCreate

@ -1161,6 +1177,48 @@ NV_STATUS nvUvmInterfaceGetExternalAllocPtes(uvmGpuAddressSpaceHandle vaSpace,
                                             NvU64 size,
                                             UvmGpuExternalMappingInfo *gpuExternalMappingInfo);

+/*******************************************************************************
+    nvUvmInterfaceGetExternalAllocPhysAddrs
+
+    The interface builds the RM physical addrs using the provided input parameters.
+
+    Arguments:
+        vaSpace[IN]                     -  vaSpace handle.
+        hMemory[IN]                     -  Memory handle.
+        offset [IN]                     -  Offset from the beginning of the allocation
+                                           where PTE mappings should begin.
+                                           Should be aligned with mappingPagesize
+                                           in gpuExternalMappingInfo associated
+                                           with the allocation.
+        size [IN]                       -  Length of the allocation for which PhysAddrs
+                                           should be built.
+                                           Should be aligned with mappingPagesize
+                                           in gpuExternalMappingInfo associated
+                                           with the allocation.
+                                           size = 0 will be interpreted as the total size
+                                           of the allocation.
+        gpuExternalMappingInfo[IN/OUT]  -  See nv_uvm_types.h for more information.
+
+   Error codes:
+        NV_ERR_INVALID_ARGUMENT         - Invalid parameter/s is passed.
+        NV_ERR_INVALID_OBJECT_HANDLE    - Invalid memory handle is passed.
+        NV_ERR_NOT_SUPPORTED            - Functionality is not supported (see comments in nv_gpu_ops.c)
+        NV_ERR_INVALID_BASE             - offset is beyond the allocation size
+        NV_ERR_INVALID_LIMIT            - (offset + size) is beyond the allocation size.
+        NV_ERR_BUFFER_TOO_SMALL         - gpuExternalMappingInfo.physAddrBufferSize is insufficient to
+                                          store single physAddr.
+        NV_ERR_NOT_READY                - Returned when querying the physAddrs requires a deferred setup
+                                          which has not yet completed. It is expected that the caller
+                                          will reattempt the call until a different code is returned.
+                                          As an example, multi-node systems which require querying
+                                          physAddrs from the Fabric Manager may return this code.
+*/
+NV_STATUS nvUvmInterfaceGetExternalAllocPhysAddrs(uvmGpuAddressSpaceHandle vaSpace,
+                                                  NvHandle hMemory,
+                                                  NvU64 offset,
+                                                  NvU64 size,
+                                                  UvmGpuExternalPhysAddrInfo *gpuExternalPhysAddrsInfo);
+
 /*******************************************************************************
    nvUvmInterfaceRetainChannel

@ -1462,6 +1520,16 @@ NV_STATUS nvUvmInterfacePagingChannelPushStream(UvmGpuPagingChannelHandle channe
                                                char *methodStream,
                                                NvU32 methodStreamSize);

+/*******************************************************************************
+    nvUvmInterfaceReportFatalError
+
+    Reports a global fatal error so RM can inform the clients that a node reboot
+    is necessary to recover from this error. This function can be called from
+    any lock environment, bottom half or non-interrupt context.
+
+*/
+void nvUvmInterfaceReportFatalError(NV_STATUS error);
+
 /*******************************************************************************
    Cryptography Services Library (CSL) Interface
 */
--- a/kernel-open/common/inc/nv_uvm_types.h
+++ b/kernel-open/common/inc/nv_uvm_types.h
@ -543,6 +543,36 @@ typedef struct UvmGpuExternalMappingInfo_tag
    NvU32 pteSize;
 } UvmGpuExternalMappingInfo;

+typedef struct UvmGpuExternalPhysAddrInfo_tag
+{
+    // In: Virtual permissions. Returns
+    // NV_ERR_INVALID_ACCESS_TYPE if input is
+    // inaccurate
+    UvmRmGpuMappingType mappingType;
+
+    // In: Size of the buffer to store PhysAddrs (in bytes).
+    NvU64 physAddrBufferSize;
+
+    // In: Page size for mapping
+    //     If this field is passed as 0, the page size
+    //     of the allocation is used for mapping.
+    //     nvUvmInterfaceGetExternalAllocPtes must pass
+    //     this field as zero.
+    NvU64 mappingPageSize;
+
+    // In: Pointer to a buffer to store PhysAddrs.
+    // Out: The interface will fill the buffer with PhysAddrs
+    NvU64 *physAddrBuffer;
+
+    // Out: Number of PhysAddrs filled in to the buffer.
+    NvU64 numWrittenPhysAddrs;
+
+    // Out: Number of PhysAddrs remaining to be filled
+    //      if the buffer is not sufficient to accommodate
+    //      requested PhysAddrs.
+    NvU64 numRemainingPhysAddrs;
+} UvmGpuExternalPhysAddrInfo;
+
 typedef struct UvmGpuP2PCapsParams_tag
 {
    // Out: peerId[i] contains gpu[i]'s peer id of gpu[1 - i]. Only defined if
@ -660,6 +690,9 @@ typedef struct UvmGpuInfo_tag
    // Maximum number of TPCs per GPC
    NvU32 maxTpcPerGpcCount;

+    // Number of access counter buffers.
+    NvU32 accessCntrBufferCount;
+
    // NV_TRUE if SMC is enabled on this GPU.
    NvBool smcEnabled;

@ -721,10 +754,12 @@ typedef struct UvmGpuFbInfo_tag
    // RM regions that are not registered with PMA either.
    NvU64 maxAllocatableAddress;

-    NvU32 heapSize;          // RAM in KB available for user allocations
-    NvU32 reservedHeapSize;  // RAM in KB reserved for internal RM allocation
-    NvBool bZeroFb;          // Zero FB mode enabled.
-    NvU64 maxVidmemPageSize; // Largest GPU page size to access vidmem.
+    NvU32  heapSize;           // RAM in KB available for user allocations
+    NvU32  reservedHeapSize;   // RAM in KB reserved for internal RM allocation
+    NvBool bZeroFb;            // Zero FB mode enabled.
+    NvU64  maxVidmemPageSize;  // Largest GPU page size to access vidmem.
+    NvBool bStaticBar1Enabled; // Static BAR1 mode is enabled
+    NvU64  staticBar1Size;     // The size of the static mapping
 } UvmGpuFbInfo;

 typedef struct UvmGpuEccInfo_tag
@ -736,6 +771,15 @@ typedef struct UvmGpuEccInfo_tag
    NvBool   bEccEnabled;
 } UvmGpuEccInfo;

+typedef struct UvmGpuNvlinkInfo_tag
+{
+    unsigned nvlinkMask;
+    unsigned nvlinkOffset;
+    void    *nvlinkReadLocation;
+    NvBool  *nvlinkErrorNotifier;
+    NvBool   bNvlinkRecoveryEnabled;
+} UvmGpuNvlinkInfo;
+
 typedef struct UvmPmaAllocationOptions_tag
 {
    NvU32 flags;
@ -852,6 +896,41 @@ typedef NV_STATUS (*uvmEventIsrTopHalf_t) (const NvProcessorUuid *pGpuUuidStruct
 typedef void (*uvmEventIsrTopHalf_t) (void);
 #endif

+/*******************************************************************************
+    uvmEventDrainP2P
+    This function will be called by the GPU driver to signal to UVM that the
+    GPU has encountered an uncontained error, and all peer work must be drained
+    to recover.  When it is called, the following assumptions/guarantees are
+    valid/made:
+
+      * Impacted user channels have been preempted and disabled
+      * UVM channels are still running normally and will continue to do
+        so unless an unrecoverable error is hit on said channels
+      * UVM must not return from this function until all enqueued work on
+      * peer channels has drained
+      * In the context of this function call, RM will still service faults
+      * UVM must prevent new peer work from being enqueued until the
+        uvmEventResumeP2P callback is issued
+
+    Returns:
+        NV_OK if UVM has idled peer work and will prevent new peer workloads.
+        NV_ERR_TIMEOUT if peer work was unable to be drained within a timeout
+        XXX NV_ERR_* for any other failure (TBD)
+
+*/
+typedef NV_STATUS (*uvmEventDrainP2P_t) (const NvProcessorUuid *pGpuUuidStruct);
+
+/*******************************************************************************
+    uvmEventResumeP2P
+    This function will be called by the GPU driver to signal to UVM that the
+    GPU has recovered from the previously reported uncontained NVLINK error.
+    When it is called, the following assumptions/guarantees are valid/made:
+
+      * UVM is again allowed to enqueue peer work
+      * UVM channels are still running normally
+*/
+typedef NV_STATUS (*uvmEventResumeP2P_t) (const NvProcessorUuid *pGpuUuidStruct);
+
 struct UvmOpsUvmEvents
 {
    uvmEventSuspend_t     suspend;
@ -864,6 +943,8 @@ struct UvmOpsUvmEvents
    uvmEventWddmRestartAfterTimeout_t wddmRestartAfterTimeout;
    uvmEventServiceInterrupt_t serviceInterrupt;
 #endif
+    uvmEventDrainP2P_t drainP2P;
+    uvmEventResumeP2P_t resumeP2P;
 };

 #define UVM_CSL_SIGN_AUTH_TAG_SIZE_BYTES 32
@ -1071,11 +1152,13 @@ typedef UvmGpuAccessCntrConfig gpuAccessCntrConfig;
 typedef UvmGpuFaultInfo gpuFaultInfo;
 typedef UvmGpuMemoryInfo gpuMemoryInfo;
 typedef UvmGpuExternalMappingInfo gpuExternalMappingInfo;
+typedef UvmGpuExternalPhysAddrInfo gpuExternalPhysAddrInfo;
 typedef UvmGpuChannelResourceInfo gpuChannelResourceInfo;
 typedef UvmGpuChannelInstanceInfo gpuChannelInstanceInfo;
 typedef UvmGpuChannelResourceBindParams gpuChannelResourceBindParams;
 typedef UvmGpuFbInfo gpuFbInfo;
 typedef UvmGpuEccInfo gpuEccInfo;
+typedef UvmGpuNvlinkInfo gpuNvlinkInfo;
 typedef UvmGpuPagingChannel *gpuPagingChannelHandle;
 typedef UvmGpuPagingChannelInfo gpuPagingChannelInfo;
 typedef UvmGpuPagingChannelAllocParams gpuPagingChannelAllocParams;
--- a/kernel-open/common/inc/nvkms-api-types.h
+++ b/kernel-open/common/inc/nvkms-api-types.h
@ -50,6 +50,8 @@
 #define NVKMS_LOG2_LUT_ARRAY_SIZE             10
 #define NVKMS_LUT_ARRAY_SIZE                  (1 << NVKMS_LOG2_LUT_ARRAY_SIZE)

+#define NVKMS_OLUT_FP_NORM_SCALE_DEFAULT      0xffffffff
+
 typedef NvU32 NvKmsDeviceHandle;
 typedef NvU32 NvKmsDispHandle;
 typedef NvU32 NvKmsConnectorHandle;
@ -245,6 +247,80 @@ struct NvKmsLutRamps {
    NvU16 blue[NVKMS_LUT_ARRAY_SIZE];  /*! in */
 };

+/* Datatypes for LUT capabilities */
+enum NvKmsLUTFormat {
+    /*
+     * Normalized fixed-point format mapping [0, 1] to [0x0, 0xFFFF].
+     */
+    NVKMS_LUT_FORMAT_UNORM16,
+
+    /*
+     * Half-precision floating point.
+     */
+    NVKMS_LUT_FORMAT_FP16,
+
+    /*
+     * 14-bit fixed-point format required to work around hardware bug 813188.
+     *
+     * To convert from UNORM16 to UNORM14_WAR_813188:
+     * unorm14_war_813188 = ((unorm16 >> 2) & ~7) + 0x6000
+     */
+    NVKMS_LUT_FORMAT_UNORM14_WAR_813188
+};
+
+enum NvKmsLUTVssSupport {
+    NVKMS_LUT_VSS_NOT_SUPPORTED,
+    NVKMS_LUT_VSS_SUPPORTED,
+    NVKMS_LUT_VSS_REQUIRED,
+};
+
+enum NvKmsLUTVssType {
+    NVKMS_LUT_VSS_TYPE_NONE,
+    NVKMS_LUT_VSS_TYPE_LINEAR,
+    NVKMS_LUT_VSS_TYPE_LOGARITHMIC,
+};
+
+struct NvKmsLUTCaps {
+    /*! Whether this layer or head on this device supports this LUT stage. */
+    NvBool supported;
+
+    /*! Whether this LUT supports VSS. */
+    enum NvKmsLUTVssSupport vssSupport;
+
+    /*!
+     * The type of VSS segmenting this LUT uses.
+     */
+    enum NvKmsLUTVssType vssType;
+
+    /*!
+     * Expected number of VSS segments.
+     */
+    NvU32 vssSegments;
+
+    /*!
+     * Expected number of LUT entries.
+     */
+    NvU32 lutEntries;
+
+    /*!
+     * Format for each of the LUT entries.
+     */
+    enum NvKmsLUTFormat entryFormat;
+};
+
+/* each LUT entry uses this many bytes */
+#define NVKMS_LUT_CAPS_LUT_ENTRY_SIZE (4 * sizeof(NvU16))
+
+/* if the LUT surface uses VSS, size of the VSS header */
+#define NVKMS_LUT_VSS_HEADER_SIZE (4 * NVKMS_LUT_CAPS_LUT_ENTRY_SIZE)
+
+struct NvKmsLUTSurfaceParams {
+    NvKmsSurfaceHandle surfaceHandle;
+    NvU64 offset NV_ALIGN_BYTES(8);
+    NvU32 vssSegments;
+    NvU32 lutEntries;
+};
+
 /*
 * A 3x4 row-major colorspace conversion matrix.
 *
@ -463,6 +539,10 @@ struct NvKmsLayerCapabilities {
     * still expected to honor the NvKmsUsageBounds for each head.
     */
    NvU64 supportedSurfaceMemoryFormats NV_ALIGN_BYTES(8);
+
+    /* Capabilities for each LUT stage in the EVO3 precomp pipeline. */
+    struct NvKmsLUTCaps ilut;
+    struct NvKmsLUTCaps tmo;
 };

 /*!
@ -683,4 +763,20 @@ struct NvKmsSuperframeInfo {
    } view[NVKMS_MAX_SUPERFRAME_VIEWS];
 };

+/* Fields within NvKmsVblankSemControlDataOneHead::flags */
+#define NVKMS_VBLANK_SEM_CONTROL_SWAP_INTERVAL          15:0
+
+struct NvKmsVblankSemControlDataOneHead {
+    NvU32 requestCounterAccel;
+    NvU32 requestCounter;
+    NvU32 flags;
+
+    NvU32 semaphore;
+    NvU64 vblankCount NV_ALIGN_BYTES(8);
+};
+
+struct NvKmsVblankSemControlData {
+    struct NvKmsVblankSemControlDataOneHead head[NV_MAX_HEADS];
+};
+
 #endif /* NVKMS_API_TYPES_H */
--- a/kernel-open/common/inc/nvkms-kapi.h
+++ b/kernel-open/common/inc/nvkms-kapi.h
@ -124,6 +124,14 @@ struct NvKmsKapiDisplayMode {
 #define NVKMS_KAPI_LAYER_INVALID_IDX           0xff
 #define NVKMS_KAPI_LAYER_PRIMARY_IDX              0

+struct NvKmsKapiLutCaps {
+    struct {
+        struct NvKmsLUTCaps ilut;
+        struct NvKmsLUTCaps tmo;
+    } layer[NVKMS_KAPI_LAYER_MAX];
+    struct NvKmsLUTCaps olut;
+};
+
 struct NvKmsKapiDeviceResourcesInfo {

    NvU32 numHeads;
@ -169,6 +177,8 @@ struct NvKmsKapiDeviceResourcesInfo {

    NvU64 supportedSurfaceMemoryFormats[NVKMS_KAPI_LAYER_MAX];
    NvBool supportsICtCp[NVKMS_KAPI_LAYER_MAX];
+
+    struct NvKmsKapiLutCaps lutCaps;
 };

 #define NVKMS_KAPI_LAYER_MASK(layerType) (1 << (layerType))
@ -262,21 +272,54 @@ struct NvKmsKapiLayerConfig {
    NvU16 dstWidth, dstHeight;

    enum NvKmsInputColorSpace inputColorSpace;
+
+    struct {
+        NvBool enabled;
+        struct NvKmsKapiSurface *lutSurface;
+        NvU64 offset;
+        NvU32 vssSegments;
+        NvU32 lutEntries;
+    } ilut;
+
+    struct {
+        NvBool enabled;
+        struct NvKmsKapiSurface *lutSurface;
+        NvU64 offset;
+        NvU32 vssSegments;
+        NvU32 lutEntries;
+    } tmo;
+
    struct NvKmsCscMatrix csc;
    NvBool cscUseMain;
+
+    struct {
+        struct NvKmsCscMatrix lmsCtm;
+        struct NvKmsCscMatrix lmsToItpCtm;
+        struct NvKmsCscMatrix itpToLmsCtm;
+        struct NvKmsCscMatrix blendCtm;
+        struct {
+            NvBool lmsCtm      : 1;
+            NvBool lmsToItpCtm : 1;
+            NvBool itpToLmsCtm : 1;
+            NvBool blendCtm    : 1;
+        } enabled;
+    } matrixOverrides;
 };

 struct NvKmsKapiLayerRequestedConfig {
    struct NvKmsKapiLayerConfig config;
    struct {
-        NvBool surfaceChanged     : 1;
-        NvBool srcXYChanged       : 1;
-        NvBool srcWHChanged       : 1;
-        NvBool dstXYChanged       : 1;
-        NvBool dstWHChanged       : 1;
-        NvBool cscChanged         : 1;
-        NvBool tfChanged          : 1;
-        NvBool hdrMetadataChanged : 1;
+        NvBool surfaceChanged          : 1;
+        NvBool srcXYChanged            : 1;
+        NvBool srcWHChanged            : 1;
+        NvBool dstXYChanged            : 1;
+        NvBool dstWHChanged            : 1;
+        NvBool cscChanged              : 1;
+        NvBool tfChanged               : 1;
+        NvBool hdrMetadataChanged      : 1;
+        NvBool matrixOverridesChanged  : 1;
+        NvBool ilutChanged             : 1;
+        NvBool tmoChanged              : 1;
    } flags;
 };

@ -342,18 +385,30 @@ struct NvKmsKapiHeadModeSetConfig {
            struct NvKmsLutRamps *pRamps;
        } output;
    } lut;
+
+    struct {
+        NvBool enabled;
+        struct NvKmsKapiSurface *lutSurface;
+        NvU64 offset;
+        NvU32 vssSegments;
+        NvU32 lutEntries;
+    } olut;
+
+    NvU32 olutFpNormScale;
 };

 struct NvKmsKapiHeadRequestedConfig {
    struct NvKmsKapiHeadModeSetConfig modeSetConfig;
    struct {
-        NvBool activeChanged       : 1;
-        NvBool displaysChanged     : 1;
-        NvBool modeChanged         : 1;
-        NvBool hdrInfoFrameChanged : 1;
-        NvBool colorimetryChanged  : 1;
-        NvBool ilutChanged         : 1;
-        NvBool olutChanged         : 1;
+        NvBool activeChanged          : 1;
+        NvBool displaysChanged        : 1;
+        NvBool modeChanged            : 1;
+        NvBool hdrInfoFrameChanged    : 1;
+        NvBool colorimetryChanged     : 1;
+        NvBool legacyIlutChanged      : 1;
+        NvBool legacyOlutChanged      : 1;
+        NvBool olutChanged            : 1;
+        NvBool olutFpNormScaleChanged : 1;
    } flags;

    struct NvKmsKapiCursorRequestedConfig cursorRequestedConfig;
@ -1172,21 +1227,6 @@ struct NvKmsKapiFunctionsTable {
        NvU64 *pPages
    );

-     /*!
-     * Check if this memory object can be scanned out for display.
-     *
-     * \param [in]  device  A device allocated using allocateDevice().
-     *
-     * \param [in]  memory  The memory object to check for display support.
-     *
-     * \return NV_TRUE if this memory can be displayed, NV_FALSE if not.
-     */
-    NvBool (*isMemoryValidForDisplay)
-    (
-        const struct NvKmsKapiDevice *device,
-        const struct NvKmsKapiMemory *memory
-    );
-
    /*
     * Import SGT as a memory handle.
     *
@ -1504,6 +1544,16 @@ struct NvKmsKapiFunctionsTable {
        struct NvKmsKapiDevice *device,
        NvS32 index
    );
+
+    /*
+     * Notify NVKMS that the system's framebuffer console has been disabled and
+     * the reserved allocation for the old framebuffer console can be unmapped.
+     */
+    void
+    (*framebufferConsoleDisabled)
+    (
+        struct NvKmsKapiDevice *device
+    );
 };

 /** @} */
@ -1518,6 +1568,20 @@ NvBool nvKmsKapiGetFunctionsTable
    struct NvKmsKapiFunctionsTable *funcsTable
 );

+NvU32 nvKmsKapiF16ToF32(NvU16 a);
+
+NvU16 nvKmsKapiF32ToF16(NvU32 a);
+
+NvU32 nvKmsKapiF32Mul(NvU32 a, NvU32 b);
+
+NvU32 nvKmsKapiF32Div(NvU32 a, NvU32 b);
+
+NvU32 nvKmsKapiF32Add(NvU32 a, NvU32 b);
+
+NvU32 nvKmsKapiF32ToUI32RMinMag(NvU32 a, NvBool exact);
+
+NvU32 nvKmsKapiUI32ToF32(NvU32 a);
+
 /** @} */

 #endif /* defined(__NVKMS_KAPI_H__) */
--- a/kernel-open/common/inc/nvlimits.h
+++ b/kernel-open/common/inc/nvlimits.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2017 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2017-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -34,19 +34,25 @@
 /*
 * This is the maximum number of GPUs supported in a single system.
 */
-#define NV_MAX_DEVICES          32
+#define NV_MAX_DEVICES                32

 /*
 * This is the maximum number of subdevices within a single device.
 */
-#define NV_MAX_SUBDEVICES       8
+#define NV_MAX_SUBDEVICES             8

 /*
 * This is the maximum length of the process name string.
 */
-#define NV_PROC_NAME_MAX_LENGTH 100U
+#define NV_PROC_NAME_MAX_LENGTH       100U

 /*
 * This is the maximum number of heads per GPU.
 */
-#define NV_MAX_HEADS            4
+#define NV_MAX_HEADS                  4
+
+/*
+ * Maximum length of a MIG device UUID. It is a 36-byte UUID string plus a
+ * 4-byte prefix and NUL terminator: 'M' 'I' 'G' '-' UUID '\0x0'
+ */
+#define NV_MIG_DEVICE_UUID_STR_LENGTH 41U
--- a/kernel-open/common/inc/nvmisc.h
+++ b/kernel-open/common/inc/nvmisc.h
@ -721,6 +721,42 @@ nvPrevPow2_U64(const NvU64 x )
    }                                                       \
 }

+//
+// Bug 4851259: Newly added functions must be hidden from certain HS-signed
+// ucode compilers to avoid signature mismatch.
+//
+#ifndef NVDEC_1_0
+/*!
+ * Returns the position of nth set bit in the given mask.
+ *
+ * Returns -1 if mask has fewer than n bits set.
+ *
+ * n is 0 indexed and has valid values 0..31 inclusive, so "zeroth" set bit is
+ * the first set LSB.
+ *
+ * Example, if mask = 0x000000F0u and n = 1, the return value will be 5.
+ * Example, if mask = 0x000000F0u and n = 4, the return value will be -1.
+ */
+static NV_FORCEINLINE NvS32
+nvGetNthSetBitIndex32(NvU32 mask, NvU32 n)
+{
+    NvU32 seenSetBitsCount = 0;
+    NvS32 index;
+    FOR_EACH_INDEX_IN_MASK(32, index, mask)
+    {
+        if (seenSetBitsCount == n)
+        {
+            return index;
+        }
+        ++seenSetBitsCount;
+    }
+    FOR_EACH_INDEX_IN_MASK_END;
+
+    return -1;
+}
+
+#endif // NVDEC_1_0
+
 //
 // Size to use when declaring variable-sized arrays
 //
--- a/kernel-open/common/inc/os-interface.h
+++ b/kernel-open/common/inc/os-interface.h
@ -40,8 +40,11 @@
 #include "nv_stdarg.h"
 #include <nv-kernel-interface-api.h>
 #include <os/nv_memory_type.h>
+#include <os/nv_memory_area.h>
 #include <nv-caps.h>

+#include "rs_access.h"
+


 typedef struct
@ -102,8 +105,10 @@ NvBool      NV_API_CALL  os_pci_remove_supported     (void);
 void        NV_API_CALL  os_pci_remove               (void *);
 void*       NV_API_CALL  os_map_kernel_space         (NvU64, NvU64, NvU32);
 void        NV_API_CALL  os_unmap_kernel_space       (void *, NvU64);
-void*       NV_API_CALL  os_map_user_space           (NvU64, NvU64, NvU32, NvU32, void **);
+#if defined(NV_VMWARE)
+void*       NV_API_CALL  os_map_user_space           (MemoryArea *, NvU32, NvU32, void **);
 void        NV_API_CALL  os_unmap_user_space         (void *, NvU64, void *);
+#endif
 NV_STATUS   NV_API_CALL  os_flush_cpu_cache_all      (void);
 NV_STATUS   NV_API_CALL  os_flush_user_cache         (void);
 void        NV_API_CALL  os_flush_cpu_write_combine_buffer(void);
@ -114,7 +119,7 @@ void        NV_API_CALL  os_io_write_byte            (NvU32, NvU8);
 void        NV_API_CALL  os_io_write_word            (NvU32, NvU16);
 void        NV_API_CALL  os_io_write_dword           (NvU32, NvU32);
 NvBool      NV_API_CALL  os_is_administrator         (void);
-NvBool      NV_API_CALL  os_allow_priority_override  (void);
+NvBool      NV_API_CALL  os_check_access             (RsAccessRight accessRight);
 void        NV_API_CALL  os_dbg_init                 (void);
 void        NV_API_CALL  os_dbg_breakpoint           (void);
 void        NV_API_CALL  os_dbg_set_level            (NvU32);
@ -130,7 +135,8 @@ void        NV_API_CALL  os_free_spinlock            (void *);
 NvU64       NV_API_CALL  os_acquire_spinlock         (void *);
 void        NV_API_CALL  os_release_spinlock         (void *, NvU64);
 NV_STATUS   NV_API_CALL  os_queue_work_item          (struct os_work_queue *, void *);
-NV_STATUS   NV_API_CALL  os_flush_work_queue         (struct os_work_queue *);
+NV_STATUS   NV_API_CALL  os_flush_work_queue         (struct os_work_queue *, NvBool);
+NvBool      NV_API_CALL  os_is_queue_flush_ongoing   (struct os_work_queue *);
 NV_STATUS   NV_API_CALL  os_alloc_mutex              (void **);
 void        NV_API_CALL  os_free_mutex               (void *);
 NV_STATUS   NV_API_CALL  os_acquire_mutex            (void *);
@ -219,6 +225,8 @@ extern NvU32 os_page_size;
 extern NvU64 os_page_mask;
 extern NvU8  os_page_shift;
 extern NvBool os_cc_enabled;
+extern NvBool os_cc_sev_snp_enabled;
+extern NvBool os_cc_snp_vtom_enabled;
 extern NvBool os_cc_tdx_enabled;
 extern NvBool os_dma_buf_enabled;
 extern NvBool os_imex_channel_is_supported;
--- a/kernel-open/common/inc/os/nv_memory_area.h
+++ b/kernel-open/common/inc/os/nv_memory_area.h
@ -36,4 +36,69 @@ typedef struct MemoryArea
    NvU64 numRanges;
 } MemoryArea;

+static inline NvU64 memareaSize(MemoryArea memArea)
+{
+    NvU64 size = 0;
+    NvU64 idx = 0;
+    for (idx = 0; idx < memArea.numRanges; idx++)
+    {
+        size += memArea.pRanges[idx].size;
+    }
+    return size;
+}
+
+static inline MemoryRange
+mrangeMake
+(
+    NvU64 start,
+    NvU64 size
+)
+{
+    MemoryRange range;
+    range.start = start;
+    range.size = size;
+    return range;
+}
+
+static inline NvU64
+mrangeLimit
+(
+    MemoryRange a
+)
+{
+    return a.start + a.size;
+}
+
+static inline NvBool
+mrangeIntersects
+(
+    MemoryRange a,
+    MemoryRange b
+)
+{
+    return ((a.start >= b.start) && (a.start < mrangeLimit(b))) ||
+        ((b.start >= a.start) && (b.start < mrangeLimit(a)));
+}
+
+static inline NvBool
+mrangeContains
+(
+    MemoryRange outer,
+    MemoryRange inner
+)
+{
+    return (inner.start >= outer.start) && (mrangeLimit(inner) <= mrangeLimit(outer));
+}
+
+static inline MemoryRange
+mrangeOffset
+(
+    MemoryRange range,
+    NvU64 amt
+)
+{
+    range.start += amt;
+    return range;
+}
+
 #endif /* NV_MEMORY_AREA_H */
--- a/kernel-open/common/inc/rm-gpu-ops.h
+++ b/kernel-open/common/inc/rm-gpu-ops.h
@ -85,9 +85,11 @@ NV_STATUS  NV_API_CALL rm_gpu_ops_enable_access_cntr(nvidia_stack_t *, nvgpuDevi
 NV_STATUS  NV_API_CALL rm_gpu_ops_disable_access_cntr(nvidia_stack_t *, nvgpuDeviceHandle_t, nvgpuAccessCntrInfo_t);
 NV_STATUS  NV_API_CALL  rm_gpu_ops_set_page_directory (nvidia_stack_t *, nvgpuAddressSpaceHandle_t, NvU64, unsigned, NvBool, NvU32);
 NV_STATUS  NV_API_CALL  rm_gpu_ops_unset_page_directory (nvidia_stack_t *, nvgpuAddressSpaceHandle_t);
+NV_STATUS  NV_API_CALL rm_gpu_ops_get_nvlink_info(nvidia_stack_t *, nvgpuDeviceHandle_t, nvgpuNvlinkInfo_t);
 NV_STATUS  NV_API_CALL rm_gpu_ops_p2p_object_create(nvidia_stack_t *, nvgpuDeviceHandle_t, nvgpuDeviceHandle_t, NvHandle *);
 void       NV_API_CALL rm_gpu_ops_p2p_object_destroy(nvidia_stack_t *, nvgpuSessionHandle_t, NvHandle);
 NV_STATUS  NV_API_CALL rm_gpu_ops_get_external_alloc_ptes(nvidia_stack_t*, nvgpuAddressSpaceHandle_t, NvHandle, NvU64, NvU64, nvgpuExternalMappingInfo_t);
+NV_STATUS  NV_API_CALL rm_gpu_ops_get_external_alloc_phys_addrs(nvidia_stack_t*, nvgpuAddressSpaceHandle_t, NvHandle, NvU64, NvU64, nvgpuExternalPhysAddrInfo_t);
 NV_STATUS  NV_API_CALL rm_gpu_ops_retain_channel(nvidia_stack_t *, nvgpuAddressSpaceHandle_t, NvHandle, NvHandle, void **, nvgpuChannelInstanceInfo_t);
 NV_STATUS  NV_API_CALL rm_gpu_ops_bind_channel_resources(nvidia_stack_t *, void *, nvgpuChannelResourceBindParams_t);
 void       NV_API_CALL rm_gpu_ops_release_channel(nvidia_stack_t *, void *);
@ -100,6 +102,7 @@ void       NV_API_CALL rm_gpu_ops_paging_channel_destroy(nvidia_stack_t *, nvgpu
 NV_STATUS  NV_API_CALL rm_gpu_ops_paging_channels_map(nvidia_stack_t *, nvgpuAddressSpaceHandle_t, NvU64, nvgpuDeviceHandle_t, NvU64 *);
 void       NV_API_CALL rm_gpu_ops_paging_channels_unmap(nvidia_stack_t *, nvgpuAddressSpaceHandle_t, NvU64, nvgpuDeviceHandle_t);
 NV_STATUS  NV_API_CALL rm_gpu_ops_paging_channel_push_stream(nvidia_stack_t *, nvgpuPagingChannelHandle_t, char *, NvU32);
+void       NV_API_CALL rm_gpu_ops_report_fatal_error(nvidia_stack_t *, NV_STATUS error);

 NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_context_init(nvidia_stack_t *, struct ccslContext_t **, nvgpuChannelHandle_t);
 NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_context_clear(nvidia_stack_t *, struct ccslContext_t *);
--- a/kernel-open/common/inc/rs_access.h
+++ b/kernel-open/common/inc/rs_access.h
@ -0,0 +1,276 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#pragma once
+
+#include <nvtypes.h>
+#if defined(_MSC_VER)
+#pragma warning(disable:4324)
+#endif
+
+//
+// This file was generated with FINN, an NVIDIA coding tool.
+// Source file:      rs_access.finn
+//
+
+
+
+
+#include "nvtypes.h"
+#include "nvmisc.h"
+
+
+/****************************************************************************/
+/*                       Access right definitions                           */
+/****************************************************************************/
+
+//
+// The meaning of each access right is documented in
+//   resman/docs/rmapi/resource_server/rm_capabilities.adoc
+//
+// RS_ACCESS_COUNT is the number of access rights that have been defined
+// and are in use. All integers in the range [0, RS_ACCESS_COUNT) should
+// represent valid access rights.
+//
+// When adding a new access right, don't forget to update
+//   1) The descriptions in the resman/docs/rmapi/resource_server/rm_capabilities.adoc
+//   2) RS_ACCESS_COUNT, defined below
+//   3) The declaration of g_rsAccessMetadata in rs_access_rights.c
+//   4) The list of access rights in drivers/common/chip-config/Chipcontrols.pm
+//   5) Any relevant access right callbacks
+//
+
+#define RS_ACCESS_DUP_OBJECT 0U
+#define RS_ACCESS_NICE       1U
+#define RS_ACCESS_DEBUG      2U
+#define RS_ACCESS_PERFMON    3U
+#define RS_ACCESS_COUNT      4U
+
+
+/****************************************************************************/
+/*                     Access right data structures                         */
+/****************************************************************************/
+
+/*!
+ * @brief A type that can be used to represent any access right.
+ */
+typedef NvU16 RsAccessRight;
+
+/*!
+ * @brief An internal type used to represent one limb in an access right mask.
+ */
+typedef NvU32 RsAccessLimb;
+#define SDK_RS_ACCESS_LIMB_BITS 32
+
+/*!
+ * @brief The number of limbs in the RS_ACCESS_MASK struct.
+ */
+#define SDK_RS_ACCESS_MAX_LIMBS 1
+
+/*!
+ * @brief The maximum number of possible access rights supported by the
+ *        current data structure definition.
+ *
+ * You probably want RS_ACCESS_COUNT instead, which is the number of actual
+ * access rights defined.
+ */
+#define SDK_RS_ACCESS_MAX_COUNT (0x20) /* finn: Evaluated from "(SDK_RS_ACCESS_LIMB_BITS * SDK_RS_ACCESS_MAX_LIMBS)" */
+
+/**
+ * @brief A struct representing a set of access rights.
+ *
+ * Note that the values of bit positions larger than RS_ACCESS_COUNT is
+ * undefined, and should not be assumed to be 0 (see RS_ACCESS_MASK_FILL).
+ */
+typedef struct RS_ACCESS_MASK {
+    RsAccessLimb limbs[SDK_RS_ACCESS_MAX_LIMBS];
+} RS_ACCESS_MASK;
+
+/**
+ * @brief A struct representing auxiliary information about each access right.
+ */
+typedef struct RS_ACCESS_INFO {
+    NvU32 flags;
+} RS_ACCESS_INFO;
+
+
+/****************************************************************************/
+/*                           Access right macros                            */
+/****************************************************************************/
+
+#define SDK_RS_ACCESS_LIMB_INDEX(index) ((index) / SDK_RS_ACCESS_LIMB_BITS)
+#define SDK_RS_ACCESS_LIMB_POS(index)   ((index) % SDK_RS_ACCESS_LIMB_BITS)
+
+#define SDK_RS_ACCESS_LIMB_ELT(pAccessMask, index) \
+    ((pAccessMask)->limbs[SDK_RS_ACCESS_LIMB_INDEX(index)])
+#define SDK_RS_ACCESS_OFFSET_MASK(index) \
+    NVBIT_TYPE(SDK_RS_ACCESS_LIMB_POS(index), RsAccessLimb)
+
+/*!
+ * @brief Checks that accessRight represents a valid access right.
+ *
+ * The valid range of access rights is [0, RS_ACCESS_COUNT).
+ *
+ * @param[in] accessRight The access right value to check
+ *
+ * @return true if accessRight is valid
+ * @return false otherwise
+ */
+#define RS_ACCESS_BOUNDS_CHECK(accessRight) \
+    (accessRight < RS_ACCESS_COUNT)
+
+/*!
+ * @brief Test whether an access right is present in a set
+ *
+ * @param[in] pAccessMask The set of access rights to read
+ * @param[in] index The access right to examine
+ *
+ * @return NV_TRUE if the access right specified by index was present in the set,
+ *         and NV_FALSE otherwise
+ */
+#define RS_ACCESS_MASK_TEST(pAccessMask, index) \
+    (RS_ACCESS_BOUNDS_CHECK(index) && \
+        (SDK_RS_ACCESS_LIMB_ELT(pAccessMask, index) & SDK_RS_ACCESS_OFFSET_MASK(index)) != 0)
+
+/*!
+ * @brief Add an access right to a mask
+ *
+ * @param[in] pAccessMask The set of access rights to modify
+ * @param[in] index The access right to set
+ */
+#define RS_ACCESS_MASK_ADD(pAccessMask, index) \
+    do \
+    { \
+        if (RS_ACCESS_BOUNDS_CHECK(index)) { \
+            SDK_RS_ACCESS_LIMB_ELT(pAccessMask, index) |= SDK_RS_ACCESS_OFFSET_MASK(index); \
+        } \
+    } while (NV_FALSE)
+
+/*!
+ * @brief Remove an access right from a mask
+ *
+ * @param[in] pAccessMask The set of access rights to modify
+ * @param[in] index The access right to unset
+ */
+#define RS_ACCESS_MASK_REMOVE(pAccessMask, index) \
+    do \
+    { \
+        if (RS_ACCESS_BOUNDS_CHECK(index)) { \
+            SDK_RS_ACCESS_LIMB_ELT(pAccessMask, index) &= ~SDK_RS_ACCESS_OFFSET_MASK(index); \
+        } \
+    } while (NV_FALSE)
+
+/*!
+ * @brief Performs an in-place union between two access right masks
+ *
+ * @param[in,out] pMaskOut The access rights mask to be updated
+ * @param[in] pMaskIn The set of access rights to be added to pMaskOut
+ */
+#define RS_ACCESS_MASK_UNION(pMaskOut, pMaskIn) \
+    do \
+    { \
+        NvLength limb; \
+        for (limb = 0; limb < SDK_RS_ACCESS_MAX_LIMBS; limb++) \
+        { \
+            SDK_RS_ACCESS_LIMB_ELT(pMaskOut, limb) |= SDK_RS_ACCESS_LIMB_ELT(pMaskIn, limb); \
+        } \
+    } while (NV_FALSE)
+
+/*!
+ * @brief Performs an in-place subtract of one mask's rights from another
+ *
+ * @param[in,out] pMaskOut The access rights mask to be updated
+ * @param[in] pMaskIn The set of access rights to be removed from pMaskOut
+ */
+#define RS_ACCESS_MASK_SUBTRACT(pMaskOut, pMaskIn) \
+    do \
+    { \
+        NvLength limb; \
+        for (limb = 0; limb < SDK_RS_ACCESS_MAX_LIMBS; limb++) \
+        { \
+            SDK_RS_ACCESS_LIMB_ELT(pMaskOut, limb) &= ~SDK_RS_ACCESS_LIMB_ELT(pMaskIn, limb); \
+        } \
+    } while (NV_FALSE)
+
+/*!
+ * @brief Removes all rights from an access rights mask
+ *
+ * @param[in,out] pAccessMask The access rights mask to be updated
+ */
+#define RS_ACCESS_MASK_CLEAR(pAccessMask) \
+    do \
+    { \
+        portMemSet(pAccessMask, 0, sizeof(*pAccessMask)); \
+    } while (NV_FALSE)
+
+/*!
+ * @brief Adds all rights to an access rights mask
+ *
+ * @param[in,out] pAccessMask The access rights mask to be updated
+ */
+#define RS_ACCESS_MASK_FILL(pAccessMask) \
+    do \
+    { \
+        portMemSet(pAccessMask, 0xff, sizeof(*pAccessMask)); \
+    } while (NV_FALSE)
+
+
+/****************************************************************************/
+/*                           Share definitions                              */
+/****************************************************************************/
+
+//
+// The usage of Share Policy and the meaning of each share type is documented in
+//   resman/docs/rmapi/resource_server/rm_capabilities.adoc
+//
+#define RS_SHARE_TYPE_NONE              (0U)
+#define RS_SHARE_TYPE_ALL               (1U)
+#define RS_SHARE_TYPE_OS_SECURITY_TOKEN (2U)
+#define RS_SHARE_TYPE_CLIENT            (3U)
+#define RS_SHARE_TYPE_PID               (4U)
+#define RS_SHARE_TYPE_SMC_PARTITION     (5U)
+#define RS_SHARE_TYPE_GPU               (6U)
+#define RS_SHARE_TYPE_FM_CLIENT         (7U)
+// Must be last. Update when a new SHARE_TYPE is added
+#define RS_SHARE_TYPE_MAX               (8U)
+
+
+//
+// Use Revoke to remove an existing policy from the list.
+// Allow is based on OR logic, Require is based on AND logic.
+// To share a right, at least one Allow (non-Require) must match, and all Require must pass.
+// If Compose is specified, policies will be added to the list. Otherwise, they will replace the list.
+//
+#define RS_SHARE_ACTION_FLAG_REVOKE      NVBIT(0)
+#define RS_SHARE_ACTION_FLAG_REQUIRE     NVBIT(1)
+#define RS_SHARE_ACTION_FLAG_COMPOSE     NVBIT(2)
+
+/****************************************************************************/
+/*                       Share flag data structures                         */
+/****************************************************************************/
+
+typedef struct RS_SHARE_POLICY {
+    NvU32          target;
+    RS_ACCESS_MASK accessMask;
+    NvU16          type;                         ///< RS_SHARE_TYPE_
+    NvU8           action;                        ///< RS_SHARE_ACTION_
+} RS_SHARE_POLICY;
--- a/kernel-open/conftest.sh
+++ b/kernel-open/conftest.sh
@ -661,23 +661,6 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_PCI_GET_DOMAIN_BUS_AND_SLOT_PRESENT" "" "functions"
        ;;

-        pci_bus_address)
-            #
-            # Determine if the pci_bus_address() function is
-            # present.
-            #
-            # Added by commit 06cf56e497c8 ("PCI: Add pci_bus_address() to
-            # get bus address of a BAR") in v3.14
-            #
-            CODE="
-            #include <linux/pci.h>
-            void conftest_pci_bus_address(void) {
-                pci_bus_address();
-            }"
-
-            compile_check_conftest "$CODE" "NV_PCI_BUS_ADDRESS_PRESENT" "" "functions"
-        ;;
-
        hash__remap_4k_pfn)
            #
            # Determine if the hash__remap_4k_pfn() function is
@ -1538,23 +1521,6 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_GET_NUM_PHYSPAGES_PRESENT" "" "functions"
        ;;

-        backing_dev_info)
-            #
-            # Determine if the 'address_space' structure has
-            # a 'backing_dev_info' field.
-            #
-            # Removed by commit b83ae6d42143 ("fs: remove
-            # mapping->backing_dev_info") in v4.0
-            #
-            CODE="
-            #include <linux/fs.h>
-            int conftest_backing_dev_info(void) {
-                return offsetof(struct address_space, backing_dev_info);
-            }"
-
-            compile_check_conftest "$CODE" "NV_ADDRESS_SPACE_HAS_BACKING_DEV_INFO" "" "types"
-        ;;
-
        xen_ioemu_inject_msi)
            # Determine if the xen_ioemu_inject_msi() function is present.
            CODE="
@ -2409,45 +2375,6 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_DRM_ATOMIC_HELPER_LEGACY_GAMMA_SET_PRESENT" "" "functions"
        ;;

-        wait_on_bit_lock_argument_count)
-            #
-            # Determine how many arguments wait_on_bit_lock takes.
-            #
-            # Changed by commit 743162013d40 ("sched: Remove proliferation
-            # of wait_on_bit() action functions") in v3.17 (2014-07-07)
-            #
-            echo "$CONFTEST_PREAMBLE
-            #include <linux/wait.h>
-            void conftest_wait_on_bit_lock(void) {
-                wait_on_bit_lock(NULL, 0, 0);
-            }" > conftest$$.c
-
-            $CC $CFLAGS -c conftest$$.c > /dev/null 2>&1
-            rm -f conftest$$.c
-
-            if [ -f conftest$$.o ]; then
-                rm -f conftest$$.o
-                echo "#define NV_WAIT_ON_BIT_LOCK_ARGUMENT_COUNT 3" | append_conftest "functions"
-                return
-            fi
-
-            echo "$CONFTEST_PREAMBLE
-            #include <linux/wait.h>
-            void conftest_wait_on_bit_lock(void) {
-                wait_on_bit_lock(NULL, 0, NULL, 0);
-            }" > conftest$$.c
-
-            $CC $CFLAGS -c conftest$$.c > /dev/null 2>&1
-            rm -f conftest$$.c
-
-            if [ -f conftest$$.o ]; then
-                rm -f conftest$$.o
-                echo "#define NV_WAIT_ON_BIT_LOCK_ARGUMENT_COUNT 4" | append_conftest "functions"
-                return
-            fi
-            echo "#error wait_on_bit_lock() conftest failed!" | append_conftest "functions"
-        ;;
-
        pci_stop_and_remove_bus_device)
            #
            # Determine if the pci_stop_and_remove_bus_device() function is present.
@ -2523,31 +2450,6 @@ compile_test() {
            fi
        ;;

-        mm_context_t)
-            #
-            # Determine if the 'mm_context_t' data type is present
-            # and if it has an 'id' member.
-            # It does not exist on all architectures.
-            #
-            echo "$CONFTEST_PREAMBLE
-            #include <linux/mm.h>
-            int conftest_mm_context_t(void) {
-                return offsetof(mm_context_t, id);
-            }" > conftest$$.c
-
-            $CC $CFLAGS -c conftest$$.c > /dev/null 2>&1
-            rm -f conftest$$.c
-
-            if [ -f conftest$$.o ]; then
-                echo "#define NV_MM_CONTEXT_T_HAS_ID" | append_conftest "types"
-                rm -f conftest$$.o
-                return
-            else
-                echo "#undef NV_MM_CONTEXT_T_HAS_ID" | append_conftest "types"
-                return
-            fi
-        ;;
-
        pci_dev_has_ats_enabled)
            #
            # Determine if the 'pci_dev' data type has a 'ats_enabled' member.
@ -5102,6 +5004,42 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_CC_PLATFORM_PRESENT" "" "functions"
        ;;

+        cc_attr_guest_sev_snp)
+            #
+            # Determine if 'CC_ATTR_GUEST_SEV_SNP' is present.
+            #
+            # Added by commit aa5a461171f9 ("x86/mm: Extend cc_attr to
+            # include AMD SEV-SNP") in v5.19.
+            #
+            CODE="
+            #if defined(NV_LINUX_CC_PLATFORM_H_PRESENT)
+            #include <linux/cc_platform.h>
+            #endif
+
+            enum cc_attr cc_attributes = CC_ATTR_GUEST_SEV_SNP;
+            "
+
+            compile_check_conftest "$CODE" "NV_CC_ATTR_SEV_SNP" "" "types"
+        ;;
+
+        hv_get_isolation_type)
+            #
+            # Determine if 'hv_get_isolation_type()' is present.
+            # Added by commit faff44069ff5 ("x86/hyperv: Add Write/Read MSR
+            # registers via ghcb page") in v5.16.
+            #
+            CODE="
+            #if defined(NV_ASM_MSHYPERV_H_PRESENT)
+            #include <asm/mshyperv.h>
+            #endif
+            void conftest_hv_get_isolation_type(void) {
+                int i;
+                hv_get_isolation_type(i);
+            }"
+
+            compile_check_conftest "$CODE" "NV_HV_GET_ISOLATION_TYPE" "" "functions"
+        ;;
+
        drm_prime_pages_to_sg_has_drm_device_arg)
            #
            # Determine if drm_prime_pages_to_sg() has 'dev' argument.
@ -6596,7 +6534,9 @@ compile_test() {
            # Determine whether drm_fbdev_generic_setup is present.
            #
            # Added by commit 9060d7f49376 ("drm/fb-helper: Finish the
-            # generic fbdev emulation") in v4.19.
+            # generic fbdev emulation") in v4.19. Removed by commit
+            # aae4682e5d66 ("drm/fbdev-generic: Convert to fbdev-ttm")
+            # in v6.11.
            #
            CODE="
            #include <drm/drm_fb_helper.h>
@ -6608,6 +6548,48 @@ compile_test() {
            }"

            compile_check_conftest "$CODE" "NV_DRM_FBDEV_GENERIC_SETUP_PRESENT" "" "functions"
+            ;;
+
+        drm_fbdev_ttm_setup)
+            #
+            # Determine whether drm_fbdev_ttm_setup is present.
+            #
+            # Added by commit aae4682e5d66 ("drm/fbdev-generic:
+            # Convert to fbdev-ttm") in v6.11.
+            #
+            CODE="
+            #include <drm/drm_fb_helper.h>
+            #if defined(NV_DRM_DRM_FBDEV_TTM_H_PRESENT)
+            #include <drm/drm_fbdev_ttm.h>
+            #endif
+            void conftest_drm_fbdev_ttm_setup(void) {
+                drm_fbdev_ttm_setup();
+            }"
+
+            compile_check_conftest "$CODE" "NV_DRM_FBDEV_TTM_SETUP_PRESENT" "" "functions"
+        ;;
+
+        drm_output_poll_changed)
+            #
+            # Determine whether drm_mode_config_funcs.output_poll_changed
+            # callback is present
+            #
+            # Removed by commit 446d0f4849b1 ("drm: Remove struct
+            # drm_mode_config_funcs.output_poll_changed") in v6.12. Hotplug
+            # event support is handled through the fbdev emulation interface
+            # going forward.
+            #
+            CODE="
+            #if defined(NV_DRM_DRM_MODE_CONFIG_H_PRESENT)
+            #include <drm/drm_mode_config.h>
+            #else
+            #include <drm/drm_crtc.h>
+            #endif
+            int conftest_drm_output_poll_changed_available(void) {
+                return offsetof(struct drm_mode_config_funcs, output_poll_changed);
+            }"
+
+            compile_check_conftest "$CODE" "NV_DRM_OUTPUT_POLL_CHANGED_PRESENT" "" "types"
        ;;

        drm_aperture_remove_conflicting_pci_framebuffers)
@ -6990,6 +6972,192 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_DRM_PROPERTY_BLOB_PUT_PRESENT" "" "functions"
        ;;

+        drm_driver_has_gem_prime_mmap)
+            #
+            # Determine if the 'drm_driver' structure has a 'gem_prime_mmap'
+            # function pointer.
+            #
+            # Removed by commit 0adec22702d4 ("drm: Remove struct
+            # drm_driver.gem_prime_mmap") in v6.6.
+            #
+            CODE="
+            #if defined(NV_DRM_DRMP_H_PRESENT)
+            #include <drm/drmP.h>
+            #endif
+
+            #if defined(NV_DRM_DRM_DRV_H_PRESENT)
+            #include <drm/drm_drv.h>
+            #endif
+
+            int conftest_drm_driver_has_gem_prime_mmap(void) {
+                return offsetof(struct drm_driver, gem_prime_mmap);
+            }"
+
+            compile_check_conftest "$CODE" "NV_DRM_DRIVER_HAS_GEM_PRIME_MMAP" "" "types"
+        ;;
+
+        drm_gem_prime_mmap)
+            #
+            # Determine if the function drm_gem_prime_mmap() is present.
+            #
+            # Added by commit 7698799f95 ("drm/prime: Add drm_gem_prime_mmap()
+            # in v5.0
+            #
+            CODE="
+            #if defined(NV_DRM_DRMP_H_PRESENT)
+            #include <drm/drmP.h>
+            #endif
+            #if defined(NV_DRM_DRM_PRIME_H_PRESENT)
+            #include <drm/drm_prime.h>
+            #endif
+            void conftest_drm_gem_prime_mmap(void) {
+                drm_gem_prime_mmap();
+            }"
+
+            compile_check_conftest "$CODE" "NV_DRM_GEM_PRIME_MMAP_PRESENT" "" "functions"
+        ;;
+
+        vmf_insert_mixed)
+            #
+            # Determine if the function vmf_insert_mixed() is present.
+            #
+            # Added by commit 1c8f422059ae ("mm: change return type to
+            # vm_fault_t") in v4.17.
+            #
+            CODE="
+            #include <linux/mm.h>
+            void conftest_vmf_insert_mixed() {
+                vmf_insert_mixed();
+            }"
+
+            compile_check_conftest "$CODE" "NV_VMF_INSERT_MIXED_PRESENT" "" "functions"
+        ;;
+
+        pfn_to_pfn_t)
+            #
+            # Determine if the function pfn_to_pfn_t() is present.
+            #
+            # Added by commit 34c0fd540e79 ("mm, dax, pmem: introduce pfn_t") in
+            # v4.5.
+            #
+            CODE="
+            #if defined(NV_LINUX_PFN_T_H_PRESENT)
+            #include <linux/pfn_t.h>
+            #endif
+            void conftest_pfn_to_pfn_t() {
+                pfn_to_pfn_t();
+            }"
+
+            compile_check_conftest "$CODE" "NV_PFN_TO_PFN_T_PRESENT" "" "functions"
+        ;;
+
+        drm_gem_dmabuf_mmap)
+            #
+            # Determine if the drm_gem_dmabuf_mmap() function is present.
+            #
+            # drm_gem_dmabuf_mmap() was exported by commit c308279f8798 ("drm:
+            # export gem dmabuf_ops for drivers to reuse") in v4.17.
+            #
+            CODE="
+            #if defined(NV_DRM_DRM_PRIME_H_PRESENT)
+            #include <drm/drm_prime.h>
+            #endif
+            void conftest_drm_gem_dmabuf_mmap(void) {
+                drm_gem_dmabuf_mmap();
+            }"
+
+            compile_check_conftest "$CODE" "NV_DRM_GEM_DMABUF_MMAP_PRESENT" "" "functions"
+        ;;
+
+        drm_gem_prime_export_has_dev_arg)
+            #
+            # Determine if drm_gem_prime_export() function has a 'dev' argument.
+            #
+            # This argument was removed by commit e4fa8457b219 ("drm/prime:
+            # Align gem_prime_export with obj_funcs.export") in v5.4.
+            #
+            CODE="
+            #if defined(NV_DRM_DRMP_H_PRESENT)
+            #include <drm/drmP.h>
+            #endif
+            #if defined(NV_DRM_DRM_PRIME_H_PRESENT)
+            #include <drm/drm_prime.h>
+            #endif
+
+            void conftest_drm_gem_prime_export_has_dev_arg(
+                    struct drm_device *dev,
+                    struct drm_gem_object *obj) {
+                (void) drm_gem_prime_export(dev, obj, 0);
+            }"
+
+            compile_check_conftest "$CODE" "NV_DRM_GEM_PRIME_EXPORT_HAS_DEV_ARG" "" "types"
+        ;;
+
+        dma_buf_ops_has_cache_sgt_mapping)
+            #
+            # Determine if dma_buf_ops structure has a 'cache_sgt_mapping'
+            # member.
+            #
+            # dma_buf_ops::cache_sgt_mapping was added by commit f13e143e7444
+            # ("dma-buf: start caching of sg_table objects v2") in v5.3.
+            #
+            CODE="
+            #include <linux/dma-buf.h>
+            int conftest_dma_ops_has_cache_sgt_mapping(void) {
+                return offsetof(struct dma_buf_ops, cache_sgt_mapping);
+            }"
+
+            compile_check_conftest "$CODE" "NV_DMA_BUF_OPS_HAS_CACHE_SGT_MAPPING" "" "types"
+        ;;
+
+        drm_gem_object_funcs)
+            #
+            # Determine if the 'struct drm_gem_object_funcs' type is present.
+            #
+            # Added by commit b39b5394fabc ("drm/gem: Add drm_gem_object_funcs")
+            # in v5.0.
+            #
+            CODE="
+            #if defined(NV_DRM_DRM_GEM_H_PRESENT)
+            #include <drm/drm_gem.h>
+            #endif
+            struct drm_gem_object_funcs funcs;"
+
+            compile_check_conftest "$CODE" "NV_DRM_GEM_OBJECT_FUNCS_PRESENT" "" "types"
+        ;;
+
+        struct_page_has_zone_device_data)
+            #
+            # Determine if struct page has a 'zone_device_data' field.
+            #
+            # Added by commit 8a164fef9c4c ("mm: simplify ZONE_DEVICE page
+            # private data") in v5.3.
+            #
+            CODE="
+            #include <linux/mm_types.h>
+            int conftest_struct_page_has_zone_device_data(void) {
+                return offsetof(struct page, zone_device_data);
+            }"
+
+            compile_check_conftest "$CODE" "NV_STRUCT_PAGE_HAS_ZONE_DEVICE_DATA" "" "types"
+        ;;
+
+    folio_test_swapcache)
+            #
+            # Determine if the folio_test_swapcache() function is present.
+            #
+            # folio_test_swapcache() was exported by commit d389a4a811551 ("mm:
+            # Add folio flag manipulation functions") in v5.16.
+            #
+            CODE="
+            #include <linux/page-flags.h>
+            void conftest_folio_test_swapcache(void) {
+                folio_test_swapcache();
+            }"
+
+            compile_check_conftest "$CODE" "NV_FOLIO_TEST_SWAPCACHE_PRESENT" "" "functions"
+        ;;
+
        # When adding a new conftest entry, please use the correct format for
        # specifying the relevant upstream Linux kernel commit.  Please
        # avoid specifying -rc kernels, and only use SHAs that actually exist
--- a/kernel-open/header-presence-tests.mk
+++ b/kernel-open/header-presence-tests.mk
@ -15,6 +15,7 @@ NV_HEADER_PRESENCE_TESTS = \
  drm/drm_atomic_uapi.h \
  drm/drm_drv.h \
  drm/drm_fbdev_generic.h \
+  drm/drm_fbdev_ttm.h \
  drm/drm_framebuffer.h \
  drm/drm_connector.h \
  drm/drm_probe_helper.h \
@ -99,5 +100,7 @@ NV_HEADER_PRESENCE_TESTS = \
  linux/sync_file.h \
  linux/cc_platform.h \
  asm/cpufeature.h \
-  linux/mpi.h
+  linux/mpi.h \
+  asm/mshyperv.h \
+  linux/pfn_t.h

--- a/kernel-open/nvidia-drm/nv-kthread-q.c
+++ b/kernel-open/nvidia-drm/nv-kthread-q.c
@ -201,7 +201,7 @@ static struct task_struct *thread_create_on_node(int (*threadfn)(void *data),

        // Ran out of attempts - return thread even if its stack may not be
        // allocated on the preferred node
-        if ((i == (attempts - 1)))
+        if (i == (attempts - 1))
            break;

        // Get the NUMA node where the first page of the stack is resident. If
--- a/kernel-open/nvidia-drm/nv_common_utils.h
+++ b/kernel-open/nvidia-drm/nv_common_utils.h
@ -0,0 +1,120 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2015 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __NV_COMMON_UTILS_H__
+#define __NV_COMMON_UTILS_H__
+
+#include "nvtypes.h"
+#include "nvmisc.h"
+
+#if !defined(TRUE)
+#define TRUE NV_TRUE
+#endif
+
+#if !defined(FALSE)
+#define FALSE NV_FALSE
+#endif
+
+#define NV_IS_UNSIGNED(x) ((__typeof__(x))-1 > 0)
+
+/* Get the length of a statically-sized array. */
+#define ARRAY_LEN(_arr) (sizeof(_arr) / sizeof(_arr[0]))
+
+#define NV_INVALID_HEAD         0xFFFFFFFF
+
+#define NV_INVALID_CONNECTOR_PHYSICAL_INFORMATION (~0)
+
+#if !defined(NV_MIN)
+# define NV_MIN(a,b) (((a)<(b))?(a):(b))
+#endif
+
+#define NV_MIN3(a,b,c) NV_MIN(NV_MIN(a, b), c)
+#define NV_MIN4(a,b,c,d) NV_MIN3(NV_MIN(a,b),c,d)
+
+#if !defined(NV_MAX)
+# define NV_MAX(a,b) (((a)>(b))?(a):(b))
+#endif
+
+#define NV_MAX3(a,b,c) NV_MAX(NV_MAX(a, b), c)
+#define NV_MAX4(a,b,c,d) NV_MAX3(NV_MAX(a,b),c,d)
+
+static inline int NV_LIMIT_VAL_TO_MIN_MAX(int val, int min, int max)
+{
+    if (val < min) {
+        return min;
+    }
+    if (val > max) {
+        return max;
+    }
+    return val;
+}
+
+#define NV_ROUNDUP_DIV(x,y) ((x) / (y) + (((x) % (y)) ? 1 : 0))
+
+/*
+ * Macros used for computing palette entries:
+ *
+ * NV_UNDER_REPLICATE(val, source_size, result_size) expands a value
+ * of source_size bits into a value of target_size bits by shifting
+ * the source value into the high bits and replicating the high bits
+ * of the value into the low bits of the result.
+ *
+ * PALETTE_DEPTH_SHIFT(val, w) maps a colormap entry for a component
+ * that has w bits to an appropriate entry in a LUT of 256 entries.
+ */
+static inline unsigned int NV_UNDER_REPLICATE(unsigned short val,
+                                              int source_size,
+                                              int result_size)
+{
+    return (val << (result_size - source_size)) |
+        (val >> ((source_size << 1) - result_size));
+}
+
+
+static inline unsigned short PALETTE_DEPTH_SHIFT(unsigned short val, int depth)
+{
+    return NV_UNDER_REPLICATE(val, depth, 8);
+}
+
+/*
+ *  Use __builtin_ffs where it is supported, or provide an equivalent
+ *  implementation for platforms like riscv where it is not.
+ */
+#if defined(__GNUC__) && !NVCPU_IS_RISCV64
+static inline int nv_ffs(int x)
+{
+    return __builtin_ffs(x);
+}
+#else
+static inline int nv_ffs(int x)
+{
+    if (x == 0)
+        return 0;
+
+    LOWESTBITIDX_32(x);
+
+    return 1 + x;
+}
+#endif
+
+#endif /* __NV_COMMON_UTILS_H__ */
--- a/kernel-open/nvidia-drm/nvidia-drm-conftest.h
+++ b/kernel-open/nvidia-drm/nvidia-drm-conftest.h
@ -85,7 +85,11 @@

 /* For nv_drm_gem_prime_force_fence_signal */
 #ifndef spin_is_locked
+#if ((__FreeBSD_version >= 1500000) && (__FreeBSD_version < 1500018)) || (__FreeBSD_version < 1401501)
 #define spin_is_locked(lock) mtx_owned(lock.m)
+#else
+#define spin_is_locked(lock) mtx_owned(lock)
+#endif
 #endif

 #ifndef rwsem_is_locked
--- a/kernel-open/nvidia-drm/nvidia-drm-crtc.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-crtc.c
@ -35,6 +35,7 @@
 #include "nvidia-drm-format.h"

 #include "nvmisc.h"
+#include "nv_common_utils.h"

 #include <drm/drm_crtc_helper.h>
 #include <drm/drm_plane_helper.h>
@ -46,14 +47,99 @@
 #include <drm/drm_color_mgmt.h>
 #endif

+/*
+ * The two arrays below specify the PQ EOTF transfer function that's used to
+ * convert from PQ encoded L'M'S' fixed-point to linear LMS FP16. This transfer
+ * function is the inverse of the OETF curve.
+ *
+ * TODO: Generate table with max number of entries for ILUT.
+ */
+static const NvU32 __eotf_pq_512_seg_sizes_log2[] = {
+    6, 6, 4, 4, 4, 3, 4, 3, 3, 3, 2, 2, 2, 3, 3, 2,
+    2, 2, 2, 2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    6, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 1, 2,
+    2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 1, 4, 2, 2,
+};
+static const NvU16 __eotf_pq_512_entries[] = {
+    0x0000, 0x0001, 0x0003, 0x0005, 0x0008, 0x000C, 0x0011, 0x0016, 0x001B, 0x0022, 0x0028, 0x002F, 0x0037, 0x003F, 0x0048, 0x0051,
+    0x005A, 0x0064, 0x006F, 0x007A, 0x0085, 0x0091, 0x009E, 0x00AB, 0x00B8, 0x00C6, 0x00D4, 0x00E3, 0x00F3, 0x0102, 0x0113, 0x0123,
+    0x0135, 0x0146, 0x0158, 0x016B, 0x017E, 0x0192, 0x01A6, 0x01BB, 0x01D0, 0x01E5, 0x01FC, 0x0212, 0x0229, 0x0241, 0x0259, 0x0272,
+    0x028B, 0x02A4, 0x02BE, 0x02D9, 0x02F4, 0x0310, 0x032C, 0x0349, 0x0366, 0x0384, 0x03A2, 0x03C1, 0x03E0, 0x0400, 0x0421, 0x0442,
+    0x0463, 0x0485, 0x04A8, 0x04CB, 0x04EF, 0x0513, 0x0538, 0x055D, 0x0583, 0x05AA, 0x05D1, 0x05F9, 0x0621, 0x064A, 0x0673, 0x069D,
+    0x06C7, 0x06F3, 0x071E, 0x074B, 0x0777, 0x07A5, 0x07D3, 0x0801, 0x0819, 0x0830, 0x0849, 0x0861, 0x087A, 0x0893, 0x08AD, 0x08C7,
+    0x08E1, 0x08FB, 0x0916, 0x0931, 0x094C, 0x0968, 0x0984, 0x09A0, 0x09BD, 0x09DA, 0x09F7, 0x0A15, 0x0A33, 0x0A51, 0x0A70, 0x0A8F,
+    0x0AAE, 0x0ACE, 0x0AEE, 0x0B0E, 0x0B2F, 0x0B50, 0x0B71, 0x0B93, 0x0BB5, 0x0BD7, 0x0BFA, 0x0C0F, 0x0C20, 0x0C32, 0x0C44, 0x0C56,
+    0x0C69, 0x0CB5, 0x0D03, 0x0D55, 0x0DA9, 0x0E01, 0x0E5B, 0x0EB9, 0x0F1B, 0x0F7F, 0x0FE7, 0x1029, 0x1061, 0x109A, 0x10D5, 0x1111,
+    0x1150, 0x1190, 0x11D3, 0x1217, 0x125E, 0x12A6, 0x12F0, 0x133D, 0x138B, 0x13DC, 0x1417, 0x1442, 0x146D, 0x149A, 0x14C8, 0x14F7,
+    0x1527, 0x1558, 0x158B, 0x15BF, 0x15F4, 0x162A, 0x1662, 0x169B, 0x16D5, 0x1711, 0x174E, 0x178C, 0x17CC, 0x1806, 0x1828, 0x184A,
+    0x186D, 0x18B4, 0x18FF, 0x194D, 0x199E, 0x19F3, 0x1A4B, 0x1AA7, 0x1B06, 0x1B37, 0x1B69, 0x1B9B, 0x1BCF, 0x1C02, 0x1C1D, 0x1C38,
+    0x1C54, 0x1C70, 0x1C8D, 0x1CAB, 0x1CC9, 0x1CE7, 0x1D06, 0x1D26, 0x1D46, 0x1D88, 0x1DCC, 0x1E13, 0x1E5C, 0x1EA8, 0x1EF6, 0x1F47,
+    0x1F9A, 0x1FF1, 0x2025, 0x2053, 0x2082, 0x20B3, 0x20E6, 0x211A, 0x214F, 0x2187, 0x21C0, 0x21FA, 0x2237, 0x2275, 0x22B5, 0x22F7,
+    0x233B, 0x23C9, 0x2430, 0x247F, 0x24D3, 0x252B, 0x2589, 0x25EB, 0x2653, 0x26C1, 0x2734, 0x27AD, 0x2817, 0x2838, 0x285A, 0x287C,
+    0x28A0, 0x28C5, 0x28EA, 0x2911, 0x2938, 0x2960, 0x298A, 0x29B4, 0x29DF, 0x2A0C, 0x2A39, 0x2A68, 0x2A98, 0x2AFA, 0x2B62, 0x2BCE,
+    0x2C20, 0x2C5B, 0x2C99, 0x2CDA, 0x2D1E, 0x2D65, 0x2DB0, 0x2DFD, 0x2E4E, 0x2EA3, 0x2EFC, 0x2F58, 0x2FB8, 0x300E, 0x3043, 0x307A,
+    0x30B3, 0x30D0, 0x30EE, 0x310D, 0x312C, 0x314C, 0x316D, 0x318E, 0x31B0, 0x31D3, 0x31F6, 0x321A, 0x323F, 0x3265, 0x328B, 0x32B2,
+    0x32DA, 0x332D, 0x3383, 0x33DC, 0x341D, 0x344D, 0x347F, 0x34B4, 0x34EA, 0x3523, 0x355E, 0x359B, 0x35DB, 0x361D, 0x3662, 0x36A9,
+    0x36F3, 0x3740, 0x3791, 0x37E4, 0x381D, 0x384A, 0x3879, 0x38A9, 0x38DB, 0x3910, 0x3946, 0x397E, 0x39B8, 0x39F5, 0x3A34, 0x3A75,
+    0x3AB9, 0x3AFF, 0x3B48, 0x3B94, 0x3BE2, 0x3C1A, 0x3C44, 0x3C70, 0x3C9D, 0x3CA0, 0x3CA3, 0x3CA6, 0x3CA9, 0x3CAC, 0x3CAF, 0x3CB1,
+    0x3CB4, 0x3CB7, 0x3CBA, 0x3CBD, 0x3CC0, 0x3CC3, 0x3CC6, 0x3CC9, 0x3CCC, 0x3CCF, 0x3CD2, 0x3CD5, 0x3CD8, 0x3CDB, 0x3CDE, 0x3CE1,
+    0x3CE4, 0x3CE7, 0x3CEA, 0x3CEE, 0x3CF1, 0x3CF4, 0x3CF7, 0x3CFA, 0x3CFD, 0x3D00, 0x3D03, 0x3D06, 0x3D09, 0x3D0D, 0x3D10, 0x3D13,
+    0x3D16, 0x3D19, 0x3D1C, 0x3D20, 0x3D23, 0x3D26, 0x3D29, 0x3D2C, 0x3D30, 0x3D33, 0x3D36, 0x3D39, 0x3D3D, 0x3D40, 0x3D43, 0x3D46,
+    0x3D4A, 0x3D4D, 0x3D50, 0x3D54, 0x3D57, 0x3D5A, 0x3D5D, 0x3D61, 0x3D64, 0x3D9B, 0x3DD3, 0x3E0D, 0x3E4A, 0x3E89, 0x3ECA, 0x3F0E,
+    0x3F54, 0x3F9C, 0x3FE8, 0x401B, 0x4043, 0x406D, 0x4099, 0x40C6, 0x40F4, 0x4124, 0x4156, 0x418A, 0x41C0, 0x41F8, 0x4232, 0x426D,
+    0x42AB, 0x42EB, 0x432E, 0x4373, 0x43BA, 0x4428, 0x4479, 0x44D0, 0x452D, 0x4591, 0x45FC, 0x466F, 0x46EB, 0x472C, 0x476F, 0x47B5,
+    0x47FE, 0x4824, 0x484B, 0x4874, 0x489D, 0x48F5, 0x4954, 0x4986, 0x49B9, 0x49EF, 0x4A26, 0x4A5F, 0x4A9B, 0x4AD9, 0x4B19, 0x4B9F,
+    0x4C18, 0x4C66, 0x4CBA, 0x4CE6, 0x4D13, 0x4D43, 0x4D74, 0x4DA7, 0x4DDC, 0x4E12, 0x4E4B, 0x4E86, 0x4EC3, 0x4F02, 0x4F44, 0x4F88,
+    0x4FCE, 0x500C, 0x5032, 0x5082, 0x50D8, 0x5106, 0x5135, 0x5166, 0x5199, 0x5205, 0x5278, 0x52F5, 0x537C, 0x53C3, 0x5406, 0x542D,
+    0x5454, 0x54A9, 0x5503, 0x550F, 0x551B, 0x5527, 0x5533, 0x5540, 0x554C, 0x5559, 0x5565, 0x5572, 0x557F, 0x558C, 0x5599, 0x55A7,
+    0x55B4, 0x55C1, 0x55CF, 0x5607, 0x5641, 0x567E, 0x56BC, 0x56FE, 0x5741, 0x5788, 0x57D0,
+};
+
+/*
+ * The two arrays below specify the PQ OETF transfer function that's used to
+ * convert from linear LMS FP16 to PQ encoded L'M'S' fixed-point.
+ *
+ * TODO: Generate table with max number of entries for ILUT.
+ */
+static const NvU32 __oetf_pq_512_seg_sizes_log2[] = {
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 3, 3,
+    3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5,
+    5,
+};
+static const NvU16 __oetf_pq_512_entries[] = {
+    0x0000, 0x000C, 0x0014, 0x001C, 0x0028, 0x003C, 0x005C, 0x008C, 0x00D0, 0x0134, 0x0184, 0x01C8, 0x0238, 0x029C, 0x033C, 0x03C4,
+    0x043C, 0x04A4, 0x0504, 0x0560, 0x0600, 0x0690, 0x0714, 0x078C, 0x07FC, 0x0864, 0x08C8, 0x0924, 0x0980, 0x09D4, 0x0A24, 0x0A70,
+    0x0B04, 0x0B90, 0x0C10, 0x0C88, 0x0CFC, 0x0D68, 0x0DD4, 0x0E38, 0x0EF4, 0x0FA4, 0x1048, 0x10E4, 0x1174, 0x1200, 0x1284, 0x1304,
+    0x13F4, 0x14D0, 0x159C, 0x165C, 0x1714, 0x17C0, 0x1864, 0x1900, 0x1A28, 0x1B34, 0x1C30, 0x1D1C, 0x1DFC, 0x1ECC, 0x1F94, 0x2050,
+    0x2104, 0x21B0, 0x2258, 0x22F8, 0x2390, 0x2424, 0x24B4, 0x2540, 0x25C4, 0x2648, 0x26C4, 0x2740, 0x27B8, 0x282C, 0x289C, 0x290C,
+    0x29E0, 0x2AAC, 0x2B70, 0x2C2C, 0x2CE0, 0x2D90, 0x2E38, 0x2ED8, 0x2F74, 0x300C, 0x30A0, 0x3130, 0x31BC, 0x3244, 0x32C8, 0x3348,
+    0x3440, 0x352C, 0x360C, 0x36E4, 0x37B4, 0x387C, 0x393C, 0x39F8, 0x3AA8, 0x3B58, 0x3C00, 0x3CA4, 0x3D44, 0x3DDC, 0x3E74, 0x3F04,
+    0x401C, 0x4128, 0x4228, 0x431C, 0x4408, 0x44E8, 0x45C4, 0x4694, 0x475C, 0x4820, 0x48DC, 0x4994, 0x4A48, 0x4AF4, 0x4B9C, 0x4C3C,
+    0x4D78, 0x4EA0, 0x4FBC, 0x50CC, 0x51D0, 0x52CC, 0x53BC, 0x54A0, 0x5580, 0x5658, 0x5728, 0x57F0, 0x58B4, 0x5974, 0x5A2C, 0x5ADC,
+    0x5C34, 0x5D7C, 0x5EB4, 0x5FDC, 0x60F4, 0x6204, 0x630C, 0x6404, 0x64F8, 0x65E0, 0x66C4, 0x679C, 0x6870, 0x693C, 0x6A04, 0x6AC4,
+    0x6C38, 0x6D94, 0x6EE4, 0x7020, 0x7150, 0x7274, 0x738C, 0x7498, 0x7598, 0x7694, 0x7784, 0x786C, 0x794C, 0x7A24, 0x7AF8, 0x7BC4,
+    0x7D50, 0x7EC4, 0x8024, 0x8174, 0x82B4, 0x83E8, 0x850C, 0x8628, 0x8738, 0x883C, 0x8938, 0x8A2C, 0x8B18, 0x8BFC, 0x8CD8, 0x8DB0,
+    0x8F4C, 0x90D0, 0x9240, 0x939C, 0x94EC, 0x962C, 0x975C, 0x9880, 0x999C, 0x9AAC, 0x9BB0, 0x9CAC, 0x9DA0, 0x9E8C, 0x9F70, 0xA04C,
+    0xA1F4, 0xA384, 0xA500, 0xA664, 0xA7BC, 0xA904, 0xAA3C, 0xAB6C, 0xAC8C, 0xADA0, 0xAEAC, 0xAFAC, 0xB0A4, 0xB194, 0xB27C, 0xB360,
+    0xB510, 0xB6A4, 0xB824, 0xB994, 0xBAF0, 0xBC3C, 0xBD78, 0xBEA8, 0xBFCC, 0xC0E4, 0xC1F0, 0xC2F4, 0xC3F0, 0xC4E4, 0xC5CC, 0xC6B0,
+    0xC78C, 0xC860, 0xC930, 0xC9F8, 0xCABC, 0xCB7C, 0xCC38, 0xCCEC, 0xCD9C, 0xCE48, 0xCEF0, 0xCF94, 0xD034, 0xD0D4, 0xD16C, 0xD200,
+    0xD294, 0xD324, 0xD3B4, 0xD43C, 0xD4C4, 0xD54C, 0xD5CC, 0xD650, 0xD6CC, 0xD748, 0xD7C4, 0xD83C, 0xD8B0, 0xD924, 0xD994, 0xDA08,
+    0xDAE0, 0xDBB4, 0xDC84, 0xDD4C, 0xDE10, 0xDECC, 0xDF84, 0xE038, 0xE0E8, 0xE194, 0xE238, 0xE2DC, 0xE37C, 0xE418, 0xE4B0, 0xE544,
+    0xE5D4, 0xE664, 0xE6F0, 0xE778, 0xE800, 0xE884, 0xE904, 0xE984, 0xEA00, 0xEA7C, 0xEAF4, 0xEB68, 0xEBDC, 0xEC50, 0xECC0, 0xED30,
+    0xEE08, 0xEED8, 0xEFA4, 0xF068, 0xF128, 0xF1E4, 0xF298, 0xF348, 0xF3F4, 0xF49C, 0xF540, 0xF5E0, 0xF67C, 0xF714, 0xF7A8, 0xF83C,
+    0xF8CC, 0xF958, 0xF9E0, 0xFA68, 0xFAEC, 0xFB6C, 0xFBE8, 0xFC64, 0xFCE0, 0xFD58, 0xFDCC, 0xFE40, 0xFEB4, 0xFF24, 0xFF90, 0xFFFF,
+};
+
+#define NUM_VSS_HEADER_ENTRIES (NVKMS_LUT_VSS_HEADER_SIZE / NVKMS_LUT_CAPS_LUT_ENTRY_SIZE)

-#if defined(NV_DRM_HAS_HDR_OUTPUT_METADATA)
 static int
 nv_drm_atomic_replace_property_blob_from_id(struct drm_device *dev,
                                            struct drm_property_blob **blob,
                                            uint64_t blob_id,
-                                            ssize_t expected_size)
+                                            ssize_t expected_size,
+                                            NvBool *replaced)
 {
+    struct drm_property_blob *old_blob = *blob;
    struct drm_property_blob *new_blob = NULL;

    if (blob_id != 0) {
@ -64,17 +150,26 @@ nv_drm_atomic_replace_property_blob_from_id(struct drm_device *dev,

        if ((expected_size > 0) &&
            (new_blob->length != expected_size)) {
-            drm_property_blob_put(new_blob);
+            nv_drm_property_blob_put(new_blob);
            return -EINVAL;
        }
    }

-    drm_property_replace_blob(blob, new_blob);
-    drm_property_blob_put(new_blob);
+    if (old_blob != new_blob) {
+        nv_drm_property_blob_put(old_blob);
+        if (new_blob) {
+            nv_drm_property_blob_get(new_blob);
+        }
+        *blob = new_blob;
+        *replaced = true;
+    } else {
+        *replaced = false;
+    }
+
+    nv_drm_property_blob_put(new_blob);

    return 0;
 }
-#endif

 static void nv_drm_plane_destroy(struct drm_plane *plane)
 {
@ -118,9 +213,40 @@ cursor_req_config_disable(struct NvKmsKapiCursorRequestedConfig *req_config)
    req_config->flags.surfaceChanged = NV_TRUE;
 }

+static NvU64 ctm_val_to_csc_val(NvU64 ctm_val)
+{
+    /*
+     * Values in the CTM are encoded in S31.32 sign-magnitude fixed-
+     * point format, while NvKms CSC values are signed 2's-complement
+     * S15.16 (Ssign-extend12-3.16?) fixed-point format.
+     */
+    NvU64 sign_bit = ctm_val & (1ULL << 63);
+    NvU64 magnitude = ctm_val & ~sign_bit;
+
+    /*
+     * Drop the low 16 bits of the fractional part and the high 17 bits
+     * of the integral part. Drop 17 bits to avoid corner cases where
+     * the highest resulting bit is a 1, causing the `cscVal = -cscVal`
+     * line to result in a positive number.
+     *
+     * NOTE: Upstream precedent is to clamp to the range supported by hardware.
+     * Here, we truncate the integral part to 14 bits, and will later truncate
+     * further to the 3-5 bits supported by hardware within the display HAL.
+     *
+     * TODO: Clamping would be better, in the rare event that we receive
+     * coefficients that are large enough for it to matter.
+     */
+    NvS32 csc_val = (magnitude >> 16) & ((1ULL << 31) - 1);
+    if (sign_bit) {
+        csc_val = -csc_val;
+    }
+
+    return csc_val;
+}
+
 #if defined(NV_DRM_COLOR_MGMT_AVAILABLE)
-static void color_mgmt_config_ctm_to_csc(struct NvKmsCscMatrix *nvkms_csc,
-                                         struct drm_color_ctm  *drm_ctm)
+static void ctm_to_csc(struct NvKmsCscMatrix *nvkms_csc,
+                       struct drm_color_ctm  *drm_ctm)
 {
    int y;

@ -131,32 +257,27 @@ static void color_mgmt_config_ctm_to_csc(struct NvKmsCscMatrix *nvkms_csc,
        int x;

        for (x = 0; x < 3; x++) {
-            /*
-             * Values in the CTM are encoded in S31.32 sign-magnitude fixed-
-             * point format, while NvKms CSC values are signed 2's-complement
-             * S15.16 (Ssign-extend12-3.16?) fixed-point format.
-             */
-            NvU64 ctmVal = drm_ctm->matrix[y*3 + x];
-            NvU64 signBit = ctmVal & (1ULL << 63);
-            NvU64 magnitude = ctmVal & ~signBit;
-
-            /*
-             * Drop the low 16 bits of the fractional part and the high 17 bits
-             * of the integral part. Drop 17 bits to avoid corner cases where
-             * the highest resulting bit is a 1, causing the `cscVal = -cscVal`
-             * line to result in a positive number.
-             */
-            NvS32 cscVal = (magnitude >> 16) & ((1ULL << 31) - 1);
-            if (signBit) {
-                cscVal = -cscVal;
-            }
-
-            nvkms_csc->m[y][x] = cscVal;
+            nvkms_csc->m[y][x] = ctm_val_to_csc_val(drm_ctm->matrix[y*3 + x]);
        }
    }
 }
 #endif /* NV_DRM_COLOR_MGMT_AVAILABLE */

+static void ctm_3x4_to_csc(struct NvKmsCscMatrix    *nvkms_csc,
+                           struct drm_color_ctm_3x4 *drm_ctm_3x4)
+{
+    int y;
+
+    for (y = 0; y < 3; y++) {
+        int x;
+
+        for (x = 0; x < 4; x++) {
+            nvkms_csc->m[y][x] =
+                ctm_val_to_csc_val(drm_ctm_3x4->matrix[y*4 + x]);
+        }
+    }
+}
+
 static void
 cursor_plane_req_config_update(struct drm_plane *plane,
                               struct drm_plane_state *plane_state,
@ -251,15 +372,702 @@ cursor_plane_req_config_update(struct drm_plane *plane,
        old_config.dstY != req_config->dstY;
 }

+static void free_drm_lut_surface(struct kref *ref)
+{
+    struct nv_drm_lut_surface *drm_lut_surface =
+        container_of(ref, struct nv_drm_lut_surface, refcount);
+    struct NvKmsKapiDevice *pDevice = drm_lut_surface->pDevice;
+
+    BUG_ON(drm_lut_surface->nvkms_surface == NULL);
+    BUG_ON(drm_lut_surface->nvkms_memory == NULL);
+    BUG_ON(drm_lut_surface->buffer == NULL);
+
+    nvKms->destroySurface(pDevice, drm_lut_surface->nvkms_surface);
+    nvKms->unmapMemory(pDevice, drm_lut_surface->nvkms_memory,
+                       NVKMS_KAPI_MAPPING_TYPE_KERNEL,
+                       drm_lut_surface->buffer);
+    nvKms->freeMemory(pDevice, drm_lut_surface->nvkms_memory);
+
+    nv_drm_free(drm_lut_surface);
+}
+
+static struct nv_drm_lut_surface *alloc_drm_lut_surface(
+    struct nv_drm_device *nv_dev,
+    enum NvKmsLUTFormat entry_format,
+    enum NvKmsLUTVssType vss_type,
+    NvU32 num_vss_header_segments,
+    NvU32 num_vss_header_entries,
+    NvU32 num_entries)
+{
+    struct NvKmsKapiDevice *pDevice = nv_dev->pDevice;
+    struct nv_drm_lut_surface *drm_lut_surface;
+    NvU8 compressible = 0; // No compression
+    size_t size =
+        (((num_vss_header_entries + num_entries) *
+          NVKMS_LUT_CAPS_LUT_ENTRY_SIZE) + 255) & ~255; // 256-byte aligned
+
+    struct NvKmsKapiMemory *surface_mem;
+    struct NvKmsKapiSurface *surface;
+    struct NvKmsKapiCreateSurfaceParams params = {};
+    NvU16 *lut_data;
+
+    /* Allocate displayable memory. */
+    if (nv_dev->hasVideoMemory) {
+        surface_mem =
+            nvKms->allocateVideoMemory(pDevice,
+                                       NvKmsSurfaceMemoryLayoutPitch,
+                                       NVKMS_KAPI_ALLOCATION_TYPE_SCANOUT,
+                                       size,
+                                       &compressible);
+    } else {
+        surface_mem =
+            nvKms->allocateSystemMemory(pDevice,
+                                        NvKmsSurfaceMemoryLayoutPitch,
+                                        NVKMS_KAPI_ALLOCATION_TYPE_SCANOUT,
+                                        size,
+                                        &compressible);
+    }
+    if (surface_mem == NULL) {
+        return NULL;
+    }
+
+    /* Map memory in order to populate it. */
+    if (!nvKms->mapMemory(pDevice, surface_mem,
+                          NVKMS_KAPI_MAPPING_TYPE_KERNEL,
+                          (void **) &lut_data)) {
+        nvKms->freeMemory(pDevice, surface_mem);
+        return NULL;
+    }
+
+    /* Create surface. */
+    params.format = NvKmsSurfaceMemoryFormatR16G16B16A16;
+    params.width = num_vss_header_entries + num_entries;
+    params.height = 1;
+    params.planes[0].memory = surface_mem;
+    params.planes[0].offset = 0;
+    params.planes[0].pitch = size;
+
+    surface = nvKms->createSurface(pDevice, &params);
+    if (surface == NULL) {
+        nvKms->unmapMemory(pDevice, surface_mem,
+                           NVKMS_KAPI_MAPPING_TYPE_KERNEL, (void *) lut_data);
+        nvKms->freeMemory(pDevice, surface_mem);
+        return NULL;
+    }
+
+    /* Pack into struct nv_drm_lut_surface. */
+    drm_lut_surface = nv_drm_calloc(1, sizeof(struct nv_drm_lut_surface));
+    if (drm_lut_surface == NULL) {
+        nvKms->destroySurface(pDevice, surface);
+        nvKms->unmapMemory(pDevice, surface_mem,
+                           NVKMS_KAPI_MAPPING_TYPE_KERNEL, (void *) lut_data);
+        nvKms->freeMemory(pDevice, surface_mem);
+        return NULL;
+    }
+    drm_lut_surface->pDevice = pDevice;
+    drm_lut_surface->nvkms_memory = surface_mem;
+    drm_lut_surface->nvkms_surface = surface;
+    drm_lut_surface->buffer = lut_data;
+    drm_lut_surface->properties.vssSegments = num_vss_header_segments;
+    drm_lut_surface->properties.vssType = vss_type;
+    drm_lut_surface->properties.lutEntries = num_entries;
+    drm_lut_surface->properties.entryFormat = entry_format;
+
+    /* Init refcount. */
+    kref_init(&drm_lut_surface->refcount);
+
+    return drm_lut_surface;
+}
+
+static NvU32 fp32_lut_interp(
+    NvU16 entry0,
+    NvU16 entry1,
+    NvU32 interp,
+    NvU32 interp_max)
+{
+    NvU32 fp32_entry0 = nvKmsKapiUI32ToF32((NvU32) entry0);
+    NvU32 fp32_entry1 = nvKmsKapiUI32ToF32((NvU32) entry1);
+
+    NvU32 fp32_num0  = nvKmsKapiUI32ToF32(interp_max - interp);
+    NvU32 fp32_num1  = nvKmsKapiUI32ToF32(interp);
+    NvU32 fp32_denom = nvKmsKapiUI32ToF32(interp_max);
+
+    fp32_entry0 = nvKmsKapiF32Mul(fp32_entry0, fp32_num0);
+    fp32_entry0 = nvKmsKapiF32Div(fp32_entry0, fp32_denom);
+
+    fp32_entry1 = nvKmsKapiF32Mul(fp32_entry1, fp32_num1);
+    fp32_entry1 = nvKmsKapiF32Div(fp32_entry1, fp32_denom);
+
+    return nvKmsKapiF32Add(fp32_entry0, fp32_entry1);
+}
+
+static struct nv_drm_lut_surface *create_drm_ilut_surface_vss(
+    struct nv_drm_device *nv_dev,
+    struct nv_drm_plane *nv_plane,
+    struct nv_drm_plane_state *nv_drm_plane_state)
+{
+    static const NvU32 fp_norm  = 0x42FA0000; // FP32 125.0
+    static const NvU32 u10_norm = 0x447FC000; // FP32 1023.0
+    static const NvU32 u16_norm = 0x477FFF00; // FP32 UINT16_MAX
+    // FP32 UINT32_MAX (Precision limited to 2^32)
+    static const NvU32 u32_norm = 0x4F800000;
+
+    struct nv_drm_lut_surface *drm_lut_surface;
+
+    NvU32 entry_idx;
+    NvU32 num_entries;
+    NvU16 *lut_data;
+
+    const NvU32 *vss_header_seg_sizes = NULL;
+    NvU32 num_vss_header_segments = 0;
+    const NvU16 *vss_entries = NULL;
+    enum NvKmsLUTVssType vss_type = NVKMS_LUT_VSS_TYPE_NONE;
+
+    NvBool multiply = false;
+    NvU32 fp32_multiplier;
+
+    WARN_ON(!nv_plane->ilut_caps.supported);
+    WARN_ON(nv_plane->ilut_caps.entryFormat != NVKMS_LUT_FORMAT_FP16);
+    WARN_ON(nv_plane->ilut_caps.vssSupport != NVKMS_LUT_VSS_SUPPORTED);
+    WARN_ON(nv_plane->ilut_caps.vssType != NVKMS_LUT_VSS_TYPE_LINEAR);
+
+    /* Convert multiplier from S31.32 Sign-Magnitude format to FP32. */
+    if (nv_drm_plane_state->degamma_multiplier != (((NvU64) 1) << 32)) {
+        NvU32 upper = (NvU32) (nv_drm_plane_state->degamma_multiplier >> 32);
+        NvU32 lower = (NvU32) nv_drm_plane_state->degamma_multiplier;
+
+        /* Range property is configured to ensure sign bit = 0. */
+        WARN_ON(nv_drm_plane_state->degamma_multiplier & (((NvU64) 1) << 63));
+
+        fp32_multiplier =
+            nvKmsKapiF32Add(
+                nvKmsKapiUI32ToF32(upper),
+                nvKmsKapiF32Div(nvKmsKapiUI32ToF32(lower), u32_norm));
+
+        multiply = true;
+    }
+
+    /* Determine configuration based on specified EOTF. */
+    if (nv_drm_plane_state->degamma_tf == NV_DRM_TRANSFER_FUNCTION_PQ) {
+        /* Need VSS for PQ. */
+        vss_header_seg_sizes = __eotf_pq_512_seg_sizes_log2;
+        num_vss_header_segments = ARRAY_LEN(__eotf_pq_512_seg_sizes_log2);
+        vss_type = NVKMS_LUT_VSS_TYPE_LINEAR;
+
+        vss_entries = __eotf_pq_512_entries;
+        num_entries = ARRAY_LEN(__eotf_pq_512_entries) + 1;
+    } else {
+        WARN_ON((nv_drm_plane_state->degamma_tf != NV_DRM_TRANSFER_FUNCTION_DEFAULT) &&
+                (nv_drm_plane_state->degamma_tf != NV_DRM_TRANSFER_FUNCTION_LINEAR));
+
+        num_entries = NVKMS_LUT_ARRAY_SIZE + 1;
+    }
+    WARN_ON((vss_entries != NULL) &&
+            (num_vss_header_segments != nv_plane->ilut_caps.vssSegments));
+    WARN_ON((vss_entries != NULL) && (num_entries > nv_plane->ilut_caps.lutEntries));
+    WARN_ON((vss_entries == NULL) && (num_entries != nv_plane->ilut_caps.lutEntries));
+
+    /*
+     * Allocate displayable LUT surface.
+     * Space for the VSS header must be included even for non-VSS LUTs.
+     */
+    drm_lut_surface =
+        alloc_drm_lut_surface(nv_dev,
+                              NVKMS_LUT_FORMAT_FP16,
+                              vss_type,
+                              num_vss_header_segments,
+                              NUM_VSS_HEADER_ENTRIES,
+                              num_entries);
+    if (!drm_lut_surface) {
+        return NULL;
+    }
+
+    lut_data = (NvU16 *) drm_lut_surface->buffer;
+
+    /* Calculate VSS header. */
+    if (vss_header_seg_sizes != NULL) {
+        for (entry_idx = 0; entry_idx < NUM_VSS_HEADER_ENTRIES; entry_idx++) {
+            int i;
+            NvU64 vss_header_entry = 0;
+            for (i = 0; (i < 16) &&
+                        (((entry_idx * 16) + i) < num_vss_header_segments); i++) {
+                vss_header_entry |=
+                    ((NvU64) vss_header_seg_sizes[(entry_idx * 16) + i]) << (i * 3);
+            }
+            ((NvU64 *) lut_data)[entry_idx] = vss_header_entry;
+        }
+    }
+
+    /* Calculate LUT content. */
+    for (entry_idx = 0;
+         entry_idx < num_entries - 1; entry_idx++) {
+        NvU32 fp32_r, fp32_g, fp32_b;
+        NvU32 data_idx = entry_idx + NUM_VSS_HEADER_ENTRIES;
+
+        if (nv_drm_plane_state->degamma_lut != NULL) {
+            /* Use provided Degamma LUT. */
+            static const NvU32 interp_max = (((NvU32) 1) << (32 - 10)) - 1;
+
+            const struct drm_color_lut *degamma_lut =
+                (struct drm_color_lut *) nv_drm_plane_state->degamma_lut->data;
+
+            NvU16 lut_idx;
+            NvU32 interp = 0;
+
+            if (vss_entries != NULL) {
+                /* Merge with provided VSS LUT. */
+                NvU16 fp16_entry = vss_entries[entry_idx];
+
+                /* Convert from FP16 to UNORM32. */
+                // TODO: Use pre-UNORM32-normalized VSS LUT table?
+                NvU32 unorm32_entry =
+                    nvKmsKapiF32ToUI32RMinMag(
+                        nvKmsKapiF32Mul(
+                            nvKmsKapiF32Div(nvKmsKapiF16ToF32(fp16_entry),
+                                            fp_norm),
+                            u32_norm),
+                        false);
+
+                /* Index using upper 10 bits from UNORM32 VSS LUT. */
+                lut_idx = unorm32_entry >> (32 - 10);
+                /* Interpolate using lower 22 bits from UNORM32 VSS LUT. */
+                interp = unorm32_entry & interp_max;
+            } else {
+                /* Direct index. */
+                lut_idx = entry_idx;
+            }
+
+            BUG_ON(lut_idx >= NVKMS_LUT_ARRAY_SIZE);
+
+            /* Perform interpolation or direct indexing. */
+            if (interp > 0 && ((lut_idx + 1) < NVKMS_LUT_ARRAY_SIZE)) {
+                fp32_r =
+                    fp32_lut_interp(degamma_lut[lut_idx].red,
+                                    degamma_lut[lut_idx + 1].red,
+                                    interp,
+                                    interp_max);
+                fp32_g =
+                    fp32_lut_interp(degamma_lut[lut_idx].green,
+                                    degamma_lut[lut_idx + 1].green,
+                                    interp,
+                                    interp_max);
+                fp32_b =
+                    fp32_lut_interp(degamma_lut[lut_idx].blue,
+                                    degamma_lut[lut_idx + 1].blue,
+                                    interp,
+                                    interp_max);
+            } else {
+                fp32_r = nvKmsKapiUI32ToF32((NvU32) degamma_lut[lut_idx].red);
+                fp32_g = nvKmsKapiUI32ToF32((NvU32) degamma_lut[lut_idx].green);
+                fp32_b = nvKmsKapiUI32ToF32((NvU32) degamma_lut[lut_idx].blue);
+            }
+
+            /* Convert UNORM16 to 1.0-normalized FP32. */
+            fp32_r = nvKmsKapiF32Div(fp32_r, u16_norm);
+            fp32_g = nvKmsKapiF32Div(fp32_g, u16_norm);
+            fp32_b = nvKmsKapiF32Div(fp32_b, u16_norm);
+        } else if (vss_entries != NULL) {
+            /* Use VSS LUT directly, but normalized to 1.0. */
+            // TODO: Use pre-1.0-normalized VSS LUT table?
+            NvU16 fp16_entry = vss_entries[entry_idx];
+            NvU32 fp32_entry = nvKmsKapiF16ToF32(fp16_entry);
+
+            fp32_r = fp32_g = fp32_b = nvKmsKapiF32Div(fp32_entry, fp_norm);
+        } else {
+            /* Use implicit identity. */
+            // TODO: Use LUT table?
+            fp32_r = fp32_g = fp32_b =
+                nvKmsKapiF32Div(nvKmsKapiUI32ToF32(entry_idx), u10_norm);
+        }
+
+        /* Apply multiplier. */
+        if (multiply) {
+            fp32_r = nvKmsKapiF32Mul(fp32_r, fp32_multiplier);
+            fp32_g = nvKmsKapiF32Mul(fp32_g, fp32_multiplier);
+            fp32_b = nvKmsKapiF32Mul(fp32_b, fp32_multiplier);
+        }
+
+        /* Convert from FP32 to FP16 to populate LUT. */
+        lut_data[(data_idx * 4) + 0] = nvKmsKapiF32ToF16(fp32_r);
+        lut_data[(data_idx * 4) + 1] = nvKmsKapiF32ToF16(fp32_g);
+        lut_data[(data_idx * 4) + 2] = nvKmsKapiF32ToF16(fp32_b);
+    }
+    ((NvU64 *) lut_data)[NUM_VSS_HEADER_ENTRIES + num_entries - 1] =
+        ((NvU64 *) lut_data)[NUM_VSS_HEADER_ENTRIES + num_entries - 2];
+
+    return drm_lut_surface;
+}
+
+#define UNORM16_TO_UNORM14_WAR_813188(u16) ((u16 >> 2) & ~7) + 0x6000
+
+static struct nv_drm_lut_surface *create_drm_ilut_surface_legacy(
+    struct nv_drm_device *nv_dev,
+    struct nv_drm_plane *nv_plane,
+    struct nv_drm_plane_state *nv_drm_plane_state)
+
+{
+    struct nv_drm_lut_surface *drm_lut_surface;
+    NvU16 *lut_data;
+    NvU32 entry_idx;
+
+    const struct drm_color_lut *degamma_lut;
+
+    WARN_ON(!nv_plane->ilut_caps.supported);
+    WARN_ON(nv_plane->ilut_caps.entryFormat != NVKMS_LUT_FORMAT_UNORM14_WAR_813188);
+    WARN_ON(nv_plane->ilut_caps.vssSupport == NVKMS_LUT_VSS_REQUIRED);
+    WARN_ON((NVKMS_LUT_ARRAY_SIZE + 1) > nv_plane->ilut_caps.lutEntries);
+
+    BUG_ON(nv_drm_plane_state->degamma_lut == NULL);
+
+    degamma_lut =
+        (struct drm_color_lut *) nv_drm_plane_state->degamma_lut->data;
+
+    /* Allocate displayable LUT surface. */
+    drm_lut_surface =
+        alloc_drm_lut_surface(nv_dev,
+                              NVKMS_LUT_FORMAT_UNORM14_WAR_813188,
+                              NVKMS_LUT_VSS_TYPE_NONE,
+                              0, 0,
+                              NVKMS_LUT_ARRAY_SIZE + 1);
+    if (drm_lut_surface == NULL) {
+        return NULL;
+    }
+
+    lut_data = (NvU16 *) drm_lut_surface->buffer;
+
+    /* Fill LUT surface. */
+    for (entry_idx = 0; entry_idx < NVKMS_LUT_ARRAY_SIZE; entry_idx++) {
+        lut_data[(entry_idx * 4) + 0] =
+            UNORM16_TO_UNORM14_WAR_813188(degamma_lut[entry_idx].red);
+        lut_data[(entry_idx * 4) + 1] =
+            UNORM16_TO_UNORM14_WAR_813188(degamma_lut[entry_idx].green);
+        lut_data[(entry_idx * 4) + 2] =
+            UNORM16_TO_UNORM14_WAR_813188(degamma_lut[entry_idx].blue);
+    }
+    ((NvU64 *) lut_data)[NVKMS_LUT_ARRAY_SIZE] =
+        ((NvU64 *) lut_data)[NVKMS_LUT_ARRAY_SIZE - 1];
+
+    return drm_lut_surface;
+}
+
+static struct nv_drm_lut_surface *create_drm_tmo_surface(
+    struct nv_drm_device *nv_dev,
+    struct nv_drm_plane *nv_plane,
+    struct nv_drm_plane_state *nv_drm_plane_state)
+
+{
+    struct nv_drm_lut_surface *drm_lut_surface;
+    NvU16 *lut_data;
+    NvU32 entry_idx;
+
+    const struct drm_color_lut *tmo_lut;
+
+    const NvU32 num_vss_header_segments = 64;
+    const NvU32 tmo_seg_size_log2 = 4;
+
+    WARN_ON(!nv_plane->tmo_caps.supported);
+    WARN_ON(nv_plane->tmo_caps.entryFormat != NVKMS_LUT_FORMAT_UNORM16);
+    WARN_ON(nv_plane->tmo_caps.vssSupport != NVKMS_LUT_VSS_REQUIRED);
+    WARN_ON(nv_plane->tmo_caps.vssType != NVKMS_LUT_VSS_TYPE_LINEAR);
+    WARN_ON(num_vss_header_segments != nv_plane->tmo_caps.vssSegments);
+    WARN_ON((NVKMS_LUT_ARRAY_SIZE + 1) > nv_plane->tmo_caps.lutEntries);
+
+    BUG_ON(nv_drm_plane_state->tmo_lut == NULL);
+
+    tmo_lut = (struct drm_color_lut *) nv_drm_plane_state->tmo_lut->data;
+
+    /* Verify that all channels are equal. */
+    for (entry_idx = 0; entry_idx < NVKMS_LUT_ARRAY_SIZE; entry_idx++) {
+        if ((tmo_lut[entry_idx].red != tmo_lut[entry_idx].green) ||
+            (tmo_lut[entry_idx].red != tmo_lut[entry_idx].blue)) {
+            return NULL;
+        }
+    }
+
+    /*
+     * Allocate displayable LUT surface.
+     * The TMO LUT always uses VSS.
+     */
+    drm_lut_surface =
+        alloc_drm_lut_surface(nv_dev,
+                              NVKMS_LUT_FORMAT_UNORM16,
+                              NVKMS_LUT_VSS_TYPE_LINEAR,
+                              num_vss_header_segments,
+                              NUM_VSS_HEADER_ENTRIES,
+                              NVKMS_LUT_ARRAY_SIZE + 1);
+    if (drm_lut_surface == NULL) {
+        return NULL;
+    }
+
+    lut_data = (NvU16 *) drm_lut_surface->buffer;
+
+    /* Calculate linear VSS header. */
+    for (entry_idx = 0; entry_idx < NUM_VSS_HEADER_ENTRIES; entry_idx++) {
+        int i;
+        NvU64 vss_header_entry = 0;
+        for (i = 0; (i < 16) &&
+                    (((entry_idx * 16) + i) < num_vss_header_segments); i++) {
+            vss_header_entry |=
+                ((NvU64) tmo_seg_size_log2) << (i * 3);
+        }
+        ((NvU64 *) lut_data)[entry_idx] = vss_header_entry;
+    }
+
+    /* Fill LUT surface. */
+    for (entry_idx = 0; entry_idx < NVKMS_LUT_ARRAY_SIZE; entry_idx++) {
+        NvU32 data_idx = entry_idx + NUM_VSS_HEADER_ENTRIES;
+
+        lut_data[(data_idx * 4) + 0] = tmo_lut[entry_idx].red;
+        lut_data[(data_idx * 4) + 1] = tmo_lut[entry_idx].green;
+        lut_data[(data_idx * 4) + 2] = tmo_lut[entry_idx].blue;
+    }
+    ((NvU64 *) lut_data)[NUM_VSS_HEADER_ENTRIES + NVKMS_LUT_ARRAY_SIZE] =
+        ((NvU64 *) lut_data)[NUM_VSS_HEADER_ENTRIES + NVKMS_LUT_ARRAY_SIZE - 1];
+
+    return drm_lut_surface;
+}
+
+static NvU16 unorm16_lut_interp(
+    NvU16 entry0,
+    NvU16 entry1,
+    NvU16 interp,
+    NvU16 interp_max)
+{
+    NvU64 u64_entry0 = (NvU64) entry0;
+    NvU64 u64_entry1 = (NvU64) entry1;
+
+    u64_entry0 *= (NvU64) (interp_max - interp);
+    u64_entry0 /= (NvU64) interp_max;
+
+    u64_entry1 *= (NvU64) interp;
+    u64_entry1 /= (NvU64) interp_max;
+
+    return (NvU16) (u64_entry0 + u64_entry1);
+}
+
+static struct nv_drm_lut_surface *create_drm_olut_surface_vss(
+    struct nv_drm_device *nv_dev,
+    struct nv_drm_crtc *nv_crtc,
+    struct nv_drm_crtc_state *nv_drm_crtc_state)
+{
+    struct nv_drm_lut_surface *drm_lut_surface;
+
+    NvU32 entry_idx;
+    NvU32 num_entries;
+    NvU16 *lut_data;
+
+    const NvU32 *vss_header_seg_sizes = NULL;
+    NvU32 num_vss_header_segments = 0;
+    const NvU16 *vss_entries = NULL;
+    enum NvKmsLUTVssType vss_type = NVKMS_LUT_VSS_TYPE_NONE;
+
+    WARN_ON(!nv_crtc->olut_caps.supported);
+    WARN_ON(nv_crtc->olut_caps.entryFormat != NVKMS_LUT_FORMAT_UNORM16);
+    WARN_ON(nv_crtc->olut_caps.vssSupport != NVKMS_LUT_VSS_SUPPORTED);
+    WARN_ON(nv_crtc->olut_caps.vssType != NVKMS_LUT_VSS_TYPE_LOGARITHMIC);
+
+    /* Determine configuration based on specified OETF. */
+    if (nv_drm_crtc_state->regamma_tf == NV_DRM_TRANSFER_FUNCTION_PQ) {
+        /* Need VSS for PQ. */
+        vss_header_seg_sizes = __oetf_pq_512_seg_sizes_log2;
+        num_vss_header_segments = ARRAY_LEN(__oetf_pq_512_seg_sizes_log2);
+        vss_type = NVKMS_LUT_VSS_TYPE_LOGARITHMIC;
+
+        vss_entries = __oetf_pq_512_entries;
+        num_entries = ARRAY_LEN(__oetf_pq_512_entries) + 1;
+    } else {
+        WARN_ON((nv_drm_crtc_state->regamma_tf != NV_DRM_TRANSFER_FUNCTION_DEFAULT) &&
+                (nv_drm_crtc_state->regamma_tf != NV_DRM_TRANSFER_FUNCTION_LINEAR));
+
+        num_entries = NVKMS_LUT_ARRAY_SIZE + 1;
+    }
+    WARN_ON((vss_entries != NULL) &&
+            (num_vss_header_segments != nv_crtc->olut_caps.vssSegments));
+    WARN_ON((vss_entries != NULL) && (num_entries > nv_crtc->olut_caps.lutEntries));
+    WARN_ON((vss_entries == NULL) && (num_entries != nv_crtc->olut_caps.lutEntries));
+
+    /*
+     * Allocate displayable LUT surface.
+     * Space for the VSS header must be included even for non-VSS LUTs.
+     */
+    drm_lut_surface =
+        alloc_drm_lut_surface(nv_dev,
+                              NVKMS_LUT_FORMAT_UNORM16,
+                              vss_type,
+                              num_vss_header_segments,
+                              NUM_VSS_HEADER_ENTRIES,
+                              num_entries);
+    if (!drm_lut_surface) {
+        return NULL;
+    }
+
+    lut_data = (NvU16 *) drm_lut_surface->buffer;
+
+    /* Calculate VSS header. */
+    if (vss_header_seg_sizes != NULL) {
+        for (entry_idx = 0; entry_idx < NUM_VSS_HEADER_ENTRIES; entry_idx++) {
+            int i;
+            NvU64 vss_header_entry = 0;
+            for (i = 0; (i < 16) &&
+                        (((entry_idx * 16) + i) < num_vss_header_segments); i++) {
+                vss_header_entry |=
+                    ((NvU64) vss_header_seg_sizes[(entry_idx * 16) + i]) << (i * 3);
+            }
+            ((NvU64 *) lut_data)[entry_idx] = vss_header_entry;
+        }
+    }
+
+    /* Calculate LUT content. */
+    for (entry_idx = 0;
+         entry_idx < num_entries - 1; entry_idx++) {
+        NvU32 data_idx = entry_idx + NUM_VSS_HEADER_ENTRIES;
+
+        NvU16 r, g, b = 0;
+
+        if (nv_drm_crtc_state->regamma_lut != NULL) {
+            /* Use provided Regamma LUT. */
+            static const NvU16 interp_max = (((NvU16) 1) << (16 - 10)) - 1;
+
+            const struct drm_color_lut *regamma_lut =
+                (struct drm_color_lut *) nv_drm_crtc_state->regamma_lut->data;
+
+            NvU16 lut_idx;
+            NvU16 interp = 0;
+
+            if (vss_entries != NULL) {
+                /* Merge with provided VSS LUT. */
+                NvU16 unorm16_entry = vss_entries[entry_idx];
+
+                /* Index using upper 10 bits from UNORM16 VSS LUT. */
+                lut_idx = unorm16_entry >> (16 - 10);
+                /* Interpolate using lower 6 bits from UNORM16 VSS LUT. */
+                interp = unorm16_entry & interp_max;
+            } else {
+                /* Direct index. */
+                lut_idx = entry_idx;
+            }
+
+            BUG_ON(lut_idx >= NVKMS_LUT_ARRAY_SIZE);
+
+            /* Perform interpolation or direct indexing. */
+            if (interp > 0 && ((lut_idx + 1) < NVKMS_LUT_ARRAY_SIZE)) {
+                r = unorm16_lut_interp(regamma_lut[lut_idx].red,
+                                       regamma_lut[lut_idx + 1].red,
+                                       interp,
+                                       interp_max);
+                g = unorm16_lut_interp(regamma_lut[lut_idx].green,
+                                       regamma_lut[lut_idx + 1].green,
+                                       interp,
+                                       interp_max);
+                b = unorm16_lut_interp(regamma_lut[lut_idx].blue,
+                                       regamma_lut[lut_idx + 1].blue,
+                                       interp,
+                                       interp_max);
+            } else {
+                r = regamma_lut[lut_idx].red;
+                g = regamma_lut[lut_idx].green;
+                b = regamma_lut[lut_idx].blue;
+            }
+        } else if (vss_entries != NULL) {
+            /* Use VSS LUT directly. */
+            r = g = b = vss_entries[entry_idx];
+        } else {
+            /* Use implicit identity. */
+            WARN_ON_ONCE(num_entries != (NVKMS_LUT_ARRAY_SIZE + 1));
+            r = g = b = entry_idx << (16 - 10);
+        }
+
+        /* Populate LUT. */
+        lut_data[(data_idx * 4) + 0] = r;
+        lut_data[(data_idx * 4) + 1] = g;
+        lut_data[(data_idx * 4) + 2] = b;
+    }
+    ((NvU64 *) lut_data)[NUM_VSS_HEADER_ENTRIES + num_entries - 1] =
+        ((NvU64 *) lut_data)[NUM_VSS_HEADER_ENTRIES + num_entries - 2];
+
+    return drm_lut_surface;
+}
+
+static struct nv_drm_lut_surface *create_drm_olut_surface_legacy(
+    struct nv_drm_device *nv_dev,
+    struct nv_drm_crtc *nv_crtc,
+    struct nv_drm_crtc_state *nv_drm_crtc_state)
+
+{
+    struct nv_drm_lut_surface *drm_lut_surface;
+    NvU16 *lut_data;
+    NvU32 entry_idx;
+
+    const struct drm_color_lut *regamma_lut;
+
+    WARN_ON(!nv_crtc->olut_caps.supported);
+    WARN_ON(nv_crtc->olut_caps.entryFormat != NVKMS_LUT_FORMAT_UNORM14_WAR_813188);
+    WARN_ON(nv_crtc->olut_caps.vssSupport == NVKMS_LUT_VSS_REQUIRED);
+    WARN_ON((NVKMS_LUT_ARRAY_SIZE + 1) > nv_crtc->olut_caps.lutEntries);
+
+    BUG_ON(nv_drm_crtc_state->regamma_lut == NULL);
+
+    regamma_lut =
+        (struct drm_color_lut *) nv_drm_crtc_state->regamma_lut->data;
+
+    /* Allocate displayable LUT surface. */
+    drm_lut_surface =
+        alloc_drm_lut_surface(nv_dev,
+                              NVKMS_LUT_FORMAT_UNORM14_WAR_813188,
+                              NVKMS_LUT_VSS_TYPE_NONE,
+                              0, 0,
+                              NVKMS_LUT_ARRAY_SIZE + 1);
+    if (drm_lut_surface == NULL) {
+        return NULL;
+    }
+
+    lut_data = (NvU16 *) drm_lut_surface->buffer;
+
+    /* Fill LUT surface. */
+    for (entry_idx = 0; entry_idx < NVKMS_LUT_ARRAY_SIZE; entry_idx++) {
+        lut_data[(entry_idx * 4) + 0] =
+            UNORM16_TO_UNORM14_WAR_813188(regamma_lut[entry_idx].red);
+        lut_data[(entry_idx * 4) + 1] =
+            UNORM16_TO_UNORM14_WAR_813188(regamma_lut[entry_idx].green);
+        lut_data[(entry_idx * 4) + 2] =
+            UNORM16_TO_UNORM14_WAR_813188(regamma_lut[entry_idx].blue);
+    }
+    ((NvU64 *) lut_data)[NVKMS_LUT_ARRAY_SIZE] =
+        ((NvU64 *) lut_data)[NVKMS_LUT_ARRAY_SIZE - 1];
+
+    return drm_lut_surface;
+}
+
+static bool
+update_matrix_override(struct drm_property_blob *blob,
+                       struct NvKmsCscMatrix *new_matrix,
+                       const struct NvKmsCscMatrix *old_matrix,
+                       bool old_enabled,
+                       bool *changed)
+{
+    bool enabled;
+    if (blob != NULL) {
+        ctm_3x4_to_csc(new_matrix, (struct drm_color_ctm_3x4 *) blob->data);
+        enabled = true;
+    } else {
+        enabled = false;
+    }
+    *changed |= (enabled != old_enabled) ||
+                memcmp(new_matrix, old_matrix, sizeof(*old_matrix));
+    return enabled;
+}
+
 static int
 plane_req_config_update(struct drm_plane *plane,
                        struct drm_plane_state *plane_state,
                        struct NvKmsKapiLayerRequestedConfig *req_config)
 {
+    struct nv_drm_device *nv_dev = to_nv_device(plane->dev);
    struct nv_drm_plane *nv_plane = to_nv_plane(plane);
    struct NvKmsKapiLayerConfig old_config = req_config->config;
    struct nv_drm_plane_state *nv_drm_plane_state =
        to_nv_drm_plane_state(plane_state);
+    bool matrix_overrides_changed = 0;

    if (plane_state->fb == NULL) {
        plane_req_config_disable(req_config);
@ -390,7 +1198,7 @@ plane_req_config_update(struct drm_plane *plane,
    req_config->config.syncParams.semaphoreSpecified = false;

    if (nv_drm_plane_state->fd_user_ptr) {
-        if (to_nv_device(plane->dev)->supportsSyncpts) {
+        if (nv_dev->supportsSyncpts) {
            req_config->config.syncParams.postSyncptRequested = true;
        } else {
            return -1;
@ -403,7 +1211,6 @@ plane_req_config_update(struct drm_plane *plane,
            nv_drm_plane_state->hdr_output_metadata->data;
        struct hdr_metadata_infoframe *info_frame =
            &hdr_metadata->hdmi_metadata_type1;
-        struct nv_drm_device *nv_dev = to_nv_device(plane->dev);
        uint32_t i;

        if (hdr_metadata->metadata_type != HDMI_STATIC_METADATA_TYPE1) {
@ -460,6 +1267,118 @@ plane_req_config_update(struct drm_plane *plane,
    req_config->flags.tfChanged = (old_config.tf != req_config->config.tf);
 #endif

+    req_config->config.matrixOverrides.enabled.lmsCtm =
+        update_matrix_override(nv_drm_plane_state->lms_ctm,
+                               &req_config->config.matrixOverrides.lmsCtm,
+                               &old_config.matrixOverrides.lmsCtm,
+                               old_config.matrixOverrides.enabled.lmsCtm,
+                               &matrix_overrides_changed);
+    req_config->config.matrixOverrides.enabled.lmsToItpCtm =
+        update_matrix_override(nv_drm_plane_state->lms_to_itp_ctm,
+                               &req_config->config.matrixOverrides.lmsToItpCtm,
+                               &old_config.matrixOverrides.lmsToItpCtm,
+                               old_config.matrixOverrides.enabled.lmsToItpCtm,
+                               &matrix_overrides_changed);
+    req_config->config.matrixOverrides.enabled.itpToLmsCtm =
+        update_matrix_override(nv_drm_plane_state->itp_to_lms_ctm,
+                               &req_config->config.matrixOverrides.itpToLmsCtm,
+                               &old_config.matrixOverrides.itpToLmsCtm,
+                               old_config.matrixOverrides.enabled.itpToLmsCtm,
+                               &matrix_overrides_changed);
+    req_config->config.matrixOverrides.enabled.blendCtm =
+        update_matrix_override(nv_drm_plane_state->blend_ctm,
+                               &req_config->config.matrixOverrides.blendCtm,
+                               &old_config.matrixOverrides.blendCtm,
+                               old_config.matrixOverrides.enabled.blendCtm,
+                               &matrix_overrides_changed);
+    req_config->flags.matrixOverridesChanged = matrix_overrides_changed;
+
+    if (nv_drm_plane_state->degamma_changed) {
+        if (nv_drm_plane_state->degamma_drm_lut_surface != NULL) {
+            kref_put(&nv_drm_plane_state->degamma_drm_lut_surface->refcount,
+                     free_drm_lut_surface);
+            nv_drm_plane_state->degamma_drm_lut_surface = NULL;
+        }
+
+        if (nv_plane->ilut_caps.vssSupport == NVKMS_LUT_VSS_SUPPORTED) {
+            if ((nv_drm_plane_state->degamma_tf  != NV_DRM_TRANSFER_FUNCTION_DEFAULT) ||
+                (nv_drm_plane_state->degamma_lut != NULL) ||
+                (nv_drm_plane_state->degamma_multiplier != ((uint64_t) 1) << 32)) {
+
+                nv_drm_plane_state->degamma_drm_lut_surface =
+                    create_drm_ilut_surface_vss(nv_dev, nv_plane,
+                                                nv_drm_plane_state);
+                if (nv_drm_plane_state->degamma_drm_lut_surface == NULL) {
+                    return -1;
+                }
+            }
+        } else {
+            WARN_ON(nv_plane->ilut_caps.vssSupport != NVKMS_LUT_VSS_NOT_SUPPORTED);
+            if (nv_drm_plane_state->degamma_lut != NULL) {
+                nv_drm_plane_state->degamma_drm_lut_surface =
+                    create_drm_ilut_surface_legacy(nv_dev, nv_plane,
+                                                   nv_drm_plane_state);
+                if (nv_drm_plane_state->degamma_drm_lut_surface == NULL) {
+                    return -1;
+                }
+            }
+        }
+
+        if (nv_drm_plane_state->degamma_drm_lut_surface != NULL) {
+            req_config->config.ilut.enabled = NV_TRUE;
+            req_config->config.ilut.lutSurface =
+                nv_drm_plane_state->degamma_drm_lut_surface->nvkms_surface;
+            req_config->config.ilut.offset = 0;
+            req_config->config.ilut.vssSegments =
+                nv_drm_plane_state->degamma_drm_lut_surface->properties.vssSegments;
+            req_config->config.ilut.lutEntries =
+                nv_drm_plane_state->degamma_drm_lut_surface->properties.lutEntries;
+        } else {
+            req_config->config.ilut.enabled = NV_FALSE;
+            req_config->config.ilut.lutSurface = NULL;
+            req_config->config.ilut.offset = 0;
+            req_config->config.ilut.vssSegments = 0;
+            req_config->config.ilut.lutEntries = 0;
+
+        }
+        req_config->flags.ilutChanged = NV_TRUE;
+    }
+
+    if (nv_drm_plane_state->tmo_changed) {
+        if (nv_drm_plane_state->tmo_drm_lut_surface != NULL) {
+            kref_put(&nv_drm_plane_state->tmo_drm_lut_surface->refcount,
+                     free_drm_lut_surface);
+            nv_drm_plane_state->tmo_drm_lut_surface = NULL;
+        }
+
+        if (nv_drm_plane_state->tmo_lut != NULL) {
+            nv_drm_plane_state->tmo_drm_lut_surface =
+                create_drm_tmo_surface(nv_dev, nv_plane,
+                                       nv_drm_plane_state);
+            if (nv_drm_plane_state->tmo_drm_lut_surface == NULL) {
+                return -1;
+            }
+        }
+
+        if (nv_drm_plane_state->tmo_drm_lut_surface != NULL) {
+            req_config->config.tmo.enabled = NV_TRUE;
+            req_config->config.tmo.lutSurface =
+                nv_drm_plane_state->tmo_drm_lut_surface->nvkms_surface;
+            req_config->config.tmo.offset = 0;
+            req_config->config.tmo.vssSegments =
+                nv_drm_plane_state->tmo_drm_lut_surface->properties.vssSegments;
+            req_config->config.tmo.lutEntries =
+                nv_drm_plane_state->tmo_drm_lut_surface->properties.lutEntries;
+        } else {
+            req_config->config.tmo.enabled = NV_FALSE;
+            req_config->config.tmo.lutSurface = NULL;
+            req_config->config.tmo.offset = 0;
+            req_config->config.tmo.vssSegments = 0;
+            req_config->config.tmo.lutEntries = 0;
+        }
+        req_config->flags.tmoChanged = NV_TRUE;
+    }
+
    /*
     * Unconditionally mark the surface as changed, even if nothing changed,
     * so that we always get a flip event: a DRM client may flip with
@ -594,8 +1513,8 @@ static int nv_drm_plane_atomic_check(struct drm_plane *plane,
                 * the CTM needs to be changed to the identity matrix
                 */
                if (crtc_state->ctm) {
-                    color_mgmt_config_ctm_to_csc(&plane_requested_config->config.csc,
-                                                 (struct drm_color_ctm *)crtc_state->ctm->data);
+                    ctm_to_csc(&plane_requested_config->config.csc,
+                               (struct drm_color_ctm *)crtc_state->ctm->data);
                } else {
                    plane_requested_config->config.csc = NVKMS_IDENTITY_CSC_MATRIX;
                }
@ -632,6 +1551,77 @@ static bool nv_drm_plane_format_mod_supported(struct drm_plane *plane,
 }
 #endif

+static int nv_drm_atomic_crtc_get_property(
+    struct drm_crtc *crtc,
+    const struct drm_crtc_state *state,
+    struct drm_property *property,
+    uint64_t *val)
+{
+    struct nv_drm_device *nv_dev = to_nv_device(crtc->dev);
+    const struct nv_drm_crtc_state *nv_drm_crtc_state =
+        to_nv_crtc_state_const(state);
+
+    if (property == nv_dev->nv_crtc_regamma_tf_property) {
+        *val = nv_drm_crtc_state->regamma_tf;
+        return 0;
+    } else if (property == nv_dev->nv_crtc_regamma_lut_property) {
+        *val = nv_drm_crtc_state->regamma_lut ?
+            nv_drm_crtc_state->regamma_lut->base.id : 0;
+        return 0;
+    } else if (property == nv_dev->nv_crtc_regamma_divisor_property) {
+        *val = nv_drm_crtc_state->regamma_divisor;
+        return 0;
+    } else if (property == nv_dev->nv_crtc_regamma_lut_size_property) {
+        /*
+         * This shouldn't be necessary, because read-only properties are stored
+         * in obj->properties->values[]. To be safe, check for it anyway.
+         */
+        *val = NVKMS_LUT_ARRAY_SIZE;
+        return 0;
+    }
+
+    return -EINVAL;
+
+}
+
+static int nv_drm_atomic_crtc_set_property(
+    struct drm_crtc *crtc,
+    struct drm_crtc_state *state,
+    struct drm_property *property,
+    uint64_t val)
+{
+    struct nv_drm_device *nv_dev = to_nv_device(crtc->dev);
+    struct nv_drm_crtc_state *nv_drm_crtc_state =
+        to_nv_crtc_state(state);
+    NvBool replaced = false;
+
+    if (property == nv_dev->nv_crtc_regamma_tf_property) {
+        if (val != nv_drm_crtc_state->regamma_tf) {
+            nv_drm_crtc_state->regamma_tf = val;
+            nv_drm_crtc_state->regamma_changed = true;
+        }
+        return 0;
+    } else if (property == nv_dev->nv_crtc_regamma_lut_property) {
+        int ret = nv_drm_atomic_replace_property_blob_from_id(
+                   nv_dev->dev,
+                   &nv_drm_crtc_state->regamma_lut,
+                   val,
+                   sizeof(struct drm_color_lut) * NVKMS_LUT_ARRAY_SIZE,
+                   &replaced);
+        if (replaced) {
+            nv_drm_crtc_state->regamma_changed = true;
+        }
+        return ret;
+    } else if (property == nv_dev->nv_crtc_regamma_divisor_property) {
+        if (val != nv_drm_crtc_state->regamma_divisor) {
+            nv_drm_crtc_state->regamma_divisor = val;
+            nv_drm_crtc_state->regamma_changed = true;
+        }
+        return 0;
+    }
+
+    return -EINVAL;
+}

 static int nv_drm_plane_atomic_set_property(
    struct drm_plane *plane,
@ -642,6 +1632,7 @@ static int nv_drm_plane_atomic_set_property(
    struct nv_drm_device *nv_dev = to_nv_device(plane->dev);
    struct nv_drm_plane_state *nv_drm_plane_state =
        to_nv_drm_plane_state(state);
+    NvBool replaced = false;

    if (property == nv_dev->nv_out_fence_property) {
        nv_drm_plane_state->fd_user_ptr = (void __user *)(uintptr_t)(val);
@ -656,9 +1647,73 @@ static int nv_drm_plane_atomic_set_property(
                nv_dev->dev,
                &nv_drm_plane_state->hdr_output_metadata,
                val,
-                sizeof(struct hdr_output_metadata));
+                sizeof(struct hdr_output_metadata),
+                &replaced);
    }
 #endif
+    else if (property == nv_dev->nv_plane_lms_ctm_property) {
+        return nv_drm_atomic_replace_property_blob_from_id(
+                nv_dev->dev,
+                &nv_drm_plane_state->lms_ctm,
+                val,
+                sizeof(struct drm_color_ctm_3x4),
+                &replaced);
+    } else if (property == nv_dev->nv_plane_lms_to_itp_ctm_property) {
+        return nv_drm_atomic_replace_property_blob_from_id(
+                nv_dev->dev,
+                &nv_drm_plane_state->lms_to_itp_ctm,
+                val,
+                sizeof(struct drm_color_ctm_3x4),
+                &replaced);
+    } else if (property == nv_dev->nv_plane_itp_to_lms_ctm_property) {
+        return nv_drm_atomic_replace_property_blob_from_id(
+                nv_dev->dev,
+                &nv_drm_plane_state->itp_to_lms_ctm,
+                val,
+                sizeof(struct drm_color_ctm_3x4),
+                &replaced);
+    } else if (property == nv_dev->nv_plane_blend_ctm_property) {
+        return nv_drm_atomic_replace_property_blob_from_id(
+                nv_dev->dev,
+                &nv_drm_plane_state->blend_ctm,
+                val,
+                sizeof(struct drm_color_ctm_3x4),
+                &replaced);
+    } else if (property == nv_dev->nv_plane_degamma_tf_property) {
+        if (val != nv_drm_plane_state->degamma_tf) {
+            nv_drm_plane_state->degamma_tf = val;
+            nv_drm_plane_state->degamma_changed = true;
+        }
+        return 0;
+    } else if (property == nv_dev->nv_plane_degamma_lut_property) {
+        int ret = nv_drm_atomic_replace_property_blob_from_id(
+                   nv_dev->dev,
+                   &nv_drm_plane_state->degamma_lut,
+                   val,
+                   sizeof(struct drm_color_lut) * NVKMS_LUT_ARRAY_SIZE,
+                   &replaced);
+        if (replaced) {
+            nv_drm_plane_state->degamma_changed = true;
+        }
+        return ret;
+    } else if (property == nv_dev->nv_plane_degamma_multiplier_property) {
+        if (val != nv_drm_plane_state->degamma_multiplier) {
+            nv_drm_plane_state->degamma_multiplier = val;
+            nv_drm_plane_state->degamma_changed = true;
+        }
+        return 0;
+    } else if (property == nv_dev->nv_plane_tmo_lut_property) {
+        int ret = nv_drm_atomic_replace_property_blob_from_id(
+                   nv_dev->dev,
+                   &nv_drm_plane_state->tmo_lut,
+                   val,
+                   sizeof(struct drm_color_lut) * NVKMS_LUT_ARRAY_SIZE,
+                   &replaced);
+        if (replaced) {
+            nv_drm_plane_state->tmo_changed = true;
+        }
+        return ret;
+    }

    return -EINVAL;
 }
@ -681,13 +1736,50 @@ static int nv_drm_plane_atomic_get_property(
    }
 #if defined(NV_DRM_HAS_HDR_OUTPUT_METADATA)
    else if (property ==  nv_dev->nv_hdr_output_metadata_property) {
-        const struct nv_drm_plane_state *nv_drm_plane_state =
-            to_nv_drm_plane_state_const(state);
        *val = nv_drm_plane_state->hdr_output_metadata ?
            nv_drm_plane_state->hdr_output_metadata->base.id : 0;
        return 0;
    }
 #endif
+    else if (property == nv_dev->nv_plane_lms_ctm_property) {
+        *val = nv_drm_plane_state->lms_ctm ?
+            nv_drm_plane_state->lms_ctm->base.id : 0;
+        return 0;
+    } else if (property == nv_dev->nv_plane_lms_to_itp_ctm_property) {
+        *val = nv_drm_plane_state->lms_to_itp_ctm ?
+            nv_drm_plane_state->lms_to_itp_ctm->base.id : 0;
+        return 0;
+    } else if (property == nv_dev->nv_plane_itp_to_lms_ctm_property) {
+        *val = nv_drm_plane_state->itp_to_lms_ctm ?
+            nv_drm_plane_state->itp_to_lms_ctm->base.id : 0;
+        return 0;
+    } else if (property == nv_dev->nv_plane_blend_ctm_property) {
+        *val = nv_drm_plane_state->blend_ctm ?
+            nv_drm_plane_state->blend_ctm->base.id : 0;
+        return 0;
+    } else if (property == nv_dev->nv_plane_degamma_tf_property) {
+        *val = nv_drm_plane_state->degamma_tf;
+        return 0;
+    } else if (property == nv_dev->nv_plane_degamma_lut_property) {
+        *val = nv_drm_plane_state->degamma_lut ?
+            nv_drm_plane_state->degamma_lut->base.id : 0;
+        return 0;
+    } else if (property == nv_dev->nv_plane_degamma_multiplier_property) {
+        *val = nv_drm_plane_state->degamma_multiplier;
+        return 0;
+    } else if (property == nv_dev->nv_plane_tmo_lut_property) {
+        *val = nv_drm_plane_state->tmo_lut ?
+            nv_drm_plane_state->tmo_lut->base.id : 0;
+        return 0;
+    } else if ((property == nv_dev->nv_plane_degamma_lut_size_property) ||
+               (property == nv_dev->nv_plane_tmo_lut_size_property)) {
+        /*
+         * This shouldn't be necessary, because read-only properties are stored
+         * in obj->properties->values[]. To be safe, check for it anyway.
+         */
+        *val = NVKMS_LUT_ARRAY_SIZE;
+        return 0;
+    }

    return -EINVAL;
 }
@ -744,10 +1836,54 @@ nv_drm_plane_atomic_duplicate_state(struct drm_plane *plane)
 #if defined(NV_DRM_HAS_HDR_OUTPUT_METADATA)
    nv_plane_state->hdr_output_metadata = nv_old_plane_state->hdr_output_metadata;
    if (nv_plane_state->hdr_output_metadata) {
-        drm_property_blob_get(nv_plane_state->hdr_output_metadata);
+        nv_drm_property_blob_get(nv_plane_state->hdr_output_metadata);
    }
 #endif

+    nv_plane_state->lms_ctm = nv_old_plane_state->lms_ctm;
+    if (nv_plane_state->lms_ctm) {
+        nv_drm_property_blob_get(nv_plane_state->lms_ctm);
+    }
+
+    nv_plane_state->lms_to_itp_ctm = nv_old_plane_state->lms_to_itp_ctm;
+    if (nv_plane_state->lms_to_itp_ctm) {
+        nv_drm_property_blob_get(nv_plane_state->lms_to_itp_ctm);
+    }
+
+    nv_plane_state->itp_to_lms_ctm = nv_old_plane_state->itp_to_lms_ctm;
+    if (nv_plane_state->itp_to_lms_ctm) {
+        nv_drm_property_blob_get(nv_plane_state->itp_to_lms_ctm);
+    }
+
+    nv_plane_state->blend_ctm = nv_old_plane_state->blend_ctm;
+    if (nv_plane_state->blend_ctm) {
+        nv_drm_property_blob_get(nv_plane_state->blend_ctm);
+    }
+
+    nv_plane_state->degamma_tf = nv_old_plane_state->degamma_tf;
+    nv_plane_state->degamma_lut = nv_old_plane_state->degamma_lut;
+    if (nv_plane_state->degamma_lut) {
+        nv_drm_property_blob_get(nv_plane_state->degamma_lut);
+    }
+    nv_plane_state->degamma_multiplier = nv_old_plane_state->degamma_multiplier;
+    nv_plane_state->degamma_changed = false;
+    nv_plane_state->degamma_drm_lut_surface =
+        nv_old_plane_state->degamma_drm_lut_surface;
+    if (nv_plane_state->degamma_drm_lut_surface) {
+        kref_get(&nv_plane_state->degamma_drm_lut_surface->refcount);
+    }
+
+    nv_plane_state->tmo_lut = nv_old_plane_state->tmo_lut;
+    if (nv_plane_state->tmo_lut) {
+        nv_drm_property_blob_get(nv_plane_state->tmo_lut);
+    }
+    nv_plane_state->tmo_changed = false;
+    nv_plane_state->tmo_drm_lut_surface =
+        nv_old_plane_state->tmo_drm_lut_surface;
+    if (nv_plane_state->tmo_drm_lut_surface) {
+        kref_get(&nv_plane_state->tmo_drm_lut_surface->refcount);
+    }
+
    return &nv_plane_state->base;
 }

@ -755,6 +1891,8 @@ static inline void __nv_drm_plane_atomic_destroy_state(
    struct drm_plane *plane,
    struct drm_plane_state *state)
 {
+    struct nv_drm_plane_state *nv_drm_plane_state =
+        to_nv_drm_plane_state(state);
 #if defined(NV_DRM_ATOMIC_HELPER_PLANE_DESTROY_STATE_HAS_PLANE_ARG)
    __drm_atomic_helper_plane_destroy_state(plane, state);
 #else
@ -762,12 +1900,24 @@ static inline void __nv_drm_plane_atomic_destroy_state(
 #endif

 #if defined(NV_DRM_HAS_HDR_OUTPUT_METADATA)
-    {
-        struct nv_drm_plane_state *nv_drm_plane_state =
-            to_nv_drm_plane_state(state);
-        drm_property_blob_put(nv_drm_plane_state->hdr_output_metadata);
-    }
+    nv_drm_property_blob_put(nv_drm_plane_state->hdr_output_metadata);
 #endif
+    nv_drm_property_blob_put(nv_drm_plane_state->lms_ctm);
+    nv_drm_property_blob_put(nv_drm_plane_state->lms_to_itp_ctm);
+    nv_drm_property_blob_put(nv_drm_plane_state->itp_to_lms_ctm);
+    nv_drm_property_blob_put(nv_drm_plane_state->blend_ctm);
+
+    nv_drm_property_blob_put(nv_drm_plane_state->degamma_lut);
+    if (nv_drm_plane_state->degamma_drm_lut_surface != NULL) {
+        kref_put(&nv_drm_plane_state->degamma_drm_lut_surface->refcount,
+                 free_drm_lut_surface);
+    }
+
+    nv_drm_property_blob_put(nv_drm_plane_state->tmo_lut);
+    if (nv_drm_plane_state->tmo_drm_lut_surface != NULL) {
+        kref_put(&nv_drm_plane_state->tmo_drm_lut_surface->refcount,
+                 free_drm_lut_surface);
+    }
 }

 static void nv_drm_plane_atomic_destroy_state(
@ -875,6 +2025,7 @@ static inline struct nv_drm_crtc_state *nv_drm_crtc_state_alloc(void)
        return NULL;
    }

+    nv_state->req_config.modeSetConfig.olutFpNormScale = NVKMS_OLUT_FP_NORM_SCALE_DEFAULT;
    for (i = 0; i < ARRAY_SIZE(nv_state->req_config.layerRequestedConfig); i++) {
        plane_config_clear(&nv_state->req_config.layerRequestedConfig[i].config);
    }
@ -923,6 +2074,7 @@ static void nv_drm_atomic_crtc_reset(struct drm_crtc *crtc)
 static struct drm_crtc_state*
 nv_drm_atomic_crtc_duplicate_state(struct drm_crtc *crtc)
 {
+    struct nv_drm_crtc_state *nv_old_state = to_nv_crtc_state(crtc->state);
    struct nv_drm_crtc_state *nv_state = nv_drm_crtc_state_alloc();

    if (nv_state == NULL) {
@ -944,7 +2096,7 @@ nv_drm_atomic_crtc_duplicate_state(struct drm_crtc *crtc)
     * be freed in any following failure paths.
     */
    if (!nv_drm_crtc_duplicate_req_head_modeset_config(
-             &(to_nv_crtc_state(crtc->state)->req_config),
+             &nv_old_state->req_config,
             &nv_state->req_config)) {

        nv_drm_free(nv_state->nv_flip);
@ -954,6 +2106,17 @@ nv_drm_atomic_crtc_duplicate_state(struct drm_crtc *crtc)

    __drm_atomic_helper_crtc_duplicate_state(crtc, &nv_state->base);

+    nv_state->regamma_tf = nv_old_state->regamma_tf;
+    nv_state->regamma_lut = nv_old_state->regamma_lut;
+    if (nv_state->regamma_lut) {
+        nv_drm_property_blob_get(nv_state->regamma_lut);
+    }
+    nv_state->regamma_divisor = nv_old_state->regamma_divisor;
+    if (nv_state->regamma_drm_lut_surface) {
+        kref_get(&nv_state->regamma_drm_lut_surface->refcount);
+    }
+    nv_state->regamma_changed = false;
+
    return &nv_state->base;
 }

@ -977,6 +2140,12 @@ static void nv_drm_atomic_crtc_destroy_state(struct drm_crtc *crtc,

    __nv_drm_atomic_helper_crtc_destroy_state(crtc, &nv_state->base);

+    nv_drm_property_blob_put(nv_state->regamma_lut);
+    if (nv_state->regamma_drm_lut_surface != NULL) {
+        kref_put(&nv_state->regamma_drm_lut_surface->refcount,
+                 free_drm_lut_surface);
+    }
+
    nv_drm_free(nv_state->req_config.modeSetConfig.lut.input.pRamps);
    nv_drm_free(nv_state->req_config.modeSetConfig.lut.output.pRamps);

@ -988,6 +2157,8 @@ static struct drm_crtc_funcs nv_crtc_funcs = {
    .page_flip              = drm_atomic_helper_page_flip,
    .reset                  = nv_drm_atomic_crtc_reset,
    .destroy                = nv_drm_crtc_destroy,
+    .atomic_get_property    = nv_drm_atomic_crtc_get_property,
+    .atomic_set_property    = nv_drm_atomic_crtc_set_property,
    .atomic_duplicate_state = nv_drm_atomic_crtc_duplicate_state,
    .atomic_destroy_state   = nv_drm_atomic_crtc_destroy_state,
 #if defined(NV_DRM_ATOMIC_HELPER_LEGACY_GAMMA_SET_PRESENT)
@ -1101,7 +2272,7 @@ static int color_mgmt_config_set_luts(struct nv_drm_crtc_state *nv_crtc_state,
        nv_drm_free(modeset_config->lut.input.pRamps);
        modeset_config->lut.input.pRamps    = NULL;
    }
-    req_config->flags.ilutChanged = NV_TRUE;
+    req_config->flags.legacyIlutChanged = NV_TRUE;

    if (crtc_state->gamma_lut) {
        struct drm_color_lut *gamma_lut = NULL;
@ -1134,7 +2305,7 @@ static int color_mgmt_config_set_luts(struct nv_drm_crtc_state *nv_crtc_state,
        nv_drm_free(modeset_config->lut.output.pRamps);
        modeset_config->lut.output.pRamps    = NULL;
    }
-    req_config->flags.olutChanged = NV_TRUE;
+    req_config->flags.legacyOlutChanged = NV_TRUE;

    return 0;
 }
@ -1157,6 +2328,8 @@ static int nv_drm_crtc_atomic_check(struct drm_crtc *crtc,
    struct drm_crtc_state *crtc_state =
        drm_atomic_get_new_crtc_state(state, crtc);
 #endif
+    struct nv_drm_crtc *nv_crtc = to_nv_crtc(crtc);
+    struct nv_drm_device *nv_dev = to_nv_device(crtc->dev);
    struct nv_drm_crtc_state *nv_crtc_state = to_nv_crtc_state(crtc_state);
    struct NvKmsKapiHeadRequestedConfig *req_config =
        &nv_crtc_state->req_config;
@ -1211,6 +2384,76 @@ static int nv_drm_crtc_atomic_check(struct drm_crtc *crtc,
    }
 #endif

+    if (nv_crtc_state->regamma_changed) {
+        if (nv_crtc_state->regamma_drm_lut_surface != NULL) {
+            kref_put(&nv_crtc_state->regamma_drm_lut_surface->refcount,
+                     free_drm_lut_surface);
+            nv_crtc_state->regamma_drm_lut_surface = NULL;
+        }
+
+        if (nv_crtc->olut_caps.vssSupport == NVKMS_LUT_VSS_SUPPORTED) {
+            if ((nv_crtc_state->regamma_tf  != NV_DRM_TRANSFER_FUNCTION_DEFAULT) ||
+                (nv_crtc_state->regamma_lut != NULL)) {
+
+                nv_crtc_state->regamma_drm_lut_surface =
+                    create_drm_olut_surface_vss(nv_dev, nv_crtc,
+                                                nv_crtc_state);
+                if (nv_crtc_state->regamma_drm_lut_surface == NULL) {
+                    return -1;
+                }
+            }
+        } else {
+            WARN_ON(nv_crtc->olut_caps.vssSupport != NVKMS_LUT_VSS_NOT_SUPPORTED);
+            if (nv_crtc_state->regamma_lut != NULL) {
+                nv_crtc_state->regamma_drm_lut_surface =
+                    create_drm_olut_surface_legacy(nv_dev, nv_crtc,
+                                                   nv_crtc_state);
+                if (nv_crtc_state->regamma_drm_lut_surface == NULL) {
+                    return -1;
+                }
+            }
+        }
+
+        if (nv_crtc_state->regamma_drm_lut_surface != NULL) {
+            req_config->modeSetConfig.olut.enabled = NV_TRUE;
+            req_config->modeSetConfig.olut.lutSurface =
+                nv_crtc_state->regamma_drm_lut_surface->nvkms_surface;
+            req_config->modeSetConfig.olut.offset = 0;
+            req_config->modeSetConfig.olut.vssSegments =
+                nv_crtc_state->regamma_drm_lut_surface->properties.vssSegments;
+            req_config->modeSetConfig.olut.lutEntries =
+                nv_crtc_state->regamma_drm_lut_surface->properties.lutEntries;
+        } else {
+            req_config->modeSetConfig.olut.enabled = NV_FALSE;
+            req_config->modeSetConfig.olut.lutSurface = NULL;
+            req_config->modeSetConfig.olut.offset = 0;
+            req_config->modeSetConfig.olut.vssSegments = 0;
+            req_config->modeSetConfig.olut.lutEntries = 0;
+        }
+        req_config->flags.olutChanged = NV_TRUE;
+
+        /*
+         * Range property is configured to ensure sign bit = 0 and
+         * value is >= 1, but it may still default to 0 if it's unsupported.
+         */
+        WARN_ON(nv_crtc_state->regamma_divisor & (((NvU64) 1) << 63));
+
+        req_config->flags.olutFpNormScaleChanged = NV_TRUE;
+        if (nv_crtc_state->regamma_divisor < (((NvU64) 1) << 32)) {
+            req_config->modeSetConfig.olutFpNormScale =
+                NVKMS_OLUT_FP_NORM_SCALE_DEFAULT;
+        } else {
+            /*
+             * Since the sign bit of the regamma_divisor is unset, we treat it as
+             * unsigned and do 32.32 unsigned fixed-point division to get the
+             * fpNormScale.
+             */
+            req_config->modeSetConfig.olutFpNormScale =
+                (NvU32)(((NvU64)NVKMS_OLUT_FP_NORM_SCALE_DEFAULT << 32) /
+                        nv_crtc_state->regamma_divisor);
+        }
+    }
+
    return ret;
 }

@ -1227,11 +2470,48 @@ static const struct drm_crtc_helper_funcs nv_crtc_helper_funcs = {
    .mode_fixup = nv_drm_crtc_mode_fixup,
 };

+static void nv_drm_crtc_install_properties(
+    struct drm_crtc *crtc)
+{
+    struct nv_drm_device *nv_dev = to_nv_device(crtc->dev);
+    struct nv_drm_crtc *nv_crtc = to_nv_crtc(crtc);
+    struct nv_drm_crtc_state *nv_crtc_state = to_nv_crtc_state(crtc->state);
+
+    if (nv_crtc->olut_caps.supported) {
+        if (nv_crtc->olut_caps.vssSupport == NVKMS_LUT_VSS_SUPPORTED) {
+            if (nv_dev->nv_crtc_regamma_tf_property) {
+                drm_object_attach_property(
+                    &crtc->base, nv_dev->nv_crtc_regamma_tf_property,
+                    NV_DRM_TRANSFER_FUNCTION_DEFAULT);
+            }
+            if (nv_dev->nv_crtc_regamma_divisor_property) {
+                /* Default to 1 */
+                nv_crtc_state->regamma_divisor = (((NvU64) 1) << 32);
+                drm_object_attach_property(
+                    &crtc->base, nv_dev->nv_crtc_regamma_divisor_property,
+                    nv_crtc_state->regamma_divisor);
+            }
+        }
+        if (nv_dev->nv_crtc_regamma_lut_property) {
+            drm_object_attach_property(
+                &crtc->base, nv_dev->nv_crtc_regamma_lut_property, 0);
+        }
+        if (nv_dev->nv_crtc_regamma_lut_size_property) {
+            drm_object_attach_property(
+                &crtc->base, nv_dev->nv_crtc_regamma_lut_size_property,
+                NVKMS_LUT_ARRAY_SIZE);
+        }
+    }
+}
+
 static void nv_drm_plane_install_properties(
    struct drm_plane *plane,
    NvBool supportsICtCp)
 {
    struct nv_drm_device *nv_dev = to_nv_device(plane->dev);
+    struct nv_drm_plane *nv_plane = to_nv_plane(plane);
+    struct nv_drm_plane_state *nv_plane_state =
+        to_nv_drm_plane_state(plane->state);

    if (nv_dev->nv_out_fence_property) {
        drm_object_attach_property(
@ -1244,12 +2524,82 @@ static void nv_drm_plane_install_properties(
            NVKMS_INPUT_COLORSPACE_NONE);
    }

+    if (supportsICtCp) {
 #if defined(NV_DRM_HAS_HDR_OUTPUT_METADATA)
-    if (supportsICtCp && nv_dev->nv_hdr_output_metadata_property) {
-        drm_object_attach_property(
-            &plane->base, nv_dev->nv_hdr_output_metadata_property, 0);
-    }
+        if (nv_dev->nv_hdr_output_metadata_property) {
+            drm_object_attach_property(
+                &plane->base, nv_dev->nv_hdr_output_metadata_property, 0);
+        }
 #endif
+
+        /*
+         * The old DRM_OBJECT_MAX_PROPERTY limit of 24 is too small to
+         * accomodate all of the properties for the ICtCp pipeline.
+         *
+         * Commit 1e13c5644c44 ("drm/drm_mode_object: increase max objects to
+         * accommodate new color props") in Linux v6.8 increased the limit to
+         * 64. To be safe, require this before attaching any properties for the
+         * ICtCp pipeline.
+         */
+        if (DRM_OBJECT_MAX_PROPERTY >= 64) {
+            if (nv_dev->nv_plane_lms_ctm_property) {
+                drm_object_attach_property(
+                    &plane->base, nv_dev->nv_plane_lms_ctm_property, 0);
+            }
+
+            if (nv_dev->nv_plane_lms_to_itp_ctm_property) {
+                drm_object_attach_property(
+                    &plane->base, nv_dev->nv_plane_lms_to_itp_ctm_property, 0);
+            }
+
+            if (nv_dev->nv_plane_itp_to_lms_ctm_property) {
+                drm_object_attach_property(
+                    &plane->base, nv_dev->nv_plane_itp_to_lms_ctm_property, 0);
+            }
+
+            WARN_ON(!nv_plane->tmo_caps.supported);
+            if (nv_dev->nv_plane_tmo_lut_property) {
+                drm_object_attach_property(
+                    &plane->base, nv_dev->nv_plane_tmo_lut_property, 0);
+            }
+            if (nv_dev->nv_plane_tmo_lut_size_property) {
+                drm_object_attach_property(
+                    &plane->base, nv_dev->nv_plane_tmo_lut_size_property,
+                    NVKMS_LUT_ARRAY_SIZE);
+            }
+        }
+    }
+
+    if (nv_dev->nv_plane_blend_ctm_property) {
+        drm_object_attach_property(
+            &plane->base, nv_dev->nv_plane_blend_ctm_property, 0);
+    }
+
+    if (nv_plane->ilut_caps.supported) {
+        if (nv_plane->ilut_caps.vssSupport == NVKMS_LUT_VSS_SUPPORTED) {
+            if (nv_dev->nv_plane_degamma_tf_property) {
+                drm_object_attach_property(
+                    &plane->base, nv_dev->nv_plane_degamma_tf_property,
+                    NV_DRM_TRANSFER_FUNCTION_DEFAULT);
+            }
+            if (nv_dev->nv_plane_degamma_multiplier_property) {
+                /* Default to 1 in S31.32 Sign-Magnitude Format */
+                nv_plane_state->degamma_multiplier = ((uint64_t) 1) << 32;
+                drm_object_attach_property(
+                    &plane->base, nv_dev->nv_plane_degamma_multiplier_property,
+                    nv_plane_state->degamma_multiplier);
+            }
+        }
+        if (nv_dev->nv_plane_degamma_lut_property) {
+            drm_object_attach_property(
+                &plane->base, nv_dev->nv_plane_degamma_lut_property, 0);
+        }
+        if (nv_dev->nv_plane_degamma_lut_size_property) {
+            drm_object_attach_property(
+                &plane->base, nv_dev->nv_plane_degamma_lut_size_property,
+                NVKMS_LUT_ARRAY_SIZE);
+        }
+    }
 }

 static void
@ -1429,6 +2779,9 @@ nv_drm_plane_create(struct drm_device *dev,
    drm_plane_helper_add(plane, &nv_plane_helper_funcs);

    if (plane_type != DRM_PLANE_TYPE_CURSOR) {
+        nv_plane->ilut_caps = pResInfo->lutCaps.layer[layer_idx].ilut;
+        nv_plane->tmo_caps = pResInfo->lutCaps.layer[layer_idx].tmo;
+
        nv_drm_plane_install_properties(
                plane,
                pResInfo->supportsICtCp[layer_idx]);
@ -1465,7 +2818,8 @@ failed:
 static struct drm_crtc *__nv_drm_crtc_create(struct nv_drm_device *nv_dev,
                                             struct drm_plane *primary_plane,
                                             struct drm_plane *cursor_plane,
-                                             unsigned int head)
+                                             unsigned int head,
+                                             const struct NvKmsKapiDeviceResourcesInfo *pResInfo)
 {
    struct nv_drm_crtc *nv_crtc = NULL;
    struct nv_drm_crtc_state *nv_state = NULL;
@ -1508,6 +2862,10 @@ static struct drm_crtc *__nv_drm_crtc_create(struct nv_drm_device *nv_dev,

    drm_crtc_helper_add(&nv_crtc->base, &nv_crtc_helper_funcs);

+    nv_crtc->olut_caps = pResInfo->lutCaps.olut;
+
+    nv_drm_crtc_install_properties(&nv_crtc->base);
+
 #if defined(NV_DRM_COLOR_MGMT_AVAILABLE)
 #if defined(NV_DRM_CRTC_ENABLE_COLOR_MGMT_PRESENT)
    drm_crtc_enable_color_mgmt(&nv_crtc->base, NVKMS_LUT_ARRAY_SIZE, true,
@ -1584,7 +2942,7 @@ void nv_drm_enumerate_crtcs_and_planes(
            struct drm_crtc *crtc =
                __nv_drm_crtc_create(nv_dev,
                                     primary_plane, cursor_plane,
-                                     i);
+                                     i, pResInfo);
            if (IS_ERR(crtc)) {
                nv_drm_plane_destroy(primary_plane);

--- a/kernel-open/nvidia-drm/nvidia-drm-crtc.h
+++ b/kernel-open/nvidia-drm/nvidia-drm-crtc.h
@ -38,6 +38,13 @@
 #include "nvtypes.h"
 #include "nvkms-kapi.h"

+enum nv_drm_transfer_function {
+    NV_DRM_TRANSFER_FUNCTION_DEFAULT,
+    NV_DRM_TRANSFER_FUNCTION_LINEAR,
+    NV_DRM_TRANSFER_FUNCTION_PQ,
+    NV_DRM_TRANSFER_FUNCTION_MAX,
+};
+
 struct nv_drm_crtc {
    NvU32 head;

@ -63,6 +70,8 @@ struct nv_drm_crtc {
     */
    struct drm_file *modeset_permission_filep;

+    struct NvKmsLUTCaps olut_caps;
+
    struct drm_crtc base;
 };

@ -142,6 +151,12 @@ struct nv_drm_crtc_state {
     * nv_drm_atomic_crtc_destroy_state().
     */
    struct nv_drm_flip *nv_flip;
+
+    enum nv_drm_transfer_function regamma_tf;
+    struct drm_property_blob *regamma_lut;
+    uint64_t regamma_divisor;
+    struct nv_drm_lut_surface *regamma_drm_lut_surface;
+    NvBool regamma_changed;
 };

 static inline struct nv_drm_crtc_state *to_nv_crtc_state(struct drm_crtc_state *state)
@ -149,6 +164,11 @@ static inline struct nv_drm_crtc_state *to_nv_crtc_state(struct drm_crtc_state *
    return container_of(state, struct nv_drm_crtc_state, base);
 }

+static inline const struct nv_drm_crtc_state *to_nv_crtc_state_const(const struct drm_crtc_state *state)
+{
+    return container_of(state, struct nv_drm_crtc_state, base);
+}
+
 struct nv_drm_plane {
    /**
     * @base:
@ -170,6 +190,9 @@ struct nv_drm_plane {
     * Index of this plane in the per head array of layers.
     */
    uint32_t layer_idx;
+
+    struct NvKmsLUTCaps ilut_caps;
+    struct NvKmsLUTCaps tmo_caps;
 };

 static inline struct nv_drm_plane *to_nv_plane(struct drm_plane *plane)
@ -180,6 +203,22 @@ static inline struct nv_drm_plane *to_nv_plane(struct drm_plane *plane)
    return container_of(plane, struct nv_drm_plane, base);
 }

+struct nv_drm_lut_surface {
+    struct NvKmsKapiDevice *pDevice;
+    struct NvKmsKapiMemory *nvkms_memory;
+    struct NvKmsKapiSurface *nvkms_surface;
+    struct {
+        NvU32 vssSegments;
+        enum NvKmsLUTVssType vssType;
+
+        NvU32 lutEntries;
+        enum NvKmsLUTFormat entryFormat;
+
+    } properties;
+    void *buffer;
+    struct kref refcount;
+};
+
 struct nv_drm_plane_state {
    struct drm_plane_state base;
    s32 __user *fd_user_ptr;
@ -187,6 +226,20 @@ struct nv_drm_plane_state {
 #if defined(NV_DRM_HAS_HDR_OUTPUT_METADATA)
    struct drm_property_blob *hdr_output_metadata;
 #endif
+    struct drm_property_blob *lms_ctm;
+    struct drm_property_blob *lms_to_itp_ctm;
+    struct drm_property_blob *itp_to_lms_ctm;
+    struct drm_property_blob *blend_ctm;
+
+    enum nv_drm_transfer_function degamma_tf;
+    struct drm_property_blob *degamma_lut;
+    uint64_t degamma_multiplier; /* S31.32 Sign-Magnitude Format */
+    struct nv_drm_lut_surface *degamma_drm_lut_surface;
+    NvBool degamma_changed;
+
+    struct drm_property_blob *tmo_lut;
+    struct nv_drm_lut_surface *tmo_drm_lut_surface;
+    NvBool tmo_changed;
 };

 static inline struct nv_drm_plane_state *to_nv_drm_plane_state(struct drm_plane_state *state)
--- a/kernel-open/nvidia-drm/nvidia-drm-drv.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-drv.c
@ -64,12 +64,14 @@
 #include <drm/drm_ioctl.h>
 #endif

-#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE)
+#if defined(NV_DRM_FBDEV_AVAILABLE)
 #include <drm/drm_aperture.h>
 #include <drm/drm_fb_helper.h>
 #endif

-#if defined(NV_DRM_DRM_FBDEV_GENERIC_H_PRESENT)
+#if defined(NV_DRM_DRM_FBDEV_TTM_H_PRESENT)
+#include <drm/drm_fbdev_ttm.h>
+#elif defined(NV_DRM_DRM_FBDEV_GENERIC_H_PRESENT)
 #include <drm/drm_fbdev_generic.h>
 #endif

@ -105,16 +107,16 @@ static int nv_drm_revoke_sub_ownership(struct drm_device *dev);

 static struct nv_drm_device *dev_list = NULL;

-static const char* nv_get_input_colorspace_name(
+static char* nv_get_input_colorspace_name(
    enum NvKmsInputColorSpace colorSpace)
 {
    switch (colorSpace) {
        case NVKMS_INPUT_COLORSPACE_NONE:
            return "None";
        case NVKMS_INPUT_COLORSPACE_SCRGB_LINEAR:
-            return "IEC 61966-2-2 linear FP";
+            return "scRGB Linear FP16";
        case NVKMS_INPUT_COLORSPACE_BT2100_PQ:
-            return "ITU-R BT.2100-PQ YCbCr";
+            return "BT.2100 PQ";
        default:
            /* We shoudn't hit this */
            WARN_ON("Unsupported input colorspace");
@ -122,8 +124,30 @@ static const char* nv_get_input_colorspace_name(
    }
 };

+static char* nv_get_transfer_function_name(
+    enum nv_drm_transfer_function tf)
+{
+    switch (tf) {
+        case NV_DRM_TRANSFER_FUNCTION_LINEAR:
+            return "Linear";
+        case NV_DRM_TRANSFER_FUNCTION_PQ:
+            return "PQ (Perceptual Quantizer)";
+        default:
+            /* We shoudn't hit this */
+            WARN_ON("Unsupported transfer function");
+#if defined(fallthrough)
+            fallthrough;
+#else
+            /* Fallthrough */
+#endif
+        case NV_DRM_TRANSFER_FUNCTION_DEFAULT:
+            return "Default";
+    }
+};
+
 #if defined(NV_DRM_ATOMIC_MODESET_AVAILABLE)

+#if defined(NV_DRM_OUTPUT_POLL_CHANGED_PRESENT)
 static void nv_drm_output_poll_changed(struct drm_device *dev)
 {
    struct drm_connector *connector = NULL;
@ -167,6 +191,7 @@ static void nv_drm_output_poll_changed(struct drm_device *dev)
    nv_drm_connector_list_iter_end(&conn_iter);
 #endif
 }
+#endif /* NV_DRM_OUTPUT_POLL_CHANGED_PRESENT */

 static struct drm_framebuffer *nv_drm_framebuffer_create(
    struct drm_device *dev,
@ -204,7 +229,9 @@ static const struct drm_mode_config_funcs nv_mode_config_funcs = {
    .atomic_check  = nv_drm_atomic_check,
    .atomic_commit = nv_drm_atomic_commit,

+    #if defined(NV_DRM_OUTPUT_POLL_CHANGED_PRESENT)
    .output_poll_changed = nv_drm_output_poll_changed,
+    #endif
 };

 static void nv_drm_event_callback(const struct NvKmsKapiEvent *event)
@ -364,15 +391,21 @@ static void nv_drm_enumerate_encoders_and_connectors
 */
 static int nv_drm_create_properties(struct nv_drm_device *nv_dev)
 {
-    struct drm_prop_enum_list enum_list[3] = { };
+    struct drm_prop_enum_list colorspace_enum_list[3] = { };
+    struct drm_prop_enum_list tf_enum_list[NV_DRM_TRANSFER_FUNCTION_MAX] = { };
    int i, len = 0;

    for (i = 0; i < 3; i++) {
-        enum_list[len].type = i;
-        enum_list[len].name = nv_get_input_colorspace_name(i);
+        colorspace_enum_list[len].type = i;
+        colorspace_enum_list[len].name = nv_get_input_colorspace_name(i);
        len++;
    }

+    for (i = 0; i < NV_DRM_TRANSFER_FUNCTION_MAX; i++) {
+        tf_enum_list[i].type = i;
+        tf_enum_list[i].name = nv_get_transfer_function_name(i);
+    }
+
    if (nv_dev->supportsSyncpts) {
        nv_dev->nv_out_fence_property =
            drm_property_create_range(nv_dev->dev, DRM_MODE_PROP_ATOMIC,
@ -384,7 +417,7 @@ static int nv_drm_create_properties(struct nv_drm_device *nv_dev)

    nv_dev->nv_input_colorspace_property =
        drm_property_create_enum(nv_dev->dev, 0, "NV_INPUT_COLORSPACE",
-                                 enum_list, len);
+                                 colorspace_enum_list, len);
    if (nv_dev->nv_input_colorspace_property == NULL) {
        NV_DRM_LOG_ERR("Failed to create NV_INPUT_COLORSPACE property");
        return -ENOMEM;
@ -399,6 +432,109 @@ static int nv_drm_create_properties(struct nv_drm_device *nv_dev)
    }
 #endif

+    nv_dev->nv_plane_lms_ctm_property =
+        drm_property_create(nv_dev->dev, DRM_MODE_PROP_BLOB,
+            "NV_PLANE_LMS_CTM", 0);
+    if (nv_dev->nv_plane_lms_ctm_property == NULL) {
+        return -ENOMEM;
+    }
+
+    nv_dev->nv_plane_lms_to_itp_ctm_property =
+        drm_property_create(nv_dev->dev, DRM_MODE_PROP_BLOB,
+            "NV_PLANE_LMS_TO_ITP_CTM", 0);
+    if (nv_dev->nv_plane_lms_to_itp_ctm_property == NULL) {
+        return -ENOMEM;
+    }
+
+    nv_dev->nv_plane_itp_to_lms_ctm_property =
+        drm_property_create(nv_dev->dev, DRM_MODE_PROP_BLOB,
+            "NV_PLANE_ITP_TO_LMS_CTM", 0);
+    if (nv_dev->nv_plane_itp_to_lms_ctm_property == NULL) {
+        return -ENOMEM;
+    }
+
+    nv_dev->nv_plane_blend_ctm_property =
+        drm_property_create(nv_dev->dev, DRM_MODE_PROP_BLOB,
+            "NV_PLANE_BLEND_CTM", 0);
+    if (nv_dev->nv_plane_blend_ctm_property == NULL) {
+        return -ENOMEM;
+    }
+
+    // Degamma TF + LUT + LUT Size + Multiplier
+
+    nv_dev->nv_plane_degamma_tf_property =
+        drm_property_create_enum(nv_dev->dev, 0,
+            "NV_PLANE_DEGAMMA_TF", tf_enum_list,
+            NV_DRM_TRANSFER_FUNCTION_MAX);
+    if (nv_dev->nv_plane_degamma_tf_property == NULL) {
+        return -ENOMEM;
+    }
+    nv_dev->nv_plane_degamma_lut_property =
+        drm_property_create(nv_dev->dev, DRM_MODE_PROP_BLOB,
+            "NV_PLANE_DEGAMMA_LUT", 0);
+    if (nv_dev->nv_plane_degamma_lut_property == NULL) {
+        return -ENOMEM;
+    }
+    nv_dev->nv_plane_degamma_lut_size_property =
+        drm_property_create_range(nv_dev->dev, DRM_MODE_PROP_IMMUTABLE,
+            "NV_PLANE_DEGAMMA_LUT_SIZE", 0, UINT_MAX);
+    if (nv_dev->nv_plane_degamma_lut_size_property == NULL) {
+        return -ENOMEM;
+    }
+    nv_dev->nv_plane_degamma_multiplier_property =
+        drm_property_create_range(nv_dev->dev, 0,
+            "NV_PLANE_DEGAMMA_MULTIPLIER", 0,
+            U64_MAX & ~(((NvU64) 1) << 63)); // No negative values
+    if (nv_dev->nv_plane_degamma_multiplier_property == NULL) {
+        return -ENOMEM;
+    }
+
+    // TMO LUT + LUT Size
+
+    nv_dev->nv_plane_tmo_lut_property =
+        drm_property_create(nv_dev->dev, DRM_MODE_PROP_BLOB,
+            "NV_PLANE_TMO_LUT", 0);
+    if (nv_dev->nv_plane_tmo_lut_property == NULL) {
+        return -ENOMEM;
+    }
+    nv_dev->nv_plane_tmo_lut_size_property =
+        drm_property_create_range(nv_dev->dev, DRM_MODE_PROP_IMMUTABLE,
+            "NV_PLANE_TMO_LUT_SIZE", 0, UINT_MAX);
+    if (nv_dev->nv_plane_tmo_lut_size_property == NULL) {
+        return -ENOMEM;
+    }
+
+    // REGAMMA TF + LUT + LUT Size + Divisor
+
+    nv_dev->nv_crtc_regamma_tf_property =
+        drm_property_create_enum(nv_dev->dev, 0,
+            "NV_CRTC_REGAMMA_TF", tf_enum_list,
+            NV_DRM_TRANSFER_FUNCTION_MAX);
+    if (nv_dev->nv_crtc_regamma_tf_property == NULL) {
+        return -ENOMEM;
+    }
+    nv_dev->nv_crtc_regamma_lut_property =
+        drm_property_create(nv_dev->dev, DRM_MODE_PROP_BLOB,
+            "NV_CRTC_REGAMMA_LUT", 0);
+    if (nv_dev->nv_crtc_regamma_lut_property == NULL) {
+        return -ENOMEM;
+    }
+    nv_dev->nv_crtc_regamma_lut_size_property =
+        drm_property_create_range(nv_dev->dev, DRM_MODE_PROP_IMMUTABLE,
+            "NV_CRTC_REGAMMA_LUT_SIZE", 0, UINT_MAX);
+    if (nv_dev->nv_crtc_regamma_lut_size_property == NULL) {
+        return -ENOMEM;
+    }
+    // S31.32
+    nv_dev->nv_crtc_regamma_divisor_property =
+        drm_property_create_range(nv_dev->dev, 0,
+            "NV_CRTC_REGAMMA_DIVISOR",
+            (((NvU64) 1) << 32), // No values between 0 and 1
+            U64_MAX & ~(((NvU64) 1) << 63)); // No negative values
+    if (nv_dev->nv_crtc_regamma_divisor_property == NULL) {
+        return -ENOMEM;
+    }
+
    return 0;
 }

@ -476,7 +612,7 @@ static int nv_drm_load(struct drm_device *dev, unsigned long flags)
        return -ENODEV;
    }

-#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE)
+#if defined(NV_DRM_FBDEV_AVAILABLE)
    /*
     * If fbdev is enabled, take modeset ownership now before other DRM clients
     * can take master (and thus NVKMS ownership).
@ -548,6 +684,13 @@ static int nv_drm_load(struct drm_device *dev, unsigned long flags)

    ret = nv_drm_create_properties(nv_dev);
    if (ret < 0) {
+        drm_mode_config_cleanup(dev);
+#if defined(NV_DRM_FBDEV_AVAILABLE)
+        if (nv_dev->hasFramebufferConsole) {
+            nvKms->releaseOwnership(nv_dev->pDevice);
+        }
+#endif
+        nvKms->freeDevice(nv_dev->pDevice);
        return -ENODEV;
    }

@ -610,7 +753,7 @@ static void __nv_drm_unload(struct drm_device *dev)

    /* Release modeset ownership if fbdev is enabled */

-#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE)
+#if defined(NV_DRM_FBDEV_AVAILABLE)
    if (nv_dev->hasFramebufferConsole) {
        drm_atomic_helper_shutdown(dev);
        nvKms->releaseOwnership(nv_dev->pDevice);
@ -710,36 +853,37 @@ void nv_drm_master_drop(struct drm_device *dev, struct drm_file *file_priv)
 #endif
 {
    struct nv_drm_device *nv_dev = to_nv_device(dev);
-    int err;

    nv_drm_revoke_modeset_permission(dev, file_priv, 0);
    nv_drm_revoke_sub_ownership(dev);

-    /*
-     * After dropping nvkms modeset onwership, it is not guaranteed that
-     * drm and nvkms modeset state will remain in sync.  Therefore, disable
-     * all outputs and crtcs before dropping nvkms modeset ownership.
-     *
-     * First disable all active outputs atomically and then disable each crtc one
-     * by one, there is not helper function available to disable all crtcs
-     * atomically.
-     */
-
-    drm_modeset_lock_all(dev);
-
-    if ((err = nv_drm_atomic_helper_disable_all(
-            dev,
-            dev->mode_config.acquire_ctx)) != 0) {
-
-        NV_DRM_DEV_LOG_ERR(
-            nv_dev,
-            "nv_drm_atomic_helper_disable_all failed with error code %d !",
-            err);
-    }
-
-    drm_modeset_unlock_all(dev);
-
    if (!nv_dev->hasFramebufferConsole) {
+        int err;
+
+        /*
+         * After dropping nvkms modeset onwership, it is not guaranteed that drm
+         * and nvkms modeset state will remain in sync.  Therefore, disable all
+         * outputs and crtcs before dropping nvkms modeset ownership.
+         *
+         * First disable all active outputs atomically and then disable each
+         * crtc one by one, there is not helper function available to disable
+         * all crtcs atomically.
+         */
+
+        drm_modeset_lock_all(dev);
+
+        if ((err = nv_drm_atomic_helper_disable_all(
+                dev,
+                dev->mode_config.acquire_ctx)) != 0) {
+
+            NV_DRM_DEV_LOG_ERR(
+                nv_dev,
+                "nv_drm_atomic_helper_disable_all failed with error code %d !",
+                err);
+        }
+
+        drm_modeset_unlock_all(dev);
+
        nvKms->releaseOwnership(nv_dev->pDevice);
    }
 }
@ -1684,14 +1828,19 @@ static struct drm_driver nv_drm_driver = {
    .num_ioctls             = ARRAY_SIZE(nv_drm_ioctls),

 /*
- * Linux kernel v6.6 commit 71a7974ac701 ("drm/prime: Unexport helpers
- * for fd/handle conversion") unexports drm_gem_prime_handle_to_fd() and
- * drm_gem_prime_fd_to_handle().
+ * Linux kernel v6.6 commit 6b85aa68d9d5 ("drm: Enable PRIME import/export for
+ * all drivers") made drm_gem_prime_handle_to_fd() /
+ * drm_gem_prime_fd_to_handle() the default when .prime_handle_to_fd /
+ * .prime_fd_to_handle are unspecified, respectively.
 *
- * Prior Linux kernel v6.6 commit 6b85aa68d9d5 ("drm: Enable PRIME
- * import/export for all drivers") made these helpers the default when
- * .prime_handle_to_fd / .prime_fd_to_handle are unspecified, so it's fine
- * to just skip specifying them if the helpers aren't present.
+ * Linux kernel v6.6 commit 71a7974ac701 ("drm/prime: Unexport helpers for
+ * fd/handle conversion") unexports drm_gem_prime_handle_to_fd() and
+ * drm_gem_prime_fd_to_handle(). However, because of the aforementioned commit,
+ * it's fine to just skip specifying them in this case.
+ *
+ * Linux kernel v6.7 commit 0514f63cfff3 ("Revert "drm/prime: Unexport helpers
+ * for fd/handle conversion"") exported the helpers again, but left the default
+ * behavior intact. Nonetheless, it does not hurt to specify them.
 */
 #if NV_IS_EXPORT_SYMBOL_PRESENT_drm_gem_prime_handle_to_fd
    .prime_handle_to_fd     = drm_gem_prime_handle_to_fd,
@ -1703,6 +1852,21 @@ static struct drm_driver nv_drm_driver = {
    .gem_prime_import       = nv_drm_gem_prime_import,
    .gem_prime_import_sg_table = nv_drm_gem_prime_import_sg_table,

+/*
+ * Linux kernel v5.0 commit 7698799f95 ("drm/prime: Add drm_gem_prime_mmap()")
+ * added drm_gem_prime_mmap().
+ *
+ * Linux kernel v6.6 commit 0adec22702d4 ("drm: Remove struct
+ * drm_driver.gem_prime_mmap") removed .gem_prime_mmap, but replaced it with a
+ * direct call to drm_gem_prime_mmap().
+ *
+ * TODO: Support .gem_prime_mmap on Linux < v5.0 using internal implementation.
+ */
+#if defined(NV_DRM_GEM_PRIME_MMAP_PRESENT) && \
+    defined(NV_DRM_DRIVER_HAS_GEM_PRIME_MMAP)
+    .gem_prime_mmap         = drm_gem_prime_mmap,
+#endif
+
 #if defined(NV_DRM_DRIVER_HAS_GEM_PRIME_CALLBACKS)
    .gem_prime_export       = drm_gem_prime_export,
    .gem_prime_get_sg_table = nv_drm_gem_prime_get_sg_table,
@ -1838,7 +2002,7 @@ void nv_drm_register_drm_device(const nv_gpu_info_t *gpu_info)
        goto failed_drm_register;
    }

-#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE)
+#if defined(NV_DRM_FBDEV_AVAILABLE)
    if (nv_drm_fbdev_module_param &&
        drm_core_check_feature(dev, DRIVER_MODESET)) {

@ -1850,10 +2014,15 @@ void nv_drm_register_drm_device(const nv_gpu_info_t *gpu_info)
 #else
            drm_aperture_remove_conflicting_pci_framebuffers(pdev, nv_drm_driver.name);
 #endif
+            nvKms->framebufferConsoleDisabled(nv_dev->pDevice);
        }
+        #if defined(NV_DRM_FBDEV_TTM_AVAILABLE)
+        drm_fbdev_ttm_setup(dev, 32);
+        #elif defined(NV_DRM_FBDEV_GENERIC_AVAILABLE)
        drm_fbdev_generic_setup(dev, 32);
+        #endif
    }
-#endif /* defined(NV_DRM_FBDEV_GENERIC_AVAILABLE) */
+#endif /* defined(NV_DRM_FBDEV_AVAILABLE) */

    /* Add NVIDIA-DRM device into list */

@ -1995,12 +2164,12 @@ void nv_drm_suspend_resume(NvBool suspend)

        if (suspend) {
            drm_kms_helper_poll_disable(dev);
-#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE)
+#if defined(NV_DRM_FBDEV_AVAILABLE)
            drm_fb_helper_set_suspend_unlocked(dev->fb_helper, 1);
 #endif
            drm_mode_config_reset(dev);
        } else {
-#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE)
+#if defined(NV_DRM_FBDEV_AVAILABLE)
            drm_fb_helper_set_suspend_unlocked(dev->fb_helper, 0);
 #endif
            drm_kms_helper_poll_enable(dev);
--- a/kernel-open/nvidia-drm/nvidia-drm-fb.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-fb.c
@ -36,12 +36,15 @@

 static void __nv_drm_framebuffer_free(struct nv_drm_framebuffer *nv_fb)
 {
+    struct drm_framebuffer *fb = &nv_fb->base;
    uint32_t i;

    /* Unreference gem object */
-    for (i = 0; i < ARRAY_SIZE(nv_fb->nv_gem); i++) {
-        if (nv_fb->nv_gem[i] != NULL) {
-            nv_drm_gem_object_unreference_unlocked(nv_fb->nv_gem[i]);
+    for (i = 0; i < NVKMS_MAX_PLANES_PER_SURFACE; i++) {
+        struct drm_gem_object *gem = nv_fb_get_gem_obj(fb, i);
+        if (gem != NULL) {
+            struct nv_drm_gem_object *nv_gem = to_nv_gem_object(gem);
+            nv_drm_gem_object_unreference_unlocked(nv_gem);
        }
    }

@ -69,10 +72,8 @@ static int
 nv_drm_framebuffer_create_handle(struct drm_framebuffer *fb,
                                 struct drm_file *file, unsigned int *handle)
 {
-    struct nv_drm_framebuffer *nv_fb = to_nv_framebuffer(fb);
-
    return nv_drm_gem_handle_create(file,
-                                    nv_fb->nv_gem[0],
+                                    to_nv_gem_object(nv_fb_get_gem_obj(fb, 0)),
                                    handle);
 }

@ -88,6 +89,7 @@ static struct nv_drm_framebuffer *nv_drm_framebuffer_alloc(
 {
    struct nv_drm_device *nv_dev = to_nv_device(dev);
    struct nv_drm_framebuffer *nv_fb;
+    struct nv_drm_gem_object *nv_gem;
    const int num_planes = nv_drm_format_num_planes(cmd->pixel_format);
    uint32_t i;

@ -101,21 +103,22 @@ static struct nv_drm_framebuffer *nv_drm_framebuffer_alloc(
        return ERR_PTR(-ENOMEM);
    }

-    if (num_planes > ARRAY_SIZE(nv_fb->nv_gem)) {
+    if (num_planes > NVKMS_MAX_PLANES_PER_SURFACE) {
        NV_DRM_DEV_DEBUG_DRIVER(nv_dev, "Unsupported number of planes");
        goto failed;
    }

    for (i = 0; i < num_planes; i++) {
-        if ((nv_fb->nv_gem[i] = nv_drm_gem_object_lookup(
-                        dev,
-                        file,
-                        cmd->handles[i])) == NULL) {
+        nv_gem = nv_drm_gem_object_lookup(dev, file, cmd->handles[i]);
+
+        if (nv_gem == NULL) {
            NV_DRM_DEV_DEBUG_DRIVER(
                nv_dev,
                "Failed to find gem object of type nvkms memory");
            goto failed;
        }
+
+        nv_fb_set_gem_obj(&nv_fb->base, i, &nv_gem->base);
    }

     return nv_fb;
@ -135,12 +138,14 @@ static int nv_drm_framebuffer_init(struct drm_device *dev,
 {
    struct nv_drm_device *nv_dev = to_nv_device(dev);
    struct NvKmsKapiCreateSurfaceParams params = { };
+    struct nv_drm_gem_object *nv_gem;
+    struct drm_framebuffer *fb = &nv_fb->base;
    uint32_t i;
    int ret;

    /* Initialize the base framebuffer object and add it to drm subsystem */

-    ret = drm_framebuffer_init(dev, &nv_fb->base, &nv_framebuffer_funcs);
+    ret = drm_framebuffer_init(dev, fb, &nv_framebuffer_funcs);
    if (ret != 0) {
        NV_DRM_DEV_DEBUG_DRIVER(
            nv_dev,
@ -148,23 +153,18 @@ static int nv_drm_framebuffer_init(struct drm_device *dev,
        return ret;
    }

-    for (i = 0; i < ARRAY_SIZE(nv_fb->nv_gem); i++) {
-        if (nv_fb->nv_gem[i] != NULL) {
-            if (!nvKms->isMemoryValidForDisplay(nv_dev->pDevice,
-                                                nv_fb->nv_gem[i]->pMemory)) {
-                NV_DRM_DEV_LOG_INFO(
-                        nv_dev,
-                        "Framebuffer memory not appropriate for scanout");
-                goto fail;
-            }
+    for (i = 0; i < NVKMS_MAX_PLANES_PER_SURFACE; i++) {
+        struct drm_gem_object *gem = nv_fb_get_gem_obj(fb, i);
+        if (gem != NULL) {
+            nv_gem = to_nv_gem_object(gem);

-            params.planes[i].memory = nv_fb->nv_gem[i]->pMemory;
-            params.planes[i].offset = nv_fb->base.offsets[i];
-            params.planes[i].pitch = nv_fb->base.pitches[i];
+            params.planes[i].memory = nv_gem->pMemory;
+            params.planes[i].offset = fb->offsets[i];
+            params.planes[i].pitch = fb->pitches[i];
        }
    }
-    params.height = nv_fb->base.height;
-    params.width = nv_fb->base.width;
+    params.height = fb->height;
+    params.width = fb->width;
    params.format = format;

    if (have_modifier) {
@ -199,7 +199,7 @@ static int nv_drm_framebuffer_init(struct drm_device *dev,
    return 0;

 fail:
-    drm_framebuffer_cleanup(&nv_fb->base);
+    drm_framebuffer_cleanup(fb);
    return -EINVAL;
 }

--- a/kernel-open/nvidia-drm/nvidia-drm-fb.h
+++ b/kernel-open/nvidia-drm/nvidia-drm-fb.h
@ -41,8 +41,10 @@
 struct nv_drm_framebuffer {
    struct NvKmsKapiSurface *pSurface;

-    struct nv_drm_gem_object*
-        nv_gem[NVKMS_MAX_PLANES_PER_SURFACE];
+#if !defined(NV_DRM_FRAMEBUFFER_OBJ_PRESENT)
+    struct drm_gem_object*
+        obj[NVKMS_MAX_PLANES_PER_SURFACE];
+#endif

    struct drm_framebuffer base;
 };
@ -56,6 +58,29 @@ static inline struct nv_drm_framebuffer *to_nv_framebuffer(
    return container_of(fb, struct nv_drm_framebuffer, base);
 }

+static inline struct drm_gem_object *nv_fb_get_gem_obj(
+    struct drm_framebuffer *fb,
+    uint32_t plane)
+{
+#if defined(NV_DRM_FRAMEBUFFER_OBJ_PRESENT)
+    return fb->obj[plane];
+#else
+    return to_nv_framebuffer(fb)->obj[plane];
+#endif
+}
+
+static inline void nv_fb_set_gem_obj(
+    struct drm_framebuffer *fb,
+    uint32_t plane,
+    struct drm_gem_object *obj)
+{
+#if defined(NV_DRM_FRAMEBUFFER_OBJ_PRESENT)
+    fb->obj[plane] = obj;
+#else
+    to_nv_framebuffer(fb)->obj[plane] = obj;
+#endif
+}
+
 struct drm_framebuffer *nv_drm_internal_framebuffer_create(
    struct drm_device *dev,
    struct drm_file *file,
--- a/kernel-open/nvidia-drm/nvidia-drm-gem-nvkms-memory.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-gem-nvkms-memory.c
@ -71,9 +71,20 @@ static void __nv_drm_gem_nvkms_memory_free(struct nv_drm_gem_object *nv_gem)
    nv_drm_free(nv_nvkms_memory);
 }

+static int __nv_drm_gem_nvkms_map(
+    struct nv_drm_gem_nvkms_memory *nv_nvkms_memory);
+
 static int __nv_drm_gem_nvkms_mmap(struct nv_drm_gem_object *nv_gem,
                                   struct vm_area_struct *vma)
 {
+    struct nv_drm_gem_nvkms_memory *nv_nvkms_memory =
+        to_nv_nvkms_memory(nv_gem);
+
+    int ret = __nv_drm_gem_nvkms_map(nv_nvkms_memory);
+    if (ret) {
+       return ret;
+    }
+
    return drm_gem_mmap_obj(&nv_gem->base,
                drm_vma_node_size(&nv_gem->base.vma_node) << PAGE_SHIFT, vma);
 }
@ -146,11 +157,18 @@ static struct drm_gem_object *__nv_drm_gem_nvkms_prime_dup(
 static int __nv_drm_gem_nvkms_map(
    struct nv_drm_gem_nvkms_memory *nv_nvkms_memory)
 {
+    int ret = 0;
    struct nv_drm_device *nv_dev = nv_nvkms_memory->base.nv_dev;
    struct NvKmsKapiMemory *pMemory = nv_nvkms_memory->base.pMemory;

+    mutex_lock(&nv_nvkms_memory->map_lock);
+
+    if (nv_nvkms_memory->physically_mapped) {
+        goto done;
+    }
+
    if (!nv_dev->hasVideoMemory) {
-        return 0;
+        goto done;
    }

    if (!nvKms->mapMemory(nv_dev->pDevice,
@ -161,7 +179,8 @@ static int __nv_drm_gem_nvkms_map(
            nv_dev,
            "Failed to map NvKmsKapiMemory 0x%p",
            pMemory);
-        return -ENOMEM;
+        ret = -ENOMEM;
+        goto done;
    }

    nv_nvkms_memory->pWriteCombinedIORemapAddress = ioremap_wc(
@ -177,7 +196,9 @@ static int __nv_drm_gem_nvkms_map(

    nv_nvkms_memory->physically_mapped = true;

-    return 0;
+done:
+    mutex_unlock(&nv_nvkms_memory->map_lock);
+    return ret;
 }

 static void *__nv_drm_gem_nvkms_prime_vmap(
@ -186,14 +207,38 @@ static void *__nv_drm_gem_nvkms_prime_vmap(
    struct nv_drm_gem_nvkms_memory *nv_nvkms_memory =
        to_nv_nvkms_memory(nv_gem);

-    if (!nv_nvkms_memory->physically_mapped) {
-        int ret = __nv_drm_gem_nvkms_map(nv_nvkms_memory);
-        if (ret) {
-           return ERR_PTR(ret);
-        }
+    int ret = __nv_drm_gem_nvkms_map(nv_nvkms_memory);
+    if (ret) {
+       return ERR_PTR(ret);
    }

-    return nv_nvkms_memory->pWriteCombinedIORemapAddress;
+    if (nv_nvkms_memory->physically_mapped) {
+        return nv_nvkms_memory->pWriteCombinedIORemapAddress;
+    }
+
+    /*
+     * If this buffer isn't physically mapped, it might be backed by struct
+     * pages. Use vmap in that case.
+     */
+    if (nv_nvkms_memory->pages_count > 0) {
+         return nv_drm_vmap(nv_nvkms_memory->pages,
+                            nv_nvkms_memory->pages_count);
+    }
+
+    return ERR_PTR(-ENOMEM);
+}
+
+static void __nv_drm_gem_nvkms_prime_vunmap(
+    struct nv_drm_gem_object *nv_gem,
+    void *address)
+{
+    struct nv_drm_gem_nvkms_memory *nv_nvkms_memory =
+        to_nv_nvkms_memory(nv_gem);
+
+    if (!nv_nvkms_memory->physically_mapped &&
+        nv_nvkms_memory->pages_count > 0) {
+        nv_drm_vunmap(address);
+    }
 }

 static int __nv_drm_gem_map_nvkms_memory_offset(
@ -201,17 +246,7 @@ static int __nv_drm_gem_map_nvkms_memory_offset(
    struct nv_drm_gem_object *nv_gem,
    uint64_t *offset)
 {
-    struct nv_drm_gem_nvkms_memory *nv_nvkms_memory =
-        to_nv_nvkms_memory(nv_gem);
-
-    if (!nv_nvkms_memory->physically_mapped) {
-        int ret = __nv_drm_gem_nvkms_map(nv_nvkms_memory);
-        if (ret) {
-           return ret;
-        }
-    }
-
-    return nv_drm_gem_create_mmap_offset(&nv_nvkms_memory->base, offset);
+    return nv_drm_gem_create_mmap_offset(nv_gem, offset);
 }

 static struct sg_table *__nv_drm_gem_nvkms_memory_prime_get_sg_table(
@ -223,7 +258,7 @@ static struct sg_table *__nv_drm_gem_nvkms_memory_prime_get_sg_table(
    struct sg_table *sg_table;

    if (nv_nvkms_memory->pages_count == 0) {
-        NV_DRM_DEV_LOG_ERR(
+        NV_DRM_DEV_DEBUG_DRIVER(
                nv_dev,
                "Cannot create sg_table for NvKmsKapiMemory 0x%p",
                nv_gem->pMemory);
@ -241,6 +276,7 @@ const struct nv_drm_gem_object_funcs nv_gem_nvkms_memory_ops = {
    .free = __nv_drm_gem_nvkms_memory_free,
    .prime_dup = __nv_drm_gem_nvkms_prime_dup,
    .prime_vmap = __nv_drm_gem_nvkms_prime_vmap,
+    .prime_vunmap = __nv_drm_gem_nvkms_prime_vunmap,
    .mmap = __nv_drm_gem_nvkms_mmap,
    .handle_vma_fault = __nv_drm_gem_nvkms_handle_vma_fault,
    .create_mmap_offset = __nv_drm_gem_map_nvkms_memory_offset,
@ -265,6 +301,7 @@ static int __nv_drm_nvkms_gem_obj_init(
        return -EINVAL;
    }

+    mutex_init(&nv_nvkms_memory->map_lock);
    nv_nvkms_memory->pPhysicalAddress = NULL;
    nv_nvkms_memory->pWriteCombinedIORemapAddress = NULL;
    nv_nvkms_memory->physically_mapped = false;
--- a/kernel-open/nvidia-drm/nvidia-drm-gem-nvkms-memory.h
+++ b/kernel-open/nvidia-drm/nvidia-drm-gem-nvkms-memory.h
@ -32,8 +32,15 @@
 struct nv_drm_gem_nvkms_memory {
    struct nv_drm_gem_object base;

+    /*
+     * Lock to protect concurrent writes to physically_mapped, pPhysicalAddress,
+     * and pWriteCombinedIORemapAddress.
+     *
+     * __nv_drm_gem_nvkms_map(), the sole writer, is structured such that
+     * readers are not required to hold the lock.
+     */
+    struct mutex map_lock;
    bool physically_mapped;
-
    void *pPhysicalAddress;
    void *pWriteCombinedIORemapAddress;

--- a/kernel-open/nvidia-drm/nvidia-drm-gem-user-memory.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-gem-user-memory.c
@ -36,6 +36,10 @@
 #include "linux/mm.h"
 #include "nv-mm.h"

+#if defined(NV_LINUX_PFN_T_H_PRESENT)
+#include "linux/pfn_t.h"
+#endif
+
 #if defined(NV_BSD)
 #include <vm/vm_pageout.h>
 #endif
@ -103,6 +107,37 @@ static int __nv_drm_gem_user_memory_mmap(struct nv_drm_gem_object *nv_gem,
    return 0;
 }

+#if defined(NV_LINUX) && !defined(NV_VMF_INSERT_MIXED_PRESENT)
+static vm_fault_t __nv_vm_insert_mixed_helper(
+    struct vm_area_struct *vma,
+    unsigned long address,
+    unsigned long pfn)
+{
+    int ret;
+
+#if defined(NV_PFN_TO_PFN_T_PRESENT)
+    ret = vm_insert_mixed(vma, address, pfn_to_pfn_t(pfn));
+#else
+    ret = vm_insert_mixed(vma, address, pfn);
+#endif
+
+    switch (ret) {
+        case 0:
+        case -EBUSY:
+            /*
+             * EBUSY indicates that another thread already handled
+             * the faulted range.
+             */
+            return VM_FAULT_NOPAGE;
+        case -ENOMEM:
+            return VM_FAULT_OOM;
+        default:
+            WARN_ONCE(1, "Unhandled error in %s: %d\n", __FUNCTION__, ret);
+            return VM_FAULT_SIGBUS;
+    }
+}
+#endif
+
 static vm_fault_t __nv_drm_gem_user_memory_handle_vma_fault(
    struct nv_drm_gem_object *nv_gem,
    struct vm_area_struct *vma,
@ -112,36 +147,19 @@ static vm_fault_t __nv_drm_gem_user_memory_handle_vma_fault(
    unsigned long address = nv_page_fault_va(vmf);
    struct drm_gem_object *gem = vma->vm_private_data;
    unsigned long page_offset;
-    vm_fault_t ret;
+    unsigned long pfn;

    page_offset = vmf->pgoff - drm_vma_node_start(&gem->vma_node);
-
    BUG_ON(page_offset >= nv_user_memory->pages_count);
+    pfn = page_to_pfn(nv_user_memory->pages[page_offset]);

 #if !defined(NV_LINUX)
-    ret = vmf_insert_pfn(vma, address, page_to_pfn(nv_user_memory->pages[page_offset]));
-#else /* !defined(NV_LINUX) */
-    ret = vm_insert_page(vma, address, nv_user_memory->pages[page_offset]);
-    switch (ret) {
-        case 0:
-        case -EBUSY:
-            /*
-             * EBUSY indicates that another thread already handled
-             * the faulted range.
-             */
-            ret = VM_FAULT_NOPAGE;
-            break;
-        case -ENOMEM:
-            ret = VM_FAULT_OOM;
-            break;
-        default:
-            WARN_ONCE(1, "Unhandled error in %s: %d\n", __FUNCTION__, ret);
-            ret = VM_FAULT_SIGBUS;
-            break;
-    }
-#endif /* !defined(NV_LINUX) */
-
-    return ret;
+    return vmf_insert_pfn(vma, address, pfn);
+#elif defined(NV_VMF_INSERT_MIXED_PRESENT)
+    return vmf_insert_mixed(vma, address, pfn_to_pfn_t(pfn));
+#else
+    return __nv_vm_insert_mixed_helper(vma, address, pfn);
+#endif
 }

 static int __nv_drm_gem_user_create_mmap_offset(
--- a/kernel-open/nvidia-drm/nvidia-drm-gem.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-gem.c
@ -144,6 +144,12 @@ void nv_drm_gem_object_init(struct nv_drm_device *nv_dev,
 #endif

    drm_gem_private_object_init(dev, &nv_gem->base, size);
+
+    /* Create mmap offset early for drm_gem_prime_mmap(), if possible. */
+    if (nv_gem->ops->create_mmap_offset) {
+        uint64_t offset;
+        nv_gem->ops->create_mmap_offset(nv_dev, nv_gem, &offset);
+    }
 }

 struct drm_gem_object *nv_drm_gem_prime_import(struct drm_device *dev,
@ -232,6 +238,7 @@ int nv_drm_gem_map_offset_ioctl(struct drm_device *dev,
        return -EINVAL;
    }

+    /* mmap offset creation is idempotent, fetch it by creating it again. */
    if (nv_gem->ops->create_mmap_offset) {
        ret = nv_gem->ops->create_mmap_offset(nv_dev, nv_gem, &params->offset);
    } else {
--- a/kernel-open/nvidia-drm/nvidia-drm-helper.h
+++ b/kernel-open/nvidia-drm/nvidia-drm-helper.h
@ -40,8 +40,13 @@
 #include <drm/drm_blend.h>
 #endif

-#if defined(NV_DRM_ROTATION_AVAILABLE)
-/* For DRM_MODE_ROTATE_* and DRM_MODE_REFLECT_* */
+#if defined(NV_DRM_ROTATION_AVAILABLE) || \
+    defined(NV_DRM_COLOR_CTM_3X4_PRESENT) || \
+    defined(NV_DRM_COLOR_LUT_PRESENT)
+/*
+ * For DRM_MODE_ROTATE_*, DRM_MODE_REFLECT_*, struct drm_color_ctm_3x4, and
+ * struct drm_color_lut.
+ */
 #include <uapi/drm/drm_mode.h>
 #endif

@ -358,6 +363,24 @@ static inline void nv_drm_connector_put(struct drm_connector *connector)
 #endif
 }

+static inline void nv_drm_property_blob_put(struct drm_property_blob *blob)
+{
+#if defined(NV_DRM_PROPERTY_BLOB_PUT_PRESENT)
+    drm_property_blob_put(blob);
+#else
+    drm_property_unreference_blob(blob);
+#endif
+}
+
+static inline void nv_drm_property_blob_get(struct drm_property_blob *blob)
+{
+#if defined(NV_DRM_PROPERTY_BLOB_PUT_PRESENT)
+    drm_property_blob_get(blob);
+#else
+    drm_property_reference_blob(blob);
+#endif
+}
+
 static inline struct drm_crtc *
 nv_drm_crtc_find(struct drm_device *dev, struct drm_file *filep, uint32_t id)
 {
@ -625,6 +648,31 @@ static inline int nv_drm_format_num_planes(uint32_t format)
 #define DRM_UNLOCKED 0
 #endif

+/*
+ * struct drm_color_ctm_3x4 was added by commit 6872a189be50 ("drm/amd/display:
+ * Add 3x4 CTM support for plane CTM") in v6.8. For backwards compatibility,
+ * define it when not present.
+ */
+#if !defined(NV_DRM_COLOR_CTM_3X4_PRESENT)
+struct drm_color_ctm_3x4 {
+    __u64 matrix[12];
+};
+#endif
+
+/*
+ * struct drm_color_lut was added by commit 5488dc16fde7 ("drm: introduce pipe
+ * color correction properties") in v4.6. For backwards compatibility, define it
+ * when not present.
+ */
+#if !defined(NV_DRM_COLOR_LUT_PRESENT)
+struct drm_color_lut {
+    __u16 red;
+    __u16 green;
+    __u16 blue;
+    __u16 reserved;
+};
+#endif
+
 /*
 * drm_vma_offset_exact_lookup_locked() were added
 * by kernel commit 2225cfe46bcc which was Signed-off-by:
--- a/kernel-open/nvidia-drm/nvidia-drm-linux.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-linux.c
@ -34,7 +34,7 @@ MODULE_PARM_DESC(
    "Enable atomic kernel modesetting (1 = enable, 0 = disable (default))");
 module_param_named(modeset, nv_drm_modeset_module_param, bool, 0400);

-#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE)
+#if defined(NV_DRM_FBDEV_AVAILABLE)
 MODULE_PARM_DESC(
    fbdev,
    "Create a framebuffer device (1 = enable, 0 = disable (default)) (EXPERIMENTAL)");
--- a/kernel-open/nvidia-drm/nvidia-drm-modeset.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-modeset.c
@ -414,6 +414,31 @@ nv_drm_atomic_apply_modeset_config(struct drm_device *dev,
        return -EINVAL;
    }

+#if defined(NV_DRM_FRAMEBUFFER_OBJ_PRESENT)
+    if (commit) {
+        /*
+         * This function does what is necessary to prepare the framebuffers
+         * attached to each new plane in the state for scan out, mostly by
+         * calling back into driver callbacks the NVIDIA driver does not
+         * provide. The end result is that all it does on the NVIDIA driver
+         * is populate the plane state's dma fence pointers with any implicit
+         * sync fences attached to the GEM objects associated with those planes
+         * in the new state, prefering explicit sync fences when appropriate.
+         * This must be done prior to converting the per-plane fences to
+         * semaphore waits below.
+         *
+         * Note this only works when the drm_framebuffer:obj[] field is present
+         * and populated, so skip calling this function on kernels where that
+         * field is not present.
+         */
+        ret = drm_atomic_helper_prepare_planes(dev, state);
+
+        if (ret) {
+            return ret;
+        }
+    }
+#endif /* defined(NV_DRM_FRAMEBUFFER_OBJ_PRESENT) */
+
    memset(requested_config, 0, sizeof(*requested_config));

    /* Loop over affected crtcs and construct NvKmsKapiRequestedModeSetConfig */
--- a/kernel-open/nvidia-drm/nvidia-drm-os-interface.h
+++ b/kernel-open/nvidia-drm/nvidia-drm-os-interface.h
@ -59,14 +59,20 @@ typedef struct nv_timer nv_drm_timer;
 #endif

 #if defined(NV_DRM_FBDEV_GENERIC_SETUP_PRESENT) && defined(NV_DRM_APERTURE_REMOVE_CONFLICTING_PCI_FRAMEBUFFERS_PRESENT)
+#define NV_DRM_FBDEV_AVAILABLE
 #define NV_DRM_FBDEV_GENERIC_AVAILABLE
 #endif

+#if defined(NV_DRM_FBDEV_TTM_SETUP_PRESENT) && defined(NV_DRM_APERTURE_REMOVE_CONFLICTING_PCI_FRAMEBUFFERS_PRESENT)
+#define NV_DRM_FBDEV_AVAILABLE
+#define NV_DRM_FBDEV_TTM_AVAILABLE
+#endif
+
 struct page;

 /* Set to true when the atomic modeset feature is enabled. */
 extern bool nv_drm_modeset_module_param;
-#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE)
+#if defined(NV_DRM_FBDEV_AVAILABLE)
 /* Set to true when the nvidia-drm driver should install a framebuffer device */
 extern bool nv_drm_fbdev_module_param;
 #endif
--- a/kernel-open/nvidia-drm/nvidia-drm-priv.h
+++ b/kernel-open/nvidia-drm/nvidia-drm-priv.h
@ -163,6 +163,24 @@ struct nv_drm_device {
    struct drm_property *nv_hdr_output_metadata_property;
 #endif

+    struct drm_property *nv_plane_lms_ctm_property;
+    struct drm_property *nv_plane_lms_to_itp_ctm_property;
+    struct drm_property *nv_plane_itp_to_lms_ctm_property;
+    struct drm_property *nv_plane_blend_ctm_property;
+
+    struct drm_property *nv_plane_degamma_tf_property;
+    struct drm_property *nv_plane_degamma_lut_property;
+    struct drm_property *nv_plane_degamma_lut_size_property;
+    struct drm_property *nv_plane_degamma_multiplier_property;
+
+    struct drm_property *nv_plane_tmo_lut_property;
+    struct drm_property *nv_plane_tmo_lut_size_property;
+
+    struct drm_property *nv_crtc_regamma_tf_property;
+    struct drm_property *nv_crtc_regamma_lut_property;
+    struct drm_property *nv_crtc_regamma_lut_size_property;
+    struct drm_property *nv_crtc_regamma_divisor_property;
+
    struct nv_drm_device *next;
 };

--- a/kernel-open/nvidia-drm/nvidia-drm-sources.mk
+++ b/kernel-open/nvidia-drm/nvidia-drm-sources.mk
@ -67,10 +67,14 @@ NV_CONFTEST_FUNCTION_COMPILE_TESTS += fence_set_error
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += sync_file_get_fence
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_aperture_remove_conflicting_pci_framebuffers
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_fbdev_generic_setup
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_fbdev_ttm_setup
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_connector_attach_hdr_output_metadata_property
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_helper_crtc_enable_color_mgmt
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_crtc_enable_color_mgmt
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_atomic_helper_legacy_gamma_set
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += vmf_insert_mixed
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += pfn_to_pfn_t
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_gem_prime_mmap

 NV_CONFTEST_TYPE_COMPILE_TESTS += drm_bus_present
 NV_CONFTEST_TYPE_COMPILE_TESTS += drm_bus_has_bus_type
@ -130,3 +134,9 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += drm_aperture_remove_conflicting_pci_framebuffe
 NV_CONFTEST_TYPE_COMPILE_TESTS += drm_mode_create_dp_colorspace_property_has_supported_colorspaces_arg
 NV_CONFTEST_TYPE_COMPILE_TESTS += drm_syncobj_features_present
 NV_CONFTEST_TYPE_COMPILE_TESTS += drm_unlocked_ioctl_flag_present
+NV_CONFTEST_TYPE_COMPILE_TESTS += drm_framebuffer_obj_present
+NV_CONFTEST_TYPE_COMPILE_TESTS += drm_color_ctm_3x4_present
+NV_CONFTEST_TYPE_COMPILE_TESTS += drm_color_lut
+NV_CONFTEST_TYPE_COMPILE_TESTS += drm_property_blob_put
+NV_CONFTEST_TYPE_COMPILE_TESTS += drm_driver_has_gem_prime_mmap
+NV_CONFTEST_TYPE_COMPILE_TESTS += drm_output_poll_changed
--- a/kernel-open/nvidia-modeset/nv-kthread-q.c
+++ b/kernel-open/nvidia-modeset/nv-kthread-q.c
@ -201,7 +201,7 @@ static struct task_struct *thread_create_on_node(int (*threadfn)(void *data),

        // Ran out of attempts - return thread even if its stack may not be
        // allocated on the preferred node
-        if ((i == (attempts - 1)))
+        if (i == (attempts - 1))
            break;

        // Get the NUMA node where the first page of the stack is resident. If
--- a/kernel-open/nvidia-modeset/nvidia-modeset-linux.c
+++ b/kernel-open/nvidia-modeset/nvidia-modeset-linux.c
@ -86,6 +86,9 @@ module_param_named(vblank_sem_control, vblank_sem_control, bool, 0400);
 static bool opportunistic_display_sync = true;
 module_param_named(opportunistic_display_sync, opportunistic_display_sync, bool, 0400);

+static enum NvKmsDebugForceColorSpace debug_force_color_space = NVKMS_DEBUG_FORCE_COLOR_SPACE_NONE;
+module_param_named(debug_force_color_space, debug_force_color_space, uint, 0400);
+
 /* These parameters are used for fault injection tests.  Normally the defaults
 * should be used. */
 MODULE_PARM_DESC(fail_malloc, "Fail the Nth call to nvkms_alloc");
@ -139,6 +142,14 @@ NvBool nvkms_opportunistic_display_sync(void)
    return opportunistic_display_sync;
 }

+enum NvKmsDebugForceColorSpace nvkms_debug_force_color_space(void)
+{
+    if (debug_force_color_space >= NVKMS_DEBUG_FORCE_COLOR_SPACE_MAX) {
+        return NVKMS_DEBUG_FORCE_COLOR_SPACE_NONE;
+    }
+    return debug_force_color_space;
+}
+
 NvBool nvkms_kernel_supports_syncpts(void)
 {
 /*
@ -1084,7 +1095,7 @@ static void nvkms_kapi_event_kthread_q_callback(void *arg)
    nvKmsKapiHandleEventQueueChange(device);
 }

-struct nvkms_per_open *nvkms_open_common(enum NvKmsClientType type,
+static struct nvkms_per_open *nvkms_open_common(enum NvKmsClientType type,
                                         struct NvKmsKapiDevice *device,
                                         int *status)
 {
@ -1136,7 +1147,7 @@ failed:
    return NULL;
 }

-void nvkms_close_pm_locked(struct nvkms_per_open *popen)
+static void nvkms_close_pm_locked(struct nvkms_per_open *popen)
 {
    /*
     * Don't use down_interruptible(): we need to free resources
@ -1199,7 +1210,7 @@ static void nvkms_close_popen(struct nvkms_per_open *popen)
    }
 }

-int nvkms_ioctl_common
+static int nvkms_ioctl_common
 (
    struct nvkms_per_open *popen,
    NvU32 cmd, NvU64 address, const size_t size
@ -1558,6 +1569,48 @@ NvBool nvKmsKapiGetFunctionsTable
 }
 EXPORT_SYMBOL(nvKmsKapiGetFunctionsTable);

+NvU32 nvKmsKapiF16ToF32(NvU16 a)
+{
+    return nvKmsKapiF16ToF32Internal(a);
+}
+EXPORT_SYMBOL(nvKmsKapiF16ToF32);
+
+NvU16 nvKmsKapiF32ToF16(NvU32 a)
+{
+    return nvKmsKapiF32ToF16Internal(a);
+}
+EXPORT_SYMBOL(nvKmsKapiF32ToF16);
+
+NvU32 nvKmsKapiF32Mul(NvU32 a, NvU32 b)
+{
+    return nvKmsKapiF32MulInternal(a, b);
+}
+EXPORT_SYMBOL(nvKmsKapiF32Mul);
+
+NvU32 nvKmsKapiF32Div(NvU32 a, NvU32 b)
+{
+    return nvKmsKapiF32DivInternal(a, b);
+}
+EXPORT_SYMBOL(nvKmsKapiF32Div);
+
+NvU32 nvKmsKapiF32Add(NvU32 a, NvU32 b)
+{
+    return nvKmsKapiF32AddInternal(a, b);
+}
+EXPORT_SYMBOL(nvKmsKapiF32Add);
+
+NvU32 nvKmsKapiF32ToUI32RMinMag(NvU32 a, NvBool exact)
+{
+    return nvKmsKapiF32ToUI32RMinMagInternal(a, exact);
+}
+EXPORT_SYMBOL(nvKmsKapiF32ToUI32RMinMag);
+
+NvU32 nvKmsKapiUI32ToF32(NvU32 a)
+{
+    return nvKmsKapiUI32ToF32Internal(a);
+}
+EXPORT_SYMBOL(nvKmsKapiUI32ToF32);
+
 /*************************************************************************
 * File operation callback functions.
 *************************************************************************/
--- a/kernel-open/nvidia-modeset/nvidia-modeset-os-interface.h
+++ b/kernel-open/nvidia-modeset/nvidia-modeset-os-interface.h
@ -67,6 +67,14 @@ enum NvKmsSyncPtOp {
    NVKMS_SYNCPT_OP_READ_MINVAL,
 };

+enum NvKmsDebugForceColorSpace {
+    NVKMS_DEBUG_FORCE_COLOR_SPACE_NONE,
+    NVKMS_DEBUG_FORCE_COLOR_SPACE_RGB,
+    NVKMS_DEBUG_FORCE_COLOR_SPACE_YUV444,
+    NVKMS_DEBUG_FORCE_COLOR_SPACE_YUV422,
+    NVKMS_DEBUG_FORCE_COLOR_SPACE_MAX,
+};
+
 typedef struct {

    struct {
@ -102,6 +110,7 @@ NvBool nvkms_disable_vrr_memclk_switch(void);
 NvBool nvkms_hdmi_deepcolor(void);
 NvBool nvkms_vblank_sem_control(void);
 NvBool nvkms_opportunistic_display_sync(void);
+enum NvKmsDebugForceColorSpace nvkms_debug_force_color_space(void);

 void   nvkms_call_rm    (void *ops);
 void*  nvkms_alloc      (size_t size,
--- a/kernel-open/nvidia-modeset/nvkms.h
+++ b/kernel-open/nvidia-modeset/nvkms.h
@ -110,4 +110,18 @@ NvBool nvKmsSetBacklight(NvU32 display_id, void *drv_priv, NvU32 brightness);

 NvBool nvKmsOpenDevHasSubOwnerPermissionOrBetter(const struct NvKmsPerOpenDev *pOpenDev);

+NvU32 nvKmsKapiF16ToF32Internal(NvU16 a);
+
+NvU16 nvKmsKapiF32ToF16Internal(NvU32 a);
+
+NvU32 nvKmsKapiF32MulInternal(NvU32 a, NvU32 b);
+
+NvU32 nvKmsKapiF32DivInternal(NvU32 a, NvU32 b);
+
+NvU32 nvKmsKapiF32AddInternal(NvU32 a, NvU32 b);
+
+NvU32 nvKmsKapiF32ToUI32RMinMagInternal(NvU32 a, NvBool exact);
+
+NvU32 nvKmsKapiUI32ToF32Internal(NvU32 a);
+
 #endif /* __NV_KMS_H__ */
--- a/kernel-open/nvidia-uvm/clc365.h
+++ b/kernel-open/nvidia-uvm/clc365.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2023 NVIDIA Corporation
+    Copyright (c) 2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
--- a/kernel-open/nvidia-uvm/clc369.h
+++ b/kernel-open/nvidia-uvm/clc369.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2023 NVIDIA Corporation
+    Copyright (c) 2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
--- a/kernel-open/nvidia-uvm/ctrl2080mc.h
+++ b/kernel-open/nvidia-uvm/ctrl2080mc.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2013-2023 NVIDIA Corporation
+    Copyright (c) 2013-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
--- a/kernel-open/nvidia-uvm/hwref/blackwell/gb100/dev_fault.h
+++ b/kernel-open/nvidia-uvm/hwref/blackwell/gb100/dev_fault.h
@ -462,6 +462,7 @@
 #define NV_PFAULT_CLIENT_HUB_SCC3        0x00000044 /*       */
 #define NV_PFAULT_CLIENT_HUB_SCC_NB3     0x00000045 /*       */
 #define NV_PFAULT_CLIENT_HUB_RASTERTWOD1 0x00000046 /*       */
+#define NV_PFAULT_CLIENT_HUB_PTP_X8      0x00000046 /*       */
 #define NV_PFAULT_CLIENT_HUB_RASTERTWOD2 0x00000047 /*       */
 #define NV_PFAULT_CLIENT_HUB_RASTERTWOD3 0x00000048 /*       */
 #define NV_PFAULT_CLIENT_HUB_GSPLITE1    0x00000049 /*       */
--- a/kernel-open/nvidia-uvm/nv-kthread-q-selftest.c
+++ b/kernel-open/nvidia-uvm/nv-kthread-q-selftest.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016 NVIDIA Corporation
+    Copyright (c) 2016-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -81,7 +81,7 @@
 #define NUM_Q_ITEMS_IN_MULTITHREAD_TEST (NUM_TEST_Q_ITEMS * NUM_TEST_KTHREADS)

 // This exists in order to have a function to place a breakpoint on:
-void on_nvq_assert(void)
+static void on_nvq_assert(void)
 {
    (void)NULL;
 }
--- a/kernel-open/nvidia-uvm/nv-kthread-q.c
+++ b/kernel-open/nvidia-uvm/nv-kthread-q.c
@ -201,7 +201,7 @@ static struct task_struct *thread_create_on_node(int (*threadfn)(void *data),

        // Ran out of attempts - return thread even if its stack may not be
        // allocated on the preferred node
-        if ((i == (attempts - 1)))
+        if (i == (attempts - 1))
            break;

        // Get the NUMA node where the first page of the stack is resident. If
--- a/kernel-open/nvidia-uvm/nvidia-uvm-sources.Kbuild
+++ b/kernel-open/nvidia-uvm/nvidia-uvm-sources.Kbuild
@ -1,7 +1,6 @@
 NVIDIA_UVM_SOURCES ?=
 NVIDIA_UVM_SOURCES_CXX ?=

-NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_ats_sva.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_conf_computing.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_sec2_test.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_maxwell_sec2.c
@ -12,7 +11,6 @@ NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_blackwell_mmu.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_blackwell_host.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_common.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_linux.c
-NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_debug_optimized.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/nvstatus.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/nvCpuUuid.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/nv-kthread-q.c
@ -36,6 +34,7 @@ NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_range_tree.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_rb_tree.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_range_allocator.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_va_range.c
+NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_va_range_device_p2p.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_va_policy.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_va_block.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_range_group.c
@ -101,6 +100,7 @@ NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_perf_prefetch.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_ats.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_ats_ibm.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_ats_faults.c
+NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_ats_sva.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_test.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_test_rng.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_range_tree_test.c
--- a/kernel-open/nvidia-uvm/nvidia-uvm.Kbuild
+++ b/kernel-open/nvidia-uvm/nvidia-uvm.Kbuild
@ -13,19 +13,6 @@ NVIDIA_UVM_OBJECTS =
 include $(src)/nvidia-uvm/nvidia-uvm-sources.Kbuild
 NVIDIA_UVM_OBJECTS += $(patsubst %.c,%.o,$(NVIDIA_UVM_SOURCES))

-# Some linux kernel functions rely on being built with optimizations on and
-# to work around this we put wrappers for them in a separate file that's built
-# with optimizations on in debug builds and skipped in other builds.
-# Notably gcc 4.4 supports per function optimization attributes that would be
-# easier to use, but is too recent to rely on for now.
-NVIDIA_UVM_DEBUG_OPTIMIZED_SOURCE := nvidia-uvm/uvm_debug_optimized.c
-NVIDIA_UVM_DEBUG_OPTIMIZED_OBJECT := $(patsubst %.c,%.o,$(NVIDIA_UVM_DEBUG_OPTIMIZED_SOURCE))
-
-ifneq ($(UVM_BUILD_TYPE),debug)
-  # Only build the wrappers on debug builds
-  NVIDIA_UVM_OBJECTS := $(filter-out $(NVIDIA_UVM_DEBUG_OPTIMIZED_OBJECT), $(NVIDIA_UVM_OBJECTS))
-endif
-
 obj-m += nvidia-uvm.o
 nvidia-uvm-y := $(NVIDIA_UVM_OBJECTS)

@ -36,15 +23,14 @@ NVIDIA_UVM_KO = nvidia-uvm/nvidia-uvm.ko
 #

 ifeq ($(UVM_BUILD_TYPE),debug)
-  NVIDIA_UVM_CFLAGS += -DDEBUG -O1 -g
-else
-  ifeq ($(UVM_BUILD_TYPE),develop)
-    # -DDEBUG is required, in order to allow pr_devel() print statements to
-    # work:
-    NVIDIA_UVM_CFLAGS += -DDEBUG
-    NVIDIA_UVM_CFLAGS += -DNVIDIA_UVM_DEVELOP
-  endif
-  NVIDIA_UVM_CFLAGS += -O2
+  NVIDIA_UVM_CFLAGS += -DDEBUG -g
+endif
+
+ifeq ($(UVM_BUILD_TYPE),develop)
+  # -DDEBUG is required, in order to allow pr_devel() print statements to
+  # work:
+  NVIDIA_UVM_CFLAGS += -DDEBUG
+  NVIDIA_UVM_CFLAGS += -DNVIDIA_UVM_DEVELOP
 endif

 NVIDIA_UVM_CFLAGS += -DNVIDIA_UVM_ENABLED
@ -56,30 +42,17 @@ NVIDIA_UVM_CFLAGS += -I$(src)/nvidia-uvm

 $(call ASSIGN_PER_OBJ_CFLAGS, $(NVIDIA_UVM_OBJECTS), $(NVIDIA_UVM_CFLAGS))

-ifeq ($(UVM_BUILD_TYPE),debug)
-  # Force optimizations on for the wrappers
-  $(call ASSIGN_PER_OBJ_CFLAGS, $(NVIDIA_UVM_DEBUG_OPTIMIZED_OBJECT), $(NVIDIA_UVM_CFLAGS) -O2)
-endif
-
 #
 # Register the conftests needed by nvidia-uvm.ko
 #

 NV_OBJECTS_DEPEND_ON_CONFTEST += $(NVIDIA_UVM_OBJECTS)

-NV_CONFTEST_FUNCTION_COMPILE_TESTS += wait_on_bit_lock_argument_count
-NV_CONFTEST_FUNCTION_COMPILE_TESTS += pde_data
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += radix_tree_empty
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += radix_tree_replace_slot
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += pnv_npu2_init_context
-NV_CONFTEST_FUNCTION_COMPILE_TESTS += vmf_insert_pfn
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += cpumask_of_node
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += list_is_first
-NV_CONFTEST_FUNCTION_COMPILE_TESTS += timer_setup
-NV_CONFTEST_FUNCTION_COMPILE_TESTS += pci_bus_address
-NV_CONFTEST_FUNCTION_COMPILE_TESTS += set_memory_uc
-NV_CONFTEST_FUNCTION_COMPILE_TESTS += set_pages_uc
-NV_CONFTEST_FUNCTION_COMPILE_TESTS += ktime_get_raw_ts64
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += ioasid_get
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += mm_pasid_drop
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += mmget_not_zero
@ -88,26 +61,13 @@ NV_CONFTEST_FUNCTION_COMPILE_TESTS += iommu_sva_bind_device_has_drvdata_arg
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += vm_fault_to_errno
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += find_next_bit_wrap
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += iommu_is_dma_domain
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += folio_test_swapcache

-NV_CONFTEST_TYPE_COMPILE_TESTS += backing_dev_info
-NV_CONFTEST_TYPE_COMPILE_TESTS += mm_context_t
-NV_CONFTEST_TYPE_COMPILE_TESTS += get_user_pages_remote
-NV_CONFTEST_TYPE_COMPILE_TESTS += get_user_pages
-NV_CONFTEST_TYPE_COMPILE_TESTS += pin_user_pages_remote
-NV_CONFTEST_TYPE_COMPILE_TESTS += pin_user_pages
-NV_CONFTEST_TYPE_COMPILE_TESTS += vm_fault_has_address
 NV_CONFTEST_TYPE_COMPILE_TESTS += vm_ops_fault_removed_vma_arg
-NV_CONFTEST_TYPE_COMPILE_TESTS += kmem_cache_has_kobj_remove_work
-NV_CONFTEST_TYPE_COMPILE_TESTS += sysfs_slab_unlink
-NV_CONFTEST_TYPE_COMPILE_TESTS += vm_fault_t
 NV_CONFTEST_TYPE_COMPILE_TESTS += mmu_notifier_ops_invalidate_range
 NV_CONFTEST_TYPE_COMPILE_TESTS += mmu_notifier_ops_arch_invalidate_secondary_tlbs
-NV_CONFTEST_TYPE_COMPILE_TESTS += proc_ops
-NV_CONFTEST_TYPE_COMPILE_TESTS += timespec64
-NV_CONFTEST_TYPE_COMPILE_TESTS += mm_has_mmap_lock
 NV_CONFTEST_TYPE_COMPILE_TESTS += migrate_vma_added_flags
 NV_CONFTEST_TYPE_COMPILE_TESTS += migrate_device_range
-NV_CONFTEST_TYPE_COMPILE_TESTS += vm_area_struct_has_const_vm_flags
 NV_CONFTEST_TYPE_COMPILE_TESTS += handle_mm_fault_has_mm_arg
 NV_CONFTEST_TYPE_COMPILE_TESTS += handle_mm_fault_has_pt_regs_arg
 NV_CONFTEST_TYPE_COMPILE_TESTS += mempolicy_has_unified_nodes
@ -115,6 +75,7 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += mempolicy_has_home_node
 NV_CONFTEST_TYPE_COMPILE_TESTS += mpol_preferred_many_present
 NV_CONFTEST_TYPE_COMPILE_TESTS += mmu_interval_notifier
 NV_CONFTEST_TYPE_COMPILE_TESTS += fault_flag_remote_present
+NV_CONFTEST_TYPE_COMPILE_TESTS += struct_page_has_zone_device_data

 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_int_active_memcg
 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_migrate_vma_setup
--- a/kernel-open/nvidia-uvm/uvm.c
+++ b/kernel-open/nvidia-uvm/uvm.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -127,9 +127,9 @@ static NV_STATUS uvm_api_mm_initialize(UVM_MM_INITIALIZE_PARAMS *params, struct
        goto err;
    }

-    old_fd_type = nv_atomic_long_cmpxchg((atomic_long_t *)&filp->private_data,
-                                         UVM_FD_UNINITIALIZED,
-                                         UVM_FD_INITIALIZING);
+    old_fd_type = atomic_long_cmpxchg((atomic_long_t *)&filp->private_data,
+                                      UVM_FD_UNINITIALIZED,
+                                      UVM_FD_INITIALIZING);
    old_fd_type &= UVM_FD_TYPE_MASK;
    if (old_fd_type != UVM_FD_UNINITIALIZED) {
        status = NV_ERR_IN_USE;
@ -222,10 +222,6 @@ static int uvm_open(struct inode *inode, struct file *filp)
    // assigning f_mapping.
    mapping->a_ops = inode->i_mapping->a_ops;

-#if defined(NV_ADDRESS_SPACE_HAS_BACKING_DEV_INFO)
-    mapping->backing_dev_info = inode->i_mapping->backing_dev_info;
-#endif
-
    filp->private_data = NULL;
    filp->f_mapping = mapping;

@ -325,21 +321,21 @@ static int uvm_release_entry(struct inode *inode, struct file *filp)

 static void uvm_destroy_vma_managed(struct vm_area_struct *vma, bool make_zombie)
 {
-    uvm_va_range_t *va_range, *va_range_next;
+    uvm_va_range_managed_t *managed_range, *managed_range_next;
    NvU64 size = 0;

    uvm_assert_rwsem_locked_write(&uvm_va_space_get(vma->vm_file)->lock);
-    uvm_for_each_va_range_in_vma_safe(va_range, va_range_next, vma) {
+    uvm_for_each_va_range_managed_in_vma_safe(managed_range, managed_range_next, vma) {
        // On exit_mmap (process teardown), current->mm is cleared so
        // uvm_va_range_vma_current would return NULL.
-        UVM_ASSERT(uvm_va_range_vma(va_range) == vma);
-        UVM_ASSERT(va_range->node.start >= vma->vm_start);
-        UVM_ASSERT(va_range->node.end   <  vma->vm_end);
-        size += uvm_va_range_size(va_range);
+        UVM_ASSERT(uvm_va_range_vma(managed_range) == vma);
+        UVM_ASSERT(managed_range->va_range.node.start >= vma->vm_start);
+        UVM_ASSERT(managed_range->va_range.node.end   <  vma->vm_end);
+        size += uvm_va_range_size(&managed_range->va_range);
        if (make_zombie)
-            uvm_va_range_zombify(va_range);
+            uvm_va_range_zombify(managed_range);
        else
-            uvm_va_range_destroy(va_range, NULL);
+            uvm_va_range_destroy(&managed_range->va_range, NULL);
    }

    if (vma->vm_private_data) {
@ -351,18 +347,17 @@ static void uvm_destroy_vma_managed(struct vm_area_struct *vma, bool make_zombie

 static void uvm_destroy_vma_semaphore_pool(struct vm_area_struct *vma)
 {
+    uvm_va_range_semaphore_pool_t *semaphore_pool_range;
    uvm_va_space_t *va_space;
-    uvm_va_range_t *va_range;

    va_space = uvm_va_space_get(vma->vm_file);
    uvm_assert_rwsem_locked(&va_space->lock);
-    va_range = uvm_va_range_find(va_space, vma->vm_start);
-    UVM_ASSERT(va_range &&
-               va_range->node.start   == vma->vm_start &&
-               va_range->node.end + 1 == vma->vm_end &&
-               va_range->type == UVM_VA_RANGE_TYPE_SEMAPHORE_POOL);
+    semaphore_pool_range = uvm_va_range_semaphore_pool_find(va_space, vma->vm_start);
+    UVM_ASSERT(semaphore_pool_range &&
+               semaphore_pool_range->va_range.node.start   == vma->vm_start &&
+               semaphore_pool_range->va_range.node.end + 1 == vma->vm_end);

-    uvm_mem_unmap_cpu_user(va_range->semaphore_pool.mem);
+    uvm_mem_unmap_cpu_user(semaphore_pool_range->mem);
 }

 // If a fault handler is not set, paths like handle_pte_fault in older kernels
@ -478,7 +473,7 @@ static void uvm_vm_open_failure(struct vm_area_struct *original,
 static void uvm_vm_open_managed(struct vm_area_struct *vma)
 {
    uvm_va_space_t *va_space = uvm_va_space_get(vma->vm_file);
-    uvm_va_range_t *va_range;
+    uvm_va_range_managed_t *managed_range;
    struct vm_area_struct *original;
    NV_STATUS status;
    NvU64 new_end;
@ -534,13 +529,13 @@ static void uvm_vm_open_managed(struct vm_area_struct *vma)
        goto out;
    }

-    // There can be multiple va_ranges under the vma already. Check if one spans
+    // There can be multiple ranges under the vma already. Check if one spans
    // the new split boundary. If so, split it.
-    va_range = uvm_va_range_find(va_space, new_end);
-    UVM_ASSERT(va_range);
-    UVM_ASSERT(uvm_va_range_vma_current(va_range) == original);
-    if (va_range->node.end != new_end) {
-        status = uvm_va_range_split(va_range, new_end, NULL);
+    managed_range = uvm_va_range_managed_find(va_space, new_end);
+    UVM_ASSERT(managed_range);
+    UVM_ASSERT(uvm_va_range_vma_current(managed_range) == original);
+    if (managed_range->va_range.node.end != new_end) {
+        status = uvm_va_range_split(managed_range, new_end, NULL);
        if (status != NV_OK) {
            UVM_DBG_PRINT("Failed to split VA range, destroying both: %s. "
                          "original vma [0x%lx, 0x%lx) new vma [0x%lx, 0x%lx)\n",
@ -552,10 +547,10 @@ static void uvm_vm_open_managed(struct vm_area_struct *vma)
        }
    }

-    // Point va_ranges to the new vma
-    uvm_for_each_va_range_in_vma(va_range, vma) {
-        UVM_ASSERT(uvm_va_range_vma_current(va_range) == original);
-        va_range->managed.vma_wrapper = vma->vm_private_data;
+    // Point managed_ranges to the new vma
+    uvm_for_each_va_range_managed_in_vma(managed_range, vma) {
+        UVM_ASSERT(uvm_va_range_vma_current(managed_range) == original);
+        managed_range->vma_wrapper = vma->vm_private_data;
    }

 out:
@ -657,12 +652,12 @@ static struct vm_operations_struct uvm_vm_ops_managed =
 };

 // vm operations on semaphore pool allocations only control CPU mappings. Unmapping GPUs,
-// freeing the allocation, and destroying the va_range are handled by UVM_FREE.
+// freeing the allocation, and destroying the range are handled by UVM_FREE.
 static void uvm_vm_open_semaphore_pool(struct vm_area_struct *vma)
 {
    struct vm_area_struct *origin_vma = (struct vm_area_struct *)vma->vm_private_data;
    uvm_va_space_t *va_space = uvm_va_space_get(origin_vma->vm_file);
-    uvm_va_range_t *va_range;
+    uvm_va_range_semaphore_pool_t *semaphore_pool_range;
    bool is_fork = (vma->vm_mm != origin_vma->vm_mm);
    NV_STATUS status;

@ -670,14 +665,17 @@ static void uvm_vm_open_semaphore_pool(struct vm_area_struct *vma)

    uvm_va_space_down_write(va_space);

-    va_range = uvm_va_range_find(va_space, origin_vma->vm_start);
-    UVM_ASSERT(va_range);
-    UVM_ASSERT_MSG(va_range->type == UVM_VA_RANGE_TYPE_SEMAPHORE_POOL &&
-                   va_range->node.start == origin_vma->vm_start &&
-                   va_range->node.end + 1 == origin_vma->vm_end,
+    semaphore_pool_range = uvm_va_range_semaphore_pool_find(va_space, origin_vma->vm_start);
+    UVM_ASSERT(semaphore_pool_range);
+    UVM_ASSERT_MSG(semaphore_pool_range &&
+                   semaphore_pool_range->va_range.node.start == origin_vma->vm_start &&
+                   semaphore_pool_range->va_range.node.end + 1 == origin_vma->vm_end,
                   "origin vma [0x%llx, 0x%llx); va_range [0x%llx, 0x%llx) type %d\n",
-                   (NvU64)origin_vma->vm_start, (NvU64)origin_vma->vm_end, va_range->node.start,
-                   va_range->node.end + 1, va_range->type);
+                   (NvU64)origin_vma->vm_start,
+                   (NvU64)origin_vma->vm_end,
+                   semaphore_pool_range->va_range.node.start,
+                   semaphore_pool_range->va_range.node.end + 1,
+                   semaphore_pool_range->va_range.type);

    // Semaphore pool vmas do not have vma wrappers, but some functions will
    // assume vm_private_data is a wrapper.
@ -689,9 +687,9 @@ static void uvm_vm_open_semaphore_pool(struct vm_area_struct *vma)

        // uvm_disable_vma unmaps in the parent as well; clear the uvm_mem CPU
        // user mapping metadata and then remap.
-        uvm_mem_unmap_cpu_user(va_range->semaphore_pool.mem);
+        uvm_mem_unmap_cpu_user(semaphore_pool_range->mem);

-        status = uvm_mem_map_cpu_user(va_range->semaphore_pool.mem, va_range->va_space, origin_vma);
+        status = uvm_mem_map_cpu_user(semaphore_pool_range->mem, semaphore_pool_range->va_range.va_space, origin_vma);
        if (status != NV_OK) {
            UVM_DBG_PRINT("Failed to remap semaphore pool to CPU for parent after fork; status = %d (%s)",
                    status, nvstatusToString(status));
@ -702,7 +700,7 @@ static void uvm_vm_open_semaphore_pool(struct vm_area_struct *vma)
        origin_vma->vm_private_data = NULL;
        origin_vma->vm_ops = &uvm_vm_ops_disabled;
        vma->vm_ops = &uvm_vm_ops_disabled;
-        uvm_mem_unmap_cpu_user(va_range->semaphore_pool.mem);
+        uvm_mem_unmap_cpu_user(semaphore_pool_range->mem);
    }

    uvm_va_space_up_write(va_space);
@ -751,10 +749,81 @@ static struct vm_operations_struct uvm_vm_ops_semaphore_pool =
 #endif
 };

+static void uvm_vm_open_device_p2p(struct vm_area_struct *vma)
+{
+    struct vm_area_struct *origin_vma = (struct vm_area_struct *)vma->vm_private_data;
+    uvm_va_space_t *va_space = uvm_va_space_get(origin_vma->vm_file);
+    uvm_va_range_t *va_range;
+    bool is_fork = (vma->vm_mm != origin_vma->vm_mm);
+
+    uvm_record_lock_mmap_lock_write(current->mm);
+
+    uvm_va_space_down_write(va_space);
+
+    va_range = uvm_va_range_find(va_space, origin_vma->vm_start);
+    UVM_ASSERT(va_range);
+    UVM_ASSERT_MSG(va_range->type == UVM_VA_RANGE_TYPE_DEVICE_P2P &&
+                   va_range->node.start == origin_vma->vm_start &&
+                   va_range->node.end + 1 == origin_vma->vm_end,
+                   "origin vma [0x%llx, 0x%llx); va_range [0x%llx, 0x%llx) type %d\n",
+                   (NvU64)origin_vma->vm_start, (NvU64)origin_vma->vm_end, va_range->node.start,
+                   va_range->node.end + 1, va_range->type);
+
+    // Device P2P vmas do not have vma wrappers, but some functions will
+    // assume vm_private_data is a wrapper.
+    vma->vm_private_data = NULL;
+
+    if (is_fork) {
+        // If we forked, leave the parent vma alone.
+        uvm_disable_vma(vma);
+
+        // uvm_disable_vma unmaps in the parent as well so remap the parent
+        uvm_va_range_device_p2p_map_cpu(va_range->va_space, origin_vma, uvm_va_range_to_device_p2p(va_range));
+    }
+    else {
+        // mremap will free the backing pages via unmap so we can't support it.
+        origin_vma->vm_private_data = NULL;
+        origin_vma->vm_ops = &uvm_vm_ops_disabled;
+        vma->vm_ops = &uvm_vm_ops_disabled;
+        unmap_mapping_range(va_space->mapping, va_range->node.start, va_range->node.end - va_range->node.start + 1, 1);
+    }
+
+    uvm_va_space_up_write(va_space);
+
+    uvm_record_unlock_mmap_lock_write(current->mm);
+}
+
+static void uvm_vm_open_device_p2p_entry(struct vm_area_struct *vma)
+{
+    UVM_ENTRY_VOID(uvm_vm_open_device_p2p(vma));
+}
+
+// Device P2P pages are only mapped on the CPU. Pages are allocated externally
+// to UVM but destroying the range must unpin the RM object.
+static void uvm_vm_close_device_p2p(struct vm_area_struct *vma)
+{
+}
+
+static void uvm_vm_close_device_p2p_entry(struct vm_area_struct *vma)
+{
+    UVM_ENTRY_VOID(uvm_vm_close_device_p2p(vma));
+}
+
+static struct vm_operations_struct uvm_vm_ops_device_p2p =
+{
+    .open         = uvm_vm_open_device_p2p_entry,
+    .close        = uvm_vm_close_device_p2p_entry,
+
+#if defined(NV_VM_OPS_FAULT_REMOVED_VMA_ARG)
+    .fault        = uvm_vm_fault_sigbus_wrapper_entry,
+#else
+    .fault        = uvm_vm_fault_sigbus_entry,
+#endif
+};
+
 static int uvm_mmap(struct file *filp, struct vm_area_struct *vma)
 {
    uvm_va_space_t *va_space;
-    uvm_va_range_t *va_range;
    NV_STATUS status = uvm_global_get_status();
    int ret = 0;
    bool vma_wrapper_allocated = false;
@ -845,18 +914,28 @@ static int uvm_mmap(struct file *filp, struct vm_area_struct *vma)
    status = uvm_va_range_create_mmap(va_space, current->mm, vma->vm_private_data, NULL);

    if (status == NV_ERR_UVM_ADDRESS_IN_USE) {
+        uvm_va_range_semaphore_pool_t *semaphore_pool_range;
+        uvm_va_range_device_p2p_t *device_p2p_range;
        // If the mmap is for a semaphore pool, the VA range will have been
        // allocated by a previous ioctl, and the mmap just creates the CPU
        // mapping.
-        va_range = uvm_va_range_find(va_space, vma->vm_start);
-        if (va_range && va_range->node.start == vma->vm_start &&
-                va_range->node.end + 1 == vma->vm_end &&
-                va_range->type == UVM_VA_RANGE_TYPE_SEMAPHORE_POOL) {
+        semaphore_pool_range = uvm_va_range_semaphore_pool_find(va_space, vma->vm_start);
+        device_p2p_range = uvm_va_range_device_p2p_find(va_space, vma->vm_start);
+        if (semaphore_pool_range && semaphore_pool_range->va_range.node.start == vma->vm_start &&
+                semaphore_pool_range->va_range.node.end + 1 == vma->vm_end) {
            uvm_vma_wrapper_destroy(vma->vm_private_data);
            vma_wrapper_allocated = false;
            vma->vm_private_data = vma;
            vma->vm_ops = &uvm_vm_ops_semaphore_pool;
-            status = uvm_mem_map_cpu_user(va_range->semaphore_pool.mem, va_range->va_space, vma);
+            status = uvm_mem_map_cpu_user(semaphore_pool_range->mem, semaphore_pool_range->va_range.va_space, vma);
+        }
+        else if (device_p2p_range && device_p2p_range->va_range.node.start == vma->vm_start &&
+                 device_p2p_range->va_range.node.end + 1 == vma->vm_end) {
+            uvm_vma_wrapper_destroy(vma->vm_private_data);
+            vma_wrapper_allocated = false;
+            vma->vm_private_data = vma;
+            vma->vm_ops = &uvm_vm_ops_device_p2p;
+            status = uvm_va_range_device_p2p_map_cpu(va_space, vma, device_p2p_range);
        }
    }

@ -914,8 +993,9 @@ static NV_STATUS uvm_api_initialize(UVM_INITIALIZE_PARAMS *params, struct file *
    // attempt to be made. This is safe because other threads will have only had
    // a chance to observe UVM_FD_INITIALIZING and not UVM_FD_VA_SPACE in this
    // case.
-    old_fd_type = nv_atomic_long_cmpxchg((atomic_long_t *)&filp->private_data,
-                                         UVM_FD_UNINITIALIZED, UVM_FD_INITIALIZING);
+    old_fd_type = atomic_long_cmpxchg((atomic_long_t *)&filp->private_data,
+                                      UVM_FD_UNINITIALIZED,
+                                      UVM_FD_INITIALIZING);
    old_fd_type &= UVM_FD_TYPE_MASK;
    if (old_fd_type == UVM_FD_UNINITIALIZED) {
        status = uvm_va_space_create(filp->f_mapping, &va_space, params->flags);
@ -1001,6 +1081,9 @@ static long uvm_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_CLEAN_UP_ZOMBIE_RESOURCES,      uvm_api_clean_up_zombie_resources);
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_POPULATE_PAGEABLE,              uvm_api_populate_pageable);
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_VALIDATE_VA_RANGE,              uvm_api_validate_va_range);
+        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TOOLS_GET_PROCESSOR_UUID_TABLE_V2,uvm_api_tools_get_processor_uuid_table_v2);
+        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_ALLOC_DEVICE_P2P,               uvm_api_alloc_device_p2p);
+        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_CLEAR_ALL_ACCESS_COUNTERS,      uvm_api_clear_all_access_counters);
    }

    // Try the test ioctls if none of the above matched
--- a/kernel-open/nvidia-uvm/uvm.h
+++ b/kernel-open/nvidia-uvm/uvm.h
@ -45,20 +45,20 @@
 //     #endif
 // 3) Do the same thing for the function definition, and for any structs that
 //    are taken as arguments to these functions.
-// 4) Let this change propagate over to cuda_a and dev_a, so that the CUDA and
-//    nvidia-cfg libraries can start using the new API by bumping up the API
+// 4) Let this change propagate over to cuda_a and bugfix_main, so that the CUDA
+//    and nvidia-cfg libraries can start using the new API by bumping up the API
 //    version number it's using.
 //    Places where UVM_API_REVISION is defined are:
 //      drivers/gpgpu/cuda/cuda.nvmk (cuda_a)
-//      drivers/setup/linux/nvidia-cfg/makefile.nvmk (dev_a)
-// 5) Once the dev_a and cuda_a changes have made it back into chips_a,
+//      drivers/setup/linux/nvidia-cfg/makefile.nvmk (bugfix_main)
+// 5) Once the bugfix_main and cuda_a changes have made it back into chips_a,
 //    remove the old API declaration, definition, and any old structs that were
 //    in use.

 #ifndef _UVM_H_
 #define _UVM_H_

-#define UVM_API_LATEST_REVISION 12
+#define UVM_API_LATEST_REVISION 13

 #if !defined(UVM_API_REVISION)
 #error "please define UVM_API_REVISION macro to a desired version number or UVM_API_LATEST_REVISION macro"
@ -384,36 +384,8 @@ NV_STATUS UvmIsPageableMemoryAccessSupportedOnGpu(const NvProcessorUuid *gpuUuid
 //         because it is not very informative.
 //
 //------------------------------------------------------------------------------
-#if UVM_API_REV_IS_AT_MOST(8)
-NV_STATUS UvmRegisterGpu(const NvProcessorUuid *gpuUuid);
-#else
 NV_STATUS UvmRegisterGpu(const NvProcessorUuid *gpuUuid,
                         const UvmGpuPlatformParams *platformParams);
-#endif
-
-#if UVM_API_REV_IS_AT_MOST(8)
-//------------------------------------------------------------------------------
-// UvmRegisterGpuSmc
-//
-// The same as UvmRegisterGpu, but takes additional parameters to specify the
-// GPU partition being registered if SMC is enabled.
-//
-// Arguments:
-//     gpuUuid: (INPUT)
-//         UUID of the physical GPU of the SMC partition to register.
-//
-//     platformParams: (INPUT)
-//         User handles identifying the partition to register.
-//
-// Error codes (see UvmRegisterGpu also):
-//
-//     NV_ERR_INVALID_STATE:
-//         SMC was not enabled, or the partition identified by the user
-//         handles or its configuration changed.
-//
-NV_STATUS UvmRegisterGpuSmc(const NvProcessorUuid *gpuUuid,
-                            const UvmGpuPlatformParams *platformParams);
-#endif

 //------------------------------------------------------------------------------
 // UvmUnregisterGpu
@ -1364,6 +1336,86 @@ NV_STATUS UvmAllocSemaphorePool(void                          *base,
                                const UvmGpuMappingAttributes *perGpuAttribs,
                                NvLength                       gpuAttribsCount);

+//------------------------------------------------------------------------------
+// UvmAllocDeviceP2P
+//
+// Create a VA range within the process's address space reserved for use by
+// other devices to directly access GPU memory. The memory associated with the
+// RM handle is mapped into the user address space associated with the range for
+// direct access from the CPU.
+//
+// The VA range must not overlap with an existing VA range, irrespective of
+// whether the existing range corresponds to a UVM allocation or an external
+// allocation.
+//
+// Multiple VA ranges may be created mapping the same physical memory associated
+// with the RM handle. The associated GPU memory will not be freed until all VA
+// ranges have been destroyed either explicitly or implicitly and all non-UVM
+// users (eg. third party device drivers) have stopped using the associated
+// GPU memory.
+//
+// The VA range can be unmapped and freed by calling UvmFree.
+//
+// Destroying the final range mapping the RM handle may block until all third
+// party device drivers and other kernel users have stopped using the memory.
+//
+// These VA ranges are only associated with a single GPU.
+//
+// Arguments:
+//     gpuUuid: (INPUT)
+//         UUID of the physical GPU if the GPU is not SMC capable or SMC
+//         enabled, or the GPU instance UUID of the partition containing the
+//         memory to be mapped on the CPU.
+//
+//     base: (INPUT)
+//         Base address of the virtual address range.
+//
+//     length: (INPUT)
+//         Length, in bytes, of the range.
+//
+//     offset: (INPUT)
+//         Offset, in bytes, from the start of the externally allocated memory
+//         to map from.
+//
+//     platformParams: (INPUT)
+//         Platform specific parameters that identify the allocation.
+//         On Linux: RM ctrl fd, hClient and the handle (hMemory) of the
+//         externally allocated memory to map.
+//
+// Errors:
+//
+//     NV_ERR_INVALID_ADDRESS:
+//         base is NULL or length is zero or at least one of base and length is
+//         not aligned to 4K.
+//
+//     NV_ERR_INVALID_DEVICE:
+//         The gpuUuid was either not registered or has no GPU VA space
+//         registered for it.
+//
+//     NV_ERR_INVALID_ARGUMENT:
+//         base + offset + length exceeeds the end of the externally allocated
+//         memory handle or the externally allocated handle is not valid.
+//
+//     NV_ERR_UVM_ADDRESS_IN_USE:
+//         The requested virtual address range overlaps with an existing
+//         allocation.
+//
+//     NV_ERR_NO_MEMORY:
+//         Internal memory allocation failed.
+//
+//     NV_ERR_NOT_SUPPORTED:
+//         The device peer-to-peer feature is not supported by the current
+//         system configuration. This may be because the GPU doesn't support
+//         the peer-to-peer feature or the kernel was not built with the correct
+//         configuration options.
+//
+//------------------------------------------------------------------------------
+NV_STATUS UvmAllocDeviceP2P(NvProcessorUuid gpuUuid,
+                            void     *base,
+                            NvLength length,
+                            NvLength offset,
+                            const UvmDeviceP2PPlatformParams *platformParams);
+
 //------------------------------------------------------------------------------
 // UvmMigrate
 //
@ -3276,7 +3328,7 @@ NV_STATUS UvmEventGetGpuUuidTable(NvProcessorUuid *gpuUuidTable,
 //------------------------------------------------------------------------------
 NV_STATUS UvmEventFetch(UvmDebugSession      sessionHandle,
                        UvmEventQueueHandle  queueHandle,
-                        UvmEventEntry_V1    *pBuffer,
+                        UvmEventEntry       *pBuffer,
                        NvU64               *nEntries);

 //------------------------------------------------------------------------------
@ -3472,32 +3524,21 @@ NV_STATUS UvmToolsDestroySession(UvmToolsSessionHandle session);
 // 4. Destroy event Queue using UvmToolsDestroyEventQueue
 //

-#if UVM_API_REV_IS_AT_MOST(10)
-// This is deprecated and replaced by sizeof(UvmToolsEventControlData).
-NvLength UvmToolsGetEventControlSize(void);
-
-// This is deprecated and replaced by sizeof(UvmEventEntry_V1) or
-// sizeof(UvmEventEntry_V2).
-NvLength UvmToolsGetEventEntrySize(void);
-#endif
-
 NvLength UvmToolsGetNumberOfCounters(void);

 //------------------------------------------------------------------------------
 // UvmToolsCreateEventQueue
 //
-// This call creates an event queue that can hold the given number of events.
-// All events are disabled by default. Event queue data persists lifetime of the
-// target process.
+// This function is deprecated. See UvmToolsCreateEventQueue_V2.
+//
+// This call creates an event queue that can hold the given number of
+// UvmEventEntry events. All events are disabled by default. Event queue data
+// persists lifetime of the target process.
 //
 // Arguments:
 //     session: (INPUT)
 //         Handle to the tools session.
 //
-//     version: (INPUT)
-//         Requested version for events or counters.
-//         See UvmToolsEventQueueVersion.
-//
 //     event_buffer: (INPUT)
 //         User allocated buffer. Must be page-aligned. Must be large enough to
 //         hold at least event_buffer_size events. Gets pinned until queue is
@ -3520,8 +3561,55 @@ NvLength UvmToolsGetNumberOfCounters(void);
 //         Session handle does not refer to a valid session
 //
 //     NV_ERR_INVALID_ARGUMENT:
-//         The version is not UvmToolsEventQueueVersion_V1 or
-//         UvmToolsEventQueueVersion_V2.
+//         One of the parameters: event_buffer, event_buffer_size, event_control
+//         is not valid
+//
+//     NV_ERR_INSUFFICIENT_RESOURCES:
+//         There could be multiple reasons for this error. One would be that
+//         it's not possible to allocate a queue of requested size. Another
+//         would be either event_buffer or event_control memory couldn't be
+//         pinned (e.g. because of OS limitation of pinnable memory). Also it
+//         could not have been possible to create UvmToolsEventQueueDescriptor.
+//
+NV_STATUS UvmToolsCreateEventQueue(UvmToolsSessionHandle        session,
+                                   void                        *event_buffer,
+                                   NvLength                     event_buffer_size,
+                                   void                        *event_control,
+                                   UvmToolsEventQueueHandle    *queue);
+
+//------------------------------------------------------------------------------
+// UvmToolsCreateEventQueue_V2
+//
+// This call creates an event queue that can hold the given number of
+// UvmEventEntry_V2 events. All events are disabled by default. Event queue data
+// persists beyond the lifetime of the target process.
+//
+// Arguments:
+//     session: (INPUT)
+//         Handle to the tools session.
+//
+//     event_buffer: (INPUT)
+//         User allocated buffer. Must be page-aligned. Must be large enough to
+//         hold at least event_buffer_size events. Gets pinned until queue is
+//         destroyed.
+//
+//     event_buffer_size: (INPUT)
+//         Size of the event queue buffer in units of UvmEventEntry_V2's. Must
+//         be a power of two, and greater than 1.
+//
+//     event_control (INPUT)
+//         User allocated buffer. Must be page-aligned. Must be large enough to
+//         hold UvmToolsEventControlData (although single page-size allocation
+//         should be more than enough). Gets pinned until queue is destroyed.
+//
+//     queue: (OUTPUT)
+//         Handle to the created queue.
+//
+// Error codes:
+//     NV_ERR_INSUFFICIENT_PERMISSIONS:
+//         Session handle does not refer to a valid session
+//
+//     NV_ERR_INVALID_ARGUMENT:
 //         One of the parameters: event_buffer, event_buffer_size, event_control
 //         is not valid
 //
@ -3538,20 +3626,11 @@ NvLength UvmToolsGetNumberOfCounters(void);
 //         could not have been possible to create UvmToolsEventQueueDescriptor.
 //
 //------------------------------------------------------------------------------
-#if UVM_API_REV_IS_AT_MOST(10)
-NV_STATUS UvmToolsCreateEventQueue(UvmToolsSessionHandle     session,
-                                   void                     *event_buffer,
-                                   NvLength                  event_buffer_size,
-                                   void                     *event_control,
-                                   UvmToolsEventQueueHandle *queue);
-#else
-NV_STATUS UvmToolsCreateEventQueue(UvmToolsSessionHandle        session,
-                                   UvmToolsEventQueueVersion    version,
-                                   void                        *event_buffer,
-                                   NvLength                     event_buffer_size,
-                                   void                        *event_control,
-                                   UvmToolsEventQueueHandle    *queue);
-#endif
+NV_STATUS UvmToolsCreateEventQueue_V2(UvmToolsSessionHandle        session,
+                                      void                        *event_buffer,
+                                      NvLength                     event_buffer_size,
+                                      void                        *event_control,
+                                      UvmToolsEventQueueHandle    *queue);

 UvmToolsEventQueueDescriptor UvmToolsGetEventQueueDescriptor(UvmToolsEventQueueHandle queue);

@ -3967,6 +4046,8 @@ NV_STATUS UvmToolsWriteProcessMemory(UvmToolsSessionHandle  session,
 //------------------------------------------------------------------------------
 // UvmToolsGetProcessorUuidTable
 //
+// This function is deprecated. See UvmToolsGetProcessorUuidTable_V2.
+//
 // Populate a table with the UUIDs of all the currently registered processors
 // in the target process. When a GPU is registered, it is added to the table.
 // When a GPU is unregistered, it is removed. As long as a GPU remains
@ -3979,55 +4060,63 @@ NV_STATUS UvmToolsWriteProcessMemory(UvmToolsSessionHandle  session,
 //     session: (INPUT)
 //         Handle to the tools session.
 //
-//     version: (INPUT)
-//         Requested version for the UUID table returned. The version must
-//         match the requested version of the event queue created with
-//         UvmToolsCreateEventQueue(). See UvmToolsEventQueueVersion.
-//         If the version of the event queue does not match the version of the
-//         UUID table, the behavior is undefined.
-//
 //     table: (OUTPUT)
 //         Array of processor UUIDs, including the CPU's UUID which is always
 //         at index zero. The number of elements in the array must be greater
-//         or equal to UVM_MAX_PROCESSORS_V1 if the version is
-//         UvmToolsEventQueueVersion_V1 and UVM_MAX_PROCESSORS if the version is
-//         UvmToolsEventQueueVersion_V2.
+//         or equal to UVM_MAX_PROCESSORS_V1.
 //         The srcIndex and dstIndex fields of the UvmEventMigrationInfo struct
 //         index this array. Unused indices will have a UUID of zero.
-//         If version is UvmToolsEventQueueVersion_V1 then the reported UUID
-//         will be that of the corresponding physical GPU, even if multiple SMC
-//         partitions are registered under that physical GPU. If version is
-//         UvmToolsEventQueueVersion_V2 then the reported UUID will be the GPU
-//         instance UUID if SMC is enabled, otherwise it will be the UUID of
-//         the physical GPU.
+//         The reported UUID will be that of the corresponding physical GPU,
+//         even if multiple SMC partitions are registered under that physical
+//         GPU.
 //
 // Error codes:
 //     NV_ERR_INVALID_ADDRESS:
 //         writing to table failed.
 //
-//     NV_ERR_INVALID_ARGUMENT:
-//         The version is not UvmToolsEventQueueVersion_V1 or
-//         UvmToolsEventQueueVersion_V2.
+//     NV_ERR_NO_MEMORY:
+//         Internal memory allocation failed.
+//------------------------------------------------------------------------------
+NV_STATUS UvmToolsGetProcessorUuidTable(UvmToolsSessionHandle     session,
+                                        NvProcessorUuid          *table);
+
+//------------------------------------------------------------------------------
+// UvmToolsGetProcessorUuidTable_V2
+//
+// Populate a table with the UUIDs of all the currently registered processors
+// in the target process. When a GPU is registered, it is added to the table.
+// When a GPU is unregistered, it is removed. As long as a GPU remains
+// registered, its index in the table does not change.
+// Note that the index in the table corresponds to the processor ID reported
+// in UvmEventEntry event records and that the table is not contiguously packed
+// with non-zero UUIDs even with no GPU unregistrations.
+//
+// Arguments:
+//     session: (INPUT)
+//         Handle to the tools session.
+//
+//     table: (OUTPUT)
+//         Array of processor UUIDs, including the CPU's UUID which is always
+//         at index zero. The number of elements in the array must be greater
+//         or equal to UVM_MAX_PROCESSORS.
+//         The srcIndex and dstIndex fields of the UvmEventMigrationInfo struct
+//         index this array. Unused indices will have a UUID of zero.
+//         The reported UUID will be the GPU instance UUID if SMC is enabled,
+//         otherwise it will be the UUID of the physical GPU.
+//
+// Error codes:
+//     NV_ERR_INVALID_ADDRESS:
+//         writing to table failed.
 //
 //     NV_ERR_NOT_SUPPORTED:
-//         The kernel is not able to support the requested version
-//         (i.e., the UVM kernel driver is older and doesn't support
-//         UvmToolsEventQueueVersion_V2).
+//         The UVM kernel driver is older and doesn't support
+//         UvmToolsGetProcessorUuidTable_V2.
 //
 //     NV_ERR_NO_MEMORY:
 //         Internal memory allocation failed.
 //------------------------------------------------------------------------------
-#if UVM_API_REV_IS_AT_MOST(11)
-NV_STATUS UvmToolsGetProcessorUuidTable(UvmToolsSessionHandle      session,
-                                        UvmToolsEventQueueVersion  version,
-                                        NvProcessorUuid           *table,
-                                        NvLength                   table_size,
-                                        NvLength                  *count);
-#else
-NV_STATUS UvmToolsGetProcessorUuidTable(UvmToolsSessionHandle     session,
-                                        UvmToolsEventQueueVersion version,
-                                        NvProcessorUuid          *table);
-#endif
+NV_STATUS UvmToolsGetProcessorUuidTable_V2(UvmToolsSessionHandle     session,
+                                           NvProcessorUuid          *table);

 //------------------------------------------------------------------------------
 // UvmToolsFlushEvents
--- a/kernel-open/nvidia-uvm/uvm_api.h
+++ b/kernel-open/nvidia-uvm/uvm_api.h
@ -47,7 +47,7 @@
    {                                                                               \
        params_type params;                                                         \
        BUILD_BUG_ON(sizeof(params) > UVM_MAX_IOCTL_PARAM_STACK_SIZE);              \
-        if (nv_copy_from_user(&params, (void __user*)arg, sizeof(params)))          \
+        if (copy_from_user(&params, (void __user*)arg, sizeof(params)))             \
            return -EFAULT;                                                         \
                                                                                    \
        params.rmStatus = uvm_global_get_status();                                  \
@ -60,7 +60,7 @@
                params.rmStatus = function_name(&params, filp);                     \
        }                                                                           \
                                                                                    \
-        if (nv_copy_to_user((void __user*)arg, &params, sizeof(params)))            \
+        if (copy_to_user((void __user*)arg, &params, sizeof(params)))               \
            return -EFAULT;                                                         \
                                                                                    \
        return 0;                                                                   \
@ -84,7 +84,7 @@
        if (!params)                                                                    \
            return -ENOMEM;                                                             \
        BUILD_BUG_ON(sizeof(*params) <= UVM_MAX_IOCTL_PARAM_STACK_SIZE);                \
-        if (nv_copy_from_user(params, (void __user*)arg, sizeof(*params))) {            \
+        if (copy_from_user(params, (void __user*)arg, sizeof(*params))) {               \
            uvm_kvfree(params);                                                         \
            return -EFAULT;                                                             \
        }                                                                               \
@ -99,7 +99,7 @@
                params->rmStatus = function_name(params, filp);                         \
        }                                                                               \
                                                                                        \
-        if (nv_copy_to_user((void __user*)arg, params, sizeof(*params)))                \
+        if (copy_to_user((void __user*)arg, params, sizeof(*params)))                   \
            ret = -EFAULT;                                                              \
                                                                                        \
        uvm_kvfree(params);                                                             \
@ -244,6 +244,7 @@ NV_STATUS uvm_api_migrate(UVM_MIGRATE_PARAMS *params, struct file *filp);
 NV_STATUS uvm_api_enable_system_wide_atomics(UVM_ENABLE_SYSTEM_WIDE_ATOMICS_PARAMS *params, struct file *filp);
 NV_STATUS uvm_api_disable_system_wide_atomics(UVM_DISABLE_SYSTEM_WIDE_ATOMICS_PARAMS *params, struct file *filp);
 NV_STATUS uvm_api_tools_init_event_tracker(UVM_TOOLS_INIT_EVENT_TRACKER_PARAMS *params, struct file *filp);
+NV_STATUS uvm_api_tools_init_event_tracker_v2(UVM_TOOLS_INIT_EVENT_TRACKER_V2_PARAMS *params, struct file *filp);
 NV_STATUS uvm_api_tools_set_notification_threshold(UVM_TOOLS_SET_NOTIFICATION_THRESHOLD_PARAMS *params, struct file *filp);
 NV_STATUS uvm_api_tools_event_queue_enable_events(UVM_TOOLS_EVENT_QUEUE_ENABLE_EVENTS_PARAMS *params, struct file *filp);
 NV_STATUS uvm_api_tools_event_queue_disable_events(UVM_TOOLS_EVENT_QUEUE_DISABLE_EVENTS_PARAMS *params, struct file *filp);
@ -256,5 +257,7 @@ NV_STATUS uvm_api_unmap_external(UVM_UNMAP_EXTERNAL_PARAMS *params, struct file
 NV_STATUS uvm_api_migrate_range_group(UVM_MIGRATE_RANGE_GROUP_PARAMS *params, struct file *filp);
 NV_STATUS uvm_api_alloc_semaphore_pool(UVM_ALLOC_SEMAPHORE_POOL_PARAMS *params, struct file *filp);
 NV_STATUS uvm_api_populate_pageable(const UVM_POPULATE_PAGEABLE_PARAMS *params, struct file *filp);
+NV_STATUS uvm_api_alloc_device_p2p(UVM_ALLOC_DEVICE_P2P_PARAMS *params, struct file *filp);
+NV_STATUS uvm_api_clear_all_access_counters(UVM_CLEAR_ALL_ACCESS_COUNTERS_PARAMS *params, struct file *filp);

 #endif // __UVM_API_H__
--- a/kernel-open/nvidia-uvm/uvm_ats_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_ats_faults.c
@ -90,18 +90,19 @@ static NV_STATUS service_ats_requests(uvm_gpu_va_space_t *gpu_va_space,

    uvm_migrate_args_t uvm_migrate_args =
    {
-        .va_space                       = va_space,
-        .mm                             = mm,
-        .dst_id                         = ats_context->residency_id,
-        .dst_node_id                    = ats_context->residency_node,
-        .start                          = start,
-        .length                         = length,
-        .populate_permissions           = populate_permissions,
-        .touch                          = fault_service_type,
-        .skip_mapped                    = fault_service_type,
-        .populate_on_cpu_alloc_failures = fault_service_type,
-        .user_space_start               = &user_space_start,
-        .user_space_length              = &user_space_length,
+        .va_space                           = va_space,
+        .mm                                 = mm,
+        .dst_id                             = ats_context->residency_id,
+        .dst_node_id                        = ats_context->residency_node,
+        .start                              = start,
+        .length                             = length,
+        .populate_permissions               = populate_permissions,
+        .touch                              = fault_service_type,
+        .skip_mapped                        = fault_service_type,
+        .populate_on_cpu_alloc_failures     = fault_service_type,
+        .populate_on_migrate_vma_failures   = fault_service_type,
+        .user_space_start                   = &user_space_start,
+        .user_space_length                  = &user_space_length,
    };

    UVM_ASSERT(uvm_ats_can_service_faults(gpu_va_space, mm));
@ -112,7 +113,7 @@ static NV_STATUS service_ats_requests(uvm_gpu_va_space_t *gpu_va_space,
    // set skip_mapped to true. For pages already mapped, this will only handle
    // PTE upgrades if needed.
    status = uvm_migrate_pageable(&uvm_migrate_args);
-    if (status == NV_WARN_NOTHING_TO_DO)
+    if (fault_service_type && (status == NV_WARN_NOTHING_TO_DO))
        status = NV_OK;

    UVM_ASSERT(status != NV_ERR_MORE_PROCESSING_REQUIRED);
@ -379,14 +380,20 @@ static NV_STATUS ats_compute_residency_mask(uvm_gpu_va_space_t *gpu_va_space,

 static void ats_compute_prefetch_mask(uvm_gpu_va_space_t *gpu_va_space,
                                      struct vm_area_struct *vma,
+                                      uvm_ats_service_type_t service_type,
                                      uvm_ats_fault_context_t *ats_context,
                                      uvm_va_block_region_t max_prefetch_region)
 {
-    uvm_page_mask_t *accessed_mask = &ats_context->accessed_mask;
+    uvm_page_mask_t *accessed_mask;
    uvm_page_mask_t *residency_mask = &ats_context->prefetch_state.residency_mask;
    uvm_page_mask_t *prefetch_mask = &ats_context->prefetch_state.prefetch_pages_mask;
    uvm_perf_prefetch_bitmap_tree_t *bitmap_tree = &ats_context->prefetch_state.bitmap_tree;

+    if (service_type == UVM_ATS_SERVICE_TYPE_FAULTS)
+        accessed_mask = &ats_context->faults.accessed_mask;
+    else
+        accessed_mask = &ats_context->access_counters.accessed_mask;
+
    if (uvm_page_mask_empty(accessed_mask))
        return;

@ -406,7 +413,7 @@ static NV_STATUS ats_compute_prefetch(uvm_gpu_va_space_t *gpu_va_space,
                                      uvm_ats_fault_context_t *ats_context)
 {
    NV_STATUS status;
-    uvm_page_mask_t *accessed_mask = &ats_context->accessed_mask;
+    uvm_page_mask_t *accessed_mask;
    uvm_page_mask_t *prefetch_mask = &ats_context->prefetch_state.prefetch_pages_mask;
    uvm_va_block_region_t max_prefetch_region = uvm_ats_region_from_vma(vma, base);

@ -420,6 +427,11 @@ static NV_STATUS ats_compute_prefetch(uvm_gpu_va_space_t *gpu_va_space,
    if (!uvm_perf_prefetch_enabled(gpu_va_space->va_space))
        return status;

+    if (service_type == UVM_ATS_SERVICE_TYPE_FAULTS)
+        accessed_mask = &ats_context->faults.accessed_mask;
+    else
+        accessed_mask = &ats_context->access_counters.accessed_mask;
+
    if (uvm_page_mask_empty(accessed_mask))
        return status;

@ -432,12 +444,12 @@ static NV_STATUS ats_compute_prefetch(uvm_gpu_va_space_t *gpu_va_space,
        uvm_page_mask_init_from_region(prefetch_mask, max_prefetch_region, NULL);
    }
    else {
-        ats_compute_prefetch_mask(gpu_va_space, vma, ats_context, max_prefetch_region);
+        ats_compute_prefetch_mask(gpu_va_space, vma, service_type, ats_context, max_prefetch_region);
    }

    if (service_type == UVM_ATS_SERVICE_TYPE_FAULTS) {
-        uvm_page_mask_t *read_fault_mask = &ats_context->read_fault_mask;
-        uvm_page_mask_t *write_fault_mask = &ats_context->write_fault_mask;
+        uvm_page_mask_t *read_fault_mask = &ats_context->faults.read_fault_mask;
+        uvm_page_mask_t *write_fault_mask = &ats_context->faults.write_fault_mask;

        uvm_page_mask_or(read_fault_mask, read_fault_mask, prefetch_mask);

@ -459,10 +471,10 @@ NV_STATUS uvm_ats_service_faults(uvm_gpu_va_space_t *gpu_va_space,
    NV_STATUS status = NV_OK;
    uvm_va_block_region_t subregion;
    uvm_va_block_region_t region = uvm_va_block_region(0, PAGES_PER_UVM_VA_BLOCK);
-    uvm_page_mask_t *read_fault_mask = &ats_context->read_fault_mask;
-    uvm_page_mask_t *write_fault_mask = &ats_context->write_fault_mask;
-    uvm_page_mask_t *faults_serviced_mask = &ats_context->faults_serviced_mask;
-    uvm_page_mask_t *reads_serviced_mask = &ats_context->reads_serviced_mask;
+    uvm_page_mask_t *read_fault_mask = &ats_context->faults.read_fault_mask;
+    uvm_page_mask_t *write_fault_mask = &ats_context->faults.write_fault_mask;
+    uvm_page_mask_t *faults_serviced_mask = &ats_context->faults.faults_serviced_mask;
+    uvm_page_mask_t *reads_serviced_mask = &ats_context->faults.reads_serviced_mask;
    uvm_fault_client_type_t client_type = ats_context->client_type;
    uvm_ats_service_type_t service_type = UVM_ATS_SERVICE_TYPE_FAULTS;

@ -637,6 +649,8 @@ NV_STATUS uvm_ats_service_access_counters(uvm_gpu_va_space_t *gpu_va_space,
    UVM_ASSERT(gpu_va_space->ats.enabled);
    UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE);

+    uvm_page_mask_zero(&ats_context->access_counters.migrated_mask);
+
    uvm_assert_mmap_lock_locked(vma->vm_mm);
    uvm_assert_rwsem_locked(&gpu_va_space->va_space->lock);

@ -650,21 +664,24 @@ NV_STATUS uvm_ats_service_access_counters(uvm_gpu_va_space_t *gpu_va_space,

    // Remove pages which are already resident at the intended destination from
    // the accessed_mask.
-    uvm_page_mask_andnot(&ats_context->accessed_mask,
-                         &ats_context->accessed_mask,
+    uvm_page_mask_andnot(&ats_context->access_counters.accessed_mask,
+                         &ats_context->access_counters.accessed_mask,
                         &ats_context->prefetch_state.residency_mask);

-    for_each_va_block_subregion_in_mask(subregion, &ats_context->accessed_mask, region) {
+    for_each_va_block_subregion_in_mask(subregion, &ats_context->access_counters.accessed_mask, region) {
        NV_STATUS status;
        NvU64 start = base + (subregion.first * PAGE_SIZE);
        size_t length = uvm_va_block_region_num_pages(subregion) * PAGE_SIZE;
        uvm_fault_access_type_t access_type = UVM_FAULT_ACCESS_TYPE_COUNT;
+        uvm_page_mask_t *migrated_mask = &ats_context->access_counters.migrated_mask;

        UVM_ASSERT(start >= vma->vm_start);
        UVM_ASSERT((start + length) <= vma->vm_end);

        status = service_ats_requests(gpu_va_space, vma, start, length, access_type, service_type, ats_context);
-        if (status != NV_OK)
+        if (status == NV_OK)
+            uvm_page_mask_region_fill(migrated_mask, subregion);
+        else if (status != NV_WARN_NOTHING_TO_DO)
            return status;
    }

--- a/kernel-open/nvidia-uvm/uvm_ats_faults.h
+++ b/kernel-open/nvidia-uvm/uvm_ats_faults.h
@ -29,18 +29,18 @@

 // Service ATS faults in the range (base, base + UVM_VA_BLOCK_SIZE) with service
 // type for individual pages in the range requested by page masks set in
-// ats_context->read_fault_mask/write_fault_mask. base must be aligned to
+// ats_context->fault.read_fault_mask/write_fault_mask. base must be aligned to
 // UVM_VA_BLOCK_SIZE. The caller is responsible for ensuring that faulting
 // addresses fall completely within the VMA. The caller is also responsible for
 // ensuring that the faulting addresses don't overlap a GMMU region. (See
 // uvm_ats_check_in_gmmu_region). The caller is also responsible for handling
 // any errors returned by this function (fault cancellations etc.).
 //
-// Returns the fault service status in ats_context->faults_serviced_mask. In
-// addition, ats_context->reads_serviced_mask returns whether read servicing
-// worked on write faults iff the read service was also requested in the
-// corresponding bit in read_fault_mask. These returned masks are only valid if
-// the return status is NV_OK. Status other than NV_OK indicate system global
+// Returns the fault service status in ats_context->fault.faults_serviced_mask.
+// In addition, ats_context->fault.reads_serviced_mask returns whether read
+// servicing worked on write faults iff the read service was also requested in
+// the corresponding bit in read_fault_mask. These returned masks are only valid
+// if the return status is NV_OK. Status other than NV_OK indicate system global
 // fault servicing failures.
 //
 // LOCKING: The caller must retain and hold the mmap_lock and hold the va_space
@ -52,9 +52,9 @@ NV_STATUS uvm_ats_service_faults(uvm_gpu_va_space_t *gpu_va_space,

 // Service access counter notifications on ATS regions in the range (base, base
 // + UVM_VA_BLOCK_SIZE) for individual pages in the range requested by page_mask
-// set in ats_context->accessed_mask. base must be aligned to UVM_VA_BLOCK_SIZE.
-// The caller is responsible for ensuring that the addresses in the
-// accessed_mask is completely covered by the VMA. The caller is also
+// set in ats_context->access_counters.accessed_mask. base must be aligned to
+// UVM_VA_BLOCK_SIZE. The caller is responsible for ensuring that the addresses
+// in the accessed_mask is completely covered by the VMA. The caller is also
 // responsible for handling any errors returned by this function.
 //
 // Returns NV_OK if servicing was successful. Any other error indicates an error
--- a/kernel-open/nvidia-uvm/uvm_ats_sva.c
+++ b/kernel-open/nvidia-uvm/uvm_ats_sva.c
@ -127,12 +127,12 @@ static NvU32 smmu_vintf_read32(void __iomem *smmu_cmdqv_base, int reg)

 // We always use VCMDQ127 for the WAR
 #define VCMDQ 127
-void smmu_vcmdq_write32(void __iomem *smmu_cmdqv_base, int reg, NvU32 val)
+static void smmu_vcmdq_write32(void __iomem *smmu_cmdqv_base, int reg, NvU32 val)
 {
    iowrite32(val, SMMU_VCMDQ_BASE_ADDR(smmu_cmdqv_base, VCMDQ) + reg);
 }

-NvU32 smmu_vcmdq_read32(void __iomem *smmu_cmdqv_base, int reg)
+static NvU32 smmu_vcmdq_read32(void __iomem *smmu_cmdqv_base, int reg)
 {
    return ioread32(SMMU_VCMDQ_BASE_ADDR(smmu_cmdqv_base, VCMDQ) + reg);
 }
--- a/kernel-open/nvidia-uvm/uvm_ce_test.c
+++ b/kernel-open/nvidia-uvm/uvm_ce_test.c
@ -855,6 +855,7 @@ static NV_STATUS cpu_decrypt_in_order(uvm_channel_t *channel,
                                      uvm_mem_t *dst_mem,
                                      uvm_mem_t *src_mem,
                                      const UvmCslIv *decrypt_iv,
+                                      NvU32 key_version,
                                      uvm_mem_t *auth_tag_mem,
                                      size_t size,
                                      NvU32 copy_size)
@ -869,6 +870,7 @@ static NV_STATUS cpu_decrypt_in_order(uvm_channel_t *channel,
                                                         dst_plain + i * copy_size,
                                                         src_cipher + i * copy_size,
                                                         decrypt_iv + i,
+                                                         key_version,
                                                         copy_size,
                                                         auth_tag_buffer + i * UVM_CONF_COMPUTING_AUTH_TAG_SIZE));
    }
@ -879,6 +881,7 @@ static NV_STATUS cpu_decrypt_out_of_order(uvm_channel_t *channel,
                                          uvm_mem_t *dst_mem,
                                          uvm_mem_t *src_mem,
                                          const UvmCslIv *decrypt_iv,
+                                          NvU32 key_version,
                                          uvm_mem_t *auth_tag_mem,
                                          size_t size,
                                          NvU32 copy_size)
@ -896,6 +899,7 @@ static NV_STATUS cpu_decrypt_out_of_order(uvm_channel_t *channel,
                                                         dst_plain + i * copy_size,
                                                         src_cipher + i * copy_size,
                                                         decrypt_iv + i,
+                                                         key_version,
                                                         copy_size,
                                                         auth_tag_buffer + i * UVM_CONF_COMPUTING_AUTH_TAG_SIZE));
    }
@ -959,7 +963,7 @@ static void gpu_encrypt(uvm_push_t *push,
                                                          i * UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
                                                          dst_cipher);

-        uvm_conf_computing_log_gpu_encryption(push->channel, decrypt_iv);
+        uvm_conf_computing_log_gpu_encryption(push->channel, copy_size, decrypt_iv);

        if (i > 0)
            uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
@ -1020,6 +1024,7 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu,
    size_t auth_tag_buffer_size = (size / copy_size) * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
    UvmCslIv *decrypt_iv = NULL;
    UvmCslIv *encrypt_iv = NULL;
+    NvU32 key_version;
    uvm_tracker_t tracker;
    size_t src_plain_size;

@ -1089,6 +1094,11 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu,

    gpu_encrypt(&push, dst_cipher, dst_plain_gpu, auth_tag_mem, decrypt_iv, size, copy_size);

+    // There shouldn't be any key rotation between the end of the push and the
+    // CPU decryption(s), but it is more robust against test changes to force
+    // decryption to use the saved key.
+    key_version = uvm_channel_pool_key_version(push.channel->pool);
+
    TEST_NV_CHECK_GOTO(uvm_push_end_and_wait(&push), out);

    TEST_CHECK_GOTO(!mem_match(src_plain, src_cipher, size), out);
@ -1101,6 +1111,7 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu,
                                                dst_plain,
                                                dst_cipher,
                                                decrypt_iv,
+                                                key_version,
                                                auth_tag_mem,
                                                size,
                                                copy_size),
@ -1111,6 +1122,7 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu,
                                                    dst_plain,
                                                    dst_cipher,
                                                    decrypt_iv,
+                                                    key_version,
                                                    auth_tag_mem,
                                                    size,
                                                    copy_size),
--- a/kernel-open/nvidia-uvm/uvm_channel.c
+++ b/kernel-open/nvidia-uvm/uvm_channel.c
@ -38,6 +38,32 @@
 #include "clb06f.h"
 #include "uvm_conf_computing.h"

+// WLC push is decrypted by SEC2 or CE (in WLC schedule).
+// In sysmem it's followed by auth tag.
+#define WLC_PUSHBUFFER_ALIGNMENT max3(UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT, \
+                                      UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT, \
+                                      UVM_CONF_COMPUTING_BUF_ALIGNMENT)
+#define WLC_ALIGNED_MAX_PUSH_SIZE UVM_ALIGN_UP(UVM_MAX_WLC_PUSH_SIZE, WLC_PUSHBUFFER_ALIGNMENT)
+
+// WLC uses the following structures in unprotected sysmem:
+// * Encrypted pushbuffer location. This gets populated via cpu_encrypt to
+//   launch work on a WLC channel.
+// * Auth tag associated with the above encrypted (push)buffer
+// * Another auth tag used to encrypt another channel's pushbuffer during
+//   indirect work launch. This can be allocated with the launched work
+//   but since WLC can oly launch one pushbuffer at a time it's easier
+//   to include it here.
+#define WLC_SYSMEM_TOTAL_SIZE UVM_ALIGN_UP(WLC_ALIGNED_MAX_PUSH_SIZE + 2 * UVM_CONF_COMPUTING_AUTH_TAG_SIZE, \
+                                           WLC_PUSHBUFFER_ALIGNMENT)
+
+#define WLC_SYSMEM_PUSHBUFFER_OFFSET 0
+#define WLC_SYSMEM_PUSHBUFFER_AUTH_TAG_OFFSET (WLC_SYSMEM_PUSHBUFFER_OFFSET + WLC_ALIGNED_MAX_PUSH_SIZE)
+#define WLC_SYSMEM_LAUNCH_AUTH_TAG_OFFSET (WLC_SYSMEM_PUSHBUFFER_AUTH_TAG_OFFSET + UVM_CONF_COMPUTING_AUTH_TAG_SIZE)
+
+// LCIC pushbuffer is populated by SEC2
+#define LCIC_PUSHBUFFER_ALIGNMENT UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT
+#define LCIC_ALIGNED_PUSH_SIZE UVM_ALIGN_UP(UVM_LCIC_PUSH_SIZE, LCIC_PUSHBUFFER_ALIGNMENT)
+
 static unsigned uvm_channel_num_gpfifo_entries = UVM_CHANNEL_NUM_GPFIFO_ENTRIES_DEFAULT;

 #define UVM_CHANNEL_GPFIFO_LOC_DEFAULT "auto"
@ -132,6 +158,12 @@ static NvU32 uvm_channel_update_progress_with_max(uvm_channel_t *channel,

    NvU64 completed_value = uvm_channel_update_completed_value(channel);

+    // LCIC channels don't use gpfifo entries after the static schedule is up.
+    // They can only have one entry active at a time so use the state of the
+    // tracking semaphore to represent progress.
+    if (uvm_channel_is_lcic(channel) && uvm_channel_manager_is_wlc_ready(channel->pool->manager))
+        return uvm_gpu_tracking_semaphore_is_completed(&channel->tracking_sem) ? 0 : 1;
+
    channel_pool_lock(channel->pool);

    // Completed value should never exceed the queued value
@ -280,16 +312,16 @@ static void unlock_channel_for_push(uvm_channel_t *channel)
    index = uvm_channel_index_in_pool(channel);

    uvm_channel_pool_assert_locked(channel->pool);
-    UVM_ASSERT(test_bit(index, channel->pool->push_locks));
+    UVM_ASSERT(test_bit(index, channel->pool->conf_computing.push_locks));

-    __clear_bit(index, channel->pool->push_locks);
-    uvm_up_out_of_order(&channel->pool->push_sem);
+    __clear_bit(index, channel->pool->conf_computing.push_locks);
+    uvm_up_out_of_order(&channel->pool->conf_computing.push_sem);
 }

 bool uvm_channel_is_locked_for_push(uvm_channel_t *channel)
 {
    if (g_uvm_global.conf_computing_enabled)
-        return test_bit(uvm_channel_index_in_pool(channel), channel->pool->push_locks);
+        return test_bit(uvm_channel_index_in_pool(channel), channel->pool->conf_computing.push_locks);

    // For CE and proxy channels, we always return that the channel is locked,
    // which has no functional impact in the UVM channel code-flow, this is only
@ -303,19 +335,21 @@ static void lock_channel_for_push(uvm_channel_t *channel)

    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
    uvm_channel_pool_assert_locked(channel->pool);
-    UVM_ASSERT(!test_bit(index, channel->pool->push_locks));
+    UVM_ASSERT(!test_bit(index, channel->pool->conf_computing.push_locks));

-    __set_bit(index, channel->pool->push_locks);
+    __set_bit(index, channel->pool->conf_computing.push_locks);
 }

 static bool test_claim_and_lock_channel(uvm_channel_t *channel, NvU32 num_gpfifo_entries)
 {
-    NvU32 index = uvm_channel_index_in_pool(channel);
-
    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
    uvm_channel_pool_assert_locked(channel->pool);

-    if (!test_bit(index, channel->pool->push_locks) && try_claim_channel_locked(channel, num_gpfifo_entries)) {
+    // Already locked by someone else
+    if (uvm_channel_is_locked_for_push(channel))
+        return false;
+
+    if (try_claim_channel_locked(channel, num_gpfifo_entries)) {
        lock_channel_for_push(channel);
        return true;
    }
@ -323,6 +357,109 @@ static bool test_claim_and_lock_channel(uvm_channel_t *channel, NvU32 num_gpfifo
    return false;
 }

+// Reserve, or release, all channels in the given pool.
+//
+// One scenario where reservation of the entire pool is useful is key rotation,
+// because the reservation blocks addition of new work to the pool while
+// rotation is in progress.
+static void channel_pool_reserve_release_all_channels(uvm_channel_pool_t *pool, bool reserve)
+{
+    NvU32 i;
+
+    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
+
+    // Disable lock tracking: a single thread is acquiring multiple locks of
+    // the same order
+    uvm_thread_context_lock_disable_tracking();
+
+    for (i = 0; i < pool->num_channels; i++) {
+        if (reserve)
+            uvm_down(&pool->conf_computing.push_sem);
+        else
+            uvm_up(&pool->conf_computing.push_sem);
+    }
+
+    uvm_thread_context_lock_enable_tracking();
+}
+
+static void channel_pool_reserve_all_channels(uvm_channel_pool_t *pool)
+{
+    channel_pool_reserve_release_all_channels(pool, true);
+}
+
+static void channel_pool_release_all_channels(uvm_channel_pool_t *pool)
+{
+    channel_pool_reserve_release_all_channels(pool, false);
+}
+
+static NV_STATUS channel_pool_rotate_key_locked(uvm_channel_pool_t *pool)
+{
+    uvm_channel_t *channel;
+
+    // A rotation is not necessarily pending, because UVM can trigger rotations
+    // at will.
+    UVM_ASSERT(uvm_conf_computing_is_key_rotation_enabled_in_pool(pool));
+
+    uvm_assert_mutex_locked(&pool->conf_computing.key_rotation.mutex);
+
+    uvm_for_each_channel_in_pool(channel, pool) {
+        // WLC channels share CE with LCIC pushes and LCIC waits for
+        // WLC work to complete using WFI, so it's enough to wait
+        // for the latter one.
+        uvm_channel_t *wait_channel = uvm_channel_is_wlc(channel) ? uvm_channel_wlc_get_paired_lcic(channel) : channel;
+
+        NV_STATUS status = uvm_channel_wait(wait_channel);
+        if (status != NV_OK)
+            return status;
+
+    }
+
+    return uvm_conf_computing_rotate_pool_key(pool);
+}
+
+static NV_STATUS channel_pool_rotate_key(uvm_channel_pool_t *pool, bool force_rotation)
+{
+    NV_STATUS status = NV_OK;
+
+    uvm_mutex_lock(&pool->conf_computing.key_rotation.mutex);
+
+    if (force_rotation || uvm_conf_computing_is_key_rotation_pending_in_pool(pool)) {
+        channel_pool_reserve_all_channels(pool);
+
+        status = channel_pool_rotate_key_locked(pool);
+
+        channel_pool_release_all_channels(pool);
+    }
+
+    uvm_mutex_unlock(&pool->conf_computing.key_rotation.mutex);
+
+    return status;
+}
+
+static NV_STATUS channel_pool_rotate_key_if_pending(uvm_channel_pool_t *pool)
+{
+    NV_STATUS status;
+    bool force_rotation = false;
+
+    if (!uvm_conf_computing_is_key_rotation_enabled_in_pool(pool))
+        return NV_OK;
+
+    status = channel_pool_rotate_key(pool, force_rotation);
+
+    // RM couldn't acquire the locks it needed, so UVM will try again later.
+    if (status == NV_ERR_STATE_IN_USE)
+        status = NV_OK;
+
+    return status;
+}
+
+NV_STATUS uvm_channel_pool_rotate_key(uvm_channel_pool_t *pool)
+{
+    bool force_rotation = true;
+
+    return channel_pool_rotate_key(pool, force_rotation);
+}
+
 // Reserve a channel in the specified pool. The channel is locked until the push
 // ends
 static NV_STATUS channel_reserve_and_lock_in_pool(uvm_channel_pool_t *pool, uvm_channel_t **channel_out)
@ -330,20 +467,28 @@ static NV_STATUS channel_reserve_and_lock_in_pool(uvm_channel_pool_t *pool, uvm_
    uvm_channel_t *channel;
    uvm_spin_loop_t spin;
    NvU32 index;
+    NV_STATUS status;

    UVM_ASSERT(pool);
    UVM_ASSERT(g_uvm_global.conf_computing_enabled);

+    // LCIC channels are reserved directly during GPU initialization.
+    UVM_ASSERT(!uvm_channel_pool_is_lcic(pool));
+
+    status = channel_pool_rotate_key_if_pending(pool);
+    if (status != NV_OK)
+        return status;
+
    // This semaphore is uvm_up() in unlock_channel_for_push() as part of the
    // uvm_channel_end_push() routine.
-    uvm_down(&pool->push_sem);
+    uvm_down(&pool->conf_computing.push_sem);

    // At least one channel is unlocked. We check if any unlocked channel is
    // available, i.e., if it has free GPFIFO entries.

    channel_pool_lock(pool);

-    for_each_clear_bit(index, pool->push_locks, pool->num_channels) {
+    for_each_clear_bit(index, pool->conf_computing.push_locks, pool->num_channels) {
        channel = &pool->channels[index];
        if (try_claim_channel_locked(channel, 1)) {
            lock_channel_for_push(channel);
@ -358,8 +503,6 @@ static NV_STATUS channel_reserve_and_lock_in_pool(uvm_channel_pool_t *pool, uvm_
    uvm_spin_loop_init(&spin);
    while (1) {
        uvm_for_each_channel_in_pool(channel, pool) {
-            NV_STATUS status;
-
            uvm_channel_update_progress(channel);

            channel_pool_lock(pool);
@ -371,7 +514,7 @@ static NV_STATUS channel_reserve_and_lock_in_pool(uvm_channel_pool_t *pool, uvm_

            status = uvm_channel_check_errors(channel);
            if (status != NV_OK) {
-                uvm_up(&pool->push_sem);
+                uvm_up(&pool->conf_computing.push_sem);
                return status;
            }

@ -489,6 +632,27 @@ static NvU32 channel_get_available_push_info_index(uvm_channel_t *channel)
    return push_info - channel->push_infos;
 }

+static unsigned channel_pool_num_gpfifo_entries(uvm_channel_pool_t *pool)
+{
+    UVM_ASSERT(uvm_pool_type_is_valid(pool->pool_type));
+
+    // WLC benefits from larger number of entries since more available entries
+    // result in less frequent calls to uvm_channel_update_progress. 16 is the
+    // maximum size that can re-use static pb preallocated memory when uploading
+    // the WLC schedule.
+    if (uvm_channel_pool_is_wlc(pool))
+        return 16;
+
+    // Every channel needs at least 3 entries; 1 for sentinel and 2 for
+    // submitting GPFIFO control entries. The number also has to be power of 2,
+    // as the HW stores the size as log2 value. LCIC does not accept external
+    // pushes, uvm_channel_update_progress is not a concern.
+    if (uvm_channel_pool_is_lcic(pool))
+        return 4;
+
+    return pool->manager->conf.num_gpfifo_entries;
+}
+
 static void channel_semaphore_gpu_encrypt_payload(uvm_push_t *push, NvU64 semaphore_va)
 {
    NvU32 iv_index;
@ -501,14 +665,14 @@ static void channel_semaphore_gpu_encrypt_payload(uvm_push_t *push, NvU64 semaph
    uvm_gpu_address_t semaphore_gpu_va = uvm_gpu_address_virtual(semaphore_va);
    UvmCslIv *iv_cpu_addr = semaphore->conf_computing.ivs;
    NvU32 payload_size = sizeof(*uvm_gpu_semaphore_get_encrypted_payload_cpu_va(semaphore));
-    NvU32 *last_pushed_notifier = &semaphore->conf_computing.last_pushed_notifier;
+    uvm_gpu_semaphore_notifier_t *last_pushed_notifier = &semaphore->conf_computing.last_pushed_notifier;

    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
    UVM_ASSERT(uvm_channel_is_ce(channel));

    iv_index = ((*last_pushed_notifier + 2) / 2) % channel->num_gpfifo_entries;

-    uvm_conf_computing_log_gpu_encryption(channel, &iv_cpu_addr[iv_index]);
+    uvm_conf_computing_log_gpu_encryption(channel, payload_size, &iv_cpu_addr[iv_index]);

    gpu->parent->ce_hal->memset_4(push, notifier_gpu_va, ++(*last_pushed_notifier), sizeof(*last_pushed_notifier));
    gpu->parent->ce_hal->encrypt(push, encrypted_payload_gpu_va, semaphore_gpu_va, payload_size, auth_tag_gpu_va);
@ -529,18 +693,69 @@ static void push_reserve_csl_sign_buf(uvm_push_t *push)
    UVM_ASSERT((buf - UVM_METHOD_SIZE / sizeof(*buf)) == push->begin);
 }

+// Reserve space for a single authentication tag and return addresses to unprotected sysmem
+static void *push_reserve_auth_tag(uvm_push_t *push, uvm_gpu_address_t *gpu_address_out)
+{
+    void *cpu_address;
+    uvm_gpu_address_t gpu_address;
+
+    UVM_ASSERT(push->channel);
+
+    cpu_address = uvm_push_get_single_inline_buffer(push,
+                                                    UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
+                                                    UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT,
+                                                    &gpu_address);
+
+    // SEC2 channel uses unprotected sysmem for pushes
+    if (uvm_channel_is_sec2(push->channel)) {
+        *gpu_address_out = gpu_address;
+        return cpu_address;
+    }
+    else {
+        uvm_pushbuffer_t *pushbuffer = uvm_channel_get_pushbuffer(push->channel);
+        NvU64 unprotected_gpu_va_for_push = uvm_pushbuffer_get_unprotected_gpu_va_for_push(pushbuffer, push);
+        char *unprotected_cpu_va_for_push = uvm_pushbuffer_get_unprotected_cpu_va_for_push(pushbuffer, push);
+        const size_t offset = (char*)cpu_address - (char*)push->begin;
+
+        UVM_ASSERT(offset == gpu_address.address - uvm_pushbuffer_get_gpu_va_for_push(pushbuffer, push));
+        UVM_ASSERT(!uvm_channel_is_wlc(push->channel));
+        UVM_ASSERT(!uvm_channel_is_lcic(push->channel));
+        UVM_ASSERT(uvm_channel_is_ce(push->channel));
+
+        *gpu_address_out = uvm_gpu_address_virtual_unprotected(unprotected_gpu_va_for_push + offset);
+        return unprotected_cpu_va_for_push + offset;
+    }
+}
+
+static uvm_channel_pool_t *get_paired_pool(uvm_channel_pool_t *pool)
+{
+    uvm_channel_type_t paired_channel_type;
+    uvm_channel_pool_t *paired_pool;
+
+    UVM_ASSERT(pool);
+    UVM_ASSERT(uvm_channel_pool_is_wlc(pool) || uvm_channel_pool_is_lcic(pool));
+
+    paired_channel_type = uvm_channel_pool_is_wlc(pool) ? UVM_CHANNEL_TYPE_LCIC : UVM_CHANNEL_TYPE_WLC;
+    paired_pool = pool->manager->pool_to_use.default_for_type[paired_channel_type];
+
+    // Prevent accessing a non-existing paired pool. This can happen if, for
+    // example, the function is invoked when the WLC pool exists, but the LCIC
+    // doesn't (it hasn't been created yet, or it has been already destroyed).
+    UVM_ASSERT(paired_pool);
+
+    return paired_pool;
+}
+
 static uvm_channel_t *get_paired_channel(uvm_channel_t *channel)
 {
-    unsigned index;
    uvm_channel_pool_t *paired_pool;
-    uvm_channel_type_t paired_channel_type;
+    unsigned index;

    UVM_ASSERT(channel);
-    UVM_ASSERT(uvm_channel_is_wlc(channel) || uvm_channel_is_lcic(channel));

+    paired_pool = get_paired_pool(channel->pool);
    index = uvm_channel_index_in_pool(channel);
-    paired_channel_type = uvm_channel_is_wlc(channel) ? UVM_CHANNEL_TYPE_LCIC : UVM_CHANNEL_TYPE_WLC;
-    paired_pool = channel->pool->manager->pool_to_use.default_for_type[paired_channel_type];
+
    return paired_pool->channels + index;
 }

@ -560,6 +775,63 @@ uvm_channel_t *uvm_channel_wlc_get_paired_lcic(uvm_channel_t *wlc_channel)
    return get_paired_channel(wlc_channel);
 }

+NvU64 uvm_channel_get_static_pb_protected_vidmem_gpu_va(uvm_channel_t *channel)
+{
+    unsigned channel_index;
+    NvU64 pool_vidmem_base;
+
+    UVM_ASSERT(channel);
+    UVM_ASSERT(uvm_channel_is_wlc(channel) || uvm_channel_is_lcic(channel));
+
+    channel_index = uvm_channel_index_in_pool(channel);
+    pool_vidmem_base = uvm_rm_mem_get_gpu_uvm_va(channel->pool->conf_computing.pool_vidmem,
+                                                 uvm_channel_get_gpu(channel));
+
+    if (uvm_channel_is_lcic(channel))
+        return pool_vidmem_base + channel_index * LCIC_ALIGNED_PUSH_SIZE;
+
+    return pool_vidmem_base + 2 * channel_index * WLC_ALIGNED_MAX_PUSH_SIZE;
+}
+
+static NvU64 get_channel_unprotected_sysmem_gpu_va(uvm_channel_t *channel)
+{
+    unsigned channel_index;
+    NvU64 pool_sysmem_base;
+
+    UVM_ASSERT(channel);
+    UVM_ASSERT(uvm_channel_is_wlc(channel));
+
+    channel_index = uvm_channel_index_in_pool(channel);
+    pool_sysmem_base = uvm_rm_mem_get_gpu_uvm_va(channel->pool->conf_computing.pool_sysmem,
+                                                 uvm_channel_get_gpu(channel));
+
+    return pool_sysmem_base + (channel_index * WLC_SYSMEM_TOTAL_SIZE);
+}
+
+NvU64 uvm_channel_get_static_pb_unprotected_sysmem_gpu_va(uvm_channel_t *channel)
+{
+    return get_channel_unprotected_sysmem_gpu_va(channel) + WLC_SYSMEM_PUSHBUFFER_OFFSET;
+}
+
+static char* get_channel_unprotected_sysmem_cpu(uvm_channel_t *channel)
+{
+    unsigned channel_index;
+    char* pool_sysmem_base;
+
+    UVM_ASSERT(channel);
+    UVM_ASSERT(uvm_channel_is_wlc(channel));
+
+    channel_index = uvm_channel_index_in_pool(channel);
+    pool_sysmem_base = uvm_rm_mem_get_cpu_va(channel->pool->conf_computing.pool_sysmem);
+
+    return pool_sysmem_base + (channel_index * WLC_SYSMEM_TOTAL_SIZE);
+}
+
+char* uvm_channel_get_static_pb_unprotected_sysmem_cpu(uvm_channel_t *channel)
+{
+    return get_channel_unprotected_sysmem_cpu(channel) + WLC_SYSMEM_PUSHBUFFER_OFFSET;
+}
+
 static NV_STATUS channel_rotate_and_reserve_launch_channel(uvm_channel_t *channel, uvm_channel_t **launch_channel)
 {
    uvm_channel_manager_t *manager = channel->pool->manager;
@ -735,20 +1007,64 @@ static void uvm_channel_tracking_semaphore_release(uvm_push_t *push, NvU64 semap
        channel_semaphore_gpu_encrypt_payload(push, semaphore_va);
 }

+static uvm_gpu_semaphore_notifier_t *lcic_static_entry_notifier_cpu_va(uvm_channel_t *lcic)
+{
+    uvm_gpu_semaphore_notifier_t *notifier_base;
+
+    UVM_ASSERT(uvm_channel_is_lcic(lcic));
+
+    notifier_base = uvm_rm_mem_get_cpu_va(lcic->pool->conf_computing.pool_sysmem);
+    return notifier_base + uvm_channel_index_in_pool(lcic) * 2;
+}
+
+static uvm_gpu_semaphore_notifier_t *lcic_static_exit_notifier_cpu_va(uvm_channel_t *lcic)
+{
+    return lcic_static_entry_notifier_cpu_va(lcic) + 1;
+}
+
+static uvm_gpu_address_t lcic_static_entry_notifier_gpu_va(uvm_channel_t *lcic)
+{
+    NvU64 notifier_base;
+    const NvU64 offset = uvm_channel_index_in_pool(lcic) * 2 * sizeof(uvm_gpu_semaphore_notifier_t);
+
+    UVM_ASSERT(uvm_channel_is_lcic(lcic));
+
+    notifier_base = uvm_rm_mem_get_gpu_uvm_va(lcic->pool->conf_computing.pool_sysmem, uvm_channel_get_gpu(lcic));
+    return uvm_gpu_address_virtual_unprotected(notifier_base + offset);
+}
+
+static uvm_gpu_address_t lcic_static_exit_notifier_gpu_va(uvm_channel_t *lcic)
+{
+    uvm_gpu_address_t notifier_address = lcic_static_entry_notifier_gpu_va(lcic);
+
+    notifier_address.address += sizeof(uvm_gpu_semaphore_notifier_t);
+    return notifier_address;
+}
+
 static void internal_channel_submit_work_wlc(uvm_push_t *push)
 {
+    size_t payload_size;
    uvm_channel_t *wlc_channel = push->channel;
    uvm_channel_t *lcic_channel = uvm_channel_wlc_get_paired_lcic(wlc_channel);
-    UvmCslIv *iv_cpu_addr = lcic_channel->tracking_sem.semaphore.conf_computing.ivs;
-    NvU32 *last_pushed_notifier;
+    uvm_gpu_semaphore_t *lcic_semaphore = &lcic_channel->tracking_sem.semaphore;
+    UvmCslIv *iv_cpu_addr = lcic_semaphore->conf_computing.ivs;
+    uvm_gpu_semaphore_notifier_t *last_pushed_notifier;
    NvU32 iv_index;
-    uvm_spin_loop_t spin;
+    NV_STATUS status;
+    void* auth_tag_cpu = get_channel_unprotected_sysmem_cpu(wlc_channel) + WLC_SYSMEM_PUSHBUFFER_AUTH_TAG_OFFSET;

-    UVM_ASSERT(lcic_channel);

    // Wait for the WLC/LCIC to be primed. This means that PUT == GET + 2
    // and a WLC doorbell ring is enough to start work.
-    UVM_SPIN_WHILE(!uvm_gpu_tracking_semaphore_is_completed(&lcic_channel->tracking_sem), &spin);
+    status = uvm_channel_wait(lcic_channel);
+    if (status != NV_OK) {
+        UVM_ASSERT(uvm_global_get_status() != NV_OK);
+
+        // If there's a global fatal error we can't communicate with the GPU
+        // and the below launch sequence doesn't work.
+        UVM_ERR_PRINT_NV_STATUS("Failed to wait for LCIC channel (%s) completion.", status, lcic_channel->name);
+        return;
+    }

    // Executing WLC adds an extra job to LCIC
    ++lcic_channel->tracking_sem.queued_value;
@ -760,19 +1076,21 @@ static void internal_channel_submit_work_wlc(uvm_push_t *push)

    // Handles the CPU part of the setup for the LCIC to be able to do GPU
    // encryption of its tracking semaphore value. See setup_lcic_schedule().
-    last_pushed_notifier  = &lcic_channel->tracking_sem.semaphore.conf_computing.last_pushed_notifier;
-    *lcic_channel->conf_computing.static_notifier_entry_unprotected_sysmem_cpu = ++(*last_pushed_notifier);
-    *lcic_channel->conf_computing.static_notifier_exit_unprotected_sysmem_cpu = ++(*last_pushed_notifier);
+    last_pushed_notifier = &lcic_semaphore->conf_computing.last_pushed_notifier;
+    *lcic_static_entry_notifier_cpu_va(lcic_channel) = ++(*last_pushed_notifier);
+    *lcic_static_exit_notifier_cpu_va(lcic_channel) = ++(*last_pushed_notifier);
    iv_index = (*last_pushed_notifier / 2) % lcic_channel->num_gpfifo_entries;
-    uvm_conf_computing_log_gpu_encryption(lcic_channel, &iv_cpu_addr[iv_index]);
+
+    payload_size = sizeof(*uvm_gpu_semaphore_get_encrypted_payload_cpu_va(lcic_semaphore));
+    uvm_conf_computing_log_gpu_encryption(lcic_channel, payload_size, &iv_cpu_addr[iv_index]);

    // Move push data
    uvm_conf_computing_cpu_encrypt(wlc_channel,
-                                   wlc_channel->conf_computing.static_pb_unprotected_sysmem_cpu,
+                                   uvm_channel_get_static_pb_unprotected_sysmem_cpu(wlc_channel),
                                   push->begin,
                                   &push->launch_iv,
                                   UVM_MAX_WLC_PUSH_SIZE,
-                                   wlc_channel->conf_computing.static_pb_unprotected_sysmem_auth_tag_cpu);
+                                   auth_tag_cpu);

    // Make sure all encrypted data is observable before ringing the doorbell.
    wmb();
@ -792,7 +1110,7 @@ static void internal_channel_submit_work_indirect_wlc(uvm_push_t *push, NvU32 ol

    void *push_enc_cpu = uvm_pushbuffer_get_unprotected_cpu_va_for_push(pushbuffer, push);
    NvU64 push_enc_gpu = uvm_pushbuffer_get_unprotected_gpu_va_for_push(pushbuffer, push);
-    void *push_enc_auth_tag;
+    void *push_enc_auth_tag_cpu;
    uvm_gpu_address_t push_enc_auth_tag_gpu;
    NvU64 gpfifo_gpu_va = push->channel->channel_info.gpFifoGpuVa + old_cpu_put * sizeof(gpfifo_entry);

@ -816,15 +1134,16 @@ static void internal_channel_submit_work_indirect_wlc(uvm_push_t *push, NvU32 ol

    // Move over the pushbuffer data
    // WLC channels use a static preallocated space for launch auth tags
-    push_enc_auth_tag = indirect_push.channel->conf_computing.launch_auth_tag_cpu;
-    push_enc_auth_tag_gpu = uvm_gpu_address_virtual(indirect_push.channel->conf_computing.launch_auth_tag_gpu_va);
+    push_enc_auth_tag_cpu = get_channel_unprotected_sysmem_cpu(indirect_push.channel) + WLC_SYSMEM_LAUNCH_AUTH_TAG_OFFSET;
+    push_enc_auth_tag_gpu = uvm_gpu_address_virtual_unprotected(
+        get_channel_unprotected_sysmem_gpu_va(indirect_push.channel) + WLC_SYSMEM_LAUNCH_AUTH_TAG_OFFSET);

    uvm_conf_computing_cpu_encrypt(indirect_push.channel,
                                   push_enc_cpu,
                                   push->begin,
                                   NULL,
                                   uvm_push_get_size(push),
-                                   push_enc_auth_tag);
+                                   push_enc_auth_tag_cpu);

    uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);

@ -872,10 +1191,7 @@ static void update_gpput_via_sec2(uvm_push_t *sec2_push, uvm_channel_t *channel,
                                                      UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT,
                                                      UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT,
                                                      &gpput_enc_gpu);
-    gpput_auth_tag_cpu = uvm_push_get_single_inline_buffer(sec2_push,
-                                                           UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
-                                                           UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT,
-                                                           &gpput_auth_tag_gpu);
+    gpput_auth_tag_cpu = push_reserve_auth_tag(sec2_push, &gpput_auth_tag_gpu);

    // Update GPPUT. The update needs 4B write to specific offset,
    // however we can only do 16B aligned decrypt writes.
@ -926,10 +1242,7 @@ static void set_gpfifo_via_sec2(uvm_push_t *sec2_push, uvm_channel_t *channel, N
                                                       sizeof(gpfifo_scratchpad),
                                                       UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT,
                                                       &gpfifo_enc_gpu);
-    gpfifo_auth_tag_cpu = uvm_push_get_single_inline_buffer(sec2_push,
-                                                            UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
-                                                            UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT,
-                                                            &gpfifo_auth_tag_gpu);
+    gpfifo_auth_tag_cpu = push_reserve_auth_tag(sec2_push, &gpfifo_auth_tag_gpu);

    if (IS_ALIGNED(gpfifo_gpu, UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT)) {
        gpfifo_scratchpad[0] = value;
@ -1016,10 +1329,7 @@ static NV_STATUS internal_channel_submit_work_indirect_sec2(uvm_push_t *push, Nv


    // Move over the pushbuffer data
-    push_auth_tag_cpu = uvm_push_get_single_inline_buffer(&indirect_push,
-                                                          UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
-                                                          UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT,
-                                                          &push_auth_tag_gpu);
+    push_auth_tag_cpu = push_reserve_auth_tag(&indirect_push, &push_auth_tag_gpu);

    uvm_conf_computing_cpu_encrypt(indirect_push.channel,
                                   push_enc_cpu,
@ -1077,7 +1387,6 @@ static void encrypt_push(uvm_push_t *push)
    NvU32 push_size = uvm_push_get_size(push);
    uvm_push_info_t *push_info = uvm_push_info_from_push(push);
    uvm_pushbuffer_t *pushbuffer = uvm_channel_get_pushbuffer(channel);
-    unsigned auth_tag_offset = UVM_CONF_COMPUTING_AUTH_TAG_SIZE * push->push_info_index;

    if (!g_uvm_global.conf_computing_enabled)
        return;
@ -1096,14 +1405,20 @@ static void encrypt_push(uvm_push_t *push)
    UVM_ASSERT(channel->conf_computing.push_crypto_bundles != NULL);

    crypto_bundle = channel->conf_computing.push_crypto_bundles + push->push_info_index;
-    auth_tag_gpu_va = uvm_rm_mem_get_gpu_va(channel->conf_computing.push_crypto_bundle_auth_tags, gpu, false);
-    auth_tag_gpu_va.address += auth_tag_offset;

+    // Auth tag is reserved after 'push_size' was queried above so it won't be
+    // overwritten during the encryption below. It will be overwritten by the
+    // launch encryption though. This is OK as it doesn't store any useful
+    // value at launch time.
+    crypto_bundle->auth_tag = push_reserve_auth_tag(push, &auth_tag_gpu_va);
    crypto_bundle->push_size = push_size;
+
    push_protected_gpu_va = uvm_pushbuffer_get_gpu_va_for_push(pushbuffer, push);
    push_unprotected_gpu_va = uvm_pushbuffer_get_unprotected_gpu_va_for_push(pushbuffer, push);

-    uvm_conf_computing_log_gpu_encryption(channel, &crypto_bundle->iv);
+    uvm_conf_computing_log_gpu_encryption(channel, push_size, &crypto_bundle->iv);
+    crypto_bundle->key_version = uvm_channel_pool_key_version(channel->pool);
+
    gpu->parent->ce_hal->encrypt(push,
                                 uvm_gpu_address_virtual_unprotected(push_unprotected_gpu_va),
                                 uvm_gpu_address_virtual(push_protected_gpu_va),
@ -1123,7 +1438,6 @@ void uvm_channel_end_push(uvm_push_t *push)
    NvU32 push_size;
    NvU32 cpu_put;
    NvU32 new_cpu_put;
-    uvm_gpu_t *gpu = uvm_channel_get_gpu(channel);
    bool needs_sec2_work_submit = false;

    channel_pool_lock(channel->pool);
@ -1137,6 +1451,7 @@ void uvm_channel_end_push(uvm_push_t *push)
    uvm_channel_tracking_semaphore_release(push, semaphore_va, new_payload);

    if (uvm_channel_is_wlc(channel) && uvm_channel_manager_is_wlc_ready(channel_manager)) {
+        uvm_gpu_t *gpu = uvm_channel_get_gpu(channel);
        uvm_channel_t *paired_lcic = uvm_channel_wlc_get_paired_lcic(channel);

        gpu->parent->ce_hal->semaphore_reduction_inc(push,
@ -1431,9 +1746,16 @@ NV_STATUS uvm_channel_write_ctrl_gpfifo(uvm_channel_t *channel, NvU64 ctrl_fifo_

 static NV_STATUS channel_reserve_and_lock(uvm_channel_t *channel, NvU32 num_gpfifo_entries)
 {
+    NV_STATUS status;
    uvm_spin_loop_t spin;
    uvm_channel_pool_t *pool = channel->pool;

+    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
+
+    status = channel_pool_rotate_key_if_pending(pool);
+    if (status != NV_OK)
+        return status;
+
    // This semaphore is uvm_up() in unlock_channel_for_push() as part of the
    // uvm_channel_end_push() routine. Note that different than in
    // channel_reserve_and_lock_in_pool, we cannot pick an unlocked channel from
@ -1441,7 +1763,7 @@ static NV_STATUS channel_reserve_and_lock(uvm_channel_t *channel, NvU32 num_gpfi
    // Not a concern given that uvm_channel_reserve() is not the common-case for
    // channel reservation, and only used for channel initialization, GPFIFO
    // control work submission, and testing.
-    uvm_down(&pool->push_sem);
+    uvm_down(&pool->conf_computing.push_sem);

    channel_pool_lock(pool);

@ -1452,8 +1774,6 @@ static NV_STATUS channel_reserve_and_lock(uvm_channel_t *channel, NvU32 num_gpfi

    uvm_spin_loop_init(&spin);
    while (1) {
-        NV_STATUS status;
-
        uvm_channel_update_progress(channel);

        channel_pool_lock(pool);
@ -1465,7 +1785,7 @@ static NV_STATUS channel_reserve_and_lock(uvm_channel_t *channel, NvU32 num_gpfi

        status = uvm_channel_check_errors(channel);
        if (status != NV_OK) {
-            uvm_up(&pool->push_sem);
+            uvm_up(&pool->conf_computing.push_sem);
            return status;
        }

@ -1655,6 +1975,8 @@ NV_STATUS uvm_channel_wait(uvm_channel_t *channel)
 static NV_STATUS csl_init(uvm_channel_t *channel)
 {
    NV_STATUS status;
+    unsigned context_index = uvm_channel_index_in_pool(channel);
+    uvm_channel_pool_t *pool = channel->pool;

    UVM_ASSERT(g_uvm_global.conf_computing_enabled);

@ -1671,17 +1993,38 @@ static NV_STATUS csl_init(uvm_channel_t *channel)
    uvm_mutex_init(&channel->csl.ctx_lock, UVM_LOCK_ORDER_CSL_CTX);
    channel->csl.is_ctx_initialized = true;

+    if (uvm_channel_is_lcic(channel)) {
+        pool = get_paired_pool(pool);
+        context_index += pool->num_channels;
+    }
+
+    UVM_ASSERT(pool->conf_computing.key_rotation.csl_contexts != NULL);
+
+    pool->conf_computing.key_rotation.csl_contexts[context_index] = &channel->csl.ctx;
+
    return NV_OK;
 }

 static void csl_destroy(uvm_channel_t *channel)
 {
+    uvm_channel_pool_t *pool = channel->pool;
+    unsigned context_index = uvm_channel_index_in_pool(channel);
+
    if (!channel->csl.is_ctx_initialized)
        return;

    uvm_assert_mutex_unlocked(&channel->csl.ctx_lock);
    UVM_ASSERT(!uvm_channel_is_locked_for_push(channel));

+    if (uvm_channel_is_lcic(channel)) {
+        pool = get_paired_pool(pool);
+        context_index += pool->num_channels;
+    }
+
+    UVM_ASSERT(pool->conf_computing.key_rotation.csl_contexts != NULL);
+
+    pool->conf_computing.key_rotation.csl_contexts[context_index] = NULL;
+
    uvm_rm_locked_call_void(nvUvmInterfaceDeinitCslContext(&channel->csl.ctx));
    channel->csl.is_ctx_initialized = false;
 }
@ -1691,152 +2034,45 @@ static void free_conf_computing_buffers(uvm_channel_t *channel)
    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
    UVM_ASSERT(uvm_channel_is_ce(channel));

-    uvm_rm_mem_free(channel->conf_computing.static_pb_protected_vidmem);
-    uvm_rm_mem_free(channel->conf_computing.static_pb_unprotected_sysmem);
-    uvm_rm_mem_free(channel->conf_computing.static_notifier_unprotected_sysmem);
-    uvm_rm_mem_free(channel->conf_computing.push_crypto_bundle_auth_tags);
    uvm_kvfree(channel->conf_computing.static_pb_protected_sysmem);
-    uvm_kvfree(channel->conf_computing.push_crypto_bundles);
-    channel->conf_computing.static_pb_protected_vidmem = NULL;
-    channel->conf_computing.static_pb_unprotected_sysmem = NULL;
-    channel->conf_computing.static_notifier_unprotected_sysmem = NULL;
-    channel->conf_computing.push_crypto_bundle_auth_tags = NULL;
    channel->conf_computing.static_pb_protected_sysmem = NULL;
+
+    uvm_kvfree(channel->conf_computing.push_crypto_bundles);
    channel->conf_computing.push_crypto_bundles = NULL;

    uvm_kvfree(channel->tracking_sem.semaphore.conf_computing.ivs);
    channel->tracking_sem.semaphore.conf_computing.ivs = NULL;
 }

-static NV_STATUS alloc_conf_computing_buffers_semaphore(uvm_channel_t *channel)
+static NV_STATUS alloc_conf_computing_buffers(uvm_channel_t *channel)
 {
    uvm_gpu_semaphore_t *semaphore = &channel->tracking_sem.semaphore;

    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
    UVM_ASSERT(uvm_channel_is_ce(channel));

-    semaphore->conf_computing.ivs = uvm_kvmalloc_zero(sizeof(*semaphore->conf_computing.ivs)
-                                                      * channel->num_gpfifo_entries);
+    semaphore->conf_computing.ivs =
+        uvm_kvmalloc(sizeof(*semaphore->conf_computing.ivs) * channel->num_gpfifo_entries);

    if (!semaphore->conf_computing.ivs)
        return NV_ERR_NO_MEMORY;

-    return NV_OK;
-}
-
-static NV_STATUS alloc_conf_computing_buffers_wlc(uvm_channel_t *channel)
-{
-    uvm_gpu_t *gpu = uvm_channel_get_gpu(channel);
-    size_t aligned_wlc_push_size = UVM_ALIGN_UP(UVM_MAX_WLC_PUSH_SIZE, UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT);
-    NV_STATUS status = uvm_rm_mem_alloc_and_map_cpu(gpu,
-                                                    UVM_RM_MEM_TYPE_SYS,
-                                                    aligned_wlc_push_size + UVM_CONF_COMPUTING_AUTH_TAG_SIZE * 2,
-                                                    PAGE_SIZE,
-                                                    &channel->conf_computing.static_pb_unprotected_sysmem);
-    if (status != NV_OK)
-        return status;
-
-    // Both pushes will be targets for SEC2 decrypt operations and have to
-    // be aligned for SEC2. The first push location will also be a target
-    // for CE decrypt operation and has to be aligned for CE decrypt.
-    status = uvm_rm_mem_alloc(gpu,
-                              UVM_RM_MEM_TYPE_GPU,
-                              UVM_ALIGN_UP(UVM_MAX_WLC_PUSH_SIZE, UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT) * 2,
-                              UVM_CONF_COMPUTING_BUF_ALIGNMENT,
-                              &channel->conf_computing.static_pb_protected_vidmem);
-    if (status != NV_OK)
-        return status;
-
-    channel->conf_computing.static_pb_unprotected_sysmem_cpu =
-        uvm_rm_mem_get_cpu_va(channel->conf_computing.static_pb_unprotected_sysmem);
-    channel->conf_computing.static_pb_unprotected_sysmem_auth_tag_cpu =
-        (char*)channel->conf_computing.static_pb_unprotected_sysmem_cpu + aligned_wlc_push_size;
-
-    // The location below is only used for launch pushes but reuses
-    // the same sysmem allocation
-    channel->conf_computing.launch_auth_tag_cpu =
-        (char*)channel->conf_computing.static_pb_unprotected_sysmem_cpu +
-        aligned_wlc_push_size + UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
-    channel->conf_computing.launch_auth_tag_gpu_va =
-        uvm_rm_mem_get_gpu_uvm_va(channel->conf_computing.static_pb_unprotected_sysmem, gpu) +
-        aligned_wlc_push_size + UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
-
-    channel->conf_computing.static_pb_protected_sysmem = uvm_kvmalloc(UVM_MAX_WLC_PUSH_SIZE + UVM_PAGE_SIZE_4K);
-    if (!channel->conf_computing.static_pb_protected_sysmem)
-        return NV_ERR_NO_MEMORY;
-
-    return status;
-}
-
-static NV_STATUS alloc_conf_computing_buffers_lcic(uvm_channel_t *channel)
-{
-    uvm_gpu_t *gpu = uvm_channel_get_gpu(channel);
-    const size_t notifier_size = sizeof(*channel->conf_computing.static_notifier_entry_unprotected_sysmem_cpu);
-    NV_STATUS status = uvm_rm_mem_alloc_and_map_cpu(gpu,
-                                                    UVM_RM_MEM_TYPE_SYS,
-                                                    notifier_size * 2,
-                                                    UVM_CONF_COMPUTING_BUF_ALIGNMENT,
-                                                    &channel->conf_computing.static_notifier_unprotected_sysmem);
-    if (status != NV_OK)
-        return status;
-
-    status = uvm_rm_mem_alloc(gpu,
-                              UVM_RM_MEM_TYPE_GPU,
-                              UVM_LCIC_PUSH_SIZE,
-                              UVM_CONF_COMPUTING_BUF_ALIGNMENT,
-                              &channel->conf_computing.static_pb_protected_vidmem);
-    if (status != NV_OK)
-        return status;
-
-    channel->conf_computing.static_notifier_entry_unprotected_sysmem_cpu =
-        uvm_rm_mem_get_cpu_va(channel->conf_computing.static_notifier_unprotected_sysmem);
-    channel->conf_computing.static_notifier_exit_unprotected_sysmem_cpu =
-        channel->conf_computing.static_notifier_entry_unprotected_sysmem_cpu + 1;
-
-    channel->conf_computing.static_notifier_entry_unprotected_sysmem_gpu_va =
-        uvm_rm_mem_get_gpu_va(channel->conf_computing.static_notifier_unprotected_sysmem, gpu, false);
-    channel->conf_computing.static_notifier_exit_unprotected_sysmem_gpu_va =
-        channel->conf_computing.static_notifier_entry_unprotected_sysmem_gpu_va;
-    channel->conf_computing.static_notifier_exit_unprotected_sysmem_gpu_va.address += notifier_size;
-
-    return status;
-}
-
-static NV_STATUS alloc_conf_computing_buffers(uvm_channel_t *channel)
-{
-    NV_STATUS status;
-
-    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
-    UVM_ASSERT(uvm_channel_is_ce(channel));
-
-    status = alloc_conf_computing_buffers_semaphore(channel);
-    if (status != NV_OK)
-        return status;
-
    if (uvm_channel_is_wlc(channel)) {
-        status = alloc_conf_computing_buffers_wlc(channel);
-    }
-    else if (uvm_channel_is_lcic(channel)) {
-        status = alloc_conf_computing_buffers_lcic(channel);
-    }
-    else {
-        uvm_gpu_t *gpu = uvm_channel_get_gpu(channel);
-        void *push_crypto_bundles = uvm_kvmalloc_zero(sizeof(*channel->conf_computing.push_crypto_bundles) *
-                                                      channel->num_gpfifo_entries);
+        channel->conf_computing.static_pb_protected_sysmem =
+            uvm_kvmalloc(UVM_ALIGN_UP(UVM_MAX_WLC_PUSH_SIZE, UVM_PAGE_SIZE_4K));

-        if (push_crypto_bundles == NULL)
+        if (!channel->conf_computing.static_pb_protected_sysmem)
            return NV_ERR_NO_MEMORY;
+    }
+    else if (!uvm_channel_is_lcic(channel)) {
+        channel->conf_computing.push_crypto_bundles =
+            uvm_kvmalloc(sizeof(*channel->conf_computing.push_crypto_bundles) * channel->num_gpfifo_entries);

-        channel->conf_computing.push_crypto_bundles = push_crypto_bundles;
-
-        status = uvm_rm_mem_alloc_and_map_cpu(gpu,
-                                              UVM_RM_MEM_TYPE_SYS,
-                                              channel->num_gpfifo_entries * UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
-                                              UVM_CONF_COMPUTING_BUF_ALIGNMENT,
-                                              &channel->conf_computing.push_crypto_bundle_auth_tags);
+        if (!channel->conf_computing.push_crypto_bundles)
+            return NV_ERR_NO_MEMORY;
    }

-    return status;
+    return NV_OK;
 }

 static void channel_destroy(uvm_channel_pool_t *pool, uvm_channel_t *channel)
@ -1884,36 +2120,6 @@ static void channel_destroy(uvm_channel_pool_t *pool, uvm_channel_t *channel)
    pool->num_channels--;
 }

-static unsigned channel_pool_type_num_gpfifo_entries(uvm_channel_manager_t *manager, uvm_channel_pool_type_t pool_type)
-{
-    switch (pool_type) {
-        case UVM_CHANNEL_POOL_TYPE_CE:
-        case UVM_CHANNEL_POOL_TYPE_CE_PROXY:
-            return manager->conf.num_gpfifo_entries;
-        case UVM_CHANNEL_POOL_TYPE_SEC2:
-            return manager->conf.num_gpfifo_entries;
-        case UVM_CHANNEL_POOL_TYPE_WLC: {
-            // WLC benefits from larger number of entries since more available
-            // entries result in less frequent calls to
-            // uvm_channel_update_progress 16 is the maximum size that can
-            // re-use static pb preallocated memory when uploading the WLC
-            // schedule.
-            return 16;
-        }
-        case UVM_CHANNEL_POOL_TYPE_LCIC: {
-            // Every channel needs at least 3 entries; 1 for sentinel and 2 more
-            // for submitting GPFIFO control entries. The number also has to be
-            // power of 2, as the HW stores the size as log2 value.
-            // LCIC does not accept external pushes, uvm_channel_update_progress
-            // is not a concern.
-            return 4;
-        }
-        default:
-            UVM_ASSERT_MSG(0, "Unhandled pool type: %d", pool_type);
-            return 0;
-    }
-}
-
 // Returns the TSG for a given channel.
 static uvmGpuTsgHandle channel_get_tsg(uvm_channel_t *channel)
 {
@ -1941,7 +2147,7 @@ static NV_STATUS internal_channel_create(uvm_channel_t *channel)
    uvm_channel_manager_t *manager = channel->pool->manager;

    memset(&channel_alloc_params, 0, sizeof(channel_alloc_params));
-    channel_alloc_params.numGpFifoEntries = channel_pool_type_num_gpfifo_entries(manager, channel->pool->pool_type);
+    channel_alloc_params.numGpFifoEntries = channel_pool_num_gpfifo_entries(channel->pool);
    channel_alloc_params.gpFifoLoc = manager->conf.gpfifo_loc;
    channel_alloc_params.gpPutLoc = manager->conf.gpput_loc;

@ -2045,7 +2251,7 @@ static NV_STATUS channel_create(uvm_channel_pool_t *pool, uvm_channel_t *channel
     if (status != NV_OK)
         goto error;

-    channel->num_gpfifo_entries = channel_pool_type_num_gpfifo_entries(manager, pool->pool_type);
+    channel->num_gpfifo_entries = channel_pool_num_gpfifo_entries(pool);
    channel->gpfifo_entries = uvm_kvmalloc_zero(sizeof(*channel->gpfifo_entries) * channel->num_gpfifo_entries);
    if (channel->gpfifo_entries == NULL) {
        status = NV_ERR_NO_MEMORY;
@ -2125,8 +2331,8 @@ static NV_STATUS channel_init(uvm_channel_t *channel)

        if (uvm_channel_is_sec2(channel))
            pb_base = uvm_pushbuffer_get_sec2_gpu_va_base(pushbuffer);
-        else if (channel->conf_computing.static_pb_protected_vidmem)
-            pb_base = uvm_rm_mem_get_gpu_uvm_va(channel->conf_computing.static_pb_protected_vidmem, gpu);
+        else if (uvm_channel_is_wlc(channel) || uvm_channel_is_lcic(channel))
+            pb_base = uvm_channel_get_static_pb_protected_vidmem_gpu_va(channel);

        gpu->parent->host_hal->set_gpfifo_pushbuffer_segment_base(&gpfifo_entry, pb_base);
        write_ctrl_gpfifo(channel, gpfifo_entry);
@ -2166,34 +2372,68 @@ static bool channel_manager_uses_proxy_pool(uvm_channel_manager_t *manager)
 }

 // Number of channels to create in a pool of the given type.
-//
-// TODO: Bug 1764958: Tweak this function after benchmarking real workloads.
-static unsigned channel_pool_type_num_channels(uvm_channel_pool_type_t pool_type)
+static unsigned channel_manager_num_channels(uvm_channel_manager_t *manager, uvm_channel_pool_type_t pool_type)
 {
-    // TODO: Bug 3387454: The vGPU plugin implementation supports a single
-    // proxy channel per GPU
-    if (pool_type == UVM_CHANNEL_POOL_TYPE_CE_PROXY)
-        return 1;
+    unsigned num_channels;

-    // Not all GPU architectures support more than 1 channel per TSG. Since SEC2
-    // is not in UVM critical path for performance, we conservatively create a
-    // pool/TSG with a single channel.
-    if (pool_type == UVM_CHANNEL_POOL_TYPE_SEC2)
-        return 1;
+    // In the common case, create two channels per pool.
+    //
+    // TODO: Bug 1764958: Tweak this number after benchmarking real workloads.
+    const unsigned channel_pool_type_ce_num_channels = 2;

-    if (pool_type == UVM_CHANNEL_POOL_TYPE_WLC || pool_type == UVM_CHANNEL_POOL_TYPE_LCIC)
-        return UVM_PUSH_MAX_CONCURRENT_PUSHES;
+    UVM_ASSERT(uvm_pool_type_is_valid(pool_type));

-    return 2;
+    if (pool_type == UVM_CHANNEL_POOL_TYPE_CE_PROXY) {
+
+        // TODO: Bug 3387454: The vGPU plugin implementation supports a single
+        // proxy channel per GPU
+        num_channels = 1;
+    }
+    else if (pool_type == UVM_CHANNEL_POOL_TYPE_SEC2) {
+
+        // Not all GPU architectures support more than 1 channel per TSG. Since
+        // SEC2 is not in UVM critical path for performance, conservatively
+        // create a pool/TSG with a single channel.
+        num_channels = 1;
+    }
+    else if ((pool_type == UVM_CHANNEL_POOL_TYPE_WLC) || (pool_type == UVM_CHANNEL_POOL_TYPE_LCIC)) {
+        unsigned max_concurrent_ce_pushes;
+        unsigned num_used_ces = bitmap_weight(manager->ce_mask, UVM_COPY_ENGINE_COUNT_MAX);
+
+        // CE selection should happen before this function is invoked.
+        UVM_ASSERT(num_used_ces > 0);
+
+        // Create as many WLC and LCIC channels as concurrent, ongoing, pushes
+        // of interest are allowed. In the general case, this number of pushes
+        // is capped by UVM_PUSH_MAX_CONCURRENT_PUSHES. But in Confidential
+        // Computing there is at most one ongoing push per channel, so the
+        // number of WLC/LCIC channels is also limited by the number of CE
+        // channels.
+        //
+        // The calculation only considers channels mapped to the
+        // UVM_CHANNEL_POOL_TYPE_CE type, because WLC and LCIC channels are
+        // created to enable work launch exclusively in those other channels.
+        max_concurrent_ce_pushes = num_used_ces * channel_pool_type_ce_num_channels;
+        num_channels = min(max_concurrent_ce_pushes, (unsigned) UVM_PUSH_MAX_CONCURRENT_PUSHES);
+    }
+    else {
+        UVM_ASSERT(pool_type == UVM_CHANNEL_POOL_TYPE_CE);
+
+        num_channels = channel_pool_type_ce_num_channels;
+    }
+
+    UVM_ASSERT(num_channels <= UVM_CHANNEL_MAX_NUM_CHANNELS_PER_POOL);
+
+    return num_channels;
 }

 // Number of TSGs to create in a pool of a given type.
-static unsigned channel_pool_type_num_tsgs(uvm_channel_pool_type_t pool_type)
+static unsigned channel_manager_num_tsgs(uvm_channel_manager_t *manager, uvm_channel_pool_type_t pool_type)
 {
    // For WLC and LCIC channels, we create one TSG per WLC/LCIC channel pair.
    // The TSG is stored in the WLC pool.
    if (pool_type == UVM_CHANNEL_POOL_TYPE_WLC)
-        return channel_pool_type_num_channels(pool_type);
+        return channel_manager_num_channels(manager, pool_type);
    else if (pool_type == UVM_CHANNEL_POOL_TYPE_LCIC)
        return 0;

@ -2249,17 +2489,150 @@ static void channel_pool_destroy(uvm_channel_pool_t *pool)

    while (pool->num_channels > 0)
        channel_destroy(pool, pool->channels + pool->num_channels - 1);
+
    uvm_kvfree(pool->channels);
    pool->channels = NULL;

    while (pool->num_tsgs > 0)
        tsg_destroy(pool, *(pool->tsg_handles + pool->num_tsgs - 1));
+
    uvm_kvfree(pool->tsg_handles);
    pool->tsg_handles = NULL;

+    uvm_kvfree(pool->conf_computing.key_rotation.csl_contexts);
+    pool->conf_computing.key_rotation.csl_contexts = NULL;
+
+    uvm_rm_mem_free(pool->conf_computing.pool_sysmem);
+    uvm_rm_mem_free(pool->conf_computing.pool_vidmem);
+
    pool->manager->num_channel_pools--;
 }

+static void channel_pool_initialize_locks(uvm_channel_pool_t *pool, unsigned num_channels)
+{
+    uvm_lock_order_t order;
+
+    channel_pool_lock_init(pool);
+
+    if (!g_uvm_global.conf_computing_enabled)
+        return;
+
+    // Use different order lock for SEC2 and WLC channels.
+    // This allows reserving a SEC2 or WLC channel for indirect work
+    // submission while holding a reservation for a channel.
+    if (uvm_channel_pool_is_sec2(pool))
+        order = UVM_LOCK_ORDER_CSL_SEC2_PUSH;
+    else if (uvm_channel_pool_is_wlc(pool))
+        order = UVM_LOCK_ORDER_CSL_WLC_PUSH;
+    else
+        order = UVM_LOCK_ORDER_CSL_PUSH;
+
+    uvm_sema_init(&pool->conf_computing.push_sem, num_channels, order);
+
+    if (uvm_channel_pool_is_wlc(pool))
+        order = UVM_LOCK_ORDER_KEY_ROTATION_WLC;
+    else
+        order = UVM_LOCK_ORDER_KEY_ROTATION;
+
+    uvm_mutex_init(&pool->conf_computing.key_rotation.mutex, order);
+}
+
+static NV_STATUS channel_pool_alloc_key_rotation_data(uvm_channel_pool_t *pool, unsigned num_channels)
+{
+    size_t csl_contexts_size;
+
+    // uvm_conf_computing_is_key_rotation_enabled_in_pool cannot be used to
+    // skip key rotation data initialization, because during GPU initialization
+    // the function always returns false.
+    if (!g_uvm_global.conf_computing_enabled)
+        return NV_OK;
+
+    // CSL contexts associated with LCIC channels are saved in the WLC context
+    // array, not in the LCIC context array, so all the underlying engine
+    // contexts are stored contiguously.
+    if (uvm_channel_pool_is_lcic(pool))
+        return NV_OK;
+
+    if (uvm_channel_pool_is_wlc(pool)) {
+        UVM_ASSERT(channel_manager_num_channels(pool->manager, UVM_CHANNEL_POOL_TYPE_WLC) == num_channels);
+        UVM_ASSERT(channel_manager_num_channels(pool->manager, UVM_CHANNEL_POOL_TYPE_LCIC) == num_channels);
+
+        num_channels *= 2;
+    }
+
+    csl_contexts_size = sizeof(*pool->conf_computing.key_rotation.csl_contexts) * num_channels;
+    pool->conf_computing.key_rotation.csl_contexts = uvm_kvmalloc_zero(csl_contexts_size);
+
+    if (pool->conf_computing.key_rotation.csl_contexts == NULL)
+        return NV_ERR_NO_MEMORY;
+
+    pool->conf_computing.key_rotation.num_csl_contexts = num_channels;
+
+    return NV_OK;
+}
+
+static NV_STATUS channel_pool_alloc_conf_computing_buffers(uvm_channel_pool_t *pool, unsigned num_channels)
+{
+    uvm_gpu_t *gpu = pool->manager->gpu;
+    NV_STATUS status = NV_OK;
+
+    if (!g_uvm_global.conf_computing_enabled)
+        return NV_OK;
+
+    if (uvm_channel_pool_is_wlc(pool)) {
+
+        // Allocate unprotected sysmem buffers for WLC channels.
+        // The use/substructures are described by WLC_SYSMEM_TOTAL_SIZE
+        status = uvm_rm_mem_alloc_and_map_cpu(gpu,
+                                              UVM_RM_MEM_TYPE_SYS,
+                                              WLC_SYSMEM_TOTAL_SIZE * num_channels,
+                                              WLC_PUSHBUFFER_ALIGNMENT,
+                                              &pool->conf_computing.pool_sysmem);
+        if (status != NV_OK)
+            return status;
+
+        // WLC stores two pushbuffers used by its static schedule in vidmem.
+        // See setup_wlc_schedule for the expected use of each of the static
+        // pushbuffers.
+        status = uvm_rm_mem_alloc(gpu,
+                                  UVM_RM_MEM_TYPE_GPU,
+                                  WLC_ALIGNED_MAX_PUSH_SIZE * 2 * num_channels,
+                                  WLC_PUSHBUFFER_ALIGNMENT,
+                                  &pool->conf_computing.pool_vidmem);
+        if (status != NV_OK)
+            return status;
+    }
+    else if (uvm_channel_pool_is_lcic(pool)) {
+
+        // LCIC uses only static schedule so in order to use dynamic values
+        // for entry/exit notifiers for its tracking semaphore they need
+        // to be populated in a pre-defined sysmem location, before invoking
+        // the LCIC schedule.
+        status = uvm_rm_mem_alloc_and_map_cpu(gpu,
+                                              UVM_RM_MEM_TYPE_SYS,
+                                              sizeof(uvm_gpu_semaphore_notifier_t) * 2 * num_channels,
+                                              0,
+                                              &pool->conf_computing.pool_sysmem);
+        if (status != NV_OK)
+            return status;
+
+        // LCIC static schedule pushbuffer is in vidmem
+        status = uvm_rm_mem_alloc(gpu,
+                                  UVM_RM_MEM_TYPE_GPU,
+                                  LCIC_ALIGNED_PUSH_SIZE * num_channels,
+                                  LCIC_PUSHBUFFER_ALIGNMENT,
+                                  &pool->conf_computing.pool_vidmem);
+        if (status != NV_OK)
+            return status;
+    }
+
+    status = channel_pool_alloc_key_rotation_data(pool, num_channels);
+    if (status != NV_OK)
+        return status;
+
+    return NV_OK;
+}
+
 static NV_STATUS channel_pool_add(uvm_channel_manager_t *channel_manager,
                                  uvm_channel_pool_type_t pool_type,
                                  unsigned engine_index,
@ -2280,7 +2653,7 @@ static NV_STATUS channel_pool_add(uvm_channel_manager_t *channel_manager,
    pool->engine_index = engine_index;
    pool->pool_type = pool_type;

-    num_tsgs = channel_pool_type_num_tsgs(pool_type);
+    num_tsgs = channel_manager_num_tsgs(channel_manager, pool_type);
    if (num_tsgs != 0) {
        pool->tsg_handles = uvm_kvmalloc_zero(sizeof(*pool->tsg_handles) * num_tsgs);
        if (!pool->tsg_handles) {
@ -2297,21 +2670,13 @@ static NV_STATUS channel_pool_add(uvm_channel_manager_t *channel_manager,
        }
    }

-    channel_pool_lock_init(pool);
+    num_channels = channel_manager_num_channels(channel_manager, pool_type);

-    num_channels = channel_pool_type_num_channels(pool_type);
-    UVM_ASSERT(num_channels <= UVM_CHANNEL_MAX_NUM_CHANNELS_PER_POOL);
+    channel_pool_initialize_locks(pool, num_channels);

-    if (g_uvm_global.conf_computing_enabled) {
-        // Use different order lock for SEC2 and WLC channels.
-        // This allows reserving a SEC2 or WLC channel for indirect work
-        // submission while holding a reservation for a channel.
-        uvm_lock_order_t order = uvm_channel_pool_is_sec2(pool) ? UVM_LOCK_ORDER_CSL_SEC2_PUSH :
-                                 (uvm_channel_pool_is_wlc(pool) ? UVM_LOCK_ORDER_CSL_WLC_PUSH :
-                                                                  UVM_LOCK_ORDER_CSL_PUSH);
-
-        uvm_sema_init(&pool->push_sem, num_channels, order);
-    }
+    status = channel_pool_alloc_conf_computing_buffers(pool, num_channels);
+    if (status != NV_OK)
+        goto error;

    pool->channels = uvm_kvmalloc_zero(sizeof(*pool->channels) * num_channels);
    if (!pool->channels) {
@ -2871,11 +3236,8 @@ static NV_STATUS channel_manager_create_ce_pools(uvm_channel_manager_t *manager,
 static NV_STATUS setup_wlc_schedule(uvm_channel_t *wlc)
 {
    uvm_gpu_t *gpu = uvm_channel_get_gpu(wlc);
-    NvU64 protected_vidmem = uvm_rm_mem_get_gpu_uvm_va(wlc->conf_computing.static_pb_protected_vidmem, gpu);
-    NvU64 unprotected_sysmem_gpu = uvm_rm_mem_get_gpu_uvm_va(wlc->conf_computing.static_pb_unprotected_sysmem, gpu);
-    void *unprotected_sysmem_cpu = wlc->conf_computing.static_pb_unprotected_sysmem_cpu;
-    NvU64 tag_offset = (uintptr_t)wlc->conf_computing.static_pb_unprotected_sysmem_auth_tag_cpu -
-                       (uintptr_t)wlc->conf_computing.static_pb_unprotected_sysmem_cpu;
+    NvU64 protected_vidmem_gpu_va = uvm_channel_get_static_pb_protected_vidmem_gpu_va(wlc);
+    NvU64 unprotected_sysmem_gpu_va = get_channel_unprotected_sysmem_gpu_va(wlc);

    NvU64 *wlc_gpfifo_entries;
    uvm_push_t wlc_decrypt_push, sec2_push;
@ -2883,31 +3245,39 @@ static NV_STATUS setup_wlc_schedule(uvm_channel_t *wlc)
    int i;
    NV_STATUS status = NV_OK;

-    // "gpfifo" is the representation of GPFIFO copied to gpFifoGpu
+    // "gpfifo" is the representation of GPFIFO copied to gpFifoGpuVa.
+    // Resuse static pushbuffer sysmem location for uploading GPFIFO schedule
    const size_t gpfifo_size = wlc->num_gpfifo_entries * sizeof(*wlc_gpfifo_entries);
-    void *gpfifo_unprotected_cpu = unprotected_sysmem_cpu;
-    NvU64 gpfifo_unprotected_gpu = unprotected_sysmem_gpu;
+    NvU64 gpfifo_unprotected_gpu_va = unprotected_sysmem_gpu_va;
+    void *gpfifo_unprotected_cpu = get_channel_unprotected_sysmem_cpu(wlc);

-    // "run_push" represents mutable push location used by WLC
-    uvm_gpu_address_t run_push_protected_gpu = uvm_gpu_address_virtual(protected_vidmem);
-    uvm_gpu_address_t run_push_unprotected_gpu = uvm_gpu_address_virtual(unprotected_sysmem_gpu);
-    uvm_gpu_address_t run_push_unprotected_auth_tag_gpu = uvm_gpu_address_virtual(unprotected_sysmem_gpu + tag_offset);
+    // "run_push" represents mutable push location used by WLC. This is the
+    // first part of the WLC schedule, commands are decrypted as part of the
+    // launch sequence to protected_vidmem_gpu_va + 0.
+    // These locations are used in the static part ("decrypt_push") of the WLC schedule.
+    uvm_gpu_address_t run_push_protected_gpu = uvm_gpu_address_virtual(protected_vidmem_gpu_va);
+    uvm_gpu_address_t run_push_unprotected_gpu =
+        uvm_gpu_address_virtual_unprotected(unprotected_sysmem_gpu_va + WLC_SYSMEM_PUSHBUFFER_OFFSET);
+    uvm_gpu_address_t run_push_unprotected_auth_tag_gpu =
+        uvm_gpu_address_virtual_unprotected(unprotected_sysmem_gpu_va + WLC_SYSMEM_PUSHBUFFER_AUTH_TAG_OFFSET);

    // "decrypt_push" represents WLC decrypt push, constructed using fake_push.
-    // Copied to wlc_pb_base + UVM_MAX_WLC_PUSH_SIZE, as the second of the two
+    // Copied to protected_vidmem_gpu_va + UVM_MAX_WLC_PUSH_SIZE, as the second of the two
    // pushes that make the WLC fixed schedule.
-    NvU64 decrypt_push_protected_gpu = UVM_ALIGN_UP(protected_vidmem + UVM_MAX_WLC_PUSH_SIZE, UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT);
-    NvU64 decrypt_push_unprotected_gpu = unprotected_sysmem_gpu + gpfifo_size;
+    NvU64 decrypt_push_protected_gpu_va = protected_vidmem_gpu_va + WLC_ALIGNED_MAX_PUSH_SIZE;
+
+    // Similar to gpfifo, uploading the "decrypt_push" reuses static sysmem
+    // locations later used for "run_push" when the WLC/LCIC schedule is active
+    NvU64 decrypt_push_unprotected_gpu_va = gpfifo_unprotected_gpu_va + gpfifo_size;
    void *decrypt_push_unprotected_cpu = (char*)gpfifo_unprotected_cpu + gpfifo_size;

    // Tags for upload via SEC2
-    void *decrypt_push_auth_tag, *gpfifo_auth_tag;
+    void *decrypt_push_auth_tag_cpu, *gpfifo_auth_tag_cpu;
    uvm_gpu_address_t decrypt_push_auth_tag_gpu, gpfifo_auth_tag_gpu;

    BUILD_BUG_ON(sizeof(*wlc_gpfifo_entries) != sizeof(*wlc->channel_info.gpFifoEntries));

    UVM_ASSERT(uvm_channel_is_wlc(wlc));
-    UVM_ASSERT(tag_offset == UVM_ALIGN_UP(UVM_MAX_WLC_PUSH_SIZE, UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT));

    // WLC schedule consists of two parts, the number of entries needs to be even.
    // This also guarantees that the size is 16B aligned
@ -2954,7 +3324,7 @@ static NV_STATUS setup_wlc_schedule(uvm_channel_t *wlc)
    for (i = 0; i < wlc->num_gpfifo_entries; ++i) {
        if (i % 2 == wlc->cpu_put % 2) {
            gpu->parent->host_hal->set_gpfifo_entry(wlc_gpfifo_entries + i,
-                                                    decrypt_push_protected_gpu,
+                                                    decrypt_push_protected_gpu_va,
                                                    decrypt_push_size,
                                                    UVM_GPFIFO_SYNC_PROCEED);
        }
@ -2974,15 +3344,8 @@ static NV_STATUS setup_wlc_schedule(uvm_channel_t *wlc)
    if (status != NV_OK)
        goto end_wlc_dec_push;

-    decrypt_push_auth_tag = uvm_push_get_single_inline_buffer(&sec2_push,
-                                                              UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
-                                                              UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT,
-                                                              &decrypt_push_auth_tag_gpu);
-
-    gpfifo_auth_tag = uvm_push_get_single_inline_buffer(&sec2_push,
-                                                        UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
-                                                        UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT,
-                                                        &gpfifo_auth_tag_gpu);
+    decrypt_push_auth_tag_cpu = push_reserve_auth_tag(&sec2_push, &decrypt_push_auth_tag_gpu);
+    gpfifo_auth_tag_cpu = push_reserve_auth_tag(&sec2_push, &gpfifo_auth_tag_gpu);

    // Upload WLC pushbuffer
    uvm_conf_computing_cpu_encrypt(sec2_push.channel,
@ -2990,10 +3353,10 @@ static NV_STATUS setup_wlc_schedule(uvm_channel_t *wlc)
                                   wlc_decrypt_push.begin,
                                   NULL,
                                   decrypt_push_size,
-                                   decrypt_push_auth_tag);
+                                   decrypt_push_auth_tag_cpu);
    gpu->parent->sec2_hal->decrypt(&sec2_push,
-                                   decrypt_push_protected_gpu,
-                                   decrypt_push_unprotected_gpu,
+                                   decrypt_push_protected_gpu_va,
+                                   decrypt_push_unprotected_gpu_va,
                                   decrypt_push_size,
                                   decrypt_push_auth_tag_gpu.address);

@ -3003,10 +3366,10 @@ static NV_STATUS setup_wlc_schedule(uvm_channel_t *wlc)
                                   wlc_gpfifo_entries,
                                   NULL,
                                   gpfifo_size,
-                                   gpfifo_auth_tag);
+                                   gpfifo_auth_tag_cpu);
    gpu->parent->sec2_hal->decrypt(&sec2_push,
                                   wlc->channel_info.gpFifoGpuVa,
-                                   gpfifo_unprotected_gpu,
+                                   gpfifo_unprotected_gpu_va,
                                   gpfifo_size,
                                   gpfifo_auth_tag_gpu.address);

@ -3028,21 +3391,22 @@ free_gpfifo_entries:
 static NV_STATUS setup_lcic_schedule(uvm_channel_t *paired_wlc, uvm_channel_t *lcic)
 {
    uvm_gpu_t *gpu = uvm_channel_get_gpu(lcic);
-    NvU64 lcic_pb_base = uvm_rm_mem_get_gpu_uvm_va(lcic->conf_computing.static_pb_protected_vidmem, gpu);
+    NvU64 lcic_pb_base = uvm_channel_get_static_pb_protected_vidmem_gpu_va(lcic);

    // Reuse WLC sysmem allocation
-    NvU64 gpu_unprotected = uvm_rm_mem_get_gpu_uvm_va(paired_wlc->conf_computing.static_pb_unprotected_sysmem, gpu);
-    char *cpu_unprotected = paired_wlc->conf_computing.static_pb_unprotected_sysmem_cpu;
+    NvU64 gpu_unprotected = get_channel_unprotected_sysmem_gpu_va(paired_wlc);
+    char *cpu_unprotected = get_channel_unprotected_sysmem_cpu(paired_wlc);

    uvm_gpu_semaphore_t *lcic_semaphore = &lcic->tracking_sem.semaphore;
-    uvm_gpu_address_t notifier_src_entry_addr = lcic->conf_computing.static_notifier_entry_unprotected_sysmem_gpu_va;
-    uvm_gpu_address_t notifier_src_exit_addr = lcic->conf_computing.static_notifier_exit_unprotected_sysmem_gpu_va;
+
+    uvm_gpu_address_t notifier_src_entry_addr = lcic_static_entry_notifier_gpu_va(lcic);
+    uvm_gpu_address_t notifier_src_exit_addr = lcic_static_exit_notifier_gpu_va(lcic);
    uvm_gpu_address_t notifier_dst_addr = uvm_gpu_semaphore_get_notifier_gpu_va(lcic_semaphore);
    uvm_gpu_address_t encrypted_payload_gpu_va = uvm_gpu_semaphore_get_encrypted_payload_gpu_va(lcic_semaphore);
    uvm_gpu_address_t auth_tag_gpu_va = uvm_gpu_semaphore_get_auth_tag_gpu_va(lcic_semaphore);
    uvm_gpu_address_t semaphore_gpu_va = uvm_gpu_address_virtual(uvm_channel_tracking_semaphore_get_gpu_va(lcic));
    NvU32 payload_size = sizeof(*uvm_gpu_semaphore_get_encrypted_payload_cpu_va(lcic_semaphore));
-    NvU32 notifier_size = sizeof(*lcic->conf_computing.static_notifier_entry_unprotected_sysmem_cpu);
+    NvU32 notifier_size = sizeof(uvm_gpu_semaphore_notifier_t);

    NvU64 *lcic_gpfifo_entries;
    uvm_push_t lcic_push, sec2_push;
@ -3057,7 +3421,7 @@ static NV_STATUS setup_lcic_schedule(uvm_channel_t *paired_wlc, uvm_channel_t *l
    NvU64 lcic_push_unprotected_gpu = gpfifo_unprotected_gpu + gpfifo_size;
    NvU64 lcic_push_protected_gpu = lcic_pb_base;

-    char *lcic_push_enc_tag, *gpfifo_enc_tag;
+    char *lcic_push_enc_tag_cpu, *gpfifo_enc_tag_cpu;
    uvm_gpu_address_t lcic_push_enc_tag_gpu, gpfifo_enc_tag_gpu;

    BUILD_BUG_ON(sizeof(*lcic_gpfifo_entries) != sizeof(*lcic->channel_info.gpFifoEntries));
@ -3098,7 +3462,11 @@ static NV_STATUS setup_lcic_schedule(uvm_channel_t *paired_wlc, uvm_channel_t *l
                                                 0xffffffff);

    gpu->parent->ce_hal->memcopy(&lcic_push, notifier_dst_addr, notifier_src_entry_addr, notifier_size);
+
+    // This CE encryption does not need to be logged, it will be logged on every
+    // push_end instead
    gpu->parent->ce_hal->encrypt(&lcic_push, encrypted_payload_gpu_va, semaphore_gpu_va, payload_size, auth_tag_gpu_va);
+
    gpu->parent->ce_hal->memcopy(&lcic_push, notifier_dst_addr, notifier_src_exit_addr, notifier_size);

    // End LCIC push
@ -3123,15 +3491,8 @@ static NV_STATUS setup_lcic_schedule(uvm_channel_t *paired_wlc, uvm_channel_t *l
    if (status != NV_OK)
        goto end_lcic_push;

-    lcic_push_enc_tag = uvm_push_get_single_inline_buffer(&sec2_push,
-                                                          UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
-                                                          UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT,
-                                                          &lcic_push_enc_tag_gpu);
-
-    gpfifo_enc_tag = uvm_push_get_single_inline_buffer(&sec2_push,
-                                                       UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
-                                                       UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT,
-                                                       &gpfifo_enc_tag_gpu);
+    lcic_push_enc_tag_cpu = push_reserve_auth_tag(&sec2_push, &lcic_push_enc_tag_gpu);
+    gpfifo_enc_tag_cpu = push_reserve_auth_tag(&sec2_push, &gpfifo_enc_tag_gpu);

    // Upload LCIC pushbuffer
    uvm_conf_computing_cpu_encrypt(sec2_push.channel,
@ -3139,7 +3500,7 @@ static NV_STATUS setup_lcic_schedule(uvm_channel_t *paired_wlc, uvm_channel_t *l
                                   lcic_push.begin,
                                   NULL,
                                   lcic_push_size,
-                                   lcic_push_enc_tag);
+                                   lcic_push_enc_tag_cpu);
    gpu->parent->sec2_hal->decrypt(&sec2_push,
                                   lcic_push_protected_gpu,
                                   lcic_push_unprotected_gpu,
@ -3152,7 +3513,7 @@ static NV_STATUS setup_lcic_schedule(uvm_channel_t *paired_wlc, uvm_channel_t *l
                                   lcic_gpfifo_entries,
                                   NULL,
                                   gpfifo_size,
-                                   gpfifo_enc_tag);
+                                   gpfifo_enc_tag_cpu);
    gpu->parent->sec2_hal->decrypt(&sec2_push,
                                   lcic->channel_info.gpFifoGpuVa,
                                   gpfifo_unprotected_gpu,
@ -3172,6 +3533,7 @@ static NV_STATUS channel_manager_setup_wlc_lcic(uvm_channel_pool_t *wlc_pool, uv
    NvU32 i;

    UVM_ASSERT(wlc_pool->manager == lcic_pool->manager);
+    UVM_ASSERT(!uvm_channel_manager_is_wlc_ready(wlc_pool->manager));
    UVM_ASSERT(wlc_pool->manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_WLC] != NULL);
    UVM_ASSERT(lcic_pool->manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_LCIC] == NULL);
    UVM_ASSERT(wlc_pool->num_channels == lcic_pool->num_channels);
@ -3244,6 +3606,9 @@ static NV_STATUS channel_manager_create_conf_computing_pools(uvm_channel_manager
    // are ready to be used for secure work submission.
    manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_LCIC] = lcic_pool;

+    // WLC and LCIC pools are ready
+    manager->conf_computing.wlc_ready = true;
+
    return NV_OK;
 }

@ -3296,6 +3661,8 @@ NV_STATUS uvm_channel_manager_create(uvm_gpu_t *gpu, uvm_channel_manager_t **cha
    if (!channel_manager)
        return NV_ERR_NO_MEMORY;

+    *channel_manager_out = channel_manager;
+
    channel_manager->gpu = gpu;
    init_channel_manager_conf(channel_manager);
    status = uvm_pushbuffer_create(channel_manager, &channel_manager->pushbuffer);
@ -3314,12 +3681,18 @@ NV_STATUS uvm_channel_manager_create(uvm_gpu_t *gpu, uvm_channel_manager_t **cha
    if (status != NV_OK)
        goto error;

-    *channel_manager_out = channel_manager;
+    // Key rotation is enabled only after all the channels have been created:
+    // RM does not support channel allocation on an engine if key rotation is
+    // pending on that engine. This can become a problem during testing if
+    // key rotation thresholds are very low.
+    uvm_conf_computing_enable_key_rotation(gpu);

-    return status;
+    return NV_OK;

 error:
+    *channel_manager_out = NULL;
    uvm_channel_manager_destroy(channel_manager);
+
    return status;
 }

@ -3346,11 +3719,15 @@ static void channel_manager_stop_wlc(uvm_channel_manager_t *manager)
    NV_STATUS status;

    uvm_for_each_channel_in_pool(channel, lcic_pool) {
-        uvm_spin_loop_t spin;
-
        // Wait for the WLC/LCIC to be primed. This means that PUT == GET + 2
        // and a WLC doorbell ring is enough to start work.
-        UVM_SPIN_WHILE(!uvm_gpu_tracking_semaphore_is_completed(&channel->tracking_sem), &spin);
+        status = uvm_channel_wait(channel);
+        if (status != NV_OK)
+            UVM_ERR_PRINT_NV_STATUS("Failed to wait for LCIC channel (%s) completion", status, channel->name);
+
+        // Continue on error and attempt to stop WLC below. This can lead to
+        // channel destruction with mismatched GET and PUT pointers. RM will
+        // print errors if that's the case, but channel destruction succeeeds.
    }

    status = uvm_push_begin(manager, UVM_CHANNEL_TYPE_SEC2, &push, "Stop WLC channels");
@ -3370,8 +3747,7 @@ static void channel_manager_stop_wlc(uvm_channel_manager_t *manager)
    if (status != NV_OK)
        UVM_ERR_PRINT_NV_STATUS("Failed to end stop push for WLC", status);

-    manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_WLC] = NULL;
-    manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_LCIC] = NULL;
+    manager->conf_computing.wlc_ready = false;
 }

 void uvm_channel_manager_destroy(uvm_channel_manager_t *channel_manager)
@ -3393,6 +3769,14 @@ void uvm_channel_manager_destroy(uvm_channel_manager_t *channel_manager)
    uvm_kvfree(channel_manager);
 }

+NvU32 uvm_channel_pool_key_version(uvm_channel_pool_t *pool)
+{
+    if (uvm_channel_pool_is_lcic(pool))
+        pool = get_paired_pool(pool);
+
+    return pool->conf_computing.key_rotation.version;
+}
+
 bool uvm_channel_is_privileged(uvm_channel_t *channel)
 {
    if (uvm_parent_gpu_is_virt_mode_sriov_heavy(uvm_channel_get_gpu(channel)->parent))
--- a/kernel-open/nvidia-uvm/uvm_channel.h
+++ b/kernel-open/nvidia-uvm/uvm_channel.h
@ -228,21 +228,65 @@ typedef struct
    // variant is required when the thread holding the pool lock must sleep
    // (ex: acquire another mutex) deeper in the call stack, either in UVM or
    // RM.
-    union {
+    union
+    {
        uvm_spinlock_t spinlock;
        uvm_mutex_t mutex;
    };

-    // Secure operations require that uvm_push_begin order matches
-    // uvm_push_end order, because the engine's state is used in its internal
-    // operation and each push may modify this state. push_locks is protected by
-    // the channel pool lock.
-    DECLARE_BITMAP(push_locks, UVM_CHANNEL_MAX_NUM_CHANNELS_PER_POOL);
+    struct
+    {
+        // Secure operations require that uvm_push_begin order matches
+        // uvm_push_end order, because the engine's state is used in its
+        // internal operation and each push may modify this state.
+        // push_locks is protected by the channel pool lock.
+        DECLARE_BITMAP(push_locks, UVM_CHANNEL_MAX_NUM_CHANNELS_PER_POOL);

-    // Counting semaphore for available and unlocked channels, it must be
-    // acquired before submitting work to a channel when the Confidential
-    // Computing feature is enabled.
-    uvm_semaphore_t push_sem;
+        // Counting semaphore for available and unlocked channels, it must be
+        // acquired before submitting work to a channel when the Confidential
+        // Computing feature is enabled.
+        uvm_semaphore_t push_sem;
+
+        // Per channel buffers in unprotected sysmem.
+        uvm_rm_mem_t *pool_sysmem;
+
+        // Per channel buffers in protected vidmem.
+        uvm_rm_mem_t *pool_vidmem;
+
+       struct
+       {
+            // Current encryption key version, incremented upon key rotation.
+            // While there are separate keys for encryption and decryption, the
+            // two keys are rotated at once, so the versioning applies to both.
+            NvU32 version;
+
+            // Lock used to ensure mutual exclusion during key rotation.
+            uvm_mutex_t mutex;
+
+            // CSL contexts passed to RM for key rotation. This is usually an
+            // array containing the CSL contexts associated with the channels in
+            // the pool. In the case of the WLC pool, the array also includes
+            // CSL contexts associated with LCIC channels.
+            UvmCslContext **csl_contexts;
+
+            // Number of elements in the CSL context array.
+            unsigned num_csl_contexts;
+
+            // Number of bytes encrypted, or decrypted, on the engine associated
+            // with the pool since the last key rotation. Only used during
+            // testing, to force key rotations after a certain encryption size,
+            // see UVM_CONF_COMPUTING_KEY_ROTATION_LOWER_THRESHOLD.
+            //
+            // Encryptions on a LCIC pool are accounted for in the paired WLC
+            // pool.
+            //
+            // TODO: Bug 4612912: these accounting variables can be removed once
+            // RM exposes an API to set the key rotation lower threshold.
+            atomic64_t encrypted;
+            atomic64_t decrypted;
+        } key_rotation;
+
+    } conf_computing;
 } uvm_channel_pool_t;

 struct uvm_channel_struct
@ -322,43 +366,14 @@ struct uvm_channel_struct
        // work launches to match the order of push end-s that triggered them.
        volatile NvU32 gpu_put;

-        // Static pushbuffer for channels with static schedule (WLC/LCIC)
-        uvm_rm_mem_t *static_pb_protected_vidmem;
-
-        // Static pushbuffer staging buffer for WLC
-        uvm_rm_mem_t *static_pb_unprotected_sysmem;
-        void *static_pb_unprotected_sysmem_cpu;
-        void *static_pb_unprotected_sysmem_auth_tag_cpu;
-
-        // The above static locations are required by the WLC (and LCIC)
-        // schedule. Protected sysmem location completes WLC's independence
-        // from the pushbuffer allocator.
+        // Protected sysmem location makes WLC independent from the pushbuffer
+        // allocator. Unprotected sysmem and protected vidmem counterparts
+        // are allocated from the channel pool (sysmem, vidmem).
        void *static_pb_protected_sysmem;

-        // Static tracking semaphore notifier values
-        // Because of LCIC's fixed schedule, the secure semaphore release
-        // mechanism uses two additional static locations for incrementing the
-        // notifier values. See:
-        // . channel_semaphore_secure_release()
-        // . setup_lcic_schedule()
-        // . internal_channel_submit_work_wlc()
-        uvm_rm_mem_t *static_notifier_unprotected_sysmem;
-        NvU32 *static_notifier_entry_unprotected_sysmem_cpu;
-        NvU32 *static_notifier_exit_unprotected_sysmem_cpu;
-        uvm_gpu_address_t static_notifier_entry_unprotected_sysmem_gpu_va;
-        uvm_gpu_address_t static_notifier_exit_unprotected_sysmem_gpu_va;
-
-        // Explicit location for push launch tag used by WLC.
-        // Encryption auth tags have to be located in unprotected sysmem.
-        void *launch_auth_tag_cpu;
-        NvU64 launch_auth_tag_gpu_va;
-
        // Used to decrypt the push back to protected sysmem.
        // This happens when profilers register callbacks for migration data.
        uvm_push_crypto_bundle_t *push_crypto_bundles;
-
-        // Accompanying authentication tags for the crypto bundles
-        uvm_rm_mem_t *push_crypto_bundle_auth_tags;
    } conf_computing;

    // RM channel information
@ -451,6 +466,16 @@ struct uvm_channel_manager_struct
        UVM_BUFFER_LOCATION gpput_loc;
        UVM_BUFFER_LOCATION pushbuffer_loc;
    } conf;
+
+    struct
+    {
+        // Flag indicating that the WLC/LCIC mechanism is ready/setup; should
+        // only be false during (de)initialization.
+        bool wlc_ready;
+
+        // True indicates that key rotation is enabled (UVM-wise).
+        bool key_rotation_enabled;
+    } conf_computing;
 };

 // Create a channel manager for the GPU
@ -501,6 +526,12 @@ uvm_channel_t *uvm_channel_lcic_get_paired_wlc(uvm_channel_t *lcic_channel);

 uvm_channel_t *uvm_channel_wlc_get_paired_lcic(uvm_channel_t *wlc_channel);

+NvU64 uvm_channel_get_static_pb_protected_vidmem_gpu_va(uvm_channel_t *channel);
+
+NvU64 uvm_channel_get_static_pb_unprotected_sysmem_gpu_va(uvm_channel_t *channel);
+
+char* uvm_channel_get_static_pb_unprotected_sysmem_cpu(uvm_channel_t *channel);
+
 static bool uvm_channel_pool_is_proxy(uvm_channel_pool_t *pool)
 {
    UVM_ASSERT(uvm_pool_type_is_valid(pool->pool_type));
@ -532,6 +563,17 @@ static uvm_channel_type_t uvm_channel_proxy_channel_type(void)
    return UVM_CHANNEL_TYPE_MEMOPS;
 }

+// Force key rotation in the engine associated with the given channel pool.
+// Rotation may still not happen if RM cannot acquire the necessary locks (in
+// which case the function returns NV_ERR_STATE_IN_USE).
+//
+// This function should be only invoked in pools in which key rotation is
+// enabled.
+NV_STATUS uvm_channel_pool_rotate_key(uvm_channel_pool_t *pool);
+
+// Retrieve the current encryption key version associated with the channel pool.
+NvU32 uvm_channel_pool_key_version(uvm_channel_pool_t *pool);
+
 // Privileged channels support all the Host and engine methods, while
 // non-privileged channels don't support privileged methods.
 //
@ -579,12 +621,9 @@ NvU32 uvm_channel_manager_update_progress(uvm_channel_manager_t *channel_manager
 // beginning.
 NV_STATUS uvm_channel_manager_wait(uvm_channel_manager_t *manager);

-// Check if WLC/LCIC mechanism is ready/setup
-// Should only return false during initialization
 static bool uvm_channel_manager_is_wlc_ready(uvm_channel_manager_t *manager)
 {
-    return (manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_WLC] != NULL) &&
-           (manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_LCIC] != NULL);
+    return manager->conf_computing.wlc_ready;
 }
 // Get the GPU VA of semaphore_channel's tracking semaphore within the VA space
 // associated with access_channel.
--- a/kernel-open/nvidia-uvm/uvm_channel_test.c
+++ b/kernel-open/nvidia-uvm/uvm_channel_test.c
@ -796,11 +796,8 @@ done:
 static NV_STATUS test_conf_computing_channel_selection(uvm_va_space_t *va_space)
 {
    NV_STATUS status = NV_OK;
-    uvm_channel_pool_t *pool;
-    uvm_push_t *pushes;
-    uvm_gpu_t *gpu;
-    NvU32 i;
-    NvU32 num_pushes;
+    uvm_push_t *pushes = NULL;
+    uvm_gpu_t *gpu = NULL;

    if (!g_uvm_global.conf_computing_enabled)
        return NV_OK;
@ -810,9 +807,19 @@ static NV_STATUS test_conf_computing_channel_selection(uvm_va_space_t *va_space)
    for_each_va_space_gpu(gpu, va_space) {
        uvm_channel_type_t channel_type;

+        // Key rotation is disabled because this test relies on nested pushes,
+        // which is illegal. If any push other than the first one triggers key
+        // rotation, the test won't complete. This is because key rotation
+        // depends on waiting for ongoing pushes to end, which doesn't happen
+        // if those pushes are ended after the current one begins.
+        uvm_conf_computing_disable_key_rotation(gpu);
+
        for (channel_type = 0; channel_type < UVM_CHANNEL_TYPE_COUNT; channel_type++) {
-            pool = gpu->channel_manager->pool_to_use.default_for_type[channel_type];
-            TEST_CHECK_RET(pool != NULL);
+            NvU32 i;
+            NvU32 num_pushes;
+            uvm_channel_pool_t *pool = gpu->channel_manager->pool_to_use.default_for_type[channel_type];
+
+            TEST_CHECK_GOTO(pool != NULL, error);

            // Skip LCIC channels as those can't accept any pushes
            if (uvm_channel_pool_is_lcic(pool))
@ -824,7 +831,7 @@ static NV_STATUS test_conf_computing_channel_selection(uvm_va_space_t *va_space)
            num_pushes = min(pool->num_channels, (NvU32)UVM_PUSH_MAX_CONCURRENT_PUSHES);

            pushes = uvm_kvmalloc_zero(sizeof(*pushes) * num_pushes);
-            TEST_CHECK_RET(pushes != NULL);
+            TEST_CHECK_GOTO(pushes != NULL, error);

            for (i = 0; i < num_pushes; i++) {
                uvm_push_t *push = &pushes[i];
@ -841,12 +848,18 @@ static NV_STATUS test_conf_computing_channel_selection(uvm_va_space_t *va_space)

            uvm_kvfree(pushes);
        }
+
+        uvm_conf_computing_enable_key_rotation(gpu);
    }

    uvm_thread_context_lock_enable_tracking();

    return status;
+
 error:
+    if (gpu != NULL)
+        uvm_conf_computing_enable_key_rotation(gpu);
+
    uvm_thread_context_lock_enable_tracking();
    uvm_kvfree(pushes);

@ -948,6 +961,318 @@ release:
    return NV_OK;
 }

+static NV_STATUS force_key_rotations(uvm_channel_pool_t *pool, unsigned num_rotations)
+{
+    unsigned num_tries;
+    unsigned max_num_tries = 20;
+    unsigned num_rotations_completed = 0;
+
+    if (num_rotations == 0)
+        return NV_OK;
+
+    // The number of accepted rotations is kept low, so failed rotation
+    // invocations due to RM not acquiring the necessary locks (which imply a
+    // sleep in the test) do not balloon the test execution time.
+    UVM_ASSERT(num_rotations <= 10);
+
+    for (num_tries = 0; (num_tries < max_num_tries) && (num_rotations_completed < num_rotations); num_tries++) {
+        // Force key rotation, irrespective of encryption usage.
+        NV_STATUS status = uvm_channel_pool_rotate_key(pool);
+
+        // Key rotation may not be able to complete due to RM failing to acquire
+        // the necessary locks. Detect the situation, sleep for a bit, and then
+        // try again
+        //
+        // The maximum time spent sleeping in a single rotation call is
+        // (max_num_tries * max_sleep_us)
+        if (status == NV_ERR_STATE_IN_USE) {
+            NvU32 min_sleep_us = 1000;
+            NvU32 max_sleep_us = 10000;
+
+            usleep_range(min_sleep_us, max_sleep_us);
+            continue;
+        }
+
+        TEST_NV_CHECK_RET(status);
+
+        num_rotations_completed++;
+    }
+
+    // If not a single key rotation occurred, the dependent tests still pass,
+    // but there is no much value to them. Instead, return an error so the
+    // maximum number of tries, or the maximum sleep time, are adjusted to
+    // ensure that at least one rotation completes.
+    if (num_rotations_completed > 0)
+        return NV_OK;
+    else
+        return NV_ERR_STATE_IN_USE;
+}
+
+static NV_STATUS force_key_rotation(uvm_channel_pool_t *pool)
+{
+    return force_key_rotations(pool, 1);
+}
+
+// Test key rotation in all pools. This is useful because key rotation may not
+// happen otherwise on certain engines during UVM test execution. For example,
+// if the MEMOPS channel type is mapped to a CE not shared with any other
+// channel type, then the only encryption taking place in the engine is due to
+// semaphore releases (4 bytes each). This small encryption size makes it
+// unlikely to exceed even small rotation thresholds.
+static NV_STATUS test_channel_key_rotation_basic(uvm_gpu_t *gpu)
+{
+    uvm_channel_pool_t *pool;
+
+    uvm_for_each_pool(pool, gpu->channel_manager) {
+        if (!uvm_conf_computing_is_key_rotation_enabled_in_pool(pool))
+            continue;
+
+        TEST_NV_CHECK_RET(force_key_rotation(pool));
+    }
+
+    return NV_OK;
+}
+
+// Interleave GPU encryptions and decryptions, and their CPU counterparts, with
+// key rotations.
+static NV_STATUS test_channel_key_rotation_interleave(uvm_gpu_t *gpu)
+{
+    int i;
+    uvm_channel_pool_t *gpu_to_cpu_pool;
+    uvm_channel_pool_t *cpu_to_gpu_pool;
+    NV_STATUS status = NV_OK;
+    size_t size = UVM_CONF_COMPUTING_DMA_BUFFER_SIZE;
+    void *initial_plain_cpu = NULL;
+    void *final_plain_cpu = NULL;
+    uvm_mem_t *plain_gpu = NULL;
+    uvm_gpu_address_t plain_gpu_address;
+
+    cpu_to_gpu_pool = gpu->channel_manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_CPU_TO_GPU];
+    TEST_CHECK_RET(uvm_conf_computing_is_key_rotation_enabled_in_pool(cpu_to_gpu_pool));
+
+    gpu_to_cpu_pool = gpu->channel_manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_GPU_TO_CPU];
+    TEST_CHECK_RET(uvm_conf_computing_is_key_rotation_enabled_in_pool(gpu_to_cpu_pool));
+
+    initial_plain_cpu = uvm_kvmalloc_zero(size);
+    if (initial_plain_cpu == NULL) {
+        status = NV_ERR_NO_MEMORY;
+        goto out;
+    }
+
+    final_plain_cpu = uvm_kvmalloc_zero(size);
+    if (final_plain_cpu == NULL) {
+        status = NV_ERR_NO_MEMORY;
+        goto out;
+    }
+
+    TEST_NV_CHECK_GOTO(uvm_mem_alloc_vidmem(size, gpu, &plain_gpu), out);
+    TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(plain_gpu, gpu), out);
+    plain_gpu_address = uvm_mem_gpu_address_virtual_kernel(plain_gpu, gpu);
+
+    memset(initial_plain_cpu, 1, size);
+
+    for (i = 0; i < 5; i++) {
+        TEST_NV_CHECK_GOTO(force_key_rotation(gpu_to_cpu_pool), out);
+        TEST_NV_CHECK_GOTO(force_key_rotation(cpu_to_gpu_pool), out);
+
+        TEST_NV_CHECK_GOTO(uvm_conf_computing_util_memcopy_cpu_to_gpu(gpu,
+                                                                      plain_gpu_address,
+                                                                      initial_plain_cpu,
+                                                                      size,
+                                                                      NULL,
+                                                                      "CPU > GPU"),
+                           out);
+
+        TEST_NV_CHECK_GOTO(force_key_rotation(gpu_to_cpu_pool), out);
+        TEST_NV_CHECK_GOTO(force_key_rotation(cpu_to_gpu_pool), out);
+
+        TEST_NV_CHECK_GOTO(uvm_conf_computing_util_memcopy_gpu_to_cpu(gpu,
+                                                                      final_plain_cpu,
+                                                                      plain_gpu_address,
+                                                                      size,
+                                                                      NULL,
+                                                                      "GPU > CPU"),
+                           out);
+
+        TEST_CHECK_GOTO(!memcmp(initial_plain_cpu, final_plain_cpu, size), out);
+
+        memset(final_plain_cpu, 0, size);
+    }
+
+out:
+    uvm_mem_free(plain_gpu);
+    uvm_kvfree(final_plain_cpu);
+    uvm_kvfree(initial_plain_cpu);
+
+    return status;
+}
+
+static NV_STATUS memset_vidmem(uvm_mem_t *mem, NvU8 val)
+{
+    uvm_push_t push;
+    uvm_gpu_address_t gpu_address;
+    uvm_gpu_t *gpu = mem->backing_gpu;
+
+    UVM_ASSERT(uvm_mem_is_vidmem(mem));
+
+    TEST_NV_CHECK_RET(uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_INTERNAL, &push, "zero vidmem"));
+
+    gpu_address = uvm_mem_gpu_address_virtual_kernel(mem, gpu);
+    gpu->parent->ce_hal->memset_1(&push, gpu_address, val, mem->size);
+
+    TEST_NV_CHECK_RET(uvm_push_end_and_wait(&push));
+
+    return NV_OK;
+}
+
+// Custom version of uvm_conf_computing_util_memcopy_gpu_to_cpu that allows
+// testing to insert key rotations in between the push end, and the CPU
+// decryption
+static NV_STATUS encrypted_memcopy_gpu_to_cpu(uvm_gpu_t *gpu,
+                                              void *dst_plain,
+                                              uvm_gpu_address_t src_gpu_address,
+                                              size_t size,
+                                              unsigned num_rotations_to_insert)
+{
+    NV_STATUS status;
+    uvm_push_t push;
+    uvm_conf_computing_dma_buffer_t *dma_buffer;
+    uvm_gpu_address_t dst_gpu_address, auth_tag_gpu_address;
+    void *src_cipher, *auth_tag;
+    uvm_channel_t *channel;
+
+    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
+    UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE);
+
+    status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL);
+    if (status != NV_OK)
+        return status;
+
+    status = uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_TO_CPU, &push, "Small GPU > CPU encryption");
+    if (status != NV_OK)
+        goto out;
+
+    channel = push.channel;
+    uvm_conf_computing_log_gpu_encryption(channel, size, dma_buffer->decrypt_iv);
+    dma_buffer->key_version[0] = uvm_channel_pool_key_version(channel->pool);
+
+    dst_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
+    auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
+    gpu->parent->ce_hal->encrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address);
+
+    status = uvm_push_end_and_wait(&push);
+    if (status != NV_OK)
+        goto out;
+
+    TEST_NV_CHECK_GOTO(force_key_rotations(channel->pool, num_rotations_to_insert), out);
+
+    // If num_rotations_to_insert is not zero, the current encryption key will
+    // be different from the one used during CE encryption.
+
+    src_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
+    auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
+    status = uvm_conf_computing_cpu_decrypt(channel,
+                                            dst_plain,
+                                            src_cipher,
+                                            dma_buffer->decrypt_iv,
+                                            dma_buffer->key_version[0],
+                                            size,
+                                            auth_tag);
+
+ out:
+    uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL);
+    return status;
+}
+
+static NV_STATUS test_channel_key_rotation_cpu_decryption(uvm_gpu_t *gpu,
+                                                          unsigned num_repetitions,
+                                                          unsigned num_rotations_to_insert)
+{
+    unsigned i;
+    uvm_channel_pool_t *gpu_to_cpu_pool;
+    NV_STATUS status = NV_OK;
+    size_t size = UVM_CONF_COMPUTING_DMA_BUFFER_SIZE;
+    NvU8 *plain_cpu = NULL;
+    uvm_mem_t *plain_gpu = NULL;
+    uvm_gpu_address_t plain_gpu_address;
+
+    if (!uvm_conf_computing_is_key_rotation_enabled(gpu))
+        return NV_OK;
+
+    gpu_to_cpu_pool = gpu->channel_manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_GPU_TO_CPU];
+    TEST_CHECK_RET(uvm_conf_computing_is_key_rotation_enabled_in_pool(gpu_to_cpu_pool));
+
+    plain_cpu = (NvU8 *) uvm_kvmalloc_zero(size);
+    if (plain_cpu == NULL) {
+        status = NV_ERR_NO_MEMORY;
+        goto out;
+    }
+
+    TEST_NV_CHECK_GOTO(uvm_mem_alloc_vidmem(size, gpu, &plain_gpu), out);
+    TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(plain_gpu, gpu), out);
+    TEST_NV_CHECK_GOTO(memset_vidmem(plain_gpu, 1), out);
+
+    plain_gpu_address = uvm_mem_gpu_address_virtual_kernel(plain_gpu, gpu);
+
+    for (i = 0; i < num_repetitions; i++) {
+        unsigned j;
+
+        TEST_NV_CHECK_GOTO(encrypted_memcopy_gpu_to_cpu(gpu,
+                                                        plain_cpu,
+                                                        plain_gpu_address,
+                                                        size,
+                                                        num_rotations_to_insert),
+                          out);
+
+        for (j = 0; j < size; j++)
+            TEST_CHECK_GOTO(plain_cpu[j] == 1, out);
+
+        memset(plain_cpu, 0, size);
+
+    }
+out:
+    uvm_mem_free(plain_gpu);
+    uvm_kvfree(plain_cpu);
+
+    return status;
+}
+
+// Test that CPU decryptions can use old keys i.e. previous versions of the keys
+// that are no longer the current key, due to key rotation. Given that SEC2
+// does not expose encryption capabilities, the "decrypt-after-rotation" problem
+// is exclusive of CE encryptions.
+static NV_STATUS test_channel_key_rotation_decrypt_after_key_rotation(uvm_gpu_t *gpu)
+{
+    // Instruct encrypted_memcopy_gpu_to_cpu to insert several key rotations
+    // between the GPU encryption, and the associated CPU decryption.
+    unsigned num_rotations_to_insert = 8;
+
+    TEST_NV_CHECK_RET(test_channel_key_rotation_cpu_decryption(gpu, 1, num_rotations_to_insert));
+
+    return NV_OK;
+}
+
+static NV_STATUS test_channel_key_rotation(uvm_va_space_t *va_space)
+{
+    uvm_gpu_t *gpu;
+
+    if (!g_uvm_global.conf_computing_enabled)
+        return NV_OK;
+
+    for_each_va_space_gpu(gpu, va_space) {
+        if (!uvm_conf_computing_is_key_rotation_enabled(gpu))
+            break;
+
+        TEST_NV_CHECK_RET(test_channel_key_rotation_basic(gpu));
+
+        TEST_NV_CHECK_RET(test_channel_key_rotation_interleave(gpu));
+
+        TEST_NV_CHECK_RET(test_channel_key_rotation_decrypt_after_key_rotation(gpu));
+    }
+
+    return NV_OK;
+}
+
 static NV_STATUS test_write_ctrl_gpfifo_noop(uvm_va_space_t *va_space)
 {
    uvm_gpu_t *gpu;
@ -1094,7 +1419,7 @@ static NV_STATUS test_write_ctrl_gpfifo_tight(uvm_va_space_t *va_space)
        TEST_NV_CHECK_GOTO(uvm_channel_write_ctrl_gpfifo(channel, entry), error);

        // Release the semaphore.
-        UVM_WRITE_ONCE(*cpu_ptr, 1);
+        WRITE_ONCE(*cpu_ptr, 1);

        TEST_NV_CHECK_GOTO(uvm_push_wait(&push), error);

@ -1203,6 +1528,10 @@ NV_STATUS uvm_test_channel_sanity(UVM_TEST_CHANNEL_SANITY_PARAMS *params, struct
    if (status != NV_OK)
        goto done;

+    status = test_channel_key_rotation(va_space);
+    if (status != NV_OK)
+        goto done;
+
    // The following tests have side effects, they reset the GPU's
    // channel_manager.
    status = test_channel_pushbuffer_extension_base(va_space);
@ -1338,6 +1667,126 @@ done:
    return status;
 }

+static NV_STATUS channel_stress_key_rotation_cpu_encryption(uvm_gpu_t *gpu, UVM_TEST_CHANNEL_STRESS_PARAMS *params)
+{
+    int i;
+    uvm_channel_pool_t *cpu_to_gpu_pool;
+    NV_STATUS status = NV_OK;
+    size_t size = UVM_CONF_COMPUTING_DMA_BUFFER_SIZE;
+    void *initial_plain_cpu = NULL;
+    uvm_mem_t *plain_gpu = NULL;
+    uvm_gpu_address_t plain_gpu_address;
+
+    UVM_ASSERT(params->key_rotation_operation == UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_CPU_TO_GPU);
+
+    cpu_to_gpu_pool = gpu->channel_manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_CPU_TO_GPU];
+    TEST_CHECK_RET(uvm_conf_computing_is_key_rotation_enabled_in_pool(cpu_to_gpu_pool));
+
+    initial_plain_cpu = uvm_kvmalloc_zero(size);
+    if (initial_plain_cpu == NULL) {
+        status = NV_ERR_NO_MEMORY;
+        goto out;
+    }
+
+    TEST_NV_CHECK_GOTO(uvm_mem_alloc_vidmem(size, gpu, &plain_gpu), out);
+    TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(plain_gpu, gpu), out);
+    plain_gpu_address = uvm_mem_gpu_address_virtual_kernel(plain_gpu, gpu);
+
+    memset(initial_plain_cpu, 1, size);
+
+    for (i = 0; i < params->iterations; i++) {
+        TEST_NV_CHECK_GOTO(uvm_conf_computing_util_memcopy_cpu_to_gpu(gpu,
+                                                                      plain_gpu_address,
+                                                                      initial_plain_cpu,
+                                                                      size,
+                                                                      NULL,
+                                                                      "CPU > GPU"),
+                           out);
+    }
+
+out:
+    uvm_mem_free(plain_gpu);
+    uvm_kvfree(initial_plain_cpu);
+
+    return status;
+}
+
+static NV_STATUS channel_stress_key_rotation_cpu_decryption(uvm_gpu_t *gpu, UVM_TEST_CHANNEL_STRESS_PARAMS *params)
+{
+    unsigned num_rotations_to_insert = 0;
+
+    UVM_ASSERT(params->key_rotation_operation == UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_GPU_TO_CPU);
+
+    return test_channel_key_rotation_cpu_decryption(gpu, params->iterations, num_rotations_to_insert);
+}
+
+static NV_STATUS channel_stress_key_rotation_rotate(uvm_gpu_t *gpu, UVM_TEST_CHANNEL_STRESS_PARAMS *params)
+{
+    NvU32 i;
+
+    UVM_ASSERT(params->key_rotation_operation == UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_ROTATE);
+
+    for (i = 0; i < params->iterations; ++i) {
+        NV_STATUS status;
+        uvm_channel_pool_t *pool;
+        uvm_channel_type_t type;
+
+        if ((i % 3) == 0)
+            type = UVM_CHANNEL_TYPE_CPU_TO_GPU;
+        else if ((i % 3) == 1)
+            type = UVM_CHANNEL_TYPE_GPU_TO_CPU;
+        else
+            type = UVM_CHANNEL_TYPE_WLC;
+
+        pool = gpu->channel_manager->pool_to_use.default_for_type[type];
+
+        if (!uvm_conf_computing_is_key_rotation_enabled_in_pool(pool))
+            return NV_ERR_INVALID_STATE;
+
+        status = force_key_rotation(pool);
+        if (status != NV_OK)
+            return status;
+    }
+
+    return NV_OK;
+}
+
+// The objective of this test is documented in the user-level function
+static NV_STATUS uvm_test_channel_stress_key_rotation(uvm_va_space_t *va_space, UVM_TEST_CHANNEL_STRESS_PARAMS *params)
+{
+    uvm_test_rng_t rng;
+    uvm_gpu_t *gpu;
+    NV_STATUS status = NV_OK;
+
+    if (!g_uvm_global.conf_computing_enabled)
+        return NV_OK;
+
+    uvm_test_rng_init(&rng, params->seed);
+
+    uvm_va_space_down_read(va_space);
+
+    // Key rotation should be enabled, or disabled, in all GPUs. Pick a random
+    // one.
+    gpu = random_va_space_gpu(&rng, va_space);
+
+    if (!uvm_conf_computing_is_key_rotation_enabled(gpu))
+        goto out;
+
+    if (params->key_rotation_operation == UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_CPU_TO_GPU)
+        status = channel_stress_key_rotation_cpu_encryption(gpu, params);
+    else if (params->key_rotation_operation == UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_GPU_TO_CPU)
+        status = channel_stress_key_rotation_cpu_decryption(gpu, params);
+    else if (params->key_rotation_operation == UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_ROTATE)
+        status = channel_stress_key_rotation_rotate(gpu, params);
+    else
+        status = NV_ERR_INVALID_PARAMETER;
+
+out:
+    uvm_va_space_up_read(va_space);
+
+    return status;
+}
+
 NV_STATUS uvm_test_channel_stress(UVM_TEST_CHANNEL_STRESS_PARAMS *params, struct file *filp)
 {
    uvm_va_space_t *va_space = uvm_va_space_get(filp);
@ -1349,6 +1798,8 @@ NV_STATUS uvm_test_channel_stress(UVM_TEST_CHANNEL_STRESS_PARAMS *params, struct
            return uvm_test_channel_stress_update_channels(va_space, params);
        case UVM_TEST_CHANNEL_STRESS_MODE_NOOP_PUSH:
            return uvm_test_channel_noop_push(va_space, params);
+        case UVM_TEST_CHANNEL_STRESS_MODE_KEY_ROTATION:
+            return uvm_test_channel_stress_key_rotation(va_space, params);
        default:
            return NV_ERR_INVALID_PARAMETER;
    }
--- a/kernel-open/nvidia-uvm/uvm_common.c
+++ b/kernel-open/nvidia-uvm/uvm_common.c
@ -281,29 +281,6 @@ NV_STATUS uvm_spin_loop(uvm_spin_loop_t *spin)
    return NV_OK;
 }

-// This formats a GPU UUID, in a UVM-friendly way. That is, nearly the same as
-// what nvidia-smi reports.  It will always prefix the UUID with UVM-GPU so
-// that we know that we have a real, binary formatted UUID that will work in
-// the UVM APIs.
-//
-// It comes out like this:
-//
-//     UVM-GPU-d802726c-df8d-a3c3-ec53-48bdec201c27
-//
-//  This routine will always null-terminate the string for you. This is true
-//  even if the buffer was too small!
-//
-//  Return value is the number of non-null characters written.
-//
-// Note that if you were to let the NV2080_CTRL_CMD_GPU_GET_GID_INFO command
-// return it's default format, which is ascii, not binary, then you would get
-// this back:
-//
-//     GPU-d802726c-df8d-a3c3-ec53-48bdec201c27
-//
-//  ...which is actually a character string, and won't work for UVM API calls.
-//  So it's very important to be able to see the difference.
-//
 static char uvm_digit_to_hex(unsigned value)
 {
    if (value >= 10)
@ -312,27 +289,19 @@ static char uvm_digit_to_hex(unsigned value)
        return value + '0';
 }

-int format_uuid_to_buffer(char *buffer, unsigned bufferLength, const NvProcessorUuid *pUuidStruct)
+void uvm_uuid_string(char *buffer, const NvProcessorUuid *pUuidStruct)
 {
-    char *str = buffer+8;
+    char *str = buffer;
    unsigned i;
    unsigned dashMask = 1 << 4 | 1 << 6 | 1 << 8 | 1 << 10;

-    if (bufferLength < (8 /*prefix*/+ 16 * 2 /*digits*/ + 4 * 1 /*dashes*/ + 1 /*null*/))
-        return *buffer = 0;
-
-    memcpy(buffer, "UVM-GPU-", 8);
-
    for (i = 0; i < 16; i++) {
        *str++ = uvm_digit_to_hex(pUuidStruct->uuid[i] >> 4);
        *str++ = uvm_digit_to_hex(pUuidStruct->uuid[i] & 0xF);

-        if (dashMask & (1 << (i+1)))
+        if (dashMask & (1 << (i + 1)))
            *str++ = '-';
    }

    *str = 0;
-
-    return (int)(str-buffer);
 }
-
--- a/kernel-open/nvidia-uvm/uvm_common.h
+++ b/kernel-open/nvidia-uvm/uvm_common.h
@ -50,9 +50,12 @@ enum {
    NVIDIA_UVM_NUM_MINOR_DEVICES
 };

-#define UVM_GPU_UUID_TEXT_BUFFER_LENGTH (8+16*2+4+1)
+// UUID has the format: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+#define UVM_UUID_STRING_LENGTH ((8 + 1) + 3 * (4 + 1) + 12 + 1)

-int format_uuid_to_buffer(char *buffer, unsigned bufferLength, const NvProcessorUuid *pGpuUuid);
+// Writes UVM_UUID_STRING_LENGTH characters into buffer, including a terminating
+// NULL.
+void uvm_uuid_string(char *buffer, const NvProcessorUuid *uuid);

 #define UVM_PRINT_FUNC_PREFIX(func, prefix, fmt, ...) \
    func(prefix "%s:%u %s[pid:%d]" fmt,               \
@ -98,27 +101,9 @@ bool uvm_debug_prints_enabled(void);
 #define UVM_INFO_PRINT(fmt, ...) \
    UVM_PRINT_FUNC_PREFIX_CHECK(printk, KERN_INFO NVIDIA_UVM_PRETTY_PRINTING_PREFIX, " " fmt, ##__VA_ARGS__)

-//
-// Please see the documentation of format_uuid_to_buffer, for details on what
-// this routine prints for you.
-//
-#define UVM_DBG_PRINT_UUID(msg, uuidPtr)                                \
-    do {                                                                \
-        char uuidBuffer[UVM_GPU_UUID_TEXT_BUFFER_LENGTH];               \
-        format_uuid_to_buffer(uuidBuffer, sizeof(uuidBuffer), uuidPtr); \
-        UVM_DBG_PRINT("%s: %s\n", msg, uuidBuffer);                     \
-    } while (0)
-
 #define UVM_ERR_PRINT_NV_STATUS(msg, rmStatus, ...)                        \
    UVM_ERR_PRINT("ERROR: %s : " msg "\n", nvstatusToString(rmStatus), ##__VA_ARGS__)

-#define UVM_ERR_PRINT_UUID(msg, uuidPtr, ...)                              \
-    do {                                                                   \
-        char uuidBuffer[UVM_GPU_UUID_TEXT_BUFFER_LENGTH];                  \
-        format_uuid_to_buffer(uuidBuffer, sizeof(uuidBuffer), uuidPtr);    \
-        UVM_ERR_PRINT("ERROR: %s : " msg "\n", uuidBuffer, ##__VA_ARGS__); \
-    } while (0)
-
 #define UVM_PANIC()             UVM_PRINT_FUNC(panic, "\n")
 #define UVM_PANIC_MSG(fmt, ...) UVM_PRINT_FUNC(panic, ": " fmt, ##__VA_ARGS__)

@ -395,7 +380,7 @@ static inline void uvm_touch_page(struct page *page)
    UVM_ASSERT(page);

    mapping = (char *) kmap(page);
-    (void)UVM_READ_ONCE(*mapping);
+    (void)READ_ONCE(*mapping);
    kunmap(page);
 }

--- a/kernel-open/nvidia-uvm/uvm_conf_computing.c
+++ b/kernel-open/nvidia-uvm/uvm_conf_computing.c
@ -33,6 +33,15 @@
 #include "nv_uvm_interface.h"
 #include "uvm_va_block.h"

+// Amount of encrypted data on a given engine that triggers key rotation. This
+// is a UVM internal threshold, different from that of RM, and used only during
+// testing.
+//
+// Key rotation is triggered when the total encryption size, or the total
+// decryption size (whatever comes first) reaches this lower threshold on the
+// engine.
+#define UVM_CONF_COMPUTING_KEY_ROTATION_LOWER_THRESHOLD (UVM_SIZE_1MB * 8)
+
 // The maximum number of secure operations per push is:
 // UVM_MAX_PUSH_SIZE / min(CE encryption size, CE decryption size)
 // + 1 (tracking semaphore) =  128 * 1024 / 56 + 1 = 2342
@ -352,6 +361,19 @@ error:
    return status;
 }

+// The production key rotation defaults are such that key rotations rarely
+// happen. During UVM testing more frequent rotations are triggering by relying
+// on internal encryption usage accounting. When key rotations are triggered by
+// UVM, the driver does not rely on channel key rotation notifiers.
+//
+// TODO: Bug 4612912: UVM should be able to programmatically set the rotation
+// lower threshold. This function, and all the metadata associated with it
+// (per-pool encryption accounting, for example) can be removed at that point.
+static bool key_rotation_is_notifier_driven(void)
+{
+    return !uvm_enable_builtin_tests;
+}
+
 NV_STATUS uvm_conf_computing_gpu_init(uvm_gpu_t *gpu)
 {
    NV_STATUS status;
@ -394,17 +416,35 @@ void uvm_conf_computing_gpu_deinit(uvm_gpu_t *gpu)
    conf_computing_dma_buffer_pool_deinit(&gpu->conf_computing.dma_buffer_pool);
 }

-void uvm_conf_computing_log_gpu_encryption(uvm_channel_t *channel, UvmCslIv *iv)
+void uvm_conf_computing_log_gpu_encryption(uvm_channel_t *channel, size_t size, UvmCslIv *iv)
 {
    NV_STATUS status;
+    uvm_channel_pool_t *pool;
+
+    if (uvm_channel_is_lcic(channel))
+        pool = uvm_channel_lcic_get_paired_wlc(channel)->pool;
+    else
+        pool = channel->pool;

    uvm_mutex_lock(&channel->csl.ctx_lock);
+
+    if (uvm_conf_computing_is_key_rotation_enabled_in_pool(pool)) {
+        status = nvUvmInterfaceCslLogEncryption(&channel->csl.ctx, UVM_CSL_OPERATION_DECRYPT, size);
+
+        // Informing RM of an encryption/decryption should not fail
+        UVM_ASSERT(status == NV_OK);
+
+        if (!key_rotation_is_notifier_driven())
+            atomic64_add(size, &pool->conf_computing.key_rotation.encrypted);
+    }
+
    status = nvUvmInterfaceCslIncrementIv(&channel->csl.ctx, UVM_CSL_OPERATION_DECRYPT, 1, iv);
-    uvm_mutex_unlock(&channel->csl.ctx_lock);

    // IV rotation is done preemptively as needed, so the above
    // call cannot return failure.
    UVM_ASSERT(status == NV_OK);
+
+    uvm_mutex_unlock(&channel->csl.ctx_lock);
 }

 void uvm_conf_computing_acquire_encryption_iv(uvm_channel_t *channel, UvmCslIv *iv)
@ -428,27 +468,46 @@ void uvm_conf_computing_cpu_encrypt(uvm_channel_t *channel,
                                    void *auth_tag_buffer)
 {
    NV_STATUS status;
+    uvm_channel_pool_t *pool;

    UVM_ASSERT(size);

+    if (uvm_channel_is_lcic(channel))
+        pool = uvm_channel_lcic_get_paired_wlc(channel)->pool;
+    else
+        pool = channel->pool;
+
    uvm_mutex_lock(&channel->csl.ctx_lock);
+
    status = nvUvmInterfaceCslEncrypt(&channel->csl.ctx,
                                      size,
                                      (NvU8 const *) src_plain,
                                      encrypt_iv,
                                      (NvU8 *) dst_cipher,
                                      (NvU8 *) auth_tag_buffer);
-    uvm_mutex_unlock(&channel->csl.ctx_lock);

    // IV rotation is done preemptively as needed, so the above
    // call cannot return failure.
    UVM_ASSERT(status == NV_OK);
+
+    if (uvm_conf_computing_is_key_rotation_enabled_in_pool(pool)) {
+        status = nvUvmInterfaceCslLogEncryption(&channel->csl.ctx, UVM_CSL_OPERATION_ENCRYPT, size);
+
+        // Informing RM of an encryption/decryption should not fail
+        UVM_ASSERT(status == NV_OK);
+
+        if (!key_rotation_is_notifier_driven())
+            atomic64_add(size, &pool->conf_computing.key_rotation.decrypted);
+    }
+
+    uvm_mutex_unlock(&channel->csl.ctx_lock);
 }

 NV_STATUS uvm_conf_computing_cpu_decrypt(uvm_channel_t *channel,
                                         void *dst_plain,
                                         const void *src_cipher,
                                         const UvmCslIv *src_iv,
+                                         NvU32 key_version,
                                         size_t size,
                                         const void *auth_tag_buffer)
 {
@ -469,11 +528,19 @@ NV_STATUS uvm_conf_computing_cpu_decrypt(uvm_channel_t *channel,
                                      size,
                                      (const NvU8 *) src_cipher,
                                      src_iv,
-                                      NV_U32_MAX,
+                                      key_version,
                                      (NvU8 *) dst_plain,
                                      NULL,
                                      0,
                                      (const NvU8 *) auth_tag_buffer);
+
+    if (status != NV_OK) {
+        UVM_ERR_PRINT("nvUvmInterfaceCslDecrypt() failed: %s, channel %s, GPU %s\n",
+                      nvstatusToString(status),
+                      channel->name,
+                      uvm_gpu_name(uvm_channel_get_gpu(channel)));
+    }
+
    uvm_mutex_unlock(&channel->csl.ctx_lock);

    return status;
@ -640,3 +707,231 @@ NV_STATUS uvm_conf_computing_maybe_rotate_channel_ivs_retry_busy(uvm_channel_t *
 {
    return uvm_conf_computing_rotate_channel_ivs_below_limit(channel, uvm_conf_computing_channel_iv_rotation_limit, true);
 }
+
+void uvm_conf_computing_enable_key_rotation(uvm_gpu_t *gpu)
+{
+    if (!g_uvm_global.conf_computing_enabled)
+        return;
+
+    // Key rotation cannot be enabled on UVM if it is disabled on RM
+    if (!gpu->parent->rm_info.gpuConfComputeCaps.bKeyRotationEnabled)
+        return;
+
+    gpu->channel_manager->conf_computing.key_rotation_enabled = true;
+}
+
+void uvm_conf_computing_disable_key_rotation(uvm_gpu_t *gpu)
+{
+    if (!g_uvm_global.conf_computing_enabled)
+        return;
+
+    gpu->channel_manager->conf_computing.key_rotation_enabled = false;
+}
+
+bool uvm_conf_computing_is_key_rotation_enabled(uvm_gpu_t *gpu)
+{
+    return gpu->channel_manager->conf_computing.key_rotation_enabled;
+}
+
+bool uvm_conf_computing_is_key_rotation_enabled_in_pool(uvm_channel_pool_t *pool)
+{
+    if (!uvm_conf_computing_is_key_rotation_enabled(pool->manager->gpu))
+        return false;
+
+    // TODO: Bug 4586447: key rotation must be disabled in the SEC2 engine,
+    // because currently the encryption key is shared between UVM and RM, but
+    // UVM is not able to idle SEC2 channels owned by RM.
+    if (uvm_channel_pool_is_sec2(pool))
+        return false;
+
+    // Key rotation happens as part of channel reservation, and LCIC channels
+    // are never reserved directly. Rotation of keys in LCIC channels happens
+    // as the result of key rotation in WLC channels.
+    //
+    // Return false even if there is nothing fundamental prohibiting direct key
+    // rotation on LCIC pools
+    if (uvm_channel_pool_is_lcic(pool))
+        return false;
+
+    return true;
+}
+
+static bool conf_computing_is_key_rotation_pending_use_stats(uvm_channel_pool_t *pool)
+{
+    NvU64 decrypted, encrypted;
+
+    UVM_ASSERT(!key_rotation_is_notifier_driven());
+
+    decrypted = atomic64_read(&pool->conf_computing.key_rotation.decrypted);
+
+    if (decrypted > UVM_CONF_COMPUTING_KEY_ROTATION_LOWER_THRESHOLD)
+        return true;
+
+    encrypted = atomic64_read(&pool->conf_computing.key_rotation.encrypted);
+
+    if (encrypted > UVM_CONF_COMPUTING_KEY_ROTATION_LOWER_THRESHOLD)
+        return true;
+
+    return false;
+}
+
+static bool conf_computing_is_key_rotation_pending_use_notifier(uvm_channel_pool_t *pool)
+{
+    // If key rotation is pending for the pool's engine, then the key rotation
+    // notifier in any of the engine channels can be used by UVM to detect the
+    // situation. Note that RM doesn't update all the notifiers in a single
+    // atomic operation, so it is possible that the channel read by UVM (the
+    // first one in the pool) indicates that a key rotation is pending, but
+    // another channel in the pool (temporarily) indicates the opposite, or vice
+    // versa.
+    uvm_channel_t *first_channel = pool->channels;
+
+    UVM_ASSERT(key_rotation_is_notifier_driven());
+    UVM_ASSERT(first_channel != NULL);
+
+    return first_channel->channel_info.keyRotationNotifier->status == UVM_KEY_ROTATION_STATUS_PENDING;
+}
+
+bool uvm_conf_computing_is_key_rotation_pending_in_pool(uvm_channel_pool_t *pool)
+{
+    if (!uvm_conf_computing_is_key_rotation_enabled_in_pool(pool))
+        return false;
+
+    if (key_rotation_is_notifier_driven())
+        return conf_computing_is_key_rotation_pending_use_notifier(pool);
+    else
+        return conf_computing_is_key_rotation_pending_use_stats(pool);
+}
+
+NV_STATUS uvm_conf_computing_rotate_pool_key(uvm_channel_pool_t *pool)
+{
+    NV_STATUS status;
+
+    UVM_ASSERT(uvm_conf_computing_is_key_rotation_enabled_in_pool(pool));
+    UVM_ASSERT(pool->conf_computing.key_rotation.csl_contexts != NULL);
+    UVM_ASSERT(pool->conf_computing.key_rotation.num_csl_contexts > 0);
+
+    // NV_ERR_STATE_IN_USE indicates that RM was not able to acquire the
+    // required locks at this time. This status is not interpreted as an error,
+    // but as a sign for UVM to try again later. This is the same "protocol"
+    // used in IV rotation.
+    status = nvUvmInterfaceCslRotateKey(pool->conf_computing.key_rotation.csl_contexts,
+                                        pool->conf_computing.key_rotation.num_csl_contexts);
+
+    if (status == NV_OK) {
+        pool->conf_computing.key_rotation.version++;
+
+        if (!key_rotation_is_notifier_driven()) {
+            atomic64_set(&pool->conf_computing.key_rotation.decrypted, 0);
+            atomic64_set(&pool->conf_computing.key_rotation.encrypted, 0);
+        }
+    }
+    else if (status != NV_ERR_STATE_IN_USE) {
+        UVM_DBG_PRINT("nvUvmInterfaceCslRotateKey() failed in engine %u: %s\n",
+                      pool->engine_index,
+                      nvstatusToString(status));
+    }
+
+    return status;
+}
+
+__attribute__ ((format(printf, 6, 7)))
+NV_STATUS uvm_conf_computing_util_memcopy_cpu_to_gpu(uvm_gpu_t *gpu,
+                                                     uvm_gpu_address_t dst_gpu_address,
+                                                     void *src_plain,
+                                                     size_t size,
+                                                     uvm_tracker_t *tracker,
+                                                     const char *format,
+                                                     ...)
+{
+    NV_STATUS status;
+    uvm_push_t push;
+    uvm_conf_computing_dma_buffer_t *dma_buffer;
+    uvm_gpu_address_t src_gpu_address, auth_tag_gpu_address;
+    void *dst_cipher, *auth_tag;
+    va_list args;
+
+    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
+    UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE);
+
+    status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL);
+    if (status != NV_OK)
+        return status;
+
+    va_start(args, format);
+    status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_CPU_TO_GPU, tracker, &push, format, args);
+    va_end(args);
+
+    if (status != NV_OK)
+        goto out;
+
+    dst_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
+    auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
+    uvm_conf_computing_cpu_encrypt(push.channel, dst_cipher, src_plain, NULL, size, auth_tag);
+
+    src_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
+    auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
+    gpu->parent->ce_hal->decrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address);
+
+    status = uvm_push_end_and_wait(&push);
+
+out:
+    uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL);
+    return status;
+}
+
+__attribute__ ((format(printf, 6, 7)))
+NV_STATUS uvm_conf_computing_util_memcopy_gpu_to_cpu(uvm_gpu_t *gpu,
+                                                     void *dst_plain,
+                                                     uvm_gpu_address_t src_gpu_address,
+                                                     size_t size,
+                                                     uvm_tracker_t *tracker,
+                                                     const char *format,
+                                                     ...)
+{
+    NV_STATUS status;
+    uvm_push_t push;
+    uvm_conf_computing_dma_buffer_t *dma_buffer;
+    uvm_gpu_address_t dst_gpu_address, auth_tag_gpu_address;
+    void *src_cipher, *auth_tag;
+    va_list args;
+
+    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
+    UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE);
+
+    status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL);
+    if (status != NV_OK)
+        return status;
+
+    va_start(args, format);
+    status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_TO_CPU, tracker, &push, format, args);
+    va_end(args);
+
+    if (status != NV_OK)
+        goto out;
+
+    uvm_conf_computing_log_gpu_encryption(push.channel, size, dma_buffer->decrypt_iv);
+    dma_buffer->key_version[0] = uvm_channel_pool_key_version(push.channel->pool);
+
+    dst_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
+    auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
+    gpu->parent->ce_hal->encrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address);
+
+    status = uvm_push_end_and_wait(&push);
+    if (status != NV_OK)
+        goto out;
+
+    src_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
+    auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
+    status = uvm_conf_computing_cpu_decrypt(push.channel,
+                                            dst_plain,
+                                            src_cipher,
+                                            dma_buffer->decrypt_iv,
+                                            dma_buffer->key_version[0],
+                                            size,
+                                            auth_tag);
+
+ out:
+    uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL);
+    return status;
+}
--- a/kernel-open/nvidia-uvm/uvm_conf_computing.h
+++ b/kernel-open/nvidia-uvm/uvm_conf_computing.h
@ -87,9 +87,9 @@ typedef struct
    // a free buffer.
    uvm_tracker_t tracker;

-    // When the DMA buffer is used as the destination of a GPU encryption, SEC2
-    // writes the authentication tag here. Later when the buffer is decrypted
-    // on the CPU the authentication tag is used again (read) for CSL to verify
+    // When the DMA buffer is used as the destination of a GPU encryption, the
+    // engine (CE or SEC2) writes the authentication tag here. When the buffer
+    // is decrypted on the CPU the authentication tag is used by CSL to verify
    // the authenticity. The allocation is big enough for one authentication
    // tag per PAGE_SIZE page in the alloc buffer.
    uvm_mem_t *auth_tag;
@ -98,7 +98,12 @@ typedef struct
    // to the authentication tag. The allocation is big enough for one IV per
    // PAGE_SIZE page in the alloc buffer. The granularity between the decrypt
    // IV and authentication tag must match.
-    UvmCslIv decrypt_iv[(UVM_CONF_COMPUTING_DMA_BUFFER_SIZE / PAGE_SIZE)];
+    UvmCslIv decrypt_iv[UVM_CONF_COMPUTING_DMA_BUFFER_SIZE / PAGE_SIZE];
+
+    // When the DMA buffer is used as the destination of a GPU encryption, the
+    // key version used during GPU encryption of each PAGE_SIZE page can be
+    // saved here, so CPU decryption uses the correct decryption key.
+    NvU32 key_version[UVM_CONF_COMPUTING_DMA_BUFFER_SIZE / PAGE_SIZE];

    // Bitmap of the encrypted pages in the backing allocation
    uvm_page_mask_t encrypted_page_mask;
@ -147,7 +152,7 @@ NV_STATUS uvm_conf_computing_gpu_init(uvm_gpu_t *gpu);
 void uvm_conf_computing_gpu_deinit(uvm_gpu_t *gpu);

 // Logs encryption information from the GPU and returns the IV.
-void uvm_conf_computing_log_gpu_encryption(uvm_channel_t *channel, UvmCslIv *iv);
+void uvm_conf_computing_log_gpu_encryption(uvm_channel_t *channel, size_t size, UvmCslIv *iv);

 // Acquires next CPU encryption IV and returns it.
 void uvm_conf_computing_acquire_encryption_iv(uvm_channel_t *channel, UvmCslIv *iv);
@ -167,10 +172,14 @@ void uvm_conf_computing_cpu_encrypt(uvm_channel_t *channel,
 // CPU side decryption helper. Decrypts data from src_cipher and writes the
 // plain text in dst_plain. src_cipher and dst_plain can't overlap. IV obtained
 // from uvm_conf_computing_log_gpu_encryption() needs to be be passed to src_iv.
+//
+// The caller must indicate which key to use for decryption by passing the
+// appropiate key version number.
 NV_STATUS uvm_conf_computing_cpu_decrypt(uvm_channel_t *channel,
                                         void *dst_plain,
                                         const void *src_cipher,
                                         const UvmCslIv *src_iv,
+                                         NvU32 key_version,
                                         size_t size,
                                         const void *auth_tag_buffer);

@ -214,4 +223,71 @@ NV_STATUS uvm_conf_computing_maybe_rotate_channel_ivs_retry_busy(uvm_channel_t *
 // Check if there are fewer than 'limit' messages available in either direction
 // and rotate if not.
 NV_STATUS uvm_conf_computing_rotate_channel_ivs_below_limit(uvm_channel_t *channel, NvU64 limit, bool retry_if_busy);
+
+// Rotate the engine key associated with the given channel pool.
+NV_STATUS uvm_conf_computing_rotate_pool_key(uvm_channel_pool_t *pool);
+
+// Returns true if key rotation is allowed in the channel pool.
+bool uvm_conf_computing_is_key_rotation_enabled_in_pool(uvm_channel_pool_t *pool);
+
+// Returns true if key rotation is pending in the channel pool.
+bool uvm_conf_computing_is_key_rotation_pending_in_pool(uvm_channel_pool_t *pool);
+
+// Enable/disable key rotation in the passed GPU. Note that UVM enablement is
+// dependent on RM enablement: key rotation may still be disabled upon calling
+// this function, if it is disabled in RM. On the other hand, key rotation can
+// be disabled in UVM, even if it is enabled in RM.
+//
+// Enablement/Disablement affects only kernel key rotation in keys owned by UVM.
+// It doesn't affect user key rotation (CUDA, Video...), nor it affects RM
+// kernel key rotation.
+void uvm_conf_computing_enable_key_rotation(uvm_gpu_t *gpu);
+void uvm_conf_computing_disable_key_rotation(uvm_gpu_t *gpu);
+
+// Returns true if key rotation is enabled on UVM in the given GPU. Key rotation
+// can be enabled on the GPU but disabled on some of GPU engines (LCEs or SEC2),
+// see uvm_conf_computing_is_key_rotation_enabled_in_pool.
+bool uvm_conf_computing_is_key_rotation_enabled(uvm_gpu_t *gpu);
+
+// Launch a synchronous, encrypted copy between CPU and GPU.
+//
+// The maximum copy size allowed is UVM_CONF_COMPUTING_DMA_BUFFER_SIZE.
+//
+// The source CPU buffer pointed by src_plain contains the unencrypted (plain
+// text) contents; the function internally performs a CPU-side encryption step
+// before launching the GPU-side CE decryption. The source buffer can be in
+// protected or unprotected sysmem, while the destination buffer must be in
+// protected vidmem.
+//
+// The input tracker, if not NULL, is internally acquired by the push
+// responsible for the encrypted copy.
+__attribute__ ((format(printf, 6, 7)))
+NV_STATUS uvm_conf_computing_util_memcopy_cpu_to_gpu(uvm_gpu_t *gpu,
+                                                     uvm_gpu_address_t dst_gpu_address,
+                                                     void *src_plain,
+                                                     size_t size,
+                                                     uvm_tracker_t *tracker,
+                                                     const char *format,
+                                                     ...);
+
+// Launch a synchronous, encrypted copy between CPU and GPU.
+//
+// The maximum copy size allowed is UVM_CONF_COMPUTING_DMA_BUFFER_SIZE.
+//
+// The source CPU buffer pointed by src_plain contains the unencrypted (plain
+// text) contents; the function internally performs a CPU-side encryption step
+// before launching the GPU-side CE decryption. The source buffer can be in
+// protected or unprotected sysmem, while the destination buffer must be in
+// protected vidmem.
+//
+// The input tracker, if not NULL, is internally acquired by the push
+// responsible for the encrypted copy.
+__attribute__ ((format(printf, 6, 7)))
+NV_STATUS uvm_conf_computing_util_memcopy_gpu_to_cpu(uvm_gpu_t *gpu,
+                                                     void *dst_plain,
+                                                     uvm_gpu_address_t src_gpu_address,
+                                                     size_t size,
+                                                     uvm_tracker_t *tracker,
+                                                     const char *format,
+                                                     ...);
 #endif // __UVM_CONF_COMPUTING_H__
--- a/kernel-open/nvidia-uvm/uvm_debug_optimized.c
+++ b/kernel-open/nvidia-uvm/uvm_debug_optimized.c
@ -1,53 +0,0 @@
-/*******************************************************************************
-    Copyright (c) 2015 NVIDIA Corporation
-
-    Permission is hereby granted, free of charge, to any person obtaining a copy
-    of this software and associated documentation files (the "Software"), to
-    deal in the Software without restriction, including without limitation the
-    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-    sell copies of the Software, and to permit persons to whom the Software is
-    furnished to do so, subject to the following conditions:
-
-        The above copyright notice and this permission notice shall be
-        included in all copies or substantial portions of the Software.
-
-    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-    DEALINGS IN THE SOFTWARE.
-
-*******************************************************************************/
-
-// This file provides simple wrappers that are always built with optimizations
-// turned on to WAR issues with functions that don't build correctly otherwise.
-
-#include "uvm_linux.h"
-
-int nv_atomic_xchg(atomic_t *val, int new)
-{
-    return atomic_xchg(val, new);
-}
-
-int nv_atomic_cmpxchg(atomic_t *val, int old, int new)
-{
-    return atomic_cmpxchg(val, old, new);
-}
-
-long nv_atomic_long_cmpxchg(atomic_long_t *val, long old, long new)
-{
-    return atomic_long_cmpxchg(val, old, new);
-}
-
-unsigned long nv_copy_from_user(void *to, const void __user *from, unsigned long n)
-{
-    return copy_from_user(to, from, n);
-}
-
-unsigned long nv_copy_to_user(void __user *to, const void *from, unsigned long n)
-{
-    return copy_to_user(to, from, n);
-}
-
--- a/kernel-open/nvidia-uvm/uvm_forward_decl.h
+++ b/kernel-open/nvidia-uvm/uvm_forward_decl.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2022 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -42,7 +42,6 @@ typedef struct uvm_gpu_semaphore_struct uvm_gpu_semaphore_t;
 typedef struct uvm_gpu_tracking_semaphore_struct uvm_gpu_tracking_semaphore_t;
 typedef struct uvm_gpu_semaphore_pool_struct uvm_gpu_semaphore_pool_t;
 typedef struct uvm_gpu_semaphore_pool_page_struct uvm_gpu_semaphore_pool_page_t;
-typedef struct uvm_gpu_peer_struct uvm_gpu_peer_t;
 typedef struct uvm_mmu_mode_hal_struct uvm_mmu_mode_hal_t;

 typedef struct uvm_channel_manager_struct uvm_channel_manager_t;
@ -57,6 +56,12 @@ typedef struct uvm_gpfifo_entry_struct uvm_gpfifo_entry_t;

 typedef struct uvm_va_policy_struct uvm_va_policy_t;
 typedef struct uvm_va_range_struct uvm_va_range_t;
+typedef struct uvm_va_range_managed_struct uvm_va_range_managed_t;
+typedef struct uvm_va_range_external_struct uvm_va_range_external_t;
+typedef struct uvm_va_range_channel_struct uvm_va_range_channel_t;
+typedef struct uvm_va_range_sked_reflected_struct uvm_va_range_sked_reflected_t;
+typedef struct uvm_va_range_semaphore_pool_struct uvm_va_range_semaphore_pool_t;
+typedef struct uvm_va_range_device_p2p_struct uvm_va_range_device_p2p_t;
 typedef struct uvm_va_block_struct uvm_va_block_t;
 typedef struct uvm_va_block_test_struct uvm_va_block_test_t;
 typedef struct uvm_va_block_wrapper_struct uvm_va_block_wrapper_t;
--- a/kernel-open/nvidia-uvm/uvm_get_rm_ptes_test.c
+++ b/kernel-open/nvidia-uvm/uvm_get_rm_ptes_test.c
@ -115,8 +115,8 @@ static NV_STATUS verify_mapping_info(uvm_va_space_t *va_space,

    TEST_CHECK_RET(skip);

-    memory_owning_gpu = uvm_va_space_get_gpu_by_uuid(va_space, &memory_info->uuid);
-    if (memory_owning_gpu == NULL)
+    memory_owning_gpu = uvm_va_space_get_gpu_by_mem_info(va_space, memory_info);
+    if (!memory_owning_gpu)
        return NV_ERR_INVALID_DEVICE;

    aperture = get_aperture(va_space, memory_owning_gpu, memory_mapping_gpu, memory_info, sli_supported);
@ -129,7 +129,8 @@ static NV_STATUS verify_mapping_info(uvm_va_space_t *va_space,
    phys_offset = mapping_offset;

    // Add the physical offset for nvswitch connected peer mappings
-    if (uvm_aperture_is_peer(aperture) && uvm_gpus_are_nvswitch_connected(memory_mapping_gpu, memory_owning_gpu))
+    if (uvm_aperture_is_peer(aperture) &&
+        uvm_parent_gpus_are_nvswitch_connected(memory_mapping_gpu->parent, memory_owning_gpu->parent))
        phys_offset += memory_owning_gpu->parent->nvswitch_info.fabric_memory_window_start;

    for (index = 0; index < ext_mapping_info->numWrittenPtes; index++) {
--- a/kernel-open/nvidia-uvm/uvm_global.c
+++ b/kernel-open/nvidia-uvm/uvm_global.c
@ -412,7 +412,7 @@ void uvm_global_set_fatal_error_impl(NV_STATUS error)

    UVM_ASSERT(error != NV_OK);

-    previous_error = nv_atomic_cmpxchg(&g_uvm_global.fatal_error, NV_OK, error);
+    previous_error = atomic_cmpxchg(&g_uvm_global.fatal_error, NV_OK, error);

    if (previous_error == NV_OK) {
        UVM_ERR_PRINT("Encountered a global fatal error: %s\n", nvstatusToString(error));
@ -421,6 +421,8 @@ void uvm_global_set_fatal_error_impl(NV_STATUS error)
        UVM_ERR_PRINT("Encountered a global fatal error: %s after a global error has been already set: %s\n",
                nvstatusToString(error), nvstatusToString(previous_error));
    }
+
+    nvUvmInterfaceReportFatalError(error);
 }

 NV_STATUS uvm_global_reset_fatal_error(void)
@ -430,7 +432,7 @@ NV_STATUS uvm_global_reset_fatal_error(void)
        return NV_ERR_INVALID_STATE;
    }

-    return nv_atomic_xchg(&g_uvm_global.fatal_error, NV_OK);
+    return atomic_xchg(&g_uvm_global.fatal_error, NV_OK);
 }

 void uvm_global_gpu_retain(const uvm_processor_mask_t *mask)
--- a/kernel-open/nvidia-uvm/uvm_global.h
+++ b/kernel-open/nvidia-uvm/uvm_global.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -52,19 +52,23 @@ struct uvm_global_struct
    // Created on module load and destroyed on module unload
    uvmGpuSessionHandle rm_session_handle;

-    // peer-to-peer table
-    // peer info is added and removed from this table when usermode
-    // driver calls UvmEnablePeerAccess and UvmDisablePeerAccess
-    // respectively.
-    uvm_gpu_peer_t peers[UVM_MAX_UNIQUE_GPU_PAIRS];
+    // Peer-to-peer table for storing parent GPU and MIG instance peer info.
+    // Note that MIG instances can be peers within a single parent GPU or
+    // be peers in different parent GPUs if NVLINK or PCIe peers is enabled.
+    // PCIe and MIG peer info is added and removed from this table when
+    // usermode driver calls UvmEnablePeerAccess() and UvmDisablePeerAccess()
+    // respectively. NvLink and MIG peers are updated when UvmRegisterGpu() and
+    // UvmUnregisterGpu() are called. Peer to peer state for MIG instances
+    // within the same parent GPU are not stored here.
+    uvm_parent_gpu_peer_t parent_gpu_peers[UVM_MAX_UNIQUE_PARENT_GPU_PAIRS];

    // peer-to-peer copy mode
    // Pascal+ GPUs support virtual addresses in p2p copies.
    // Ampere+ GPUs add support for physical addresses in p2p copies.
    uvm_gpu_peer_copy_mode_t peer_copy_mode;

-    // Stores an NV_STATUS, once it becomes != NV_OK, the driver should refuse to
-    // do most anything other than try and clean up as much as possible.
+    // Stores an NV_STATUS, once it becomes != NV_OK, the driver should refuse
+    // to do most anything other than try and clean up as much as possible.
    // An example of a fatal error is an unrecoverable ECC error on one of the
    // GPUs.
    atomic_t fatal_error;
@ -232,12 +236,12 @@ static uvmGpuSessionHandle uvm_global_session_handle(void)
 // suspended.
 #define UVM_GPU_WRITE_ONCE(x, val) do {         \
        UVM_ASSERT(!uvm_global_is_suspended()); \
-        UVM_WRITE_ONCE(x, val);                 \
+        WRITE_ONCE(x, val);                     \
    } while (0)

 #define UVM_GPU_READ_ONCE(x) ({                 \
        UVM_ASSERT(!uvm_global_is_suspended()); \
-        UVM_READ_ONCE(x);                       \
+        READ_ONCE(x);                           \
    })

 static bool global_is_fatal_error_assert_disabled(void)
@ -384,7 +388,7 @@ static uvm_gpu_t *uvm_gpu_find_next_valid_gpu_in_parent(uvm_parent_gpu_t *parent
         (parent_gpu) = uvm_global_find_next_parent_gpu((parent_gpu)))

 // LOCKING: Must hold the global_lock
-#define for_each_gpu_in_parent(parent_gpu, gpu)                                                 \
+#define for_each_gpu_in_parent(gpu, parent_gpu)                                                 \
    for (({uvm_assert_mutex_locked(&g_uvm_global.global_lock);                                  \
         (gpu) = uvm_gpu_find_next_valid_gpu_in_parent((parent_gpu), NULL);});                  \
         (gpu) != NULL;                                                                         \
--- a/kernel-open/nvidia-uvm/uvm_gpu.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu.c
@ -57,12 +57,6 @@ MODULE_PARM_DESC(uvm_peer_copy, "Choose the addressing mode for peer copying, op
                                UVM_PARAM_PEER_COPY_PHYSICAL " [default] or " UVM_PARAM_PEER_COPY_VIRTUAL ". "
                                "Valid for Ampere+ GPUs.");

-static void remove_gpu(uvm_gpu_t *gpu);
-static void disable_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1);
-static NV_STATUS discover_smc_peers(uvm_gpu_t *gpu);
-static NV_STATUS discover_nvlink_peers(uvm_gpu_t *gpu);
-static void destroy_nvlink_peers(uvm_gpu_t *gpu);
-
 static uvm_user_channel_t *get_user_channel(uvm_rb_tree_node_t *node)
 {
    return container_of(node, uvm_user_channel_t, instance_ptr.node);
@ -92,7 +86,7 @@ static uvm_gpu_link_type_t get_gpu_link_type(UVM_LINK_TYPE link_type)

 static void fill_parent_gpu_info(uvm_parent_gpu_t *parent_gpu, const UvmGpuInfo *gpu_info)
 {
-    char uuid_buffer[UVM_GPU_UUID_TEXT_BUFFER_LENGTH];
+    char uuid_buffer[UVM_UUID_STRING_LENGTH];

    parent_gpu->rm_info = *gpu_info;

@ -118,10 +112,10 @@ static void fill_parent_gpu_info(uvm_parent_gpu_t *parent_gpu, const UvmGpuInfo
    if (parent_gpu->nvswitch_info.is_nvswitch_connected)
        parent_gpu->nvswitch_info.fabric_memory_window_start = gpu_info->nvswitchMemoryWindowStart;

-    format_uuid_to_buffer(uuid_buffer, sizeof(uuid_buffer), &parent_gpu->uuid);
+    uvm_uuid_string(uuid_buffer, &parent_gpu->uuid);
    snprintf(parent_gpu->name,
             sizeof(parent_gpu->name),
-             "ID %u: %s: %s",
+             "ID %u: %s: " UVM_PARENT_GPU_UUID_PREFIX "%s",
             uvm_parent_id_value(parent_gpu->id),
             parent_gpu->rm_info.name,
             uuid_buffer);
@ -150,7 +144,6 @@ static NV_STATUS get_gpu_caps(uvm_gpu_t *gpu)
    return NV_OK;
 }

-
 // Return a PASID to use with the internal address space (AS), or -1 if not
 // supported. This  PASID is needed to enable ATS in the internal AS, but it is
 // not used in address translation requests, which only translate GPA->SPA.
@ -231,10 +224,16 @@ static NV_STATUS alloc_and_init_address_space(uvm_gpu_t *gpu)
    return NV_OK;
 }

+int uvm_device_p2p_static_bar(uvm_gpu_t *gpu)
+{
+    return nv_bar_index_to_os_bar_index(gpu->parent->pci_dev, NV_GPU_BAR_INDEX_FB);
+}
+
 static NV_STATUS get_gpu_fb_info(uvm_gpu_t *gpu)
 {
    NV_STATUS status;
    UvmGpuFbInfo fb_info = {0};
+    unsigned long pci_bar1_addr = pci_resource_start(gpu->parent->pci_dev, uvm_device_p2p_static_bar(gpu));

    status = uvm_rm_locked_call(nvUvmInterfaceGetFbInfo(uvm_gpu_device_handle(gpu), &fb_info));
    if (status != NV_OK)
@ -246,6 +245,8 @@ static NV_STATUS get_gpu_fb_info(uvm_gpu_t *gpu)
    }

    gpu->mem_info.max_vidmem_page_size = fb_info.maxVidmemPageSize;
+    gpu->mem_info.static_bar1_start = pci_bar1_addr;
+    gpu->mem_info.static_bar1_size = fb_info.staticBar1Size;

    return NV_OK;
 }
@ -696,31 +697,136 @@ static void gpu_access_counters_print_common(uvm_parent_gpu_t *parent_gpu, struc
                         (num_pages_out * (NvU64)PAGE_SIZE) / (1024u * 1024u));
 }

-void uvm_gpu_print(uvm_gpu_t *gpu)
+// This function converts an index of 2D array of size [N x N] into an index
+// of upper triangular array of size [((N - 1) * ((N - 1) + 1)) / 2] which
+// does not include diagonal elements.
+static NvU32 peer_table_index(NvU32 index0, NvU32 index1, NvU32 N)
 {
-    gpu_info_print_common(gpu, NULL);
+    NvU32 square_index, triangular_index, min_index;
+
+    UVM_ASSERT(index0 != index1);
+    UVM_ASSERT(index0 < N);
+    UVM_ASSERT(index1 < N);
+
+    // Calculate an index of 2D array by re-ordering indices to always point
+    // to the same entry.
+    min_index = min(index0, index1);
+    square_index = min_index * N + max(index0, index1);
+
+    // Calculate and subtract number of lower triangular matrix elements till
+    // the current row (which includes diagonal elements) to get the correct
+    // index in an upper triangular matrix.
+    triangular_index = square_index - SUM_FROM_0_TO_N(min_index + 1);
+
+    return triangular_index;
 }

-static void gpu_peer_caps_print(uvm_gpu_t **gpu_pair, struct seq_file *s)
+// This function converts an index of 2D array of size [N x N] into an index
+// of upper triangular array of size [(N * (N + 1)) / 2] which does include
+// diagonal elements.
+static NvU32 sub_processor_peer_table_index(NvU32 index0, NvU32 index1)
 {
+    NvU32 square_index, triangular_index, min_index;
+
+    UVM_ASSERT(index0 < UVM_PARENT_ID_MAX_SUB_PROCESSORS);
+    UVM_ASSERT(index1 < UVM_PARENT_ID_MAX_SUB_PROCESSORS);
+
+    // Calculate an index of 2D array by re-ordering indices to always point
+    // to the same entry.
+    min_index = min(index0, index1);
+    square_index = min_index * UVM_PARENT_ID_MAX_SUB_PROCESSORS + max(index0, index1);
+
+    // Calculate and subtract number of lower triangular matrix elements till
+    // the current row (which doesn't include diagonal elements) to get the
+    // correct index in an upper triangular matrix.
+    triangular_index = square_index - SUM_FROM_0_TO_N(min_index);
+
+    return triangular_index;
+}
+
+NvU32 uvm_gpu_pair_index(const uvm_gpu_id_t id0, const uvm_gpu_id_t id1)
+{
+    NvU32 index = peer_table_index(uvm_id_gpu_index(id0), uvm_id_gpu_index(id1), UVM_ID_MAX_GPUS);
+
+    UVM_ASSERT(index < UVM_MAX_UNIQUE_GPU_PAIRS);
+
+    return index;
+}
+
+static NvU32 parent_gpu_peer_table_index(const uvm_parent_gpu_id_t id0, const uvm_parent_gpu_id_t id1)
+{
+    NvU32 index = peer_table_index(uvm_parent_id_gpu_index(id0), uvm_parent_id_gpu_index(id1), UVM_PARENT_ID_MAX_GPUS);
+
+    UVM_ASSERT(index < UVM_MAX_UNIQUE_PARENT_GPU_PAIRS);
+
+    return index;
+}
+
+static NvU32 sub_processor_pair_index(const uvm_gpu_id_t id0, const uvm_gpu_id_t id1)
+{
+    NvU32 index = sub_processor_peer_table_index(uvm_id_sub_processor_index(id0), uvm_id_sub_processor_index(id1));
+
+    UVM_ASSERT(index < UVM_MAX_UNIQUE_SUB_PROCESSOR_PAIRS);
+
+    return index;
+}
+
+// Get the parent P2P capabilities between the given parent gpus.
+static uvm_parent_gpu_peer_t *parent_gpu_peer_caps(const uvm_parent_gpu_t *parent_gpu0,
+                                                   const uvm_parent_gpu_t *parent_gpu1)
+{
+    return &g_uvm_global.parent_gpu_peers[parent_gpu_peer_table_index(parent_gpu0->id, parent_gpu1->id)];
+}
+
+// Get the P2P capabilities between the given gpus.
+static uvm_gpu_peer_t *gpu_peer_caps(const uvm_gpu_t *gpu0, const uvm_gpu_t *gpu1)
+{
+    uvm_parent_gpu_peer_t *parent_peer_caps = parent_gpu_peer_caps(gpu0->parent, gpu1->parent);
+
+    return &parent_peer_caps->gpu_peers[sub_processor_pair_index(gpu0->id, gpu1->id)];
+}
+
+static uvm_aperture_t parent_gpu_peer_aperture(uvm_parent_gpu_t *local,
+                                               uvm_parent_gpu_t *remote,
+                                               uvm_parent_gpu_peer_t *parent_peer_caps)
+{
+    size_t peer_index;
+
+    UVM_ASSERT(parent_peer_caps->ref_count);
+    UVM_ASSERT(parent_peer_caps->link_type != UVM_GPU_LINK_INVALID);
+
+    if (uvm_parent_id_value(local->id) < uvm_parent_id_value(remote->id))
+        peer_index = 0;
+    else
+        peer_index = 1;
+
+    return UVM_APERTURE_PEER(parent_peer_caps->peer_ids[peer_index]);
+}
+
+static void parent_gpu_peer_caps_print(uvm_parent_gpu_t **parent_gpu_pair, struct seq_file *s)
+{
+    uvm_parent_gpu_peer_t *parent_peer_caps;
    bool nvswitch_connected;
    uvm_aperture_t aperture;
-    uvm_gpu_peer_t *peer_caps;
-    uvm_gpu_t *local;
-    uvm_gpu_t *remote;
+    uvm_parent_gpu_t *local;
+    uvm_parent_gpu_t *remote;
+    const char *link_type;
+    NvU32 bandwidth;

    UVM_ASSERT(uvm_procfs_is_debug_enabled());

-    local = gpu_pair[0];
-    remote = gpu_pair[1];
-    peer_caps = uvm_gpu_peer_caps(local, remote);
-    aperture = uvm_gpu_peer_aperture(local, remote);
-    nvswitch_connected = uvm_gpus_are_nvswitch_connected(local, remote);
-    UVM_SEQ_OR_DBG_PRINT(s, "Link type                      %s\n", uvm_gpu_link_type_string(peer_caps->link_type));
-    UVM_SEQ_OR_DBG_PRINT(s, "Bandwidth                      %uMBps\n", peer_caps->total_link_line_rate_mbyte_per_s);
+    local = parent_gpu_pair[0];
+    remote = parent_gpu_pair[1];
+    parent_peer_caps = parent_gpu_peer_caps(local, remote);
+    link_type = uvm_gpu_link_type_string(parent_peer_caps->link_type);
+    bandwidth = parent_peer_caps->total_link_line_rate_mbyte_per_s;
+    aperture = parent_gpu_peer_aperture(local, remote, parent_peer_caps);
+    nvswitch_connected = uvm_parent_gpus_are_nvswitch_connected(local, remote);
+    UVM_SEQ_OR_DBG_PRINT(s, "Link type                      %s\n", link_type);
+    UVM_SEQ_OR_DBG_PRINT(s, "Bandwidth                      %uMBps\n", bandwidth);
    UVM_SEQ_OR_DBG_PRINT(s, "Aperture                       %s\n", uvm_aperture_string(aperture));
    UVM_SEQ_OR_DBG_PRINT(s, "Connected through NVSWITCH     %s\n", nvswitch_connected ? "True" : "False");
-    UVM_SEQ_OR_DBG_PRINT(s, "Refcount                       %llu\n", UVM_READ_ONCE(peer_caps->ref_count));
+    UVM_SEQ_OR_DBG_PRINT(s, "Refcount                       %llu\n", READ_ONCE(parent_peer_caps->ref_count));
 }

 static int nv_procfs_read_gpu_info(struct seq_file *s, void *v)
@ -784,31 +890,49 @@ UVM_DEFINE_SINGLE_PROCFS_FILE(gpu_info_entry);
 UVM_DEFINE_SINGLE_PROCFS_FILE(gpu_fault_stats_entry);
 UVM_DEFINE_SINGLE_PROCFS_FILE(gpu_access_counters_entry);

+static void uvm_parent_gpu_uuid_string(char *buffer, const NvProcessorUuid *uuid)
+{
+    memcpy(buffer, UVM_PARENT_GPU_UUID_PREFIX, sizeof(UVM_PARENT_GPU_UUID_PREFIX) - 1);
+    uvm_uuid_string(buffer + sizeof(UVM_PARENT_GPU_UUID_PREFIX) - 1, uuid);
+}
+
+static void uvm_gpu_uuid_string(char *buffer, const NvProcessorUuid *uuid)
+{
+    memcpy(buffer, UVM_GPU_UUID_PREFIX, sizeof(UVM_GPU_UUID_PREFIX) - 1);
+    uvm_uuid_string(buffer + sizeof(UVM_GPU_UUID_PREFIX) - 1, uuid);
+}
+
 static NV_STATUS init_parent_procfs_dir(uvm_parent_gpu_t *parent_gpu)
 {
    struct proc_dir_entry *gpu_base_dir_entry;
-    char uuid_text_buffer[UVM_GPU_UUID_TEXT_BUFFER_LENGTH];
-    char gpu_dir_name[sizeof(uuid_text_buffer) + 1];
+    char gpu_dir_name[UVM_PARENT_GPU_UUID_STRING_LENGTH];

    if (!uvm_procfs_is_enabled())
        return NV_OK;

    gpu_base_dir_entry = uvm_procfs_get_gpu_base_dir();

-    format_uuid_to_buffer(uuid_text_buffer, sizeof(uuid_text_buffer), &parent_gpu->uuid);
-
-    // Create UVM-GPU-${UUID} directory
-    snprintf(gpu_dir_name, sizeof(gpu_dir_name), "%s", uuid_text_buffer);
+    // Create GPU-${physical-UUID} directory.
+    uvm_parent_gpu_uuid_string(gpu_dir_name, &parent_gpu->uuid);

    parent_gpu->procfs.dir = NV_CREATE_PROC_DIR(gpu_dir_name, gpu_base_dir_entry);
    if (parent_gpu->procfs.dir == NULL)
        return NV_ERR_OPERATING_SYSTEM;

+    // GPU peer files are debug only.
+    if (!uvm_procfs_is_debug_enabled())
+        return NV_OK;
+
+    parent_gpu->procfs.dir_peers = NV_CREATE_PROC_DIR(UVM_PROC_GPUS_PEER_DIR_NAME, parent_gpu->procfs.dir);
+    if (parent_gpu->procfs.dir_peers == NULL)
+        return NV_ERR_OPERATING_SYSTEM;
+
    return NV_OK;
 }

 static void deinit_parent_procfs_dir(uvm_parent_gpu_t *parent_gpu)
 {
+    proc_remove(parent_gpu->procfs.dir_peers);
    proc_remove(parent_gpu->procfs.dir);
 }

@ -845,17 +969,17 @@ static NV_STATUS init_procfs_dirs(uvm_gpu_t *gpu)
 {
    struct proc_dir_entry *gpu_base_dir_entry;
    char symlink_name[16]; // Hold a uvm_gpu_id_t value in decimal.
-    char uuid_text_buffer[UVM_GPU_UUID_TEXT_BUFFER_LENGTH];
-    char gpu_dir_name[sizeof(symlink_name) + sizeof(uuid_text_buffer) + 1];
+    char uuid_buffer[max(UVM_PARENT_GPU_UUID_STRING_LENGTH, UVM_GPU_UUID_STRING_LENGTH)];
+    char gpu_dir_name[sizeof(symlink_name) + sizeof(uuid_buffer) + 1];

    if (!uvm_procfs_is_enabled())
        return NV_OK;

-    format_uuid_to_buffer(uuid_text_buffer, sizeof(uuid_text_buffer), &gpu->parent->uuid);
+    uvm_parent_gpu_uuid_string(uuid_buffer, &gpu->parent->uuid);

    gpu_base_dir_entry = uvm_procfs_get_gpu_base_dir();

-    // Create UVM-GPU-${physical-UUID}/${sub_processor_index} directory
+    // Create GPU-${physical-UUID}/${sub_processor_index} directory
    snprintf(gpu_dir_name, sizeof(gpu_dir_name), "%u", uvm_id_sub_processor_index(gpu->id));

    gpu->procfs.dir = NV_CREATE_PROC_DIR(gpu_dir_name, gpu->parent->procfs.dir);
@ -863,34 +987,24 @@ static NV_STATUS init_procfs_dirs(uvm_gpu_t *gpu)
        return NV_ERR_OPERATING_SYSTEM;

    // Create symlink from ${gpu_id} to
-    // UVM-GPU-${physical-UUID}/${sub_processor_index}
+    // GPU-${physical-UUID}/${sub_processor_index}
    snprintf(symlink_name, sizeof(symlink_name), "%u", uvm_id_value(gpu->id));
    snprintf(gpu_dir_name,
             sizeof(gpu_dir_name),
             "%s/%u",
-             uuid_text_buffer,
+             uuid_buffer,
             uvm_id_sub_processor_index(gpu->id));

    gpu->procfs.dir_symlink = proc_symlink(symlink_name, gpu_base_dir_entry, gpu_dir_name);
    if (gpu->procfs.dir_symlink == NULL)
        return NV_ERR_OPERATING_SYSTEM;

-    if (gpu->parent->smc.enabled) {
-        // Create symlink from UVM-GPU-${GI-UUID} to
-        // UVM-GPU-${physical-UUID}/${sub_processor_index}
-        format_uuid_to_buffer(uuid_text_buffer, sizeof(uuid_text_buffer), &gpu->uuid);
+    // Create symlink from GI-${GI-UUID} to
+    // GPU-${physical-UUID}/${sub_processor_index}
+    uvm_gpu_uuid_string(uuid_buffer, &gpu->uuid);

-        gpu->procfs.gpu_instance_uuid_symlink = proc_symlink(uuid_text_buffer, gpu_base_dir_entry, gpu_dir_name);
-        if (gpu->procfs.gpu_instance_uuid_symlink == NULL)
-            return NV_ERR_OPERATING_SYSTEM;
-    }
-
-    // GPU peer files are debug only
-    if (!uvm_procfs_is_debug_enabled())
-        return NV_OK;
-
-    gpu->procfs.dir_peers = NV_CREATE_PROC_DIR(UVM_PROC_GPUS_PEER_DIR_NAME, gpu->procfs.dir);
-    if (gpu->procfs.dir_peers == NULL)
+    gpu->procfs.gpu_instance_uuid_symlink = proc_symlink(uuid_buffer, gpu_base_dir_entry, gpu_dir_name);
+    if (gpu->procfs.gpu_instance_uuid_symlink == NULL)
        return NV_ERR_OPERATING_SYSTEM;

    return NV_OK;
@ -899,7 +1013,6 @@ static NV_STATUS init_procfs_dirs(uvm_gpu_t *gpu)
 // The kernel waits on readers to finish before returning from those calls
 static void deinit_procfs_dirs(uvm_gpu_t *gpu)
 {
-    proc_remove(gpu->procfs.dir_peers);
    proc_remove(gpu->procfs.gpu_instance_uuid_symlink);
    proc_remove(gpu->procfs.dir_symlink);
    proc_remove(gpu->procfs.dir);
@ -919,12 +1032,12 @@ static void deinit_procfs_files(uvm_gpu_t *gpu)
    proc_remove(gpu->procfs.info_file);
 }

-static void deinit_procfs_peer_cap_files(uvm_gpu_peer_t *peer_caps)
+static void deinit_procfs_parent_peer_cap_files(uvm_parent_gpu_peer_t *parent_peer_caps)
 {
-    proc_remove(peer_caps->procfs.peer_symlink_file[0]);
-    proc_remove(peer_caps->procfs.peer_symlink_file[1]);
-    proc_remove(peer_caps->procfs.peer_file[0]);
-    proc_remove(peer_caps->procfs.peer_file[1]);
+    proc_remove(parent_peer_caps->procfs.peer_symlink_file[0]);
+    proc_remove(parent_peer_caps->procfs.peer_symlink_file[1]);
+    proc_remove(parent_peer_caps->procfs.peer_file[0]);
+    proc_remove(parent_peer_caps->procfs.peer_file[1]);
 }

 static NV_STATUS init_semaphore_pools(uvm_gpu_t *gpu)
@ -1019,11 +1132,18 @@ static NV_STATUS alloc_parent_gpu(const NvProcessorUuid *gpu_uuid,

    // TODO: Bug 3881835: revisit whether to use nv_kthread_q_t or workqueue.
    status = errno_to_nv_status(nv_kthread_q_init(&parent_gpu->lazy_free_q, "vidmem lazy free"));
+    if (status != NV_OK)
+        goto cleanup;

    nv_kref_init(&parent_gpu->gpu_kref);

    *parent_gpu_out = parent_gpu;

+    return NV_OK;
+
+cleanup:
+    uvm_kvfree(parent_gpu);
+
    return status;
 }

@ -1239,8 +1359,8 @@ static NV_STATUS init_parent_gpu(uvm_parent_gpu_t *parent_gpu,

 static NV_STATUS init_gpu(uvm_gpu_t *gpu, const UvmGpuInfo *gpu_info)
 {
-    char uuid_buffer[UVM_GPU_UUID_TEXT_BUFFER_LENGTH];
-    size_t len;
+    char parent_uuid_buffer[UVM_UUID_STRING_LENGTH];
+    char gi_uuid_buffer[UVM_UUID_STRING_LENGTH];
    NV_STATUS status;

    if (gpu->parent->smc.enabled) {
@ -1258,19 +1378,14 @@ static NV_STATUS init_gpu(uvm_gpu_t *gpu, const UvmGpuInfo *gpu_info)
    uvm_uuid_copy(&gpu->uuid, &gpu_info->uuid);
    gpu->smc.swizz_id = gpu_info->smcSwizzId;

-    format_uuid_to_buffer(uuid_buffer, sizeof(uuid_buffer), &gpu->parent->uuid);
+    uvm_uuid_string(parent_uuid_buffer, &gpu->parent->uuid);
+    uvm_uuid_string(gi_uuid_buffer, &gpu->uuid);
    snprintf(gpu->name,
             sizeof(gpu->name),
-             "ID %u: %s",
+             "ID %u: " UVM_PARENT_GPU_UUID_PREFIX "%s " UVM_GPU_UUID_PREFIX "%s",
             uvm_id_value(gpu->id),
-             uuid_buffer + 4);
-
-    format_uuid_to_buffer(uuid_buffer, sizeof(uuid_buffer), &gpu->uuid);
-    len = strlen(gpu->name);
-    snprintf(gpu->name + len,
-             sizeof(gpu->name) - len,
-             " UVM-GI-%s",
-             uuid_buffer + 8);
+             parent_uuid_buffer,
+             gi_uuid_buffer);

    // Initialize the per-GPU procfs dirs as early as possible so that other
    // parts of the driver can add files in them as part of their per-GPU init.
@ -1318,6 +1433,8 @@ static NV_STATUS init_gpu(uvm_gpu_t *gpu, const UvmGpuInfo *gpu_info)
        return status;
    }

+    uvm_pmm_gpu_device_p2p_init(gpu);
+
    status = init_semaphore_pools(gpu);
    if (status != NV_OK) {
        UVM_ERR_PRINT("Failed to initialize the semaphore pool: %s, GPU %s\n",
@ -1371,137 +1488,6 @@ static NV_STATUS init_gpu(uvm_gpu_t *gpu, const UvmGpuInfo *gpu_info)
    return NV_OK;
 }

-// Add a new gpu and register it with RM
-// TODO: Bug 2844714: Split parent-specific parts of this function out into a
-// separate add_parent_gpu() function.
-static NV_STATUS add_gpu(const NvProcessorUuid *gpu_uuid,
-                         const uvm_gpu_id_t gpu_id,
-                         const UvmGpuInfo *gpu_info,
-                         const UvmGpuPlatformInfo *gpu_platform_info,
-                         uvm_parent_gpu_t *parent_gpu,
-                         uvm_gpu_t **gpu_out)
-{
-    NV_STATUS status;
-    bool alloc_parent = (parent_gpu == NULL);
-    uvm_gpu_t *gpu = NULL;
-
-    uvm_assert_mutex_locked(&g_uvm_global.global_lock);
-
-    if (alloc_parent) {
-        status = alloc_parent_gpu(gpu_uuid, uvm_parent_gpu_id_from_gpu_id(gpu_id), &parent_gpu);
-        if (status != NV_OK)
-            return status;
-    }
-
-    gpu = alloc_gpu(parent_gpu, gpu_id);
-    if (!gpu) {
-        if (alloc_parent)
-            uvm_parent_gpu_kref_put(parent_gpu);
-
-        return NV_ERR_NO_MEMORY;
-    }
-
-    parent_gpu->num_retained_gpus++;
-
-    if (alloc_parent)
-        fill_parent_gpu_info(parent_gpu, gpu_info);
-
-    // After this point all error clean up should be handled by remove_gpu()
-
-    if (!gpu_supports_uvm(parent_gpu)) {
-        UVM_DBG_PRINT("Registration of non-UVM-capable GPU attempted: GPU %s\n", uvm_gpu_name(gpu));
-        status = NV_ERR_NOT_SUPPORTED;
-        goto error;
-    }
-
-    if (alloc_parent) {
-        status = init_parent_gpu(parent_gpu, gpu_uuid, gpu_info, gpu_platform_info);
-        if (status != NV_OK)
-            goto error;
-    }
-
-    status = init_gpu(gpu, gpu_info);
-    if (status != NV_OK)
-        goto error;
-
-    status = uvm_gpu_check_ecc_error(gpu);
-    if (status != NV_OK)
-        goto error;
-
-    atomic64_set(&gpu->retained_count, 1);
-    uvm_processor_mask_set(&g_uvm_global.retained_gpus, gpu->id);
-
-    uvm_spin_lock_irqsave(&g_uvm_global.gpu_table_lock);
-
-    if (alloc_parent)
-        uvm_global_add_parent_gpu(parent_gpu);
-
-    // Mark the GPU as valid in the parent GPU's GPU table.
-    UVM_ASSERT(!test_bit(uvm_id_sub_processor_index(gpu->id), parent_gpu->valid_gpus));
-    __set_bit(uvm_id_sub_processor_index(gpu->id), parent_gpu->valid_gpus);
-
-    // Although locking correctness does not, at this early point (before the
-    // GPU is visible in the table) strictly require holding the gpu_table_lock
-    // in order to read gpu->isr.replayable_faults.handling, nor to enable page
-    // fault interrupts (this could have been done earlier), it is best to do it
-    // here, in order to avoid an interrupt storm. That way, we take advantage
-    // of the spinlock_irqsave side effect of turning off local CPU interrupts,
-    // part of holding the gpu_table_lock. That means that the local CPU won't
-    // receive any of these interrupts, until the GPU is safely added to the
-    // table (where the top half ISR can find it).
-    //
-    // As usual with spinlock_irqsave behavior, *other* CPUs can still handle
-    // these interrupts, but the local CPU will not be slowed down (interrupted)
-    // by such handling, and can quickly release the gpu_table_lock, thus
-    // unblocking any other CPU's top half (which waits for the gpu_table_lock).
-    if (alloc_parent && parent_gpu->isr.replayable_faults.handling) {
-        parent_gpu->fault_buffer_hal->enable_replayable_faults(parent_gpu);
-
-        // Clear the interrupt bit and force the re-evaluation of the interrupt
-        // condition to ensure that we don't miss any pending interrupt
-        parent_gpu->fault_buffer_hal->clear_replayable_faults(parent_gpu,
-                                                              parent_gpu->fault_buffer_info.replayable.cached_get);
-    }
-
-    // Access counters are enabled on demand
-
-    uvm_spin_unlock_irqrestore(&g_uvm_global.gpu_table_lock);
-
-    if (gpu->parent->smc.enabled) {
-        status = discover_smc_peers(gpu);
-        if (status != NV_OK) {
-            // Nobody can have retained the GPU yet, since we still hold the
-            // global lock.
-            UVM_ASSERT(uvm_gpu_retained_count(gpu) == 1);
-            atomic64_set(&gpu->retained_count, 0);
-            goto error;
-        }
-    }
-    else if (alloc_parent) {
-        status = discover_nvlink_peers(gpu);
-        if (status != NV_OK) {
-            UVM_ERR_PRINT("Failed to discover NVLINK peers: %s, GPU %s\n",
-                          nvstatusToString(status),
-                          uvm_gpu_name(gpu));
-
-            // Nobody can have retained the GPU yet, since we still hold the
-            // global lock.
-            UVM_ASSERT(uvm_gpu_retained_count(gpu) == 1);
-            atomic64_set(&gpu->retained_count, 0);
-            goto error;
-        }
-    }
-
-    *gpu_out = gpu;
-
-    return NV_OK;
-
-error:
-    remove_gpu(gpu);
-
-    return status;
-}
-
 static void sync_parent_gpu_trackers(uvm_parent_gpu_t *parent_gpu,
                                     bool sync_replay_tracker,
                                     bool sync_clear_faulted_tracker)
@ -1529,6 +1515,16 @@ static void sync_parent_gpu_trackers(uvm_parent_gpu_t *parent_gpu,
        if (status != NV_OK)
            UVM_ASSERT(status == uvm_global_get_status());
    }
+
+    // Sync the access counter clear tracker too.
+    if (parent_gpu->access_counters_supported) {
+        uvm_parent_gpu_access_counters_isr_lock(parent_gpu);
+        status = uvm_tracker_wait(&parent_gpu->access_counter_buffer_info.clear_tracker);
+        uvm_parent_gpu_access_counters_isr_unlock(parent_gpu);
+
+        if (status != NV_OK)
+            UVM_ASSERT(status == uvm_global_get_status());
+    }
 }

 // Remove all references the given GPU has to other GPUs, since one of those
@ -1629,6 +1625,8 @@ static void deinit_gpu(uvm_gpu_t *gpu)

    deinit_semaphore_pools(gpu);

+    uvm_pmm_gpu_device_p2p_deinit(gpu);
+
    uvm_pmm_sysmem_mappings_deinit(&gpu->pmm_reverse_sysmem_mappings);

    uvm_pmm_gpu_deinit(&gpu->pmm);
@ -1646,78 +1644,6 @@ static void deinit_gpu(uvm_gpu_t *gpu)
    gpu->magic = 0;
 }

-// Remove a gpu and unregister it from RM
-// Note that this is also used in most error paths in add_gpu()
-static void remove_gpu(uvm_gpu_t *gpu)
-{
-    NvU32 sub_processor_index;
-    uvm_parent_gpu_t *parent_gpu;
-    bool free_parent;
-
-    uvm_assert_mutex_locked(&g_uvm_global.global_lock);
-
-    sub_processor_index = uvm_id_sub_processor_index(gpu->id);
-    parent_gpu = gpu->parent;
-
-    UVM_ASSERT_MSG(uvm_gpu_retained_count(gpu) == 0,
-                   "gpu_id %u retained_count %llu\n",
-                   uvm_id_value(gpu->id),
-                   uvm_gpu_retained_count(gpu));
-
-    UVM_ASSERT(parent_gpu->num_retained_gpus > 0);
-    parent_gpu->num_retained_gpus--;
-
-    free_parent = (parent_gpu->num_retained_gpus == 0);
-
-    // NVLINK peers must be removed and the relevant access counter buffers must
-    // be flushed before removing this GPU from the global table. See the
-    // comment on discover_nvlink_peers in add_gpu.
-    if (free_parent)
-        destroy_nvlink_peers(gpu);
-
-    // uvm_mem_free and other uvm_mem APIs invoked by the Confidential Compute
-    // deinitialization must be called before the GPU is removed from the global
-    // table.
-    //
-    // TODO: Bug 2008200: Add and remove the GPU in a more reasonable spot.
-    uvm_conf_computing_gpu_deinit(gpu);
-
-    // If the parent is not being freed, the following gpu_table_lock is only
-    // needed to protect concurrent uvm_parent_gpu_find_first_valid_gpu() in BH
-    // from the __clear_bit here.
-    // In the free_parent case, gpu_table_lock protects the top half from the
-    // uvm_global_remove_parent_gpu()
-    uvm_spin_lock_irqsave(&g_uvm_global.gpu_table_lock);
-
-    // Mark the GPU as invalid in the parent GPU's GPU table.
-    __clear_bit(sub_processor_index, parent_gpu->valid_gpus);
-
-    // Remove the GPU from the table.
-    if (free_parent)
-        uvm_global_remove_parent_gpu(parent_gpu);
-
-    uvm_spin_unlock_irqrestore(&g_uvm_global.gpu_table_lock);
-
-    uvm_processor_mask_clear(&g_uvm_global.retained_gpus, gpu->id);
-
-    // If the parent is being freed, stop scheduling new bottom halves and
-    // update relevant software state.  Else flush any pending bottom halves
-    // before continuing.
-    if (free_parent)
-        uvm_parent_gpu_disable_isr(parent_gpu);
-    else
-        uvm_parent_gpu_flush_bottom_halves(parent_gpu);
-
-    deinit_gpu(gpu);
-
-    UVM_ASSERT(parent_gpu->gpus[sub_processor_index] == gpu);
-    parent_gpu->gpus[sub_processor_index] = NULL;
-    uvm_kvfree(gpu);
-
-    if (free_parent)
-        deinit_parent_gpu(parent_gpu);
-}
-
 // Do not not call this directly. It is called by nv_kref_put, when the
 // GPU's ref count drops to zero.
 static void uvm_parent_gpu_destroy(nv_kref_t *nv_kref)
@ -1810,7 +1736,7 @@ static void update_stats_fault_cb(uvm_perf_event_t event_id, uvm_perf_event_data
    // The reported fault entry must be the "representative" fault entry
    UVM_ASSERT(!event_data->fault.gpu.buffer_entry->filtered);

-    parent_gpu = uvm_va_space_get_gpu(event_data->fault.space, event_data->fault.proc_id)->parent;
+    parent_gpu = uvm_gpu_get(event_data->fault.proc_id)->parent;

    fault_entry = event_data->fault.gpu.buffer_entry;

@ -1830,15 +1756,14 @@ static void update_stats_migration_cb(uvm_perf_event_t event_id, uvm_perf_event_
    bool is_replayable_fault;
    bool is_non_replayable_fault;
    bool is_access_counter;
-    uvm_va_space_t *va_space = uvm_va_block_get_va_space(event_data->migration.block);

    UVM_ASSERT(event_id == UVM_PERF_EVENT_MIGRATION);

    if (UVM_ID_IS_GPU(event_data->migration.dst))
-        gpu_dst = uvm_va_space_get_gpu(va_space, event_data->migration.dst);
+        gpu_dst = uvm_gpu_get(event_data->migration.dst);

    if (UVM_ID_IS_GPU(event_data->migration.src))
-        gpu_src = uvm_va_space_get_gpu(va_space, event_data->migration.src);
+        gpu_src = uvm_gpu_get(event_data->migration.src);

    if (!gpu_dst && !gpu_src)
        return;
@ -1984,7 +1909,7 @@ static uvm_gpu_t *uvm_gpu_get_by_parent_and_swizz_id(uvm_parent_gpu_t *parent_gp
    UVM_ASSERT(parent_gpu);
    uvm_assert_mutex_locked(&g_uvm_global.global_lock);

-    for_each_gpu_in_parent(parent_gpu, gpu) {
+    for_each_gpu_in_parent(gpu, parent_gpu) {
        if (gpu->smc.swizz_id == swizz_id)
            return gpu;
    }
@ -1992,6 +1917,739 @@ static uvm_gpu_t *uvm_gpu_get_by_parent_and_swizz_id(uvm_parent_gpu_t *parent_gp
    return NULL;
 }

+void uvm_gpu_retain(uvm_gpu_t *gpu)
+{
+    UVM_ASSERT(uvm_gpu_retained_count(gpu) > 0);
+    atomic64_inc(&gpu->retained_count);
+}
+
+bool uvm_parent_gpus_are_nvswitch_connected(const uvm_parent_gpu_t *parent_gpu0, const uvm_parent_gpu_t *parent_gpu1)
+{
+    if (parent_gpu0 != parent_gpu1 &&
+        parent_gpu0->nvswitch_info.is_nvswitch_connected &&
+        parent_gpu1->nvswitch_info.is_nvswitch_connected) {
+        UVM_ASSERT(parent_gpu_peer_caps(parent_gpu0, parent_gpu1)->link_type >= UVM_GPU_LINK_NVLINK_2);
+        return true;
+    }
+
+    return false;
+}
+
+NV_STATUS uvm_gpu_check_ecc_error_no_rm(uvm_gpu_t *gpu)
+{
+    // We may need to call service_interrupts() which cannot be done in the top
+    // half interrupt handler so assert here as well to catch improper use as
+    // early as possible.
+    UVM_ASSERT(!in_interrupt());
+
+    if (!gpu->ecc.enabled)
+        return NV_OK;
+
+    // Early out If a global ECC error is already set to not spam the logs with
+    // the same error.
+    if (uvm_global_get_status() == NV_ERR_ECC_ERROR)
+        return NV_ERR_ECC_ERROR;
+
+    if (*gpu->ecc.error_notifier) {
+        UVM_ERR_PRINT("ECC error encountered, GPU %s\n", uvm_gpu_name(gpu));
+        uvm_global_set_fatal_error(NV_ERR_ECC_ERROR);
+        return NV_ERR_ECC_ERROR;
+    }
+
+    // RM hasn't seen an ECC error yet, check whether there is a pending
+    // interrupt that might indicate one. We might get false positives because
+    // the interrupt bits we read are not ECC-specific. They're just the
+    // top-level bits for any interrupt on all engines which support ECC. On
+    // Pascal for example, RM returns us a mask with the bits for GR, L2, and
+    // FB, because any of those might raise an ECC interrupt. So if they're set
+    // we have to ask RM to check whether it was really an ECC error (and a
+    // double-bit ECC error at that), in which case it sets the notifier.
+    if ((*gpu->ecc.hw_interrupt_tree_location & gpu->ecc.mask) == 0) {
+        // No pending interrupts.
+        return NV_OK;
+    }
+
+    // An interrupt that might mean an ECC error needs to be serviced, signal
+    // that to the caller.
+    return NV_WARN_MORE_PROCESSING_REQUIRED;
+}
+
+static NV_STATUS get_parent_p2p_caps(uvm_parent_gpu_t *parent_gpu0,
+                                     uvm_parent_gpu_t *parent_gpu1,
+                                     UvmGpuP2PCapsParams *p2p_caps_params)
+{
+    NV_STATUS status;
+    uvmGpuDeviceHandle rm_device0, rm_device1;
+
+    if (uvm_parent_id_value(parent_gpu0->id) < uvm_parent_id_value(parent_gpu1->id)) {
+        rm_device0 = parent_gpu0->rm_device;
+        rm_device1 = parent_gpu1->rm_device;
+    }
+    else {
+        rm_device0 = parent_gpu1->rm_device;
+        rm_device1 = parent_gpu0->rm_device;
+    }
+
+    memset(p2p_caps_params, 0, sizeof(*p2p_caps_params));
+    status = uvm_rm_locked_call(nvUvmInterfaceGetP2PCaps(rm_device0, rm_device1, p2p_caps_params));
+    if (status != NV_OK) {
+        UVM_ERR_PRINT("nvUvmInterfaceGetP2PCaps() failed with error: %s, for GPU0:%s and GPU1:%s\n",
+                       nvstatusToString(status),
+                       uvm_parent_gpu_name(parent_gpu0),
+                       uvm_parent_gpu_name(parent_gpu1));
+        return status;
+    }
+
+    return NV_OK;
+}
+
+static NV_STATUS create_parent_p2p_object(uvm_parent_gpu_t *parent_gpu0,
+                                          uvm_parent_gpu_t *parent_gpu1,
+                                          NvHandle *p2p_handle)
+{
+    NV_STATUS status;
+    uvmGpuDeviceHandle rm_device0, rm_device1;
+
+    if (uvm_parent_id_value(parent_gpu0->id) < uvm_parent_id_value(parent_gpu1->id)) {
+        rm_device0 = parent_gpu0->rm_device;
+        rm_device1 = parent_gpu1->rm_device;
+    }
+    else {
+        rm_device0 = parent_gpu1->rm_device;
+        rm_device1 = parent_gpu0->rm_device;
+    }
+
+    *p2p_handle = 0;
+
+    status = uvm_rm_locked_call(nvUvmInterfaceP2pObjectCreate(rm_device0, rm_device1, p2p_handle));
+    if (status != NV_OK) {
+        UVM_ERR_PRINT("nvUvmInterfaceP2pObjectCreate() failed with error: %s, for GPU0:%s and GPU1:%s\n",
+                       nvstatusToString(status),
+                       uvm_parent_gpu_name(parent_gpu0),
+                       uvm_parent_gpu_name(parent_gpu1));
+        return status;
+    }
+
+    UVM_ASSERT(*p2p_handle);
+    return NV_OK;
+}
+
+static void set_optimal_p2p_write_ces(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
+{
+    uvm_parent_gpu_peer_t *parent_peer_caps = parent_gpu_peer_caps(gpu0->parent, gpu1->parent);
+    bool sorted;
+    NvU32 ce0, ce1;
+
+    UVM_ASSERT(parent_peer_caps->ref_count);
+
+    if (parent_peer_caps->link_type < UVM_GPU_LINK_NVLINK_1)
+        return;
+
+    sorted = uvm_id_value(gpu0->id) < uvm_id_value(gpu1->id);
+    ce0 = parent_peer_caps->optimalNvlinkWriteCEs[sorted ? 0 : 1];
+    ce1 = parent_peer_caps->optimalNvlinkWriteCEs[sorted ? 1 : 0];
+
+    uvm_channel_manager_set_p2p_ce(gpu0->channel_manager, gpu1, ce0);
+    uvm_channel_manager_set_p2p_ce(gpu1->channel_manager, gpu0, ce1);
+}
+
+static int nv_procfs_read_parent_gpu_peer_caps(struct seq_file *s, void *v)
+{
+    if (!uvm_down_read_trylock(&g_uvm_global.pm.lock))
+            return -EAGAIN;
+
+    parent_gpu_peer_caps_print((uvm_parent_gpu_t **)s->private, s);
+
+    uvm_up_read(&g_uvm_global.pm.lock);
+
+    return 0;
+}
+
+static int nv_procfs_read_parent_gpu_peer_caps_entry(struct seq_file *s, void *v)
+{
+    UVM_ENTRY_RET(nv_procfs_read_parent_gpu_peer_caps(s, v));
+}
+
+UVM_DEFINE_SINGLE_PROCFS_FILE(parent_gpu_peer_caps_entry);
+
+static NV_STATUS init_procfs_parent_peer_cap_files(uvm_parent_gpu_t *local, uvm_parent_gpu_t *remote, size_t local_idx)
+{
+    // This needs to hold a uvm_parent_gpu_id_t in decimal
+    char gpu_dir_name[16];
+
+    // This needs to hold a GPU UUID
+    char symlink_name[UVM_GPU_UUID_STRING_LENGTH];
+    uvm_parent_gpu_peer_t *parent_peer_caps;
+
+    UVM_ASSERT(uvm_procfs_is_debug_enabled());
+
+    parent_peer_caps = parent_gpu_peer_caps(local, remote);
+    parent_peer_caps->procfs.pairs[local_idx][0] = local;
+    parent_peer_caps->procfs.pairs[local_idx][1] = remote;
+
+    // Create gpus/gpuA/peers/gpuB
+    snprintf(gpu_dir_name, sizeof(gpu_dir_name), "%u", uvm_parent_id_value(remote->id));
+    parent_peer_caps->procfs.peer_file[local_idx] = NV_CREATE_PROC_FILE(gpu_dir_name,
+                                                                        local->procfs.dir_peers,
+                                                                        parent_gpu_peer_caps_entry,
+                                                                        &parent_peer_caps->procfs.pairs[local_idx]);
+
+    if (parent_peer_caps->procfs.peer_file[local_idx] == NULL)
+        return NV_ERR_OPERATING_SYSTEM;
+
+    // Create a symlink from UVM GPU UUID (GPU-...) to the UVM GPU ID gpuB
+    uvm_parent_gpu_uuid_string(symlink_name, &remote->uuid);
+    parent_peer_caps->procfs.peer_symlink_file[local_idx] = proc_symlink(symlink_name,
+                                                                         local->procfs.dir_peers,
+                                                                         gpu_dir_name);
+    if (parent_peer_caps->procfs.peer_symlink_file[local_idx] == NULL)
+        return NV_ERR_OPERATING_SYSTEM;
+
+    return NV_OK;
+}
+
+static NV_STATUS init_procfs_parent_peer_files(uvm_parent_gpu_t *parent_gpu0, uvm_parent_gpu_t *parent_gpu1)
+{
+    NV_STATUS status;
+
+    if (!uvm_procfs_is_debug_enabled())
+        return NV_OK;
+
+    status = init_procfs_parent_peer_cap_files(parent_gpu0, parent_gpu1, 0);
+    if (status != NV_OK)
+        return status;
+
+    status = init_procfs_parent_peer_cap_files(parent_gpu1, parent_gpu0, 1);
+    if (status != NV_OK)
+        return status;
+
+    return NV_OK;
+}
+
+static NV_STATUS parent_peers_init(uvm_parent_gpu_t *parent_gpu0,
+                                   uvm_parent_gpu_t *parent_gpu1,
+                                   uvm_parent_gpu_peer_t *parent_peer_caps)
+{
+    UvmGpuP2PCapsParams p2p_caps_params;
+    uvm_gpu_link_type_t link_type;
+    NvHandle p2p_handle;
+    NV_STATUS status;
+
+    UVM_ASSERT(parent_peer_caps->ref_count == 0);
+
+    status = create_parent_p2p_object(parent_gpu0, parent_gpu1, &p2p_handle);
+    if (status != NV_OK)
+        return status;
+
+    status = get_parent_p2p_caps(parent_gpu0, parent_gpu1, &p2p_caps_params);
+    if (status != NV_OK)
+        goto cleanup;
+
+    // check for peer-to-peer compatibility (PCI-E or NvLink).
+    link_type = get_gpu_link_type(p2p_caps_params.p2pLink);
+    if (link_type == UVM_GPU_LINK_INVALID || link_type == UVM_GPU_LINK_C2C) {
+        status = NV_ERR_NOT_SUPPORTED;
+        goto cleanup;
+    }
+
+    status = init_procfs_parent_peer_files(parent_gpu0, parent_gpu1);
+    if (status != NV_OK)
+        goto cleanup;
+
+    parent_peer_caps->ref_count = 1;
+    parent_peer_caps->p2p_handle = p2p_handle;
+    parent_peer_caps->link_type = link_type;
+    parent_peer_caps->total_link_line_rate_mbyte_per_s = p2p_caps_params.totalLinkLineRateMBps;
+
+    // Initialize peer ids and establish peer mappings
+    // Peer id from min(gpu_id0, gpu_id1) -> max(gpu_id0, gpu_id1)
+    parent_peer_caps->peer_ids[0] = p2p_caps_params.peerIds[0];
+
+    // Peer id from max(gpu_id0, gpu_id1) -> min(gpu_id0, gpu_id1)
+    parent_peer_caps->peer_ids[1] = p2p_caps_params.peerIds[1];
+
+    parent_peer_caps->optimalNvlinkWriteCEs[0] = p2p_caps_params.optimalNvlinkWriteCEs[0];
+    parent_peer_caps->optimalNvlinkWriteCEs[1] = p2p_caps_params.optimalNvlinkWriteCEs[1];
+
+    return NV_OK;
+
+cleanup:
+    uvm_rm_locked_call_void(nvUvmInterfaceP2pObjectDestroy(uvm_global_session_handle(), p2p_handle));
+
+    return status;
+}
+
+static NV_STATUS parent_peers_retain(uvm_parent_gpu_t *parent_gpu0, uvm_parent_gpu_t *parent_gpu1)
+{
+    uvm_parent_gpu_peer_t *parent_peer_caps = parent_gpu_peer_caps(parent_gpu0, parent_gpu1);
+    NV_STATUS status = NV_OK;
+
+    if (parent_peer_caps->ref_count == 0)
+        status = parent_peers_init(parent_gpu0, parent_gpu1, parent_peer_caps);
+    else
+        parent_peer_caps->ref_count++;
+
+    return status;
+}
+
+static void parent_peers_destroy(uvm_parent_gpu_t *parent_gpu0,
+                                 uvm_parent_gpu_t *parent_gpu1,
+                                 uvm_parent_gpu_peer_t *parent_peer_caps)
+{
+    UVM_ASSERT(parent_peer_caps->p2p_handle);
+
+    deinit_procfs_parent_peer_cap_files(parent_peer_caps);
+
+    uvm_rm_locked_call_void(nvUvmInterfaceP2pObjectDestroy(uvm_global_session_handle(), parent_peer_caps->p2p_handle));
+
+    memset(parent_peer_caps, 0, sizeof(*parent_peer_caps));
+}
+
+static void parent_peers_release(uvm_parent_gpu_t *parent_gpu0, uvm_parent_gpu_t *parent_gpu1)
+{
+    uvm_parent_gpu_peer_t *parent_peer_caps = parent_gpu_peer_caps(parent_gpu0, parent_gpu1);
+
+    UVM_ASSERT(parent_peer_caps->ref_count);
+
+    if (--parent_peer_caps->ref_count == 0)
+        parent_peers_destroy(parent_gpu0, parent_gpu1, parent_peer_caps);
+}
+
+static NV_STATUS peers_init(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1, uvm_gpu_peer_t *peer_caps)
+{
+    NV_STATUS status;
+
+    UVM_ASSERT(peer_caps->ref_count == 0);
+
+    status = parent_peers_retain(gpu0->parent, gpu1->parent);
+    if (status != NV_OK)
+        return status;
+
+    // Establish peer mappings from each GPU to the other.
+    status = uvm_mmu_create_peer_identity_mappings(gpu0, gpu1);
+    if (status != NV_OK)
+        goto cleanup_parent;
+
+    status = uvm_mmu_create_peer_identity_mappings(gpu1, gpu0);
+    if (status != NV_OK)
+        goto cleanup_mappings;
+
+    peer_caps->ref_count = 1;
+
+    set_optimal_p2p_write_ces(gpu0, gpu1);
+
+    UVM_ASSERT(uvm_gpu_get(gpu0->id) == gpu0);
+    UVM_ASSERT(uvm_gpu_get(gpu1->id) == gpu1);
+
+    // In the case of NVLINK peers, this initialization will happen during
+    // add_gpu. As soon as the peer info table is assigned below, the access
+    // counter bottom half could start operating on the GPU being newly
+    // added and inspecting the peer caps, so all of the appropriate
+    // initialization must happen before this point.
+    uvm_spin_lock(&gpu0->peer_info.peer_gpus_lock);
+
+    uvm_processor_mask_set(&gpu0->peer_info.peer_gpu_mask, gpu1->id);
+    UVM_ASSERT(gpu0->peer_info.peer_gpus[uvm_id_gpu_index(gpu1->id)] == NULL);
+    gpu0->peer_info.peer_gpus[uvm_id_gpu_index(gpu1->id)] = gpu1;
+
+    uvm_spin_unlock(&gpu0->peer_info.peer_gpus_lock);
+    uvm_spin_lock(&gpu1->peer_info.peer_gpus_lock);
+
+    uvm_processor_mask_set(&gpu1->peer_info.peer_gpu_mask, gpu0->id);
+    UVM_ASSERT(gpu1->peer_info.peer_gpus[uvm_id_gpu_index(gpu0->id)] == NULL);
+    gpu1->peer_info.peer_gpus[uvm_id_gpu_index(gpu0->id)] = gpu0;
+
+    uvm_spin_unlock(&gpu1->peer_info.peer_gpus_lock);
+
+    return NV_OK;
+
+cleanup_mappings:
+    uvm_mmu_destroy_peer_identity_mappings(gpu0, gpu1);
+
+cleanup_parent:
+    parent_peers_release(gpu0->parent, gpu1->parent);
+
+    return status;
+}
+
+static NV_STATUS peers_retain(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
+{
+    uvm_gpu_peer_t *peer_caps = gpu_peer_caps(gpu0, gpu1);
+    NV_STATUS status = NV_OK;
+
+    if (peer_caps->ref_count == 0)
+        status = peers_init(gpu0, gpu1, peer_caps);
+    else
+        peer_caps->ref_count++;
+
+    return status;
+}
+
+static void peers_destroy(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1, uvm_gpu_peer_t *peer_caps)
+{
+    uvm_mmu_destroy_peer_identity_mappings(gpu0, gpu1);
+    uvm_mmu_destroy_peer_identity_mappings(gpu1, gpu0);
+
+    uvm_spin_lock(&gpu0->peer_info.peer_gpus_lock);
+    uvm_processor_mask_clear(&gpu0->peer_info.peer_gpu_mask, gpu1->id);
+    gpu0->peer_info.peer_gpus[uvm_id_gpu_index(gpu1->id)] = NULL;
+    uvm_spin_unlock(&gpu0->peer_info.peer_gpus_lock);
+
+    uvm_spin_lock(&gpu1->peer_info.peer_gpus_lock);
+    uvm_processor_mask_clear(&gpu1->peer_info.peer_gpu_mask, gpu0->id);
+    gpu1->peer_info.peer_gpus[uvm_id_gpu_index(gpu0->id)] = NULL;
+    uvm_spin_unlock(&gpu1->peer_info.peer_gpus_lock);
+
+    // Flush the access counter buffer to avoid getting stale notifications for
+    // accesses to GPUs to which peer access is being disabled. This is also
+    // needed in the case of disabling automatic (NVLINK) peers on GPU
+    // unregister, because access counter processing might still be using GPU
+    // IDs queried from the peer table above which are about to be removed from
+    // the global table.
+    if (gpu0->parent->access_counters_supported)
+        uvm_parent_gpu_access_counter_buffer_flush(gpu0->parent);
+    if (gpu1->parent->access_counters_supported)
+        uvm_parent_gpu_access_counter_buffer_flush(gpu1->parent);
+
+    parent_peers_release(gpu0->parent, gpu1->parent);
+
+    memset(peer_caps, 0, sizeof(*peer_caps));
+}
+
+static void peers_release(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
+{
+    uvm_gpu_peer_t *peer_caps = gpu_peer_caps(gpu0, gpu1);
+
+    UVM_ASSERT(peer_caps->ref_count);
+
+    if (--peer_caps->ref_count == 0)
+        peers_destroy(gpu0, gpu1, peer_caps);
+}
+
+static void parent_peers_destroy_nvlink(uvm_parent_gpu_t *parent_gpu)
+{
+    uvm_parent_gpu_t *other_parent_gpu;
+
+    uvm_assert_mutex_locked(&g_uvm_global.global_lock);
+
+    for_each_parent_gpu(other_parent_gpu) {
+        uvm_parent_gpu_peer_t *parent_peer_caps;
+
+        if (other_parent_gpu == parent_gpu)
+            continue;
+
+        parent_peer_caps = parent_gpu_peer_caps(parent_gpu, other_parent_gpu);
+
+        // PCIe peers need to be explicitly destroyed via UvmDisablePeerAccess
+        if (parent_peer_caps->ref_count == 0 || parent_peer_caps->link_type == UVM_GPU_LINK_PCIE)
+            continue;
+
+        parent_peers_release(parent_gpu, other_parent_gpu);
+    }
+}
+
+static NV_STATUS parent_peers_discover_nvlink(uvm_parent_gpu_t *parent_gpu)
+{
+    uvm_parent_gpu_t *other_parent_gpu;
+    NV_STATUS status;
+
+    uvm_assert_mutex_locked(&g_uvm_global.global_lock);
+
+    for_each_parent_gpu(other_parent_gpu) {
+        uvm_parent_gpu_peer_t *parent_peer_caps;
+        UvmGpuP2PCapsParams p2p_caps_params;
+
+        if (other_parent_gpu == parent_gpu)
+            continue;
+
+        status = get_parent_p2p_caps(parent_gpu, other_parent_gpu, &p2p_caps_params);
+        if (status != NV_OK)
+            goto cleanup;
+
+        // PCIe peers need to be explicitly enabled via UvmEnablePeerAccess
+        if (p2p_caps_params.p2pLink == UVM_LINK_TYPE_NONE || p2p_caps_params.p2pLink == UVM_LINK_TYPE_PCIE)
+            continue;
+
+        parent_peer_caps = parent_gpu_peer_caps(parent_gpu, other_parent_gpu);
+        UVM_ASSERT(parent_peer_caps->ref_count == 0);
+        status = parent_peers_init(parent_gpu, other_parent_gpu, parent_peer_caps);
+        if (status != NV_OK)
+            goto cleanup;
+    }
+
+    return NV_OK;
+
+cleanup:
+    parent_peers_destroy_nvlink(parent_gpu);
+
+    return status;
+}
+
+static void peers_destroy_nvlink(uvm_gpu_t *gpu)
+{
+    uvm_parent_gpu_t *other_parent_gpu;
+    uvm_parent_gpu_t *parent_gpu;
+
+    UVM_ASSERT(gpu);
+    uvm_assert_mutex_locked(&g_uvm_global.global_lock);
+
+    parent_gpu = gpu->parent;
+
+    for_each_parent_gpu(other_parent_gpu) {
+        uvm_parent_gpu_peer_t *parent_peer_caps;
+        uvm_gpu_t *other_gpu;
+
+        if (other_parent_gpu == parent_gpu)
+            continue;
+
+        parent_peer_caps = parent_gpu_peer_caps(parent_gpu, other_parent_gpu);
+
+        // PCIe peers need to be explicitly destroyed via UvmDisablePeerAccess
+        if (parent_peer_caps->ref_count == 0 || parent_peer_caps->link_type == UVM_GPU_LINK_PCIE)
+            continue;
+
+        for_each_gpu_in_parent(other_gpu, other_parent_gpu) {
+            uvm_gpu_peer_t *peer_caps = gpu_peer_caps(gpu, other_gpu);
+
+            if (peer_caps->ref_count == 0)
+                continue;
+
+            peers_release(gpu, other_gpu);
+        }
+    }
+}
+
+static NV_STATUS peers_discover_nvlink(uvm_gpu_t *gpu)
+{
+    uvm_parent_gpu_t *parent_gpu = gpu->parent;
+    uvm_parent_gpu_t *other_parent_gpu;
+    uvm_gpu_t *other_gpu;
+    NV_STATUS status;
+
+    uvm_assert_mutex_locked(&g_uvm_global.global_lock);
+
+    for_each_parent_gpu(other_parent_gpu) {
+        uvm_parent_gpu_peer_t *parent_peer_caps;
+
+        if (other_parent_gpu == parent_gpu)
+            continue;
+
+        parent_peer_caps = parent_gpu_peer_caps(parent_gpu, other_parent_gpu);
+        if (parent_peer_caps->ref_count == 0 || parent_peer_caps->link_type == UVM_GPU_LINK_PCIE)
+            continue;
+
+        for_each_gpu_in_parent(other_gpu, other_parent_gpu) {
+            uvm_gpu_peer_t *peer_caps = gpu_peer_caps(gpu, other_gpu);
+
+            UVM_ASSERT(peer_caps->ref_count == 0);
+            status = peers_init(gpu, other_gpu, peer_caps);
+            if (status != NV_OK)
+                goto cleanup;
+        }
+    }
+
+    return NV_OK;
+
+cleanup:
+    peers_destroy_nvlink(gpu);
+
+    return status;
+}
+
+// Remove a gpu and unregister it from RM
+// Note that this is also used in most error paths in add_gpu()
+static void remove_gpu(uvm_gpu_t *gpu)
+{
+    NvU32 sub_processor_index;
+    uvm_parent_gpu_t *parent_gpu;
+    bool free_parent;
+
+    uvm_assert_mutex_locked(&g_uvm_global.global_lock);
+
+    sub_processor_index = uvm_id_sub_processor_index(gpu->id);
+    parent_gpu = gpu->parent;
+
+    UVM_ASSERT_MSG(uvm_gpu_retained_count(gpu) == 0,
+                   "gpu_id %u retained_count %llu\n",
+                   uvm_id_value(gpu->id),
+                   uvm_gpu_retained_count(gpu));
+
+    UVM_ASSERT(parent_gpu->num_retained_gpus > 0);
+    parent_gpu->num_retained_gpus--;
+
+    free_parent = (parent_gpu->num_retained_gpus == 0);
+
+    // NVLINK peers must be removed and the relevant access counter buffers must
+    // be flushed before removing this GPU from the global table.
+    peers_destroy_nvlink(gpu);
+
+    if (free_parent)
+        parent_peers_destroy_nvlink(parent_gpu);
+
+    // uvm_mem_free and other uvm_mem APIs invoked by the Confidential Compute
+    // deinitialization must be called before the GPU is removed from the global
+    // table.
+    //
+    // TODO: Bug 2008200: Add and remove the GPU in a more reasonable spot.
+    uvm_conf_computing_gpu_deinit(gpu);
+
+    // If the parent is not being freed, the following gpu_table_lock is only
+    // needed to protect concurrent uvm_parent_gpu_find_first_valid_gpu() in BH
+    // from the __clear_bit here.
+    // In the free_parent case, gpu_table_lock protects the top half from the
+    // uvm_global_remove_parent_gpu()
+    uvm_spin_lock_irqsave(&g_uvm_global.gpu_table_lock);
+
+    // Mark the GPU as invalid in the parent GPU's GPU table.
+    __clear_bit(sub_processor_index, parent_gpu->valid_gpus);
+
+    // Remove the GPU from the table.
+    if (free_parent)
+        uvm_global_remove_parent_gpu(parent_gpu);
+
+    uvm_spin_unlock_irqrestore(&g_uvm_global.gpu_table_lock);
+
+    uvm_processor_mask_clear(&g_uvm_global.retained_gpus, gpu->id);
+
+    // If the parent is being freed, stop scheduling new bottom halves and
+    // update relevant software state.  Else flush any pending bottom halves
+    // before continuing.
+    if (free_parent)
+        uvm_parent_gpu_disable_isr(parent_gpu);
+    else
+        uvm_parent_gpu_flush_bottom_halves(parent_gpu);
+
+    deinit_gpu(gpu);
+
+    UVM_ASSERT(parent_gpu->gpus[sub_processor_index] == gpu);
+    parent_gpu->gpus[sub_processor_index] = NULL;
+    uvm_kvfree(gpu);
+
+    if (free_parent)
+        deinit_parent_gpu(parent_gpu);
+}
+
+// Add a new gpu and register it with RM
+static NV_STATUS add_gpu(const NvProcessorUuid *gpu_uuid,
+                         const uvm_gpu_id_t gpu_id,
+                         const UvmGpuInfo *gpu_info,
+                         const UvmGpuPlatformInfo *gpu_platform_info,
+                         uvm_parent_gpu_t *parent_gpu,
+                         uvm_gpu_t **gpu_out)
+{
+    NV_STATUS status;
+    bool alloc_parent = (parent_gpu == NULL);
+    uvm_gpu_t *gpu = NULL;
+
+    uvm_assert_mutex_locked(&g_uvm_global.global_lock);
+
+    if (alloc_parent) {
+        status = alloc_parent_gpu(gpu_uuid, uvm_parent_gpu_id_from_gpu_id(gpu_id), &parent_gpu);
+        if (status != NV_OK)
+            return status;
+    }
+
+    gpu = alloc_gpu(parent_gpu, gpu_id);
+    if (!gpu) {
+        if (alloc_parent)
+            uvm_parent_gpu_kref_put(parent_gpu);
+
+        return NV_ERR_NO_MEMORY;
+    }
+
+    parent_gpu->num_retained_gpus++;
+
+    if (alloc_parent)
+        fill_parent_gpu_info(parent_gpu, gpu_info);
+
+    // After this point all error clean up should be handled by remove_gpu()
+
+    if (!gpu_supports_uvm(parent_gpu)) {
+        UVM_DBG_PRINT("Registration of non-UVM-capable GPU attempted: GPU %s\n", uvm_gpu_name(gpu));
+        status = NV_ERR_NOT_SUPPORTED;
+        goto error;
+    }
+
+    if (alloc_parent) {
+        status = init_parent_gpu(parent_gpu, gpu_uuid, gpu_info, gpu_platform_info);
+        if (status != NV_OK)
+            goto error;
+    }
+
+    status = init_gpu(gpu, gpu_info);
+    if (status != NV_OK)
+        goto error;
+
+    status = uvm_gpu_check_ecc_error(gpu);
+    if (status != NV_OK)
+        goto error;
+
+    atomic64_set(&gpu->retained_count, 1);
+    uvm_processor_mask_set(&g_uvm_global.retained_gpus, gpu->id);
+
+    uvm_spin_lock_irqsave(&g_uvm_global.gpu_table_lock);
+
+    if (alloc_parent)
+        uvm_global_add_parent_gpu(parent_gpu);
+
+    // Mark the GPU as valid in the parent GPU's GPU table.
+    UVM_ASSERT(!test_bit(uvm_id_sub_processor_index(gpu->id), parent_gpu->valid_gpus));
+    __set_bit(uvm_id_sub_processor_index(gpu->id), parent_gpu->valid_gpus);
+
+    // Although locking correctness does not, at this early point (before the
+    // GPU is visible in the table) strictly require holding the gpu_table_lock
+    // in order to read gpu->isr.replayable_faults.handling, nor to enable page
+    // fault interrupts (this could have been done earlier), it is best to do it
+    // here, in order to avoid an interrupt storm. That way, we take advantage
+    // of the spinlock_irqsave side effect of turning off local CPU interrupts,
+    // part of holding the gpu_table_lock. That means that the local CPU won't
+    // receive any of these interrupts, until the GPU is safely added to the
+    // table (where the top half ISR can find it).
+    //
+    // As usual with spinlock_irqsave behavior, *other* CPUs can still handle
+    // these interrupts, but the local CPU will not be slowed down (interrupted)
+    // by such handling, and can quickly release the gpu_table_lock, thus
+    // unblocking any other CPU's top half (which waits for the gpu_table_lock).
+    if (alloc_parent && parent_gpu->isr.replayable_faults.handling) {
+        parent_gpu->fault_buffer_hal->enable_replayable_faults(parent_gpu);
+
+        // Clear the interrupt bit and force the re-evaluation of the interrupt
+        // condition to ensure that we don't miss any pending interrupt
+        parent_gpu->fault_buffer_hal->clear_replayable_faults(parent_gpu,
+                                                              parent_gpu->fault_buffer_info.replayable.cached_get);
+    }
+
+    // Access counters are enabled on demand
+
+    uvm_spin_unlock_irqrestore(&g_uvm_global.gpu_table_lock);
+
+    if (alloc_parent) {
+        status = parent_peers_discover_nvlink(parent_gpu);
+        if (status != NV_OK)
+            goto error_retained;
+    }
+
+    status = peers_discover_nvlink(gpu);
+    if (status != NV_OK)
+        goto error_retained;
+
+    *gpu_out = gpu;
+
+    return NV_OK;
+
+error_retained:
+    UVM_ERR_PRINT("Failed to discover NVLINK peers: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu));
+
+    // Nobody can have retained the GPU yet, since we still hold the
+    // global lock.
+    UVM_ASSERT(uvm_gpu_retained_count(gpu) == 1);
+    atomic64_set(&gpu->retained_count, 0);
+error:
+    remove_gpu(gpu);
+
+    return status;
+}
+
 // Increment the refcount for the GPU with the given UUID. If this is the first
 // time that this UUID is retained, the GPU is added to UVM.
 // When SMC partitioning is enabled, user_rm_device contains the user handles
@ -2086,12 +2744,6 @@ NV_STATUS uvm_gpu_retain_by_uuid(const NvProcessorUuid *gpu_uuid,
    return status;
 }

-void uvm_gpu_retain(uvm_gpu_t *gpu)
-{
-    UVM_ASSERT(uvm_gpu_retained_count(gpu) > 0);
-    atomic64_inc(&gpu->retained_count);
-}
-
 void uvm_gpu_release_locked(uvm_gpu_t *gpu)
 {
    uvm_parent_gpu_t *parent_gpu = gpu->parent;
@ -2115,496 +2767,20 @@ void uvm_gpu_release(uvm_gpu_t *gpu)
    uvm_mutex_unlock(&g_uvm_global.global_lock);
 }

-// Note: Peer table is an upper triangular matrix packed into a flat array.
-// This function converts an index of 2D array of size [N x N] into an index
-// of upper triangular array of size [((N - 1) * ((N - 1) + 1)) / 2] which
-// does not include diagonal elements.
-NvU32 uvm_gpu_peer_table_index(const uvm_gpu_id_t gpu_id0, const uvm_gpu_id_t gpu_id1)
-{
-    NvU32 square_index, triangular_index;
-    NvU32 gpu_index0 = uvm_id_gpu_index(gpu_id0);
-    NvU32 gpu_index1 = uvm_id_gpu_index(gpu_id1);
-
-    UVM_ASSERT(!uvm_id_equal(gpu_id0, gpu_id1));
-
-    // Calculate an index of 2D array by re-ordering indices to always point
-    // to the same entry.
-    square_index = min(gpu_index0, gpu_index1) * UVM_ID_MAX_GPUS +
-                   max(gpu_index0, gpu_index1);
-
-    // Calculate and subtract number of lower triangular matrix elements till
-    // the current row (which includes diagonal elements) to get the correct
-    // index in an upper triangular matrix.
-    // Note: As gpu_id can be [1, N), no extra logic is needed to calculate
-    // diagonal elements.
-    triangular_index = square_index - SUM_FROM_0_TO_N(min(uvm_id_value(gpu_id0), uvm_id_value(gpu_id1)));
-
-    UVM_ASSERT(triangular_index < UVM_MAX_UNIQUE_GPU_PAIRS);
-
-    return triangular_index;
-}
-
-NV_STATUS uvm_gpu_check_ecc_error_no_rm(uvm_gpu_t *gpu)
-{
-    // We may need to call service_interrupts() which cannot be done in the top
-    // half interrupt handler so assert here as well to catch improper use as
-    // early as possible.
-    UVM_ASSERT(!in_interrupt());
-
-    if (!gpu->ecc.enabled)
-        return NV_OK;
-
-    // Early out If a global ECC error is already set to not spam the logs with
-    // the same error.
-    if (uvm_global_get_status() == NV_ERR_ECC_ERROR)
-        return NV_ERR_ECC_ERROR;
-
-    if (*gpu->ecc.error_notifier) {
-        UVM_ERR_PRINT("ECC error encountered, GPU %s\n", uvm_gpu_name(gpu));
-        uvm_global_set_fatal_error(NV_ERR_ECC_ERROR);
-        return NV_ERR_ECC_ERROR;
-    }
-
-    // RM hasn't seen an ECC error yet, check whether there is a pending
-    // interrupt that might indicate one. We might get false positives because
-    // the interrupt bits we read are not ECC-specific. They're just the
-    // top-level bits for any interrupt on all engines which support ECC. On
-    // Pascal for example, RM returns us a mask with the bits for GR, L2, and
-    // FB, because any of those might raise an ECC interrupt. So if they're set
-    // we have to ask RM to check whether it was really an ECC error (and a
-    // double-bit ECC error at that), in which case it sets the notifier.
-    if ((*gpu->ecc.hw_interrupt_tree_location & gpu->ecc.mask) == 0) {
-        // No pending interrupts.
-        return NV_OK;
-    }
-
-    // An interrupt that might mean an ECC error needs to be serviced, signal
-    // that to the caller.
-    return NV_WARN_MORE_PROCESSING_REQUIRED;
-}
-
-static NV_STATUS get_p2p_caps(uvm_gpu_t *gpu0,
-                              uvm_gpu_t *gpu1,
-                              UvmGpuP2PCapsParams *p2p_caps_params)
-{
-    NV_STATUS status;
-    uvmGpuDeviceHandle rm_device0, rm_device1;
-
-    if (uvm_id_value(gpu0->id) < uvm_id_value(gpu1->id)) {
-        rm_device0 = uvm_gpu_device_handle(gpu0);
-        rm_device1 = uvm_gpu_device_handle(gpu1);
-    }
-    else {
-        rm_device0 = uvm_gpu_device_handle(gpu1);
-        rm_device1 = uvm_gpu_device_handle(gpu0);
-    }
-
-    memset(p2p_caps_params, 0, sizeof(*p2p_caps_params));
-    status = uvm_rm_locked_call(nvUvmInterfaceGetP2PCaps(rm_device0, rm_device1, p2p_caps_params));
-    if (status != NV_OK) {
-        UVM_ERR_PRINT("nvUvmInterfaceGetP2PCaps() failed with error: %s, for GPU0:%s and GPU1:%s\n",
-                       nvstatusToString(status),
-                       uvm_gpu_name(gpu0),
-                       uvm_gpu_name(gpu1));
-        return status;
-    }
-
-    if (p2p_caps_params->p2pLink != UVM_LINK_TYPE_NONE) {
-        // P2P is not supported under SMC partitioning
-        UVM_ASSERT(!gpu0->parent->smc.enabled);
-        UVM_ASSERT(!gpu1->parent->smc.enabled);
-    }
-
-    return NV_OK;
-}
-
-static NV_STATUS create_p2p_object(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1, NvHandle *p2p_handle)
-{
-    NV_STATUS status;
-    uvmGpuDeviceHandle rm_device0, rm_device1;
-
-    if (uvm_id_value(gpu0->id) < uvm_id_value(gpu1->id)) {
-        rm_device0 = uvm_gpu_device_handle(gpu0);
-        rm_device1 = uvm_gpu_device_handle(gpu1);
-    }
-    else {
-        rm_device0 = uvm_gpu_device_handle(gpu1);
-        rm_device1 = uvm_gpu_device_handle(gpu0);
-    }
-
-    *p2p_handle = 0;
-
-    status = uvm_rm_locked_call(nvUvmInterfaceP2pObjectCreate(rm_device0, rm_device1, p2p_handle));
-    if (status != NV_OK) {
-        UVM_ERR_PRINT("nvUvmInterfaceP2pObjectCreate() failed with error: %s, for GPU0:%s and GPU1:%s\n",
-                       nvstatusToString(status),
-                       uvm_gpu_name(gpu0),
-                       uvm_gpu_name(gpu1));
-        return status;
-    }
-
-    UVM_ASSERT(*p2p_handle);
-    return NV_OK;
-}
-
-static void set_optimal_p2p_write_ces(const UvmGpuP2PCapsParams *p2p_caps_params,
-                                      const uvm_gpu_peer_t *peer_caps,
-                                      uvm_gpu_t *gpu0,
-                                      uvm_gpu_t *gpu1)
-{
-    bool sorted;
-    NvU32 ce0, ce1;
-
-    if (peer_caps->link_type < UVM_GPU_LINK_NVLINK_1)
-        return;
-
-    sorted = uvm_id_value(gpu0->id) < uvm_id_value(gpu1->id);
-    ce0 = p2p_caps_params->optimalNvlinkWriteCEs[sorted ? 0 : 1];
-    ce1 = p2p_caps_params->optimalNvlinkWriteCEs[sorted ? 1 : 0];
-
-    uvm_channel_manager_set_p2p_ce(gpu0->channel_manager, gpu1, ce0);
-    uvm_channel_manager_set_p2p_ce(gpu1->channel_manager, gpu0, ce1);
-}
-
-static int nv_procfs_read_gpu_peer_caps(struct seq_file *s, void *v)
-{
-    if (!uvm_down_read_trylock(&g_uvm_global.pm.lock))
-            return -EAGAIN;
-
-    gpu_peer_caps_print((uvm_gpu_t **)s->private, s);
-
-    uvm_up_read(&g_uvm_global.pm.lock);
-
-    return 0;
-}
-
-static int nv_procfs_read_gpu_peer_caps_entry(struct seq_file *s, void *v)
-{
-    UVM_ENTRY_RET(nv_procfs_read_gpu_peer_caps(s, v));
-}
-
-UVM_DEFINE_SINGLE_PROCFS_FILE(gpu_peer_caps_entry);
-
-static NV_STATUS init_procfs_peer_cap_files(uvm_gpu_t *local, uvm_gpu_t *remote, size_t local_idx)
-{
-    // This needs to hold a gpu_id_t in decimal
-    char gpu_dir_name[16];
-
-    // This needs to hold a GPU UUID
-    char symlink_name[UVM_GPU_UUID_TEXT_BUFFER_LENGTH];
-    uvm_gpu_peer_t *peer_caps;
-
-    if (!uvm_procfs_is_enabled())
-        return NV_OK;
-
-    peer_caps = uvm_gpu_peer_caps(local, remote);
-    peer_caps->procfs.pairs[local_idx][0] = local;
-    peer_caps->procfs.pairs[local_idx][1] = remote;
-
-    // Create gpus/gpuA/peers/gpuB
-    snprintf(gpu_dir_name, sizeof(gpu_dir_name), "%u", uvm_id_value(remote->id));
-    peer_caps->procfs.peer_file[local_idx] = NV_CREATE_PROC_FILE(gpu_dir_name,
-                                                                 local->procfs.dir_peers,
-                                                                 gpu_peer_caps_entry,
-                                                                 &peer_caps->procfs.pairs[local_idx]);
-
-    if (peer_caps->procfs.peer_file[local_idx] == NULL)
-        return NV_ERR_OPERATING_SYSTEM;
-
-    // Create a symlink from UVM GPU UUID (UVM-GPU-...) to the UVM GPU ID gpuB
-    format_uuid_to_buffer(symlink_name, sizeof(symlink_name), &remote->uuid);
-    peer_caps->procfs.peer_symlink_file[local_idx] = proc_symlink(symlink_name,
-                                                                  local->procfs.dir_peers,
-                                                                  gpu_dir_name);
-    if (peer_caps->procfs.peer_symlink_file[local_idx] == NULL)
-        return NV_ERR_OPERATING_SYSTEM;
-
-    return NV_OK;
-}
-
-static NV_STATUS init_procfs_peer_files(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
-{
-    NV_STATUS status;
-
-    if (!uvm_procfs_is_debug_enabled())
-        return NV_OK;
-
-    status = init_procfs_peer_cap_files(gpu0, gpu1, 0);
-    if (status != NV_OK)
-        return status;
-
-    status = init_procfs_peer_cap_files(gpu1, gpu0, 1);
-    if (status != NV_OK)
-        return status;
-
-    return NV_OK;
-}
-
-static NV_STATUS init_peer_access(uvm_gpu_t *gpu0,
-                                  uvm_gpu_t *gpu1,
-                                  const UvmGpuP2PCapsParams *p2p_caps_params,
-                                  uvm_gpu_peer_t *peer_caps)
-{
-    NV_STATUS status;
-
-    UVM_ASSERT(p2p_caps_params->p2pLink != UVM_LINK_TYPE_C2C);
-
-    // check for peer-to-peer compatibility (PCI-E or NvLink).
-    peer_caps->link_type = get_gpu_link_type(p2p_caps_params->p2pLink);
-    if (peer_caps->link_type == UVM_GPU_LINK_INVALID || peer_caps->link_type == UVM_GPU_LINK_C2C)
-        return NV_ERR_NOT_SUPPORTED;
-
-    peer_caps->total_link_line_rate_mbyte_per_s = p2p_caps_params->totalLinkLineRateMBps;
-
-    // Initialize peer ids and establish peer mappings
-    // Peer id from min(gpu_id0, gpu_id1) -> max(gpu_id0, gpu_id1)
-    peer_caps->peer_ids[0] = p2p_caps_params->peerIds[0];
-
-    // Peer id from max(gpu_id0, gpu_id1) -> min(gpu_id0, gpu_id1)
-    peer_caps->peer_ids[1] = p2p_caps_params->peerIds[1];
-
-    // Establish peer mappings from each GPU to the other.
-    status = uvm_mmu_create_peer_identity_mappings(gpu0, gpu1);
-    if (status != NV_OK)
-        return status;
-
-    status = uvm_mmu_create_peer_identity_mappings(gpu1, gpu0);
-    if (status != NV_OK)
-        return status;
-
-    set_optimal_p2p_write_ces(p2p_caps_params, peer_caps, gpu0, gpu1);
-
-    UVM_ASSERT(uvm_gpu_get(gpu0->id) == gpu0);
-    UVM_ASSERT(uvm_gpu_get(gpu1->id) == gpu1);
-
-    // In the case of NVLINK peers, this initialization will happen during
-    // add_gpu. As soon as the peer info table is assigned below, the access
-    // counter bottom half could start operating on the GPU being newly
-    // added and inspecting the peer caps, so all of the appropriate
-    // initialization must happen before this point.
-    uvm_spin_lock(&gpu0->peer_info.peer_gpus_lock);
-
-    uvm_processor_mask_set(&gpu0->peer_info.peer_gpu_mask, gpu1->id);
-    UVM_ASSERT(gpu0->peer_info.peer_gpus[uvm_id_gpu_index(gpu1->id)] == NULL);
-    gpu0->peer_info.peer_gpus[uvm_id_gpu_index(gpu1->id)] = gpu1;
-
-    uvm_spin_unlock(&gpu0->peer_info.peer_gpus_lock);
-    uvm_spin_lock(&gpu1->peer_info.peer_gpus_lock);
-
-    uvm_processor_mask_set(&gpu1->peer_info.peer_gpu_mask, gpu0->id);
-    UVM_ASSERT(gpu1->peer_info.peer_gpus[uvm_id_gpu_index(gpu0->id)] == NULL);
-    gpu1->peer_info.peer_gpus[uvm_id_gpu_index(gpu0->id)] = gpu0;
-
-    uvm_spin_unlock(&gpu1->peer_info.peer_gpus_lock);
-
-    return init_procfs_peer_files(gpu0, gpu1);
-}
-
-static NV_STATUS discover_smc_peers(uvm_gpu_t *gpu)
-{
-    NvU32 sub_processor_index;
-    uvm_gpu_t *other_gpu;
-    NV_STATUS status;
-
-    UVM_ASSERT(gpu);
-    uvm_assert_mutex_locked(&g_uvm_global.global_lock);
-    UVM_ASSERT(gpu->parent->smc.enabled);
-
-    for_each_sub_processor_index(sub_processor_index) {
-        uvm_gpu_peer_t *peer_caps;
-
-        other_gpu = gpu->parent->gpus[sub_processor_index];
-        if (!other_gpu || other_gpu == gpu)
-            continue;
-
-        peer_caps = uvm_gpu_peer_caps(gpu, other_gpu);
-        if (peer_caps->ref_count == 1)
-            continue;
-
-        UVM_ASSERT(peer_caps->ref_count == 0);
-
-        memset(peer_caps, 0, sizeof(*peer_caps));
-        peer_caps->ref_count = 1;
-
-        status = init_procfs_peer_files(gpu, other_gpu);
-        if (status != NV_OK) {
-            peer_caps->ref_count = 0;
-            return status;
-        }
-    }
-
-    return NV_OK;
-}
-
-static NV_STATUS enable_pcie_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
-{
-    NV_STATUS status = NV_OK;
-    UvmGpuP2PCapsParams p2p_caps_params;
-    uvm_gpu_peer_t *peer_caps;
-    NvHandle p2p_handle;
-
-    UVM_ASSERT(gpu0);
-    UVM_ASSERT(gpu1);
-    uvm_assert_mutex_locked(&g_uvm_global.global_lock);
-
-    peer_caps = uvm_gpu_peer_caps(gpu0, gpu1);
-    UVM_ASSERT(peer_caps->link_type == UVM_GPU_LINK_INVALID);
-    UVM_ASSERT(peer_caps->ref_count == 0);
-
-    status = create_p2p_object(gpu0, gpu1, &p2p_handle);
-    if (status != NV_OK)
-        return status;
-
-    // Store the handle in the global table.
-    peer_caps->p2p_handle = p2p_handle;
-
-    status = get_p2p_caps(gpu0, gpu1, &p2p_caps_params);
-    if (status != NV_OK)
-        goto cleanup;
-
-    // Sanity checks
-    UVM_ASSERT(p2p_caps_params.p2pLink == UVM_LINK_TYPE_PCIE);
-
-    status = init_peer_access(gpu0, gpu1, &p2p_caps_params, peer_caps);
-    if (status != NV_OK)
-        goto cleanup;
-
-    return NV_OK;
-
-cleanup:
-    disable_peer_access(gpu0, gpu1);
-    return status;
-}
-
-static NV_STATUS enable_nvlink_peer_access(uvm_gpu_t *gpu0,
-                                           uvm_gpu_t *gpu1,
-                                           UvmGpuP2PCapsParams *p2p_caps_params)
-{
-    NV_STATUS status = NV_OK;
-    NvHandle p2p_handle;
-    uvm_gpu_peer_t *peer_caps;
-
-    UVM_ASSERT(gpu0);
-    UVM_ASSERT(gpu1);
-    uvm_assert_mutex_locked(&g_uvm_global.global_lock);
-
-    peer_caps = uvm_gpu_peer_caps(gpu0, gpu1);
-    UVM_ASSERT(peer_caps->ref_count == 0);
-    peer_caps->ref_count = 1;
-
-    // Create P2P object for direct NVLink peers
-    status = create_p2p_object(gpu0, gpu1, &p2p_handle);
-    if (status != NV_OK) {
-        UVM_ERR_PRINT("failed to create a P2P object with error: %s, for GPU1:%s and GPU2:%s \n",
-                       nvstatusToString(status),
-                       uvm_gpu_name(gpu0),
-                       uvm_gpu_name(gpu1));
-        return status;
-    }
-
-    UVM_ASSERT(p2p_handle != 0);
-
-    // Store the handle in the global table.
-    peer_caps->p2p_handle = p2p_handle;
-
-    // Update p2p caps after p2p object creation as it generates the peer ids.
-    status = get_p2p_caps(gpu0, gpu1, p2p_caps_params);
-    if (status != NV_OK)
-        goto cleanup;
-
-    status = init_peer_access(gpu0, gpu1, p2p_caps_params, peer_caps);
-    if (status != NV_OK)
-        goto cleanup;
-
-    return NV_OK;
-
-cleanup:
-    disable_peer_access(gpu0, gpu1);
-    return status;
-}
-
-static NV_STATUS discover_nvlink_peers(uvm_gpu_t *gpu)
-{
-    NV_STATUS status = NV_OK;
-    uvm_gpu_t *other_gpu;
-
-    UVM_ASSERT(gpu);
-    uvm_assert_mutex_locked(&g_uvm_global.global_lock);
-    UVM_ASSERT(!gpu->parent->smc.enabled);
-
-    for_each_gpu(other_gpu) {
-        UvmGpuP2PCapsParams p2p_caps_params;
-
-        if ((other_gpu == gpu) || other_gpu->parent->smc.enabled)
-            continue;
-
-        status = get_p2p_caps(gpu, other_gpu, &p2p_caps_params);
-        if (status != NV_OK)
-            goto cleanup;
-
-        // PCIe peers need to be explicitly enabled via UvmEnablePeerAccess
-        if (p2p_caps_params.p2pLink == UVM_LINK_TYPE_NONE || p2p_caps_params.p2pLink == UVM_LINK_TYPE_PCIE)
-            continue;
-
-        status = enable_nvlink_peer_access(gpu, other_gpu, &p2p_caps_params);
-        if (status != NV_OK)
-            goto cleanup;
-    }
-
-    return NV_OK;
-
-cleanup:
-    destroy_nvlink_peers(gpu);
-
-    return status;
-}
-
-static void destroy_nvlink_peers(uvm_gpu_t *gpu)
-{
-    uvm_gpu_t *other_gpu;
-
-    UVM_ASSERT(gpu);
-    uvm_assert_mutex_locked(&g_uvm_global.global_lock);
-
-    if (gpu->parent->smc.enabled)
-        return;
-
-    for_each_gpu(other_gpu) {
-        uvm_gpu_peer_t *peer_caps;
-
-        if ((other_gpu == gpu) || other_gpu->parent->smc.enabled)
-            continue;
-
-        peer_caps = uvm_gpu_peer_caps(gpu, other_gpu);
-
-        // PCIe peers need to be explicitly destroyed via UvmDisablePeerAccess
-        if (peer_caps->link_type == UVM_GPU_LINK_INVALID || peer_caps->link_type == UVM_GPU_LINK_PCIE)
-            continue;
-
-        disable_peer_access(gpu, other_gpu);
-    }
-}
-
 NV_STATUS uvm_gpu_retain_pcie_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
 {
-    NV_STATUS status = NV_OK;
-    uvm_gpu_peer_t *peer_caps;
+    NV_STATUS status;

    UVM_ASSERT(gpu0);
    UVM_ASSERT(gpu1);
    uvm_assert_mutex_locked(&g_uvm_global.global_lock);

-    peer_caps = uvm_gpu_peer_caps(gpu0, gpu1);
+    status = peers_retain(gpu0, gpu1);
+    if (status != NV_OK)
+        return status;

-    // Insert an entry into global peer table, if not present.
-    if (peer_caps->link_type == UVM_GPU_LINK_INVALID) {
-        UVM_ASSERT(peer_caps->ref_count == 0);
-
-        status = enable_pcie_peer_access(gpu0, gpu1);
-        if (status != NV_OK)
-            return status;
-    }
-    else if (peer_caps->link_type != UVM_GPU_LINK_PCIE) {
+    if (uvm_parent_gpu_peer_link_type(gpu0->parent, gpu1->parent) != UVM_GPU_LINK_PCIE) {
+        peers_release(gpu0, gpu1);
        return NV_ERR_INVALID_DEVICE;
    }

@ -2613,103 +2789,53 @@ NV_STATUS uvm_gpu_retain_pcie_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
    uvm_gpu_retain(gpu0);
    uvm_gpu_retain(gpu1);

-    peer_caps->ref_count++;
-
-    return status;
-}
-
-static void disable_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
-{
-    uvm_gpu_peer_t *peer_caps;
-    NvHandle p2p_handle = 0;
-
-    UVM_ASSERT(gpu0);
-    UVM_ASSERT(gpu1);
-
-    uvm_assert_mutex_locked(&g_uvm_global.global_lock);
-
-    peer_caps = uvm_gpu_peer_caps(gpu0, gpu1);
-
-    if (uvm_procfs_is_debug_enabled())
-        deinit_procfs_peer_cap_files(peer_caps);
-
-    p2p_handle = peer_caps->p2p_handle;
-    UVM_ASSERT(p2p_handle);
-
-    uvm_mmu_destroy_peer_identity_mappings(gpu0, gpu1);
-    uvm_mmu_destroy_peer_identity_mappings(gpu1, gpu0);
-
-    uvm_rm_locked_call_void(nvUvmInterfaceP2pObjectDestroy(uvm_global_session_handle(), p2p_handle));
-
-    UVM_ASSERT(uvm_gpu_get(gpu0->id) == gpu0);
-    UVM_ASSERT(uvm_gpu_get(gpu1->id) == gpu1);
-
-    uvm_spin_lock(&gpu0->peer_info.peer_gpus_lock);
-    uvm_processor_mask_clear(&gpu0->peer_info.peer_gpu_mask, gpu1->id);
-    gpu0->peer_info.peer_gpus[uvm_id_gpu_index(gpu1->id)] = NULL;
-    uvm_spin_unlock(&gpu0->peer_info.peer_gpus_lock);
-
-    uvm_spin_lock(&gpu1->peer_info.peer_gpus_lock);
-    uvm_processor_mask_clear(&gpu1->peer_info.peer_gpu_mask, gpu0->id);
-    gpu1->peer_info.peer_gpus[uvm_id_gpu_index(gpu0->id)] = NULL;
-    uvm_spin_unlock(&gpu1->peer_info.peer_gpus_lock);
-
-    // Flush the access counter buffer to avoid getting stale notifications for
-    // accesses to GPUs to which peer access is being disabled. This is also
-    // needed in the case of disabling automatic (NVLINK) peers on GPU
-    // unregister, because access counter processing might still be using GPU
-    // IDs queried from the peer table above which are about to be removed from
-    // the global table.
-    if (gpu0->parent->access_counters_supported)
-        uvm_parent_gpu_access_counter_buffer_flush(gpu0->parent);
-    if (gpu1->parent->access_counters_supported)
-        uvm_parent_gpu_access_counter_buffer_flush(gpu1->parent);
-
-    memset(peer_caps, 0, sizeof(*peer_caps));
+    return NV_OK;
 }

 void uvm_gpu_release_pcie_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
 {
-    uvm_gpu_peer_t *peer_caps;
    UVM_ASSERT(gpu0);
    UVM_ASSERT(gpu1);
    uvm_assert_mutex_locked(&g_uvm_global.global_lock);
+    UVM_ASSERT(uvm_parent_gpu_peer_link_type(gpu0->parent, gpu1->parent) == UVM_GPU_LINK_PCIE);

-    peer_caps = uvm_gpu_peer_caps(gpu0, gpu1);
-
-    UVM_ASSERT(peer_caps->ref_count > 0);
-    UVM_ASSERT(peer_caps->link_type == UVM_GPU_LINK_PCIE);
-    peer_caps->ref_count--;
-
-    if (peer_caps->ref_count == 0)
-        disable_peer_access(gpu0, gpu1);
+    peers_release(gpu0, gpu1);

    uvm_gpu_release_locked(gpu0);
    uvm_gpu_release_locked(gpu1);
 }

-static uvm_aperture_t uvm_gpu_peer_caps_aperture(uvm_gpu_peer_t *peer_caps, uvm_gpu_t *local_gpu, uvm_gpu_t *remote_gpu)
+uvm_gpu_link_type_t uvm_parent_gpu_peer_link_type(uvm_parent_gpu_t *parent_gpu0, uvm_parent_gpu_t *parent_gpu1)
 {
-    size_t peer_index;
+    uvm_parent_gpu_peer_t *parent_peer_caps;

-    // MIG instances in the same physical GPU have vidmem addresses
-    if (local_gpu->parent == remote_gpu->parent)
-        return UVM_APERTURE_VID;
+    if (parent_gpu0 == parent_gpu1)
+        return UVM_GPU_LINK_INVALID;

-    UVM_ASSERT(peer_caps->link_type != UVM_GPU_LINK_INVALID);
+    parent_peer_caps = parent_gpu_peer_caps(parent_gpu0, parent_gpu1);
+    if (parent_peer_caps->ref_count == 0)
+        return UVM_GPU_LINK_INVALID;

-    if (uvm_id_value(local_gpu->id) < uvm_id_value(remote_gpu->id))
-        peer_index = 0;
-    else
-        peer_index = 1;
-
-    return UVM_APERTURE_PEER(peer_caps->peer_ids[peer_index]);
+    return parent_peer_caps->link_type;
 }

 uvm_aperture_t uvm_gpu_peer_aperture(uvm_gpu_t *local_gpu, uvm_gpu_t *remote_gpu)
 {
-    uvm_gpu_peer_t *peer_caps = uvm_gpu_peer_caps(local_gpu, remote_gpu);
-    return uvm_gpu_peer_caps_aperture(peer_caps, local_gpu, remote_gpu);
+    uvm_parent_gpu_peer_t *parent_peer_caps;
+
+    // MIG instances in the same physical GPU have vidmem addresses
+    if (uvm_gpus_are_smc_peers(local_gpu, remote_gpu))
+        return UVM_APERTURE_VID;
+
+    parent_peer_caps = parent_gpu_peer_caps(local_gpu->parent, remote_gpu->parent);
+    return parent_gpu_peer_aperture(local_gpu->parent, remote_gpu->parent, parent_peer_caps);
+}
+
+NvU64 uvm_gpu_peer_ref_count(const uvm_gpu_t *gpu0, const uvm_gpu_t *gpu1)
+{
+    UVM_ASSERT(!uvm_gpus_are_smc_peers(gpu0, gpu1));
+
+    return gpu_peer_caps(gpu0, gpu1)->ref_count;
 }

 uvm_aperture_t uvm_get_page_tree_location(const uvm_parent_gpu_t *parent_gpu)
@ -2741,10 +2867,11 @@ uvm_processor_id_t uvm_gpu_get_processor_id_by_address(uvm_gpu_t *gpu, uvm_gpu_p

    for_each_gpu_id_in_mask(id, &gpu->peer_info.peer_gpu_mask) {
        uvm_gpu_t *other_gpu = gpu->peer_info.peer_gpus[uvm_id_gpu_index(id)];
+
        UVM_ASSERT(other_gpu);
        UVM_ASSERT(!uvm_gpus_are_smc_peers(gpu, other_gpu));

-        if (uvm_gpus_are_nvswitch_connected(gpu, other_gpu)) {
+        if (uvm_parent_gpus_are_nvswitch_connected(gpu->parent, other_gpu->parent)) {
            // NVSWITCH connected systems use an extended physical address to
            // map to peers.  Find the physical memory 'slot' containing the
            // given physical address to find the peer gpu that owns the
@ -2766,12 +2893,6 @@ uvm_processor_id_t uvm_gpu_get_processor_id_by_address(uvm_gpu_t *gpu, uvm_gpu_p
    return id;
 }

-uvm_gpu_peer_t *uvm_gpu_index_peer_caps(const uvm_gpu_id_t gpu_id0, const uvm_gpu_id_t gpu_id1)
-{
-    NvU32 table_index = uvm_gpu_peer_table_index(gpu_id0, gpu_id1);
-    return &g_uvm_global.peers[table_index];
-}
-
 static NvU64 instance_ptr_to_key(uvm_gpu_phys_address_t instance_ptr)
 {
    NvU64 key;
--- a/kernel-open/nvidia-uvm/uvm_gpu.h
+++ b/kernel-open/nvidia-uvm/uvm_gpu.h
@ -49,9 +49,13 @@
 #include <linux/mmu_notifier.h>
 #include "uvm_conf_computing.h"

-// Buffer length to store uvm gpu id, RM device name and gpu uuid.
-#define UVM_GPU_NICE_NAME_BUFFER_LENGTH (sizeof("ID 999: : ") + \
-            UVM_GPU_NAME_LENGTH + UVM_GPU_UUID_TEXT_BUFFER_LENGTH)
+#define UVM_PARENT_GPU_UUID_PREFIX "GPU-"
+#define UVM_GPU_UUID_PREFIX "GI-"
+
+// UVM_UUID_STRING_LENGTH already includes NULL, don't double-count it with
+// sizeof()
+#define UVM_PARENT_GPU_UUID_STRING_LENGTH (sizeof(UVM_PARENT_GPU_UUID_PREFIX) - 1 + UVM_UUID_STRING_LENGTH)
+#define UVM_GPU_UUID_STRING_LENGTH (sizeof(UVM_GPU_UUID_PREFIX) - 1 + UVM_UUID_STRING_LENGTH)

 #define UVM_GPU_MAGIC_VALUE 0xc001d00d12341993ULL

@ -184,29 +188,45 @@ struct uvm_service_block_context_struct

 typedef struct
 {
-    // Mask of read faulted pages in a UVM_VA_BLOCK_SIZE aligned region of a SAM
-    // VMA. Used for batching ATS faults in a vma. This is unused for access
-    // counter service requests.
-    uvm_page_mask_t read_fault_mask;
+    union
+    {
+        struct
+        {
+            // Mask of read faulted pages in a UVM_VA_BLOCK_SIZE aligned region
+            // of a SAM VMA. Used for batching ATS faults in a vma.
+            uvm_page_mask_t read_fault_mask;

-    // Mask of write faulted pages in a UVM_VA_BLOCK_SIZE aligned region of a
-    // SAM VMA. Used for batching ATS faults in a vma. This is unused for access
-    // counter service requests.
-    uvm_page_mask_t write_fault_mask;
+            // Mask of write faulted pages in a UVM_VA_BLOCK_SIZE aligned region
+            // of a SAM VMA. Used for batching ATS faults in a vma.
+            uvm_page_mask_t write_fault_mask;

-    // Mask of successfully serviced pages in a UVM_VA_BLOCK_SIZE aligned region
-    // of a SAM VMA. Used to return ATS fault status. This is unused for access
-    // counter service requests.
-    uvm_page_mask_t faults_serviced_mask;
+            // Mask of all faulted pages in a UVM_VA_BLOCK_SIZE aligned region
+            // of a SAM VMA. This is a logical or of read_fault_mask and
+            // write_mask.
+            uvm_page_mask_t accessed_mask;

-    // Mask of successfully serviced read faults on pages in write_fault_mask.
-    // This is unused for access counter service requests.
-    uvm_page_mask_t reads_serviced_mask;
+            // Mask of successfully serviced pages in a UVM_VA_BLOCK_SIZE
+            // aligned region of a SAM VMA. Used to return ATS fault status.
+            uvm_page_mask_t faults_serviced_mask;

-    // Mask of all accessed pages in a UVM_VA_BLOCK_SIZE aligned region of a SAM
-    // VMA. This is used as input for access counter service requests and output
-    // of fault service requests.
-    uvm_page_mask_t accessed_mask;
+            // Mask of successfully serviced read faults on pages in
+            // write_fault_mask.
+            uvm_page_mask_t reads_serviced_mask;
+
+        } faults;
+
+        struct
+        {
+            // Mask of all accessed pages in a UVM_VA_BLOCK_SIZE aligned region
+            // of a SAM VMA.
+            uvm_page_mask_t accessed_mask;
+
+            // Mask of successfully migrated pages in a UVM_VA_BLOCK_SIZE
+            // aligned region of a SAM VMA.
+            uvm_page_mask_t migrated_mask;
+
+        } access_counters;
+    };

    // Client type of the service requestor.
    uvm_fault_client_type_t client_type;
@ -633,9 +653,10 @@ struct uvm_gpu_struct
    NvProcessorUuid uuid;

    // Nice printable name in the format:
-    // ID: 999: GPU-<parent_uuid> UVM-GI-<gi_uuid>.
-    // UVM_GPU_UUID_TEXT_BUFFER_LENGTH includes the null character.
-    char name[9 + 2 * UVM_GPU_UUID_TEXT_BUFFER_LENGTH];
+    // ID: 999: GPU-<parent_uuid> GI-<gi_uuid>
+    // UVM_PARENT_GPU_UUID_STRING_LENGTH includes a NULL character but will be
+    // used for a space instead.
+    char name[sizeof("ID: 999: ") - 1 + UVM_PARENT_GPU_UUID_STRING_LENGTH - 1 + 1 + UVM_GPU_UUID_STRING_LENGTH];

    // Refcount of the gpu, i.e. how many times it has been retained. This is
    // roughly a count of how many times it has been registered with a VA space,
@ -682,6 +703,12 @@ struct uvm_gpu_struct
            bool enabled;
            unsigned int node_id;
        } numa;
+
+        // Physical address of the start of statically mapped fb memory in BAR1
+        NvU64 static_bar1_start;
+
+        // Size of statically mapped fb memory in BAR1.
+        NvU64 static_bar1_size;
    } mem_info;

    struct
@ -706,9 +733,6 @@ struct uvm_gpu_struct
    struct
    {
        // Mask of peer_gpus set
-        //
-        // We can use a regular processor id because P2P is not allowed between
-        // partitioned GPUs when SMC is enabled
        uvm_processor_mask_t peer_gpu_mask;

        // lazily-populated array of peer GPUs, indexed by the peer's GPU index
@ -859,16 +883,19 @@ struct uvm_gpu_struct

    struct
    {
+        // "gpus/UVM-GPU-${physical-UUID}/${sub_processor_index}/"
        struct proc_dir_entry *dir;

+        // "gpus/${gpu_id}" -> "UVM-GPU-${physical-UUID}/${sub_processor_index}"
        struct proc_dir_entry *dir_symlink;

-        // The GPU instance UUID symlink if SMC is enabled.
+        // The GPU instance UUID symlink.
+        // "gpus/UVM-GI-${GI-UUID}" ->
+        //     "UVM-GPU-${physical-UUID}/${sub_processor_index}"
        struct proc_dir_entry *gpu_instance_uuid_symlink;

+        // "gpus/UVM-GPU-${physical-UUID}/${sub_processor_index}/info"
        struct proc_dir_entry *info_file;
-
-        struct proc_dir_entry *dir_peers;
    } procfs;

    // Placeholder for per-GPU performance heuristics information
@ -876,6 +903,13 @@ struct uvm_gpu_struct

    // Force pushbuffer's GPU VA to be >= 1TB; used only for testing purposes.
    bool uvm_test_force_upper_pushbuffer_segment;
+
+    // Have we initialised device p2p pages.
+    bool device_p2p_initialised;
+
+    // Used to protect allocation of p2p_mem and assignment of the page
+    // zone_device_data fields.
+    uvm_mutex_t device_p2p_lock;
 };

 // In order to support SMC/MIG GPU partitions, we split UVM GPUs into two
@ -905,7 +939,7 @@ struct uvm_parent_gpu_struct
    NvProcessorUuid uuid;

    // Nice printable name including the uvm gpu id, ascii name from RM and uuid
-    char name[UVM_GPU_NICE_NAME_BUFFER_LENGTH];
+    char name[sizeof("ID 999: : ") - 1 + UVM_GPU_NAME_LENGTH + UVM_PARENT_GPU_UUID_STRING_LENGTH];

    // GPU information and provided by RM (architecture, implementation,
    // hardware classes, etc.).
@ -1087,11 +1121,17 @@ struct uvm_parent_gpu_struct

    struct
    {
+        // "gpus/UVM-GPU-${physical-UUID}/"
        struct proc_dir_entry *dir;

+        // "gpus/UVM-GPU-${physical-UUID}/fault_stats"
        struct proc_dir_entry *fault_stats_file;

+        // "gpus/UVM-GPU-${physical-UUID}/access_counters"
        struct proc_dir_entry *access_counters_file;
+
+        // "gpus/UVM-GPU-${physical-UUID}/peers/"
+        struct proc_dir_entry *dir_peers;
    } procfs;

    // Interrupt handling state and locks
@ -1239,42 +1279,59 @@ static uvmGpuDeviceHandle uvm_gpu_device_handle(uvm_gpu_t *gpu)
    return gpu->parent->rm_device;
 }

-struct uvm_gpu_peer_struct
+typedef struct
+{
+    // ref_count also controls state maintained in each GPU instance
+    // (uvm_gpu_t). See init_peer_access().
+    NvU64 ref_count;
+} uvm_gpu_peer_t;
+
+typedef struct
 {
    // The fields in this global structure can only be inspected under one of
    // the following conditions:
    //
-    // - The VA space lock is held for either read or write, both GPUs are
-    //   registered in the VA space, and the corresponding bit in the
+    // - The VA space lock is held for either read or write, both parent GPUs
+    //   are registered in the VA space, and the corresponding bit in the
    //   va_space.enabled_peers bitmap is set.
    //
    // - The global lock is held.
    //
-    // - While the global lock was held in the past, the two GPUs were detected
-    //   to be SMC peers and were both retained.
+    // - While the global lock was held in the past, the two parent GPUs were
+    //   both retained.
    //
-    // - While the global lock was held in the past, the two GPUs were detected
-    //   to be NVLINK peers and were both retained.
+    // - While the global lock was held in the past, the two parent GPUs were
+    //   detected to be NVLINK peers and were both retained.
    //
-    // - While the global lock was held in the past, the two GPUs were detected
-    //   to be PCIe peers and uvm_gpu_retain_pcie_peer_access() was called.
+    // - While the global lock was held in the past, the two parent GPUs were
+    //   detected to be PCIe peers and uvm_gpu_retain_pcie_peer_access() was
+    //   called.
    //
    // - The peer_gpus_lock is held on one of the GPUs. In this case, the other
    //   GPU must be read from the original GPU's peer_gpus table. The fields
    //   will not change while the lock is held, but they may no longer be valid
    //   because the other GPU might be in teardown.

-    // Peer Id associated with this device w.r.t. to a peer GPU.
+    // This field is used to determine when this struct has been initialized
+    // (ref_count != 0). NVLink peers are initialized at GPU registration time.
+    // PCIe peers are initialized when retain_pcie_peers_from_uuids() is called.
+    NvU64 ref_count;
+
+    // Saved values from UvmGpuP2PCapsParams to be used after GPU instance
+    // creation. This should be per GPU instance since LCEs are associated with
+    // GPU instances, not parent GPUs, but for now MIG is not supported for
+    // NVLINK peers so RM associates this state with the parent GPUs. This will
+    // need to be revisited if that NVLINK MIG peer support is added.
+    NvU8 optimalNvlinkWriteCEs[2];
+
+    // Peer Id associated with this device with respect to a peer parent GPU.
    // Note: peerId (A -> B) != peerId (B -> A)
    // peer_id[0] from min(gpu_id_1, gpu_id_2) -> max(gpu_id_1, gpu_id_2)
    // peer_id[1] from max(gpu_id_1, gpu_id_2) -> min(gpu_id_1, gpu_id_2)
    NvU8 peer_ids[2];

-    // The link type between the peer GPUs, currently either PCIe or NVLINK.
-    // This field is used to determine the when this peer struct has been
-    // initialized (link_type != UVM_GPU_LINK_INVALID). NVLink peers are
-    // initialized at GPU registration time. PCIe peers are initialized when
-    // the refcount below goes from 0 to 1.
+    // The link type between the peer parent GPUs, currently either PCIe or
+    // NVLINK.
    uvm_gpu_link_type_t link_type;

    // Maximum unidirectional bandwidth between the peers in megabytes per
@ -1282,10 +1339,6 @@ struct uvm_gpu_peer_struct
    // See UvmGpuP2PCapsParams.
    NvU32 total_link_line_rate_mbyte_per_s;

-    // For PCIe, the number of times that this has been retained by a VA space.
-    // For NVLINK this will always be 1.
-    NvU64 ref_count;
-
    // This handle gets populated when enable_peer_access successfully creates
    // an NV50_P2P object. disable_peer_access resets the same on the object
    // deletion.
@ -1299,9 +1352,13 @@ struct uvm_gpu_peer_struct
        // GPU-A <-> GPU-B link is bidirectional, pairs[x][0] is always the
        // local GPU, while pairs[x][1] is the remote GPU. The table shall be
        // filled like so: [[GPU-A, GPU-B], [GPU-B, GPU-A]].
-        uvm_gpu_t *pairs[2][2];
+        uvm_parent_gpu_t *pairs[2][2];
    } procfs;
-};
+
+    // Peer-to-peer state for MIG instance pairs between two different parent
+    // GPUs.
+    uvm_gpu_peer_t gpu_peers[UVM_MAX_UNIQUE_SUB_PROCESSOR_PAIRS];
+} uvm_parent_gpu_peer_t;

 // Initialize global gpu state
 NV_STATUS uvm_gpu_init(void);
@ -1380,12 +1437,12 @@ static NvU64 uvm_gpu_retained_count(uvm_gpu_t *gpu)
    return atomic64_read(&gpu->retained_count);
 }

-// Decrease the refcount on the parent GPU object, and actually delete the object
-// if the refcount hits zero.
+// Decrease the refcount on the parent GPU object, and actually delete the
+// object if the refcount hits zero.
 void uvm_parent_gpu_kref_put(uvm_parent_gpu_t *gpu);

-// Calculates peer table index using GPU ids.
-NvU32 uvm_gpu_peer_table_index(const uvm_gpu_id_t gpu_id0, const uvm_gpu_id_t gpu_id1);
+// Returns a GPU peer pair index in the range [0 .. UVM_MAX_UNIQUE_GPU_PAIRS).
+NvU32 uvm_gpu_pair_index(const uvm_gpu_id_t id0, const uvm_gpu_id_t id1);

 // Either retains an existing PCIe peer entry or creates a new one. In both
 // cases the two GPUs are also each retained.
@ -1396,35 +1453,26 @@ NV_STATUS uvm_gpu_retain_pcie_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1);
 // LOCKING: requires the global lock to be held
 void uvm_gpu_release_pcie_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1);

+uvm_gpu_link_type_t uvm_parent_gpu_peer_link_type(uvm_parent_gpu_t *parent_gpu0, uvm_parent_gpu_t *parent_gpu1);
+
 // Get the aperture for local_gpu to use to map memory resident on remote_gpu.
 // They must not be the same gpu.
 uvm_aperture_t uvm_gpu_peer_aperture(uvm_gpu_t *local_gpu, uvm_gpu_t *remote_gpu);

+// Return the reference count for the P2P state between the given GPUs.
+// The two GPUs must have different parents.
+NvU64 uvm_gpu_peer_ref_count(const uvm_gpu_t *gpu0, const uvm_gpu_t *gpu1);
+
 // Get the processor id accessible by the given GPU for the given physical
 // address.
 uvm_processor_id_t uvm_gpu_get_processor_id_by_address(uvm_gpu_t *gpu, uvm_gpu_phys_address_t addr);

-// Get the P2P capabilities between the gpus with the given indexes
-uvm_gpu_peer_t *uvm_gpu_index_peer_caps(const uvm_gpu_id_t gpu_id0, const uvm_gpu_id_t gpu_id1);
+bool uvm_parent_gpus_are_nvswitch_connected(const uvm_parent_gpu_t *parent_gpu0, const uvm_parent_gpu_t *parent_gpu1);

-// Get the P2P capabilities between the given gpus
-static uvm_gpu_peer_t *uvm_gpu_peer_caps(const uvm_gpu_t *gpu0, const uvm_gpu_t *gpu1)
+static bool uvm_gpus_are_smc_peers(const uvm_gpu_t *gpu0, const uvm_gpu_t *gpu1)
 {
-    return uvm_gpu_index_peer_caps(gpu0->id, gpu1->id);
-}
+    UVM_ASSERT(gpu0 != gpu1);

-static bool uvm_gpus_are_nvswitch_connected(const uvm_gpu_t *gpu0, const uvm_gpu_t *gpu1)
-{
-    if (gpu0->parent->nvswitch_info.is_nvswitch_connected && gpu1->parent->nvswitch_info.is_nvswitch_connected) {
-        UVM_ASSERT(uvm_gpu_peer_caps(gpu0, gpu1)->link_type >= UVM_GPU_LINK_NVLINK_2);
-        return true;
-    }
-
-    return false;
-}
-
-static bool uvm_gpus_are_smc_peers(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
-{
    return gpu0->parent == gpu1->parent;
 }

@ -1595,9 +1643,6 @@ static bool uvm_parent_gpu_needs_proxy_channel_pool(const uvm_parent_gpu_t *pare

 uvm_aperture_t uvm_get_page_tree_location(const uvm_parent_gpu_t *parent_gpu);

-// Debug print of GPU properties
-void uvm_gpu_print(uvm_gpu_t *gpu);
-
 // Add the given instance pointer -> user_channel mapping to this GPU. The
 // bottom half GPU page fault handler uses this to look up the VA space for GPU
 // faults.
@ -1637,4 +1682,7 @@ typedef enum
    UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT,
 } uvm_gpu_buffer_flush_mode_t;

+// PCIe BAR containing static framebuffer memory mappings for PCIe P2P
+int uvm_device_p2p_static_bar(uvm_gpu_t *gpu);
+
 #endif // __UVM_GPU_H__
--- a/kernel-open/nvidia-uvm/uvm_gpu_access_counters.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_access_counters.c
@ -24,6 +24,7 @@
 #include "nv_uvm_interface.h"
 #include "uvm_gpu_access_counters.h"
 #include "uvm_global.h"
+#include "uvm_api.h"
 #include "uvm_gpu.h"
 #include "uvm_hal.h"
 #include "uvm_kvmalloc.h"
@ -43,8 +44,9 @@
 #define UVM_PERF_ACCESS_COUNTER_THRESHOLD_MAX       ((1 << 16) - 1)
 #define UVM_PERF_ACCESS_COUNTER_THRESHOLD_DEFAULT   256

-#define UVM_ACCESS_COUNTER_ACTION_CLEAR     0x1
-#define UVM_ACCESS_COUNTER_PHYS_ON_MANAGED  0x2
+#define UVM_ACCESS_COUNTER_ACTION_BATCH_CLEAR       0x1
+#define UVM_ACCESS_COUNTER_ACTION_TARGETED_CLEAR    0x2
+#define UVM_ACCESS_COUNTER_PHYS_ON_MANAGED          0x4

 // Each page in a tracked physical range may belong to a different VA Block. We
 // preallocate an array of reverse map translations. However, access counter
@ -600,7 +602,7 @@ NV_STATUS uvm_gpu_access_counters_enable(uvm_gpu_t *gpu, uvm_va_space_t *va_spac
    uvm_parent_gpu_access_counters_isr_lock(gpu->parent);

    if (uvm_parent_processor_mask_test(&va_space->access_counters_enabled_processors, gpu->parent->id)) {
-        status = NV_ERR_INVALID_DEVICE;
+        status = NV_OK;
    }
    else {
        UvmGpuAccessCntrConfig default_config =
@ -684,7 +686,10 @@ static void access_counter_buffer_flush_locked(uvm_parent_gpu_t *parent_gpu,

    while (get != put) {
        // Wait until valid bit is set
-        UVM_SPIN_WHILE(!parent_gpu->access_counter_buffer_hal->entry_is_valid(parent_gpu, get), &spin);
+        UVM_SPIN_WHILE(!parent_gpu->access_counter_buffer_hal->entry_is_valid(parent_gpu, get), &spin) {
+            if (uvm_global_get_status() != NV_OK)
+                goto done;
+        }

        parent_gpu->access_counter_buffer_hal->entry_clear_valid(parent_gpu, get);
        ++get;
@ -692,6 +697,7 @@ static void access_counter_buffer_flush_locked(uvm_parent_gpu_t *parent_gpu,
            get = 0;
    }

+done:
    write_get(parent_gpu, get);
 }

@ -830,12 +836,18 @@ static NvU32 fetch_access_counter_buffer_entries(uvm_parent_gpu_t *parent_gpu,
           (fetch_mode == NOTIFICATION_FETCH_MODE_ALL || notification_index < access_counters->max_batch_size)) {
        uvm_access_counter_buffer_entry_t *current_entry = &notification_cache[notification_index];

-        // We cannot just wait for the last entry (the one pointed by put) to become valid, we have to do it
-        // individually since entries can be written out of order
+        // We cannot just wait for the last entry (the one pointed by put) to
+        // become valid, we have to do it individually since entries can be
+        // written out of order
        UVM_SPIN_WHILE(!parent_gpu->access_counter_buffer_hal->entry_is_valid(parent_gpu, get), &spin) {
            // We have some entry to work on. Let's do the rest later.
            if (fetch_mode != NOTIFICATION_FETCH_MODE_ALL && notification_index > 0)
                goto done;
+
+            // There's no entry to work on and something has gone wrong. Ignore
+            // the rest.
+            if (uvm_global_get_status() != NV_OK)
+               goto done;
        }

        // Prevent later accesses being moved above the read of the valid bit
@ -991,7 +1003,9 @@ static NV_STATUS notify_tools_broadcast_and_process_flags(uvm_parent_gpu_t *pare
            uvm_tools_broadcast_access_counter(gpu, notification_start[i], flags & UVM_ACCESS_COUNTER_PHYS_ON_MANAGED);
    }

-    if (flags & UVM_ACCESS_COUNTER_ACTION_CLEAR)
+    UVM_ASSERT(!(flags & UVM_ACCESS_COUNTER_ACTION_TARGETED_CLEAR));
+
+    if (flags & UVM_ACCESS_COUNTER_ACTION_BATCH_CLEAR)
        status = access_counter_clear_notifications(gpu, notification_start, num_entries);

    return status;
@ -999,9 +1013,11 @@ static NV_STATUS notify_tools_broadcast_and_process_flags(uvm_parent_gpu_t *pare

 static NV_STATUS notify_tools_and_process_flags(uvm_va_space_t *va_space,
                                                uvm_gpu_t *gpu,
+                                                NvU64 base,
                                                uvm_access_counter_buffer_entry_t **notification_start,
                                                NvU32 num_entries,
-                                                NvU32 flags)
+                                                NvU32 flags,
+                                                uvm_page_mask_t *migrated_mask)
 {
    NV_STATUS status = NV_OK;

@ -1016,8 +1032,39 @@ static NV_STATUS notify_tools_and_process_flags(uvm_va_space_t *va_space,
        }
    }

-    if (flags & UVM_ACCESS_COUNTER_ACTION_CLEAR)
+    if (flags & UVM_ACCESS_COUNTER_ACTION_TARGETED_CLEAR) {
+        NvU32 i;
+
+        UVM_ASSERT(base);
+        UVM_ASSERT(migrated_mask);
+
+        for (i = 0; i < num_entries; i++) {
+            NvU32 start_index = i;
+            NvU32 end_index;
+
+            for (end_index = i; end_index < num_entries; end_index++) {
+                NvU32 mask_index = (notification_start[end_index]->address.address - base) / PAGE_SIZE;
+
+                if (!uvm_page_mask_test(migrated_mask, mask_index))
+                    break;
+            }
+
+            if (end_index > start_index) {
+                status = access_counter_clear_notifications(gpu,
+                                                            &notification_start[start_index],
+                                                            end_index - start_index);
+                if (status != NV_OK)
+                    return status;
+            }
+
+            i = end_index;
+        }
+    }
+    else if (flags & UVM_ACCESS_COUNTER_ACTION_BATCH_CLEAR) {
+        UVM_ASSERT(!base);
+        UVM_ASSERT(!migrated_mask);
        status = access_counter_clear_notifications(gpu, notification_start, num_entries);
+    }

    return status;
 }
@ -1242,7 +1289,7 @@ static NV_STATUS service_phys_single_va_block(uvm_access_counter_service_batch_c
    const uvm_processor_id_t processor = current_entry->counter_type == UVM_ACCESS_COUNTER_TYPE_MIMC?
                                             gpu->id: UVM_ID_CPU;

-    *out_flags &= ~UVM_ACCESS_COUNTER_ACTION_CLEAR;
+    *out_flags &= ~UVM_ACCESS_COUNTER_ACTION_BATCH_CLEAR;

    UVM_ASSERT(num_reverse_mappings > 0);

@ -1304,7 +1351,7 @@ static NV_STATUS service_phys_single_va_block(uvm_access_counter_service_batch_c
        }

        if (status == NV_OK)
-            *out_flags |= UVM_ACCESS_COUNTER_ACTION_CLEAR;
+            *out_flags |= UVM_ACCESS_COUNTER_ACTION_BATCH_CLEAR;
    }

 done:
@ -1329,7 +1376,7 @@ static NV_STATUS service_phys_va_blocks(uvm_access_counter_service_batch_context
    NV_STATUS status = NV_OK;
    size_t index;

-    *out_flags &= ~UVM_ACCESS_COUNTER_ACTION_CLEAR;
+    *out_flags &= ~UVM_ACCESS_COUNTER_ACTION_BATCH_CLEAR;

    for (index = 0; index < num_reverse_mappings; ++index) {
        NvU32 out_flags_local = 0;
@ -1341,7 +1388,7 @@ static NV_STATUS service_phys_va_blocks(uvm_access_counter_service_batch_context
        if (status != NV_OK)
            break;

-        UVM_ASSERT((out_flags_local & ~UVM_ACCESS_COUNTER_ACTION_CLEAR) == 0);
+        UVM_ASSERT((out_flags_local & ~UVM_ACCESS_COUNTER_ACTION_BATCH_CLEAR) == 0);
        *out_flags |= out_flags_local;
    }

@ -1473,7 +1520,7 @@ static NV_STATUS service_phys_notification(uvm_access_counter_service_batch_cont
        resident_gpu = uvm_gpu_get(current_entry->physical_info.resident_id);
        UVM_ASSERT(resident_gpu != NULL);

-        if (gpu != resident_gpu && uvm_gpus_are_nvswitch_connected(gpu, resident_gpu)) {
+        if (gpu != resident_gpu && uvm_parent_gpus_are_nvswitch_connected(gpu->parent, resident_gpu->parent)) {
            UVM_ASSERT(address >= resident_gpu->parent->nvswitch_info.fabric_memory_window_start);
            address -= resident_gpu->parent->nvswitch_info.fabric_memory_window_start;
        }
@ -1499,7 +1546,7 @@ static NV_STATUS service_phys_notification(uvm_access_counter_service_batch_cont
                                                       &out_flags_local);
        total_reverse_mappings += num_reverse_mappings;

-        UVM_ASSERT((out_flags_local & ~UVM_ACCESS_COUNTER_ACTION_CLEAR) == 0);
+        UVM_ASSERT((out_flags_local & ~UVM_ACCESS_COUNTER_ACTION_BATCH_CLEAR) == 0);
        flags |= out_flags_local;

        if (status != NV_OK)
@ -1610,7 +1657,7 @@ static void expand_notification_block(uvm_gpu_va_space_t *gpu_va_space,
        return;

    if (UVM_ID_IS_GPU(resident_id))
-        resident_gpu = uvm_va_space_get_gpu(gpu_va_space->va_space, resident_id);
+        resident_gpu = uvm_gpu_get(resident_id);

    if (uvm_va_block_get_physical_size(va_block, resident_id, page_index) != granularity) {
        uvm_page_mask_set(accessed_pages, page_index);
@ -1692,9 +1739,15 @@ static NV_STATUS service_virt_notifications_in_block(uvm_gpu_va_space_t *gpu_va_
    uvm_mutex_unlock(&va_block->lock);

    if (status == NV_OK)
-        flags |= UVM_ACCESS_COUNTER_ACTION_CLEAR;
+        flags |= UVM_ACCESS_COUNTER_ACTION_BATCH_CLEAR;

-    flags_status = notify_tools_and_process_flags(va_space, gpu, &notifications[index], *out_index - index, flags);
+    flags_status = notify_tools_and_process_flags(va_space,
+                                                  gpu,
+                                                  0,
+                                                  &notifications[index],
+                                                  *out_index - index,
+                                                  flags,
+                                                  NULL);

    if ((status == NV_OK) && (flags_status != NV_OK))
        status = flags_status;
@ -1713,7 +1766,6 @@ static NV_STATUS service_virt_notification_ats(uvm_gpu_va_space_t *gpu_va_space,
    NvU64 base;
    NvU64 end;
    NvU64 address;
-    NvU32 flags = UVM_ACCESS_COUNTER_ACTION_CLEAR;
    NV_STATUS status = NV_OK;
    NV_STATUS flags_status;
    struct vm_area_struct *vma = NULL;
@ -1733,7 +1785,13 @@ static NV_STATUS service_virt_notification_ats(uvm_gpu_va_space_t *gpu_va_space,
    if (!vma) {
        // Clear the notification entry to continue receiving access counter
        // notifications when a new VMA is allocated in this range.
-        status = notify_tools_and_process_flags(va_space, gpu, &notifications[index], 1, flags);
+        status = notify_tools_and_process_flags(va_space,
+                                                gpu,
+                                                0,
+                                                &notifications[index],
+                                                1,
+                                                UVM_ACCESS_COUNTER_ACTION_BATCH_CLEAR,
+                                                NULL);
        *out_index = index + 1;
        return status;
    }
@ -1741,7 +1799,7 @@ static NV_STATUS service_virt_notification_ats(uvm_gpu_va_space_t *gpu_va_space,
    base = UVM_VA_BLOCK_ALIGN_DOWN(address);
    end = min(base + UVM_VA_BLOCK_SIZE, (NvU64)vma->vm_end);

-    uvm_page_mask_zero(&ats_context->accessed_mask);
+    uvm_page_mask_zero(&ats_context->access_counters.accessed_mask);

    for (i = index; i < batch_context->virt.num_notifications; i++) {
        uvm_access_counter_buffer_entry_t *current_entry = notifications[i];
@ -1750,7 +1808,7 @@ static NV_STATUS service_virt_notification_ats(uvm_gpu_va_space_t *gpu_va_space,
        if (current_entry->virtual_info.va_space != va_space || current_entry->gpu != gpu || address >= end)
            break;

-        uvm_page_mask_set(&ats_context->accessed_mask, (address - base) / PAGE_SIZE);
+        uvm_page_mask_set(&ats_context->access_counters.accessed_mask, (address - base) / PAGE_SIZE);
    }

    *out_index = i;
@ -1762,10 +1820,15 @@ static NV_STATUS service_virt_notification_ats(uvm_gpu_va_space_t *gpu_va_space,
    //                    location is set
    // If no pages were actually migrated, don't clear the access counters.
    status = uvm_ats_service_access_counters(gpu_va_space, vma, base, ats_context);
-    if (status != NV_OK)
-        flags &= ~UVM_ACCESS_COUNTER_ACTION_CLEAR;

-    flags_status = notify_tools_and_process_flags(va_space, gpu, &notifications[index], *out_index - index, flags);
+    flags_status = notify_tools_and_process_flags(va_space,
+                                                  gpu,
+                                                  base,
+                                                  &notifications[index],
+                                                  *out_index - index,
+                                                  UVM_ACCESS_COUNTER_ACTION_TARGETED_CLEAR,
+                                                  &ats_context->access_counters.migrated_mask);
+
    if ((status == NV_OK) && (flags_status != NV_OK))
        status = flags_status;

@ -1799,25 +1862,32 @@ static NV_STATUS service_virt_notifications_batch(uvm_gpu_va_space_t *gpu_va_spa
        // Avoid clearing the entry by default.
        NvU32 flags = 0;
        uvm_va_block_t *va_block = NULL;
+        uvm_va_range_managed_t *managed_range = uvm_va_range_to_managed_or_null(va_range);

-        if (va_range->type == UVM_VA_RANGE_TYPE_MANAGED) {
-            size_t index = uvm_va_range_block_index(va_range, address);
+        if (managed_range) {
+            size_t index = uvm_va_range_block_index(managed_range, address);

-            va_block = uvm_va_range_block(va_range, index);
+            va_block = uvm_va_range_block(managed_range, index);

            // If the va_range is a managed range, the notification belongs to a
            // recently freed va_range if va_block is NULL. If va_block is not
            // NULL, service_virt_notifications_in_block will process flags.
            // Clear the notification entry to continue receiving notifications
            // when a new va_range is allocated in that region.
-            flags = UVM_ACCESS_COUNTER_ACTION_CLEAR;
+            flags = UVM_ACCESS_COUNTER_ACTION_BATCH_CLEAR;
        }

        if (va_block) {
            status = service_virt_notifications_in_block(gpu_va_space, mm, va_block, batch_context, index, out_index);
        }
        else {
-            status = notify_tools_and_process_flags(va_space, gpu_va_space->gpu, batch_context->virt.notifications, 1, flags);
+            status = notify_tools_and_process_flags(va_space,
+                                                    gpu_va_space->gpu,
+                                                    0,
+                                                    batch_context->virt.notifications,
+                                                    1,
+                                                    flags,
+                                                    NULL);
            *out_index = index + 1;
        }
    }
@ -1839,7 +1909,7 @@ static NV_STATUS service_virt_notifications_batch(uvm_gpu_va_space_t *gpu_va_spa
        // - If the va_block isn't HMM, the notification belongs to a recently
        // freed va_range. Clear the notification entry to continue receiving
        // notifications when a new va_range is allocated in this region.
-        flags = va_block ? 0 : UVM_ACCESS_COUNTER_ACTION_CLEAR;
+        flags = va_block ? 0 : UVM_ACCESS_COUNTER_ACTION_BATCH_CLEAR;

        UVM_ASSERT((status == NV_ERR_OBJECT_NOT_FOUND) ||
                   (status == NV_ERR_INVALID_ADDRESS)  ||
@ -1849,9 +1919,11 @@ static NV_STATUS service_virt_notifications_batch(uvm_gpu_va_space_t *gpu_va_spa
        // in the batch.
        status = notify_tools_and_process_flags(va_space,
                                                gpu_va_space->gpu,
+                                                0,
                                                batch_context->virt.notifications,
                                                1,
-                                                flags);
+                                                flags,
+                                                NULL);

        *out_index = index + 1;
    }
@ -1917,9 +1989,11 @@ static NV_STATUS service_virt_notifications(uvm_parent_gpu_t *parent_gpu,
            else {
                status = notify_tools_and_process_flags(va_space,
                                                        current_entry->gpu,
+                                                        0,
                                                        &batch_context->virt.notifications[i],
                                                        1,
-                                                        0);
+                                                        0,
+                                                        NULL);
                i++;
            }
        }
@ -1979,6 +2053,64 @@ void uvm_parent_gpu_service_access_counters(uvm_parent_gpu_t *parent_gpu)
    }
 }

+NV_STATUS uvm_api_clear_all_access_counters(UVM_CLEAR_ALL_ACCESS_COUNTERS_PARAMS *params, struct file *filp)
+{
+    uvm_gpu_t *gpu;
+    uvm_parent_gpu_t *parent_gpu = NULL;
+    NV_STATUS status = NV_OK;
+    uvm_va_space_t *va_space = uvm_va_space_get(filp);
+    uvm_processor_mask_t *retained_gpus;
+
+    retained_gpus = uvm_processor_mask_cache_alloc();
+    if (!retained_gpus)
+        return NV_ERR_NO_MEMORY;
+
+    uvm_processor_mask_zero(retained_gpus);
+
+    uvm_va_space_down_read(va_space);
+
+    for_each_va_space_gpu(gpu, va_space) {
+
+        if (gpu->parent == parent_gpu)
+            continue;
+
+        uvm_gpu_retain(gpu);
+        uvm_processor_mask_set(retained_gpus, gpu->id);
+        parent_gpu = gpu->parent;
+    }
+
+    uvm_va_space_up_read(va_space);
+
+    for_each_gpu_in_mask(gpu, retained_gpus) {
+
+        if (!gpu->parent->access_counters_supported)
+            continue;
+
+        uvm_parent_gpu_access_counters_isr_lock(gpu->parent);
+
+        // Access counters not enabled. Nothing to clear
+        if (gpu->parent->isr.access_counters.handling_ref_count) {
+            uvm_access_counter_buffer_info_t *access_counters = &gpu->parent->access_counter_buffer_info;
+
+            status = access_counter_clear_all(gpu);
+            if (status == NV_OK)
+                status = uvm_tracker_wait(&access_counters->clear_tracker);
+        }
+
+        uvm_parent_gpu_access_counters_isr_unlock(gpu->parent);
+
+        if (status != NV_OK)
+            break;
+    }
+
+    for_each_gpu_in_mask(gpu, retained_gpus)
+        uvm_gpu_release(gpu);
+
+    uvm_processor_mask_cache_free(retained_gpus);
+
+    return status;
+}
+
 static const NvU32 g_uvm_access_counters_threshold_max = (1 << 15) - 1;

 static NV_STATUS access_counters_config_from_test_params(const UVM_TEST_RECONFIGURE_ACCESS_COUNTERS_PARAMS *params,
--- a/kernel-open/nvidia-uvm/uvm_gpu_non_replayable_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_non_replayable_faults.c
@ -579,8 +579,9 @@ static NV_STATUS service_non_managed_fault(uvm_gpu_va_space_t *gpu_va_space,
        uvm_fault_access_type_t fault_access_type = fault_entry->fault_access_type;
        uvm_ats_fault_context_t *ats_context = &non_replayable_faults->ats_context;

-        uvm_page_mask_zero(&ats_context->read_fault_mask);
-        uvm_page_mask_zero(&ats_context->write_fault_mask);
+        uvm_page_mask_zero(&ats_context->faults.read_fault_mask);
+        uvm_page_mask_zero(&ats_context->faults.write_fault_mask);
+        uvm_page_mask_zero(&ats_context->faults.accessed_mask);

        ats_context->client_type = UVM_FAULT_CLIENT_TYPE_HUB;

@ -597,14 +598,17 @@ static NV_STATUS service_non_managed_fault(uvm_gpu_va_space_t *gpu_va_space,
        }
        else {
            NvU64 base = UVM_VA_BLOCK_ALIGN_DOWN(fault_address);
-            uvm_page_mask_t *faults_serviced_mask = &ats_context->faults_serviced_mask;
+            uvm_page_mask_t *faults_serviced_mask = &ats_context->faults.faults_serviced_mask;
+            uvm_page_mask_t *accessed_mask = &ats_context->faults.accessed_mask;
            uvm_page_index_t page_index = (fault_address - base) / PAGE_SIZE;
            uvm_page_mask_t *fault_mask = (fault_access_type >= UVM_FAULT_ACCESS_TYPE_WRITE) ?
-                                                                                       &ats_context->write_fault_mask :
-                                                                                       &ats_context->read_fault_mask;
+                                                                                &ats_context->faults.write_fault_mask :
+                                                                                &ats_context->faults.read_fault_mask;

            uvm_page_mask_set(fault_mask, page_index);

+            uvm_page_mask_set(accessed_mask, page_index);
+
            status = uvm_ats_service_faults(gpu_va_space, vma, base, ats_context);
            if (status == NV_OK) {
                // Invalidate ATS TLB entries if needed
--- a/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
@ -644,7 +644,15 @@ static NV_STATUS fault_buffer_flush_locked(uvm_parent_gpu_t *parent_gpu,

    while (get != put) {
        // Wait until valid bit is set
-        UVM_SPIN_WHILE(!parent_gpu->fault_buffer_hal->entry_is_valid(parent_gpu, get), &spin);
+        UVM_SPIN_WHILE(!parent_gpu->fault_buffer_hal->entry_is_valid(parent_gpu, get), &spin) {
+            // Channels might be idle (e.g. in teardown) so check for errors
+            // actively. In that case the gpu pointer is valid.
+            status = gpu ? uvm_channel_manager_check_errors(gpu->channel_manager) : uvm_global_get_status();
+            if (status != NV_OK) {
+                write_get(parent_gpu, get);
+                return status;
+            }
+        }

        fault_buffer_skip_replayable_entry(parent_gpu, get);
        ++get;
@ -890,6 +898,10 @@ static NV_STATUS fetch_fault_buffer_entries(uvm_parent_gpu_t *parent_gpu,
            // We have some entry to work on. Let's do the rest later.
            if (fetch_mode == FAULT_FETCH_MODE_BATCH_READY && fault_index > 0)
                goto done;
+            
+            status = uvm_global_get_status();
+            if (status != NV_OK)
+                goto done;
        }

        // Prevent later accesses being moved above the read of the valid bit
@ -1410,7 +1422,7 @@ static NV_STATUS service_fault_batch_block_locked(uvm_gpu_t *gpu,
                                         &end);
    }
    else {
-        policy = uvm_va_range_get_policy(va_block->va_range);
+        policy = &va_block->managed_range->policy;
        end = va_block->end;
    }

@ -1689,11 +1701,11 @@ static NV_STATUS service_fault_batch_ats_sub_vma(uvm_gpu_va_space_t *gpu_va_spac
    NvU32 i;
    NV_STATUS status = NV_OK;
    uvm_ats_fault_context_t *ats_context = &batch_context->ats_context;
-    const uvm_page_mask_t *read_fault_mask = &ats_context->read_fault_mask;
-    const uvm_page_mask_t *write_fault_mask = &ats_context->write_fault_mask;
-    const uvm_page_mask_t *reads_serviced_mask = &ats_context->reads_serviced_mask;
-    uvm_page_mask_t *faults_serviced_mask = &ats_context->faults_serviced_mask;
-    uvm_page_mask_t *accessed_mask = &ats_context->accessed_mask;
+    const uvm_page_mask_t *read_fault_mask = &ats_context->faults.read_fault_mask;
+    const uvm_page_mask_t *write_fault_mask = &ats_context->faults.write_fault_mask;
+    const uvm_page_mask_t *reads_serviced_mask = &ats_context->faults.reads_serviced_mask;
+    uvm_page_mask_t *faults_serviced_mask = &ats_context->faults.faults_serviced_mask;
+    uvm_page_mask_t *accessed_mask = &ats_context->faults.accessed_mask;

    UVM_ASSERT(vma);

@ -1763,8 +1775,8 @@ static void start_new_sub_batch(NvU64 *sub_batch_base,
                                NvU32 fault_index,
                                uvm_ats_fault_context_t *ats_context)
 {
-    uvm_page_mask_zero(&ats_context->read_fault_mask);
-    uvm_page_mask_zero(&ats_context->write_fault_mask);
+    uvm_page_mask_zero(&ats_context->faults.read_fault_mask);
+    uvm_page_mask_zero(&ats_context->faults.write_fault_mask);

    *sub_batch_fault_index = fault_index;
    *sub_batch_base = UVM_VA_BLOCK_ALIGN_DOWN(address);
@ -1784,8 +1796,8 @@ static NV_STATUS service_fault_batch_ats_sub(uvm_gpu_va_space_t *gpu_va_space,
    uvm_fault_buffer_entry_t *previous_entry = NULL;
    uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
    uvm_ats_fault_context_t *ats_context = &batch_context->ats_context;
-    uvm_page_mask_t *read_fault_mask = &ats_context->read_fault_mask;
-    uvm_page_mask_t *write_fault_mask = &ats_context->write_fault_mask;
+    uvm_page_mask_t *read_fault_mask = &ats_context->faults.read_fault_mask;
+    uvm_page_mask_t *write_fault_mask = &ats_context->faults.write_fault_mask;
    uvm_gpu_t *gpu = gpu_va_space->gpu;
    bool replay_per_va_block =
                        (gpu->parent->fault_buffer_info.replayable.replay_policy == UVM_PERF_FAULT_REPLAY_POLICY_BLOCK);
--- a/kernel-open/nvidia-uvm/uvm_gpu_semaphore.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_semaphore.c
@ -507,11 +507,12 @@ uvm_gpu_address_t uvm_gpu_semaphore_get_encrypted_payload_gpu_va(uvm_gpu_semapho
    return uvm_gpu_address_virtual_unprotected(encrypted_base_va + semaphore->index * UVM_SEMAPHORE_SIZE);
 }

-NvU32 *uvm_gpu_semaphore_get_notifier_cpu_va(uvm_gpu_semaphore_t *semaphore)
+uvm_gpu_semaphore_notifier_t *uvm_gpu_semaphore_get_notifier_cpu_va(uvm_gpu_semaphore_t *semaphore)
 {
-    char *notifier_base_va = uvm_rm_mem_get_cpu_va(semaphore->page->conf_computing.notifier_memory);
+    uvm_gpu_semaphore_notifier_t *notifier_base_va =
+        uvm_rm_mem_get_cpu_va(semaphore->page->conf_computing.notifier_memory);

-    return (NvU32*)(notifier_base_va + semaphore->index * sizeof(NvU32));
+    return notifier_base_va + semaphore->index;
 }

 uvm_gpu_address_t uvm_gpu_semaphore_get_notifier_gpu_va(uvm_gpu_semaphore_t *semaphore)
@ -519,7 +520,8 @@ uvm_gpu_address_t uvm_gpu_semaphore_get_notifier_gpu_va(uvm_gpu_semaphore_t *sem
    NvU64 notifier_base_va = uvm_rm_mem_get_gpu_uvm_va(semaphore->page->conf_computing.notifier_memory,
                                                       semaphore->page->pool->gpu);

-    return uvm_gpu_address_virtual_unprotected(notifier_base_va + semaphore->index * sizeof(NvU32));
+    return uvm_gpu_address_virtual_unprotected(notifier_base_va +
+                                               semaphore->index * sizeof(uvm_gpu_semaphore_notifier_t));
 }

 void *uvm_gpu_semaphore_get_auth_tag_cpu_va(uvm_gpu_semaphore_t *semaphore)
@ -622,22 +624,11 @@ void uvm_gpu_tracking_semaphore_free(uvm_gpu_tracking_semaphore_t *tracking_sem)
    uvm_gpu_semaphore_free(&tracking_sem->semaphore);
 }

-static bool should_skip_secure_semaphore_update(NvU32 last_observed_notifier, NvU32 gpu_notifier)
+static void gpu_semaphore_encrypted_payload_update(uvm_channel_t *channel, uvm_gpu_semaphore_t *semaphore)
 {
-    // No new value, or the GPU is currently writing the new encrypted material
-    // and no change in value would still result in corrupted data.
-    return (last_observed_notifier == gpu_notifier) || (gpu_notifier % 2);
-}
-
-static void uvm_gpu_semaphore_encrypted_payload_update(uvm_channel_t *channel, uvm_gpu_semaphore_t *semaphore)
-{
-    UvmCslIv local_iv;
    NvU32 local_payload;
-    NvU32 new_sem_value;
-    NvU32 gpu_notifier;
-    NvU32 last_observed_notifier;
-    NvU32 new_gpu_notifier = 0;
-    NvU32 iv_index = 0;
+    uvm_gpu_semaphore_notifier_t gpu_notifier;
+    uvm_gpu_semaphore_notifier_t new_gpu_notifier = 0;

    // A channel can have multiple entries pending and the tracking semaphore
    // update of each entry can race with this function. Since the semaphore
@ -646,62 +637,72 @@ static void uvm_gpu_semaphore_encrypted_payload_update(uvm_channel_t *channel, u
    unsigned tries_left = channel->num_gpfifo_entries;
    NV_STATUS status = NV_OK;
    NvU8 local_auth_tag[UVM_CONF_COMPUTING_AUTH_TAG_SIZE];
-    UvmCslIv *ivs_cpu_addr = semaphore->conf_computing.ivs;
-    NvU32 *gpu_notifier_cpu_addr = uvm_gpu_semaphore_get_notifier_cpu_va(semaphore);
+    uvm_gpu_semaphore_notifier_t *semaphore_notifier_cpu_addr = uvm_gpu_semaphore_get_notifier_cpu_va(semaphore);

    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
    UVM_ASSERT(uvm_channel_is_ce(channel));

-    last_observed_notifier = semaphore->conf_computing.last_observed_notifier;
-    gpu_notifier = UVM_READ_ONCE(*gpu_notifier_cpu_addr);
-    UVM_ASSERT(last_observed_notifier <= gpu_notifier);
-
-    if (should_skip_secure_semaphore_update(last_observed_notifier, gpu_notifier))
-        return;
-
    do {
-        gpu_notifier = UVM_READ_ONCE(*gpu_notifier_cpu_addr);
+        gpu_notifier = READ_ONCE(*semaphore_notifier_cpu_addr);
+
+        UVM_ASSERT(gpu_notifier >= semaphore->conf_computing.last_observed_notifier);

        // Odd notifier value means there's an update in progress.
        if (gpu_notifier % 2)
            continue;

+        // There's no change since last time
+        if (gpu_notifier == semaphore->conf_computing.last_observed_notifier)
+            return;
+
        // Make sure no memory accesses happen before we read the notifier
        smp_mb__after_atomic();

-        iv_index = (gpu_notifier / 2) % channel->num_gpfifo_entries;
        memcpy(local_auth_tag, uvm_gpu_semaphore_get_auth_tag_cpu_va(semaphore), sizeof(local_auth_tag));
-        local_payload = UVM_READ_ONCE(*uvm_gpu_semaphore_get_encrypted_payload_cpu_va(semaphore));
-        memcpy(&local_iv, &ivs_cpu_addr[iv_index], sizeof(local_iv));
+        local_payload = READ_ONCE(*uvm_gpu_semaphore_get_encrypted_payload_cpu_va(semaphore));

        // Make sure the second read of notifier happens after
        // all memory accesses.
        smp_mb__before_atomic();
-        new_gpu_notifier = UVM_READ_ONCE(*gpu_notifier_cpu_addr);
+        new_gpu_notifier = READ_ONCE(*semaphore_notifier_cpu_addr);
        tries_left--;
    } while ((tries_left > 0) && ((gpu_notifier != new_gpu_notifier) || (gpu_notifier % 2)));

    if (!tries_left) {
        status = NV_ERR_INVALID_STATE;
-        goto error;
    }
+    else {
+        NvU32 key_version;
+        const NvU32 iv_index = (gpu_notifier / 2) % channel->num_gpfifo_entries;
+        NvU32 new_semaphore_value;
+
+        UVM_ASSERT(gpu_notifier == new_gpu_notifier);
+        UVM_ASSERT(gpu_notifier % 2 == 0);
+
+        // CPU decryption is guaranteed to use the same key version as the
+        // associated GPU encryption, because if there was any key rotation in
+        // between, then key rotation waited for all channels to complete before
+        // proceeding. The wait implies that the semaphore value matches the
+        // last one encrypted on the GPU, so this CPU decryption should happen
+        // before the key is rotated.
+        key_version = uvm_channel_pool_key_version(channel->pool);

-    if (gpu_notifier == new_gpu_notifier) {
        status = uvm_conf_computing_cpu_decrypt(channel,
-                                                &new_sem_value,
+                                                &new_semaphore_value,
                                                &local_payload,
-                                                &local_iv,
-                                                sizeof(new_sem_value),
+                                                &semaphore->conf_computing.ivs[iv_index],
+                                                key_version,
+                                                sizeof(new_semaphore_value),
                                                &local_auth_tag);

        if (status != NV_OK)
            goto error;

-        uvm_gpu_semaphore_set_payload(semaphore, new_sem_value);
-        UVM_WRITE_ONCE(semaphore->conf_computing.last_observed_notifier, new_gpu_notifier);
-    }
+        uvm_gpu_semaphore_set_payload(semaphore, new_semaphore_value);
+        WRITE_ONCE(semaphore->conf_computing.last_observed_notifier, new_gpu_notifier);

-    return;
+        return;
+    }

 error:
    // Decryption failure is a fatal error as well as running out of try left.
@ -728,7 +729,7 @@ static NvU64 update_completed_value_locked(uvm_gpu_tracking_semaphore_t *trackin
        // TODO: Bug 4008734: [UVM][HCC] Extend secure tracking semaphore
        //                     mechanism to all semaphore
        uvm_channel_t *channel = container_of(tracking_semaphore, uvm_channel_t, tracking_sem);
-        uvm_gpu_semaphore_encrypted_payload_update(channel, &tracking_semaphore->semaphore);
+        gpu_semaphore_encrypted_payload_update(channel, &tracking_semaphore->semaphore);
    }

    new_sem_value = uvm_gpu_semaphore_get_payload(&tracking_semaphore->semaphore);
--- a/kernel-open/nvidia-uvm/uvm_gpu_semaphore.h
+++ b/kernel-open/nvidia-uvm/uvm_gpu_semaphore.h
@ -29,6 +29,8 @@
 #include "uvm_rm_mem.h"
 #include "uvm_linux.h"

+typedef NvU32 uvm_gpu_semaphore_notifier_t;
+
 // A GPU semaphore is a memory location accessible by the GPUs and the CPU
 // that's used for synchronization among them.
 // The GPU has primitives to acquire (wait for) and release (set) 4-byte memory
@ -52,8 +54,8 @@ struct uvm_gpu_semaphore_struct
        UvmCslIv *ivs;
        NvU32 cached_payload;

-        NvU32 last_pushed_notifier;
-        NvU32 last_observed_notifier;
+        uvm_gpu_semaphore_notifier_t last_pushed_notifier;
+        uvm_gpu_semaphore_notifier_t last_observed_notifier;
    } conf_computing;
 };

@ -154,7 +156,7 @@ NvU32 *uvm_gpu_semaphore_get_cpu_va(uvm_gpu_semaphore_t *semaphore);
 NvU32 *uvm_gpu_semaphore_get_encrypted_payload_cpu_va(uvm_gpu_semaphore_t *semaphore);
 uvm_gpu_address_t uvm_gpu_semaphore_get_encrypted_payload_gpu_va(uvm_gpu_semaphore_t *semaphore);

-NvU32 *uvm_gpu_semaphore_get_notifier_cpu_va(uvm_gpu_semaphore_t *semaphore);
+uvm_gpu_semaphore_notifier_t *uvm_gpu_semaphore_get_notifier_cpu_va(uvm_gpu_semaphore_t *semaphore);
 uvm_gpu_address_t uvm_gpu_semaphore_get_notifier_gpu_va(uvm_gpu_semaphore_t *semaphore);

 void *uvm_gpu_semaphore_get_auth_tag_cpu_va(uvm_gpu_semaphore_t *semaphore);
--- a/kernel-open/nvidia-uvm/uvm_hmm.c
+++ b/kernel-open/nvidia-uvm/uvm_hmm.c
@ -73,6 +73,24 @@ module_param(uvm_disable_hmm, bool, 0444);
 #include "uvm_va_policy.h"
 #include "uvm_tools.h"

+// The function nv_PageSwapCache() wraps the check for page swap cache flag in
+// order to support a wide variety of kernel versions.
+// The function PageSwapCache() is removed after 32f51ead3d77 ("mm: remove
+// PageSwapCache") in v6.12-rc1.
+// The function folio_test_swapcache() was added in Linux 5.16 (d389a4a811551
+// "mm: Add folio flag manipulation functions")
+// Systems with HMM patches backported to 5.14 are possible, but those systems
+// do not include folio_test_swapcache()
+// TODO: Bug 4050579: Remove this when migration of swap cached pages is updated
+static __always_inline bool nv_PageSwapCache(struct page *page)
+{
+#if defined(NV_FOLIO_TEST_SWAPCACHE_PRESENT)
+    return folio_test_swapcache(page_folio(page));
+#else
+    return PageSwapCache(page);
+#endif
+}
+
 static NV_STATUS gpu_chunk_add(uvm_va_block_t *va_block,
                               uvm_page_index_t page_index,
                               struct page *page);
@ -853,7 +871,7 @@ static NV_STATUS hmm_split_block(uvm_va_block_t *va_block,

    uvm_mutex_lock(&va_block->lock);

-    status = uvm_va_block_split_locked(va_block, new_end, new_va_block, NULL);
+    status = uvm_va_block_split_locked(va_block, new_end, new_va_block);
    if (status != NV_OK)
        goto err;

@ -1351,7 +1369,7 @@ void uvm_hmm_block_add_eviction_mappings(uvm_va_space_t *va_space,
            uvm_processor_mask_andnot(map_processors, &va_block->evicted_gpus, &node->policy.accessed_by);

            for_each_gpu_id_in_mask(id, map_processors) {
-                uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, id);
+                uvm_gpu_t *gpu = uvm_gpu_get(id);
                uvm_va_block_gpu_state_t *gpu_state;

                if (!gpu->parent->access_counters_supported)
@ -1981,7 +1999,7 @@ static void fill_dst_pfns(uvm_va_block_t *va_block,
                          uvm_page_mask_t *same_devmem_page_mask,
                          uvm_processor_id_t dest_id)
 {
-    uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_block->hmm.va_space, dest_id);
+    uvm_gpu_t *gpu = uvm_gpu_get(dest_id);
    uvm_page_index_t page_index;

    uvm_page_mask_zero(same_devmem_page_mask);
@ -2694,7 +2712,7 @@ static NV_STATUS dmamap_src_sysmem_pages(uvm_va_block_t *va_block,
                continue;
            }

-            if (PageSwapCache(src_page)) {
+            if (nv_PageSwapCache(src_page)) {
                // TODO: Bug 4050579: Remove this when swap cached pages can be
                // migrated.
                status = NV_WARN_MISMATCHED_TARGET;
@ -3512,17 +3530,17 @@ NV_STATUS uvm_hmm_va_block_range_bounds(uvm_va_space_t *va_space,
    *endp   = end;

    if (params) {
-        uvm_va_space_processor_uuid(va_space, &params->resident_on[0], UVM_ID_CPU);
+        uvm_processor_get_uuid(UVM_ID_CPU, &params->resident_on[0]);
        params->resident_physical_size[0] = PAGE_SIZE;
        params->resident_on_count = 1;

-        uvm_va_space_processor_uuid(va_space, &params->mapped_on[0], UVM_ID_CPU);
+        uvm_processor_get_uuid(UVM_ID_CPU, &params->mapped_on[0]);
        params->mapping_type[0] = (vma->vm_flags & VM_WRITE) ?
                                  UVM_PROT_READ_WRITE_ATOMIC : UVM_PROT_READ_ONLY;
        params->page_size[0] = PAGE_SIZE;
        params->mapped_on_count = 1;

-        uvm_va_space_processor_uuid(va_space, &params->populated_on[0], UVM_ID_CPU);
+        uvm_processor_get_uuid(UVM_ID_CPU, &params->populated_on[0]);
        params->populated_on_count = 1;
    }

@ -3676,12 +3694,12 @@ NV_STATUS uvm_hmm_va_range_info(uvm_va_space_t *va_space,
        params->read_duplication = node->policy.read_duplication;

        if (!UVM_ID_IS_INVALID(node->policy.preferred_location)) {
-            uvm_va_space_processor_uuid(va_space, &params->preferred_location, node->policy.preferred_location);
+            uvm_processor_get_uuid(node->policy.preferred_location, &params->preferred_location);
            params->preferred_cpu_nid = node->policy.preferred_nid;
        }

        for_each_id_in_mask(processor_id, &node->policy.accessed_by)
-            uvm_va_space_processor_uuid(va_space, &params->accessed_by[params->accessed_by_count++], processor_id);
+            uvm_processor_get_uuid(processor_id, &params->accessed_by[params->accessed_by_count++]);
    }
    else {
        uvm_range_tree_find_hole_in(&va_block->hmm.va_policy_tree, params->lookup_address,
--- a/kernel-open/nvidia-uvm/uvm_host_test.c
+++ b/kernel-open/nvidia-uvm/uvm_host_test.c
@ -186,19 +186,19 @@ static NV_STATUS test_semaphore_acquire(uvm_gpu_t *gpu)
    uvm_push_end(&push);

    // Wait for sema_A release.
-    UVM_SPIN_WHILE(UVM_READ_ONCE(*cpu_sema_A) != 1, &spin);
+    UVM_SPIN_WHILE(READ_ONCE(*cpu_sema_A) != 1, &spin);

    // Sleep for 10ms, the GPU waits while sema_B is held by us.
    msleep(10);

-    check_sema_C = UVM_READ_ONCE(*cpu_sema_C) == 0;
+    check_sema_C = READ_ONCE(*cpu_sema_C) == 0;

    // memory fence/barrier, check comment in
    // uvm_gpu_semaphore.c:uvm_gpu_semaphore_set_payload() for details.
    mb();

    // Release sema_B.
-    UVM_WRITE_ONCE(*cpu_sema_B, 1);
+    WRITE_ONCE(*cpu_sema_B, 1);

    // Wait for the GPU to release sema_C, i.e., the end of the push.
    status = uvm_push_wait(&push);
@ -207,7 +207,7 @@ static NV_STATUS test_semaphore_acquire(uvm_gpu_t *gpu)
    // check_sema_C is validated here to ensure the push has ended and was not
    // interrupted in the middle, had the check failed.
    TEST_CHECK_GOTO(check_sema_C, done);
-    TEST_CHECK_GOTO(UVM_READ_ONCE(*cpu_sema_C) == 1, done);
+    TEST_CHECK_GOTO(READ_ONCE(*cpu_sema_C) == 1, done);

 done:
    test_semaphore_free_sem(gpu, &mem);
--- a/kernel-open/nvidia-uvm/uvm_ioctl.h
+++ b/kernel-open/nvidia-uvm/uvm_ioctl.h
@ -847,7 +847,6 @@ typedef struct
    NvProcessorUuid processor;                            // IN
    NvU32           allProcessors;                        // IN
    NvU32           uvmFd;                                // IN
-    NvU32           version;                              // IN (UvmToolsEventQueueVersion)
    NV_STATUS       rmStatus;                             // OUT
 } UVM_TOOLS_INIT_EVENT_TRACKER_PARAMS;

@ -934,7 +933,6 @@ typedef struct
 typedef struct
 {
    NvU64     tablePtr                 NV_ALIGN_BYTES(8); // IN
-    NvU32     version;                                    // IN (UvmToolsEventQueueVersion)
    NV_STATUS rmStatus;                                   // OUT
 } UVM_TOOLS_GET_PROCESSOR_UUID_TABLE_PARAMS;

@ -1097,6 +1095,36 @@ typedef struct
    NV_STATUS               rmStatus; // OUT
 } UVM_MM_INITIALIZE_PARAMS;

+#define UVM_TOOLS_INIT_EVENT_TRACKER_V2                               UVM_IOCTL_BASE(76)
+typedef UVM_TOOLS_INIT_EVENT_TRACKER_PARAMS UVM_TOOLS_INIT_EVENT_TRACKER_V2_PARAMS;
+
+#define UVM_TOOLS_GET_PROCESSOR_UUID_TABLE_V2                         UVM_IOCTL_BASE(77)
+typedef UVM_TOOLS_GET_PROCESSOR_UUID_TABLE_PARAMS UVM_TOOLS_GET_PROCESSOR_UUID_TABLE_V2_PARAMS;
+
+//
+// UvmAllocDeviceP2P
+//
+#define UVM_ALLOC_DEVICE_P2P                                          UVM_IOCTL_BASE(78)
+typedef struct
+{
+    NvU64                   base                            NV_ALIGN_BYTES(8); // IN
+    NvU64                   length                          NV_ALIGN_BYTES(8); // IN
+    NvU64                   offset                          NV_ALIGN_BYTES(8); // IN
+    NvProcessorUuid         gpuUuid;                                           // IN
+    NvS32                   rmCtrlFd;                                          // IN
+    NvU32                   hClient;                                           // IN
+    NvU32                   hMemory;                                           // IN
+
+    NV_STATUS               rmStatus;                                          // OUT
+} UVM_ALLOC_DEVICE_P2P_PARAMS;
+
+#define UVM_CLEAR_ALL_ACCESS_COUNTERS                                 UVM_IOCTL_BASE(79)
+
+typedef struct
+{
+    NV_STATUS       rmStatus; // OUT
+} UVM_CLEAR_ALL_ACCESS_COUNTERS_PARAMS;
+
 //
 // Temporary ioctls which should be removed before UVM 8 release
 // Number backwards from 2047 - highest custom ioctl function number
--- a/kernel-open/nvidia-uvm/uvm_linux.h
+++ b/kernel-open/nvidia-uvm/uvm_linux.h
@ -49,6 +49,7 @@
 #include <linux/jhash.h>
 #include <linux/rwsem.h>
 #include <linux/rbtree.h>
+#include <linux/mm.h>

 #if defined(NV_ASM_BARRIER_H_PRESENT)
 #include <asm/barrier.h>
@ -147,21 +148,8 @@ static inline const struct cpumask *uvm_cpumask_of_node(int node)
    #endif

 // See bug 1707453 for further details about setting the minimum kernel version.
-#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 10, 0)
-#  error This driver does not support kernels older than 3.10!
-#endif
-
-#if !defined(VM_RESERVED)
-#define VM_RESERVED    0x00000000
-#endif
-#if !defined(VM_DONTEXPAND)
-#define VM_DONTEXPAND  0x00000000
-#endif
-#if !defined(VM_DONTDUMP)
-#define VM_DONTDUMP    0x00000000
-#endif
-#if !defined(VM_MIXEDMAP)
-#define VM_MIXEDMAP    0x00000000
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0)
+#  error This driver does not support kernels older than 4.4!
 #endif

 //
@ -185,94 +173,8 @@ static inline const struct cpumask *uvm_cpumask_of_node(int node)
            printk(fmt, ##__VA_ARGS__); \
    } while (0)

-// printk_ratelimited was added in 2.6.33 via commit
-// 8a64f336bc1d4aa203b138d29d5a9c414a9fbb47. If not available, we prefer not
-// printing anything since it's supposed to be rate-limited.
-#if !defined(printk_ratelimited)
-    #define printk_ratelimited UVM_NO_PRINT
-#endif
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0)
-    // Just too much compilation trouble with the rate-limiting printk feature
-    // until about k3.8. Because the non-rate-limited printing will cause
-    // surprises and problems, just turn it off entirely in this situation.
-    //
-    #undef pr_debug_ratelimited
-    #define pr_debug_ratelimited UVM_NO_PRINT
-#endif
-
-#if defined(NVCPU_X86) || defined(NVCPU_X86_64)
-#if !defined(pmd_large)
-#define pmd_large(_pmd) \
-    ((pmd_val(_pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT))
-#endif
-#endif /* defined(NVCPU_X86) || defined(NVCPU_X86_64) */
-
-#if !defined(GFP_DMA32)
-/*
- * GFP_DMA32 is similar to GFP_DMA, but instructs the Linux zone
- * allocator to allocate memory from the first 4GB on platforms
- * such as Linux/x86-64; the alternative is to use an IOMMU such
- * as the one implemented with the K8 GART, if available.
- */
-#define GFP_DMA32 0
-#endif
-
-#if !defined(__GFP_NOWARN)
-#define __GFP_NOWARN 0
-#endif
-
-#if !defined(__GFP_NORETRY)
-#define __GFP_NORETRY 0
-#endif
-
 #define NV_UVM_GFP_FLAGS (GFP_KERNEL)

-// Develop builds define DEBUG but enable optimization
-#if defined(DEBUG) && !defined(NVIDIA_UVM_DEVELOP)
-  // Wrappers for functions not building correctly without optimizations on,
-  // implemented in uvm_debug_optimized.c. Notably the file is only built for
-  // debug builds, not develop or release builds.
-
-  // Unoptimized builds of atomic_xchg() hit a BUILD_BUG() on arm64 as it relies
-  // on __xchg being completely inlined:
-  //   /usr/src/linux-3.12.19/arch/arm64/include/asm/cmpxchg.h:67:3: note: in expansion of macro 'BUILD_BUG'
-  //
-  // Powerppc hits a similar issue, but ends up with an undefined symbol:
-  //   WARNING: "__xchg_called_with_bad_pointer" [...] undefined!
-  int nv_atomic_xchg(atomic_t *val, int new);
-
-  // Same problem as atomic_xchg() on powerppc:
-  //   WARNING: "__cmpxchg_called_with_bad_pointer" [...] undefined!
-  int nv_atomic_cmpxchg(atomic_t *val, int old, int new);
-
-  // Same problem as atomic_xchg() on powerppc:
-  //   WARNING: "__cmpxchg_called_with_bad_pointer" [...] undefined!
-  long nv_atomic_long_cmpxchg(atomic_long_t *val, long old, long new);
-
-  // This Linux kernel commit:
-  // 2016-08-30  0d025d271e55f3de21f0aaaf54b42d20404d2b23
-  // leads to build failures on x86_64, when compiling without optimization. Avoid
-  // that problem, by providing our own builds of copy_from_user / copy_to_user,
-  // for debug (non-optimized) UVM builds. Those are accessed via these
-  // nv_copy_to/from_user wrapper functions.
-  //
-  // Bug 1849583 has further details.
-  unsigned long nv_copy_from_user(void *to, const void __user *from, unsigned long n);
-  unsigned long nv_copy_to_user(void __user *to, const void *from, unsigned long n);
-
-#else
-  #define nv_atomic_xchg            atomic_xchg
-  #define nv_atomic_cmpxchg         atomic_cmpxchg
-  #define nv_atomic_long_cmpxchg    atomic_long_cmpxchg
-  #define nv_copy_to_user           copy_to_user
-  #define nv_copy_from_user         copy_from_user
-#endif
-
-#ifndef NV_ALIGN_DOWN
-#define NV_ALIGN_DOWN(v,g) ((v) & ~((g) - 1))
-#endif
-
 #if defined(NVCPU_X86)
 /* Some old IA32 kernels don't have 64/64 division routines,
 * they only support 64/32 division with do_div(). */
@ -295,7 +197,6 @@ static inline uint64_t NV_DIV64(uint64_t dividend, uint64_t divisor, uint64_t *r
 }
 #endif

-#if defined(CLOCK_MONOTONIC_RAW)
 /* Return a nanosecond-precise value */
 static inline NvU64 NV_GETTIME(void)
 {
@ -304,60 +205,6 @@ static inline NvU64 NV_GETTIME(void)
    ktime_get_raw_ts64(&tm);
    return (NvU64) timespec64_to_ns(&tm);
 }
-#else
-/* We can only return a microsecond-precise value with the
- * available non-GPL symbols. */
-static inline NvU64 NV_GETTIME(void)
-{
-    struct timespec64 tm;
-
-    ktime_get_real_ts64(&tm);
-    return (NvU64) timespec64_to_ns(&tm);
-}
-#endif
-
-#if !defined(ilog2)
-    static inline int NV_ILOG2_U32(u32 n)
-    {
-        return fls(n) - 1;
-    }
-    static inline int NV_ILOG2_U64(u64 n)
-    {
-        return fls64(n) - 1;
-    }
-    #define ilog2(n) (sizeof(n) <= 4 ? NV_ILOG2_U32(n) : NV_ILOG2_U64(n))
-#endif
-
-// for_each_bit added in 2.6.24 via commit 3e037454bcfa4b187e8293d2121bd8c0f5a5c31c
-// later renamed in 2.6.34 via commit 984b3f5746ed2cde3d184651dabf26980f2b66e5
-#if !defined(for_each_set_bit)
-    #define for_each_set_bit(bit, addr, size) for_each_bit((bit), (addr), (size))
-#endif
-
-// for_each_set_bit_cont was added in 3.2 via 1e2ad28f80b4e155678259238f51edebc19e4014
-// It was renamed to for_each_set_bit_from in 3.3 via 307b1cd7ecd7f3dc5ce3d3860957f034f0abe4df
-#if !defined(for_each_set_bit_from)
-    #define for_each_set_bit_from(bit, addr, size)              \
-        for ((bit) = find_next_bit((addr), (size), (bit));      \
-             (bit) < (size);                                    \
-             (bit) = find_next_bit((addr), (size), (bit) + 1))
-#endif
-
-// for_each_clear_bit and for_each_clear_bit_from were added in 3.10 via
-// 03f4a8226c2f9c14361f75848d1e93139bab90c4
-#if !defined(for_each_clear_bit)
-    #define for_each_clear_bit(bit, addr, size)                     \
-        for ((bit) = find_first_zero_bit((addr), (size));           \
-             (bit) < (size);                                        \
-             (bit) = find_next_zero_bit((addr), (size), (bit) + 1))
-#endif
-
-#if !defined(for_each_clear_bit_from)
-    #define for_each_clear_bit_from(bit, addr, size)                \
-        for ((bit) = find_next_zero_bit((addr), (size), (bit));     \
-             (bit) < (size);                                        \
-             (bit) = find_next_zero_bit((addr), (size), (bit) + 1))
-#endif

 #if !defined(NV_FIND_NEXT_BIT_WRAP_PRESENT)
    static inline unsigned long find_next_bit_wrap(const unsigned long *addr, unsigned long size, unsigned long offset)
@ -400,71 +247,6 @@ static inline unsigned long __for_each_wrap(const unsigned long *bitmap,
         (bit) = __for_each_wrap((addr), (size), (start), (bit) + 1))
 #endif

-// Added in 2.6.24
-#ifndef ACCESS_ONCE
-  #define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
-#endif
-
-// WRITE_ONCE/READ_ONCE have incompatible definitions across versions, which produces warnings.
-// Therefore, we define our own macros
-#define UVM_WRITE_ONCE(x, val) (ACCESS_ONCE(x) = (val))
-#define UVM_READ_ONCE(x) ACCESS_ONCE(x)
-
-// smp_mb__before_atomic was added in 3.16, provide a fallback
-#ifndef smp_mb__before_atomic
-  #if NVCPU_IS_X86 || NVCPU_IS_X86_64
-    // That's what the kernel does for x86
-    #define smp_mb__before_atomic() barrier()
-  #else
-    // That's what the kernel does for at least arm32, arm64 and powerpc as of 4.3
-    #define smp_mb__before_atomic() smp_mb()
-  #endif
-#endif
-
-// smp_mb__after_atomic was added in 3.16, provide a fallback
-#ifndef smp_mb__after_atomic
-  #if NVCPU_IS_X86 || NVCPU_IS_X86_64
-    // That's what the kernel does for x86
-    #define smp_mb__after_atomic() barrier()
-  #else
-    // That's what the kernel does for at least arm32, arm64 and powerpc as of 4.3
-    #define smp_mb__after_atomic() smp_mb()
-  #endif
-#endif
-
-// smp_load_acquire and smp_store_release were added in commit
-// 47933ad41a86a4a9b50bed7c9b9bd2ba242aac63 ("arch: Introduce
-// smp_load_acquire(), smp_store_release()") in v3.14 (2013-11-06).
-#ifndef smp_load_acquire
-    #define smp_load_acquire(p)                     \
-        ({                                          \
-            typeof(*(p)) __v = UVM_READ_ONCE(*(p)); \
-            smp_mb();                               \
-            __v;                                    \
-        })
-#endif
-
-#ifndef smp_store_release
-    #define smp_store_release(p, v)     \
-        do {                            \
-            smp_mb();                   \
-            UVM_WRITE_ONCE(*(p), v);    \
-        } while (0)
-#endif
-
-// atomic_read_acquire and atomic_set_release were added in commit
-// 654672d4ba1a6001c365833be895f9477c4d5eab ("locking/atomics:
-// Add _{acquire|release|relaxed}() variants of some atomic operations") in v4.3
-// (2015-08-06).
-// TODO: Bug 3849079: We always use this definition on newer kernels.
-#ifndef atomic_read_acquire
-    #define atomic_read_acquire(p) smp_load_acquire(&(p)->counter)
-#endif
-
-#ifndef atomic_set_release
-    #define atomic_set_release(p, v) smp_store_release(&(p)->counter, v)
-#endif
-
 // atomic_long_read_acquire and atomic_long_set_release were added in commit
 // b5d47ef9ea5c5fe31d7eabeb79f697629bd9e2cb ("locking/atomics: Switch to
 // generated atomic-long") in v5.1 (2019-05-05).
@ -484,29 +266,6 @@ static inline void uvm_atomic_long_set_release(atomic_long_t *p, long v)
    atomic_long_set(p, v);
 }

-// Added in 3.11
-#ifndef PAGE_ALIGNED
-    #define PAGE_ALIGNED(addr) (((addr) & (PAGE_SIZE - 1)) == 0)
-#endif
-
-// Changed in 3.17 via commit 743162013d40ca612b4cb53d3a200dff2d9ab26e
-#if (NV_WAIT_ON_BIT_LOCK_ARGUMENT_COUNT == 3)
-    #define UVM_WAIT_ON_BIT_LOCK(word, bit, mode) \
-        wait_on_bit_lock(word, bit, mode)
-#elif (NV_WAIT_ON_BIT_LOCK_ARGUMENT_COUNT == 4)
-    static __sched int uvm_bit_wait(void *word)
-    {
-        if (signal_pending_state(current->state, current))
-            return 1;
-        schedule();
-        return 0;
-    }
-    #define UVM_WAIT_ON_BIT_LOCK(word, bit, mode) \
-        wait_on_bit_lock(word, bit, uvm_bit_wait, mode)
-#else
-#error "Unknown number of arguments"
-#endif
-
 static void uvm_init_radix_tree_preloadable(struct radix_tree_root *tree)
 {
    // GFP_NOWAIT, or some combination of flags that avoids setting
@ -596,6 +355,8 @@ typedef struct
  #include <asm/pgtable_types.h>
 #endif

+// Added in 57bd1905b228f (acpi, x86/mm: Remove encryption mask from ACPI page
+// protection type), v4.13
 #if !defined(PAGE_KERNEL_NOENC)
  #define PAGE_KERNEL_NOENC PAGE_KERNEL
 #endif
@ -621,15 +382,4 @@ static inline pgprot_t uvm_pgprot_decrypted(pgprot_t prot)
   return prot;
 }

-// Commit 1dff8083a024650c75a9c961c38082473ceae8cf (v4.7).
-//
-// Archs with CONFIG_MMU should have their own page.h, and can't include
-// asm-generic/page.h. However, x86, powerpc, arm64 don't define page_to_virt()
-// macro in their version of page.h.
-#include <linux/mm.h>
-#ifndef page_to_virt
-  #include <asm/page.h>
-  #define page_to_virt(x)    __va(PFN_PHYS(page_to_pfn(x)))
-#endif
-
 #endif // _UVM_LINUX_H
--- a/kernel-open/nvidia-uvm/uvm_lock.c
+++ b/kernel-open/nvidia-uvm/uvm_lock.c
@ -27,7 +27,7 @@

 const char *uvm_lock_order_to_string(uvm_lock_order_t lock_order)
 {
-    BUILD_BUG_ON(UVM_LOCK_ORDER_COUNT != 34);
+    BUILD_BUG_ON(UVM_LOCK_ORDER_COUNT != 36);

    switch (lock_order) {
        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_INVALID);
@ -48,7 +48,9 @@ const char *uvm_lock_order_to_string(uvm_lock_order_t lock_order)
        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_CONF_COMPUTING_DMA_BUFFER_POOL);
        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_CHUNK_MAPPING);
        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_PAGE_TREE);
+        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_KEY_ROTATION);
        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_CSL_PUSH);
+        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_KEY_ROTATION_WLC);
        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_CSL_WLC_PUSH);
        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_CSL_SEC2_PUSH);
        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_PUSH);
--- a/kernel-open/nvidia-uvm/uvm_lock.h
+++ b/kernel-open/nvidia-uvm/uvm_lock.h
@ -322,6 +322,15 @@
 //      Operations not allowed while holding this lock
 //      - GPU memory allocation which can evict
 //
+// - Channel pool key rotation lock
+//      Order: UVM_LOCK_ORDER_KEY_ROTATION
+//      Condition: Confidential Computing is enabled
+//      Mutex per channel pool
+//
+//      The lock ensures mutual exclusion during key rotation affecting all the
+//      channels in the associated pool. Key rotation in WLC pools is handled
+//      using a separate lock order, see UVM_LOCK_ORDER_KEY_ROTATION_WLC below.
+//
 // - CE channel CSL channel pool semaphore
 //      Order: UVM_LOCK_ORDER_CSL_PUSH
 //      Condition: The Confidential Computing feature is enabled
@ -338,6 +347,15 @@
 //      Operations allowed while holding this lock
 //      - Pushing work to CE channels (except for WLC channels)
 //
+// - WLC channel pool key rotation lock
+//      Order: UVM_LOCK_ORDER_KEY_ROTATION_WLC
+//      Condition: Confidential Computing is enabled
+//      Mutex of WLC channel pool
+//
+//      The lock has the same purpose as the regular channel pool key rotation
+//      lock. Using a different order lock for WLC channels allows key rotation
+//      on those channels during indirect work submission.
+//
 // - WLC CSL channel pool semaphore
 //      Order: UVM_LOCK_ORDER_CSL_WLC_PUSH
 //      Condition: The Confidential Computing feature is enabled
@ -484,7 +502,9 @@ typedef enum
    UVM_LOCK_ORDER_CONF_COMPUTING_DMA_BUFFER_POOL,
    UVM_LOCK_ORDER_CHUNK_MAPPING,
    UVM_LOCK_ORDER_PAGE_TREE,
+    UVM_LOCK_ORDER_KEY_ROTATION,
    UVM_LOCK_ORDER_CSL_PUSH,
+    UVM_LOCK_ORDER_KEY_ROTATION_WLC,
    UVM_LOCK_ORDER_CSL_WLC_PUSH,
    UVM_LOCK_ORDER_CSL_SEC2_PUSH,
    UVM_LOCK_ORDER_PUSH,
@ -1208,7 +1228,7 @@ static void __uvm_bit_lock(uvm_bit_locks_t *bit_locks, unsigned long bit)
 {
    int res;

-    res = UVM_WAIT_ON_BIT_LOCK(bit_locks->bits, bit, TASK_UNINTERRUPTIBLE);
+    res = wait_on_bit_lock(bit_locks->bits, bit, TASK_UNINTERRUPTIBLE);
    UVM_ASSERT_MSG(res == 0, "Uninterruptible task interrupted: %d\n", res);
    uvm_assert_bit_locked(bit_locks, bit);
 }
--- a/kernel-open/nvidia-uvm/uvm_map_external.c
+++ b/kernel-open/nvidia-uvm/uvm_map_external.c
@ -171,8 +171,11 @@ static NV_STATUS uvm_pte_buffer_get(uvm_pte_buffer_t *pte_buffer,
    pte_buffer->mapping_info.pteBufferSize = pte_buffer->num_ptes * pte_buffer->pte_size;

    if (va_range->type == UVM_VA_RANGE_TYPE_CHANNEL) {
+        uvm_va_range_channel_t *channel_range;
+
+        channel_range = uvm_va_range_to_channel(va_range);
        status = uvm_rm_locked_call(nvUvmInterfaceGetChannelResourcePtes(gpu_va_space->duped_gpu_va_space,
-                                                                         va_range->channel.rm_descriptor,
+                                                                         channel_range->rm_descriptor,
                                                                         map_offset,
                                                                         pte_buffer->num_ptes * pte_buffer->page_size,
                                                                         &pte_buffer->mapping_info));
@ -345,8 +348,8 @@ static NV_STATUS map_rm_pt_range(uvm_page_tree_t *tree,
 static uvm_membar_t va_range_downgrade_membar(uvm_va_range_t *va_range, uvm_ext_gpu_map_t *ext_gpu_map)
 {
    if (va_range->type == UVM_VA_RANGE_TYPE_CHANNEL) {
-        return uvm_hal_downgrade_membar_type(va_range->channel.gpu_va_space->gpu,
-                                             va_range->channel.aperture == UVM_APERTURE_VID);
+        return uvm_hal_downgrade_membar_type(uvm_va_range_to_channel(va_range)->gpu_va_space->gpu,
+                                             uvm_va_range_to_channel(va_range)->aperture == UVM_APERTURE_VID);
    }

    // If there is no mem_handle, this is a sparse mapping.
@ -412,7 +415,7 @@ NV_STATUS uvm_va_range_map_rm_allocation(uvm_va_range_t *va_range,
    }
    else {
        node = &va_range->node;
-        pt_range_vec = &va_range->channel.pt_range_vec;
+        pt_range_vec = &uvm_va_range_to_channel(va_range)->pt_range_vec;
    }

    if (map_offset + uvm_range_tree_node_size(node) > mem_info->size)
@ -593,7 +596,7 @@ static void uvm_release_rm_handle(struct nv_kref *ref)

 static NV_STATUS uvm_create_external_range(uvm_va_space_t *va_space, UVM_CREATE_EXTERNAL_RANGE_PARAMS *params)
 {
-    uvm_va_range_t *va_range = NULL;
+    uvm_va_range_external_t *external_range = NULL;
    struct mm_struct *mm;
    NV_STATUS status = NV_OK;

@ -611,7 +614,7 @@ static NV_STATUS uvm_create_external_range(uvm_va_space_t *va_space, UVM_CREATE_
    // Create the new external VA range.
    // uvm_va_range_create_external handles any collisions when it attempts to
    // insert the new range into the va_space range tree.
-    status = uvm_va_range_create_external(va_space, mm, params->base, params->length, &va_range);
+    status = uvm_va_range_create_external(va_space, mm, params->base, params->length, &external_range);
    if (status != NV_OK) {
        UVM_DBG_PRINT_RL("Failed to create external VA range [0x%llx, 0x%llx)\n",
                         params->base,
@ -651,7 +654,7 @@ static NV_STATUS set_ext_gpu_map_location(uvm_ext_gpu_map_t *ext_gpu_map,
    }
    // This is a local or peer allocation, so the owning GPU must have been
    // registered. This also checks for if EGM owning GPU is registered.
-    owning_gpu = uvm_va_space_get_gpu_by_uuid(va_space, &mem_info->uuid);
+    owning_gpu = uvm_va_space_get_gpu_by_mem_info(va_space, mem_info);
    if (!owning_gpu)
        return NV_ERR_INVALID_DEVICE;

@ -678,18 +681,19 @@ static NV_STATUS set_ext_gpu_map_location(uvm_ext_gpu_map_t *ext_gpu_map,
    return NV_OK;
 }

-static uvm_ext_gpu_map_t *uvm_va_range_ext_gpu_map(uvm_va_range_t *va_range, uvm_gpu_t *mapping_gpu, NvU64 addr)
+static uvm_ext_gpu_map_t *uvm_va_range_ext_gpu_map(uvm_va_range_external_t *external_range,
+                                                   uvm_gpu_t *mapping_gpu,
+                                                   NvU64 addr)
 {
    uvm_ext_gpu_map_t *ext_gpu_map = NULL;
    uvm_range_tree_node_t *node;
    uvm_ext_gpu_range_tree_t *range_tree;

-    UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_EXTERNAL);
-    uvm_assert_rwsem_locked(&va_range->va_space->lock);
+    uvm_assert_rwsem_locked(&external_range->va_range.va_space->lock);

-    range_tree = uvm_ext_gpu_range_tree(va_range, mapping_gpu);
+    range_tree = uvm_ext_gpu_range_tree(external_range, mapping_gpu);

-    if (uvm_processor_mask_test(&va_range->external.mapped_gpus, mapping_gpu->id)) {
+    if (uvm_processor_mask_test(&external_range->mapped_gpus, mapping_gpu->id)) {
        UVM_ASSERT(!uvm_range_tree_empty(&range_tree->tree));
        node = uvm_range_tree_find(&range_tree->tree, addr);
        if (node) {
@ -759,13 +763,13 @@ static NV_STATUS uvm_ext_gpu_map_split(uvm_range_tree_t *tree,
    return NV_OK;
 }

-static NV_STATUS uvm_unmap_external_in_range(uvm_va_range_t *va_range,
+static NV_STATUS uvm_unmap_external_in_range(uvm_va_range_external_t *external_range,
                                             uvm_gpu_t *gpu,
                                             NvU64 start,
                                             NvU64 end,
                                             struct list_head *deferred_list)
 {
-    uvm_ext_gpu_range_tree_t *range_tree = uvm_ext_gpu_range_tree(va_range, gpu);
+    uvm_ext_gpu_range_tree_t *range_tree = uvm_ext_gpu_range_tree(external_range, gpu);
    uvm_ext_gpu_map_t *ext_map, *ext_map_next = NULL;
    NV_STATUS status = NV_OK;

@ -813,7 +817,7 @@ static NV_STATUS uvm_unmap_external_in_range(uvm_va_range_t *va_range,
    //   2. It needs to visit newly created uvm_ext_gpu_map_t, as a result of
    //      splits. This means it can't use safe iterators as they will skip the
    //      newly created uvm_ext_gpu_map_t.
-    ext_map = uvm_ext_gpu_map_iter_first(va_range, gpu, start, end);
+    ext_map = uvm_ext_gpu_map_iter_first(external_range, gpu, start, end);
    while (ext_map) {
        if (start > ext_map->node.start) {
            status = uvm_ext_gpu_map_split(&range_tree->tree, ext_map, start - 1, &ext_map_next);
@ -828,10 +832,10 @@ static NV_STATUS uvm_unmap_external_in_range(uvm_va_range_t *va_range,
                ext_map_next = NULL;
            }
            else {
-                ext_map_next = uvm_ext_gpu_map_iter_next(va_range, ext_map, end);
+                ext_map_next = uvm_ext_gpu_map_iter_next(external_range, ext_map, end);
            }

-            uvm_ext_gpu_map_destroy(va_range, ext_map, deferred_list);
+            uvm_ext_gpu_map_destroy(external_range, ext_map, deferred_list);
        }

        ext_map = ext_map_next;
@ -840,7 +844,7 @@ static NV_STATUS uvm_unmap_external_in_range(uvm_va_range_t *va_range,
    return status;
 }

-static NV_STATUS uvm_map_external_allocation_on_gpu(uvm_va_range_t *va_range,
+static NV_STATUS uvm_map_external_allocation_on_gpu(uvm_va_range_external_t *external_range,
                                                    uvm_gpu_t *mapping_gpu,
                                                    const uvm_rm_user_object_t *user_rm_mem,
                                                    const uvm_map_rm_params_t *map_rm_params,
@ -848,9 +852,9 @@ static NV_STATUS uvm_map_external_allocation_on_gpu(uvm_va_range_t *va_range,
                                                    NvU64 length,
                                                    uvm_tracker_t *out_tracker)
 {
-    uvm_va_space_t *va_space = va_range->va_space;
+    uvm_va_space_t *va_space = external_range->va_range.va_space;
    uvm_ext_gpu_map_t *ext_gpu_map = NULL;
-    uvm_ext_gpu_range_tree_t *range_tree = uvm_ext_gpu_range_tree(va_range, mapping_gpu);
+    uvm_ext_gpu_range_tree_t *range_tree = uvm_ext_gpu_range_tree(external_range, mapping_gpu);
    UvmGpuMemoryInfo mem_info;
    uvm_gpu_va_space_t *gpu_va_space = uvm_gpu_va_space_get(va_space, mapping_gpu);
    NvU64 mapping_page_size;
@ -870,7 +874,7 @@ static NV_STATUS uvm_map_external_allocation_on_gpu(uvm_va_range_t *va_range,

    uvm_mutex_lock(&range_tree->lock);

-    status = uvm_unmap_external_in_range(va_range, mapping_gpu, base, base + length - 1, NULL);
+    status = uvm_unmap_external_in_range(external_range, mapping_gpu, base, base + length - 1, NULL);
    if (status != NV_OK)
        goto error;

@ -880,8 +884,8 @@ static NV_STATUS uvm_map_external_allocation_on_gpu(uvm_va_range_t *va_range,
        goto error;
    }

-    // Insert the ext_gpu_map into the VA range immediately since some of the
-    // below calls require it to be there.
+    // Insert the ext_gpu_map into the external range immediately since some of
+    // the below calls require it to be there.
    ext_gpu_map->node.start = base;
    ext_gpu_map->node.end = base + length - 1;
    RB_CLEAR_NODE(&ext_gpu_map->node.rb_node);
@ -897,14 +901,14 @@ static NV_STATUS uvm_map_external_allocation_on_gpu(uvm_va_range_t *va_range,
    status = uvm_range_tree_add(&range_tree->tree, &ext_gpu_map->node);
    UVM_ASSERT(status == NV_OK);

-    uvm_processor_mask_set_atomic(&va_range->external.mapped_gpus, mapping_gpu->id);
+    uvm_processor_mask_set_atomic(&external_range->mapped_gpus, mapping_gpu->id);
    ext_gpu_map->gpu = mapping_gpu;
    ext_gpu_map->mem_handle->gpu = mapping_gpu;
    nv_kref_init(&ext_gpu_map->mem_handle->ref_count);

    // Error paths after this point may call uvm_va_range_ext_gpu_map, so do a
    // sanity check now to make sure it doesn't trigger any asserts.
-    UVM_ASSERT(uvm_va_range_ext_gpu_map(va_range, mapping_gpu, base) == ext_gpu_map);
+    UVM_ASSERT(uvm_va_range_ext_gpu_map(external_range, mapping_gpu, base) == ext_gpu_map);

    // Dup the memory. This verifies the input handles, takes a ref count on the
    // physical allocation so it can't go away under us, and returns us the
@ -953,7 +957,12 @@ static NV_STATUS uvm_map_external_allocation_on_gpu(uvm_va_range_t *va_range,

    mem_info.pageSize = mapping_page_size;

-    status = uvm_va_range_map_rm_allocation(va_range, mapping_gpu, &mem_info, map_rm_params, ext_gpu_map, out_tracker);
+    status = uvm_va_range_map_rm_allocation(&external_range->va_range,
+                                            mapping_gpu,
+                                            &mem_info,
+                                            map_rm_params,
+                                            ext_gpu_map,
+                                            out_tracker);
    if (status != NV_OK)
        goto error;

@ -961,7 +970,7 @@ static NV_STATUS uvm_map_external_allocation_on_gpu(uvm_va_range_t *va_range,
    return NV_OK;

 error:
-    uvm_ext_gpu_map_destroy(va_range, ext_gpu_map, NULL);
+    uvm_ext_gpu_map_destroy(external_range, ext_gpu_map, NULL);
    uvm_mutex_unlock(&range_tree->lock);
    return status;
 }
@ -969,7 +978,7 @@ error:
 // Actual implementation of UvmMapExternalAllocation
 static NV_STATUS uvm_map_external_allocation(uvm_va_space_t *va_space, UVM_MAP_EXTERNAL_ALLOCATION_PARAMS *params)
 {
-    uvm_va_range_t *va_range = NULL;
+    uvm_va_range_external_t *external_range = NULL;
    uvm_gpu_t *mapping_gpu;
    uvm_processor_mask_t *mapped_gpus;
    NV_STATUS status = NV_OK;
@ -994,11 +1003,10 @@ static NV_STATUS uvm_map_external_allocation(uvm_va_space_t *va_space, UVM_MAP_E
        return NV_ERR_NO_MEMORY;

    uvm_va_space_down_read_rm(va_space);
-    va_range = uvm_va_range_find(va_space, params->base);
+    external_range = uvm_va_range_external_find(va_space, params->base);

-    if (!va_range ||
-        va_range->type != UVM_VA_RANGE_TYPE_EXTERNAL ||
-        va_range->node.end < params->base + params->length - 1) {
+    if (!external_range ||
+        external_range->va_range.node.end < params->base + params->length - 1) {
        uvm_va_space_up_read_rm(va_space);
        uvm_processor_mask_cache_free(mapped_gpus);
        return NV_ERR_INVALID_ADDRESS;
@ -1030,7 +1038,7 @@ static NV_STATUS uvm_map_external_allocation(uvm_va_space_t *va_space, UVM_MAP_E
        map_rm_params.format_type = params->perGpuAttributes[i].gpuFormatType;
        map_rm_params.element_bits = params->perGpuAttributes[i].gpuElementBits;
        map_rm_params.compression_type = params->perGpuAttributes[i].gpuCompressionType;
-        status = uvm_map_external_allocation_on_gpu(va_range,
+        status = uvm_map_external_allocation_on_gpu(external_range,
                                                    mapping_gpu,
                                                    &user_rm_mem,
                                                    &map_rm_params,
@ -1060,17 +1068,17 @@ error:

    // Tear down only those mappings we created during this call
    for_each_va_space_gpu_in_mask(mapping_gpu, va_space, mapped_gpus) {
-        uvm_ext_gpu_range_tree_t *range_tree = uvm_ext_gpu_range_tree(va_range, mapping_gpu);
+        uvm_ext_gpu_range_tree_t *range_tree = uvm_ext_gpu_range_tree(external_range, mapping_gpu);
        uvm_ext_gpu_map_t *ext_map, *ext_map_next;

        uvm_mutex_lock(&range_tree->lock);
        uvm_ext_gpu_map_for_each_in_safe(ext_map,
                                         ext_map_next,
-                                         va_range,
+                                         external_range,
                                         mapping_gpu,
                                         params->base,
                                         params->base + params->length - 1)
-            uvm_ext_gpu_map_destroy(va_range, ext_map, NULL);
+            uvm_ext_gpu_map_destroy(external_range, ext_map, NULL);
        uvm_mutex_unlock(&range_tree->lock);
    }

@ -1091,15 +1099,15 @@ static NvU64 external_sparse_pte_maker(uvm_page_table_range_vec_t *range_vec, Nv
    return range_vec->tree->hal->make_sparse_pte();
 }

-static NV_STATUS uvm_map_external_sparse_on_gpu(uvm_va_range_t *va_range,
+static NV_STATUS uvm_map_external_sparse_on_gpu(uvm_va_range_external_t *external_range,
                                                uvm_gpu_t *mapping_gpu,
                                                NvU64 base,
                                                NvU64 length,
                                                struct list_head *deferred_free_list)
 {
-    uvm_va_space_t *va_space = va_range->va_space;
+    uvm_va_space_t *va_space = external_range->va_range.va_space;
    uvm_ext_gpu_map_t *ext_gpu_map = NULL;
-    uvm_ext_gpu_range_tree_t *range_tree = uvm_ext_gpu_range_tree(va_range, mapping_gpu);
+    uvm_ext_gpu_range_tree_t *range_tree = uvm_ext_gpu_range_tree(external_range, mapping_gpu);
    uvm_gpu_va_space_t *gpu_va_space = uvm_gpu_va_space_get(va_space, mapping_gpu);
    uvm_page_tree_t *page_tree;
    NV_STATUS status;
@ -1115,7 +1123,7 @@ static NV_STATUS uvm_map_external_sparse_on_gpu(uvm_va_range_t *va_range,

    uvm_mutex_lock(&range_tree->lock);

-    status = uvm_unmap_external_in_range(va_range, mapping_gpu, base, base + length - 1, deferred_free_list);
+    status = uvm_unmap_external_in_range(external_range, mapping_gpu, base, base + length - 1, deferred_free_list);
    if (status != NV_OK)
        goto error;

@ -1135,10 +1143,10 @@ static NV_STATUS uvm_map_external_sparse_on_gpu(uvm_va_range_t *va_range,
    status = uvm_range_tree_add(&range_tree->tree, &ext_gpu_map->node);
    UVM_ASSERT(status == NV_OK);

-    uvm_processor_mask_set_atomic(&va_range->external.mapped_gpus, mapping_gpu->id);
+    uvm_processor_mask_set_atomic(&external_range->mapped_gpus, mapping_gpu->id);
    ext_gpu_map->gpu = mapping_gpu;

-    UVM_ASSERT(uvm_va_range_ext_gpu_map(va_range, mapping_gpu, base) == ext_gpu_map);
+    UVM_ASSERT(uvm_va_range_ext_gpu_map(external_range, mapping_gpu, base) == ext_gpu_map);

    status = uvm_page_table_range_vec_init(page_tree,
                                           ext_gpu_map->node.start,
@ -1160,14 +1168,14 @@ static NV_STATUS uvm_map_external_sparse_on_gpu(uvm_va_range_t *va_range,
    return NV_OK;

 error:
-    uvm_ext_gpu_map_destroy(va_range, ext_gpu_map, NULL);
+    uvm_ext_gpu_map_destroy(external_range, ext_gpu_map, NULL);
    uvm_mutex_unlock(&range_tree->lock);
    return status;
 }

 static NV_STATUS uvm_map_external_sparse(uvm_va_space_t *va_space, UVM_MAP_EXTERNAL_SPARSE_PARAMS *params)
 {
-    uvm_va_range_t *va_range = NULL;
+    uvm_va_range_external_t *external_range = NULL;
    uvm_gpu_t *mapping_gpu = NULL;
    NV_STATUS status = NV_OK;
    LIST_HEAD(deferred_free_list);
@ -1176,10 +1184,9 @@ static NV_STATUS uvm_map_external_sparse(uvm_va_space_t *va_space, UVM_MAP_EXTER
        return NV_ERR_INVALID_ADDRESS;

    uvm_va_space_down_read(va_space);
-    va_range = uvm_va_range_find(va_space, params->base);
-    if (!va_range ||
-        va_range->type != UVM_VA_RANGE_TYPE_EXTERNAL ||
-        va_range->node.end < params->base + params->length - 1) {
+    external_range = uvm_va_range_external_find(va_space, params->base);
+    if (!external_range ||
+        external_range->va_range.node.end < params->base + params->length - 1) {
        status = NV_ERR_INVALID_ADDRESS;
        goto out;
    }
@ -1196,7 +1203,11 @@ static NV_STATUS uvm_map_external_sparse(uvm_va_space_t *va_space, UVM_MAP_EXTER
        goto out;
    }

-    status = uvm_map_external_sparse_on_gpu(va_range, mapping_gpu, params->base, params->length, &deferred_free_list);
+    status = uvm_map_external_sparse_on_gpu(external_range,
+                                            mapping_gpu,
+                                            params->base,
+                                            params->length,
+                                            &deferred_free_list);

    if (!list_empty(&deferred_free_list))
        uvm_gpu_retain(mapping_gpu);
@ -1244,7 +1255,7 @@ void uvm_ext_gpu_map_free(uvm_ext_gpu_map_t *ext_gpu_map)
        uvm_gpu_release(owning_gpu);
 }

-void uvm_ext_gpu_map_destroy(uvm_va_range_t *va_range,
+void uvm_ext_gpu_map_destroy(uvm_va_range_external_t *external_range,
                             uvm_ext_gpu_map_t *ext_gpu_map,
                             struct list_head *deferred_free_list)
 {
@ -1268,16 +1279,16 @@ void uvm_ext_gpu_map_destroy(uvm_va_range_t *va_range,

    mapped_gpu = ext_gpu_map->gpu;

-    range_tree = uvm_ext_gpu_range_tree(va_range, mapped_gpu);
+    range_tree = uvm_ext_gpu_range_tree(external_range, mapped_gpu);

    uvm_assert_mutex_locked(&range_tree->lock);
-    UVM_ASSERT(uvm_gpu_va_space_get(va_range->va_space, mapped_gpu));
+    UVM_ASSERT(uvm_gpu_va_space_get(external_range->va_range.va_space, mapped_gpu));

    uvm_range_tree_remove(&range_tree->tree, &ext_gpu_map->node);

    // Unmap the PTEs
    if (ext_gpu_map->pt_range_vec.ranges) {
-        membar = va_range_downgrade_membar(va_range, ext_gpu_map);
+        membar = va_range_downgrade_membar(&external_range->va_range, ext_gpu_map);
        uvm_page_table_range_vec_clear_ptes(&ext_gpu_map->pt_range_vec, membar);
        uvm_page_table_range_vec_deinit(&ext_gpu_map->pt_range_vec);
    }
@ -1299,7 +1310,7 @@ void uvm_ext_gpu_map_destroy(uvm_va_range_t *va_range,
    // Check if the sub-range tree is empty. Only then can the GPU be removed from
    // the mapped_gpus bitmap.
    if (uvm_range_tree_empty(&range_tree->tree))
-        uvm_processor_mask_clear_atomic(&va_range->external.mapped_gpus, mapped_gpu->id);
+        uvm_processor_mask_clear_atomic(&external_range->mapped_gpus, mapped_gpu->id);
 }

 static NV_STATUS uvm_unmap_external(uvm_va_space_t *va_space,
@ -1307,7 +1318,7 @@ static NV_STATUS uvm_unmap_external(uvm_va_space_t *va_space,
                                    NvU64 length,
                                    const NvProcessorUuid *gpu_uuid)
 {
-    uvm_va_range_t *va_range;
+    uvm_va_range_external_t *external_range;
    uvm_gpu_t *gpu = NULL;
    NV_STATUS status = NV_OK;
    uvm_ext_gpu_range_tree_t *range_tree;
@ -1318,8 +1329,8 @@ static NV_STATUS uvm_unmap_external(uvm_va_space_t *va_space,

    uvm_va_space_down_read(va_space);

-    va_range = uvm_va_range_find(va_space, base);
-    if (!va_range || va_range->type != UVM_VA_RANGE_TYPE_EXTERNAL || base + length - 1 > va_range->node.end) {
+    external_range = uvm_va_range_external_find(va_space, base);
+    if (!external_range || base + length - 1 > external_range->va_range.node.end) {
        status = NV_ERR_INVALID_ADDRESS;
        goto out;
    }
@ -1330,9 +1341,9 @@ static NV_STATUS uvm_unmap_external(uvm_va_space_t *va_space,
        goto out;
    }

-    range_tree = uvm_ext_gpu_range_tree(va_range, gpu);
+    range_tree = uvm_ext_gpu_range_tree(external_range, gpu);
    uvm_mutex_lock(&range_tree->lock);
-    status = uvm_unmap_external_in_range(va_range, gpu, base, base + length - 1, &deferred_free_list);
+    status = uvm_unmap_external_in_range(external_range, gpu, base, base + length - 1, &deferred_free_list);
    uvm_mutex_unlock(&range_tree->lock);

    // If the deferred_free_list is not empty, retain the GPU which maps the
@ -1359,13 +1370,14 @@ NV_STATUS uvm_api_unmap_external(UVM_UNMAP_EXTERNAL_PARAMS *params, struct file
 }

 // This destroys VA ranges created by UvmMapExternalAllocation,
-// UvmMapDynamicParallelismRegion, and UvmAllocSemaphorePool *only*. VA ranges
-// created by UvmMemMap and UvmAlloc go through mmap/munmap.
+// UvmMapDynamicParallelismRegion, UvmAllocDeviceP2P and UvmAllocSemaphorePool
+// *only*. VA ranges created by UvmMemMap and UvmAlloc go through mmap/munmap.
 static NV_STATUS uvm_free(uvm_va_space_t *va_space, NvU64 base, NvU64 length)
 {
    uvm_va_range_t *va_range;
    NV_STATUS status = NV_OK;
    uvm_processor_mask_t *retained_mask = NULL;
+    uvm_gpu_t *retained_gpu = NULL;
    LIST_HEAD(deferred_free_list);

    if (uvm_api_range_invalid_4k(base, length))
@ -1382,6 +1394,7 @@ static NV_STATUS uvm_free(uvm_va_space_t *va_space, NvU64 base, NvU64 length)
    if (!va_range                                    ||
        (va_range->type != UVM_VA_RANGE_TYPE_EXTERNAL &&
         va_range->type != UVM_VA_RANGE_TYPE_SKED_REFLECTED &&
+         va_range->type != UVM_VA_RANGE_TYPE_DEVICE_P2P &&
         va_range->type != UVM_VA_RANGE_TYPE_SEMAPHORE_POOL) ||
        va_range->node.start != base                 ||
        va_range->node.end != base + length - 1) {
@ -1390,7 +1403,7 @@ static NV_STATUS uvm_free(uvm_va_space_t *va_space, NvU64 base, NvU64 length)
    }

    if ((va_range->type == UVM_VA_RANGE_TYPE_SEMAPHORE_POOL) &&
-        uvm_mem_mapped_on_cpu_user(va_range->semaphore_pool.mem)) {
+        uvm_mem_mapped_on_cpu_user(uvm_va_range_to_semaphore_pool(va_range)->mem)) {
        // Semaphore pools must be first unmapped from the CPU with munmap to
        // invalidate the vma.
        status = NV_ERR_INVALID_ARGUMENT;
@ -1398,25 +1411,37 @@ static NV_STATUS uvm_free(uvm_va_space_t *va_space, NvU64 base, NvU64 length)
    }

    if (va_range->type == UVM_VA_RANGE_TYPE_EXTERNAL) {
-        retained_mask = va_range->external.retained_mask;
+        uvm_va_range_external_t *external_range = uvm_va_range_to_external(va_range);
+
+        retained_mask = external_range->retained_mask;

        // Set the retained_mask to NULL to prevent
        // uvm_va_range_destroy_external() from freeing the mask.
-        va_range->external.retained_mask = NULL;
+        external_range->retained_mask = NULL;

        UVM_ASSERT(retained_mask);

        // External ranges may have deferred free work, so the GPUs may have to
        // be retained. Construct the mask of all the GPUs that need to be
        // retained.
-        uvm_processor_mask_and(retained_mask, &va_range->external.mapped_gpus, &va_space->registered_gpus);
+        uvm_processor_mask_and(retained_mask, &external_range->mapped_gpus, &va_space->registered_gpus);
+    }
+
+    if (va_range->type == UVM_VA_RANGE_TYPE_DEVICE_P2P) {
+        uvm_va_range_device_p2p_t *device_p2p_range = uvm_va_range_to_device_p2p(va_range);
+
+        retained_gpu = device_p2p_range->gpu;
    }

    uvm_va_range_destroy(va_range, &deferred_free_list);

    // If there is deferred work, retain the required GPUs.
-    if (!list_empty(&deferred_free_list))
-        uvm_global_gpu_retain(retained_mask);
+    if (!list_empty(&deferred_free_list)) {
+        if (retained_mask)
+            uvm_global_gpu_retain(retained_mask);
+        else
+            uvm_gpu_retain(retained_gpu);
+    }

 out:
    uvm_va_space_up_write(va_space);
@ -1424,7 +1449,10 @@ out:
    if (!list_empty(&deferred_free_list)) {
        UVM_ASSERT(status == NV_OK);
        uvm_deferred_free_object_list(&deferred_free_list);
-        uvm_global_gpu_release(retained_mask);
+        if (retained_mask)
+            uvm_global_gpu_release(retained_mask);
+        else
+            uvm_gpu_release(retained_gpu);
    }

    // Free the mask allocated in uvm_va_range_create_external() since
--- a/kernel-open/nvidia-uvm/uvm_map_external.h
+++ b/kernel-open/nvidia-uvm/uvm_map_external.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016 NVIDIA Corporation
+    Copyright (c) 2016-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -40,33 +40,35 @@ typedef struct
    UvmGpuCompressionType compression_type;
 } uvm_map_rm_params_t;

-static uvm_ext_gpu_range_tree_t *uvm_ext_gpu_range_tree(uvm_va_range_t *va_range, uvm_gpu_t *gpu)
+static uvm_ext_gpu_range_tree_t *uvm_ext_gpu_range_tree(uvm_va_range_external_t *external_range, uvm_gpu_t *gpu)
 {
-    UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_EXTERNAL);
-
-    return &va_range->external.gpu_ranges[uvm_id_gpu_index(gpu->id)];
+    return &external_range->gpu_ranges[uvm_id_gpu_index(gpu->id)];
 }

 // Returns the first external map (if any) in the gpu's range tree.
-// va_range should be of type UVM_VA_RANGE_TYPE_EXTERNAL.
 // The caller must hold the range tree lock.
-static uvm_ext_gpu_map_t *uvm_ext_gpu_map_iter_first(uvm_va_range_t *va_range, uvm_gpu_t *gpu, NvU64 start, NvU64 end)
+static uvm_ext_gpu_map_t *uvm_ext_gpu_map_iter_first(uvm_va_range_external_t *external_range,
+                                                     uvm_gpu_t *gpu,
+                                                     NvU64 start,
+                                                     NvU64 end)
 {
    uvm_ext_gpu_range_tree_t *range_tree;
    uvm_range_tree_node_t *node;

-    UVM_ASSERT(start >= va_range->node.start);
-    UVM_ASSERT(end <= va_range->node.end);
+    UVM_ASSERT(start >= external_range->va_range.node.start);
+    UVM_ASSERT(end <= external_range->va_range.node.end);

-    range_tree = uvm_ext_gpu_range_tree(va_range, gpu);
+    range_tree = uvm_ext_gpu_range_tree(external_range, gpu);
    node = uvm_range_tree_iter_first(&range_tree->tree, start, end);
    return uvm_ext_gpu_map_container(node);
 }

 // Returns the external map following the provided map (if any) in address order from
-// the gpu's range tree. va_range should be of type UVM_VA_RANGE_TYPE_EXTERNAL.
+// the gpu's range tree.
 // The caller must hold the range tree lock.
-static uvm_ext_gpu_map_t *uvm_ext_gpu_map_iter_next(uvm_va_range_t *va_range, uvm_ext_gpu_map_t *ext_gpu_map, NvU64 end)
+static uvm_ext_gpu_map_t *uvm_ext_gpu_map_iter_next(uvm_va_range_external_t *external_range,
+                                                    uvm_ext_gpu_map_t *ext_gpu_map,
+                                                    NvU64 end)
 {
    uvm_ext_gpu_range_tree_t *range_tree;
    uvm_range_tree_node_t *node;
@ -74,37 +76,41 @@ static uvm_ext_gpu_map_t *uvm_ext_gpu_map_iter_next(uvm_va_range_t *va_range, uv
    if (!ext_gpu_map)
        return NULL;

-    UVM_ASSERT(end <= va_range->node.end);
+    UVM_ASSERT(end <= external_range->va_range.node.end);

-    range_tree = uvm_ext_gpu_range_tree(va_range, ext_gpu_map->gpu);
+    range_tree = uvm_ext_gpu_range_tree(external_range, ext_gpu_map->gpu);
    node = uvm_range_tree_iter_next(&range_tree->tree, &ext_gpu_map->node, end);
    return uvm_ext_gpu_map_container(node);
 }

 // The four iterators below require that the caller hold the gpu's range tree
 // lock.
-#define uvm_ext_gpu_map_for_each_in(ext_gpu_map, va_range, gpu, start, end)             \
-    for ((ext_gpu_map) = uvm_ext_gpu_map_iter_first((va_range), (gpu), (start), (end)); \
-         (ext_gpu_map);                                                                 \
-         (ext_gpu_map) = uvm_ext_gpu_map_iter_next((va_range), (ext_gpu_map), (end)))
+#define uvm_ext_gpu_map_for_each_in(ext_gpu_map, external_range, gpu, start, end)               \
+    for ((ext_gpu_map) = uvm_ext_gpu_map_iter_first((external_range), (gpu), (start), (end));   \
+         (ext_gpu_map);                                                                         \
+         (ext_gpu_map) = uvm_ext_gpu_map_iter_next((external_range), (ext_gpu_map), (end)))

-#define uvm_ext_gpu_map_for_each_in_safe(ext_gpu_map, ext_gpu_map_next, va_range, gpu, start, end) \
-    for ((ext_gpu_map) = uvm_ext_gpu_map_iter_first((va_range), (gpu), (start), (end)),            \
-             (ext_gpu_map_next) = uvm_ext_gpu_map_iter_next((va_range), (ext_gpu_map), (end));     \
-         (ext_gpu_map);                                                                            \
-         (ext_gpu_map) = (ext_gpu_map_next),                                                       \
-             (ext_gpu_map_next) = uvm_ext_gpu_map_iter_next((va_range), (ext_gpu_map), (end)))
+#define uvm_ext_gpu_map_for_each_in_safe(ext_gpu_map, ext_gpu_map_next, external_range, gpu, start, end)    \
+    for ((ext_gpu_map) = uvm_ext_gpu_map_iter_first((external_range), (gpu), (start), (end)),               \
+             (ext_gpu_map_next) = uvm_ext_gpu_map_iter_next((external_range), (ext_gpu_map), (end));        \
+         (ext_gpu_map);                                                                                     \
+         (ext_gpu_map) = (ext_gpu_map_next),                                                                \
+             (ext_gpu_map_next) = uvm_ext_gpu_map_iter_next((external_range), (ext_gpu_map), (end)))

-#define uvm_ext_gpu_map_for_each(ext_gpu_map, va_range, gpu) \
-    uvm_ext_gpu_map_for_each_in(ext_gpu_map, va_range, gpu, (va_range)->node.start, (va_range)->node.end)
+#define uvm_ext_gpu_map_for_each(ext_gpu_map, external_range, gpu)      \
+    uvm_ext_gpu_map_for_each_in(ext_gpu_map,                            \
+                                external_range,                         \
+                                gpu,                                    \
+                                (external_range)->va_range.node.start,  \
+                                (external_range)->va_range.node.end)

-#define uvm_ext_gpu_map_for_each_safe(ext_gpu_map, ext_gpu_map_next, va_range, gpu) \
-    uvm_ext_gpu_map_for_each_in_safe(ext_gpu_map,                                   \
-                                     ext_gpu_map_next,                              \
-                                     va_range,                                      \
-                                     gpu,                                           \
-                                     (va_range)->node.start,                        \
-                                     (va_range)->node.end)
+#define uvm_ext_gpu_map_for_each_safe(ext_gpu_map, ext_gpu_map_next, external_range, gpu)   \
+    uvm_ext_gpu_map_for_each_in_safe(ext_gpu_map,                                           \
+                                     ext_gpu_map_next,                                      \
+                                     external_range,                                        \
+                                     gpu,                                                   \
+                                     (external_range)->va_range.node.start,                 \
+                                     (external_range)->va_range.node.end)

 // User-facing APIs (uvm_api_map_external_allocation, uvm_api_free) are declared
 // uvm_api.h.
@ -141,7 +147,7 @@ NV_STATUS uvm_va_range_map_rm_allocation(uvm_va_range_t *va_range,
 //
 // The caller must hold the range tree lock for the mapping gpu and is
 // responsible for making sure that mapping gpu is retained across those calls.
-void uvm_ext_gpu_map_destroy(uvm_va_range_t *va_range,
+void uvm_ext_gpu_map_destroy(uvm_va_range_external_t *external_range,
                             uvm_ext_gpu_map_t *ext_gpu_map,
                             struct list_head *deferred_free_list);

--- a/kernel-open/nvidia-uvm/uvm_mem.c
+++ b/kernel-open/nvidia-uvm/uvm_mem.c
@ -1051,8 +1051,6 @@ static NV_STATUS mem_map_gpu(uvm_mem_t *mem,
    // (uvm_conf_computing_dma_buffer_pool_t). Because we would typically
    // already hold the DMA_BUFFER_POOL lock at this time, we cannot hold
    // the block lock. Allocate PTEs without eviction in this context.
-    //
-    // See uvm_pmm_gpu_alloc()
    if (uvm_mem_is_sysmem_dma(mem))
        pmm_flags = UVM_PMM_ALLOC_FLAGS_NONE;

--- a/kernel-open/nvidia-uvm/uvm_mem_test.c
+++ b/kernel-open/nvidia-uvm/uvm_mem_test.c
@ -97,7 +97,8 @@ static NV_STATUS check_accessible_from_gpu(uvm_gpu_t *gpu, uvm_mem_t *mem)
        }

        uvm_push_set_description(&push,
-                                 "Memcopy %zd bytes from virtual sys_mem 0x%llx to %s mem 0x%llx [mem loc: %s, page size: %u]",
+                                 "Memcopy %zu bytes from virtual sys_mem 0x%llx to %s mem 0x%llx [mem loc: %s, page "
+                                 "size: %llu]",
                                 size_this_time,
                                 sys_mem_gpu_address.address,
                                 mem_gpu_address.is_virtual ? "virtual" : "physical",
--- a/kernel-open/nvidia-uvm/uvm_migrate.c
+++ b/kernel-open/nvidia-uvm/uvm_migrate.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2023 NVIDIA Corporation
+    Copyright (c) 2016-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -60,14 +60,14 @@ module_param(uvm_perf_migrate_cpu_preunmap_block_order, uint, S_IRUGO);
 static bool g_uvm_perf_migrate_cpu_preunmap_enable __read_mostly;
 static NvU64 g_uvm_perf_migrate_cpu_preunmap_size __read_mostly;

-static bool is_migration_single_block(uvm_va_range_t *first_va_range, NvU64 base, NvU64 length)
+static bool is_migration_single_block(uvm_va_range_managed_t *first_managed_range, NvU64 base, NvU64 length)
 {
    NvU64 end = base + length - 1;

-    if (end > first_va_range->node.end)
+    if (end > first_managed_range->va_range.node.end)
        return false;

-    return uvm_va_range_block_index(first_va_range, base) == uvm_va_range_block_index(first_va_range, end);
+    return uvm_va_range_block_index(first_managed_range, base) == uvm_va_range_block_index(first_managed_range, end);
 }

 static NV_STATUS block_migrate_map_mapped_pages(uvm_va_block_t *va_block,
@ -236,7 +236,7 @@ NV_STATUS uvm_va_block_migrate_locked(uvm_va_block_t *va_block,
                                                 UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE);
    }
    else {
-        uvm_va_policy_t *policy = uvm_va_range_get_policy(va_block->va_range);
+        uvm_va_policy_t *policy = &va_block->managed_range->policy;

        if (uvm_va_policy_is_read_duplicate(policy, va_space)) {
            status = uvm_va_block_make_resident_read_duplicate(va_block,
@ -401,28 +401,27 @@ static bool va_block_should_do_cpu_preunmap(uvm_va_block_t *va_block,
    return num_cpu_unchanged_pages == 0;
 }

-static void preunmap_multi_block(uvm_va_range_t *va_range,
+static void preunmap_multi_block(uvm_va_range_managed_t *managed_range,
                                 uvm_va_block_context_t *va_block_context,
                                 NvU64 start,
                                 NvU64 end,
                                 uvm_processor_id_t dest_id)
 {
    size_t i;
-    const size_t first_block_index = uvm_va_range_block_index(va_range, start);
-    const size_t last_block_index = uvm_va_range_block_index(va_range, end);
+    const size_t first_block_index = uvm_va_range_block_index(managed_range, start);
+    const size_t last_block_index = uvm_va_range_block_index(managed_range, end);
    NvU32 num_unmap_pages = 0;

-    UVM_ASSERT(start >= va_range->node.start);
-    UVM_ASSERT(end  <= va_range->node.end);
-    UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
-    uvm_assert_rwsem_locked(&va_range->va_space->lock);
+    UVM_ASSERT(start >= managed_range->va_range.node.start);
+    UVM_ASSERT(end  <= managed_range->va_range.node.end);
+    uvm_assert_rwsem_locked(&managed_range->va_range.va_space->lock);

-    UVM_ASSERT(uvm_range_group_all_migratable(va_range->va_space, start, end));
+    UVM_ASSERT(uvm_range_group_all_migratable(managed_range->va_range.va_space, start, end));

    for (i = first_block_index; i <= last_block_index; i++) {
        NvU32 num_block_unmap_pages;

-        if (!va_block_should_do_cpu_preunmap(uvm_va_range_block(va_range, i),
+        if (!va_block_should_do_cpu_preunmap(uvm_va_range_block(managed_range, i),
                                             va_block_context,
                                             start,
                                             end,
@ -435,10 +434,10 @@ static void preunmap_multi_block(uvm_va_range_t *va_range,
    }

    if (num_unmap_pages > 0)
-        unmap_mapping_range(va_range->va_space->mapping, start, end - start + 1, 1);
+        unmap_mapping_range(managed_range->va_range.va_space->mapping, start, end - start + 1, 1);
 }

-static NV_STATUS uvm_va_range_migrate_multi_block(uvm_va_range_t *va_range,
+static NV_STATUS uvm_va_range_migrate_multi_block(uvm_va_range_managed_t *managed_range,
                                                  uvm_service_block_context_t *service_context,
                                                  NvU64 start,
                                                  NvU64 end,
@ -447,22 +446,21 @@ static NV_STATUS uvm_va_range_migrate_multi_block(uvm_va_range_t *va_range,
                                                  uvm_tracker_t *out_tracker)
 {
    size_t i;
-    const size_t first_block_index = uvm_va_range_block_index(va_range, start);
-    const size_t last_block_index = uvm_va_range_block_index(va_range, end);
+    const size_t first_block_index = uvm_va_range_block_index(managed_range, start);
+    const size_t last_block_index = uvm_va_range_block_index(managed_range, end);

-    UVM_ASSERT(start >= va_range->node.start);
-    UVM_ASSERT(end  <= va_range->node.end);
-    UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
-    uvm_assert_rwsem_locked(&va_range->va_space->lock);
+    UVM_ASSERT(start >= managed_range->va_range.node.start);
+    UVM_ASSERT(end  <= managed_range->va_range.node.end);
+    uvm_assert_rwsem_locked(&managed_range->va_range.va_space->lock);

-    UVM_ASSERT(uvm_range_group_all_migratable(va_range->va_space, start, end));
+    UVM_ASSERT(uvm_range_group_all_migratable(managed_range->va_range.va_space, start, end));

    // Iterate over blocks, populating them if necessary
    for (i = first_block_index; i <= last_block_index; i++) {
        uvm_va_block_retry_t va_block_retry;
        uvm_va_block_region_t region;
        uvm_va_block_t *va_block;
-        NV_STATUS status = uvm_va_range_block_create(va_range, i, &va_block);
+        NV_STATUS status = uvm_va_range_block_create(managed_range, i, &va_block);

        if (status != NV_OK)
            return status;
@ -487,7 +485,7 @@ static NV_STATUS uvm_va_range_migrate_multi_block(uvm_va_range_t *va_range,
    return NV_OK;
 }

-static NV_STATUS uvm_va_range_migrate(uvm_va_range_t *va_range,
+static NV_STATUS uvm_va_range_migrate(uvm_va_range_managed_t *managed_range,
                                      uvm_service_block_context_t *service_context,
                                      NvU64 start,
                                      NvU64 end,
@ -497,9 +495,10 @@ static NV_STATUS uvm_va_range_migrate(uvm_va_range_t *va_range,
                                      uvm_tracker_t *out_tracker)
 {
    NvU64 preunmap_range_start = start;
-    uvm_va_policy_t *policy = uvm_va_range_get_policy(va_range);
+    uvm_va_policy_t *policy = &managed_range->policy;

-    should_do_cpu_preunmap = should_do_cpu_preunmap && va_range_should_do_cpu_preunmap(policy, va_range->va_space);
+    should_do_cpu_preunmap = should_do_cpu_preunmap &&
+                             va_range_should_do_cpu_preunmap(policy, managed_range->va_range.va_space);

    // Divide migrations into groups of contiguous VA blocks. This is to trigger
    // CPU unmaps for that region before the migration starts.
@ -511,7 +510,7 @@ static NV_STATUS uvm_va_range_migrate(uvm_va_range_t *va_range,
            preunmap_range_end = UVM_ALIGN_UP(preunmap_range_start + 1, g_uvm_perf_migrate_cpu_preunmap_size);
            preunmap_range_end = min(preunmap_range_end - 1, end);

-            preunmap_multi_block(va_range,
+            preunmap_multi_block(managed_range,
                                 service_context->block_context,
                                 preunmap_range_start,
                                 preunmap_range_end,
@ -521,7 +520,7 @@ static NV_STATUS uvm_va_range_migrate(uvm_va_range_t *va_range,
            preunmap_range_end = end;
        }

-        status = uvm_va_range_migrate_multi_block(va_range,
+        status = uvm_va_range_migrate_multi_block(managed_range,
                                                  service_context,
                                                  preunmap_range_start,
                                                  preunmap_range_end,
@ -539,7 +538,7 @@ static NV_STATUS uvm_va_range_migrate(uvm_va_range_t *va_range,

 static NV_STATUS uvm_migrate_ranges(uvm_va_space_t *va_space,
                                    uvm_service_block_context_t *service_context,
-                                    uvm_va_range_t *first_va_range,
+                                    uvm_va_range_managed_t *first_managed_range,
                                    NvU64 base,
                                    NvU64 length,
                                    uvm_processor_id_t dest_id,
@ -547,39 +546,33 @@ static NV_STATUS uvm_migrate_ranges(uvm_va_space_t *va_space,
                                    bool should_do_cpu_preunmap,
                                    uvm_tracker_t *out_tracker)
 {
-    uvm_va_range_t *va_range, *va_range_last;
+    uvm_va_range_managed_t *managed_range, *managed_range_last;
    NvU64 end = base + length - 1;
    NV_STATUS status = NV_OK;
    bool skipped_migrate = false;

-    if (!first_va_range) {
-        // For HMM, we iterate over va_blocks since there is no va_range.
+    if (!first_managed_range) {
+        // For HMM, we iterate over va_blocks since there is no managed_range.
        return uvm_hmm_migrate_ranges(va_space, service_context, base, length, dest_id, mode, out_tracker);
    }

-    UVM_ASSERT(first_va_range == uvm_va_space_iter_first(va_space, base, base));
+    UVM_ASSERT(first_managed_range == uvm_va_space_iter_managed_first(va_space, base, base));

-    va_range_last = NULL;
-    uvm_for_each_va_range_in_contig_from(va_range, va_space, first_va_range, end) {
+    managed_range_last = NULL;
+    uvm_for_each_va_range_managed_in_contig_from(managed_range, va_space, first_managed_range, end) {
        uvm_range_group_range_iter_t iter;
-        uvm_va_policy_t *policy = uvm_va_range_get_policy(va_range);
+        uvm_va_policy_t *policy = &managed_range->policy;

-        va_range_last = va_range;
+        managed_range_last = managed_range;

-        // Only managed ranges can be migrated
-        if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED) {
-            status = NV_ERR_INVALID_ADDRESS;
-            break;
-        }
-
-        // For UVM-Lite GPUs, the CUDA driver may suballocate a single va_range
-        // into many range groups.  For this reason, we iterate over each va_range first
-        // then through the range groups within.
+        // For UVM-Lite GPUs, the CUDA driver may suballocate a single
+        // managed_range into many range groups.  For this reason, we iterate
+        // over each managed_range first then through the range groups within.
        uvm_range_group_for_each_migratability_in(&iter,
                                                  va_space,
-                                                  max(base, va_range->node.start),
-                                                  min(end, va_range->node.end)) {
-            // Skip non-migratable VA ranges
+                                                  max(base, managed_range->va_range.node.start),
+                                                  min(end, managed_range->va_range.node.end)) {
+            // Skip non-migratable ranges
            if (!iter.migratable) {
                // Only return NV_WARN_MORE_PROCESSING_REQUIRED if the pages aren't
                // already resident at dest_id.
@ -588,7 +581,7 @@ static NV_STATUS uvm_migrate_ranges(uvm_va_space_t *va_space,
                                                            service_context->block_context->make_resident.dest_nid))
                    skipped_migrate = true;
            }
-            else if (uvm_processor_mask_test(&va_range->uvm_lite_gpus, dest_id) &&
+            else if (uvm_processor_mask_test(&managed_range->va_range.uvm_lite_gpus, dest_id) &&
                     !uvm_va_policy_preferred_location_equal(policy, dest_id, NUMA_NO_NODE)) {
                // Don't migrate to a non-faultable GPU that is in UVM-Lite mode,
                // unless it's the preferred location
@ -596,7 +589,7 @@ static NV_STATUS uvm_migrate_ranges(uvm_va_space_t *va_space,
                break;
            }
            else {
-                status = uvm_va_range_migrate(va_range,
+                status = uvm_va_range_migrate(managed_range,
                                              service_context,
                                              iter.start,
                                              iter.end,
@ -614,7 +607,7 @@ static NV_STATUS uvm_migrate_ranges(uvm_va_space_t *va_space,
        return status;

    // Check that we were able to iterate over the entire range without any gaps
-    if (!va_range_last || va_range_last->node.end < end)
+    if (!managed_range_last || managed_range_last->va_range.node.end < end)
        return NV_ERR_INVALID_ADDRESS;

    if (skipped_migrate)
@ -630,7 +623,7 @@ static NV_STATUS uvm_migrate(uvm_va_space_t *va_space,
                             uvm_processor_id_t dest_id,
                             int dest_nid,
                             NvU32 migrate_flags,
-                             uvm_va_range_t *first_va_range,
+                             uvm_va_range_managed_t *first_managed_range,
                             uvm_tracker_t *out_tracker)
 {
    NV_STATUS status = NV_OK;
@ -644,12 +637,12 @@ static NV_STATUS uvm_migrate(uvm_va_space_t *va_space,

    // If the GPU has its memory disabled, just skip the migration and let
    // faults take care of things.
-    if (!uvm_va_space_processor_has_memory(va_space, dest_id))
+    if (!uvm_processor_has_memory(dest_id))
        return NV_OK;

    if (mm)
        uvm_assert_mmap_lock_locked(mm);
-    else if (!first_va_range)
+    else if (!first_managed_range)
        return NV_ERR_INVALID_ADDRESS;

    service_context = uvm_service_block_context_alloc(mm);
@ -677,8 +670,8 @@ static NV_STATUS uvm_migrate(uvm_va_space_t *va_space,
    // 2- Go block by block reexecuting the transfer (in case someone moved it
    // since the first pass), and adding the mappings.
    //
-    // For HMM (!first_va_range), we always do a single pass.
-    is_single_block = !first_va_range || is_migration_single_block(first_va_range, base, length);
+    // For HMM (!first_managed_range), we always do a single pass.
+    is_single_block = !first_managed_range || is_migration_single_block(first_managed_range, base, length);
    do_mappings = UVM_ID_IS_GPU(dest_id) || !(migrate_flags & UVM_MIGRATE_FLAG_SKIP_CPU_MAP);
    do_two_passes = do_mappings && !is_single_block;

@ -687,7 +680,7 @@ static NV_STATUS uvm_migrate(uvm_va_space_t *va_space,

        status = uvm_migrate_ranges(va_space,
                                    service_context,
-                                    first_va_range,
+                                    first_managed_range,
                                    base,
                                    length,
                                    dest_id,
@ -705,7 +698,7 @@ static NV_STATUS uvm_migrate(uvm_va_space_t *va_space,

        status = uvm_migrate_ranges(va_space,
                                    service_context,
-                                    first_va_range,
+                                    first_managed_range,
                                    base,
                                    length,
                                    dest_id,
@ -792,7 +785,7 @@ static void semaphore_release_from_cpu(uvm_mem_t *semaphore_mem, NvU64 semaphore

    semaphore_cpu_va = (char *) uvm_mem_get_cpu_addr_kernel(semaphore_mem) + semaphore_offset;

-    UVM_WRITE_ONCE(*(NvU32 *)semaphore_cpu_va, semaphore_payload);
+    WRITE_ONCE(*(NvU32 *)semaphore_cpu_va, semaphore_payload);
 }

 static NV_STATUS semaphore_release(NvU64 semaphore_address,
@ -872,7 +865,7 @@ NV_STATUS uvm_api_migrate(UVM_MIGRATE_PARAMS *params, struct file *filp)
    uvm_tracker_t tracker = UVM_TRACKER_INIT();
    uvm_tracker_t *tracker_ptr = NULL;
    uvm_gpu_t *dest_gpu = NULL;
-    uvm_va_range_t *sema_va_range = NULL;
+    uvm_va_range_semaphore_pool_t *sema_va_range = NULL;
    struct mm_struct *mm;
    NV_STATUS status = NV_OK;
    bool flush_events = false;
@ -917,9 +910,9 @@ NV_STATUS uvm_api_migrate(UVM_MIGRATE_PARAMS *params, struct file *filp)
            }
        }
        else {
-            sema_va_range = uvm_va_range_find(va_space, params->semaphoreAddress);
+            sema_va_range = uvm_va_range_semaphore_pool_find(va_space, params->semaphoreAddress);
            if (!IS_ALIGNED(params->semaphoreAddress, sizeof(params->semaphorePayload)) ||
-                    !sema_va_range || sema_va_range->type != UVM_VA_RANGE_TYPE_SEMAPHORE_POOL) {
+                    !sema_va_range) {
                status = NV_ERR_INVALID_ADDRESS;
                goto done;
            }
@ -980,18 +973,19 @@ NV_STATUS uvm_api_migrate(UVM_MIGRATE_PARAMS *params, struct file *filp)
        if (type == UVM_API_RANGE_TYPE_ATS) {
            uvm_migrate_args_t uvm_migrate_args =
            {
-                .va_space                       = va_space,
-                .mm                             = mm,
-                .start                          = params->base,
-                .length                         = params->length,
-                .dst_id                         = dest_id,
-                .dst_node_id                    = cpu_numa_node,
-                .populate_permissions           = UVM_POPULATE_PERMISSIONS_INHERIT,
-                .touch                          = false,
-                .skip_mapped                    = false,
-                .populate_on_cpu_alloc_failures = false,
-                .user_space_start               = &params->userSpaceStart,
-                .user_space_length              = &params->userSpaceLength,
+                .va_space                           = va_space,
+                .mm                                 = mm,
+                .start                              = params->base,
+                .length                             = params->length,
+                .dst_id                             = dest_id,
+                .dst_node_id                        = cpu_numa_node,
+                .populate_permissions               = UVM_POPULATE_PERMISSIONS_INHERIT,
+                .touch                              = false,
+                .skip_mapped                        = false,
+                .populate_on_cpu_alloc_failures     = false,
+                .populate_on_migrate_vma_failures   = true,
+                .user_space_start                   = &params->userSpaceStart,
+                .user_space_length                  = &params->userSpaceLength,
            };

            status = uvm_migrate_pageable(&uvm_migrate_args);
@ -1004,7 +998,7 @@ NV_STATUS uvm_api_migrate(UVM_MIGRATE_PARAMS *params, struct file *filp)
                                 dest_id,
                                 (UVM_ID_IS_CPU(dest_id) ? cpu_numa_node : NUMA_NO_NODE),
                                 params->flags,
-                                 uvm_va_space_iter_first(va_space, params->base, params->base),
+                                 uvm_va_space_iter_managed_first(va_space, params->base, params->base),
                                 tracker_ptr);
        }
    }
@ -1025,7 +1019,7 @@ done:
        if (params->semaphoreAddress && (status == NV_OK)) {
            status = semaphore_release(params->semaphoreAddress,
                                       params->semaphorePayload,
-                                       &sema_va_range->semaphore_pool,
+                                       sema_va_range,
                                       dest_gpu,
                                       tracker_ptr);
        }
@ -1104,9 +1098,9 @@ NV_STATUS uvm_api_migrate_range_group(UVM_MIGRATE_RANGE_GROUP_PARAMS *params, st
            status = NV_ERR_OUT_OF_RANGE;
        }
        else {
-            uvm_va_range_t *first_va_range = uvm_va_space_iter_first(va_space, start, start);
+            uvm_va_range_managed_t *first_managed_range = uvm_va_space_iter_managed_first(va_space, start, start);

-            if (!first_va_range || first_va_range->type != UVM_VA_RANGE_TYPE_MANAGED) {
+            if (!first_managed_range) {
                status = NV_ERR_INVALID_ADDRESS;
                goto done;
            }
@ -1118,7 +1112,7 @@ NV_STATUS uvm_api_migrate_range_group(UVM_MIGRATE_RANGE_GROUP_PARAMS *params, st
                                 dest_id,
                                 NUMA_NO_NODE,
                                 migrate_flags,
-                                 first_va_range,
+                                 first_managed_range,
                                 &local_tracker);
        }

--- a/kernel-open/nvidia-uvm/uvm_migrate_pageable.c
+++ b/kernel-open/nvidia-uvm/uvm_migrate_pageable.c
@ -49,7 +49,7 @@ static NV_STATUS migrate_vma_page_copy_address(struct page *page,
                                               uvm_gpu_address_t *gpu_addr)
 {
    uvm_va_space_t *va_space = state->uvm_migrate_args->va_space;
-    uvm_gpu_t *owning_gpu = UVM_ID_IS_CPU(resident_id)? NULL: uvm_va_space_get_gpu(va_space, resident_id);
+    uvm_gpu_t *owning_gpu = UVM_ID_IS_CPU(resident_id)? NULL: uvm_gpu_get(resident_id);
    const bool can_copy_from = uvm_processor_mask_test(&va_space->can_copy_from[uvm_id_value(copying_gpu->id)],
                                                       resident_id);

@ -111,8 +111,8 @@ static NV_STATUS migrate_vma_zero_begin_push(uvm_va_space_t *va_space,
                          channel_type,
                          push,
                          "Zero %s from %s VMA region [0x%lx, 0x%lx]",
-                          uvm_va_space_processor_name(va_space, dst_id),
-                          uvm_va_space_processor_name(va_space, gpu->id),
+                          uvm_processor_get_name(dst_id),
+                          uvm_processor_get_name(gpu->id),
                          start,
                          outer);
 }
@ -130,20 +130,20 @@ static NV_STATUS migrate_vma_copy_begin_push(uvm_va_space_t *va_space,

    UVM_ASSERT_MSG(!uvm_id_equal(src_id, dst_id),
                   "Unexpected copy to self, processor %s\n",
-                   uvm_va_space_processor_name(va_space, src_id));
+                   uvm_processor_get_name(src_id));

    if (UVM_ID_IS_CPU(src_id)) {
-        gpu = uvm_va_space_get_gpu(va_space, dst_id);
+        gpu = uvm_gpu_get(dst_id);
        channel_type = UVM_CHANNEL_TYPE_CPU_TO_GPU;
    }
    else if (UVM_ID_IS_CPU(dst_id)) {
-        gpu = uvm_va_space_get_gpu(va_space, src_id);
+        gpu = uvm_gpu_get(src_id);
        channel_type = UVM_CHANNEL_TYPE_GPU_TO_CPU;
    }
    else {
        // For GPU to GPU copies, prefer to "push" the data from the source as
        // that works better
-        gpu = uvm_va_space_get_gpu(va_space, src_id);
+        gpu = uvm_gpu_get(src_id);

        channel_type = UVM_CHANNEL_TYPE_GPU_TO_GPU;
    }
@ -154,24 +154,24 @@ static NV_STATUS migrate_vma_copy_begin_push(uvm_va_space_t *va_space,
    if (!gpu->mem_info.numa.enabled) {
        UVM_ASSERT_MSG(uvm_processor_mask_test(&va_space->can_copy_from[uvm_id_value(gpu->id)], dst_id),
                       "GPU %s dst %s src %s\n",
-                       uvm_va_space_processor_name(va_space, gpu->id),
-                       uvm_va_space_processor_name(va_space, dst_id),
-                       uvm_va_space_processor_name(va_space, src_id));
+                       uvm_processor_get_name(gpu->id),
+                       uvm_processor_get_name(dst_id),
+                       uvm_processor_get_name(src_id));
        UVM_ASSERT_MSG(uvm_processor_mask_test(&va_space->can_copy_from[uvm_id_value(gpu->id)], src_id),
                       "GPU %s dst %s src %s\n",
-                       uvm_va_space_processor_name(va_space, gpu->id),
-                       uvm_va_space_processor_name(va_space, dst_id),
-                       uvm_va_space_processor_name(va_space, src_id));
+                       uvm_processor_get_name(gpu->id),
+                       uvm_processor_get_name(dst_id),
+                       uvm_processor_get_name(src_id));
    }

    if (channel_type == UVM_CHANNEL_TYPE_GPU_TO_GPU) {
-        uvm_gpu_t *dst_gpu = uvm_va_space_get_gpu(va_space, dst_id);
+        uvm_gpu_t *dst_gpu = uvm_gpu_get(dst_id);
        return uvm_push_begin_gpu_to_gpu(gpu->channel_manager,
                                         dst_gpu,
                                         push,
                                         "Copy from %s to %s for VMA region [0x%lx, 0x%lx]",
-                                         uvm_va_space_processor_name(va_space, src_id),
-                                         uvm_va_space_processor_name(va_space, dst_id),
+                                         uvm_processor_get_name(src_id),
+                                         uvm_processor_get_name(dst_id),
                                         start,
                                         outer);
    }
@ -180,8 +180,8 @@ static NV_STATUS migrate_vma_copy_begin_push(uvm_va_space_t *va_space,
                          channel_type,
                          push,
                          "Copy from %s to %s for VMA region [0x%lx, 0x%lx]",
-                          uvm_va_space_processor_name(va_space, src_id),
-                          uvm_va_space_processor_name(va_space, dst_id),
+                          uvm_processor_get_name(src_id),
+                          uvm_processor_get_name(dst_id),
                          start,
                          outer);
 }
@ -356,7 +356,7 @@ static NV_STATUS migrate_vma_populate_anon_pages(struct vm_area_struct *vma,
                    copying_gpu = uvm_va_space_find_first_gpu(va_space);
            }
            else {
-                copying_gpu = uvm_va_space_get_gpu(va_space, dst_id);
+                copying_gpu = uvm_gpu_get(dst_id);
            }

            UVM_ASSERT(copying_gpu);
@ -928,7 +928,7 @@ static NV_STATUS migrate_pageable(migrate_vma_state_t *state)

        status = migrate_pageable_vma(vma, start, outer, state, &next_addr);
        if (status == NV_WARN_NOTHING_TO_DO) {
-            NV_STATUS populate_status = NV_OK;
+            NV_STATUS populate_status;
            bool touch = uvm_migrate_args->touch;
            uvm_populate_permissions_t populate_permissions = uvm_migrate_args->populate_permissions;

@ -948,8 +948,16 @@ static NV_STATUS migrate_pageable(migrate_vma_state_t *state)
            if (current->mm != mm && !(current->flags & PF_KTHREAD))
                return NV_ERR_NOT_SUPPORTED;

-            // Populate pages with uvm_populate_pageable
-            populate_status = uvm_populate_pageable_vma(vma, start, length, 0, touch, populate_permissions);
+            // Populate pages with uvm_populate_pageable if requested.
+            if (uvm_migrate_args->populate_on_migrate_vma_failures) {
+                populate_status = uvm_populate_pageable_vma(vma, start, length, 0, touch, populate_permissions);
+            }
+            else {
+                *user_space_start = start;
+                *user_space_length = outer - start;
+                populate_status = NV_WARN_NOTHING_TO_DO;
+            }
+
            if (populate_status == NV_OK) {
                *user_space_start = max(vma->vm_start, start);
                *user_space_length = min(vma->vm_end, outer) - *user_space_start;
@ -983,7 +991,6 @@ NV_STATUS uvm_migrate_pageable(uvm_migrate_args_t *uvm_migrate_args)
 {
    migrate_vma_state_t *state = NULL;
    NV_STATUS status;
-    uvm_va_space_t *va_space = uvm_migrate_args->va_space;
    uvm_processor_id_t dst_id = uvm_migrate_args->dst_id;

    UVM_ASSERT(PAGE_ALIGNED(uvm_migrate_args->start));
@ -997,7 +1004,7 @@ NV_STATUS uvm_migrate_pageable(uvm_migrate_args_t *uvm_migrate_args)
    else {
        // Incoming dst_node_id is only valid if dst_id belongs to the CPU. Use
        // dst_node_id as the GPU node id if dst_id doesn't belong to the CPU.
-        uvm_migrate_args->dst_node_id = uvm_gpu_numa_node(uvm_va_space_get_gpu(va_space, dst_id));
+        uvm_migrate_args->dst_node_id = uvm_gpu_numa_node(uvm_gpu_get(dst_id));
    }

    state = kmem_cache_alloc(g_uvm_migrate_vma_state_cache, NV_UVM_GFP_FLAGS);
--- a/kernel-open/nvidia-uvm/uvm_migrate_pageable.h
+++ b/kernel-open/nvidia-uvm/uvm_migrate_pageable.h
@ -44,6 +44,7 @@ typedef struct
    bool                            touch : 1;
    bool                            skip_mapped : 1;
    bool                            populate_on_cpu_alloc_failures : 1;
+    bool                            populate_on_migrate_vma_failures : 1;
    NvU64                           *user_space_start;
    NvU64                           *user_space_length;
 } uvm_migrate_args_t;
--- a/kernel-open/nvidia-uvm/uvm_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_mmu.c
@ -50,18 +50,18 @@
 // because that type is normally associated with the LCE mapped to the most
 // PCEs. The higher bandwidth is beneficial when doing bulk operations such as
 // clearing PTEs, or initializing a page directory/table.
-#define page_tree_begin_acquire(tree, tracker, push, format, ...) ({                                                        \
-    NV_STATUS status;                                                                                                       \
-    uvm_channel_manager_t *manager = (tree)->gpu->channel_manager;                                                          \
-                                                                                                                            \
-    if (manager == NULL)                                                                                                    \
-        status = uvm_push_begin_fake((tree)->gpu, (push));                                                                  \
-    else if (uvm_parent_gpu_is_virt_mode_sriov_heavy((tree)->gpu->parent))                                                  \
-        status = uvm_push_begin_acquire(manager, UVM_CHANNEL_TYPE_MEMOPS, (tracker), (push), (format), ##__VA_ARGS__);      \
-    else                                                                                                                    \
-        status = uvm_push_begin_acquire(manager, UVM_CHANNEL_TYPE_GPU_INTERNAL, (tracker), (push), (format), ##__VA_ARGS__);\
-                                                                                                                            \
-    status;                                                                                                                 \
+#define page_tree_begin_acquire(tree, tracker, push, format, ...) ({                                                            \
+    NV_STATUS __status;                                                                                                         \
+    uvm_channel_manager_t *__manager = (tree)->gpu->channel_manager;                                                            \
+                                                                                                                                \
+    if (__manager == NULL)                                                                                                      \
+        __status = uvm_push_begin_fake((tree)->gpu, (push));                                                                    \
+    else if (uvm_parent_gpu_is_virt_mode_sriov_heavy((tree)->gpu->parent))                                                      \
+        __status = uvm_push_begin_acquire(__manager, UVM_CHANNEL_TYPE_MEMOPS, (tracker), (push), (format), ##__VA_ARGS__);      \
+    else                                                                                                                        \
+        __status = uvm_push_begin_acquire(__manager, UVM_CHANNEL_TYPE_GPU_INTERNAL, (tracker), (push), (format), ##__VA_ARGS__);\
+                                                                                                                                \
+    __status;                                                                                                                   \
 })

 // Default location of page table allocations
@ -2368,7 +2368,7 @@ NV_STATUS uvm_mmu_create_peer_identity_mappings(uvm_gpu_t *gpu, uvm_gpu_t *peer)
    peer_mapping = uvm_gpu_get_peer_mapping(gpu, peer->id);
    phys_offset = 0ULL;

-    if (uvm_gpus_are_nvswitch_connected(gpu, peer)) {
+    if (uvm_parent_gpus_are_nvswitch_connected(gpu->parent, peer->parent)) {
        // Add the 47-bit physical address routing bits for this peer to the
        // generated PTEs
        phys_offset = peer->parent->nvswitch_info.fabric_memory_window_start;
@ -2658,7 +2658,7 @@ NV_STATUS uvm_mmu_chunk_map(uvm_gpu_chunk_t *chunk)
    // mappings are reference counted as multiples of PAGE_SIZE. User chunk
    // sizes are guaranteed to be a multiple of that page size, but kernel chunk
    // sizes can be smaller.
-    UVM_ASSERT(uvm_pmm_gpu_memory_type_is_user(chunk->type));
+    UVM_ASSERT(uvm_gpu_chunk_is_user(chunk));

    UVM_ASSERT(PAGE_ALIGNED(chunk_size));

@ -2705,7 +2705,7 @@ void uvm_mmu_chunk_unmap(uvm_gpu_chunk_t *chunk, uvm_tracker_t *tracker)
    chunk_size = uvm_gpu_chunk_get_size(chunk);
    num_unmapped_pages = chunk_size / PAGE_SIZE;

-    UVM_ASSERT(uvm_pmm_gpu_memory_type_is_user(chunk->type));
+    UVM_ASSERT(uvm_gpu_chunk_is_user(chunk));
    UVM_ASSERT(PAGE_ALIGNED(chunk_size));

    root_chunk_mapping = root_chunk_mapping_from_chunk(gpu, chunk);
--- a/kernel-open/nvidia-uvm/uvm_perf_events.h
+++ b/kernel-open/nvidia-uvm/uvm_perf_events.h
@ -114,7 +114,7 @@ typedef union

        // Only one of these two can be set. The other one must be NULL
        uvm_va_block_t *block;
-        uvm_va_range_t *range;
+        uvm_va_range_managed_t *range;
    } module_unload;

    struct
@ -151,6 +151,8 @@ typedef union
            {
                NvU64 fault_va;

+                NvU32 cpu_num;
+
                bool is_write;

                NvU64 pc;
@ -181,8 +183,8 @@ typedef union

        // For CPU-to-CPU migrations, these two fields indicate the source
        // and destination NUMA node IDs.
-        NvU16 dst_nid;
-        NvU16 src_nid;
+        NvS16 dst_nid;
+        NvS16 src_nid;

        // Start address of the memory range being migrated
        NvU64 address;
@ -336,8 +338,8 @@ static inline void uvm_perf_event_notify_migration_cpu(uvm_perf_va_space_events_
                    .block                 = va_block,
                    .dst                   = UVM_ID_CPU,
                    .src                   = UVM_ID_CPU,
-                    .src_nid               = (NvU16)src_nid,
-                    .dst_nid               = (NvU16)dst_nid,
+                    .src_nid               = (NvS16)src_nid,
+                    .dst_nid               = (NvS16)dst_nid,
                    .address               = address,
                    .bytes                 = bytes,
                    .transfer_mode         = transfer_mode,
@ -384,6 +386,7 @@ static inline void uvm_perf_event_notify_cpu_fault(uvm_perf_va_space_events_t *v
                                                   uvm_processor_id_t preferred_location,
                                                   NvU64 fault_va,
                                                   bool is_write,
+                                                   NvU32 cpu_num,
                                                   NvU64 pc)
 {
    uvm_perf_event_data_t event_data =
@ -397,9 +400,10 @@ static inline void uvm_perf_event_notify_cpu_fault(uvm_perf_va_space_events_t *v
                }
        };

-     event_data.fault.cpu.fault_va = fault_va,
-     event_data.fault.cpu.is_write = is_write,
-     event_data.fault.cpu.pc       = pc,
+    event_data.fault.cpu.fault_va = fault_va;
+    event_data.fault.cpu.is_write = is_write;
+    event_data.fault.cpu.pc       = pc;
+    event_data.fault.cpu.cpu_num  = cpu_num;

    uvm_perf_event_notify(va_space_events, UVM_PERF_EVENT_FAULT, &event_data);
 }
--- a/kernel-open/nvidia-uvm/uvm_perf_module.c
+++ b/kernel-open/nvidia-uvm/uvm_perf_module.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016 NVIDIA Corporation
+    Copyright (c) 2016-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -78,7 +78,7 @@ error:
 void uvm_perf_module_unload(uvm_perf_module_t *module, uvm_va_space_t *va_space)
 {
    uvm_perf_event_data_t event_data;
-    uvm_va_range_t *va_range;
+    uvm_va_range_managed_t *managed_range;
    uvm_va_block_t *block;
    size_t i;

@ -89,12 +89,10 @@ void uvm_perf_module_unload(uvm_perf_module_t *module, uvm_va_space_t *va_space)

    event_data.module_unload.module = module;

-    // Iterate over all va_range/va_blocks in the va_space
-    uvm_for_each_va_range(va_range, va_space) {
-        if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED)
-            continue;
+    // Iterate over all managed ranges/va_blocks in the va_space
+    uvm_for_each_va_range_managed(managed_range, va_space) {

-        for_each_va_block_in_va_range(va_range, block) {
+        for_each_va_block_in_va_range(managed_range, block) {
            uvm_mutex_lock(&block->lock);

            // Notify a fake va_block destruction to destroy the module-allocated data
@ -106,7 +104,7 @@ void uvm_perf_module_unload(uvm_perf_module_t *module, uvm_va_space_t *va_space)
        }
        // Notify a fake va_range destruction to destroy the module-allocated data
        event_data.module_unload.block = NULL;
-        event_data.module_unload.range = va_range;
+        event_data.module_unload.range = managed_range;
        uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_MODULE_UNLOAD, &event_data);
    }

--- a/kernel-open/nvidia-uvm/uvm_perf_prefetch.c
+++ b/kernel-open/nvidia-uvm/uvm_perf_prefetch.c
@ -260,7 +260,7 @@ static void update_bitmap_tree_from_va_block(uvm_perf_prefetch_bitmap_tree_t *bi
    // registered in the current process for this GPU.
    if (UVM_ID_IS_GPU(new_residency) &&
        uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, new_residency)) {
-        uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, new_residency);
+        uvm_gpu_t *gpu = uvm_gpu_get(new_residency);

        big_page_size = uvm_va_block_gpu_big_page_size(va_block, gpu);
    }
--- a/kernel-open/nvidia-uvm/uvm_perf_thrashing.c
+++ b/kernel-open/nvidia-uvm/uvm_perf_thrashing.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2023 NVIDIA Corporation
+    Copyright (c) 2016-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -223,9 +223,9 @@ typedef struct
 // uvm_procfs_is_debug_enabled() returns true.
 static processor_thrashing_stats_t g_cpu_thrashing_stats;

-#define PROCESSOR_THRASHING_STATS_INC(va_space, proc, field)                                         \
+#define PROCESSOR_THRASHING_STATS_INC(proc, field)                                                   \
    do {                                                                                             \
-        processor_thrashing_stats_t *_processor_stats = thrashing_stats_get_or_null(va_space, proc); \
+        processor_thrashing_stats_t *_processor_stats = thrashing_stats_get_or_null(proc);           \
        if (_processor_stats)                                                                        \
            atomic64_inc(&_processor_stats->field);                                                  \
    } while (0)
@ -474,7 +474,7 @@ static processor_thrashing_stats_t *gpu_thrashing_stats_get_or_null(uvm_gpu_t *g
    return uvm_perf_module_type_data(gpu->perf_modules_data, UVM_PERF_MODULE_TYPE_THRASHING);
 }

-static processor_thrashing_stats_t *thrashing_stats_get_or_null(uvm_va_space_t *va_space, uvm_processor_id_t id)
+static processor_thrashing_stats_t *thrashing_stats_get_or_null(uvm_processor_id_t id)
 {
    if (UVM_ID_IS_CPU(id)) {
        if (g_cpu_thrashing_stats.procfs_file)
@ -483,7 +483,7 @@ static processor_thrashing_stats_t *thrashing_stats_get_or_null(uvm_va_space_t *
        return NULL;
    }

-    return gpu_thrashing_stats_get_or_null(uvm_va_space_get_gpu(va_space, id));
+    return gpu_thrashing_stats_get_or_null(uvm_gpu_get(id));
 }

 // Create the thrashing stats struct for the given GPU
@ -1034,7 +1034,7 @@ static void thrashing_detected(uvm_va_block_t *va_block,
    if (!uvm_page_mask_test_and_set(&block_thrashing->thrashing_pages, page_index))
        ++block_thrashing->num_thrashing_pages;

-    PROCESSOR_THRASHING_STATS_INC(va_space, processor_id, num_thrashing);
+    PROCESSOR_THRASHING_STATS_INC(processor_id, num_thrashing);

    UVM_ASSERT(thrashing_state_checks(va_block, block_thrashing, page_thrashing, page_index));
 }
@ -1269,7 +1269,7 @@ void thrashing_event_cb(uvm_perf_event_t event_id, uvm_perf_event_data_t *event_
        // read duplication is supported.
        read_duplication = uvm_va_block_is_hmm(va_block) ?
                           UVM_READ_DUPLICATION_UNSET :
-                           uvm_va_range_get_policy(va_block->va_range)->read_duplication;
+                           va_block->managed_range->policy.read_duplication;

        // We only care about migrations due to replayable faults, access
        // counters and page prefetching. For non-replayable faults, UVM will
@ -1405,7 +1405,16 @@ static bool thrashing_processors_have_fast_access_to(uvm_va_space_t *va_space,
    uvm_processor_mask_and(fast_to,
                           &va_space->has_nvlink[uvm_id_value(to)],
                           &va_space->has_native_atomics[uvm_id_value(to)]);
-    uvm_processor_mask_set(fast_to, to);
+    if (UVM_ID_IS_CPU(to)) {
+        uvm_processor_mask_set(fast_to, to);
+    }
+    else {
+        // Include registered SMC peers and the processor 'to'.
+        uvm_processor_mask_range_fill(fast_to,
+                                      uvm_gpu_id_from_sub_processor(uvm_parent_gpu_id_from_gpu_id(to), 0),
+                                      UVM_PARENT_ID_MAX_SUB_PROCESSORS);
+        uvm_processor_mask_and(fast_to, fast_to, &va_space->registered_gpu_va_spaces);
+    }

    return uvm_processor_mask_subset(&page_thrashing->processors, fast_to);
 }
@ -1491,10 +1500,10 @@ static uvm_perf_thrashing_hint_t get_hint_for_migration_thrashing(va_space_thras
    else if (!preferred_location_is_thrashing(preferred_location, page_thrashing) &&
             thrashing_processors_have_fast_access_to(va_space, va_block_context, page_thrashing, closest_resident_id)){
        // This is a fast path for those scenarios in which all thrashing
-        // processors have fast (NVLINK + native atomics) access to the current
-        // residency. This is skipped if the preferred location is thrashing and
-        // not accessible by the rest of thrashing processors. Otherwise, we
-        // would be in the condition above.
+        // processors have fast access (NVLINK + native atomics or SMC peers)
+        // to the current residency. This is skipped if the preferred location
+        // is thrashing and not accessible by the rest of thrashing processors.
+        // Otherwise, we would be in the condition above.
        if (UVM_ID_IS_CPU(closest_resident_id)) {
            // On P9 systems, we prefer the CPU to map vidmem (since it can
            // cache it), so don't map the GPU to sysmem.
@ -1577,8 +1586,7 @@ static uvm_perf_thrashing_hint_t get_hint_for_migration_thrashing(va_space_thras
        hint.pin.residency = requester;
    }

-    if (hint.type == UVM_PERF_THRASHING_HINT_TYPE_PIN &&
-        !uvm_va_space_processor_has_memory(va_space, hint.pin.residency))
+    if (hint.type == UVM_PERF_THRASHING_HINT_TYPE_PIN && !uvm_processor_has_memory(hint.pin.residency))
        hint.pin.residency = UVM_ID_CPU;

    return hint;
@ -1741,9 +1749,9 @@ done:
        }
        else {
            if (uvm_id_equal(hint.pin.residency, requester))
-                PROCESSOR_THRASHING_STATS_INC(va_space, requester, num_pin_local);
+                PROCESSOR_THRASHING_STATS_INC(requester, num_pin_local);
            else
-                PROCESSOR_THRASHING_STATS_INC(va_space, requester, num_pin_remote);
+                PROCESSOR_THRASHING_STATS_INC(requester, num_pin_remote);

            uvm_processor_mask_copy(&hint.pin.processors, &page_thrashing->processors);
        }
@ -1756,7 +1764,7 @@ done:
                                     page_index,
                                     requester);

-        PROCESSOR_THRASHING_STATS_INC(va_space, requester, num_throttle);
+        PROCESSOR_THRASHING_STATS_INC(requester, num_throttle);

        hint.throttle.end_time_stamp = page_thrashing_get_throttling_end_time_stamp(page_thrashing);
    }
@ -2102,15 +2110,12 @@ NV_STATUS uvm_test_set_page_thrashing_policy(UVM_TEST_SET_PAGE_THRASHING_POLICY_
    // When disabling thrashing detection, destroy the thrashing tracking
    // information for all VA blocks and unpin pages
    if (!va_space_thrashing->params.enable) {
-        uvm_va_range_t *va_range;
+        uvm_va_range_managed_t *managed_range;

-        uvm_for_each_va_range(va_range, va_space) {
+        uvm_for_each_va_range_managed(managed_range, va_space) {
            uvm_va_block_t *va_block;

-            if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED)
-                continue;
-
-            for_each_va_block_in_va_range(va_range, va_block) {
+            for_each_va_block_in_va_range(managed_range, va_block) {
                uvm_va_block_region_t va_block_region = uvm_va_block_region_from_block(va_block);
                uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);

--- a/kernel-open/nvidia-uvm/uvm_pmm_gpu.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_gpu.c
@ -50,7 +50,7 @@
 // root_chunk_lock()) as synchronizing any pending operations might take a long
 // time and it would be undesirable for that to block other operations of PMM.
 // Notably some synchronization is required as part of allocation to handle GPU
-// lifetime issues across VA spaces (see comments in uvm_pmm_gpu_alloc()). Bit
+// lifetime issues across VA spaces (see comments in pmm_gpu_alloc()). Bit
 // locks (instead of a mutex in each root chunk) are used to save space.
 //
 // All free chunks (UVM_PMM_GPU_CHUNK_STATE_FREE) are kept on free lists, with
@ -87,8 +87,9 @@
 // chunk become free, they are merged into one bigger chunk. See
 // free_chunk_with_merges().
 //
-// Splitting and merging already allocated chunks is also exposed to the users of
-// allocated chunks. See uvm_pmm_gpu_split_chunk() and uvm_pmm_gpu_merge_chunk().
+// Splitting and merging already allocated chunks is also exposed to the users
+// of allocated chunks. See uvm_pmm_gpu_split_chunk() and
+// uvm_pmm_gpu_merge_chunk().
 //
 // As splits and merges are protected by a single PMM mutex, they are only
 // performed when really necessary. See alloc_chunk() that falls back to split
@ -170,6 +171,7 @@
 #include "uvm_kvmalloc.h"
 #include "uvm_va_space.h"
 #include "uvm_va_block.h"
+#include "uvm_va_range.h"
 #include "uvm_test.h"
 #include "uvm_linux.h"

@ -464,7 +466,7 @@ bool uvm_pmm_gpu_memory_type_is_user(uvm_pmm_gpu_memory_type_t type)
    }
 }

-static bool memory_type_is_protected(uvm_pmm_gpu_memory_type_t type)
+bool uvm_pmm_gpu_memory_type_is_protected(uvm_pmm_gpu_memory_type_t type)
 {
    switch (type) {
        case UVM_PMM_GPU_MEMORY_TYPE_USER: // Alias UVM_PMM_GPU_MEMORY_TYPE_USER_PROTECTED
@ -477,8 +479,9 @@ static bool memory_type_is_protected(uvm_pmm_gpu_memory_type_t type)

 static void uvm_gpu_chunk_set_in_eviction(uvm_gpu_chunk_t *chunk, bool in_eviction)
 {
-    UVM_ASSERT(uvm_pmm_gpu_memory_type_is_user(chunk->type));
+    UVM_ASSERT(uvm_gpu_chunk_is_user(chunk));
    UVM_ASSERT(uvm_gpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_MAX);
+
    chunk->in_eviction = in_eviction;
 }

@ -546,13 +549,13 @@ static uvm_pmm_gpu_memory_type_t pmm_squash_memory_type(uvm_pmm_gpu_memory_type_
    return UVM_PMM_GPU_MEMORY_TYPE_KERNEL;
 }

-NV_STATUS uvm_pmm_gpu_alloc(uvm_pmm_gpu_t *pmm,
-                            size_t num_chunks,
-                            uvm_chunk_size_t chunk_size,
-                            uvm_pmm_gpu_memory_type_t mem_type,
-                            uvm_pmm_alloc_flags_t flags,
-                            uvm_gpu_chunk_t **chunks,
-                            uvm_tracker_t *out_tracker)
+static NV_STATUS pmm_gpu_alloc(uvm_pmm_gpu_t *pmm,
+                               size_t num_chunks,
+                               uvm_chunk_size_t chunk_size,
+                               uvm_pmm_gpu_memory_type_t mem_type,
+                               uvm_pmm_alloc_flags_t flags,
+                               uvm_gpu_chunk_t **chunks,
+                               uvm_tracker_t *out_tracker)
 {
    uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
    NV_STATUS status;
@ -620,16 +623,17 @@ error:
    return status;
 }

-static NV_STATUS pmm_gpu_alloc_kernel(uvm_pmm_gpu_t *pmm,
-                                      size_t num_chunks,
-                                      uvm_chunk_size_t chunk_size,
-                                      uvm_pmm_gpu_memory_type_t memory_type,
-                                      uvm_pmm_alloc_flags_t flags,
-                                      uvm_gpu_chunk_t **chunks,
-                                      uvm_tracker_t *out_tracker)
+NV_STATUS uvm_pmm_gpu_alloc_kernel(uvm_pmm_gpu_t *pmm,
+                                   size_t num_chunks,
+                                   uvm_chunk_size_t chunk_size,
+                                   uvm_pmm_alloc_flags_t flags,
+                                   uvm_gpu_chunk_t **chunks,
+                                   uvm_tracker_t *out_tracker)
 {
+    NV_STATUS status;
    size_t i;
-    NV_STATUS status = uvm_pmm_gpu_alloc(pmm, num_chunks, chunk_size, memory_type, flags, chunks, out_tracker);
+
+    status = pmm_gpu_alloc(pmm, num_chunks, chunk_size, UVM_PMM_GPU_MEMORY_TYPE_KERNEL, flags, chunks, out_tracker);
    if (status != NV_OK)
        return status;

@ -645,13 +649,23 @@ static NV_STATUS pmm_gpu_alloc_kernel(uvm_pmm_gpu_t *pmm,
    return NV_OK;
 }

+NV_STATUS uvm_pmm_gpu_alloc_user(uvm_pmm_gpu_t *pmm,
+                                 size_t num_chunks,
+                                 uvm_chunk_size_t chunk_size,
+                                 uvm_pmm_alloc_flags_t flags,
+                                 uvm_gpu_chunk_t **chunks,
+                                 uvm_tracker_t *out_tracker)
+{
+    return pmm_gpu_alloc(pmm, num_chunks, chunk_size, UVM_PMM_GPU_MEMORY_TYPE_USER, flags, chunks, out_tracker);
+}
+
 static void chunk_update_lists_locked(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
 {
    uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);

    uvm_assert_spinlock_locked(&pmm->list_lock);

-    if (uvm_pmm_gpu_memory_type_is_user(chunk->type)) {
+    if (uvm_gpu_chunk_is_user(chunk)) {
        if (chunk_is_root_chunk_pinned(pmm, chunk)) {
            UVM_ASSERT(root_chunk->chunk.state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT ||
                       root_chunk->chunk.state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
@ -677,7 +691,7 @@ static void gpu_unpin_temp(uvm_pmm_gpu_t *pmm,
                           bool is_referenced)
 {
    UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
-    UVM_ASSERT(uvm_pmm_gpu_memory_type_is_user(chunk->type));
+    UVM_ASSERT(uvm_gpu_chunk_is_user(chunk));

    INIT_LIST_HEAD(&chunk->list);

@ -782,7 +796,8 @@ static bool assert_chunk_mergeable(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
        uvm_gpu_chunk_t *child = chunk->suballoc->subchunks[i];

        UVM_ASSERT(child->state == first_child->state);
-        if (first_child->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) {
+
+        if ((first_child->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) && uvm_gpu_chunk_is_user(first_child)) {
            uvm_gpu_chunk_t *prev_child = chunk->suballoc->subchunks[i-1];

            UVM_ASSERT(child->va_block == child_va_block);
@ -825,11 +840,12 @@ static void merge_gpu_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
    // the subchunk state.
    uvm_spin_lock(&pmm->list_lock);

-    child_state = chunk->suballoc->subchunks[0]->state;
+    subchunk = chunk->suballoc->subchunks[0];
+    child_state = subchunk->state;

-    if (child_state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) {
-        subchunk = chunk->suballoc->subchunks[0];
+    if ((child_state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) && uvm_gpu_chunk_is_user(subchunk)) {
        UVM_ASSERT(subchunk->va_block);
+
        chunk->va_block = subchunk->va_block;
        chunk->va_block_page_index = subchunk->va_block_page_index;
        chunk->is_referenced = subchunk->is_referenced;
@ -845,7 +861,7 @@ static void merge_gpu_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)

    // The resulting chunk is assumed to be non-zero as a simplification,
    // instead of checking that all the subchunks are zero, since callers of
-    // uvm_pmm_gpu_alloc are not required to clear it. However, we think that
+    // pmm_gpu_alloc are not required to clear it. However, we think that
    // this covers all relevant cases since it is uncommon to split a chunk and
    // not to use any of the subchunks later on.
    chunk->is_zero = false;
@ -859,7 +875,7 @@ static void merge_gpu_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
        // merge.
        UVM_ASSERT(list_empty(&subchunk->list));

-        if (child_state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED)
+        if ((child_state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) && uvm_gpu_chunk_is_user(subchunk))
            UVM_ASSERT(subchunk->va_block != NULL);

        kmem_cache_free(CHUNK_CACHE, subchunk);
@ -1191,7 +1207,7 @@ uvm_gpu_phys_address_t uvm_pmm_gpu_peer_phys_address(uvm_pmm_gpu_t *pmm,
    uvm_aperture_t aperture = uvm_gpu_peer_aperture(accessing_gpu, gpu);
    NvU64 addr;

-    if (uvm_gpus_are_nvswitch_connected(accessing_gpu, gpu))
+    if (uvm_parent_gpus_are_nvswitch_connected(accessing_gpu->parent, gpu->parent))
        addr = chunk->address + gpu->parent->nvswitch_info.fabric_memory_window_start;
    else
        addr = chunk->address;
@ -1215,7 +1231,9 @@ uvm_gpu_address_t uvm_pmm_gpu_peer_copy_address(uvm_pmm_gpu_t *pmm,
    return uvm_gpu_address_virtual(gpu_peer_mapping->base + chunk->address);
 }

-static NV_STATUS evict_root_chunk_from_va_block(uvm_pmm_gpu_t *pmm, uvm_gpu_root_chunk_t *root_chunk, uvm_va_block_t *va_block)
+static NV_STATUS evict_root_chunk_from_va_block(uvm_pmm_gpu_t *pmm,
+                                                uvm_gpu_root_chunk_t *root_chunk,
+                                                uvm_va_block_t *va_block)
 {
    uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
    NV_STATUS status;
@ -1477,7 +1495,7 @@ static void root_chunk_update_eviction_list(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t
    uvm_spin_lock(&pmm->list_lock);

    UVM_ASSERT(uvm_gpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_MAX);
-    UVM_ASSERT(uvm_pmm_gpu_memory_type_is_user(chunk->type));
+    UVM_ASSERT(uvm_gpu_chunk_is_user(chunk));
    UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED ||
               chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);

@ -1797,8 +1815,8 @@ static NV_STATUS alloc_chunk_with_splits(uvm_pmm_gpu_t *pmm,
        NvU32 i;
        uvm_gpu_chunk_t *parent;

-        UVM_ASSERT(uvm_gpu_chunk_get_size(chunk)  == cur_size);
-        UVM_ASSERT(chunk->type  == type);
+        UVM_ASSERT(uvm_gpu_chunk_get_size(chunk) == cur_size);
+        UVM_ASSERT(chunk->type == type);
        UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);

        if (chunk->parent) {
@ -1953,8 +1971,7 @@ NV_STATUS alloc_root_chunk(uvm_pmm_gpu_t *pmm,
    //
    // TODO: Bug 2446832: Most (all?) kernel chunks don't require scrubbing.
    // Also, user pages that are about to be overwritten, don't need to be
-    // zeroed, either. Add an interface to uvm_pmm_gpu_alloc for callers to
-    // specify when they don't need zeroed pages.
+    // zeroed, either. Add a flag to uvm_pmm_gpu_alloc_* to skip scrubbing.
    const bool skip_pma_scrubbing = gpu->mem_info.numa.enabled;
    UVM_ASSERT(uvm_pmm_gpu_memory_type_is_user(type) || uvm_pmm_gpu_memory_type_is_kernel(type));

@ -1973,7 +1990,7 @@ NV_STATUS alloc_root_chunk(uvm_pmm_gpu_t *pmm,

    // When the Confidential Computing feature is enabled, allocate GPU memory
    // in the protected region, unless specified otherwise.
-    if (g_uvm_global.conf_computing_enabled && memory_type_is_protected(type))
+    if (g_uvm_global.conf_computing_enabled && uvm_pmm_gpu_memory_type_is_protected(type))
        options.flags |= UVM_PMA_ALLOCATE_PROTECTED_REGION;

    if (!gpu->parent->rm_info.isSimulated &&
@ -2063,7 +2080,8 @@ void free_root_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_root_chunk_t *root_chunk, free_

    status = uvm_tracker_wait_deinit(&root_chunk->tracker);
    if (status != NV_OK) {
-        // TODO: Bug 1766184: Handle RC/ECC. For now just go ahead and free the chunk anyway.
+        // TODO: Bug 1766184: Handle RC/ECC. For now just go ahead and free the
+        // chunk anyway.
        UVM_ASSERT(uvm_global_get_status() != NV_OK);
    }

@ -2161,9 +2179,10 @@ NV_STATUS split_gpu_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
        // The child inherits the parent's state.
        subchunk->state = chunk->state;

-        if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) {
+        if ((chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) && uvm_gpu_chunk_is_user(chunk)) {
            UVM_ASSERT(chunk->va_block);
            uvm_assert_mutex_locked(&chunk->va_block->lock);
+
            subchunk->va_block = chunk->va_block;
            subchunk->va_block_page_index = chunk->va_block_page_index + (i * subchunk_size) / PAGE_SIZE;
            subchunk->is_referenced = chunk->is_referenced;
@ -2185,7 +2204,8 @@ NV_STATUS split_gpu_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
        chunk->is_referenced = false;
    }
    else if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED) {
-        // -1 for the parent chunk that is going to transition into the split state.
+        // -1 for the parent chunk that is going to transition into the split
+        // state.
        root_chunk->chunk.suballoc->pinned_leaf_chunks += num_sub - 1;

        // When a pinned root chunk gets split, the count starts at 0 not
@ -2680,9 +2700,9 @@ static NV_STATUS uvm_pmm_gpu_pma_evict_range(void *void_pmm,

    // Make sure that all pending allocations, that could have started before
    // the eviction callback was called, are done. This is required to guarantee
-    // that any address that, PMA thinks, is owned by UVM has been indeed recorded
-    // in PMM's state. Taking the pma_lock in write mode will make sure all
-    // readers (pending allocations and frees) are done, but will also
+    // that any address that, PMA thinks, is owned by UVM has been indeed
+    // recorded in PMM's state. Taking the pma_lock in write mode will make sure
+    // all readers (pending allocations and frees) are done, but will also
    // unnecessarily stop new allocations from starting until it's released.
    // TODO: Bug 1795559: SRCU would likely be better for this type of
    // synchronization, but that's GPL. Figure out whether we can do anything
@ -2704,7 +2724,7 @@ static NV_STATUS uvm_pmm_gpu_pma_evict_range(void *void_pmm,
            uvm_spin_lock(&pmm->list_lock);

            if (chunk->state != UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED) {
-                UVM_ASSERT(uvm_pmm_gpu_memory_type_is_user(chunk->type));
+                UVM_ASSERT(uvm_gpu_chunk_is_user(chunk));

                if (chunk_is_evictable(pmm, chunk)) {
                    chunk_start_eviction(pmm, chunk);
@ -2722,7 +2742,7 @@ static NV_STATUS uvm_pmm_gpu_pma_evict_range(void *void_pmm,
        } while (!eviction_started && chunk->state != UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED);

        // The eviction callback gets called with a physical range that might be
-        // only partially allocated by UVM. Skip the chunks that UVM doesn't own.
+        // only partially allocated by UVM. Skip the chunks that UVM doesn't own
        if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED)
            continue;

@ -2998,9 +3018,9 @@ static NV_STATUS get_chunk_mappings_in_range(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t
        reverse_map = &get_chunk_mappings_data->mappings[get_chunk_mappings_data->num_mappings];

        reverse_map->va_block = chunk->va_block;
-        reverse_map->region   = uvm_va_block_region(chunk->va_block_page_index,
-                                                    chunk->va_block_page_index + uvm_gpu_chunk_get_size(chunk) / PAGE_SIZE);
-        reverse_map->owner    = gpu->id;
+        reverse_map->region = uvm_va_block_region(chunk->va_block_page_index,
+                                                  chunk->va_block_page_index + uvm_gpu_chunk_get_size(chunk) / PAGE_SIZE);
+        reverse_map->owner = gpu->id;

        // If we land in the middle of a chunk, adjust the offset
        if (get_chunk_mappings_data->phys_start > chunk->address) {
@ -3039,9 +3059,9 @@ NvU32 uvm_pmm_gpu_phys_to_virt(uvm_pmm_gpu_t *pmm, NvU64 phys_addr, NvU64 region
        uvm_gpu_chunk_t *chunk = &root_chunk->chunk;
        get_chunk_mappings_data_t get_chunk_mappings_data;

-        get_chunk_mappings_data.phys_start   = phys_addr;
-        get_chunk_mappings_data.phys_end     = phys_addr + size_in_chunk - 1;
-        get_chunk_mappings_data.mappings     = out_mappings + num_mappings;
+        get_chunk_mappings_data.phys_start = phys_addr;
+        get_chunk_mappings_data.phys_end = phys_addr + size_in_chunk - 1;
+        get_chunk_mappings_data.mappings = out_mappings + num_mappings;
        get_chunk_mappings_data.num_mappings = 0;

        // Walk the chunks for the current root chunk
@ -3305,6 +3325,83 @@ static bool uvm_pmm_gpu_check_orphan_pages(uvm_pmm_gpu_t *pmm)
 }
 #endif // UVM_IS_CONFIG_HMM()

+#if defined(CONFIG_PCI_P2PDMA) && defined(NV_STRUCT_PAGE_HAS_ZONE_DEVICE_DATA)
+static void device_p2p_page_free_wake(struct nv_kref *ref)
+{
+    uvm_device_p2p_mem_t *p2p_mem = container_of(ref, uvm_device_p2p_mem_t, refcount);
+    wake_up(&p2p_mem->waitq);
+}
+
+static void device_p2p_page_free(struct page *page)
+{
+    uvm_device_p2p_mem_t *p2p_mem = page->zone_device_data;
+
+    page->zone_device_data = NULL;
+    nv_kref_put(&p2p_mem->refcount, device_p2p_page_free_wake);
+}
+
+static const struct dev_pagemap_ops uvm_device_p2p_pgmap_ops =
+{
+    .page_free = device_p2p_page_free,
+};
+
+void uvm_pmm_gpu_device_p2p_init(uvm_gpu_t *gpu)
+{
+    unsigned long pci_start_pfn = pci_resource_start(gpu->parent->pci_dev,
+                                                     uvm_device_p2p_static_bar(gpu)) >> PAGE_SHIFT;
+    unsigned long pci_end_pfn = pci_start_pfn + (gpu->mem_info.static_bar1_size >> PAGE_SHIFT);
+    struct page *p2p_page;
+
+    gpu->device_p2p_initialised = false;
+    uvm_mutex_init(&gpu->device_p2p_lock, UVM_LOCK_ORDER_GLOBAL);
+
+    if (uvm_parent_gpu_is_coherent(gpu->parent)) {
+        // A coherent system uses normal struct pages.
+        gpu->device_p2p_initialised = true;
+        return;
+    }
+
+    // RM sets this when it has created a contiguous BAR mapping large enough to
+    // cover all of GPU memory that will be allocated to userspace buffers. This
+    // is required to support the P2PDMA feature to ensure we have a P2PDMA page
+    // available for every mapping.
+    if (!gpu->mem_info.static_bar1_size)
+        return;
+
+    if (pci_p2pdma_add_resource(gpu->parent->pci_dev, uvm_device_p2p_static_bar(gpu), 0, 0)) {
+        UVM_ERR_PRINT("Unable to initialse PCI P2PDMA pages\n");
+        return;
+    }
+
+    // The current upstream PCIe P2PDMA architecture does not allow drivers to
+    // specify a page_free callback. We plan to work with upstream maintainers
+    // to resolve this but in the mean time we can work around the issue by
+    // overwriting the existing dev_pagemap_ops struct with our own.
+    // TODO: Bug 4672502: [Linux Upstream][UVM] Allow drivers to manage and
+    // allocate PCI P2PDMA pages directly
+    p2p_page = pfn_to_page(pci_start_pfn);
+    p2p_page->pgmap->ops = &uvm_device_p2p_pgmap_ops;
+    for (; page_to_pfn(p2p_page) < pci_end_pfn; p2p_page++)
+        p2p_page->zone_device_data = NULL;
+
+    gpu->device_p2p_initialised = true;
+}
+
+void uvm_pmm_gpu_device_p2p_deinit(uvm_gpu_t *gpu)
+{
+    unsigned long pci_start_pfn = pci_resource_start(gpu->parent->pci_dev,
+                                                     uvm_device_p2p_static_bar(gpu)) >> PAGE_SHIFT;
+    struct page *p2p_page;
+
+    if (gpu->device_p2p_initialised && !uvm_parent_gpu_is_coherent(gpu->parent)) {
+        p2p_page = pfn_to_page(pci_start_pfn);
+        devm_memunmap_pages(&gpu->parent->pci_dev->dev, p2p_page->pgmap);
+    }
+
+    gpu->device_p2p_initialised = false;
+}
+#endif // CONFIG_PCI_P2PDMA
+
 static void process_lazy_free(uvm_pmm_gpu_t *pmm)
 {
    uvm_gpu_chunk_t *chunk;
@ -3346,8 +3443,8 @@ NV_STATUS uvm_pmm_gpu_init(uvm_pmm_gpu_t *pmm)
    NV_STATUS status = NV_OK;
    size_t i, j, k;

-    // UVM_CHUNK_SIZE_INVALID is UVM_CHUNK_SIZE_MAX shifted left by 1. This protects
-    // UVM_CHUNK_SIZE_INVALID from being negative
+    // UVM_CHUNK_SIZE_INVALID is UVM_CHUNK_SIZE_MAX shifted left by 1. This
+    // protects UVM_CHUNK_SIZE_INVALID from being negative
    BUILD_BUG_ON(UVM_CHUNK_SIZE_MAX >= UVM_CHUNK_SIZE_INVALID);

    uvm_assert_mutex_locked(&g_uvm_global.global_lock);
@ -3637,7 +3734,8 @@ NV_STATUS uvm_test_evict_chunk(UVM_TEST_EVICT_CHUNK_PARAMS *params, struct file

    if (!root_chunk) {
        // Not finding a chunk to evict is not considered an error, the caller
-        // can inspect the targeted_chunk_size to see whether anything was evicted.
+        // can inspect the targeted_chunk_size to see whether anything was
+        // evicted.
        goto out;
    }

@ -3860,8 +3958,8 @@ NV_STATUS uvm_test_pmm_query_pma_stats(UVM_TEST_PMM_QUERY_PMA_STATS_PARAMS *para
    if (!gpu)
        return NV_ERR_INVALID_DEVICE;

-    params->pma_stats.numFreePages64k = UVM_READ_ONCE(gpu->pmm.pma_stats->numFreePages64k);
-    params->pma_stats.numFreePages2m = UVM_READ_ONCE(gpu->pmm.pma_stats->numFreePages2m);
+    params->pma_stats.numFreePages64k = READ_ONCE(gpu->pmm.pma_stats->numFreePages64k);
+    params->pma_stats.numFreePages2m = READ_ONCE(gpu->pmm.pma_stats->numFreePages2m);

    uvm_gpu_release(gpu);
    return NV_OK;
--- a/kernel-open/nvidia-uvm/uvm_pmm_gpu.h
+++ b/kernel-open/nvidia-uvm/uvm_pmm_gpu.h
@ -59,7 +59,7 @@
 #include "uvm_linux.h"
 #include "uvm_types.h"
 #include "nv_uvm_types.h"
-#if UVM_IS_CONFIG_HMM()
+#if UVM_IS_CONFIG_HMM() || defined(CONFIG_PCI_P2PDMA)
 #include <linux/memremap.h>
 #endif

@ -126,6 +126,8 @@ static bool uvm_pmm_gpu_memory_type_is_kernel(uvm_pmm_gpu_memory_type_t type)
    return !uvm_pmm_gpu_memory_type_is_user(type);
 }

+bool uvm_pmm_gpu_memory_type_is_protected(uvm_pmm_gpu_memory_type_t type);
+
 typedef enum
 {
    // Chunk belongs to PMA. Code outside PMM should not have access to
@ -144,11 +146,13 @@ typedef enum

    // Chunk is temporarily pinned.
    //
-    // This state is used for user memory chunks that have been allocated, but haven't
-    // been unpinned yet and also internally when a chunk is about to be split.
+    // This state is used for user memory chunks that have been allocated, but
+    // haven't been unpinned yet and also internally when a chunk is about to be
+    // split.
    UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED,

-    // Chunk is allocated. That is it is backing some VA block
+    // Chunk is allocated. In the case of a user chunk, this state implies that
+    // the chunk is backing a VA block.
    UVM_PMM_GPU_CHUNK_STATE_ALLOCATED,

    // Number of states - MUST BE LAST
@ -173,7 +177,6 @@ typedef enum
    UVM_PMM_ALLOC_FLAGS_MASK = (1 << 2) - 1
 } uvm_pmm_alloc_flags_t;

-
 typedef enum
 {
    // Identifier for lists with zeroed chunks
@ -227,6 +230,16 @@ unsigned long uvm_pmm_gpu_devmem_get_pfn(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *ch

 #endif

+#if defined(CONFIG_PCI_P2PDMA) && defined(NV_STRUCT_PAGE_HAS_ZONE_DEVICE_DATA)
+#include <linux/pci-p2pdma.h>
+
+void uvm_pmm_gpu_device_p2p_init(uvm_gpu_t *gpu);
+void uvm_pmm_gpu_device_p2p_deinit(uvm_gpu_t *gpu);
+#else
+static inline void uvm_pmm_gpu_device_p2p_init(uvm_gpu_t *gpu) {}
+static inline void uvm_pmm_gpu_device_p2p_deinit(uvm_gpu_t *gpu) {}
+#endif
+
 struct uvm_gpu_chunk_struct
 {
    // Physical address of GPU chunk. This may be removed to save memory
@ -251,14 +264,16 @@ struct uvm_gpu_chunk_struct

        // This flag is initalized when allocating a new root chunk from PMA.
        // It is set to true, if PMA already scrubbed the chunk. The flag is
-        // only valid at allocation time (after uvm_pmm_gpu_alloc call), and
+        // only valid at allocation time (after uvm_pmm_gpu_alloc_* call), and
        // the caller is not required to clear it before freeing the chunk. The
        // VA block chunk population code can query it to skip zeroing the
        // chunk.
        bool is_zero : 1;

-        // This flag indicates an allocated chunk is referenced by a device
+        // This flag indicates an allocated user chunk is referenced by a device
        // private struct page PTE and therefore expects a page_free() callback.
+        //
+        // This field is always false in kernel chunks.
        bool is_referenced : 1;

        uvm_pmm_gpu_chunk_state_t state : order_base_2(UVM_PMM_GPU_CHUNK_STATE_COUNT + 1);
@ -286,6 +301,8 @@ struct uvm_gpu_chunk_struct
    // The VA block using the chunk, if any.
    // User chunks that are not backed by a VA block are considered to be
    // temporarily pinned and cannot be evicted.
+    //
+    // This field is always NULL in kernel chunks.
    uvm_va_block_t *va_block;

    // If this is subchunk it points to the parent - in other words
@ -403,6 +420,12 @@ static void uvm_gpu_chunk_set_size(uvm_gpu_chunk_t *chunk, uvm_chunk_size_t size
 // use it if the owning GPU is retained.
 uvm_gpu_t *uvm_gpu_chunk_get_gpu(const uvm_gpu_chunk_t *chunk);

+// Returns true if the memory type of the chunk is a user type.
+static bool uvm_gpu_chunk_is_user(const uvm_gpu_chunk_t *chunk)
+{
+    return uvm_pmm_gpu_memory_type_is_user(chunk->type);
+}
+
 // Return the first struct page corresponding to the physical address range
 // of the given chunk.
 //
@ -412,7 +435,10 @@ uvm_gpu_t *uvm_gpu_chunk_get_gpu(const uvm_gpu_chunk_t *chunk);
 // page containing the chunk's starting address.
 struct page *uvm_gpu_chunk_to_page(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk);

-// Allocates num_chunks chunks of size chunk_size in caller-supplied array (chunks).
+// User memory allocator.
+//
+// Allocates num_chunks chunks of size chunk_size in caller-supplied array
+// (chunks).
 //
 // Returned chunks are in the TEMP_PINNED state, requiring a call to either
 // uvm_pmm_gpu_unpin_allocated, uvm_pmm_gpu_unpin_referenced, or
@ -432,47 +458,24 @@ struct page *uvm_gpu_chunk_to_page(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk);
 // If the memory returned by the PMM allocator cannot be physically addressed,
 // the MMU interface provides user chunk mapping and unmapping functions
 // (uvm_mmu_chunk_map/unmap) that enable virtual addressing.
-NV_STATUS uvm_pmm_gpu_alloc(uvm_pmm_gpu_t *pmm,
-                            size_t num_chunks,
-                            uvm_chunk_size_t chunk_size,
-                            uvm_pmm_gpu_memory_type_t mem_type,
-                            uvm_pmm_alloc_flags_t flags,
-                            uvm_gpu_chunk_t **chunks,
-                            uvm_tracker_t *out_tracker);
+NV_STATUS uvm_pmm_gpu_alloc_user(uvm_pmm_gpu_t *pmm,
+                                 size_t num_chunks,
+                                 uvm_chunk_size_t chunk_size,
+                                 uvm_pmm_alloc_flags_t flags,
+                                 uvm_gpu_chunk_t **chunks,
+                                 uvm_tracker_t *out_tracker);

-// Helper for allocating kernel memory
+// Kernel memory allocator.
 //
-// Internally calls uvm_pmm_gpu_alloc() and sets the state of all chunks to
-// allocated on success.
-//
-// If Confidential Computing is enabled, this helper allocates protected kernel
-// memory.
-static NV_STATUS uvm_pmm_gpu_alloc_kernel(uvm_pmm_gpu_t *pmm,
-                                          size_t num_chunks,
-                                          uvm_chunk_size_t chunk_size,
-                                          uvm_pmm_alloc_flags_t flags,
-                                          uvm_gpu_chunk_t **chunks,
-                                          uvm_tracker_t *out_tracker)
-{
-    return uvm_pmm_gpu_alloc(pmm, num_chunks, chunk_size, UVM_PMM_GPU_MEMORY_TYPE_KERNEL, flags, chunks, out_tracker);
-}
-
-// Helper for allocating user memory
-//
-// Simple wrapper that just uses UVM_PMM_GPU_MEMORY_TYPE_USER for the memory
-// type.
-//
-// If Confidential Computing is enabled, this helper allocates protected user
-// memory.
-static NV_STATUS uvm_pmm_gpu_alloc_user(uvm_pmm_gpu_t *pmm,
-                                        size_t num_chunks,
-                                        uvm_chunk_size_t chunk_size,
-                                        uvm_pmm_alloc_flags_t flags,
-                                        uvm_gpu_chunk_t **chunks,
-                                        uvm_tracker_t *out_tracker)
-{
-    return uvm_pmm_gpu_alloc(pmm, num_chunks, chunk_size, UVM_PMM_GPU_MEMORY_TYPE_USER, flags, chunks, out_tracker);
-}
+// See uvm_pmm_gpu_alloc_user documentation for details on the behavior of this
+// function, with one exception: the returned kernel chunks are in the ALLOCATED
+// state.
+NV_STATUS uvm_pmm_gpu_alloc_kernel(uvm_pmm_gpu_t *pmm,
+                                   size_t num_chunks,
+                                   uvm_chunk_size_t chunk_size,
+                                   uvm_pmm_alloc_flags_t flags,
+                                   uvm_gpu_chunk_t **chunks,
+                                   uvm_tracker_t *out_tracker);

 // Unpin a temporarily pinned chunk, set its reverse map to a VA block, and
 // mark it as allocated.
@ -486,7 +489,7 @@ void uvm_pmm_gpu_unpin_allocated(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm
 // Can only be used on user memory.
 void uvm_pmm_gpu_unpin_referenced(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_va_block_t *va_block);

-// Frees the chunk. This also unpins the chunk if it is temporarily pinned.
+// Free a user or kernel chunk. Temporarily pinned chunks are unpinned.
 //
 // The tracker is optional and a NULL tracker indicates that no new operation
 // has been pushed for the chunk, but the tracker returned as part of
@ -542,7 +545,8 @@ size_t uvm_pmm_gpu_get_subchunks(uvm_pmm_gpu_t *pmm,
 // leaf children must be allocated.
 void uvm_pmm_gpu_merge_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk);

-// Waits for all free chunk trackers (removing their completed entries) to complete.
+// Waits for all free chunk trackers (removing their completed entries) to
+// complete.
 //
 // This inherently races with any chunks being freed to this PMM. The assumption
 // is that the caller doesn't care about preventing new chunks from being freed,
--- a/kernel-open/nvidia-uvm/uvm_pmm_sysmem.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_sysmem.c
@ -126,7 +126,7 @@ NV_STATUS uvm_pmm_sysmem_mappings_add_gpu_mapping(uvm_pmm_sysmem_mappings_t *sys
            NvU64 remove_key;

            for (remove_key = base_key; remove_key < key; ++remove_key)
-                (void *)radix_tree_delete(&sysmem_mappings->reverse_map_tree, remove_key);
+                (void)radix_tree_delete(&sysmem_mappings->reverse_map_tree, remove_key);

            kmem_cache_free(g_reverse_page_map_cache, new_reverse_map);
            status = errno_to_nv_status(ret);
@ -455,7 +455,7 @@ static NvU32 compute_gpu_mappings_entry_index(uvm_parent_processor_mask_t *dma_a
    // above and including the id and then counting the number of bits
    // remaining.
    uvm_parent_processor_mask_zero(&subset_mask);
-    bitmap_set(subset_mask.bitmap, UVM_PARENT_ID_GPU0_VALUE, uvm_parent_id_gpu_index(id));
+    uvm_parent_processor_mask_range_fill(&subset_mask, uvm_parent_gpu_id_from_index(0), uvm_parent_id_gpu_index(id));
    uvm_parent_processor_mask_and(&subset_mask, dma_addrs_mask, &subset_mask);

    return uvm_parent_processor_mask_get_gpu_count(&subset_mask);
--- a/kernel-open/nvidia-uvm/uvm_pmm_sysmem_test.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_sysmem_test.c
@ -771,7 +771,7 @@ static NV_STATUS test_cpu_chunk_mig(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
    uvm_cpu_physical_chunk_t *phys_chunk;
    NvU64 dma_addr_gpu0;

-    UVM_ASSERT(gpu0->parent == gpu1->parent);
+    UVM_ASSERT(uvm_gpus_are_smc_peers(gpu0, gpu1));

    TEST_NV_CHECK_RET(test_cpu_chunk_alloc(PAGE_SIZE, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, NUMA_NO_NODE, &chunk));
    phys_chunk = uvm_cpu_chunk_to_physical(chunk);
@ -1379,7 +1379,7 @@ static void find_shared_gpu_pair(const uvm_processor_mask_t *test_gpus,
    uvm_gpu_t *gpu1 = uvm_processor_mask_find_next_va_space_gpu(test_gpus, va_space, gpu0);

    while (gpu1) {
-        if (gpu0->parent == gpu1->parent) {
+        if (uvm_gpus_are_smc_peers(gpu0, gpu1)) {
            *out_gpu0 = gpu0;
            *out_gpu1 = gpu1;
            return;
--- a/kernel-open/nvidia-uvm/uvm_pmm_test.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_test.c
@ -112,6 +112,15 @@ static uvm_pmm_gpu_memory_type_t pmm_squash_memory_type(uvm_pmm_gpu_memory_type_
    return UVM_PMM_GPU_MEMORY_TYPE_KERNEL;
 }

+// Validate the chunk state upon allocation.
+static bool gpu_chunk_allocation_state_is_valid(uvm_gpu_chunk_t *chunk)
+{
+    if (uvm_gpu_chunk_is_user(chunk))
+        return chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED;
+    else
+        return chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED;
+}
+
 // Verify that the input chunks are in the correct state following alloc
 static NV_STATUS check_chunks(uvm_gpu_chunk_t **chunks,
                              size_t num_chunks,
@ -121,12 +130,13 @@ static NV_STATUS check_chunks(uvm_gpu_chunk_t **chunks,
    size_t i;

    mem_type = pmm_squash_memory_type(mem_type);
+
    for (i = 0; i < num_chunks; i++) {
        TEST_CHECK_RET(chunks[i]);
        TEST_CHECK_RET(chunks[i]->suballoc == NULL);
-        TEST_CHECK_RET(chunks[i]->type  == mem_type);
-        TEST_CHECK_RET(chunks[i]->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
-        TEST_CHECK_RET(uvm_gpu_chunk_get_size(chunks[i])  == chunk_size);
+        TEST_CHECK_RET(chunks[i]->type == mem_type);
+        TEST_CHECK_RET(gpu_chunk_allocation_state_is_valid(chunks[i]));
+        TEST_CHECK_RET(uvm_gpu_chunk_get_size(chunks[i]) == chunk_size);
        TEST_CHECK_RET(IS_ALIGNED(chunks[i]->address, chunk_size));
    }

@ -198,7 +208,18 @@ static NV_STATUS chunk_alloc_check(uvm_pmm_gpu_t *pmm,
    if (gpu->mem_info.size == 0)
        return NV_ERR_NO_MEMORY;

-    status = uvm_pmm_gpu_alloc(pmm, num_chunks, chunk_size, mem_type, flags, chunks, &local_tracker);
+    // TODO: Bug 4287430: the calls to uvm_pmm_gpu_alloc_* request protected
+    // memory, ignoring the protection type in mem_type. But unprotected memory
+    // is currently not used in UVM, so the default protection (protected) is
+    // correct.
+    mem_type = pmm_squash_memory_type(mem_type);
+    TEST_CHECK_RET(uvm_pmm_gpu_memory_type_is_protected(mem_type));
+
+    if (uvm_pmm_gpu_memory_type_is_user(mem_type))
+        status = uvm_pmm_gpu_alloc_user(pmm, num_chunks, chunk_size, flags, chunks, &local_tracker);
+    else
+        status = uvm_pmm_gpu_alloc_kernel(pmm, num_chunks, chunk_size, flags, chunks, &local_tracker);
+
    if (status != NV_OK)
        return status;

@ -216,14 +237,25 @@ static NV_STATUS chunk_alloc_user_check(uvm_pmm_gpu_t *pmm,
    NV_STATUS status;
    uvm_tracker_t local_tracker = UVM_TRACKER_INIT();

-    status = uvm_pmm_gpu_alloc(pmm, num_chunks, chunk_size, mem_type, flags, chunks, &local_tracker);
+    // TODO: Bug 4287430: the call to uvm_pmm_gpu_alloc_user requests protected
+    // memory, ignoring the protection type in mem_type. But unprotected memory
+    // is currently not used in UVM, so the default protection (protected) is
+    // correct.
+    mem_type = pmm_squash_memory_type(mem_type);
+    TEST_CHECK_RET(uvm_pmm_gpu_memory_type_is_protected(mem_type));
+
+    status = uvm_pmm_gpu_alloc_user(pmm, num_chunks, chunk_size, flags, chunks, &local_tracker);
    if (status != NV_OK)
        return status;

    return chunk_alloc_check_common(pmm, num_chunks, chunk_size, mem_type, flags, chunks, &local_tracker, tracker);
 }

-static NV_STATUS check_leak(uvm_gpu_t *gpu, uvm_chunk_size_t chunk_size, uvm_pmm_gpu_memory_type_t type, NvS64 limit, NvU64 *chunks)
+static NV_STATUS check_leak(uvm_gpu_t *gpu,
+                            uvm_chunk_size_t chunk_size,
+                            uvm_pmm_gpu_memory_type_t type,
+                            NvS64 limit,
+                            NvU64 *chunks)
 {
    NV_STATUS status = NV_OK;
    pmm_leak_bucket_t *bucket, *next;
@ -702,9 +734,9 @@ static NV_STATUS split_test_single(uvm_pmm_gpu_t *pmm,
            TEST_CHECK_GOTO(split_chunks[i], error);
            TEST_CHECK_GOTO(split_chunks[i]->address == parent_addr + i * child_size, error);
            TEST_CHECK_GOTO(split_chunks[i]->suballoc == NULL, error);
-            TEST_CHECK_GOTO(split_chunks[i]->type  == parent_type, error);
-            TEST_CHECK_GOTO(split_chunks[i]->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED, error);
-            TEST_CHECK_GOTO(uvm_gpu_chunk_get_size(split_chunks[i])  == child_size, error);
+            TEST_CHECK_GOTO(split_chunks[i]->type == parent_type, error);
+            TEST_CHECK_GOTO(gpu_chunk_allocation_state_is_valid(split_chunks[i]), error);
+            TEST_CHECK_GOTO(uvm_gpu_chunk_get_size(split_chunks[i]) == child_size, error);
        }

        status = get_subchunks_test(pmm, temp_chunk, split_chunks, num_children);
@ -714,9 +746,11 @@ static NV_STATUS split_test_single(uvm_pmm_gpu_t *pmm,
        if (mode == SPLIT_TEST_MODE_MERGE) {
            parent->chunk = temp_chunk;
            uvm_pmm_gpu_merge_chunk(pmm, parent->chunk);
+
            TEST_CHECK_GOTO(parent->chunk->address == parent_addr, error);
            TEST_CHECK_GOTO(parent->chunk->suballoc == NULL, error);
-            TEST_CHECK_GOTO(parent->chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED, error);
+            TEST_CHECK_GOTO(gpu_chunk_allocation_state_is_valid(parent->chunk), error);
+
            status = destroy_test_chunk(pmm, parent, verif_mem);
        }
        else {
@ -1080,7 +1114,7 @@ static NV_STATUS test_pmm_reverse_map_single(uvm_gpu_t *gpu, uvm_va_space_t *va_

 static NV_STATUS test_pmm_reverse_map_many_blocks(uvm_gpu_t *gpu, uvm_va_space_t *va_space, NvU64 addr, NvU64 size)
 {
-    uvm_va_range_t *va_range;
+    uvm_va_range_managed_t *managed_range;
    uvm_va_block_t *va_block = NULL;
    uvm_va_block_context_t *va_block_context = NULL;
    NvU32 num_blocks;
@ -1089,12 +1123,12 @@ static NV_STATUS test_pmm_reverse_map_many_blocks(uvm_gpu_t *gpu, uvm_va_space_t
    bool is_resident;

    // In this test, the [addr:addr + size) VA region contains
-    // several VA ranges with different sizes.
+    // several managed ranges with different sizes.

    // Find the first block to compute the base physical address of the root
    // chunk
-    uvm_for_each_va_range_in(va_range, va_space, addr, addr + size - 1) {
-        va_block = uvm_va_range_block(va_range, 0);
+    uvm_for_each_va_range_managed_in(managed_range, va_space, addr, addr + size - 1) {
+        va_block = uvm_va_range_block(managed_range, 0);
        if (va_block)
            break;
    }
@ -1121,15 +1155,13 @@ static NV_STATUS test_pmm_reverse_map_many_blocks(uvm_gpu_t *gpu, uvm_va_space_t
    num_blocks = uvm_pmm_gpu_phys_to_virt(&gpu->pmm, phys_addr.address, size, g_reverse_map_entries);
    TEST_CHECK_RET(num_blocks != 0);

-    // Iterate over all VA ranges and their VA blocks within the 2MB VA region.
-    // Some blocks are not populated. However, we assume that blocks have been
-    // populated in order so they have been assigned physical addresses
-    // incrementally. Therefore, the reverse translations will show them in
-    // order.
-    uvm_for_each_va_range_in(va_range, va_space, addr, addr + size - 1) {
-        uvm_va_block_t *va_block;
-
-        for_each_va_block_in_va_range(va_range, va_block) {
+    // Iterate over all managed ranges and their VA blocks within the 2MB VA
+    // region. Some blocks are not populated. However, we assume that blocks
+    // have been populated in order so they have been assigned physical
+    // addresses incrementally. Therefore, the reverse translations will show
+    // them in order.
+    uvm_for_each_va_range_managed_in(managed_range, va_space, addr, addr + size - 1) {
+        for_each_va_block_in_va_range(managed_range, va_block) {
            NvU32 num_va_block_pages = 0;

            // Iterate over all the translations for the current VA block. One
--- a/kernel-open/nvidia-uvm/uvm_policy.c
+++ b/kernel-open/nvidia-uvm/uvm_policy.c
@ -53,7 +53,7 @@ static bool uvm_is_valid_vma_range(struct mm_struct *mm, NvU64 start, NvU64 leng

 uvm_api_range_type_t uvm_api_range_type_check(uvm_va_space_t *va_space, struct mm_struct *mm, NvU64 base, NvU64 length)
 {
-    uvm_va_range_t *va_range, *va_range_last;
+    uvm_va_range_managed_t *managed_range, *managed_range_last;
    const NvU64 last_address = base + length - 1;

    if (mm)
@ -81,17 +81,17 @@ uvm_api_range_type_t uvm_api_range_type_check(uvm_va_space_t *va_space, struct m
        }
    }

-    va_range_last = NULL;
+    managed_range_last = NULL;

-    uvm_for_each_managed_va_range_in_contig(va_range, va_space, base, last_address)
-        va_range_last = va_range;
+    uvm_for_each_va_range_managed_in_contig(managed_range, va_space, base, last_address)
+        managed_range_last = managed_range;

    // Check if passed interval overlaps with an unmanaged VA range, or a
-    // sub-interval not tracked by a VA range
-    if (!va_range_last || va_range_last->node.end < last_address)
+    // sub-interval not tracked by a managed range
+    if (!managed_range_last || managed_range_last->va_range.node.end < last_address)
        return UVM_API_RANGE_TYPE_INVALID;

-    // Passed interval is fully covered by managed VA ranges
+    // Passed interval is fully covered by managed ranges
    return UVM_API_RANGE_TYPE_MANAGED;
 }

@ -100,6 +100,7 @@ static NV_STATUS split_as_needed(uvm_va_space_t *va_space,
                                 uvm_va_policy_is_split_needed_t split_needed_cb,
                                 void *data)
 {
+    uvm_va_range_managed_t *managed_range;
    uvm_va_range_t *va_range;

    UVM_ASSERT(PAGE_ALIGNED(addr));
@ -113,12 +114,14 @@ static NV_STATUS split_as_needed(uvm_va_space_t *va_space,
    if (addr == va_range->node.start)
        return NV_OK;

+    managed_range = uvm_va_range_to_managed_or_null(va_range);
+
    // Only managed ranges can be split.
-    if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED)
+    if (!managed_range)
        return NV_ERR_INVALID_ADDRESS;

-    if (split_needed_cb(uvm_va_range_get_policy(va_range), data))
-        return uvm_va_range_split(va_range, addr - 1, NULL);
+    if (split_needed_cb(&managed_range->policy, data))
+        return uvm_va_range_split(managed_range, addr - 1, NULL);

    return NV_OK;
 }
@ -228,10 +231,10 @@ static NV_STATUS preferred_location_set(uvm_va_space_t *va_space,
                                        NvU64 length,
                                        uvm_processor_id_t preferred_location,
                                        int preferred_cpu_nid,
-                                        uvm_va_range_t **first_va_range_to_migrate,
+                                        uvm_va_range_managed_t **first_managed_range_to_migrate,
                                        uvm_tracker_t *out_tracker)
 {
-    uvm_va_range_t *va_range, *va_range_last;
+    uvm_va_range_managed_t *managed_range, *managed_range_last;
    const NvU64 last_address = base + length - 1;
    bool preferred_location_is_faultable_gpu = false;
    preferred_location_split_params_t split_params;
@ -240,7 +243,7 @@ static NV_STATUS preferred_location_set(uvm_va_space_t *va_space,
    uvm_assert_rwsem_locked_write(&va_space->lock);

    if (UVM_ID_IS_VALID(preferred_location)) {
-        *first_va_range_to_migrate = NULL;
+        *first_managed_range_to_migrate = NULL;
        preferred_location_is_faultable_gpu = UVM_ID_IS_GPU(preferred_location) &&
                                              uvm_processor_mask_test(&va_space->faultable_processors,
                                                                      preferred_location);
@ -256,19 +259,19 @@ static NV_STATUS preferred_location_set(uvm_va_space_t *va_space,
    if (status != NV_OK)
        return status;

-    va_range_last = NULL;
-    uvm_for_each_managed_va_range_in_contig(va_range, va_space, base, last_address) {
+    managed_range_last = NULL;
+    uvm_for_each_va_range_managed_in_contig(managed_range, va_space, base, last_address) {
        bool found_non_migratable_interval = false;

-        va_range_last = va_range;
+        managed_range_last = managed_range;

        // If we didn't split the ends, check that they match
-        if (va_range->node.start < base || va_range->node.end > last_address)
-            UVM_ASSERT(uvm_id_equal(uvm_va_range_get_policy(va_range)->preferred_location, preferred_location));
+        if (managed_range->va_range.node.start < base || managed_range->va_range.node.end > last_address)
+            UVM_ASSERT(uvm_id_equal(managed_range->policy.preferred_location, preferred_location));

        if (UVM_ID_IS_VALID(preferred_location)) {
-            const NvU64 start = max(base, va_range->node.start);
-            const NvU64 end = min(last_address, va_range->node.end);
+            const NvU64 start = max(base, managed_range->va_range.node.start);
+            const NvU64 end = min(last_address, managed_range->va_range.node.end);

            found_non_migratable_interval = !uvm_range_group_all_migratable(va_space, start, end);

@ -276,18 +279,22 @@ static NV_STATUS preferred_location_set(uvm_va_space_t *va_space,
                return NV_ERR_INVALID_DEVICE;
        }

-        status = uvm_va_range_set_preferred_location(va_range, preferred_location, preferred_cpu_nid, mm, out_tracker);
+        status = uvm_va_range_set_preferred_location(managed_range,
+                                                     preferred_location,
+                                                     preferred_cpu_nid,
+                                                     mm,
+                                                     out_tracker);
        if (status != NV_OK)
            return status;

-        // Return the first VA range that needs to be migrated so the caller
-        // function doesn't need to traverse the tree again
-        if (found_non_migratable_interval && (*first_va_range_to_migrate == NULL))
-            *first_va_range_to_migrate = va_range;
+        // Return the first managed range that needs to be migrated so the
+        // caller function doesn't need to traverse the tree again
+        if (found_non_migratable_interval && (*first_managed_range_to_migrate == NULL))
+            *first_managed_range_to_migrate = managed_range;
    }

-    if (va_range_last) {
-        UVM_ASSERT(va_range_last->node.end >= last_address);
+    if (managed_range_last) {
+        UVM_ASSERT(managed_range_last->va_range.node.end >= last_address);
        return NV_OK;
    }

@ -308,8 +315,8 @@ NV_STATUS uvm_api_set_preferred_location(const UVM_SET_PREFERRED_LOCATION_PARAMS
    NV_STATUS tracker_status;
    uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
    uvm_va_space_t *va_space = uvm_va_space_get(filp);
-    uvm_va_range_t *va_range = NULL;
-    uvm_va_range_t *first_va_range_to_migrate = NULL;
+    uvm_va_range_managed_t *managed_range = NULL;
+    uvm_va_range_managed_t *first_managed_range_to_migrate = NULL;
    struct mm_struct *mm;
    uvm_processor_id_t preferred_location_id;
    int preferred_cpu_nid = NUMA_NO_NODE;
@ -386,27 +393,30 @@ NV_STATUS uvm_api_set_preferred_location(const UVM_SET_PREFERRED_LOCATION_PARAMS
                                    length,
                                    preferred_location_id,
                                    preferred_cpu_nid,
-                                    &first_va_range_to_migrate,
+                                    &first_managed_range_to_migrate,
                                    &local_tracker);
    if (status != NV_OK)
        goto done;

-    // No VA range to migrate, early exit
-    if (!first_va_range_to_migrate)
+    // No managed range to migrate, early exit
+    if (!first_managed_range_to_migrate)
        goto done;

    uvm_va_space_downgrade_write(va_space);
    has_va_space_write_lock = false;

-    // No need to check for holes in the VA ranges span here, this was checked by preferred_location_set
-    for (va_range = first_va_range_to_migrate; va_range; va_range = uvm_va_space_iter_next(va_range, end)) {
+    // No need to check for holes in the managed ranges span here, this was
+    // checked by preferred_location_set
+    for (managed_range = first_managed_range_to_migrate;
+         managed_range;
+         managed_range = uvm_va_space_iter_managed_next(managed_range, end)) {
        uvm_range_group_range_iter_t iter;
-        NvU64 cur_start = max(start, va_range->node.start);
-        NvU64 cur_end = min(end, va_range->node.end);
+        NvU64 cur_start = max(start, managed_range->va_range.node.start);
+        NvU64 cur_end = min(end, managed_range->va_range.node.end);

        uvm_range_group_for_each_migratability_in(&iter, va_space, cur_start, cur_end) {
            if (!iter.migratable) {
-                status = uvm_range_group_va_range_migrate(va_range, iter.start, iter.end, &local_tracker);
+                status = uvm_range_group_va_range_migrate(managed_range, iter.start, iter.end, &local_tracker);
                if (status != NV_OK)
                    goto done;
            }
@ -504,7 +514,7 @@ NV_STATUS uvm_va_block_set_accessed_by(uvm_va_block_t *va_block,
    uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block);
    NV_STATUS status;
    uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
-    uvm_va_policy_t *policy = uvm_va_range_get_policy(va_block->va_range);
+    uvm_va_policy_t *policy = &va_block->managed_range->policy;

    UVM_ASSERT(!uvm_va_block_is_hmm(va_block));

@ -604,29 +614,29 @@ static NV_STATUS accessed_by_set(uvm_va_space_t *va_space,
        goto done;

    if (type == UVM_API_RANGE_TYPE_MANAGED) {
-        uvm_va_range_t *va_range;
-        uvm_va_range_t *va_range_last = NULL;
+        uvm_va_range_managed_t *managed_range;
+        uvm_va_range_managed_t *managed_range_last = NULL;

-        uvm_for_each_managed_va_range_in_contig(va_range, va_space, base, last_address) {
-            va_range_last = va_range;
+        uvm_for_each_va_range_managed_in_contig(managed_range, va_space, base, last_address) {
+            managed_range_last = managed_range;

            // If we didn't split the ends, check that they match
-            if (va_range->node.start < base || va_range->node.end > last_address)
-                UVM_ASSERT(uvm_processor_mask_test(&uvm_va_range_get_policy(va_range)->accessed_by,
+            if (managed_range->va_range.node.start < base || managed_range->va_range.node.end > last_address)
+                UVM_ASSERT(uvm_processor_mask_test(&managed_range->policy.accessed_by,
                                                   processor_id) == set_bit);

            if (set_bit) {
-                status = uvm_va_range_set_accessed_by(va_range, processor_id, mm, &local_tracker);
+                status = uvm_va_range_set_accessed_by(managed_range, processor_id, mm, &local_tracker);
                if (status != NV_OK)
                    goto done;
            }
            else {
-                uvm_va_range_unset_accessed_by(va_range, processor_id, &local_tracker);
+                uvm_va_range_unset_accessed_by(managed_range, processor_id, &local_tracker);
            }
        }

-        UVM_ASSERT(va_range_last);
-        UVM_ASSERT(va_range_last->node.end >= last_address);
+        UVM_ASSERT(managed_range_last);
+        UVM_ASSERT(managed_range_last->va_range.node.end >= last_address);
    }
    else {
        // NULL mm case already filtered by uvm_api_range_type_check()
@ -672,7 +682,7 @@ static NV_STATUS va_block_set_read_duplication_locked(uvm_va_block_t *va_block,
    uvm_assert_mutex_locked(&va_block->lock);

    // Force CPU page residency to be on the preferred NUMA node.
-    va_block_context->make_resident.dest_nid = uvm_va_range_get_policy(va_block->va_range)->preferred_nid;
+    va_block_context->make_resident.dest_nid = va_block->managed_range->policy.preferred_nid;

    for_each_id_in_mask(src_id, &va_block->resident) {
        NV_STATUS status;
@ -721,7 +731,7 @@ static NV_STATUS va_block_unset_read_duplication_locked(uvm_va_block_t *va_block
    uvm_processor_id_t processor_id;
    uvm_va_block_region_t block_region = uvm_va_block_region_from_block(va_block);
    uvm_page_mask_t *break_read_duplication_pages = &va_block_context->caller_page_mask;
-    const uvm_va_policy_t *policy = uvm_va_range_get_policy(va_block->va_range);
+    const uvm_va_policy_t *policy = &va_block->managed_range->policy;
    uvm_processor_id_t preferred_location = policy->preferred_location;

    uvm_assert_mutex_locked(&va_block->lock);
@ -863,15 +873,15 @@ static NV_STATUS read_duplication_set(uvm_va_space_t *va_space, NvU64 base, NvU6
        goto done;

    if (type == UVM_API_RANGE_TYPE_MANAGED) {
-        uvm_va_range_t *va_range;
-        uvm_va_range_t *va_range_last = NULL;
+        uvm_va_range_managed_t *managed_range;
+        uvm_va_range_managed_t *managed_range_last = NULL;

-        uvm_for_each_managed_va_range_in_contig(va_range, va_space, base, last_address) {
-            va_range_last = va_range;
+        uvm_for_each_va_range_managed_in_contig(managed_range, va_space, base, last_address) {
+            managed_range_last = managed_range;

            // If we didn't split the ends, check that they match
-            if (va_range->node.start < base || va_range->node.end > last_address)
-                UVM_ASSERT(uvm_va_range_get_policy(va_range)->read_duplication == new_policy);
+            if (managed_range->va_range.node.start < base || managed_range->va_range.node.end > last_address)
+                UVM_ASSERT(managed_range->policy.read_duplication == new_policy);

            // If the va_space cannot currently read duplicate, only change the user
            // state. All memory should already have read duplication unset.
@ -879,22 +889,22 @@ static NV_STATUS read_duplication_set(uvm_va_space_t *va_space, NvU64 base, NvU6

                // Handle SetAccessedBy mappings
                if (new_policy == UVM_READ_DUPLICATION_ENABLED) {
-                    status = uvm_va_range_set_read_duplication(va_range, mm);
+                    status = uvm_va_range_set_read_duplication(managed_range, mm);
                    if (status != NV_OK)
                        goto done;
                }
                else {
                    // If unsetting read duplication fails, the return status is
                    // not propagated back to the caller
-                    (void)uvm_va_range_unset_read_duplication(va_range, mm);
+                    (void)uvm_va_range_unset_read_duplication(managed_range, mm);
                }
            }

-            uvm_va_range_get_policy(va_range)->read_duplication = new_policy;
+            managed_range->policy.read_duplication = new_policy;
        }

-        UVM_ASSERT(va_range_last);
-        UVM_ASSERT(va_range_last->node.end >= last_address);
+        UVM_ASSERT(managed_range_last);
+        UVM_ASSERT(managed_range_last->va_range.node.end >= last_address);
    }
    else {
        UVM_ASSERT(type == UVM_API_RANGE_TYPE_HMM);
@ -947,19 +957,16 @@ static NV_STATUS system_wide_atomics_set(uvm_va_space_t *va_space, const NvProce

    already_enabled = uvm_processor_mask_test(&va_space->system_wide_atomics_enabled_processors, gpu->id);
    if (enable && !already_enabled) {
-        uvm_va_range_t *va_range;
+        uvm_va_range_managed_t *managed_range;
        uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
        uvm_va_block_context_t *va_block_context = uvm_va_space_block_context(va_space, NULL);
        NV_STATUS tracker_status;

        // Revoke atomic mappings from the calling GPU
-        uvm_for_each_va_range(va_range, va_space) {
+        uvm_for_each_va_range_managed(managed_range, va_space) {
            uvm_va_block_t *va_block;

-            if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED)
-                continue;
-
-            for_each_va_block_in_va_range(va_range, va_block) {
+            for_each_va_block_in_va_range(managed_range, va_block) {
                uvm_page_mask_t *non_resident_pages = &va_block_context->caller_page_mask;

                uvm_mutex_lock(&va_block->lock);
--- a/kernel-open/nvidia-uvm/uvm_processors.c
+++ b/kernel-open/nvidia-uvm/uvm_processors.c
@ -23,6 +23,8 @@

 #include "uvm_linux.h"
 #include "uvm_processors.h"
+#include "uvm_global.h"
+#include "uvm_gpu.h"

 static struct kmem_cache *g_uvm_processor_mask_cache __read_mostly;
 const uvm_processor_mask_t g_uvm_processor_mask_cpu = { .bitmap = { 1 << UVM_PARENT_ID_CPU_VALUE }};
@ -107,3 +109,31 @@ bool uvm_numa_id_eq(int nid0, int nid1)

    return nid0 == nid1;
 }
+
+const char *uvm_processor_get_name(uvm_processor_id_t id)
+{
+    if (UVM_ID_IS_CPU(id))
+        return "0: CPU";
+    else
+        return uvm_gpu_name(uvm_gpu_get(id));
+}
+
+void uvm_processor_get_uuid(uvm_processor_id_t id, NvProcessorUuid *out_uuid)
+{
+    if (UVM_ID_IS_CPU(id)) {
+        memcpy(out_uuid, &NV_PROCESSOR_UUID_CPU_DEFAULT, sizeof(*out_uuid));
+    }
+    else {
+        uvm_gpu_t *gpu = uvm_gpu_get(id);
+        UVM_ASSERT(gpu);
+        memcpy(out_uuid, &gpu->uuid, sizeof(*out_uuid));
+    }
+}
+
+bool uvm_processor_has_memory(uvm_processor_id_t id)
+{
+    if (UVM_ID_IS_CPU(id))
+        return true;
+
+    return uvm_gpu_get(id)->mem_info.size > 0;
+}
--- a/kernel-open/nvidia-uvm/uvm_processors.h
+++ b/kernel-open/nvidia-uvm/uvm_processors.h
@ -176,6 +176,13 @@ static void prefix_fn_mask##_copy(mask_t *dst, const mask_t *src)
    bitmap_copy(dst->bitmap, src->bitmap, (maxval));                                                         \
 }                                                                                                            \
                                                                                                             \
+static void prefix_fn_mask##_range_fill(mask_t *mask, proc_id_t start, NvU32 nbits)                          \
+{                                                                                                            \
+    UVM_ASSERT_MSG(start.val + nbits <= (maxval), "start %u nbits %u\n", start.val, nbits);                  \
+                                                                                                             \
+    bitmap_set(mask->bitmap, start.val, nbits);                                                              \
+}                                                                                                            \
+                                                                                                             \
 static bool prefix_fn_mask##_and(mask_t *dst, const mask_t *src1, const mask_t *src2)                        \
 {                                                                                                            \
    return bitmap_and(dst->bitmap, src1->bitmap, src2->bitmap, (maxval)) != 0;                               \
@ -276,6 +283,12 @@ typedef uvm_processor_id_t uvm_gpu_id_t;
 // Maximum number of GPUs/processors that can be represented with the id types
 #define UVM_PARENT_ID_MAX_GPUS       NV_MAX_DEVICES
 #define UVM_PARENT_ID_MAX_PROCESSORS (UVM_PARENT_ID_MAX_GPUS + 1)
+#define UVM_MAX_UNIQUE_PARENT_GPU_PAIRS SUM_FROM_0_TO_N(UVM_PARENT_ID_MAX_GPUS - 1)
+
+// Note that this is the number of MIG instance pairs between two different
+// parent GPUs so parent A sub-processor ID 0 to parent B sub-processor ID 0
+// is valid.
+#define UVM_MAX_UNIQUE_SUB_PROCESSOR_PAIRS SUM_FROM_0_TO_N(UVM_PARENT_ID_MAX_SUB_PROCESSORS)

 #define UVM_ID_MAX_GPUS       (UVM_PARENT_ID_MAX_GPUS * UVM_PARENT_ID_MAX_SUB_PROCESSORS)
 #define UVM_ID_MAX_PROCESSORS (UVM_ID_MAX_GPUS + 1)
@ -711,4 +724,16 @@ void uvm_processor_mask_cache_exit(void);
 uvm_processor_mask_t *uvm_processor_mask_cache_alloc(void);
 void uvm_processor_mask_cache_free(uvm_processor_mask_t *mask);

+// Return the name of the given processor ID.
+// Locking: This should only be called when the ID is the CPU or the GPU is
+// retained (such as the va_space lock being held).
+const char *uvm_processor_get_name(uvm_processor_id_t id);
+
+// Return the UUID in 'out_uuid' for the given processor ID 'id'.
+// Locking: This should only be called when the ID is the CPU or the GPU is
+// retained (such as the va_space lock being held).
+void uvm_processor_get_uuid(uvm_processor_id_t id, NvProcessorUuid *out_uuid);
+
+bool uvm_processor_has_memory(uvm_processor_id_t id);
+
 #endif
--- a/Show More
+++ b/Show More