555.42.02

(cherry picked from commit 5a1c474040e1c3ed20760267510cc9d9332898f1)
2025-01-31 21:52:11 +01:00 · 2024-05-21 15:11:46 +02:00 · 2024-05-21 15:11:46 +02:00 · 3084c04453
commit 3084c04453
parent caa2dd11a0
1004 changed files with 172522 additions and 150960 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,11 +1,11 @@
 # Changelog

+## Release 555 Entries
+
+### [555.42.02] 2024-05-21
+
 ## Release 550 Entries

-### [550.100] 2024-07-09
-
-### [550.90.07] 2024-06-04
-
 ### [550.78] 2024-04-25

 ### [550.76] 2024-04-17
--- a/README.md
+++ b/README.md
@ -1,7 +1,7 @@
 # NVIDIA Linux Open GPU Kernel Module Source

 This is the source release of the NVIDIA Linux open GPU kernel modules,
-version 550.100.
+version 555.42.02.


 ## How to Build
@ -17,7 +17,7 @@ as root:

 Note that the kernel modules built here must be used with GSP
 firmware and user-space NVIDIA GPU driver components from a corresponding
-550.100 driver release.  This can be achieved by installing
+555.42.02 driver release.  This can be achieved by installing
 the NVIDIA GPU driver from the .run file using the `--no-kernel-modules`
 option.  E.g.,

@ -74,7 +74,7 @@ kernel.

 The NVIDIA open kernel modules support the same range of Linux kernel
 versions that are supported with the proprietary NVIDIA kernel modules.
-This is currently Linux kernel 3.10 or newer.
+This is currently Linux kernel 4.15 or newer.


 ## How to Contribute
@ -188,7 +188,7 @@ encountered specific to them.
 For details on feature support and limitations, see the NVIDIA GPU driver
 end user README here:

-https://us.download.nvidia.com/XFree86/Linux-x86_64/550.100/README/kernel_open.html
+https://us.download.nvidia.com/XFree86/Linux-x86_64/555.42.02/README/kernel_open.html

 For vGPU support, please refer to the README.vgpu packaged in the vGPU Host
 Package for more details.
@ -757,8 +757,6 @@ Subsystem Device ID.
 | NVIDIA H100 80GB HBM3                           | 2330 10DE 16C0 |
 | NVIDIA H100 80GB HBM3                           | 2330 10DE 16C1 |
 | NVIDIA H100 PCIe                                | 2331 10DE 1626 |
-| NVIDIA H200                                     | 2335 10DE 18BE |
-| NVIDIA H200                                     | 2335 10DE 18BF |
 | NVIDIA H100                                     | 2339 10DE 17FC |
 | NVIDIA H800 NVL                                 | 233A 10DE 183A |
 | NVIDIA GH200 120GB                              | 2342 10DE 16EB |
@ -858,6 +856,7 @@ Subsystem Device ID.
 | NVIDIA RTX A500 Embedded GPU                    | 25FB           |
 | NVIDIA GeForce RTX 4090                         | 2684           |
 | NVIDIA GeForce RTX 4090 D                       | 2685           |
+| NVIDIA GeForce RTX 4070 Ti SUPER                | 2689           |
 | NVIDIA RTX 6000 Ada Generation                  | 26B1 1028 16A1 |
 | NVIDIA RTX 6000 Ada Generation                  | 26B1 103C 16A1 |
 | NVIDIA RTX 6000 Ada Generation                  | 26B1 10DE 16A1 |
@ -875,7 +874,6 @@ Subsystem Device ID.
 | NVIDIA L40S                                     | 26B9 10DE 1851 |
 | NVIDIA L40S                                     | 26B9 10DE 18CF |
 | NVIDIA L20                                      | 26BA 10DE 1957 |
-| NVIDIA L20                                      | 26BA 10DE 1990 |
 | NVIDIA GeForce RTX 4080 SUPER                   | 2702           |
 | NVIDIA GeForce RTX 4080                         | 2704           |
 | NVIDIA GeForce RTX 4070 Ti SUPER                | 2705           |
--- a/kernel-open/Kbuild
+++ b/kernel-open/Kbuild
@ -72,7 +72,7 @@ EXTRA_CFLAGS += -I$(src)/common/inc
 EXTRA_CFLAGS += -I$(src)
 EXTRA_CFLAGS += -Wall $(DEFINES) $(INCLUDES) -Wno-cast-qual -Wno-format-extra-args
 EXTRA_CFLAGS += -D__KERNEL__ -DMODULE -DNVRM
-EXTRA_CFLAGS += -DNV_VERSION_STRING=\"550.100\"
+EXTRA_CFLAGS += -DNV_VERSION_STRING=\"555.42.02\"

 ifneq ($(SYSSRCHOST1X),)
 EXTRA_CFLAGS += -I$(SYSSRCHOST1X)
@ -118,7 +118,7 @@ ifeq ($(ARCH),x86_64)
 endif

 ifeq ($(ARCH),powerpc)
- EXTRA_CFLAGS += -mlittle-endian -mno-strict-align -mno-altivec
+ EXTRA_CFLAGS += -mlittle-endian -mno-strict-align
 endif

 EXTRA_CFLAGS += -DNV_UVM_ENABLE
@ -172,6 +172,7 @@ NV_CFLAGS_FROM_CONFTEST := $(shell $(NV_CONFTEST_CMD) build_cflags)
 NV_CONFTEST_CFLAGS = $(NV_CFLAGS_FROM_CONFTEST) $(EXTRA_CFLAGS) -fno-pie
 NV_CONFTEST_CFLAGS += $(call cc-disable-warning,pointer-sign)
 NV_CONFTEST_CFLAGS += $(call cc-option,-fshort-wchar,)
+NV_CONFTEST_CFLAGS += $(call cc-option,-Werror=incompatible-pointer-types,)
 NV_CONFTEST_CFLAGS += -Wno-error

 NV_CONFTEST_COMPILE_TEST_HEADERS := $(obj)/conftest/macros.h
--- a/kernel-open/Makefile
+++ b/kernel-open/Makefile
@ -28,7 +28,7 @@ else
  else
    KERNEL_UNAME ?= $(shell uname -r)
    KERNEL_MODLIB := /lib/modules/$(KERNEL_UNAME)
-    KERNEL_SOURCES := $(shell test -d $(KERNEL_MODLIB)/source && echo $(KERNEL_MODLIB)/source || echo $(KERNEL_MODLIB)/build)
+    KERNEL_SOURCES := $(shell ((test -d $(KERNEL_MODLIB)/source && echo $(KERNEL_MODLIB)/source) || (test -d $(KERNEL_MODLIB)/build/source && echo $(KERNEL_MODLIB)/build/source)) || echo $(KERNEL_MODLIB)/build)
  endif

  KERNEL_OUTPUT := $(KERNEL_SOURCES)
@ -42,7 +42,11 @@ else
  else
    KERNEL_UNAME ?= $(shell uname -r)
    KERNEL_MODLIB := /lib/modules/$(KERNEL_UNAME)
-    ifeq ($(KERNEL_SOURCES), $(KERNEL_MODLIB)/source)
+    # $(filter patter...,text) - Returns all whitespace-separated words in text that
+    # do match any of the pattern words, removing any words that do not match.
+    # Set the KERNEL_OUTPUT only if either $(KERNEL_MODLIB)/source or
+    # $(KERNEL_MODLIB)/build/source path matches the KERNEL_SOURCES.
+    ifneq ($(filter $(KERNEL_SOURCES),$(KERNEL_MODLIB)/source $(KERNEL_MODLIB)/build/source),)
      KERNEL_OUTPUT := $(KERNEL_MODLIB)/build
      KBUILD_PARAMS := KBUILD_OUTPUT=$(KERNEL_OUTPUT)
    endif
--- a/kernel-open/common/inc/nv-linux.h
+++ b/kernel-open/common/inc/nv-linux.h
@ -58,14 +58,10 @@
 #include <linux/version.h>
 #include <linux/utsname.h>

-#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 32)
-#error "This driver does not support kernels older than 2.6.32!"
-#elif LINUX_VERSION_CODE < KERNEL_VERSION(2, 7, 0)
-#  define KERNEL_2_6
-#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 0)
-#  define KERNEL_3
-#else
-#error "This driver does not support development kernels!"
+#if LINUX_VERSION_CODE == KERNEL_VERSION(4, 4, 0)
+// Version 4.4 is allowed, temporarily, although not officially supported.
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0)
+#error "This driver does not support kernels older than Linux 4.15!"
 #endif

 #if defined (CONFIG_SMP) && !defined (__SMP__)
@ -836,16 +832,16 @@ static inline dma_addr_t nv_phys_to_dma(struct device *dev, NvU64 pa)
 #define NV_PRINT_AT(nv_debug_level,at)                                           \
    {                                                                            \
        nv_printf(nv_debug_level,                                                \
-            "NVRM: VM: %s:%d: 0x%p, %d page(s), count = %d, flags = 0x%08x, "    \
+            "NVRM: VM: %s:%d: 0x%p, %d page(s), count = %d, "                    \
            "page_table = 0x%p\n",  __FUNCTION__, __LINE__, at,                  \
            at->num_pages, NV_ATOMIC_READ(at->usage_count),                      \
-            at->flags, at->page_table);                                          \
+            at->page_table);                                                     \
    }

 #define NV_PRINT_VMA(nv_debug_level,vma)                                                 \
    {                                                                                    \
        nv_printf(nv_debug_level,                                                        \
-            "NVRM: VM: %s:%d: 0x%lx - 0x%lx, 0x%08x bytes @ 0x%016llx, 0x%p, 0x%p\n",    \
+            "NVRM: VM: %s:%d: 0x%lx - 0x%lx, 0x%08lx bytes @ 0x%016llx, 0x%p, 0x%p\n",    \
            __FUNCTION__, __LINE__, vma->vm_start, vma->vm_end, NV_VMA_SIZE(vma),        \
            NV_VMA_OFFSET(vma), NV_VMA_PRIVATE(vma), NV_VMA_FILE(vma));                  \
    }
@ -1078,6 +1074,8 @@ static inline void nv_kmem_ctor_dummy(void *arg)
        kmem_cache_destroy(kmem_cache);     \
    }

+#define NV_KMEM_CACHE_ALLOC_ATOMIC(kmem_cache)     \
+    kmem_cache_alloc(kmem_cache, GFP_ATOMIC)
 #define NV_KMEM_CACHE_ALLOC(kmem_cache)     \
    kmem_cache_alloc(kmem_cache, GFP_KERNEL)
 #define NV_KMEM_CACHE_FREE(ptr, kmem_cache) \
@ -1104,6 +1102,23 @@ static inline void *nv_kmem_cache_zalloc(struct kmem_cache *k, gfp_t flags)
 #endif
 }

+static inline int nv_kmem_cache_alloc_stack_atomic(nvidia_stack_t **stack)
+{
+    nvidia_stack_t *sp = NULL;
+#if defined(NVCPU_X86_64)
+    if (rm_is_altstack_in_use())
+    {
+        sp = NV_KMEM_CACHE_ALLOC_ATOMIC(nvidia_stack_t_cache);
+        if (sp == NULL)
+            return -ENOMEM;
+        sp->size = sizeof(sp->stack);
+        sp->top = sp->stack + sp->size;
+    }
+#endif
+    *stack = sp;
+    return 0;
+}
+
 static inline int nv_kmem_cache_alloc_stack(nvidia_stack_t **stack)
 {
    nvidia_stack_t *sp = NULL;
--- a/kernel-open/common/inc/nv-mm.h
+++ b/kernel-open/common/inc/nv-mm.h
@ -29,17 +29,17 @@
 typedef int vm_fault_t;
 #endif

-/* pin_user_pages
+/*
+ * pin_user_pages()
+ *
 * Presence of pin_user_pages() also implies the presence of unpin-user_page().
- * Both were added in the v5.6-rc1
+ * Both were added in the v5.6.
 *
- * pin_user_pages() was added by commit eddb1c228f7951d399240
- * ("mm/gup: introduce pin_user_pages*() and FOLL_PIN") in v5.6-rc1 (2020-01-30)
- *
- * Removed vmas parameter from pin_user_pages() by commit 40896a02751
- * ("mm/gup: remove vmas parameter from pin_user_pages()")
- * in linux-next, expected in v6.5-rc1 (2023-05-17)
+ * pin_user_pages() was added by commit eddb1c228f79
+ * ("mm/gup: introduce pin_user_pages*() and FOLL_PIN") in v5.6.
 *
+ * Removed vmas parameter from pin_user_pages() by commit 4c630f307455
+ * ("mm/gup: remove vmas parameter from pin_user_pages()") in v6.5.
 */

 #include <linux/mm.h>
@ -63,25 +63,28 @@ typedef int vm_fault_t;
    #define NV_UNPIN_USER_PAGE put_page
 #endif // NV_PIN_USER_PAGES_PRESENT

-/* get_user_pages
+/*
+ * get_user_pages()
 *
- * The 8-argument version of get_user_pages was deprecated by commit
- * (2016 Feb 12: cde70140fed8429acf7a14e2e2cbd3e329036653)for the non-remote case
+ * The 8-argument version of get_user_pages() was deprecated by commit
+ * cde70140fed8 ("mm/gup: Overload get_user_pages() functions") in v4.6-rc1.
 * (calling get_user_pages with current and current->mm).
 *
- * Completely moved to the 6 argument version of get_user_pages -
- * 2016 Apr 4: c12d2da56d0e07d230968ee2305aaa86b93a6832
+ * Completely moved to the 6 argument version of get_user_pages() by
+ * commit c12d2da56d0e ("mm/gup: Remove the macro overload API migration
+ * helpers from the get_user*() APIs") in v4.6-rc4.
 *
- * write and force parameters were replaced with gup_flags by -
- * 2016 Oct 12: 768ae309a96103ed02eb1e111e838c87854d8b51
+ * write and force parameters were replaced with gup_flags by
+ * commit 768ae309a961 ("mm: replace get_user_pages() write/force parameters
+ * with gup_flags") in v4.9.
 *
 * A 7-argument version of get_user_pages was introduced into linux-4.4.y by
- * commit 8e50b8b07f462ab4b91bc1491b1c91bd75e4ad40 which cherry-picked the
- * replacement of the write and force parameters with gup_flags
+ * commit 8e50b8b07f462 ("mm: replace get_user_pages() write/force parameters
+ * with gup_flags") which cherry-picked the replacement of the write and
+ * force parameters with gup_flags.
 *
- * Removed vmas parameter from get_user_pages() by commit 7bbf9c8c99
- * ("mm/gup: remove unused vmas parameter from get_user_pages()")
- * in linux-next, expected in v6.5-rc1 (2023-05-17)
+ * Removed vmas parameter from get_user_pages() by commit 54d020692b34
+ * ("mm/gup: remove unused vmas parameter from get_user_pages()") in v6.5.
 *
 */

@ -112,18 +115,19 @@ typedef int vm_fault_t;
    }
 #endif // NV_GET_USER_PAGES_HAS_ARGS_FLAGS

-/* pin_user_pages_remote
+/*
+ * pin_user_pages_remote()
 *
- * pin_user_pages_remote() was added by commit eddb1c228f7951d399240
- * ("mm/gup: introduce pin_user_pages*() and FOLL_PIN") in v5.6 (2020-01-30)
+ * pin_user_pages_remote() was added by commit eddb1c228f79
+ * ("mm/gup: introduce pin_user_pages*() and FOLL_PIN") in v5.6.
 *
 * pin_user_pages_remote() removed 'tsk' parameter by commit
 * 64019a2e467a ("mm/gup: remove task_struct pointer for all gup code")
- * in v5.9-rc1 (2020-08-11). *
+ * in v5.9.
 *
 * Removed unused vmas parameter from pin_user_pages_remote() by commit
- * 83bcc2e132("mm/gup: remove unused vmas parameter from pin_user_pages_remote()")
- * in linux-next, expected in v6.5-rc1 (2023-05-14)
+ * 0b295316b3a9 ("mm/gup: remove unused vmas parameter from
+ * pin_user_pages_remote()") in v6.5.
 *
 */

@ -143,7 +147,7 @@ typedef int vm_fault_t;

 /*
 * get_user_pages_remote() was added by commit 1e9877902dc7
- * ("mm/gup: Introduce get_user_pages_remote()") in v4.6 (2016-02-12).
+ * ("mm/gup: Introduce get_user_pages_remote()") in v4.6.
 *
 * Note that get_user_pages_remote() requires the caller to hold a reference on
 * the task_struct (if non-NULL and if this API has tsk argument) and the mm_struct.
@ -153,19 +157,17 @@ typedef int vm_fault_t;
 *
 * get_user_pages_remote() write/force parameters were replaced
 * with gup_flags by commit 9beae1ea8930 ("mm: replace get_user_pages_remote()
- * write/force parameters with gup_flags") in v4.9 (2016-10-13).
+ * write/force parameters with gup_flags") in v4.9.
 *
 * get_user_pages_remote() added 'locked' parameter by commit 5b56d49fc31d
- * ("mm: add locked parameter to get_user_pages_remote()") in
- * v4.10 (2016-12-14).
+ * ("mm: add locked parameter to get_user_pages_remote()") in v4.10.
 *
 * get_user_pages_remote() removed 'tsk' parameter by
 * commit 64019a2e467a ("mm/gup: remove task_struct pointer for
- * all gup code") in v5.9-rc1 (2020-08-11).
+ * all gup code") in v5.9.
 *
- * Removed vmas parameter from get_user_pages_remote() by commit a4bde14d549 
- * ("mm/gup: remove vmas parameter from get_user_pages_remote()")
- * in linux-next, expected in v6.5-rc1 (2023-05-14)
+ * Removed vmas parameter from get_user_pages_remote() by commit ca5e863233e8
+ * ("mm/gup: remove vmas parameter from get_user_pages_remote()") in v6.5.
 *
 */

--- a/kernel-open/common/inc/nv.h
+++ b/kernel-open/common/inc/nv.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 1999-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1999-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -609,6 +609,15 @@ typedef enum
    NV_POWER_STATE_RUNNING
 } nv_power_state_t;

+typedef struct
+{
+    const char *vidmem_power_status;
+    const char *dynamic_power_status;
+    const char *gc6_support;
+    const char *gcoff_support;
+    const char *s0ix_status;
+} nv_power_info_t;
+
 #define NV_PRIMARY_VGA(nv)      ((nv)->primary_vga)

 #define NV_IS_CTL_DEVICE(nv)    ((nv)->flags & NV_FLAG_CONTROL)
@ -778,7 +787,7 @@ nv_state_t*  NV_API_CALL  nv_get_ctl_state       (void);

 void   NV_API_CALL  nv_set_dma_address_size      (nv_state_t *, NvU32 );

-NV_STATUS  NV_API_CALL  nv_alias_pages           (nv_state_t *, NvU32, NvU32, NvU32, NvU64, NvU64 *, void **);
+NV_STATUS  NV_API_CALL  nv_alias_pages           (nv_state_t *, NvU32, NvU64, NvU32, NvU32, NvU64, NvU64 *, void **);
 NV_STATUS  NV_API_CALL  nv_alloc_pages           (nv_state_t *, NvU32, NvU64, NvBool, NvU32, NvBool, NvBool, NvS32, NvU64 *, void **);
 NV_STATUS  NV_API_CALL  nv_free_pages            (nv_state_t *, NvU32, NvBool, NvU32, void *);

@ -822,6 +831,7 @@ void   NV_API_CALL  nv_acpi_methods_init         (NvU32 *);
 void   NV_API_CALL  nv_acpi_methods_uninit       (void);

 NV_STATUS  NV_API_CALL  nv_acpi_method           (NvU32, NvU32, NvU32, void *, NvU16, NvU32 *, void *, NvU16 *);
+NV_STATUS  NV_API_CALL  nv_acpi_d3cold_dsm_for_upstream_port (nv_state_t *, NvU8 *, NvU32, NvU32, NvU32 *);
 NV_STATUS  NV_API_CALL  nv_acpi_dsm_method       (nv_state_t *, NvU8 *, NvU32, NvBool, NvU32, void *, NvU16, NvU32 *, void *, NvU16 *);
 NV_STATUS  NV_API_CALL  nv_acpi_ddc_method       (nv_state_t *, void *, NvU32 *, NvBool);
 NV_STATUS  NV_API_CALL  nv_acpi_dod_method       (nv_state_t *, NvU32 *, NvU32 *);
@ -990,10 +1000,10 @@ NV_STATUS  NV_API_CALL  rm_p2p_init_mapping       (nvidia_stack_t *, NvU64, NvU6
 NV_STATUS  NV_API_CALL  rm_p2p_destroy_mapping    (nvidia_stack_t *, NvU64);
 NV_STATUS  NV_API_CALL  rm_p2p_get_pages          (nvidia_stack_t *, NvU64, NvU32, NvU64, NvU64, NvU64 *, NvU32 *, NvU32 *, NvU32 *, NvU8 **, void *);
 NV_STATUS  NV_API_CALL  rm_p2p_get_gpu_info       (nvidia_stack_t *, NvU64, NvU64, NvU8 **, void **);
-NV_STATUS  NV_API_CALL  rm_p2p_get_pages_persistent (nvidia_stack_t *,  NvU64, NvU64, void **, NvU64 *, NvU32 *, void *, void *);
+NV_STATUS  NV_API_CALL  rm_p2p_get_pages_persistent (nvidia_stack_t *,  NvU64, NvU64, void **, NvU64 *, NvU32 *, void *, void *, void **);
 NV_STATUS  NV_API_CALL  rm_p2p_register_callback  (nvidia_stack_t *, NvU64, NvU64, NvU64, void *, void (*)(void *), void *);
 NV_STATUS  NV_API_CALL  rm_p2p_put_pages          (nvidia_stack_t *, NvU64, NvU32, NvU64, void *);
-NV_STATUS  NV_API_CALL  rm_p2p_put_pages_persistent(nvidia_stack_t *, void *, void *);
+NV_STATUS  NV_API_CALL  rm_p2p_put_pages_persistent(nvidia_stack_t *, void *, void *, void *);
 NV_STATUS  NV_API_CALL  rm_p2p_dma_map_pages      (nvidia_stack_t *, nv_dma_device_t *, NvU8 *, NvU64, NvU32, NvU64 *, void **);
 NV_STATUS  NV_API_CALL  rm_dma_buf_dup_mem_handle (nvidia_stack_t *, nv_state_t *, NvHandle, NvHandle, NvHandle, NvHandle, void *, NvHandle, NvU64, NvU64, NvHandle *, void **);
 void       NV_API_CALL  rm_dma_buf_undup_mem_handle(nvidia_stack_t *, nv_state_t *, NvHandle, NvHandle);
@ -1027,9 +1037,7 @@ void       NV_API_CALL rm_enable_dynamic_power_management(nvidia_stack_t *, nv_s
 NV_STATUS  NV_API_CALL rm_ref_dynamic_power(nvidia_stack_t *, nv_state_t *, nv_dynamic_power_mode_t);
 void       NV_API_CALL rm_unref_dynamic_power(nvidia_stack_t *, nv_state_t *, nv_dynamic_power_mode_t);
 NV_STATUS  NV_API_CALL rm_transition_dynamic_power(nvidia_stack_t *, nv_state_t *, NvBool, NvBool *);
-const char* NV_API_CALL rm_get_vidmem_power_status(nvidia_stack_t *, nv_state_t *);
-const char* NV_API_CALL rm_get_dynamic_power_management_status(nvidia_stack_t *, nv_state_t *);
-const char* NV_API_CALL rm_get_gpu_gcx_support(nvidia_stack_t *, nv_state_t *, NvBool);
+void       NV_API_CALL rm_get_power_info(nvidia_stack_t *, nv_state_t *, nv_power_info_t *);

 void       NV_API_CALL rm_acpi_notify(nvidia_stack_t *, nv_state_t *, NvU32);
 void       NV_API_CALL rm_acpi_nvpcf_notify(nvidia_stack_t *);
--- a/kernel-open/common/inc/nv_uvm_interface.h
+++ b/kernel-open/common/inc/nv_uvm_interface.h
@ -1462,6 +1462,29 @@ NV_STATUS nvUvmInterfacePagingChannelPushStream(UvmGpuPagingChannelHandle channe
                                                char *methodStream,
                                                NvU32 methodStreamSize);

+/*******************************************************************************
+    nvUvmInterfaceKeyRotationChannelDisable
+
+    This function notifies RM that the given channels are idle.
+
+    This function is called after RM has notified UVM that keys need to be rotated.
+    When called RM will disable the channels, rotate their keys, and then re-enable
+    the channels.
+
+    Locking: This function acquires an API and GPU lock.
+    Memory : This function dynamically allocates memory.
+
+    Arguments:
+        channelList[IN]      - An array of channel handles whose channels are idle.
+        channelListCount[IN] - Number of channels in channelList. Its value must be
+                               greater than 0.
+
+    Error codes:
+      NV_ERR_INVALID_ARGUMENT - channelList is NULL or channeListCount is 0.
+*/
+NV_STATUS nvUvmInterfaceKeyRotationChannelDisable(uvmGpuChannelHandle channelList[],
+                                                  NvU32 channeListCount);
+
 /*******************************************************************************
    Cryptography Services Library (CSL) Interface
 */
@ -1505,21 +1528,15 @@ NV_STATUS nvUvmInterfaceCslInitContext(UvmCslContext *uvmCslContext,
 void nvUvmInterfaceDeinitCslContext(UvmCslContext *uvmCslContext);

 /*******************************************************************************
-    nvUvmInterfaceCslRotateKey
+    nvUvmInterfaceCslUpdateContext

-    Disables channels and rotates keys.
+    Updates contexts after a key rotation event and can only be called once per
+    key rotation event. Following a key rotation event, and before
+    nvUvmInterfaceCslUpdateContext is called, data encrypted by the GPU with the
+    previous key can be decrypted with nvUvmInterfaceCslDecrypt.

-    This function disables channels and rotates associated keys. The channels
-    associated with the given CSL contexts must be idled before this function is
-    called. To trigger key rotation all allocated channels for a given key must
-    be present in the list. If the function returns successfully then the CSL
-    contexts have been updated with the new key.
-
-    Locking: This function attempts to acquire the GPU lock. In case of failure
-             to acquire the return code is NV_ERR_STATE_IN_USE. The caller must
-             guarantee that no CSL function, including this one, is invoked
-             concurrently with the CSL contexts in contextList.
-    Memory : This function dynamically allocates memory.
+    Locking: This function acquires an API lock.
+    Memory : This function does not dynamically allocate memory.

    Arguments:
        contextList[IN/OUT]  - An array of pointers to CSL contexts.
@ -1527,12 +1544,8 @@ void nvUvmInterfaceDeinitCslContext(UvmCslContext *uvmCslContext);
                               must be greater than 0.
    Error codes:
        NV_ERR_INVALID_ARGUMENT - contextList is NULL or contextListCount is 0.
-        NV_ERR_STATE_IN_USE     - Unable to acquire lock / resource. Caller
-                                  can retry at a later time.
-        NV_ERR_GENERIC          - A failure other than _STATE_IN_USE occurred
-                                  when attempting to acquire a lock.
 */
-NV_STATUS nvUvmInterfaceCslRotateKey(UvmCslContext *contextList[],
+NV_STATUS nvUvmInterfaceCslUpdateContext(UvmCslContext *contextList[],
                                         NvU32 contextListCount);

 /*******************************************************************************
@ -1541,13 +1554,17 @@ NV_STATUS nvUvmInterfaceCslRotateKey(UvmCslContext *contextList[],
    Rotates the IV for a given channel and operation.

    This function will rotate the IV on both the CPU and the GPU.
-    For a given operation the channel must be idle before calling this function.
-    This function can be called regardless of the value of the IV's message counter.
+    Outstanding messages that have been encrypted by the GPU should first be
+    decrypted before calling this function with operation equal to
+    UVM_CSL_OPERATION_DECRYPT. Similarly, outstanding messages that have been
+    encrypted by the CPU should first be decrypted before calling this function
+    with operation equal to UVM_CSL_OPERATION_ENCRYPT. For a given operation
+    the channel must be idle before calling this function. This function can be
+    called regardless of the value of the IV's message counter.

-    Locking: This function attempts to acquire the GPU lock. In case of failure to
-             acquire the return code is NV_ERR_STATE_IN_USE. The caller must guarantee
-             that no CSL function, including this one, is invoked concurrently with
-             the same CSL context.
+    Locking: This function attempts to acquire the GPU lock.
+             In case of failure to acquire the return code
+             is NV_ERR_STATE_IN_USE.
    Memory : This function does not dynamically allocate memory.

 Arguments:
@ -1581,8 +1598,8 @@ NV_STATUS nvUvmInterfaceCslRotateIv(UvmCslContext *uvmCslContext,
    However, it is optional. If it is NULL, the next IV in line will be used.

    Locking: This function does not acquire an API or GPU lock.
-             The caller must guarantee that no CSL function, including this one,
-             is invoked concurrently with the same CSL context.
+             If called concurrently in different threads with the same UvmCslContext
+             the caller must guarantee exclusion.
    Memory : This function does not dynamically allocate memory.

 Arguments:
@ -1618,14 +1635,9 @@ NV_STATUS nvUvmInterfaceCslEncrypt(UvmCslContext *uvmCslContext,
    maximized when the input and output buffers are 16-byte aligned. This is
    natural alignment for AES block.

-    During a key rotation event the previous key is stored in the CSL context.
-    This allows data encrypted by the GPU to be decrypted with the previous key.
-    The keyRotationId parameter identifies which key is used. The first key rotation
-    ID has a value of 0 that increments by one for each key rotation event.
-
    Locking: This function does not acquire an API or GPU lock.
-             The caller must guarantee that no CSL function, including this one,
-             is invoked concurrently with the same CSL context.
+             If called concurrently in different threads with the same UvmCslContext
+             the caller must guarantee exclusion.
    Memory : This function does not dynamically allocate memory.

    Arguments:
@ -1635,8 +1647,6 @@ NV_STATUS nvUvmInterfaceCslEncrypt(UvmCslContext *uvmCslContext,
        decryptIv[IN]         - IV used to decrypt the ciphertext. Its value can either be given by
                                nvUvmInterfaceCslIncrementIv, or, if NULL, the CSL context's
                                internal counter is used.
-        keyRotationId[IN]     - Specifies the key that is used for decryption.
-                                A value of NV_U32_MAX specifies the current key.
        inputBuffer[IN]       - Address of ciphertext input buffer.
        outputBuffer[OUT]     - Address of plaintext output buffer.
        addAuthData[IN]       - Address of the plaintext additional authenticated data used to
@ -1657,7 +1667,6 @@ NV_STATUS nvUvmInterfaceCslDecrypt(UvmCslContext *uvmCslContext,
                                   NvU32 bufferSize,
                                   NvU8 const *inputBuffer,
                                   UvmCslIv const *decryptIv,
-                                   NvU32 keyRotationId,
                                   NvU8 *outputBuffer,
                                   NvU8 const *addAuthData,
                                   NvU32 addAuthDataSize,
@ -1672,8 +1681,8 @@ NV_STATUS nvUvmInterfaceCslDecrypt(UvmCslContext *uvmCslContext,
    undefined behavior.

    Locking: This function does not acquire an API or GPU lock.
-             The caller must guarantee that no CSL function, including this one,
-             is invoked concurrently with the same CSL context.
+             If called concurrently in different threads with the same UvmCslContext
+             the caller must guarantee exclusion.
    Memory : This function does not dynamically allocate memory.

    Arguments:
@ -1701,8 +1710,8 @@ NV_STATUS nvUvmInterfaceCslSign(UvmCslContext *uvmCslContext,

    Locking: This function does not acquire an API or GPU lock.
    Memory : This function does not dynamically allocate memory.
-             The caller must guarantee that no CSL function, including this one,
-             is invoked concurrently with the same CSL context.
+             If called concurrently in different threads with the same UvmCslContext
+             the caller must guarantee exclusion.

    Arguments:
        uvmCslContext[IN/OUT] - The CSL context.
@ -1727,8 +1736,8 @@ NV_STATUS nvUvmInterfaceCslQueryMessagePool(UvmCslContext *uvmCslContext,
    the returned IV can be used in nvUvmInterfaceCslDecrypt.

    Locking: This function does not acquire an API or GPU lock.
-             The caller must guarantee that no CSL function, including this one,
-             is invoked concurrently with the same CSL context.
+             If called concurrently in different threads with the same UvmCslContext
+             the caller must guarantee exclusion.
    Memory : This function does not dynamically allocate memory.

 Arguments:
@ -1750,13 +1759,13 @@ NV_STATUS nvUvmInterfaceCslIncrementIv(UvmCslContext *uvmCslContext,
                                       UvmCslIv *iv);

 /*******************************************************************************
-    nvUvmInterfaceCslLogEncryption
+    nvUvmInterfaceCslLogExternalEncryption

-    Checks and logs information about encryptions associated with the given
-    CSL context.
+    Checks and logs information about non-CSL encryptions, such as those that
+    originate from the GPU.

    For contexts associated with channels, this function does not modify elements of
-    the UvmCslContext, and must be called for every CPU/GPU encryption.
+    the UvmCslContext and must be called for each external encryption invocation.

    For the context associated with fault buffers, bufferSize can encompass multiple
    encryption invocations, and the UvmCslContext will be updated following a key
@ -1766,25 +1775,19 @@ NV_STATUS nvUvmInterfaceCslIncrementIv(UvmCslContext *uvmCslContext,

    Locking: This function does not acquire an API or GPU lock.
    Memory : This function does not dynamically allocate memory.
-             The caller must guarantee that no CSL function, including this one,
-             is invoked concurrently with the same CSL context.
+             If called concurrently in different threads with the same UvmCslContext
+             the caller must guarantee exclusion.

    Arguments:
        uvmCslContext[IN/OUT] - The CSL context.
-        operation[IN]         - If the CSL context is associated with a fault
-                                buffer, this argument is ignored. If it is
-                                associated with a channel, it must be either
-                                - UVM_CSL_OPERATION_ENCRYPT
-                                - UVM_CSL_OPERATION_DECRYPT
-        bufferSize[IN]        - The size of the buffer(s) encrypted by the
+        bufferSize[OUT]       - The size of the buffer(s) encrypted by the
                                external entity in units of bytes.

    Error codes:
-      NV_ERR_INSUFFICIENT_RESOURCES - The encryption would cause a counter
+      NV_ERR_INSUFFICIENT_RESOURCES - The device encryption would cause a counter
                                      to overflow.
 */
-NV_STATUS nvUvmInterfaceCslLogEncryption(UvmCslContext *uvmCslContext,
-                                         UvmCslOperation operation,
+NV_STATUS nvUvmInterfaceCslLogExternalEncryption(UvmCslContext *uvmCslContext,
                                                 NvU32 bufferSize);

 #endif // _NV_UVM_INTERFACE_H_
--- a/kernel-open/common/inc/nv_uvm_types.h
+++ b/kernel-open/common/inc/nv_uvm_types.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2014-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2014-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -39,12 +39,12 @@
 // are multiple BIG page sizes in RM. These defines are used as flags to "0"
 // should be OK when user is not sure which pagesize allocation it wants
 //
-#define UVM_PAGE_SIZE_DEFAULT    0x0
-#define UVM_PAGE_SIZE_4K         0x1000
-#define UVM_PAGE_SIZE_64K        0x10000
-#define UVM_PAGE_SIZE_128K       0x20000
-#define UVM_PAGE_SIZE_2M         0x200000
-#define UVM_PAGE_SIZE_512M       0x20000000
+#define UVM_PAGE_SIZE_DEFAULT    0x0ULL
+#define UVM_PAGE_SIZE_4K         0x1000ULL
+#define UVM_PAGE_SIZE_64K        0x10000ULL
+#define UVM_PAGE_SIZE_128K       0x20000ULL
+#define UVM_PAGE_SIZE_2M         0x200000ULL
+#define UVM_PAGE_SIZE_512M       0x20000000ULL

 //
 // When modifying flags, make sure they are compatible with the mirrored
@ -605,8 +605,6 @@ typedef struct UvmGpuConfComputeCaps_tag
 {
    // Out: GPU's confidential compute mode
    UvmGpuConfComputeMode mode;
-    // Is key rotation enabled for UVM keys
-    NvBool bKeyRotationEnabled;
 } UvmGpuConfComputeCaps;

 #define UVM_GPU_NAME_LENGTH 0x40
--- a/kernel-open/common/inc/nvmisc.h
+++ b/kernel-open/common/inc/nvmisc.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2020 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -494,6 +494,23 @@ do                                                      \
 //
 #define NV_TWO_N_MINUS_ONE(n) (((1ULL<<(n/2))<<((n+1)/2))-1)

+//
+// Create a 64b bitmask with n bits set
+// This is the same as ((1ULL<<n) - 1), but it doesn't overflow for n=64
+//
+// ...
+// n=-1, 0x0000000000000000
+// n=0,  0x0000000000000000
+// n=1,  0x0000000000000001
+// ...
+// n=63, 0x7FFFFFFFFFFFFFFF
+// n=64, 0xFFFFFFFFFFFFFFFF
+// n=65, 0xFFFFFFFFFFFFFFFF
+// n=66, 0xFFFFFFFFFFFFFFFF
+// ...
+//
+#define NV_BITMASK64(n) ((n<1) ? 0ULL : (NV_U64_MAX>>((n>64) ? 0 : (64-n))))
+
 #define DRF_READ_1WORD_BS(d,r,f,v) \
    ((DRF_EXTENT_MW(NV##d##r##f)<8)?DRF_READ_1BYTE_BS(NV##d##r##f,(v)): \
    ((DRF_EXTENT_MW(NV##d##r##f)<16)?DRF_READ_2BYTE_BS(NV##d##r##f,(v)): \
@ -574,6 +591,13 @@ nvMaskPos32(const NvU32 mask, const NvU32 bitIdx)
    n32 = BIT_IDX_32(LOWESTBIT(n32));\
 }

+// Destructive operation on n64
+#define LOWESTBITIDX_64(n64)         \
+{                                    \
+    n64 = BIT_IDX_64(LOWESTBIT(n64));\
+}
+
+
 // Destructive operation on n32
 #define HIGHESTBITIDX_32(n32)   \
 {                               \
@ -918,6 +942,11 @@ static NV_FORCEINLINE void *NV_NVUPTR_TO_PTR(NvUPtr address)
 // Use (lo) if (b) is less than 64, and (hi) if >= 64.
 //
 #define NV_BIT_SET_128(b, lo, hi)              { nvAssert( (b) < 128 ); if ( (b) < 64 ) (lo) |= NVBIT64(b); else (hi) |= NVBIT64( b & 0x3F ); }
+//
+// Clear the bit at pos (b) for U64 which is < 128.
+// Use (lo) if (b) is less than 64, and (hi) if >= 64.
+//
+#define NV_BIT_CLEAR_128(b, lo, hi)            { nvAssert( (b) < 128 ); if ( (b) < 64 ) (lo) &= ~NVBIT64(b); else (hi) &= ~NVBIT64( b & 0x3F ); }

 // Get the number of elements the specified fixed-size array
 #define NV_ARRAY_ELEMENTS(x)                   ((sizeof(x)/sizeof((x)[0])))
--- a/kernel-open/common/inc/nvstatuscodes.h
+++ b/kernel-open/common/inc/nvstatuscodes.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2014-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2014-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -152,6 +152,7 @@ NV_STATUS_CODE(NV_ERR_FABRIC_MANAGER_NOT_PRESENT,      0x0000007A, "Fabric Manag
 NV_STATUS_CODE(NV_ERR_ALREADY_SIGNALLED,               0x0000007B, "Semaphore Surface value already >= requested wait value")
 NV_STATUS_CODE(NV_ERR_QUEUE_TASK_SLOT_NOT_AVAILABLE,   0x0000007C, "PMU RPC error due to no queue slot available for this event")
 NV_STATUS_CODE(NV_ERR_KEY_ROTATION_IN_PROGRESS,        0x0000007D, "Operation not allowed as key rotation is in progress")
+NV_STATUS_CODE(NV_ERR_TEST_ONLY_CODE_NOT_ENABLED,      0x0000007E, "Test-only code path not enabled")

 // Warnings:
 NV_STATUS_CODE(NV_WARN_HOT_SWITCH,                     0x00010001, "WARNING Hot switch")
--- a/kernel-open/common/inc/nvtypes.h
+++ b/kernel-open/common/inc/nvtypes.h
@ -152,6 +152,12 @@ typedef   signed short     NvS16; /* -32768 to 32767                         */
     (((NvU32)(c) & 0xff) << 8)  | \
     (((NvU32)(d) & 0xff))))

+// Macro to build an NvU64 from two DWORDS, listed from msb to lsb
+#define NvU64_BUILD(a, b) \
+    ((NvU64)( \
+     (((NvU64)(a) & ~0U) << 32) | \
+     (((NvU64)(b) & ~0U))))
+
 #if NVTYPES_USE_STDINT
 typedef uint32_t           NvV32; /* "void": enumerated or multiple fields   */
 typedef uint32_t           NvU32; /* 0 to 4294967295                         */
--- a/kernel-open/common/inc/rm-gpu-ops.h
+++ b/kernel-open/common/inc/rm-gpu-ops.h
@ -101,16 +101,17 @@ NV_STATUS  NV_API_CALL rm_gpu_ops_paging_channels_map(nvidia_stack_t *, nvgpuAdd
 void       NV_API_CALL rm_gpu_ops_paging_channels_unmap(nvidia_stack_t *, nvgpuAddressSpaceHandle_t, NvU64, nvgpuDeviceHandle_t);
 NV_STATUS  NV_API_CALL rm_gpu_ops_paging_channel_push_stream(nvidia_stack_t *, nvgpuPagingChannelHandle_t, char *, NvU32);

+NV_STATUS  NV_API_CALL rm_gpu_ops_key_rotation_channel_disable(nvidia_stack_t *, nvgpuChannelHandle_t [], NvU32);
 NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_context_init(nvidia_stack_t *, struct ccslContext_t **, nvgpuChannelHandle_t);
 NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_context_clear(nvidia_stack_t *, struct ccslContext_t *);
-NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_rotate_key(nvidia_stack_t *, UvmCslContext *[], NvU32);
+NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_context_update(nvidia_stack_t *, UvmCslContext *[], NvU32);
 NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_rotate_iv(nvidia_stack_t *, struct ccslContext_t *, NvU8);
 NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_encrypt(nvidia_stack_t *, struct ccslContext_t *, NvU32, NvU8 const *, NvU8 *, NvU8 *);
 NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_encrypt_with_iv(nvidia_stack_t *, struct ccslContext_t *, NvU32, NvU8 const *, NvU8*, NvU8 *, NvU8 *);
-NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_decrypt(nvidia_stack_t *, struct ccslContext_t *, NvU32, NvU8 const *, NvU8 const *, NvU32, NvU8 *, NvU8 const *, NvU32, NvU8 const *);
+NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_decrypt(nvidia_stack_t *, struct ccslContext_t *, NvU32, NvU8 const *, NvU8 const *, NvU8 *, NvU8 const *, NvU32, NvU8 const *);
 NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_sign(nvidia_stack_t *, struct ccslContext_t *, NvU32, NvU8 const *, NvU8 *);
 NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_query_message_pool(nvidia_stack_t *, struct ccslContext_t *, NvU8, NvU64 *);
 NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_increment_iv(nvidia_stack_t *, struct ccslContext_t *, NvU8, NvU64, NvU8 *);
-NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_log_encryption(nvidia_stack_t *, struct ccslContext_t *, NvU8, NvU32);
+NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_log_device_encryption(nvidia_stack_t *, struct ccslContext_t *, NvU32);

 #endif
--- a/kernel-open/conftest.sh
+++ b/kernel-open/conftest.sh
@ -5252,23 +5252,25 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_PCI_CLASS_MULTIMEDIA_HD_AUDIO_PRESENT" "" "generic"
        ;;

-        follow_pfn)
+        unsafe_follow_pfn)
            #
-            # Determine if follow_pfn() is present.
+            # Determine if unsafe_follow_pfn() is present.
            #
-            # follow_pfn() was added by commit 3b6748e2dd69
-            # ("mm: introduce follow_pfn()") in v2.6.31-rc1, and removed
-            # by commit 233eb0bf3b94 ("mm: remove follow_pfn")
-            # from linux-next 233eb0bf3b94.
+            # unsafe_follow_pfn() was added by commit 69bacee7f9ad
+            # ("mm: Add unsafe_follow_pfn") in v5.13-rc1.
+            #
+            # Note: this commit never made it to the linux kernel, so
+            # unsafe_follow_pfn() never existed.
            #
            CODE="
            #include <linux/mm.h>
-            void conftest_follow_pfn(void) {
-                follow_pfn();
+            void conftest_unsafe_follow_pfn(void) {
+                unsafe_follow_pfn();
            }"

-            compile_check_conftest "$CODE" "NV_FOLLOW_PFN_PRESENT" "" "functions"
+            compile_check_conftest "$CODE" "NV_UNSAFE_FOLLOW_PFN_PRESENT" "" "functions"
        ;;
+
        drm_plane_atomic_check_has_atomic_state_arg)
            #
            # Determine if drm_plane_helper_funcs::atomic_check takes 'state'
@ -5554,7 +5556,8 @@ compile_test() {

        of_dma_configure)
            #
-            # Determine if of_dma_configure() function is present
+            # Determine if of_dma_configure() function is present, and how
+            # many arguments it takes.
            #
            # Added by commit 591c1ee465ce ("of: configure the platform
            # device dma parameters") in v3.16.  However, it was a static,
@ -5564,17 +5567,69 @@ compile_test() {
            # commit 1f5c69aa51f9 ("of: Move of_dma_configure() to device.c
            # to help re-use") in v4.1.
            #
-            CODE="
+            # It subsequently began taking a third parameter with commit
+            # 3d6ce86ee794 ("drivers: remove force dma flag from buses")
+            # in v4.18.
+            #
+
+            echo "$CONFTEST_PREAMBLE
            #if defined(NV_LINUX_OF_DEVICE_H_PRESENT)
            #include <linux/of_device.h>
            #endif
+
            void conftest_of_dma_configure(void)
            {
                of_dma_configure();
            }
-            "
+            " > conftest$$.c

-            compile_check_conftest "$CODE" "NV_OF_DMA_CONFIGURE_PRESENT" "" "functions"
+            $CC $CFLAGS -c conftest$$.c > /dev/null 2>&1
+            rm -f conftest$$.c
+
+            if [ -f conftest$$.o ]; then
+                rm -f conftest$$.o
+
+                echo "#undef NV_OF_DMA_CONFIGURE_PRESENT" | append_conftest "functions"
+                echo "#undef NV_OF_DMA_CONFIGURE_ARGUMENT_COUNT" | append_conftest "functions"
+            else
+                echo "#define NV_OF_DMA_CONFIGURE_PRESENT" | append_conftest "functions"
+
+                echo "$CONFTEST_PREAMBLE
+                #if defined(NV_LINUX_OF_DEVICE_H_PRESENT)
+                #include <linux/of_device.h>
+                #endif
+
+                void conftest_of_dma_configure(void) {
+                    of_dma_configure(NULL, NULL, false);
+                }" > conftest$$.c
+
+                $CC $CFLAGS -c conftest$$.c > /dev/null 2>&1
+                rm -f conftest$$.c
+
+                if [ -f conftest$$.o ]; then
+                    rm -f conftest$$.o
+                    echo "#define NV_OF_DMA_CONFIGURE_ARGUMENT_COUNT 3" | append_conftest "functions"
+                    return
+                fi
+
+                echo "$CONFTEST_PREAMBLE
+                #if defined(NV_LINUX_OF_DEVICE_H_PRESENT)
+                #include <linux/of_device.h>
+                #endif
+
+                void conftest_of_dma_configure(void) {
+                    of_dma_configure(NULL, NULL);
+                }" > conftest$$.c
+
+                $CC $CFLAGS -c conftest$$.c > /dev/null 2>&1
+                rm -f conftest$$.c
+
+                if [ -f conftest$$.o ]; then
+                    rm -f conftest$$.o
+                    echo "#define NV_OF_DMA_CONFIGURE_ARGUMENT_COUNT 2" | append_conftest "functions"
+                    return
+                fi
+            fi
        ;;

        icc_get)
@ -6795,12 +6850,45 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_DRM_MODE_CREATE_DP_COLORSPACE_PROPERTY_HAS_SUPPORTED_COLORSPACES_ARG" "" "types"
        ;;

+        drm_syncobj_features_present)
+            # Determine if DRIVER_SYNCOBJ and DRIVER_SYNCOBJ_TIMELINE DRM
+            # driver features are present. Timeline DRM synchronization objects
+            # may only be used if both of these are supported by the driver.
+            #
+            # DRIVER_SYNCOBJ_TIMELINE Added by commit 060cebb20cdb ("drm:
+            # introduce a capability flag for syncobj timeline support") in
+            # v5.2
+            #
+            # DRIVER_SYNCOBJ Added by commit e9083420bbac ("drm: introduce
+            # sync objects (v4)") in v4.12
+            CODE="
+            #if defined(NV_DRM_DRM_DRV_H_PRESENT)
+            #include <drm/drm_drv.h>
+            #endif
+            int features = DRIVER_SYNCOBJ | DRIVER_SYNCOBJ_TIMELINE;"
+
+            compile_check_conftest "$CODE" "NV_DRM_SYNCOBJ_FEATURES_PRESENT" "" "types"
+        ;;
+
+        stack_trace)
+            # Determine if functions stack_trace_{save,print} are present.
+            # Added by commit e9b98e162 ("stacktrace: Provide helpers for
+            # common stack trace operations") in v5.2.
+            CODE="
+            #include <linux/stacktrace.h>
+            void conftest_stack_trace(void) {
+                stack_trace_save();
+                stack_trace_print();
+            }"
+
+            compile_check_conftest "$CODE" "NV_STACK_TRACE_PRESENT" "" "functions"
+        ;;
+
        drm_unlocked_ioctl_flag_present)
            # Determine if DRM_UNLOCKED IOCTL flag is present.
            #
            # DRM_UNLOCKED was removed by commit 2798ffcc1d6a ("drm: Remove
-            # locking for legacy ioctls and DRM_UNLOCKED") in Linux
-            # next-20231208.
+            # locking for legacy ioctls and DRM_UNLOCKED") in v6.8.
            #
            # DRM_UNLOCKED definition was moved from drmP.h to drm_ioctl.h by
            # commit 2640981f3600 ("drm: document drm_ioctl.[hc]") in v4.12.
--- a/kernel-open/header-presence-tests.mk
+++ b/kernel-open/header-presence-tests.mk
@ -52,6 +52,7 @@ NV_HEADER_PRESENCE_TESTS = \
  linux/dma-resv.h \
  soc/tegra/chip-id.h \
  soc/tegra/fuse.h \
+  soc/tegra/fuse-helper.h \
  soc/tegra/tegra_bpmp.h \
  video/nv_internal.h \
  linux/platform/tegra/dce/dce-client-ipc.h \
--- a/kernel-open/nvidia-drm/nv-kthread-q.c
+++ b/kernel-open/nvidia-drm/nv-kthread-q.c
@ -201,7 +201,7 @@ static struct task_struct *thread_create_on_node(int (*threadfn)(void *data),

        // Ran out of attempts - return thread even if its stack may not be
        // allocated on the preferred node
-        if (i == (attempts - 1))
+        if ((i == (attempts - 1)))
            break;

        // Get the NUMA node where the first page of the stack is resident. If
--- a/kernel-open/nvidia-drm/nvidia-drm-crtc.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-crtc.c
@ -176,12 +176,10 @@ cursor_plane_req_config_update(struct drm_plane *plane,
        return;
    }

-    *req_config = (struct NvKmsKapiCursorRequestedConfig) {
-        .surface = to_nv_framebuffer(plane_state->fb)->pSurface,
-
-        .dstX = plane_state->crtc_x,
-        .dstY = plane_state->crtc_y,
-    };
+    memset(req_config, 0, sizeof(*req_config));
+    req_config->surface = to_nv_framebuffer(plane_state->fb)->pSurface;
+    req_config->dstX = plane_state->crtc_x;
+    req_config->dstY = plane_state->crtc_y;

 #if defined(NV_DRM_ALPHA_BLENDING_AVAILABLE)
    if (plane->blend_mode_property != NULL && plane->alpha_property != NULL) {
@ -275,24 +273,22 @@ plane_req_config_update(struct drm_plane *plane,
        return 0;
    }

-    *req_config = (struct NvKmsKapiLayerRequestedConfig) {
-        .config = {
-            .surface = to_nv_framebuffer(plane_state->fb)->pSurface,
+    memset(req_config, 0, sizeof(*req_config));
+
+    req_config->config.surface = to_nv_framebuffer(plane_state->fb)->pSurface;

    /* Source values are 16.16 fixed point */
-            .srcX = plane_state->src_x >> 16,
-            .srcY = plane_state->src_y >> 16,
-            .srcWidth  = plane_state->src_w >> 16,
-            .srcHeight = plane_state->src_h >> 16,
+    req_config->config.srcX = plane_state->src_x >> 16;
+    req_config->config.srcY = plane_state->src_y >> 16;
+    req_config->config.srcWidth  = plane_state->src_w >> 16;
+    req_config->config.srcHeight = plane_state->src_h >> 16;

-            .dstX = plane_state->crtc_x,
-            .dstY = plane_state->crtc_y,
-            .dstWidth  = plane_state->crtc_w,
-            .dstHeight = plane_state->crtc_h,
+    req_config->config.dstX = plane_state->crtc_x;
+    req_config->config.dstY = plane_state->crtc_y;
+    req_config->config.dstWidth  = plane_state->crtc_w;
+    req_config->config.dstHeight = plane_state->crtc_h;

-            .csc = old_config.csc
-        },
-    };
+    req_config->config.csc = old_config.csc;

 #if defined(NV_DRM_ROTATION_AVAILABLE)
    /*
@ -688,9 +684,7 @@ static int nv_drm_plane_atomic_set_property(
        to_nv_drm_plane_state(state);

    if (property == nv_dev->nv_out_fence_property) {
-#if defined(NV_LINUX_NVHOST_H_PRESENT) && defined(CONFIG_TEGRA_GRHOST)
-        nv_drm_plane_state->fd_user_ptr = u64_to_user_ptr(val);
-#endif
+        nv_drm_plane_state->fd_user_ptr = (void __user *)(uintptr_t)(val);
        return 0;
    } else if (property == nv_dev->nv_input_colorspace_property) {
        nv_drm_plane_state->input_colorspace = val;
@ -875,14 +869,12 @@ static inline void nv_drm_crtc_duplicate_req_head_modeset_config(
     * there is no change in new configuration yet with respect
     * to older one!
     */
-    *new = (struct NvKmsKapiHeadRequestedConfig) {
-        .modeSetConfig = old->modeSetConfig,
-    };
+    memset(new, 0, sizeof(*new));
+    new->modeSetConfig = old->modeSetConfig;

    for (i = 0; i < ARRAY_SIZE(old->layerRequestedConfig); i++) {
-        new->layerRequestedConfig[i] = (struct NvKmsKapiLayerRequestedConfig) {
-            .config = old->layerRequestedConfig[i].config,
-        };
+        new->layerRequestedConfig[i].config =
+            old->layerRequestedConfig[i].config;
    }
 }

--- a/kernel-open/nvidia-drm/nvidia-drm-drv.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-drv.c
@ -373,18 +373,14 @@ static int nv_drm_create_properties(struct nv_drm_device *nv_dev)
        len++;
    }

-#if defined(NV_LINUX_NVHOST_H_PRESENT) && defined(CONFIG_TEGRA_GRHOST)
-    if (!nv_dev->supportsSyncpts) {
-        return 0;
-    }
-
+    if (nv_dev->supportsSyncpts) {
        nv_dev->nv_out_fence_property =
            drm_property_create_range(nv_dev->dev, DRM_MODE_PROP_ATOMIC,
                    "NV_DRM_OUT_FENCE_PTR", 0, U64_MAX);
        if (nv_dev->nv_out_fence_property == NULL) {
            return -ENOMEM;
        }
-#endif
+    }

    nv_dev->nv_input_colorspace_property =
        drm_property_create_enum(nv_dev->dev, 0, "NV_INPUT_COLORSPACE",
@ -480,6 +476,22 @@ static int nv_drm_load(struct drm_device *dev, unsigned long flags)
        return -ENODEV;
    }

+#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE)
+    /*
+     * If fbdev is enabled, take modeset ownership now before other DRM clients
+     * can take master (and thus NVKMS ownership).
+     */
+    if (nv_drm_fbdev_module_param) {
+        if (!nvKms->grabOwnership(pDevice)) {
+            nvKms->freeDevice(pDevice);
+            NV_DRM_DEV_LOG_ERR(nv_dev, "Failed to grab NVKMS modeset ownership");
+            return -EBUSY;
+        }
+
+        nv_dev->hasFramebufferConsole = NV_TRUE;
+    }
+#endif
+
    mutex_lock(&nv_dev->lock);

    /* Set NvKmsKapiDevice */
@ -590,6 +602,15 @@ static void __nv_drm_unload(struct drm_device *dev)
        return;
    }

+    /* Release modeset ownership if fbdev is enabled */
+
+#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE)
+    if (nv_dev->hasFramebufferConsole) {
+        drm_atomic_helper_shutdown(dev);
+        nvKms->releaseOwnership(nv_dev->pDevice);
+    }
+#endif
+
    cancel_delayed_work_sync(&nv_dev->hotplug_event_work);
    mutex_lock(&nv_dev->lock);

@ -781,6 +802,14 @@ static int nv_drm_get_dev_info_ioctl(struct drm_device *dev,
    return 0;
 }

+static int nv_drm_get_drm_file_unique_id_ioctl(struct drm_device *dev,
+                                               void *data, struct drm_file *filep)
+{
+    struct drm_nvidia_get_drm_file_unique_id_params *params = data;
+    params->id = (u64)(filep->driver_priv);
+    return 0;
+}
+
 static int nv_drm_dmabuf_supported_ioctl(struct drm_device *dev,
                                         void *data, struct drm_file *filep)
 {
@ -1279,6 +1308,17 @@ static void nv_drm_postclose(struct drm_device *dev, struct drm_file *filep)
 }
 #endif /* NV_DRM_ATOMIC_MODESET_AVAILABLE */

+static int nv_drm_open(struct drm_device *dev, struct drm_file *filep)
+{
+    _Static_assert(sizeof(filep->driver_priv) >= sizeof(u64),
+                   "filep->driver_priv can not hold an u64");
+    static atomic64_t id = ATOMIC_INIT(0);
+
+    filep->driver_priv = (void *)atomic64_inc_return(&id);
+
+    return 0;
+}
+
 #if defined(NV_DRM_MASTER_HAS_LEASES)
 static struct drm_master *nv_drm_find_lessee(struct drm_master *master,
                                             int lessee_id)
@ -1522,6 +1562,9 @@ static const struct drm_ioctl_desc nv_drm_ioctls[] = {
    DRM_IOCTL_DEF_DRV(NVIDIA_GET_DEV_INFO,
                      nv_drm_get_dev_info_ioctl,
                      DRM_RENDER_ALLOW|DRM_UNLOCKED),
+    DRM_IOCTL_DEF_DRV(NVIDIA_GET_DRM_FILE_UNIQUE_ID,
+                      nv_drm_get_drm_file_unique_id_ioctl,
+                      DRM_RENDER_ALLOW|DRM_UNLOCKED),

 #if defined(NV_DRM_FENCE_AVAILABLE)
    DRM_IOCTL_DEF_DRV(NVIDIA_FENCE_SUPPORTED,
@ -1604,6 +1647,9 @@ static struct drm_driver nv_drm_driver = {
    .driver_features        =
 #if defined(NV_DRM_DRIVER_PRIME_FLAG_PRESENT)
                               DRIVER_PRIME |
+#endif
+#if defined(NV_DRM_SYNCOBJ_FEATURES_PRESENT)
+                               DRIVER_SYNCOBJ | DRIVER_SYNCOBJ_TIMELINE |
 #endif
                               DRIVER_GEM  | DRIVER_RENDER,

@ -1615,14 +1661,14 @@ static struct drm_driver nv_drm_driver = {
    .num_ioctls             = ARRAY_SIZE(nv_drm_ioctls),

 /*
- * linux-next commit 71a7974ac701 ("drm/prime: Unexport helpers for fd/handle
- * conversion") unexports drm_gem_prime_handle_to_fd() and
+ * Linux kernel v6.6 commit 71a7974ac701 ("drm/prime: Unexport helpers
+ * for fd/handle conversion") unexports drm_gem_prime_handle_to_fd() and
 * drm_gem_prime_fd_to_handle().
 *
- * Prior linux-next commit 6b85aa68d9d5 ("drm: Enable PRIME import/export for
- * all drivers") made these helpers the default when .prime_handle_to_fd /
- * .prime_fd_to_handle are unspecified, so it's fine to just skip specifying
- * them if the helpers aren't present.
+ * Prior Linux kernel v6.6 commit 6b85aa68d9d5 ("drm: Enable PRIME
+ * import/export for all drivers") made these helpers the default when
+ * .prime_handle_to_fd / .prime_fd_to_handle are unspecified, so it's fine
+ * to just skip specifying them if the helpers aren't present.
 */
 #if NV_IS_EXPORT_SYMBOL_PRESENT_drm_gem_prime_handle_to_fd
    .prime_handle_to_fd     = drm_gem_prime_handle_to_fd,
@ -1656,6 +1702,7 @@ static struct drm_driver nv_drm_driver = {
 #if defined(NV_DRM_ATOMIC_MODESET_AVAILABLE)
    .postclose              = nv_drm_postclose,
 #endif
+    .open                   = nv_drm_open,

    .fops                   = &nv_drm_fops,

@ -1714,6 +1761,7 @@ void nv_drm_register_drm_device(const nv_gpu_info_t *gpu_info)
    struct nv_drm_device *nv_dev = NULL;
    struct drm_device *dev = NULL;
    struct device *device = gpu_info->os_device_ptr;
+    bool bus_is_pci;

    DRM_DEBUG(
        "Registering device for NVIDIA GPU ID 0x08%x",
@ -1747,7 +1795,7 @@ void nv_drm_register_drm_device(const nv_gpu_info_t *gpu_info)
    dev->dev_private = nv_dev;
    nv_dev->dev = dev;

-    bool bus_is_pci =
+    bus_is_pci =
 #if defined(NV_LINUX)
        device->bus == &pci_bus_type;
 #elif defined(NV_BSD)
@ -1771,11 +1819,6 @@ void nv_drm_register_drm_device(const nv_gpu_info_t *gpu_info)
    if (nv_drm_fbdev_module_param &&
        drm_core_check_feature(dev, DRIVER_MODESET)) {

-        if (!nvKms->grabOwnership(nv_dev->pDevice)) {
-            NV_DRM_DEV_LOG_ERR(nv_dev, "Failed to grab NVKMS modeset ownership");
-            goto failed_grab_ownership;
-        }
-
        if (bus_is_pci) {
            struct pci_dev *pdev = to_pci_dev(device);

@ -1786,8 +1829,6 @@ void nv_drm_register_drm_device(const nv_gpu_info_t *gpu_info)
 #endif
        }
        drm_fbdev_generic_setup(dev, 32);
-
-        nv_dev->hasFramebufferConsole = NV_TRUE;
    }
 #endif /* defined(NV_DRM_FBDEV_GENERIC_AVAILABLE) */

@ -1798,12 +1839,6 @@ void nv_drm_register_drm_device(const nv_gpu_info_t *gpu_info)

    return; /* Success */

-#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE)
-failed_grab_ownership:
-
-    drm_dev_unregister(dev);
-#endif
-
 failed_drm_register:

    nv_drm_dev_free(dev);
@ -1870,12 +1905,6 @@ void nv_drm_remove_devices(void)
        struct nv_drm_device *next = dev_list->next;
        struct drm_device *dev = dev_list->dev;

-#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE)
-        if (dev_list->hasFramebufferConsole) {
-            drm_atomic_helper_shutdown(dev);
-            nvKms->releaseOwnership(dev_list->pDevice);
-        }
-#endif
        drm_dev_unregister(dev);
        nv_drm_dev_free(dev);

--- a/kernel-open/nvidia-drm/nvidia-drm-fence.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-fence.c
@ -293,14 +293,12 @@ __nv_drm_prime_fence_context_new(
     * to check a return value.
     */

-    *nv_prime_fence_context = (struct nv_drm_prime_fence_context) {
-        .base.ops = &nv_drm_prime_fence_context_ops,
-        .base.nv_dev = nv_dev,
-        .base.context = nv_dma_fence_context_alloc(1),
-        .base.fenceSemIndex = p->index,
-        .pSemSurface = pSemSurface,
-        .pLinearAddress = pLinearAddress,
-    };
+    nv_prime_fence_context->base.ops = &nv_drm_prime_fence_context_ops;
+    nv_prime_fence_context->base.nv_dev = nv_dev;
+    nv_prime_fence_context->base.context = nv_dma_fence_context_alloc(1);
+    nv_prime_fence_context->base.fenceSemIndex = p->index;
+    nv_prime_fence_context->pSemSurface = pSemSurface;
+    nv_prime_fence_context->pLinearAddress = pLinearAddress;

    INIT_LIST_HEAD(&nv_prime_fence_context->pending);

@ -1261,18 +1259,16 @@ __nv_drm_semsurf_fence_ctx_new(
     * to check a return value.
     */

-    *ctx = (struct nv_drm_semsurf_fence_ctx) {
-        .base.ops = &nv_drm_semsurf_fence_ctx_ops,
-        .base.nv_dev = nv_dev,
-        .base.context = nv_dma_fence_context_alloc(1),
-        .base.fenceSemIndex = p->index,
-        .pSemSurface = pSemSurface,
-        .pSemMapping.pVoid = semMapping,
-        .pMaxSubmittedMapping = (volatile NvU64 *)maxSubmittedMapping,
-        .callback.local = NULL,
-        .callback.nvKms = NULL,
-        .current_wait_value = 0,
-    };
+    ctx->base.ops = &nv_drm_semsurf_fence_ctx_ops;
+    ctx->base.nv_dev = nv_dev;
+    ctx->base.context = nv_dma_fence_context_alloc(1);
+    ctx->base.fenceSemIndex = p->index;
+    ctx->pSemSurface = pSemSurface;
+    ctx->pSemMapping.pVoid = semMapping;
+    ctx->pMaxSubmittedMapping = (volatile NvU64 *)maxSubmittedMapping;
+    ctx->callback.local = NULL;
+    ctx->callback.nvKms = NULL;
+    ctx->current_wait_value = 0;

    spin_lock_init(&ctx->lock);
    INIT_LIST_HEAD(&ctx->pending_fences);
--- a/kernel-open/nvidia-drm/nvidia-drm-gem-nvkms-memory.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-gem-nvkms-memory.c
@ -551,14 +551,12 @@ static struct drm_gem_object *__nv_drm_gem_nvkms_prime_dup(
 {
    struct nv_drm_device *nv_dev = to_nv_device(dev);
    const struct nv_drm_device *nv_dev_src;
-    const struct nv_drm_gem_nvkms_memory *nv_nvkms_memory_src;
    struct nv_drm_gem_nvkms_memory *nv_nvkms_memory;
    struct NvKmsKapiMemory *pMemory;

    BUG_ON(nv_gem_src == NULL || nv_gem_src->ops != &nv_gem_nvkms_memory_ops);

    nv_dev_src = to_nv_device(nv_gem_src->base.dev);
-    nv_nvkms_memory_src = to_nv_nvkms_memory_const(nv_gem_src);

    if ((nv_nvkms_memory =
            nv_drm_calloc(1, sizeof(*nv_nvkms_memory))) == NULL) {
--- a/kernel-open/nvidia-drm/nvidia-drm-helper.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-helper.c
@ -45,8 +45,7 @@

 /*
 * The inclusion of drm_framebuffer.h was removed from drm_crtc.h by commit
- * 720cf96d8fecde29b72e1101f8a567a0ce99594f ("drm: Drop drm_framebuffer.h from
- * drm_crtc.h") in linux-next, expected in v5.19-rc7.
+ * 720cf96d8fec ("drm: Drop drm_framebuffer.h from drm_crtc.h") in v6.0.
 *
 * We only need drm_framebuffer.h for drm_framebuffer_put(), and it is always
 * present (v4.9+) when drm_framebuffer_{put,get}() is present (v4.12+), so it
--- a/kernel-open/nvidia-drm/nvidia-drm-helper.h
+++ b/kernel-open/nvidia-drm/nvidia-drm-helper.h
@ -613,8 +613,8 @@ static inline int nv_drm_format_num_planes(uint32_t format)
 #endif /* defined(NV_DRM_FORMAT_MODIFIERS_PRESENT) */

 /*
- * DRM_UNLOCKED was removed with linux-next commit 2798ffcc1d6a ("drm: Remove
- * locking for legacy ioctls and DRM_UNLOCKED"), but it was previously made
+ * DRM_UNLOCKED was removed with commit 2798ffcc1d6a ("drm: Remove locking for
+ * legacy ioctls and DRM_UNLOCKED") in v6.8, but it was previously made
 * implicit for all non-legacy DRM driver IOCTLs since Linux v4.10 commit
 * fa5386459f06 "drm: Used DRM_LEGACY for all legacy functions" (Linux v4.4
 * commit ea487835e887 "drm: Enforce unlocked ioctl operation for kms driver
--- a/kernel-open/nvidia-drm/nvidia-drm-ioctl.h
+++ b/kernel-open/nvidia-drm/nvidia-drm-ioctl.h
@ -52,6 +52,7 @@
 #define DRM_NVIDIA_SEMSURF_FENCE_CREATE             0x15
 #define DRM_NVIDIA_SEMSURF_FENCE_WAIT               0x16
 #define DRM_NVIDIA_SEMSURF_FENCE_ATTACH             0x17
+#define DRM_NVIDIA_GET_DRM_FILE_UNIQUE_ID           0x18

 #define DRM_IOCTL_NVIDIA_GEM_IMPORT_NVKMS_MEMORY                           \
    DRM_IOWR((DRM_COMMAND_BASE + DRM_NVIDIA_GEM_IMPORT_NVKMS_MEMORY),      \
@ -157,6 +158,11 @@
              DRM_NVIDIA_SEMSURF_FENCE_ATTACH),                         \
              struct drm_nvidia_semsurf_fence_attach_params)

+#define DRM_IOCTL_NVIDIA_GET_DRM_FILE_UNIQUE_ID                         \
+    DRM_IOWR((DRM_COMMAND_BASE +                                        \
+              DRM_NVIDIA_GET_DRM_FILE_UNIQUE_ID),                       \
+              struct drm_nvidia_get_drm_file_unique_id_params)
+
 struct drm_nvidia_gem_import_nvkms_memory_params {
    uint64_t mem_size;           /* IN */

@ -385,4 +391,8 @@ struct drm_nvidia_semsurf_fence_attach_params {
    uint64_t wait_value;            /* IN Semaphore value to reach before signal */
 };

+struct drm_nvidia_get_drm_file_unique_id_params {
+    uint64_t id;                    /* OUT Unique ID of the DRM file */
+};
+
 #endif /* _UAPI_NVIDIA_DRM_IOCTL_H_ */
--- a/kernel-open/nvidia-drm/nvidia-drm-modeset.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-modeset.c
@ -587,6 +587,9 @@ int nv_drm_atomic_commit(struct drm_device *dev,
                NV_DRM_DEV_LOG_ERR(
                    nv_dev,
                    "Flip event timeout on head %u", nv_crtc->head);
+                while (!list_empty(&nv_crtc->flip_list)) {
+                    __nv_drm_handle_flip_event(nv_crtc);
+                }
            }
        }
    }
--- a/kernel-open/nvidia-drm/nvidia-drm-sources.mk
+++ b/kernel-open/nvidia-drm/nvidia-drm-sources.mk
@ -128,4 +128,5 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += drm_driver_has_dumb_destroy
 NV_CONFTEST_TYPE_COMPILE_TESTS += fence_ops_use_64bit_seqno
 NV_CONFTEST_TYPE_COMPILE_TESTS += drm_aperture_remove_conflicting_pci_framebuffers_has_driver_arg
 NV_CONFTEST_TYPE_COMPILE_TESTS += drm_mode_create_dp_colorspace_property_has_supported_colorspaces_arg
+NV_CONFTEST_TYPE_COMPILE_TESTS += drm_syncobj_features_present
 NV_CONFTEST_TYPE_COMPILE_TESTS += drm_unlocked_ioctl_flag_present
--- a/kernel-open/nvidia-modeset/nv-kthread-q.c
+++ b/kernel-open/nvidia-modeset/nv-kthread-q.c
@ -201,7 +201,7 @@ static struct task_struct *thread_create_on_node(int (*threadfn)(void *data),

        // Ran out of attempts - return thread even if its stack may not be
        // allocated on the preferred node
-        if (i == (attempts - 1))
+        if ((i == (attempts - 1)))
            break;

        // Get the NUMA node where the first page of the stack is resident. If
--- a/kernel-open/nvidia-modeset/nvidia-modeset-linux.c
+++ b/kernel-open/nvidia-modeset/nvidia-modeset-linux.c
@ -77,10 +77,10 @@ module_param_named(disable_hdmi_frl, disable_hdmi_frl, bool, 0400);
 static bool disable_vrr_memclk_switch = false;
 module_param_named(disable_vrr_memclk_switch, disable_vrr_memclk_switch, bool, 0400);

-static bool hdmi_deepcolor = false;
+static bool hdmi_deepcolor = true;
 module_param_named(hdmi_deepcolor, hdmi_deepcolor, bool, 0400);

-static bool vblank_sem_control = false;
+static bool vblank_sem_control = true;
 module_param_named(vblank_sem_control, vblank_sem_control, bool, 0400);

 static bool opportunistic_display_sync = true;
@ -139,6 +139,20 @@ NvBool nvkms_opportunistic_display_sync(void)
    return opportunistic_display_sync;
 }

+NvBool nvkms_kernel_supports_syncpts(void)
+{
+/*
+ * Note this only checks that the kernel has the prerequisite
+ * support for syncpts; callers must also check that the hardware
+ * supports syncpts.
+ */
+#if (defined(CONFIG_TEGRA_GRHOST) || defined(NV_LINUX_HOST1X_NEXT_H_PRESENT))
+    return NV_TRUE;
+#else
+    return NV_FALSE;
+#endif
+}
+
 #define NVKMS_SYNCPT_STUBS_NEEDED

 /*************************************************************************
@ -1234,6 +1248,26 @@ void nvkms_close_from_kapi(struct nvkms_per_open *popen)
    nvkms_close_pm_unlocked(popen);
 }

+NvBool nvkms_ioctl_from_kapi_try_pmlock
+(
+    struct nvkms_per_open *popen,
+    NvU32 cmd, void *params_address, const size_t param_size
+)
+{
+    NvBool ret;
+
+    if (nvkms_read_trylock_pm_lock()) {
+        return NV_FALSE;
+    }
+
+    ret = nvkms_ioctl_common(popen,
+                             cmd,
+                             (NvU64)(NvUPtr)params_address, param_size) == 0;
+    nvkms_read_unlock_pm_lock();
+
+    return ret;
+}
+
 NvBool nvkms_ioctl_from_kapi
 (
    struct nvkms_per_open *popen,
--- a/kernel-open/nvidia-modeset/nvidia-modeset-os-interface.h
+++ b/kernel-open/nvidia-modeset/nvidia-modeset-os-interface.h
@ -304,6 +304,11 @@ NvU32 nvkms_enumerate_gpus(nv_gpu_info_t *gpu_info);

 NvBool nvkms_allow_write_combining(void);

+/*!
+ * Check if OS supports syncpoints.
+ */
+NvBool nvkms_kernel_supports_syncpts(void);
+
 /*!
 * Checks whether the fd is associated with an nvidia character device.
 */
@ -328,6 +333,16 @@ NvBool nvkms_ioctl_from_kapi
    NvU32 cmd, void *params_address, const size_t params_size
 );

+/*!
+ * Like nvkms_ioctl_from_kapi, but return NV_FALSE instead of waiting if the
+ * power management read lock cannot be acquired.
+ */
+NvBool nvkms_ioctl_from_kapi_try_pmlock
+(
+    struct nvkms_per_open *popen,
+    NvU32 cmd, void *params_address, const size_t params_size
+);
+
 /*!
 * APIs for locking.
 */
--- a/kernel-open/nvidia-modeset/nvidia-modeset.Kbuild
+++ b/kernel-open/nvidia-modeset/nvidia-modeset.Kbuild
@ -105,3 +105,4 @@ NV_CONFTEST_FUNCTION_COMPILE_TESTS += list_is_first
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += ktime_get_real_ts64
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += ktime_get_raw_ts64
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += acpi_video_backlight_use_native
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += kernel_read_has_pointer_pos_arg
--- a/kernel-open/nvidia-uvm/nv-kthread-q.c
+++ b/kernel-open/nvidia-uvm/nv-kthread-q.c
@ -201,7 +201,7 @@ static struct task_struct *thread_create_on_node(int (*threadfn)(void *data),

        // Ran out of attempts - return thread even if its stack may not be
        // allocated on the preferred node
-        if (i == (attempts - 1))
+        if ((i == (attempts - 1)))
            break;

        // Get the NUMA node where the first page of the stack is resident. If
--- a/kernel-open/nvidia-uvm/uvm.h
+++ b/kernel-open/nvidia-uvm/uvm.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2013-2023 NVIDIA Corporation
+    Copyright (c) 2013-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -1448,9 +1448,7 @@ NV_STATUS UvmAllocSemaphorePool(void                          *base,
 //
 //     preferredCpuMemoryNode: (INPUT)
 //         Preferred CPU NUMA memory node used if the destination processor is
-//         the CPU. -1 indicates no preference, in which case the pages used
-//         can be on any of the available CPU NUMA nodes. If NUMA is disabled
-//         only 0 and -1 are allowed.
+//         the CPU.
 //
 // Error codes:
 //     NV_ERR_INVALID_ADDRESS:
@ -1464,11 +1462,6 @@ NV_STATUS UvmAllocSemaphorePool(void                          *base,
 //         The VA range exceeds the largest virtual address supported by the
 //         destination processor.
 //
-//     NV_ERR_INVALID_ARGUMENT:
-//         preferredCpuMemoryNode is not a valid CPU NUMA node or it corresponds
-//         to a NUMA node ID for a registered GPU. If NUMA is disabled, it
-//         indicates that preferredCpuMemoryNode was not either 0 or -1.
-//
 //     NV_ERR_INVALID_DEVICE:
 //         destinationUuid does not represent a valid processor such as a CPU or
 //         a GPU with a GPU VA space registered for it. Or destinationUuid is a
@ -1535,9 +1528,8 @@ NV_STATUS UvmMigrate(void                  *base,
 //
 //     preferredCpuMemoryNode: (INPUT)
 //         Preferred CPU NUMA memory node used if the destination processor is
-//         the CPU. -1 indicates no preference, in which case the pages used
-//         can be on any of the available CPU NUMA nodes. If NUMA is disabled
-//         only 0 and -1 are allowed.
+//         the CPU. This argument is ignored if the given virtual address range
+//         corresponds to managed memory.
 //
 //     semaphoreAddress: (INPUT)
 //         Base address of the semaphore.
@ -1594,8 +1586,8 @@ NV_STATUS UvmMigrateAsync(void                  *base,
 //
 // Migrates the backing of all virtual address ranges associated with the given
 // range group to the specified destination processor. The behavior of this API
-// is equivalent to calling UvmMigrate with preferredCpuMemoryNode = -1 on each
-// VA range associated with this range group.
+// is equivalent to calling UvmMigrate on each VA range associated with this
+// range group.
 //
 // Any errors encountered during migration are returned immediately. No attempt
 // is made to migrate the remaining unmigrated ranges and the ranges that are
@ -2177,8 +2169,7 @@ NV_STATUS UvmMapDynamicParallelismRegion(void                  *base,
 //
 // If any page in the VA range has a preferred location, then the migration and
 // mapping policies associated with this API take precedence over those related
-// to the preferred location. If the preferred location is a specific CPU NUMA
-// node, that NUMA node will be used for a CPU-resident copy of the page.
+// to the preferred location.
 //
 // If any pages in this VA range have any processors present in their
 // accessed-by list, the migration and mapping policies associated with this
@ -2309,7 +2300,7 @@ NV_STATUS UvmDisableReadDuplication(void     *base,
 // UvmPreventMigrationRangeGroups has not been called on the range group that
 // those pages are associated with, then the migration and mapping policies
 // associated with UvmEnableReadDuplication override the policies outlined
-// above. Note that enabling read duplication on any pages in this VA range
+// above. Note that enabling read duplication on on any pages in this VA range
 // does not clear the state set by this API for those pages. It merely overrides
 // the policies associated with this state until read duplication is disabled
 // for those pages.
@ -2342,8 +2333,7 @@ NV_STATUS UvmDisableReadDuplication(void     *base,
 //     preferredCpuMemoryNode: (INPUT)
 //         Preferred CPU NUMA memory node used if preferredLocationUuid is the
 //         UUID of the CPU. -1 is a special value which indicates all CPU nodes
-//         allowed by the global and thread memory policies. If NUMA is disabled
-//         only 0 and -1 are allowed.
+//         allowed by the global and thread memory policies.
 //
 // Errors:
 //     NV_ERR_INVALID_ADDRESS:
@ -3473,8 +3463,7 @@ NV_STATUS UvmToolsDestroySession(UvmToolsSessionHandle session);
 //

 #if UVM_API_REV_IS_AT_MOST(10)
-// This is deprecated and replaced by sizeof(UvmToolsEventControlData_V1) or
-// sizeof(UvmToolsEventControlData_V2).
+// This is deprecated and replaced by sizeof(UvmToolsEventControlData).
 NvLength UvmToolsGetEventControlSize(void);

 // This is deprecated and replaced by sizeof(UvmEventEntry_V1) or
@ -3498,8 +3487,6 @@ NvLength UvmToolsGetNumberOfCounters(void);
 //     version: (INPUT)
 //         Requested version for events or counters.
 //         See UvmEventEntry_V1 and UvmEventEntry_V2.
-//         UvmToolsEventControlData_V2::version records the entry version that
-//         will be generated.
 //
 //     event_buffer: (INPUT)
 //         User allocated buffer. Must be page-aligned. Must be large enough to
@ -3512,8 +3499,7 @@ NvLength UvmToolsGetNumberOfCounters(void);
 //
 //     event_control (INPUT)
 //         User allocated buffer. Must be page-aligned. Must be large enough to
-//         hold UvmToolsEventControlData_V1 if version is UvmEventEntry_V1 or
-//         UvmToolsEventControlData_V2 (although single page-size allocation
+//         hold UvmToolsEventControlData (although single page-size allocation
 //         should be more than enough). Gets pinned until queue is destroyed.
 //
 //     queue: (OUTPUT)
--- a/kernel-open/nvidia-uvm/uvm_ampere_host.c
+++ b/kernel-open/nvidia-uvm/uvm_ampere_host.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2018-2023 NVIDIA Corporation
+    Copyright (c) 2018-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -205,7 +205,7 @@ void uvm_hal_ampere_host_clear_faulted_channel_sw_method(uvm_push_t *push,
                     CLEAR_FAULTED_B, HWVALUE(C076, CLEAR_FAULTED_B, INST_HI, instance_ptr_hi));
 }

-// Copy from Pascal, this version sets TLB_INVALIDATE_INVAL_SCOPE.
+// Copy from Turing, this version sets TLB_INVALIDATE_INVAL_SCOPE.
 void uvm_hal_ampere_host_tlb_invalidate_all(uvm_push_t *push,
                                           uvm_gpu_phys_address_t pdb,
                                           NvU32 depth,
@ -216,6 +216,7 @@ void uvm_hal_ampere_host_tlb_invalidate_all(uvm_push_t *push,
    NvU32 pdb_lo;
    NvU32 pdb_hi;
    NvU32 ack_value = 0;
+    NvU32 sysmembar_value = 0;

    UVM_ASSERT_MSG(pdb.aperture == UVM_APERTURE_VID || pdb.aperture == UVM_APERTURE_SYS, "aperture: %u", pdb.aperture);

@ -230,8 +231,8 @@ void uvm_hal_ampere_host_tlb_invalidate_all(uvm_push_t *push,
    pdb_lo = pdb.address & HWMASK(C56F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);
    pdb_hi = pdb.address >> HWSIZE(C56F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);

-    // PDE3 is the highest level on Pascal, see the comment in uvm_pascal_mmu.c
-    // for details.
+    // PDE3 is the highest level on Pascal-Ampere, see the comment in
+    // uvm_pascal_mmu.c for details.
    UVM_ASSERT_MSG(depth < NVC56F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE3, "depth %u", depth);
    page_table_level = NVC56F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE3 - depth;

@ -242,7 +243,12 @@ void uvm_hal_ampere_host_tlb_invalidate_all(uvm_push_t *push,
        ack_value = HWCONST(C56F, MEM_OP_C, TLB_INVALIDATE_ACK_TYPE, GLOBALLY);
    }

-    NV_PUSH_4U(C56F, MEM_OP_A, HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS) |
+    if (membar == UVM_MEMBAR_SYS)
+        sysmembar_value = HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, EN);
+    else
+        sysmembar_value = HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS);
+
+    NV_PUSH_4U(C56F, MEM_OP_A, sysmembar_value |
                               HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS),
                     MEM_OP_B, 0,
                     MEM_OP_C, HWCONST(C56F, MEM_OP_C, TLB_INVALIDATE_PDB, ONE) |
@ -255,16 +261,18 @@ void uvm_hal_ampere_host_tlb_invalidate_all(uvm_push_t *push,
                     MEM_OP_D, HWCONST(C56F, MEM_OP_D, OPERATION, MMU_TLB_INVALIDATE) |
                               HWVALUE(C56F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));

-    uvm_hal_tlb_invalidate_membar(push, membar);
+    // GPU membar still requires an explicit membar method.
+    if (membar == UVM_MEMBAR_GPU)
+        uvm_push_get_gpu(push)->parent->host_hal->membar_gpu(push);
 }

-// Copy from Volta, this version sets TLB_INVALIDATE_INVAL_SCOPE.
+// Copy from Turing, this version sets TLB_INVALIDATE_INVAL_SCOPE.
 void uvm_hal_ampere_host_tlb_invalidate_va(uvm_push_t *push,
                                           uvm_gpu_phys_address_t pdb,
                                           NvU32 depth,
                                           NvU64 base,
                                           NvU64 size,
-                                           NvU32 page_size,
+                                           NvU64 page_size,
                                           uvm_membar_t membar)
 {
    NvU32 aperture_value;
@ -272,6 +280,7 @@ void uvm_hal_ampere_host_tlb_invalidate_va(uvm_push_t *push,
    NvU32 pdb_lo;
    NvU32 pdb_hi;
    NvU32 ack_value = 0;
+    NvU32 sysmembar_value = 0;
    NvU32 va_lo;
    NvU32 va_hi;
    NvU64 end;
@ -281,9 +290,9 @@ void uvm_hal_ampere_host_tlb_invalidate_va(uvm_push_t *push,
    NvU32 log2_invalidation_size;
    uvm_gpu_t *gpu = uvm_push_get_gpu(push);

-    UVM_ASSERT_MSG(IS_ALIGNED(page_size, 1 << 12), "page_size 0x%x\n", page_size);
-    UVM_ASSERT_MSG(IS_ALIGNED(base, page_size), "base 0x%llx page_size 0x%x\n", base, page_size);
-    UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%x\n", size, page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(page_size, 1 << 12), "page_size 0x%llx\n", page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(base, page_size), "base 0x%llx page_size 0x%llx\n", base, page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%llx\n", size, page_size);
    UVM_ASSERT_MSG(size > 0, "size 0x%llx\n", size);

    // The invalidation size must be a power-of-two number of pages containing
@ -325,7 +334,7 @@ void uvm_hal_ampere_host_tlb_invalidate_va(uvm_push_t *push,
    pdb_lo = pdb.address & HWMASK(C56F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);
    pdb_hi = pdb.address >> HWSIZE(C56F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);

-    // PDE3 is the highest level on Pascal-Ampere , see the comment in
+    // PDE3 is the highest level on Pascal-Ampere, see the comment in
    // uvm_pascal_mmu.c for details.
    UVM_ASSERT_MSG(depth < NVC56F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE3, "depth %u", depth);
    page_table_level = NVC56F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE3 - depth;
@ -337,10 +346,15 @@ void uvm_hal_ampere_host_tlb_invalidate_va(uvm_push_t *push,
        ack_value = HWCONST(C56F, MEM_OP_C, TLB_INVALIDATE_ACK_TYPE, GLOBALLY);
    }

+    if (membar == UVM_MEMBAR_SYS)
+        sysmembar_value = HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, EN);
+    else
+        sysmembar_value = HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS);
+
    NV_PUSH_4U(C56F, MEM_OP_A, HWVALUE(C56F, MEM_OP_A, TLB_INVALIDATE_INVALIDATION_SIZE, log2_invalidation_size) |
-                               HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS) |
-                               HWVALUE(C56F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO, va_lo) |
-                               HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS),
+                               HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS) |
+                               sysmembar_value |
+                               HWVALUE(C56F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO, va_lo),
                     MEM_OP_B, HWVALUE(C56F, MEM_OP_B, TLB_INVALIDATE_TARGET_ADDR_HI, va_hi),
                     MEM_OP_C, HWCONST(C56F, MEM_OP_C, TLB_INVALIDATE_PDB, ONE) |
                               HWVALUE(C56F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO, pdb_lo) |
@ -352,21 +366,23 @@ void uvm_hal_ampere_host_tlb_invalidate_va(uvm_push_t *push,
                     MEM_OP_D, HWCONST(C56F, MEM_OP_D, OPERATION, MMU_TLB_INVALIDATE_TARGETED) |
                               HWVALUE(C56F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));

-    uvm_hal_tlb_invalidate_membar(push, membar);
+    // GPU membar still requires an explicit membar method.
+    if (membar == UVM_MEMBAR_GPU)
+        gpu->parent->host_hal->membar_gpu(push);
 }

-// Copy from Pascal, this version sets TLB_INVALIDATE_INVAL_SCOPE.
+// Copy from Turing, this version sets TLB_INVALIDATE_INVAL_SCOPE.
 void uvm_hal_ampere_host_tlb_invalidate_test(uvm_push_t *push,
                                             uvm_gpu_phys_address_t pdb,
                                             UVM_TEST_INVALIDATE_TLB_PARAMS *params)
 {
    NvU32 ack_value = 0;
+    NvU32 sysmembar_value = 0;
    NvU32 invalidate_gpc_value = 0;
    NvU32 aperture_value = 0;
    NvU32 pdb_lo = 0;
    NvU32 pdb_hi = 0;
    NvU32 page_table_level = 0;
-    uvm_membar_t membar;

    UVM_ASSERT_MSG(pdb.aperture == UVM_APERTURE_VID || pdb.aperture == UVM_APERTURE_SYS, "aperture: %u", pdb.aperture);
    if (pdb.aperture == UVM_APERTURE_VID)
@ -381,7 +397,7 @@ void uvm_hal_ampere_host_tlb_invalidate_test(uvm_push_t *push,
    pdb_hi = pdb.address >> HWSIZE(C56F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);

    if (params->page_table_level != UvmInvalidatePageTableLevelAll) {
-        // PDE3 is the highest level on Pascal, see the comment in
+        // PDE3 is the highest level on Pascal-Ampere, see the comment in
        // uvm_pascal_mmu.c for details.
        page_table_level = min((NvU32)UvmInvalidatePageTableLevelPde3, params->page_table_level) - 1;
    }
@ -393,6 +409,11 @@ void uvm_hal_ampere_host_tlb_invalidate_test(uvm_push_t *push,
        ack_value = HWCONST(C56F, MEM_OP_C, TLB_INVALIDATE_ACK_TYPE, GLOBALLY);
    }

+    if (params->membar == UvmInvalidateTlbMemBarSys)
+        sysmembar_value = HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, EN);
+    else
+        sysmembar_value = HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS);
+
    if (params->disable_gpc_invalidate)
        invalidate_gpc_value = HWCONST(C56F, MEM_OP_C, TLB_INVALIDATE_GPC, DISABLE);
    else
@ -403,9 +424,9 @@ void uvm_hal_ampere_host_tlb_invalidate_test(uvm_push_t *push,

        NvU32 va_lo = va & HWMASK(C56F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO);
        NvU32 va_hi = va >> HWSIZE(C56F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO);
-        NV_PUSH_4U(C56F, MEM_OP_A, HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS) |
-                                   HWVALUE(C56F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO, va_lo) |
-                                   HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS),
+        NV_PUSH_4U(C56F, MEM_OP_A, sysmembar_value |
+                                   HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS) |
+                                   HWVALUE(C56F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO, va_lo),
                         MEM_OP_B, HWVALUE(C56F, MEM_OP_B, TLB_INVALIDATE_TARGET_ADDR_HI, va_hi),
                         MEM_OP_C, HWCONST(C56F, MEM_OP_C, TLB_INVALIDATE_REPLAY, NONE) |
                                   HWVALUE(C56F, MEM_OP_C, TLB_INVALIDATE_PAGE_TABLE_LEVEL, page_table_level) |
@ -418,7 +439,7 @@ void uvm_hal_ampere_host_tlb_invalidate_test(uvm_push_t *push,
                                   HWVALUE(C56F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));
    }
    else {
-        NV_PUSH_4U(C56F, MEM_OP_A, HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS) |
+        NV_PUSH_4U(C56F, MEM_OP_A, sysmembar_value |
                                   HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS),
                         MEM_OP_B, 0,
                         MEM_OP_C, HWCONST(C56F, MEM_OP_C, TLB_INVALIDATE_REPLAY, NONE) |
@ -432,12 +453,7 @@ void uvm_hal_ampere_host_tlb_invalidate_test(uvm_push_t *push,
                                   HWVALUE(C56F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));
    }

-    if (params->membar == UvmInvalidateTlbMemBarSys)
-        membar = UVM_MEMBAR_SYS;
-    else if (params->membar == UvmInvalidateTlbMemBarLocal)
-        membar = UVM_MEMBAR_GPU;
-    else
-        membar = UVM_MEMBAR_NONE;
-
-    uvm_hal_tlb_invalidate_membar(push, membar);
+    // GPU membar still requires an explicit membar method.
+    if (params->membar == UvmInvalidateTlbMemBarLocal)
+        uvm_push_get_gpu(push)->parent->host_hal->membar_gpu(push);
 }
--- a/kernel-open/nvidia-uvm/uvm_ampere_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_ampere_mmu.c
@ -51,7 +51,7 @@ uvm_mmu_engine_type_t uvm_hal_ampere_mmu_engine_id_to_type(NvU16 mmu_engine_id)
    return UVM_MMU_ENGINE_TYPE_GRAPHICS;
 }

-static NvU32 page_table_depth_ampere(NvU32 page_size)
+static NvU32 page_table_depth_ampere(NvU64 page_size)
 {
    // The common-case is page_size == UVM_PAGE_SIZE_2M, hence the first check
    if (page_size == UVM_PAGE_SIZE_2M)
@ -62,14 +62,14 @@ static NvU32 page_table_depth_ampere(NvU32 page_size)
        return 4;
 }

-static NvU32 page_sizes_ampere(void)
+static NvU64 page_sizes_ampere(void)
 {
    return UVM_PAGE_SIZE_512M | UVM_PAGE_SIZE_2M | UVM_PAGE_SIZE_64K | UVM_PAGE_SIZE_4K;
 }

 static uvm_mmu_mode_hal_t ampere_mmu_mode_hal;

-uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_ampere(NvU32 big_page_size)
+uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_ampere(NvU64 big_page_size)
 {
    static bool initialized = false;

--- a/kernel-open/nvidia-uvm/uvm_ats.c
+++ b/kernel-open/nvidia-uvm/uvm_ats.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2018-2021 NVIDIA Corporation
+    Copyright (c) 2018-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
--- a/kernel-open/nvidia-uvm/uvm_ats.h
+++ b/kernel-open/nvidia-uvm/uvm_ats.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2018-2021 NVIDIA Corporation
+    Copyright (c) 2018-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -29,10 +29,9 @@
 #include "uvm_ats_ibm.h"
 #include "nv_uvm_types.h"
 #include "uvm_lock.h"
+#include "uvm_ats_sva.h"

-    #include "uvm_ats_sva.h"
-
-    #define UVM_ATS_SUPPORTED() (UVM_ATS_IBM_SUPPORTED() || UVM_ATS_SVA_SUPPORTED())
+#define UVM_ATS_SUPPORTED() (UVM_ATS_IBM_SUPPORTED() || UVM_ATS_SVA_SUPPORTED())

 typedef struct
 {
--- a/kernel-open/nvidia-uvm/uvm_ce_test.c
+++ b/kernel-open/nvidia-uvm/uvm_ce_test.c
@ -855,7 +855,6 @@ static NV_STATUS cpu_decrypt_in_order(uvm_channel_t *channel,
                                      uvm_mem_t *dst_mem,
                                      uvm_mem_t *src_mem,
                                      const UvmCslIv *decrypt_iv,
-                                      NvU32 key_version,
                                      uvm_mem_t *auth_tag_mem,
                                      size_t size,
                                      NvU32 copy_size)
@ -870,7 +869,6 @@ static NV_STATUS cpu_decrypt_in_order(uvm_channel_t *channel,
                                                         dst_plain + i * copy_size,
                                                         src_cipher + i * copy_size,
                                                         decrypt_iv + i,
-                                                         key_version,
                                                         copy_size,
                                                         auth_tag_buffer + i * UVM_CONF_COMPUTING_AUTH_TAG_SIZE));
    }
@ -881,7 +879,6 @@ static NV_STATUS cpu_decrypt_out_of_order(uvm_channel_t *channel,
                                          uvm_mem_t *dst_mem,
                                          uvm_mem_t *src_mem,
                                          const UvmCslIv *decrypt_iv,
-                                          NvU32 key_version,
                                          uvm_mem_t *auth_tag_mem,
                                          size_t size,
                                          NvU32 copy_size)
@ -899,7 +896,6 @@ static NV_STATUS cpu_decrypt_out_of_order(uvm_channel_t *channel,
                                                         dst_plain + i * copy_size,
                                                         src_cipher + i * copy_size,
                                                         decrypt_iv + i,
-                                                         key_version,
                                                         copy_size,
                                                         auth_tag_buffer + i * UVM_CONF_COMPUTING_AUTH_TAG_SIZE));
    }
@ -963,7 +959,7 @@ static void gpu_encrypt(uvm_push_t *push,
                                                          i * UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
                                                          dst_cipher);

-        uvm_conf_computing_log_gpu_encryption(push->channel, copy_size, decrypt_iv);
+        uvm_conf_computing_log_gpu_encryption(push->channel, decrypt_iv);

        if (i > 0)
            uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
@ -1024,7 +1020,6 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu,
    size_t auth_tag_buffer_size = (size / copy_size) * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
    UvmCslIv *decrypt_iv = NULL;
    UvmCslIv *encrypt_iv = NULL;
-    NvU32 key_version;
    uvm_tracker_t tracker;
    size_t src_plain_size;

@ -1094,11 +1089,6 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu,

    gpu_encrypt(&push, dst_cipher, dst_plain_gpu, auth_tag_mem, decrypt_iv, size, copy_size);

-    // There shouldn't be any key rotation between the end of the push and the
-    // CPU decryption(s), but it is more robust against test changes to force
-    // decryption to use the saved key.
-    key_version = uvm_channel_pool_key_version(push.channel->pool);
-
    TEST_NV_CHECK_GOTO(uvm_push_end_and_wait(&push), out);

    TEST_CHECK_GOTO(!mem_match(src_plain, src_cipher, size), out);
@ -1111,7 +1101,6 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu,
                                                dst_plain,
                                                dst_cipher,
                                                decrypt_iv,
-                                                key_version,
                                                auth_tag_mem,
                                                size,
                                                copy_size),
@ -1122,7 +1111,6 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu,
                                                    dst_plain,
                                                    dst_cipher,
                                                    decrypt_iv,
-                                                    key_version,
                                                    auth_tag_mem,
                                                    size,
                                                    copy_size),
--- a/kernel-open/nvidia-uvm/uvm_channel.c
+++ b/kernel-open/nvidia-uvm/uvm_channel.c
@ -38,32 +38,6 @@
 #include "clb06f.h"
 #include "uvm_conf_computing.h"

-// WLC push is decrypted by SEC2 or CE (in WLC schedule).
-// In sysmem it's followed by auth tag.
-#define WLC_PUSHBUFFER_ALIGNMENT max3(UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT, \
-                                      UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT, \
-                                      UVM_CONF_COMPUTING_BUF_ALIGNMENT)
-#define WLC_ALIGNED_MAX_PUSH_SIZE UVM_ALIGN_UP(UVM_MAX_WLC_PUSH_SIZE, WLC_PUSHBUFFER_ALIGNMENT)
-
-// WLC uses the following structures in unprotected sysmem:
-// * Encrypted pushbuffer location. This gets populated via cpu_encrypt to
-//   launch work on a WLC channel.
-// * Auth tag associated with the above encrypted (push)buffer
-// * Another auth tag used to encrypt another channel's pushbuffer during
-//   indirect work launch. This can be allocated with the launched work
-//   but since WLC can oly launch one pushbuffer at a time it's easier
-//   to include it here.
-#define WLC_SYSMEM_TOTAL_SIZE UVM_ALIGN_UP(WLC_ALIGNED_MAX_PUSH_SIZE + 2 * UVM_CONF_COMPUTING_AUTH_TAG_SIZE, \
-                                           WLC_PUSHBUFFER_ALIGNMENT)
-
-#define WLC_SYSMEM_PUSHBUFFER_OFFSET 0
-#define WLC_SYSMEM_PUSHBUFFER_AUTH_TAG_OFFSET (WLC_SYSMEM_PUSHBUFFER_OFFSET + WLC_ALIGNED_MAX_PUSH_SIZE)
-#define WLC_SYSMEM_LAUNCH_AUTH_TAG_OFFSET (WLC_SYSMEM_PUSHBUFFER_AUTH_TAG_OFFSET + UVM_CONF_COMPUTING_AUTH_TAG_SIZE)
-
-// LCIC pushbuffer is populated by SEC2
-#define LCIC_PUSHBUFFER_ALIGNMENT UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT
-#define LCIC_ALIGNED_PUSH_SIZE UVM_ALIGN_UP(UVM_LCIC_PUSH_SIZE, LCIC_PUSHBUFFER_ALIGNMENT)
-
 static unsigned uvm_channel_num_gpfifo_entries = UVM_CHANNEL_NUM_GPFIFO_ENTRIES_DEFAULT;

 #define UVM_CHANNEL_GPFIFO_LOC_DEFAULT "auto"
@ -306,16 +280,16 @@ static void unlock_channel_for_push(uvm_channel_t *channel)
    index = uvm_channel_index_in_pool(channel);

    uvm_channel_pool_assert_locked(channel->pool);
-    UVM_ASSERT(test_bit(index, channel->pool->conf_computing.push_locks));
+    UVM_ASSERT(test_bit(index, channel->pool->push_locks));

-    __clear_bit(index, channel->pool->conf_computing.push_locks);
-    uvm_up_out_of_order(&channel->pool->conf_computing.push_sem);
+    __clear_bit(index, channel->pool->push_locks);
+    uvm_up_out_of_order(&channel->pool->push_sem);
 }

 bool uvm_channel_is_locked_for_push(uvm_channel_t *channel)
 {
    if (g_uvm_global.conf_computing_enabled)
-        return test_bit(uvm_channel_index_in_pool(channel), channel->pool->conf_computing.push_locks);
+        return test_bit(uvm_channel_index_in_pool(channel), channel->pool->push_locks);

    // For CE and proxy channels, we always return that the channel is locked,
    // which has no functional impact in the UVM channel code-flow, this is only
@ -329,21 +303,19 @@ static void lock_channel_for_push(uvm_channel_t *channel)

    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
    uvm_channel_pool_assert_locked(channel->pool);
-    UVM_ASSERT(!test_bit(index, channel->pool->conf_computing.push_locks));
+    UVM_ASSERT(!test_bit(index, channel->pool->push_locks));

-    __set_bit(index, channel->pool->conf_computing.push_locks);
+    __set_bit(index, channel->pool->push_locks);
 }

 static bool test_claim_and_lock_channel(uvm_channel_t *channel, NvU32 num_gpfifo_entries)
 {
+    NvU32 index = uvm_channel_index_in_pool(channel);
+
    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
    uvm_channel_pool_assert_locked(channel->pool);

-    // Already locked by someone else
-    if (uvm_channel_is_locked_for_push(channel))
-        return false;
-
-    if (try_claim_channel_locked(channel, num_gpfifo_entries)) {
+    if (!test_bit(index, channel->pool->push_locks) && try_claim_channel_locked(channel, num_gpfifo_entries)) {
        lock_channel_for_push(channel);
        return true;
    }
@ -351,112 +323,6 @@ static bool test_claim_and_lock_channel(uvm_channel_t *channel, NvU32 num_gpfifo
    return false;
 }

-// Reserve, or release, all channels in the given pool.
-//
-// One scenario where reservation of the entire pool is useful is key rotation,
-// because the reservation blocks addition of new work to the pool while
-// rotation is in progress.
-static void channel_pool_reserve_release_all_channels(uvm_channel_pool_t *pool, bool reserve)
-{
-    NvU32 i;
-
-    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
-
-    // Disable lock tracking: a single thread is acquiring multiple locks of
-    // the same order
-    uvm_thread_context_lock_disable_tracking();
-
-    for (i = 0; i < pool->num_channels; i++) {
-        if (reserve)
-            uvm_down(&pool->conf_computing.push_sem);
-        else
-            uvm_up(&pool->conf_computing.push_sem);
-    }
-
-    uvm_thread_context_lock_enable_tracking();
-}
-
-static void channel_pool_reserve_all_channels(uvm_channel_pool_t *pool)
-{
-    channel_pool_reserve_release_all_channels(pool, true);
-}
-
-static void channel_pool_release_all_channels(uvm_channel_pool_t *pool)
-{
-    channel_pool_reserve_release_all_channels(pool, false);
-}
-
-static NV_STATUS channel_pool_rotate_key_locked(uvm_channel_pool_t *pool)
-{
-    uvm_channel_t *channel;
-
-    // A rotation is not necessarily pending, because UVM can trigger rotations
-    // at will.
-    UVM_ASSERT(uvm_conf_computing_is_key_rotation_enabled_in_pool(pool));
-
-    uvm_assert_mutex_locked(&pool->conf_computing.key_rotation.mutex);
-
-    uvm_for_each_channel_in_pool(channel, pool) {
-        NV_STATUS status = uvm_channel_wait(channel);
-        if (status != NV_OK)
-            return status;
-
-        if (uvm_channel_pool_is_wlc(pool)) {
-            uvm_spin_loop_t spin;
-            uvm_channel_t *lcic_channel = uvm_channel_wlc_get_paired_lcic(channel);
-
-            // LCIC pushes don't exist as such. Rely on the tracking semaphore
-            // to determine completion, instead of uvm_channel_wait
-            UVM_SPIN_WHILE(!uvm_gpu_tracking_semaphore_is_completed(&lcic_channel->tracking_sem), &spin);
-        }
-    }
-
-    return uvm_conf_computing_rotate_pool_key(pool);
-}
-
-static NV_STATUS channel_pool_rotate_key(uvm_channel_pool_t *pool, bool force_rotation)
-{
-    NV_STATUS status = NV_OK;
-
-    uvm_mutex_lock(&pool->conf_computing.key_rotation.mutex);
-
-    if (force_rotation || uvm_conf_computing_is_key_rotation_pending_in_pool(pool)) {
-        channel_pool_reserve_all_channels(pool);
-
-        status = channel_pool_rotate_key_locked(pool);
-
-        channel_pool_release_all_channels(pool);
-    }
-
-    uvm_mutex_unlock(&pool->conf_computing.key_rotation.mutex);
-
-    return status;
-}
-
-static NV_STATUS channel_pool_rotate_key_if_pending(uvm_channel_pool_t *pool)
-{
-    NV_STATUS status;
-    bool force_rotation = false;
-
-    if (!uvm_conf_computing_is_key_rotation_enabled_in_pool(pool))
-        return NV_OK;
-
-    status = channel_pool_rotate_key(pool, force_rotation);
-
-    // RM couldn't acquire the locks it needed, so UVM will try again later.
-    if (status == NV_ERR_STATE_IN_USE)
-        status = NV_OK;
-
-    return status;
-}
-
-NV_STATUS uvm_channel_pool_rotate_key(uvm_channel_pool_t *pool)
-{
-    bool force_rotation = true;
-
-    return channel_pool_rotate_key(pool, force_rotation);
-}
-
 // Reserve a channel in the specified pool. The channel is locked until the push
 // ends
 static NV_STATUS channel_reserve_and_lock_in_pool(uvm_channel_pool_t *pool, uvm_channel_t **channel_out)
@ -464,28 +330,20 @@ static NV_STATUS channel_reserve_and_lock_in_pool(uvm_channel_pool_t *pool, uvm_
    uvm_channel_t *channel;
    uvm_spin_loop_t spin;
    NvU32 index;
-    NV_STATUS status;

    UVM_ASSERT(pool);
    UVM_ASSERT(g_uvm_global.conf_computing_enabled);

-    // LCIC channels are reserved directly during GPU initialization.
-    UVM_ASSERT(!uvm_channel_pool_is_lcic(pool));
-
-    status = channel_pool_rotate_key_if_pending(pool);
-    if (status != NV_OK)
-        return status;
-
    // This semaphore is uvm_up() in unlock_channel_for_push() as part of the
    // uvm_channel_end_push() routine.
-    uvm_down(&pool->conf_computing.push_sem);
+    uvm_down(&pool->push_sem);

    // At least one channel is unlocked. We check if any unlocked channel is
    // available, i.e., if it has free GPFIFO entries.

    channel_pool_lock(pool);

-    for_each_clear_bit(index, pool->conf_computing.push_locks, pool->num_channels) {
+    for_each_clear_bit(index, pool->push_locks, pool->num_channels) {
        channel = &pool->channels[index];
        if (try_claim_channel_locked(channel, 1)) {
            lock_channel_for_push(channel);
@ -500,7 +358,10 @@ static NV_STATUS channel_reserve_and_lock_in_pool(uvm_channel_pool_t *pool, uvm_
    uvm_spin_loop_init(&spin);
    while (1) {
        uvm_for_each_channel_in_pool(channel, pool) {
+            NV_STATUS status;
+
            uvm_channel_update_progress(channel);
+            index = uvm_channel_index_in_pool(channel);

            channel_pool_lock(pool);

@ -511,7 +372,7 @@ static NV_STATUS channel_reserve_and_lock_in_pool(uvm_channel_pool_t *pool, uvm_

            status = uvm_channel_check_errors(channel);
            if (status != NV_OK) {
-                uvm_up(&pool->conf_computing.push_sem);
+                uvm_up(&pool->push_sem);
                return status;
            }

@ -629,47 +490,31 @@ static NvU32 channel_get_available_push_info_index(uvm_channel_t *channel)
    return push_info - channel->push_infos;
 }

-static unsigned channel_pool_num_gpfifo_entries(uvm_channel_pool_t *pool)
-{
-    UVM_ASSERT(uvm_pool_type_is_valid(pool->pool_type));
-
-    // WLC benefits from larger number of entries since more available entries
-    // result in less frequent calls to uvm_channel_update_progress. 16 is the
-    // maximum size that can re-use static pb preallocated memory when uploading
-    // the WLC schedule.
-    if (uvm_channel_pool_is_wlc(pool))
-        return 16;
-
-    // Every channel needs at least 3 entries; 1 for sentinel and 2 for
-    // submitting GPFIFO control entries. The number also has to be power of 2,
-    // as the HW stores the size as log2 value. LCIC does not accept external
-    // pushes, uvm_channel_update_progress is not a concern.
-    if (uvm_channel_pool_is_lcic(pool))
-        return 4;
-
-    return pool->manager->conf.num_gpfifo_entries;
-}
-
 static void channel_semaphore_gpu_encrypt_payload(uvm_push_t *push, NvU64 semaphore_va)
 {
    NvU32 iv_index;
+    uvm_gpu_address_t notifier_gpu_va;
+    uvm_gpu_address_t auth_tag_gpu_va;
+    uvm_gpu_address_t semaphore_gpu_va;
+    uvm_gpu_address_t encrypted_payload_gpu_va;
    uvm_gpu_t *gpu = push->gpu;
    uvm_channel_t *channel = push->channel;
    uvm_gpu_semaphore_t *semaphore = &channel->tracking_sem.semaphore;
-    uvm_gpu_address_t notifier_gpu_va = uvm_gpu_semaphore_get_notifier_gpu_va(semaphore);
-    uvm_gpu_address_t auth_tag_gpu_va = uvm_gpu_semaphore_get_auth_tag_gpu_va(semaphore);
-    uvm_gpu_address_t encrypted_payload_gpu_va = uvm_gpu_semaphore_get_encrypted_payload_gpu_va(semaphore);
-    uvm_gpu_address_t semaphore_gpu_va = uvm_gpu_address_virtual(semaphore_va);
    UvmCslIv *iv_cpu_addr = semaphore->conf_computing.ivs;
-    NvU32 payload_size = sizeof(*uvm_gpu_semaphore_get_encrypted_payload_cpu_va(semaphore));
-    uvm_gpu_semaphore_notifier_t *last_pushed_notifier = &semaphore->conf_computing.last_pushed_notifier;
+    NvU32 payload_size = sizeof(*semaphore->payload);
+    NvU32 *last_pushed_notifier = &semaphore->conf_computing.last_pushed_notifier;

    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
    UVM_ASSERT(uvm_channel_is_ce(channel));

+    encrypted_payload_gpu_va = uvm_rm_mem_get_gpu_va(semaphore->conf_computing.encrypted_payload, gpu, false);
+    notifier_gpu_va = uvm_rm_mem_get_gpu_va(semaphore->conf_computing.notifier, gpu, false);
+    auth_tag_gpu_va = uvm_rm_mem_get_gpu_va(semaphore->conf_computing.auth_tag, gpu, false);
+    semaphore_gpu_va = uvm_gpu_address_virtual(semaphore_va);
+
    iv_index = ((*last_pushed_notifier + 2) / 2) % channel->num_gpfifo_entries;

-    uvm_conf_computing_log_gpu_encryption(channel, payload_size, &iv_cpu_addr[iv_index]);
+    uvm_conf_computing_log_gpu_encryption(channel, &iv_cpu_addr[iv_index]);

    gpu->parent->ce_hal->memset_4(push, notifier_gpu_va, ++(*last_pushed_notifier), sizeof(*last_pushed_notifier));
    gpu->parent->ce_hal->encrypt(push, encrypted_payload_gpu_va, semaphore_gpu_va, payload_size, auth_tag_gpu_va);
@ -690,35 +535,18 @@ static void push_reserve_csl_sign_buf(uvm_push_t *push)
    UVM_ASSERT((buf - UVM_METHOD_SIZE / sizeof(*buf)) == push->begin);
 }

-static uvm_channel_pool_t *get_paired_pool(uvm_channel_pool_t *pool)
-{
-    uvm_channel_type_t paired_channel_type;
-    uvm_channel_pool_t *paired_pool;
-
-    UVM_ASSERT(pool);
-    UVM_ASSERT(uvm_channel_pool_is_wlc(pool) || uvm_channel_pool_is_lcic(pool));
-
-    paired_channel_type = uvm_channel_pool_is_wlc(pool) ? UVM_CHANNEL_TYPE_LCIC : UVM_CHANNEL_TYPE_WLC;
-    paired_pool = pool->manager->pool_to_use.default_for_type[paired_channel_type];
-
-    // Prevent accessing a non-existing paired pool. This can happen if, for
-    // example, the function is invoked when the WLC pool exists, but the LCIC
-    // doesn't (it hasn't been created yet, or it has been already destroyed).
-    UVM_ASSERT(paired_pool);
-
-    return paired_pool;
-}
-
 static uvm_channel_t *get_paired_channel(uvm_channel_t *channel)
 {
-    uvm_channel_pool_t *paired_pool;
    unsigned index;
+    uvm_channel_pool_t *paired_pool;
+    uvm_channel_type_t paired_channel_type;

    UVM_ASSERT(channel);
+    UVM_ASSERT(uvm_channel_is_wlc(channel) || uvm_channel_is_lcic(channel));

-    paired_pool = get_paired_pool(channel->pool);
    index = uvm_channel_index_in_pool(channel);
-
+    paired_channel_type = uvm_channel_is_wlc(channel) ? UVM_CHANNEL_TYPE_LCIC : UVM_CHANNEL_TYPE_WLC;
+    paired_pool = channel->pool->manager->pool_to_use.default_for_type[paired_channel_type];
    return paired_pool->channels + index;
 }

@ -738,101 +566,6 @@ uvm_channel_t *uvm_channel_wlc_get_paired_lcic(uvm_channel_t *wlc_channel)
    return get_paired_channel(wlc_channel);
 }

-NvU64 uvm_channel_get_static_pb_protected_vidmem_gpu_va(uvm_channel_t *channel)
-{
-    unsigned channel_index;
-    NvU64 pool_vidmem_base;
-
-    UVM_ASSERT(channel);
-    UVM_ASSERT(uvm_channel_is_wlc(channel) || uvm_channel_is_lcic(channel));
-
-    channel_index = uvm_channel_index_in_pool(channel);
-    pool_vidmem_base = uvm_rm_mem_get_gpu_uvm_va(channel->pool->conf_computing.pool_vidmem,
-                                                 uvm_channel_get_gpu(channel));
-
-    if (uvm_channel_is_lcic(channel))
-        return pool_vidmem_base + channel_index * LCIC_ALIGNED_PUSH_SIZE;
-
-    return pool_vidmem_base + 2 * channel_index * WLC_ALIGNED_MAX_PUSH_SIZE;
-}
-
-static NvU64 get_channel_unprotected_sysmem_gpu_va(uvm_channel_t *channel)
-{
-    unsigned channel_index;
-    NvU64 pool_sysmem_base;
-
-    UVM_ASSERT(channel);
-    UVM_ASSERT(uvm_channel_is_wlc(channel));
-
-    channel_index = uvm_channel_index_in_pool(channel);
-    pool_sysmem_base = uvm_rm_mem_get_gpu_uvm_va(channel->pool->conf_computing.pool_sysmem,
-                                                 uvm_channel_get_gpu(channel));
-
-    return pool_sysmem_base + (channel_index * WLC_SYSMEM_TOTAL_SIZE);
-}
-
-NvU64 uvm_channel_get_static_pb_unprotected_sysmem_gpu_va(uvm_channel_t *channel)
-{
-    return get_channel_unprotected_sysmem_gpu_va(channel) + WLC_SYSMEM_PUSHBUFFER_OFFSET;
-}
-
-static char* get_channel_unprotected_sysmem_cpu(uvm_channel_t *channel)
-{
-    unsigned channel_index;
-    char* pool_sysmem_base;
-
-    UVM_ASSERT(channel);
-    UVM_ASSERT(uvm_channel_is_wlc(channel));
-
-    channel_index = uvm_channel_index_in_pool(channel);
-    pool_sysmem_base = uvm_rm_mem_get_cpu_va(channel->pool->conf_computing.pool_sysmem);
-
-    return pool_sysmem_base + (channel_index * WLC_SYSMEM_TOTAL_SIZE);
-}
-
-char* uvm_channel_get_static_pb_unprotected_sysmem_cpu(uvm_channel_t *channel)
-{
-    return get_channel_unprotected_sysmem_cpu(channel) + WLC_SYSMEM_PUSHBUFFER_OFFSET;
-}
-
-char *uvm_channel_get_push_crypto_bundle_auth_tags_cpu_va(uvm_channel_t *channel, unsigned tag_index)
-{
-    char *pool_sysmem_base;
-    unsigned index;
-
-    UVM_ASSERT(channel);
-    UVM_ASSERT(!uvm_channel_is_wlc(channel));
-    UVM_ASSERT(!uvm_channel_is_lcic(channel));
-    UVM_ASSERT(uvm_channel_is_ce(channel));
-    UVM_ASSERT(channel->num_gpfifo_entries == channel_pool_num_gpfifo_entries(channel->pool));
-    UVM_ASSERT(tag_index < channel->num_gpfifo_entries);
-
-    index = uvm_channel_index_in_pool(channel) * channel->num_gpfifo_entries + tag_index;
-    pool_sysmem_base = uvm_rm_mem_get_cpu_va(channel->pool->conf_computing.pool_sysmem);
-
-    return pool_sysmem_base + index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
-}
-
-static NvU64 get_push_crypto_bundle_auth_tags_gpu_va(uvm_channel_t *channel, unsigned tag_index)
-{
-    unsigned index;
-    NvU64 pool_sysmem_base;
-
-    UVM_ASSERT(channel);
-    UVM_ASSERT(!uvm_channel_is_wlc(channel));
-    UVM_ASSERT(!uvm_channel_is_lcic(channel));
-    UVM_ASSERT(uvm_channel_is_ce(channel));
-    UVM_ASSERT(channel->num_gpfifo_entries == channel_pool_num_gpfifo_entries(channel->pool));
-    UVM_ASSERT(tag_index < channel->num_gpfifo_entries);
-
-    index = uvm_channel_index_in_pool(channel) * channel->num_gpfifo_entries + tag_index;
-    pool_sysmem_base = uvm_rm_mem_get_gpu_uvm_va(channel->pool->conf_computing.pool_sysmem,
-                                                 uvm_channel_get_gpu(channel));
-
-
-    return pool_sysmem_base + index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
-}
-
 static NV_STATUS channel_rotate_and_reserve_launch_channel(uvm_channel_t *channel, uvm_channel_t **launch_channel)
 {
    uvm_channel_manager_t *manager = channel->pool->manager;
@ -1008,52 +741,16 @@ static void uvm_channel_tracking_semaphore_release(uvm_push_t *push, NvU64 semap
        channel_semaphore_gpu_encrypt_payload(push, semaphore_va);
 }

-static uvm_gpu_semaphore_notifier_t *lcic_static_entry_notifier_cpu_va(uvm_channel_t *lcic)
-{
-    uvm_gpu_semaphore_notifier_t *notifier_base;
-
-    UVM_ASSERT(uvm_channel_is_lcic(lcic));
-
-    notifier_base = uvm_rm_mem_get_cpu_va(lcic->pool->conf_computing.pool_sysmem);
-    return notifier_base + uvm_channel_index_in_pool(lcic) * 2;
-}
-
-static uvm_gpu_semaphore_notifier_t *lcic_static_exit_notifier_cpu_va(uvm_channel_t *lcic)
-{
-    return lcic_static_entry_notifier_cpu_va(lcic) + 1;
-}
-
-static uvm_gpu_address_t lcic_static_entry_notifier_gpu_va(uvm_channel_t *lcic)
-{
-    NvU64 notifier_base;
-    const NvU64 offset = uvm_channel_index_in_pool(lcic) * 2 * sizeof(uvm_gpu_semaphore_notifier_t);
-
-    UVM_ASSERT(uvm_channel_is_lcic(lcic));
-
-    notifier_base = uvm_rm_mem_get_gpu_uvm_va(lcic->pool->conf_computing.pool_sysmem, uvm_channel_get_gpu(lcic));
-    return uvm_gpu_address_virtual_unprotected(notifier_base + offset);
-}
-
-static uvm_gpu_address_t lcic_static_exit_notifier_gpu_va(uvm_channel_t *lcic)
-{
-    uvm_gpu_address_t notifier_address = lcic_static_entry_notifier_gpu_va(lcic);
-
-    notifier_address.address += sizeof(uvm_gpu_semaphore_notifier_t);
-    return notifier_address;
-}
-
 static void internal_channel_submit_work_wlc(uvm_push_t *push)
 {
-    size_t payload_size;
    uvm_channel_t *wlc_channel = push->channel;
    uvm_channel_t *lcic_channel = uvm_channel_wlc_get_paired_lcic(wlc_channel);
-    uvm_gpu_semaphore_t *lcic_semaphore = &lcic_channel->tracking_sem.semaphore;
-    UvmCslIv *iv_cpu_addr = lcic_semaphore->conf_computing.ivs;
-    uvm_gpu_semaphore_notifier_t *last_pushed_notifier;
+    UvmCslIv *iv_cpu_addr = lcic_channel->tracking_sem.semaphore.conf_computing.ivs;
+    NvU32 *last_pushed_notifier;
    NvU32 iv_index;
    uvm_spin_loop_t spin;
-    void* auth_tag_cpu = get_channel_unprotected_sysmem_cpu(wlc_channel) + WLC_SYSMEM_PUSHBUFFER_AUTH_TAG_OFFSET;

+    UVM_ASSERT(lcic_channel);

    // Wait for the WLC/LCIC to be primed. This means that PUT == GET + 2
    // and a WLC doorbell ring is enough to start work.
@ -1069,21 +766,19 @@ static void internal_channel_submit_work_wlc(uvm_push_t *push)

    // Handles the CPU part of the setup for the LCIC to be able to do GPU
    // encryption of its tracking semaphore value. See setup_lcic_schedule().
-    last_pushed_notifier = &lcic_semaphore->conf_computing.last_pushed_notifier;
-    *lcic_static_entry_notifier_cpu_va(lcic_channel) = ++(*last_pushed_notifier);
-    *lcic_static_exit_notifier_cpu_va(lcic_channel) = ++(*last_pushed_notifier);
+    last_pushed_notifier  = &lcic_channel->tracking_sem.semaphore.conf_computing.last_pushed_notifier;
+    *lcic_channel->conf_computing.static_notifier_entry_unprotected_sysmem_cpu = ++(*last_pushed_notifier);
+    *lcic_channel->conf_computing.static_notifier_exit_unprotected_sysmem_cpu = ++(*last_pushed_notifier);
    iv_index = (*last_pushed_notifier / 2) % lcic_channel->num_gpfifo_entries;
-
-    payload_size = sizeof(*uvm_gpu_semaphore_get_encrypted_payload_cpu_va(lcic_semaphore));
-    uvm_conf_computing_log_gpu_encryption(lcic_channel, payload_size, &iv_cpu_addr[iv_index]);
+    uvm_conf_computing_log_gpu_encryption(lcic_channel, &iv_cpu_addr[iv_index]);

    // Move push data
    uvm_conf_computing_cpu_encrypt(wlc_channel,
-                                   uvm_channel_get_static_pb_unprotected_sysmem_cpu(wlc_channel),
+                                   wlc_channel->conf_computing.static_pb_unprotected_sysmem_cpu,
                                   push->begin,
                                   &push->launch_iv,
                                   UVM_MAX_WLC_PUSH_SIZE,
-                                   auth_tag_cpu);
+                                   wlc_channel->conf_computing.static_pb_unprotected_sysmem_auth_tag_cpu);

    // Make sure all encrypted data is observable before ringing the doorbell.
    wmb();
@ -1103,7 +798,7 @@ static void internal_channel_submit_work_indirect_wlc(uvm_push_t *push, NvU32 ol

    void *push_enc_cpu = uvm_pushbuffer_get_unprotected_cpu_va_for_push(pushbuffer, push);
    NvU64 push_enc_gpu = uvm_pushbuffer_get_unprotected_gpu_va_for_push(pushbuffer, push);
-    void *push_enc_auth_tag_cpu;
+    void *push_enc_auth_tag;
    uvm_gpu_address_t push_enc_auth_tag_gpu;
    NvU64 gpfifo_gpu_va = push->channel->channel_info.gpFifoGpuVa + old_cpu_put * sizeof(gpfifo_entry);

@ -1127,16 +822,15 @@ static void internal_channel_submit_work_indirect_wlc(uvm_push_t *push, NvU32 ol

    // Move over the pushbuffer data
    // WLC channels use a static preallocated space for launch auth tags
-    push_enc_auth_tag_cpu = get_channel_unprotected_sysmem_cpu(indirect_push.channel) + WLC_SYSMEM_LAUNCH_AUTH_TAG_OFFSET;
-    push_enc_auth_tag_gpu = uvm_gpu_address_virtual_unprotected(
-        get_channel_unprotected_sysmem_gpu_va(indirect_push.channel) + WLC_SYSMEM_LAUNCH_AUTH_TAG_OFFSET);
+    push_enc_auth_tag = indirect_push.channel->conf_computing.launch_auth_tag_cpu;
+    push_enc_auth_tag_gpu = uvm_gpu_address_virtual(indirect_push.channel->conf_computing.launch_auth_tag_gpu_va);

    uvm_conf_computing_cpu_encrypt(indirect_push.channel,
                                   push_enc_cpu,
                                   push->begin,
                                   NULL,
                                   uvm_push_get_size(push),
-                                   push_enc_auth_tag_cpu);
+                                   push_enc_auth_tag);

    uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);

@ -1382,13 +1076,14 @@ static void encrypt_push(uvm_push_t *push)
 {
    NvU64 push_protected_gpu_va;
    NvU64 push_unprotected_gpu_va;
-    NvU64 auth_tag_gpu_va;
+    uvm_gpu_address_t auth_tag_gpu_va;
    uvm_channel_t *channel = push->channel;
    uvm_push_crypto_bundle_t *crypto_bundle;
    uvm_gpu_t *gpu = uvm_push_get_gpu(push);
    NvU32 push_size = uvm_push_get_size(push);
    uvm_push_info_t *push_info = uvm_push_info_from_push(push);
    uvm_pushbuffer_t *pushbuffer = uvm_channel_get_pushbuffer(channel);
+    unsigned auth_tag_offset = UVM_CONF_COMPUTING_AUTH_TAG_SIZE * push->push_info_index;

    if (!g_uvm_global.conf_computing_enabled)
        return;
@ -1407,20 +1102,19 @@ static void encrypt_push(uvm_push_t *push)
    UVM_ASSERT(channel->conf_computing.push_crypto_bundles != NULL);

    crypto_bundle = channel->conf_computing.push_crypto_bundles + push->push_info_index;
-    auth_tag_gpu_va = get_push_crypto_bundle_auth_tags_gpu_va(channel, push->push_info_index);
+    auth_tag_gpu_va = uvm_rm_mem_get_gpu_va(channel->conf_computing.push_crypto_bundle_auth_tags, gpu, false);
+    auth_tag_gpu_va.address += auth_tag_offset;

    crypto_bundle->push_size = push_size;
    push_protected_gpu_va = uvm_pushbuffer_get_gpu_va_for_push(pushbuffer, push);
    push_unprotected_gpu_va = uvm_pushbuffer_get_unprotected_gpu_va_for_push(pushbuffer, push);

-    uvm_conf_computing_log_gpu_encryption(channel, push_size, &crypto_bundle->iv);
-    crypto_bundle->key_version = uvm_channel_pool_key_version(channel->pool);
-
+    uvm_conf_computing_log_gpu_encryption(channel, &crypto_bundle->iv);
    gpu->parent->ce_hal->encrypt(push,
                                 uvm_gpu_address_virtual_unprotected(push_unprotected_gpu_va),
                                 uvm_gpu_address_virtual(push_protected_gpu_va),
                                 push_size,
-                                 uvm_gpu_address_virtual_unprotected(auth_tag_gpu_va));
+                                 auth_tag_gpu_va);
 }

 void uvm_channel_end_push(uvm_push_t *push)
@ -1435,6 +1129,7 @@ void uvm_channel_end_push(uvm_push_t *push)
    NvU32 push_size;
    NvU32 cpu_put;
    NvU32 new_cpu_put;
+    uvm_gpu_t *gpu = uvm_channel_get_gpu(channel);
    bool needs_sec2_work_submit = false;

    channel_pool_lock(channel->pool);
@ -1448,7 +1143,6 @@ void uvm_channel_end_push(uvm_push_t *push)
    uvm_channel_tracking_semaphore_release(push, semaphore_va, new_payload);

    if (uvm_channel_is_wlc(channel) && uvm_channel_manager_is_wlc_ready(channel_manager)) {
-        uvm_gpu_t *gpu = uvm_channel_get_gpu(channel);
        uvm_channel_t *paired_lcic = uvm_channel_wlc_get_paired_lcic(channel);

        gpu->parent->ce_hal->semaphore_reduction_inc(push,
@ -1743,16 +1437,9 @@ NV_STATUS uvm_channel_write_ctrl_gpfifo(uvm_channel_t *channel, NvU64 ctrl_fifo_

 static NV_STATUS channel_reserve_and_lock(uvm_channel_t *channel, NvU32 num_gpfifo_entries)
 {
-    NV_STATUS status;
    uvm_spin_loop_t spin;
    uvm_channel_pool_t *pool = channel->pool;

-    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
-
-    status = channel_pool_rotate_key_if_pending(pool);
-    if (status != NV_OK)
-        return status;
-
    // This semaphore is uvm_up() in unlock_channel_for_push() as part of the
    // uvm_channel_end_push() routine. Note that different than in
    // channel_reserve_and_lock_in_pool, we cannot pick an unlocked channel from
@ -1760,7 +1447,7 @@ static NV_STATUS channel_reserve_and_lock(uvm_channel_t *channel, NvU32 num_gpfi
    // Not a concern given that uvm_channel_reserve() is not the common-case for
    // channel reservation, and only used for channel initialization, GPFIFO
    // control work submission, and testing.
-    uvm_down(&pool->conf_computing.push_sem);
+    uvm_down(&pool->push_sem);

    channel_pool_lock(pool);

@ -1771,6 +1458,8 @@ static NV_STATUS channel_reserve_and_lock(uvm_channel_t *channel, NvU32 num_gpfi

    uvm_spin_loop_init(&spin);
    while (1) {
+        NV_STATUS status;
+
        uvm_channel_update_progress(channel);

        channel_pool_lock(pool);
@ -1782,7 +1471,7 @@ static NV_STATUS channel_reserve_and_lock(uvm_channel_t *channel, NvU32 num_gpfi

        status = uvm_channel_check_errors(channel);
        if (status != NV_OK) {
-            uvm_up(&pool->conf_computing.push_sem);
+            uvm_up(&pool->push_sem);
            return status;
        }

@ -1852,14 +1541,14 @@ static uvm_gpfifo_entry_t *uvm_channel_get_first_pending_entry(uvm_channel_t *ch
 NV_STATUS uvm_channel_get_status(uvm_channel_t *channel)
 {
    uvm_gpu_t *gpu;
-    NvNotification *errorNotifier;
+    NvNotification *error_notifier;

    if (uvm_channel_is_proxy(channel))
-        errorNotifier = channel->proxy.channel_info.shadowErrorNotifier;
+        error_notifier = channel->proxy.channel_info.shadowErrorNotifier;
    else
-        errorNotifier = channel->channel_info.errorNotifier;
+        error_notifier = channel->channel_info.errorNotifier;

-    if (errorNotifier->status == 0)
+    if (error_notifier->status == 0)
        return NV_OK;

    // In case we hit a channel error, check the ECC error notifier as well so
@ -1972,8 +1661,6 @@ NV_STATUS uvm_channel_wait(uvm_channel_t *channel)
 static NV_STATUS csl_init(uvm_channel_t *channel)
 {
    NV_STATUS status;
-    unsigned context_index = uvm_channel_index_in_pool(channel);
-    uvm_channel_pool_t *pool = channel->pool;

    UVM_ASSERT(g_uvm_global.conf_computing_enabled);

@ -1990,38 +1677,17 @@ static NV_STATUS csl_init(uvm_channel_t *channel)
    uvm_mutex_init(&channel->csl.ctx_lock, UVM_LOCK_ORDER_CSL_CTX);
    channel->csl.is_ctx_initialized = true;

-    if (uvm_channel_is_lcic(channel)) {
-        pool = get_paired_pool(pool);
-        context_index += pool->num_channels;
-    }
-
-    UVM_ASSERT(pool->conf_computing.key_rotation.csl_contexts != NULL);
-
-    pool->conf_computing.key_rotation.csl_contexts[context_index] = &channel->csl.ctx;
-
    return NV_OK;
 }

 static void csl_destroy(uvm_channel_t *channel)
 {
-    uvm_channel_pool_t *pool = channel->pool;
-    unsigned context_index = uvm_channel_index_in_pool(channel);
-
    if (!channel->csl.is_ctx_initialized)
        return;

    uvm_assert_mutex_unlocked(&channel->csl.ctx_lock);
    UVM_ASSERT(!uvm_channel_is_locked_for_push(channel));

-    if (uvm_channel_is_lcic(channel)) {
-        pool = get_paired_pool(pool);
-        context_index += pool->num_channels;
-    }
-
-    UVM_ASSERT(pool->conf_computing.key_rotation.csl_contexts != NULL);
-
-    pool->conf_computing.key_rotation.csl_contexts[context_index] = NULL;
-
    uvm_rm_locked_call_void(nvUvmInterfaceDeinitCslContext(&channel->csl.ctx));
    channel->csl.is_ctx_initialized = false;
 }
@ -2031,45 +1697,187 @@ static void free_conf_computing_buffers(uvm_channel_t *channel)
    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
    UVM_ASSERT(uvm_channel_is_ce(channel));

+    uvm_rm_mem_free(channel->conf_computing.static_pb_protected_vidmem);
+    uvm_rm_mem_free(channel->conf_computing.static_pb_unprotected_sysmem);
+    uvm_rm_mem_free(channel->conf_computing.static_notifier_unprotected_sysmem);
+    uvm_rm_mem_free(channel->conf_computing.push_crypto_bundle_auth_tags);
    uvm_kvfree(channel->conf_computing.static_pb_protected_sysmem);
-    channel->conf_computing.static_pb_protected_sysmem = NULL;
-
    uvm_kvfree(channel->conf_computing.push_crypto_bundles);
+    channel->conf_computing.static_pb_protected_vidmem = NULL;
+    channel->conf_computing.static_pb_unprotected_sysmem = NULL;
+    channel->conf_computing.static_notifier_unprotected_sysmem = NULL;
+    channel->conf_computing.push_crypto_bundle_auth_tags = NULL;
+    channel->conf_computing.static_pb_protected_sysmem = NULL;
    channel->conf_computing.push_crypto_bundles = NULL;

+    uvm_rm_mem_free(channel->tracking_sem.semaphore.conf_computing.encrypted_payload);
+    uvm_rm_mem_free(channel->tracking_sem.semaphore.conf_computing.notifier);
+    uvm_rm_mem_free(channel->tracking_sem.semaphore.conf_computing.auth_tag);
    uvm_kvfree(channel->tracking_sem.semaphore.conf_computing.ivs);
+    channel->tracking_sem.semaphore.conf_computing.encrypted_payload = NULL;
+    channel->tracking_sem.semaphore.conf_computing.notifier = NULL;
+    channel->tracking_sem.semaphore.conf_computing.auth_tag = NULL;
    channel->tracking_sem.semaphore.conf_computing.ivs = NULL;
 }

-static NV_STATUS alloc_conf_computing_buffers(uvm_channel_t *channel)
+static NV_STATUS alloc_conf_computing_buffers_semaphore(uvm_channel_t *channel)
 {
    uvm_gpu_semaphore_t *semaphore = &channel->tracking_sem.semaphore;
+    uvm_gpu_t *gpu = uvm_channel_get_gpu(channel);
+    NV_STATUS status;

    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
    UVM_ASSERT(uvm_channel_is_ce(channel));

-    semaphore->conf_computing.ivs =
-        uvm_kvmalloc(sizeof(*semaphore->conf_computing.ivs) * channel->num_gpfifo_entries);
+    status = uvm_rm_mem_alloc_and_map_cpu(gpu,
+                                          UVM_RM_MEM_TYPE_SYS,
+                                          sizeof(semaphore->conf_computing.last_pushed_notifier),
+                                          UVM_CONF_COMPUTING_BUF_ALIGNMENT,
+                                          &semaphore->conf_computing.notifier);
+
+    if (status != NV_OK)
+        return status;
+
+    status = uvm_rm_mem_alloc_and_map_cpu(gpu,
+                                          UVM_RM_MEM_TYPE_SYS,
+                                          sizeof(*channel->tracking_sem.semaphore.payload),
+                                          UVM_CONF_COMPUTING_BUF_ALIGNMENT,
+                                          &semaphore->conf_computing.encrypted_payload);
+
+    if (status != NV_OK)
+        return status;
+
+    status = uvm_rm_mem_alloc_and_map_cpu(gpu,
+                                          UVM_RM_MEM_TYPE_SYS,
+                                          UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
+                                          UVM_CONF_COMPUTING_BUF_ALIGNMENT,
+                                          &semaphore->conf_computing.auth_tag);
+
+    if (status != NV_OK)
+        return status;
+
+    semaphore->conf_computing.ivs = uvm_kvmalloc_zero(sizeof(*semaphore->conf_computing.ivs)
+                                    * channel->num_gpfifo_entries);

    if (!semaphore->conf_computing.ivs)
        return NV_ERR_NO_MEMORY;

-    if (uvm_channel_is_wlc(channel)) {
-        channel->conf_computing.static_pb_protected_sysmem =
-            uvm_kvmalloc(UVM_ALIGN_UP(UVM_MAX_WLC_PUSH_SIZE, UVM_PAGE_SIZE_4K));
+    return status;
+}

+static NV_STATUS alloc_conf_computing_buffers_wlc(uvm_channel_t *channel)
+{
+    uvm_gpu_t *gpu = uvm_channel_get_gpu(channel);
+    size_t aligned_wlc_push_size = UVM_ALIGN_UP(UVM_MAX_WLC_PUSH_SIZE, UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT);
+    NV_STATUS status = uvm_rm_mem_alloc_and_map_cpu(gpu,
+                                                    UVM_RM_MEM_TYPE_SYS,
+                                                    aligned_wlc_push_size + UVM_CONF_COMPUTING_AUTH_TAG_SIZE * 2,
+                                                    PAGE_SIZE,
+                                                    &channel->conf_computing.static_pb_unprotected_sysmem);
+    if (status != NV_OK)
+        return status;
+
+    // Both pushes will be targets for SEC2 decrypt operations and have to
+    // be aligned for SEC2. The first push location will also be a target
+    // for CE decrypt operation and has to be aligned for CE decrypt.
+    status = uvm_rm_mem_alloc(gpu,
+                              UVM_RM_MEM_TYPE_GPU,
+                              UVM_ALIGN_UP(UVM_MAX_WLC_PUSH_SIZE, UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT) * 2,
+                              UVM_CONF_COMPUTING_BUF_ALIGNMENT,
+                              &channel->conf_computing.static_pb_protected_vidmem);
+    if (status != NV_OK)
+        return status;
+
+    channel->conf_computing.static_pb_unprotected_sysmem_cpu =
+        uvm_rm_mem_get_cpu_va(channel->conf_computing.static_pb_unprotected_sysmem);
+    channel->conf_computing.static_pb_unprotected_sysmem_auth_tag_cpu =
+        (char*)channel->conf_computing.static_pb_unprotected_sysmem_cpu + aligned_wlc_push_size;
+
+    // The location below is only used for launch pushes but reuses
+    // the same sysmem allocation
+    channel->conf_computing.launch_auth_tag_cpu =
+        (char*)channel->conf_computing.static_pb_unprotected_sysmem_cpu +
+        aligned_wlc_push_size + UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
+    channel->conf_computing.launch_auth_tag_gpu_va =
+        uvm_rm_mem_get_gpu_uvm_va(channel->conf_computing.static_pb_unprotected_sysmem, gpu) +
+        aligned_wlc_push_size + UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
+
+    channel->conf_computing.static_pb_protected_sysmem = uvm_kvmalloc(UVM_MAX_WLC_PUSH_SIZE + UVM_PAGE_SIZE_4K);
    if (!channel->conf_computing.static_pb_protected_sysmem)
        return NV_ERR_NO_MEMORY;
-    }
-    else if (!uvm_channel_is_lcic(channel)) {
-        channel->conf_computing.push_crypto_bundles =
-            uvm_kvmalloc(sizeof(*channel->conf_computing.push_crypto_bundles) * channel->num_gpfifo_entries);

-        if (!channel->conf_computing.push_crypto_bundles)
+    return status;
+}
+
+static NV_STATUS alloc_conf_computing_buffers_lcic(uvm_channel_t *channel)
+{
+    uvm_gpu_t *gpu = uvm_channel_get_gpu(channel);
+    const size_t notifier_size = sizeof(*channel->conf_computing.static_notifier_entry_unprotected_sysmem_cpu);
+    NV_STATUS status = uvm_rm_mem_alloc_and_map_cpu(gpu,
+                                                    UVM_RM_MEM_TYPE_SYS,
+                                                    notifier_size * 2,
+                                                    UVM_CONF_COMPUTING_BUF_ALIGNMENT,
+                                                    &channel->conf_computing.static_notifier_unprotected_sysmem);
+    if (status != NV_OK)
+        return status;
+
+    status = uvm_rm_mem_alloc(gpu,
+                              UVM_RM_MEM_TYPE_GPU,
+                              UVM_LCIC_PUSH_SIZE,
+                              UVM_CONF_COMPUTING_BUF_ALIGNMENT,
+                              &channel->conf_computing.static_pb_protected_vidmem);
+    if (status != NV_OK)
+        return status;
+
+    channel->conf_computing.static_notifier_entry_unprotected_sysmem_cpu =
+        uvm_rm_mem_get_cpu_va(channel->conf_computing.static_notifier_unprotected_sysmem);
+    channel->conf_computing.static_notifier_exit_unprotected_sysmem_cpu =
+        channel->conf_computing.static_notifier_entry_unprotected_sysmem_cpu + 1;
+
+    channel->conf_computing.static_notifier_entry_unprotected_sysmem_gpu_va =
+        uvm_rm_mem_get_gpu_va(channel->conf_computing.static_notifier_unprotected_sysmem, gpu, false);
+    channel->conf_computing.static_notifier_exit_unprotected_sysmem_gpu_va =
+        channel->conf_computing.static_notifier_entry_unprotected_sysmem_gpu_va;
+    channel->conf_computing.static_notifier_exit_unprotected_sysmem_gpu_va.address += notifier_size;
+
+    return status;
+}
+
+static NV_STATUS alloc_conf_computing_buffers(uvm_channel_t *channel)
+{
+    NV_STATUS status;
+
+    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
+    UVM_ASSERT(uvm_channel_is_ce(channel));
+
+    status = alloc_conf_computing_buffers_semaphore(channel);
+    if (status != NV_OK)
+        return status;
+
+    if (uvm_channel_is_wlc(channel)) {
+        status = alloc_conf_computing_buffers_wlc(channel);
+    }
+    else if (uvm_channel_is_lcic(channel)) {
+        status = alloc_conf_computing_buffers_lcic(channel);
+    }
+    else {
+        uvm_gpu_t *gpu = uvm_channel_get_gpu(channel);
+        void *push_crypto_bundles = uvm_kvmalloc_zero(sizeof(*channel->conf_computing.push_crypto_bundles) *
+                                                      channel->num_gpfifo_entries);
+
+        if (push_crypto_bundles == NULL)
            return NV_ERR_NO_MEMORY;
+
+        channel->conf_computing.push_crypto_bundles = push_crypto_bundles;
+
+        status = uvm_rm_mem_alloc_and_map_cpu(gpu,
+                                              UVM_RM_MEM_TYPE_SYS,
+                                              channel->num_gpfifo_entries * UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
+                                              UVM_CONF_COMPUTING_BUF_ALIGNMENT,
+                                              &channel->conf_computing.push_crypto_bundle_auth_tags);
    }

-    return NV_OK;
+    return status;
 }

 static void channel_destroy(uvm_channel_pool_t *pool, uvm_channel_t *channel)
@ -2117,6 +1925,36 @@ static void channel_destroy(uvm_channel_pool_t *pool, uvm_channel_t *channel)
    pool->num_channels--;
 }

+static unsigned channel_pool_type_num_gpfifo_entries(uvm_channel_manager_t *manager, uvm_channel_pool_type_t pool_type)
+{
+    switch (pool_type) {
+        case UVM_CHANNEL_POOL_TYPE_CE:
+        case UVM_CHANNEL_POOL_TYPE_CE_PROXY:
+            return manager->conf.num_gpfifo_entries;
+        case UVM_CHANNEL_POOL_TYPE_SEC2:
+            return manager->conf.num_gpfifo_entries;
+        case UVM_CHANNEL_POOL_TYPE_WLC: {
+            // WLC benefits from larger number of entries since more available
+            // entries result in less frequent calls to
+            // uvm_channel_update_progress 16 is the maximum size that can
+            // re-use static pb preallocated memory when uploading the WLC
+            // schedule.
+            return 16;
+        }
+        case UVM_CHANNEL_POOL_TYPE_LCIC: {
+            // Every channel needs at least 3 entries; 1 for sentinel and 2 more
+            // for submitting GPFIFO control entries. The number also has to be
+            // power of 2, as the HW stores the size as log2 value.
+            // LCIC does not accept external pushes, uvm_channel_update_progress
+            // is not a concern.
+            return 4;
+        }
+        default:
+            UVM_ASSERT_MSG(0, "Unhandled pool type: %d", pool_type);
+            return 0;
+    }
+}
+
 // Returns the TSG for a given channel.
 static uvmGpuTsgHandle channel_get_tsg(uvm_channel_t *channel)
 {
@ -2144,7 +1982,7 @@ static NV_STATUS internal_channel_create(uvm_channel_t *channel)
    uvm_channel_manager_t *manager = channel->pool->manager;

    memset(&channel_alloc_params, 0, sizeof(channel_alloc_params));
-    channel_alloc_params.numGpFifoEntries = channel_pool_num_gpfifo_entries(channel->pool);
+    channel_alloc_params.numGpFifoEntries = channel_pool_type_num_gpfifo_entries(manager, channel->pool->pool_type);
    channel_alloc_params.gpFifoLoc = manager->conf.gpfifo_loc;
    channel_alloc_params.gpPutLoc = manager->conf.gpput_loc;

@ -2248,7 +2086,7 @@ static NV_STATUS channel_create(uvm_channel_pool_t *pool, uvm_channel_t *channel
     if (status != NV_OK)
         goto error;

-    channel->num_gpfifo_entries = channel_pool_num_gpfifo_entries(pool);
+    channel->num_gpfifo_entries = channel_pool_type_num_gpfifo_entries(manager, pool->pool_type);
    channel->gpfifo_entries = uvm_kvmalloc_zero(sizeof(*channel->gpfifo_entries) * channel->num_gpfifo_entries);
    if (channel->gpfifo_entries == NULL) {
        status = NV_ERR_NO_MEMORY;
@ -2328,8 +2166,8 @@ static NV_STATUS channel_init(uvm_channel_t *channel)

        if (uvm_channel_is_sec2(channel))
            pb_base = uvm_pushbuffer_get_sec2_gpu_va_base(pushbuffer);
-        else if (uvm_channel_is_wlc(channel) || uvm_channel_is_lcic(channel))
-            pb_base = uvm_channel_get_static_pb_protected_vidmem_gpu_va(channel);
+        else if (channel->conf_computing.static_pb_protected_vidmem)
+            pb_base = uvm_rm_mem_get_gpu_uvm_va(channel->conf_computing.static_pb_protected_vidmem, gpu);

        gpu->parent->host_hal->set_gpfifo_pushbuffer_segment_base(&gpfifo_entry, pb_base);
        write_ctrl_gpfifo(channel, gpfifo_entry);
@ -2369,68 +2207,34 @@ static bool channel_manager_uses_proxy_pool(uvm_channel_manager_t *manager)
 }

 // Number of channels to create in a pool of the given type.
-static unsigned channel_manager_num_channels(uvm_channel_manager_t *manager, uvm_channel_pool_type_t pool_type)
+//
+// TODO: Bug 1764958: Tweak this function after benchmarking real workloads.
+static unsigned channel_pool_type_num_channels(uvm_channel_pool_type_t pool_type)
 {
-    unsigned num_channels;
-
-    // In the common case, create two channels per pool.
-    //
-    // TODO: Bug 1764958: Tweak this number after benchmarking real workloads.
-    const unsigned channel_pool_type_ce_num_channels = 2;
-
-    UVM_ASSERT(uvm_pool_type_is_valid(pool_type));
-
-    if (pool_type == UVM_CHANNEL_POOL_TYPE_CE_PROXY) {
-
    // TODO: Bug 3387454: The vGPU plugin implementation supports a single
    // proxy channel per GPU
-        num_channels = 1;
-    }
-    else if (pool_type == UVM_CHANNEL_POOL_TYPE_SEC2) {
+    if (pool_type == UVM_CHANNEL_POOL_TYPE_CE_PROXY)
+        return 1;

-        // Not all GPU architectures support more than 1 channel per TSG. Since
-        // SEC2 is not in UVM critical path for performance, conservatively
-        // create a pool/TSG with a single channel.
-        num_channels = 1;
-    }
-    else if ((pool_type == UVM_CHANNEL_POOL_TYPE_WLC) || (pool_type == UVM_CHANNEL_POOL_TYPE_LCIC)) {
-        unsigned max_concurrent_ce_pushes;
-        unsigned num_used_ces = bitmap_weight(manager->ce_mask, UVM_COPY_ENGINE_COUNT_MAX);
+    // Not all GPU architectures support more than 1 channel per TSG. Since SEC2
+    // is not in UVM critical path for performance, we conservatively create a
+    // pool/TSG with a single channel.
+    if (pool_type == UVM_CHANNEL_POOL_TYPE_SEC2)
+        return 1;

-        // CE selection should happen before this function is invoked.
-        UVM_ASSERT(num_used_ces > 0);
+    if (pool_type == UVM_CHANNEL_POOL_TYPE_WLC || pool_type == UVM_CHANNEL_POOL_TYPE_LCIC)
+        return UVM_PUSH_MAX_CONCURRENT_PUSHES;

-        // Create as many WLC and LCIC channels as concurrent, ongoing, pushes
-        // of interest are allowed. In the general case, this number of pushes
-        // is capped by UVM_PUSH_MAX_CONCURRENT_PUSHES. But in Confidential
-        // Computing there is at most one ongoing push per channel, so the
-        // number of WLC/LCIC channels is also limited by the number of CE
-        // channels.
-        //
-        // The calculation only considers channels mapped to the
-        // UVM_CHANNEL_POOL_TYPE_CE type, because WLC and LCIC channels are
-        // created to enable work launch exclusively in those other channels.
-        max_concurrent_ce_pushes = num_used_ces * channel_pool_type_ce_num_channels;
-        num_channels = min(max_concurrent_ce_pushes, (unsigned) UVM_PUSH_MAX_CONCURRENT_PUSHES);
-    }
-    else {
-        UVM_ASSERT(pool_type == UVM_CHANNEL_POOL_TYPE_CE);
-
-        num_channels = channel_pool_type_ce_num_channels;
-    }
-
-    UVM_ASSERT(num_channels <= UVM_CHANNEL_MAX_NUM_CHANNELS_PER_POOL);
-
-    return num_channels;
+    return 2;
 }

 // Number of TSGs to create in a pool of a given type.
-static unsigned channel_manager_num_tsgs(uvm_channel_manager_t *manager, uvm_channel_pool_type_t pool_type)
+static unsigned channel_pool_type_num_tsgs(uvm_channel_pool_type_t pool_type)
 {
    // For WLC and LCIC channels, we create one TSG per WLC/LCIC channel pair.
    // The TSG is stored in the WLC pool.
    if (pool_type == UVM_CHANNEL_POOL_TYPE_WLC)
-        return channel_manager_num_channels(manager, pool_type);
+        return channel_pool_type_num_channels(pool_type);
    else if (pool_type == UVM_CHANNEL_POOL_TYPE_LCIC)
        return 0;

@ -2486,164 +2290,17 @@ static void channel_pool_destroy(uvm_channel_pool_t *pool)

    while (pool->num_channels > 0)
        channel_destroy(pool, pool->channels + pool->num_channels - 1);
-
    uvm_kvfree(pool->channels);
    pool->channels = NULL;

    while (pool->num_tsgs > 0)
        tsg_destroy(pool, *(pool->tsg_handles + pool->num_tsgs - 1));
-
    uvm_kvfree(pool->tsg_handles);
    pool->tsg_handles = NULL;

-    uvm_kvfree(pool->conf_computing.key_rotation.csl_contexts);
-    pool->conf_computing.key_rotation.csl_contexts = NULL;
-
-    uvm_rm_mem_free(pool->conf_computing.pool_sysmem);
-    uvm_rm_mem_free(pool->conf_computing.pool_vidmem);
-
    pool->manager->num_channel_pools--;
 }

-static void channel_pool_initialize_locks(uvm_channel_pool_t *pool, unsigned num_channels)
-{
-    uvm_lock_order_t order;
-
-    channel_pool_lock_init(pool);
-
-    if (!g_uvm_global.conf_computing_enabled)
-        return;
-
-    // Use different order lock for SEC2 and WLC channels.
-    // This allows reserving a SEC2 or WLC channel for indirect work
-    // submission while holding a reservation for a channel.
-    if (uvm_channel_pool_is_sec2(pool))
-        order = UVM_LOCK_ORDER_CSL_SEC2_PUSH;
-    else if (uvm_channel_pool_is_wlc(pool))
-        order = UVM_LOCK_ORDER_CSL_WLC_PUSH;
-    else
-        order = UVM_LOCK_ORDER_CSL_PUSH;
-
-    uvm_sema_init(&pool->conf_computing.push_sem, num_channels, order);
-
-    if (uvm_channel_pool_is_wlc(pool))
-        order = UVM_LOCK_ORDER_KEY_ROTATION_WLC;
-    else
-        order = UVM_LOCK_ORDER_KEY_ROTATION;
-
-    uvm_mutex_init(&pool->conf_computing.key_rotation.mutex, order);
-}
-
-static NV_STATUS channel_pool_alloc_key_rotation_data(uvm_channel_pool_t *pool, unsigned num_channels)
-{
-    size_t csl_contexts_size;
-
-    // uvm_conf_computing_is_key_rotation_enabled_in_pool cannot be used to
-    // skip key rotation data initialization, because during GPU initialization
-    // the function always returns false.
-    if (!g_uvm_global.conf_computing_enabled)
-        return NV_OK;
-
-    // CSL contexts associated with LCIC channels are saved in the WLC context
-    // array, not in the LCIC context array, so all the underlying engine
-    // contexts are stored contiguously.
-    if (uvm_channel_pool_is_lcic(pool))
-        return NV_OK;
-
-    if (uvm_channel_pool_is_wlc(pool)) {
-        UVM_ASSERT(channel_manager_num_channels(pool->manager, UVM_CHANNEL_POOL_TYPE_WLC) == num_channels);
-        UVM_ASSERT(channel_manager_num_channels(pool->manager, UVM_CHANNEL_POOL_TYPE_LCIC) == num_channels);
-
-        num_channels *= 2;
-    }
-
-    csl_contexts_size = sizeof(*pool->conf_computing.key_rotation.csl_contexts) * num_channels;
-    pool->conf_computing.key_rotation.csl_contexts = uvm_kvmalloc_zero(csl_contexts_size);
-
-    if (pool->conf_computing.key_rotation.csl_contexts == NULL)
-        return NV_ERR_NO_MEMORY;
-
-    pool->conf_computing.key_rotation.num_csl_contexts = num_channels;
-
-    return NV_OK;
-}
-
-static NV_STATUS channel_pool_alloc_conf_computing_buffers(uvm_channel_pool_t *pool, unsigned num_channels)
-{
-    uvm_gpu_t *gpu = pool->manager->gpu;
-    NV_STATUS status = NV_OK;
-
-    if (!g_uvm_global.conf_computing_enabled)
-        return NV_OK;
-
-    if (uvm_channel_pool_is_wlc(pool)) {
-
-        // Allocate unprotected sysmem buffers for WLC channels.
-        // The use/substructures are described by WLC_SYSMEM_TOTAL_SIZE
-        status = uvm_rm_mem_alloc_and_map_cpu(gpu,
-                                              UVM_RM_MEM_TYPE_SYS,
-                                              WLC_SYSMEM_TOTAL_SIZE * num_channels,
-                                              WLC_PUSHBUFFER_ALIGNMENT,
-                                              &pool->conf_computing.pool_sysmem);
-        if (status != NV_OK)
-            return status;
-
-        // WLC stores two pushbuffers used by its static schedule in vidmem.
-        // See setup_wlc_schedule for the expected use of each of the static
-        // pushbuffers.
-        status = uvm_rm_mem_alloc(gpu,
-                                  UVM_RM_MEM_TYPE_GPU,
-                                  WLC_ALIGNED_MAX_PUSH_SIZE * 2 * num_channels,
-                                  WLC_PUSHBUFFER_ALIGNMENT,
-                                  &pool->conf_computing.pool_vidmem);
-        if (status != NV_OK)
-            return status;
-    }
-    else if (uvm_channel_pool_is_lcic(pool)) {
-
-        // LCIC uses only static schedule so in order to use dynamic values
-        // for entry/exit notifiers for its tracking semaphore they need
-        // to be populated in a pre-defined sysmem location, before invoking
-        // the LCIC schedule.
-        status = uvm_rm_mem_alloc_and_map_cpu(gpu,
-                                              UVM_RM_MEM_TYPE_SYS,
-                                              sizeof(uvm_gpu_semaphore_notifier_t) * 2 * num_channels,
-                                              0,
-                                              &pool->conf_computing.pool_sysmem);
-        if (status != NV_OK)
-            return status;
-
-        // LCIC static schedule pushbuffer is in vidmem
-        status = uvm_rm_mem_alloc(gpu,
-                                  UVM_RM_MEM_TYPE_GPU,
-                                  LCIC_ALIGNED_PUSH_SIZE * num_channels,
-                                  LCIC_PUSHBUFFER_ALIGNMENT,
-                                  &pool->conf_computing.pool_vidmem);
-        if (status != NV_OK)
-            return status;
-    }
-    else if (uvm_channel_pool_is_ce(pool)) {
-
-        // General CE channels need to provide bi-directional communication
-        // using the pushbuffer. Encrypting an updated push from vidmem
-        // to sysmem still needs a place for auth tag in sysmem.
-        status = uvm_rm_mem_alloc_and_map_cpu(gpu,
-                                              UVM_RM_MEM_TYPE_SYS,
-                                              UVM_CONF_COMPUTING_AUTH_TAG_SIZE * num_channels *
-                                                  channel_pool_num_gpfifo_entries(pool),
-                                              UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT,
-                                              &pool->conf_computing.pool_sysmem);
-        if (status != NV_OK)
-            return status;
-    }
-
-    status = channel_pool_alloc_key_rotation_data(pool, num_channels);
-    if (status != NV_OK)
-        return status;
-
-    return NV_OK;
-}
-
 static NV_STATUS channel_pool_add(uvm_channel_manager_t *channel_manager,
                                  uvm_channel_pool_type_t pool_type,
                                  unsigned engine_index,
@ -2664,7 +2321,7 @@ static NV_STATUS channel_pool_add(uvm_channel_manager_t *channel_manager,
    pool->engine_index = engine_index;
    pool->pool_type = pool_type;

-    num_tsgs = channel_manager_num_tsgs(channel_manager, pool_type);
+    num_tsgs = channel_pool_type_num_tsgs(pool_type);
    if (num_tsgs != 0) {
        pool->tsg_handles = uvm_kvmalloc_zero(sizeof(*pool->tsg_handles) * num_tsgs);
        if (!pool->tsg_handles) {
@ -2681,13 +2338,21 @@ static NV_STATUS channel_pool_add(uvm_channel_manager_t *channel_manager,
        }
    }

-    num_channels = channel_manager_num_channels(channel_manager, pool_type);
+    channel_pool_lock_init(pool);

-    channel_pool_initialize_locks(pool, num_channels);
+    num_channels = channel_pool_type_num_channels(pool_type);
+    UVM_ASSERT(num_channels <= UVM_CHANNEL_MAX_NUM_CHANNELS_PER_POOL);

-    status = channel_pool_alloc_conf_computing_buffers(pool, num_channels);
-    if (status != NV_OK)
-        goto error;
+    if (g_uvm_global.conf_computing_enabled) {
+        // Use different order lock for SEC2 and WLC channels.
+        // This allows reserving a SEC2 or WLC channel for indirect work
+        // submission while holding a reservation for a channel.
+        uvm_lock_order_t order = uvm_channel_pool_is_sec2(pool) ? UVM_LOCK_ORDER_CSL_SEC2_PUSH :
+                                 (uvm_channel_pool_is_wlc(pool) ? UVM_LOCK_ORDER_CSL_WLC_PUSH :
+                                                                  UVM_LOCK_ORDER_CSL_PUSH);
+
+        uvm_sema_init(&pool->push_sem, num_channels, order);
+    }

    pool->channels = uvm_kvmalloc_zero(sizeof(*pool->channels) * num_channels);
    if (!pool->channels) {
@ -2715,41 +2380,24 @@ static NV_STATUS channel_pool_add(uvm_channel_manager_t *channel_manager,
    return status;
 }

-static bool ce_is_usable(const UvmGpuCopyEngineCaps *cap)
+static bool ce_usable_for_channel_type(uvm_channel_type_t type, const UvmGpuCopyEngineCaps *cap)
 {
-    return cap->supported && !cap->grce;
-}
+    if (!cap->supported || cap->grce)
+        return false;

-// Check that all asynchronous CEs are usable, and that there is at least one
-// such CE.
-static NV_STATUS ces_validate(uvm_channel_manager_t *manager, const UvmGpuCopyEngineCaps *ces_caps)
-{
-    unsigned ce;
-    bool found_usable_ce = false;
-
-    for (ce = 0; ce < UVM_COPY_ENGINE_COUNT_MAX; ++ce) {
-        const UvmGpuCopyEngineCaps *ce_caps = ces_caps + ce;
-
-        if (!ce_is_usable(ce_caps))
-            continue;
-
-        found_usable_ce = true;
-
-        // All channels may need to release their semaphore to sysmem.
-        // All CEs are expected to have the sysmem flag set.
-        if (!ce_caps->sysmem)
-            return NV_ERR_NOT_SUPPORTED;
-
-        // While P2P capabilities are only required for transfers between GPUs,
-        // in practice all CEs are expected to have the corresponding flag set.
-        if (!ce_caps->p2p)
-            return NV_ERR_NOT_SUPPORTED;
+    switch (type) {
+        case UVM_CHANNEL_TYPE_CPU_TO_GPU:
+        case UVM_CHANNEL_TYPE_GPU_TO_CPU:
+            return cap->sysmem;
+        case UVM_CHANNEL_TYPE_GPU_INTERNAL:
+        case UVM_CHANNEL_TYPE_MEMOPS:
+            return true;
+        case UVM_CHANNEL_TYPE_GPU_TO_GPU:
+            return cap->p2p;
+        default:
+            UVM_ASSERT_MSG(false, "Unexpected channel type 0x%x\n", type);
+            return false;
    }
-
-    if (!found_usable_ce)
-        return NV_ERR_NOT_SUPPORTED;
-
-    return NV_OK;
 }

 static unsigned ce_usage_count(NvU32 ce, const unsigned *preferred_ce)
@ -2778,13 +2426,15 @@ static int compare_ce_for_channel_type(const UvmGpuCopyEngineCaps *ce_caps,
    const UvmGpuCopyEngineCaps *cap0 = ce_caps + ce_index0;
    const UvmGpuCopyEngineCaps *cap1 = ce_caps + ce_index1;

+    UVM_ASSERT(ce_usable_for_channel_type(type, cap0));
+    UVM_ASSERT(ce_usable_for_channel_type(type, cap1));
    UVM_ASSERT(ce_index0 < UVM_COPY_ENGINE_COUNT_MAX);
    UVM_ASSERT(ce_index1 < UVM_COPY_ENGINE_COUNT_MAX);
    UVM_ASSERT(ce_index0 != ce_index1);

    switch (type) {
-        // For CPU to GPU fast sysmem read is the most important
        case UVM_CHANNEL_TYPE_CPU_TO_GPU:
+            // For CPU to GPU fast sysmem read is the most important
            if (cap0->sysmemRead != cap1->sysmemRead)
                return cap1->sysmemRead - cap0->sysmemRead;

@ -2794,8 +2444,8 @@ static int compare_ce_for_channel_type(const UvmGpuCopyEngineCaps *ce_caps,

            break;

-        // For GPU to CPU fast sysmem write is the most important
        case UVM_CHANNEL_TYPE_GPU_TO_CPU:
+            // For GPU to CPU fast sysmem write is the most important
            if (cap0->sysmemWrite != cap1->sysmemWrite)
                return cap1->sysmemWrite - cap0->sysmemWrite;

@ -2805,8 +2455,8 @@ static int compare_ce_for_channel_type(const UvmGpuCopyEngineCaps *ce_caps,

            break;

-        // For GPU to GPU prefer the LCE with the most PCEs
        case UVM_CHANNEL_TYPE_GPU_TO_GPU:
+            // Prefer the LCE with the most PCEs
            {
                int pce_diff = (int)hweight32(cap1->cePceMask) - (int)hweight32(cap0->cePceMask);

@ -2816,10 +2466,10 @@ static int compare_ce_for_channel_type(const UvmGpuCopyEngineCaps *ce_caps,

            break;

-        // For GPU_INTERNAL we want the max possible bandwidth for CEs. For now
-        // assume that the number of PCEs is a good measure.
-        // TODO: Bug 1735254: Add a direct CE query for local FB bandwidth
        case UVM_CHANNEL_TYPE_GPU_INTERNAL:
+            // We want the max possible bandwidth for CEs used for GPU_INTERNAL,
+            // for now assume that the number of PCEs is a good measure.
+            // TODO: Bug 1735254: Add a direct CE query for local FB bandwidth
            {
                int pce_diff = (int)hweight32(cap1->cePceMask) - (int)hweight32(cap0->cePceMask);

@ -2833,15 +2483,11 @@ static int compare_ce_for_channel_type(const UvmGpuCopyEngineCaps *ce_caps,

            break;

-        // For MEMOPS we mostly care about latency which should be better with
-        // less used CEs (although we only know about our own usage and not
-        // system-wide) so just break out to get the default ordering which
-        // prioritizes usage count.
        case UVM_CHANNEL_TYPE_MEMOPS:
-        // For WLC we only care about using a dedicated CE, which requires
-        // knowing the global CE mappings. For now just rely on the default
-        // ordering, which results on selecting an unused CE (if available).
-        case UVM_CHANNEL_TYPE_WLC:
+            // For MEMOPS we mostly care about latency which should be better
+            // with less used CEs (although we only know about our own usage and
+            // not system-wide) so just break out to get the default ordering
+            // which prioritizes usage count.
            break;

        default:
@ -2864,104 +2510,54 @@ static int compare_ce_for_channel_type(const UvmGpuCopyEngineCaps *ce_caps,
    return ce_index0 - ce_index1;
 }

-// Select the preferred CE for the given channel types.
-static void pick_ces_for_channel_types(uvm_channel_manager_t *manager,
+// Identify usable CEs, and select the preferred CE for a given channel type.
+static NV_STATUS pick_ce_for_channel_type(uvm_channel_manager_t *manager,
                                          const UvmGpuCopyEngineCaps *ce_caps,
-                                       uvm_channel_type_t *channel_types,
-                                       unsigned num_channel_types,
+                                          uvm_channel_type_t type,
                                          unsigned *preferred_ce)
 {
-    unsigned i;
+    NvU32 i;
+    NvU32 best_ce = UVM_COPY_ENGINE_COUNT_MAX;

-    // In Confidential Computing, do not mark all usable CEs, only the preferred
-    // ones, because non-preferred CE channels are guaranteed to not be used.
-    bool mark_all_usable_ces = !g_uvm_global.conf_computing_enabled;
+    UVM_ASSERT(type < UVM_CHANNEL_TYPE_CE_COUNT);

-    for (i = 0; i < num_channel_types; ++i) {
-        unsigned ce;
-        unsigned best_ce = UVM_COPY_ENGINE_COUNT_MAX;
-        uvm_channel_type_t type = channel_types[i];
+    for (i = 0; i < UVM_COPY_ENGINE_COUNT_MAX; ++i) {
+        const UvmGpuCopyEngineCaps *cap = ce_caps + i;

-        for (ce = 0; ce < UVM_COPY_ENGINE_COUNT_MAX; ++ce) {
-            if (!ce_is_usable(ce_caps + ce))
+        if (!ce_usable_for_channel_type(type, cap))
            continue;

-            if (mark_all_usable_ces)
-                __set_bit(ce, manager->ce_mask);
+        __set_bit(i, manager->ce_mask);

        if (best_ce == UVM_COPY_ENGINE_COUNT_MAX) {
-                best_ce = ce;
+            best_ce = i;
            continue;
        }

-            if (compare_ce_for_channel_type(ce_caps, type, ce, best_ce, preferred_ce) < 0)
-                best_ce = ce;
+        if (compare_ce_for_channel_type(ce_caps, type, i, best_ce, preferred_ce) < 0)
+            best_ce = i;
    }

-        UVM_ASSERT(best_ce != UVM_COPY_ENGINE_COUNT_MAX);
+    if (best_ce == UVM_COPY_ENGINE_COUNT_MAX) {
+        UVM_ERR_PRINT("Failed to find a suitable CE for channel type %s\n", uvm_channel_type_to_string(type));
+        return NV_ERR_NOT_SUPPORTED;
+    }

    preferred_ce[type] = best_ce;
-
-        // Preferred CEs are always marked as usable.
-        if (type < UVM_CHANNEL_TYPE_CE_COUNT)
-            __set_bit(best_ce, manager->ce_mask);
-    }
+    return NV_OK;
 }

-static void pick_ces(uvm_channel_manager_t *manager, const UvmGpuCopyEngineCaps *ce_caps, unsigned *preferred_ce)
+static NV_STATUS channel_manager_pick_copy_engines(uvm_channel_manager_t *manager, unsigned *preferred_ce)
 {
-    // The order of picking CEs for each type matters as it's affected by
-    // the usage count of each CE and it increases every time a CE
-    // is selected. MEMOPS has the least priority as it only cares about
-    // low usage of the CE to improve latency
+    NV_STATUS status;
+    unsigned i;
+    UvmGpuCopyEnginesCaps *ces_caps;
    uvm_channel_type_t types[] = {UVM_CHANNEL_TYPE_CPU_TO_GPU,
                                  UVM_CHANNEL_TYPE_GPU_TO_CPU,
                                  UVM_CHANNEL_TYPE_GPU_INTERNAL,
                                  UVM_CHANNEL_TYPE_GPU_TO_GPU,
                                  UVM_CHANNEL_TYPE_MEMOPS};

-    UVM_ASSERT(!g_uvm_global.conf_computing_enabled);
-
-    pick_ces_for_channel_types(manager, ce_caps, types, ARRAY_SIZE(types), preferred_ce);
-}
-
-static void pick_ces_conf_computing(uvm_channel_manager_t *manager,
-                                    const UvmGpuCopyEngineCaps *ce_caps,
-                                    unsigned *preferred_ce)
-{
-    unsigned best_wlc_ce;
-
-    // The WLC type must go last so an unused CE is chosen, if available
-    uvm_channel_type_t types[] = {UVM_CHANNEL_TYPE_CPU_TO_GPU,
-                                  UVM_CHANNEL_TYPE_GPU_TO_CPU,
-                                  UVM_CHANNEL_TYPE_GPU_INTERNAL,
-                                  UVM_CHANNEL_TYPE_MEMOPS,
-                                  UVM_CHANNEL_TYPE_WLC};
-
-    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
-
-    pick_ces_for_channel_types(manager, ce_caps, types, ARRAY_SIZE(types), preferred_ce);
-
-    // Direct transfers between GPUs are disallowed in Confidential Computing,
-    // but the preferred CE is still set to an arbitrary value for consistency.
-    preferred_ce[UVM_CHANNEL_TYPE_GPU_TO_GPU] = preferred_ce[UVM_CHANNEL_TYPE_GPU_TO_CPU];
-
-    best_wlc_ce = preferred_ce[UVM_CHANNEL_TYPE_WLC];
-
-    // TODO: Bug 4576908: in HCC, the WLC type should not share a CE with any
-    // channel type other than LCIC. The assertion should be a check instead.
-    UVM_ASSERT(ce_usage_count(best_wlc_ce, preferred_ce) == 0);
-}
-
-static NV_STATUS channel_manager_pick_ces(uvm_channel_manager_t *manager, unsigned *preferred_ce)
-{
-    NV_STATUS status;
-    UvmGpuCopyEnginesCaps *ces_caps;
-    uvm_channel_type_t type;
-
-    for (type = 0; type < UVM_CHANNEL_TYPE_COUNT; type++)
-        preferred_ce[type] = UVM_COPY_ENGINE_COUNT_MAX;
-
    ces_caps = uvm_kvmalloc_zero(sizeof(*ces_caps));
    if (!ces_caps)
        return NV_ERR_NO_MEMORY;
@ -2970,14 +2566,16 @@ static NV_STATUS channel_manager_pick_ces(uvm_channel_manager_t *manager, unsign
    if (status != NV_OK)
        goto out;

-    status = ces_validate(manager, ces_caps->copyEngineCaps);
+   // The order of picking CEs for each type matters as it's affected by the
+   // usage count of each CE and it increases every time a CE is selected.
+   // MEMOPS has the least priority as it only cares about low usage of the
+   // CE to improve latency
+    for (i = 0; i < ARRAY_SIZE(types); ++i) {
+        status = pick_ce_for_channel_type(manager, ces_caps->copyEngineCaps, types[i], preferred_ce);
        if (status != NV_OK)
            goto out;
+    }

-    if (g_uvm_global.conf_computing_enabled)
-        pick_ces_conf_computing(manager, ces_caps->copyEngineCaps, preferred_ce);
-    else
-        pick_ces(manager, ces_caps->copyEngineCaps, preferred_ce);
 out:
    uvm_kvfree(ces_caps);

@ -2986,16 +2584,18 @@ out:

 // Return the pool corresponding to the given CE index
 //
-// This function cannot be used to access the proxy pool in SR-IOV heavy.
+// Used to retrieve pools of type UVM_CHANNEL_POOL_TYPE_CE only.
 static uvm_channel_pool_t *channel_manager_ce_pool(uvm_channel_manager_t *manager, NvU32 ce)
 {
-    uvm_channel_pool_t *pool;
+    uvm_channel_pool_t *pool = uvm_channel_pool_first(manager, UVM_CHANNEL_POOL_TYPE_CE);

+    UVM_ASSERT(pool != NULL);
    UVM_ASSERT(test_bit(ce, manager->ce_mask));

-    // The index of the pool associated with 'ce' is the number of usable CEs
-    // in [0, ce)
-    pool = manager->channel_pools + bitmap_weight(manager->ce_mask, ce);
+    // Pools of type UVM_CHANNEL_POOL_TYPE_CE are stored contiguously. The
+    // offset of the pool associated with 'ce' is the number of usable CEs in
+    // [0, ce).
+    pool += bitmap_weight(manager->ce_mask, ce);

    UVM_ASSERT(pool->pool_type == UVM_CHANNEL_POOL_TYPE_CE);
    UVM_ASSERT(pool->engine_index == ce);
@ -3041,7 +2641,7 @@ static const char *buffer_location_to_string(UVM_BUFFER_LOCATION loc)
    else if (loc == UVM_BUFFER_LOCATION_DEFAULT)
        return "auto";

-    UVM_ASSERT_MSG(false, "Invalid buffer location value %d\n", loc);
+    UVM_ASSERT_MSG(false, "Invalid buffer locationvalue %d\n", loc);
    return NULL;
 }

@ -3213,27 +2813,28 @@ static unsigned channel_manager_get_max_pools(uvm_channel_manager_t *manager)
 static NV_STATUS channel_manager_create_ce_pools(uvm_channel_manager_t *manager, unsigned *preferred_ce)
 {
    unsigned ce;
+    unsigned type;

    // A pool is created for each usable CE, even if it has not been selected as
    // the preferred CE for any type, because as more information is discovered
    // (for example, a pair of peer GPUs is added) we may start using the
-    // previously idle pools. Configurations where non-preferred CEs are
-    // guaranteed to remain unused are allowed to avoid marking those engines as
-    // usable.
+    // previously idle pools.
    for_each_set_bit(ce, manager->ce_mask, UVM_COPY_ENGINE_COUNT_MAX) {
        NV_STATUS status;
-        unsigned type;
        uvm_channel_pool_t *pool = NULL;

        status = channel_pool_add(manager, UVM_CHANNEL_POOL_TYPE_CE, ce, &pool);
        if (status != NV_OK)
            return status;
+    }

    for (type = 0; type < UVM_CHANNEL_TYPE_CE_COUNT; type++) {
-            // Set pool type if it hasn't been set before.
-            if (preferred_ce[type] == ce && manager->pool_to_use.default_for_type[type] == NULL)
-                manager->pool_to_use.default_for_type[type] = pool;
-        }
+        // Avoid overwriting previously set defaults.
+        if (manager->pool_to_use.default_for_type[type] != NULL)
+            continue;
+
+        ce = preferred_ce[type];
+        manager->pool_to_use.default_for_type[type] = channel_manager_ce_pool(manager, ce);
    }

    return NV_OK;
@ -3242,8 +2843,11 @@ static NV_STATUS channel_manager_create_ce_pools(uvm_channel_manager_t *manager,
 static NV_STATUS setup_wlc_schedule(uvm_channel_t *wlc)
 {
    uvm_gpu_t *gpu = uvm_channel_get_gpu(wlc);
-    NvU64 protected_vidmem_gpu_va = uvm_channel_get_static_pb_protected_vidmem_gpu_va(wlc);
-    NvU64 unprotected_sysmem_gpu_va = get_channel_unprotected_sysmem_gpu_va(wlc);
+    NvU64 protected_vidmem = uvm_rm_mem_get_gpu_uvm_va(wlc->conf_computing.static_pb_protected_vidmem, gpu);
+    NvU64 unprotected_sysmem_gpu = uvm_rm_mem_get_gpu_uvm_va(wlc->conf_computing.static_pb_unprotected_sysmem, gpu);
+    void *unprotected_sysmem_cpu = wlc->conf_computing.static_pb_unprotected_sysmem_cpu;
+    NvU64 tag_offset = (uintptr_t)wlc->conf_computing.static_pb_unprotected_sysmem_auth_tag_cpu -
+                       (uintptr_t)wlc->conf_computing.static_pb_unprotected_sysmem_cpu;

    NvU64 *wlc_gpfifo_entries;
    uvm_push_t wlc_decrypt_push, sec2_push;
@ -3251,30 +2855,21 @@ static NV_STATUS setup_wlc_schedule(uvm_channel_t *wlc)
    int i;
    NV_STATUS status = NV_OK;

-    // "gpfifo" is the representation of GPFIFO copied to gpFifoGpuVa.
-    // Resuse static pushbuffer sysmem location for uploading GPFIFO schedule
+    // "gpfifo" is the representation of GPFIFO copied to gpFifoGpu
    const size_t gpfifo_size = wlc->num_gpfifo_entries * sizeof(*wlc_gpfifo_entries);
-    NvU64 gpfifo_unprotected_gpu_va = unprotected_sysmem_gpu_va;
-    void *gpfifo_unprotected_cpu = get_channel_unprotected_sysmem_cpu(wlc);
+    void *gpfifo_unprotected_cpu = unprotected_sysmem_cpu;
+    NvU64 gpfifo_unprotected_gpu = unprotected_sysmem_gpu;

-    // "run_push" represents mutable push location used by WLC. This is the
-    // first part of the WLC schedule, commands are decrypted as part of the
-    // launch sequence to protected_vidmem_gpu_va + 0.
-    // These locations are used in the static part ("decrypt_push") of the WLC schedule.
-    uvm_gpu_address_t run_push_protected_gpu = uvm_gpu_address_virtual(protected_vidmem_gpu_va);
-    uvm_gpu_address_t run_push_unprotected_gpu =
-        uvm_gpu_address_virtual_unprotected(unprotected_sysmem_gpu_va + WLC_SYSMEM_PUSHBUFFER_OFFSET);
-    uvm_gpu_address_t run_push_unprotected_auth_tag_gpu =
-        uvm_gpu_address_virtual_unprotected(unprotected_sysmem_gpu_va + WLC_SYSMEM_PUSHBUFFER_AUTH_TAG_OFFSET);
+    // "run_push" represents mutable push location used by WLC
+    uvm_gpu_address_t run_push_protected_gpu = uvm_gpu_address_virtual(protected_vidmem);
+    uvm_gpu_address_t run_push_unprotected_gpu = uvm_gpu_address_virtual(unprotected_sysmem_gpu);
+    uvm_gpu_address_t run_push_unprotected_auth_tag_gpu = uvm_gpu_address_virtual(unprotected_sysmem_gpu + tag_offset);

    // "decrypt_push" represents WLC decrypt push, constructed using fake_push.
-    // Copied to protected_vidmem_gpu_va + UVM_MAX_WLC_PUSH_SIZE, as the second of the two
+    // Copied to wlc_pb_base + UVM_MAX_WLC_PUSH_SIZE, as the second of the two
    // pushes that make the WLC fixed schedule.
-    NvU64 decrypt_push_protected_gpu_va = protected_vidmem_gpu_va + WLC_ALIGNED_MAX_PUSH_SIZE;
-
-    // Similar to gpfifo, uploading the "decrypt_push" reuses static sysmem
-    // locations later used for "run_push" when the WLC/LCIC schedule is active
-    NvU64 decrypt_push_unprotected_gpu_va = gpfifo_unprotected_gpu_va + gpfifo_size;
+    NvU64 decrypt_push_protected_gpu = UVM_ALIGN_UP(protected_vidmem + UVM_MAX_WLC_PUSH_SIZE, UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT);
+    NvU64 decrypt_push_unprotected_gpu = unprotected_sysmem_gpu + gpfifo_size;
    void *decrypt_push_unprotected_cpu = (char*)gpfifo_unprotected_cpu + gpfifo_size;

    // Tags for upload via SEC2
@ -3284,6 +2879,7 @@ static NV_STATUS setup_wlc_schedule(uvm_channel_t *wlc)
    BUILD_BUG_ON(sizeof(*wlc_gpfifo_entries) != sizeof(*wlc->channel_info.gpFifoEntries));

    UVM_ASSERT(uvm_channel_is_wlc(wlc));
+    UVM_ASSERT(tag_offset == UVM_ALIGN_UP(UVM_MAX_WLC_PUSH_SIZE, UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT));

    // WLC schedule consists of two parts, the number of entries needs to be even.
    // This also guarantees that the size is 16B aligned
@ -3330,7 +2926,7 @@ static NV_STATUS setup_wlc_schedule(uvm_channel_t *wlc)
    for (i = 0; i < wlc->num_gpfifo_entries; ++i) {
        if (i % 2 == wlc->cpu_put % 2) {
            gpu->parent->host_hal->set_gpfifo_entry(wlc_gpfifo_entries + i,
-                                                    decrypt_push_protected_gpu_va,
+                                                    decrypt_push_protected_gpu,
                                                    decrypt_push_size,
                                                    UVM_GPFIFO_SYNC_PROCEED);
        }
@ -3368,8 +2964,8 @@ static NV_STATUS setup_wlc_schedule(uvm_channel_t *wlc)
                                   decrypt_push_size,
                                   decrypt_push_auth_tag);
    gpu->parent->sec2_hal->decrypt(&sec2_push,
-                                   decrypt_push_protected_gpu_va,
-                                   decrypt_push_unprotected_gpu_va,
+                                   decrypt_push_protected_gpu,
+                                   decrypt_push_unprotected_gpu,
                                   decrypt_push_size,
                                   decrypt_push_auth_tag_gpu.address);

@ -3382,7 +2978,7 @@ static NV_STATUS setup_wlc_schedule(uvm_channel_t *wlc)
                                   gpfifo_auth_tag);
    gpu->parent->sec2_hal->decrypt(&sec2_push,
                                   wlc->channel_info.gpFifoGpuVa,
-                                   gpfifo_unprotected_gpu_va,
+                                   gpfifo_unprotected_gpu,
                                   gpfifo_size,
                                   gpfifo_auth_tag_gpu.address);

@ -3404,22 +3000,23 @@ free_gpfifo_entries:
 static NV_STATUS setup_lcic_schedule(uvm_channel_t *paired_wlc, uvm_channel_t *lcic)
 {
    uvm_gpu_t *gpu = uvm_channel_get_gpu(lcic);
-    NvU64 lcic_pb_base = uvm_channel_get_static_pb_protected_vidmem_gpu_va(lcic);
+    NvU64 lcic_pb_base = uvm_rm_mem_get_gpu_uvm_va(lcic->conf_computing.static_pb_protected_vidmem, gpu);

    // Reuse WLC sysmem allocation
-    NvU64 gpu_unprotected = get_channel_unprotected_sysmem_gpu_va(paired_wlc);
-    char *cpu_unprotected = get_channel_unprotected_sysmem_cpu(paired_wlc);
-
-    uvm_gpu_semaphore_t *lcic_semaphore = &lcic->tracking_sem.semaphore;
-
-    uvm_gpu_address_t notifier_src_entry_addr = lcic_static_entry_notifier_gpu_va(lcic);
-    uvm_gpu_address_t notifier_src_exit_addr = lcic_static_exit_notifier_gpu_va(lcic);
-    uvm_gpu_address_t notifier_dst_addr = uvm_gpu_semaphore_get_notifier_gpu_va(lcic_semaphore);
-    uvm_gpu_address_t encrypted_payload_gpu_va = uvm_gpu_semaphore_get_encrypted_payload_gpu_va(lcic_semaphore);
-    uvm_gpu_address_t auth_tag_gpu_va = uvm_gpu_semaphore_get_auth_tag_gpu_va(lcic_semaphore);
+    NvU64 gpu_unprotected = uvm_rm_mem_get_gpu_uvm_va(paired_wlc->conf_computing.static_pb_unprotected_sysmem, gpu);
+    char *cpu_unprotected = paired_wlc->conf_computing.static_pb_unprotected_sysmem_cpu;
+    uvm_gpu_semaphore_t *lcic_gpu_semaphore = &lcic->tracking_sem.semaphore;
+    uvm_gpu_address_t notifier_src_entry_addr = lcic->conf_computing.static_notifier_entry_unprotected_sysmem_gpu_va;
+    uvm_gpu_address_t notifier_src_exit_addr = lcic->conf_computing.static_notifier_exit_unprotected_sysmem_gpu_va;
+    uvm_gpu_address_t notifier_dst_addr = uvm_rm_mem_get_gpu_va(lcic_gpu_semaphore->conf_computing.notifier,
+                                                                gpu,
+                                                                false);
+    uvm_gpu_address_t encrypted_payload_gpu_va =
+        uvm_rm_mem_get_gpu_va(lcic_gpu_semaphore->conf_computing.encrypted_payload, gpu, false);
    uvm_gpu_address_t semaphore_gpu_va = uvm_gpu_address_virtual(uvm_channel_tracking_semaphore_get_gpu_va(lcic));
-    NvU32 payload_size = sizeof(*uvm_gpu_semaphore_get_encrypted_payload_cpu_va(lcic_semaphore));
-    NvU32 notifier_size = sizeof(uvm_gpu_semaphore_notifier_t);
+    uvm_gpu_address_t auth_tag_gpu_va = uvm_rm_mem_get_gpu_va(lcic_gpu_semaphore->conf_computing.auth_tag, gpu, false);
+    NvU32 payload_size = sizeof(*lcic->tracking_sem.semaphore.payload);
+    NvU32 notifier_size = sizeof(*lcic->conf_computing.static_notifier_entry_unprotected_sysmem_cpu);

    NvU64 *lcic_gpfifo_entries;
    uvm_push_t lcic_push, sec2_push;
@ -3475,11 +3072,7 @@ static NV_STATUS setup_lcic_schedule(uvm_channel_t *paired_wlc, uvm_channel_t *l
                                                 0xffffffff);

    gpu->parent->ce_hal->memcopy(&lcic_push, notifier_dst_addr, notifier_src_entry_addr, notifier_size);
-
-    // This CE encryption does not need to be logged, it will be logged on every
-    // push_end instead
    gpu->parent->ce_hal->encrypt(&lcic_push, encrypted_payload_gpu_va, semaphore_gpu_va, payload_size, auth_tag_gpu_va);
-
    gpu->parent->ce_hal->memcopy(&lcic_push, notifier_dst_addr, notifier_src_exit_addr, notifier_size);

    // End LCIC push
@ -3553,7 +3146,6 @@ static NV_STATUS channel_manager_setup_wlc_lcic(uvm_channel_pool_t *wlc_pool, uv
    NvU32 i;

    UVM_ASSERT(wlc_pool->manager == lcic_pool->manager);
-    UVM_ASSERT(!uvm_channel_manager_is_wlc_ready(wlc_pool->manager));
    UVM_ASSERT(wlc_pool->manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_WLC] != NULL);
    UVM_ASSERT(lcic_pool->manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_LCIC] == NULL);
    UVM_ASSERT(wlc_pool->num_channels == lcic_pool->num_channels);
@ -3602,8 +3194,12 @@ static NV_STATUS channel_manager_create_conf_computing_pools(uvm_channel_manager

    manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_SEC2] = sec2_pool;

-    // WLC and LCIC must use the same engine for the fixed schedule to work.
-    wlc_lcic_ce_index = preferred_ce[UVM_CHANNEL_TYPE_WLC];
+    // Use the same CE as CPU TO GPU channels for WLC/LCIC
+    // Both need to use the same engine for the fixed schedule to work.
+    // TODO: Bug 3981928: [hcc][uvm] Optimize parameters of WLC/LCIC secure
+    // work launch
+    // Find a metric to select the best CE to use
+    wlc_lcic_ce_index = preferred_ce[UVM_CHANNEL_TYPE_CPU_TO_GPU];

    // Create WLC/LCIC pools. This should be done early, CE channels use
    // them for secure launch. The WLC pool must be created before the LCIC.
@ -3626,19 +3222,20 @@ static NV_STATUS channel_manager_create_conf_computing_pools(uvm_channel_manager
    // are ready to be used for secure work submission.
    manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_LCIC] = lcic_pool;

-    // WLC and LCIC pools are ready
-    manager->conf_computing.wlc_ready = true;
-
    return NV_OK;
 }

 static NV_STATUS channel_manager_create_pools(uvm_channel_manager_t *manager)
 {
    NV_STATUS status;
+    uvm_channel_type_t type;
    unsigned max_channel_pools;
-    unsigned preferred_ce[UVM_CHANNEL_TYPE_COUNT];
+    unsigned preferred_ce[UVM_CHANNEL_TYPE_CE_COUNT];

-    status = channel_manager_pick_ces(manager, preferred_ce);
+    for (type = 0; type < ARRAY_SIZE(preferred_ce); type++)
+        preferred_ce[type] = UVM_COPY_ENGINE_COUNT_MAX;
+
+    status = channel_manager_pick_copy_engines(manager, preferred_ce);
    if (status != NV_OK)
        return status;

@ -3681,8 +3278,6 @@ NV_STATUS uvm_channel_manager_create(uvm_gpu_t *gpu, uvm_channel_manager_t **cha
    if (!channel_manager)
        return NV_ERR_NO_MEMORY;

-    *channel_manager_out = channel_manager;
-
    channel_manager->gpu = gpu;
    init_channel_manager_conf(channel_manager);
    status = uvm_pushbuffer_create(channel_manager, &channel_manager->pushbuffer);
@ -3701,18 +3296,12 @@ NV_STATUS uvm_channel_manager_create(uvm_gpu_t *gpu, uvm_channel_manager_t **cha
    if (status != NV_OK)
        goto error;

-    // Key rotation is enabled only after all the channels have been created:
-    // RM does not support channel allocation on an engine if key rotation is
-    // pending on that engine. This can become a problem during testing if
-    // key rotation thresholds are very low.
-    uvm_conf_computing_enable_key_rotation(gpu);
+    *channel_manager_out = channel_manager;

-    return NV_OK;
+    return status;

 error:
-    *channel_manager_out = NULL;
    uvm_channel_manager_destroy(channel_manager);
-
    return status;
 }

@ -3763,7 +3352,8 @@ static void channel_manager_stop_wlc(uvm_channel_manager_t *manager)
    if (status != NV_OK)
        UVM_ERR_PRINT_NV_STATUS("Failed to end stop push for WLC", status);

-    manager->conf_computing.wlc_ready = false;
+    manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_WLC] = NULL;
+    manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_LCIC] = NULL;
 }

 void uvm_channel_manager_destroy(uvm_channel_manager_t *channel_manager)
@ -3785,14 +3375,6 @@ void uvm_channel_manager_destroy(uvm_channel_manager_t *channel_manager)
    uvm_kvfree(channel_manager);
 }

-NvU32 uvm_channel_pool_key_version(uvm_channel_pool_t *pool)
-{
-    if (uvm_channel_pool_is_lcic(pool))
-        pool = get_paired_pool(pool);
-
-    return pool->conf_computing.key_rotation.version;
-}
-
 bool uvm_channel_is_privileged(uvm_channel_t *channel)
 {
    if (uvm_parent_gpu_is_virt_mode_sriov_heavy(uvm_channel_get_gpu(channel)->parent))
@ -3914,7 +3496,7 @@ static void uvm_channel_print_info(uvm_channel_t *channel, struct seq_file *s)
    UVM_SEQ_OR_DBG_PRINT(s, "get                %u\n", channel->gpu_get);
    UVM_SEQ_OR_DBG_PRINT(s, "put                %u\n", channel->cpu_put);
    UVM_SEQ_OR_DBG_PRINT(s, "Semaphore GPU VA   0x%llx\n", uvm_channel_tracking_semaphore_get_gpu_va(channel));
-    UVM_SEQ_OR_DBG_PRINT(s, "Semaphore CPU VA   0x%llx\n", (NvU64)uvm_gpu_semaphore_get_cpu_va(&channel->tracking_sem.semaphore));
+    UVM_SEQ_OR_DBG_PRINT(s, "Semaphore CPU VA   0x%llx\n", (NvU64)(uintptr_t)channel->tracking_sem.semaphore.payload);

    channel_pool_unlock(channel->pool);
 }
--- a/kernel-open/nvidia-uvm/uvm_channel.h
+++ b/kernel-open/nvidia-uvm/uvm_channel.h
@ -228,65 +228,21 @@ typedef struct
    // variant is required when the thread holding the pool lock must sleep
    // (ex: acquire another mutex) deeper in the call stack, either in UVM or
    // RM.
-    union
-    {
+    union {
        uvm_spinlock_t spinlock;
        uvm_mutex_t mutex;
    };

-    struct
-    {
    // Secure operations require that uvm_push_begin order matches
-        // uvm_push_end order, because the engine's state is used in its
-        // internal operation and each push may modify this state.
-        // push_locks is protected by the channel pool lock.
+    // uvm_push_end order, because the engine's state is used in its internal
+    // operation and each push may modify this state. push_locks is protected by
+    // the channel pool lock.
    DECLARE_BITMAP(push_locks, UVM_CHANNEL_MAX_NUM_CHANNELS_PER_POOL);

    // Counting semaphore for available and unlocked channels, it must be
    // acquired before submitting work to a channel when the Confidential
    // Computing feature is enabled.
    uvm_semaphore_t push_sem;
-
-        // Per channel buffers in unprotected sysmem.
-        uvm_rm_mem_t *pool_sysmem;
-
-        // Per channel buffers in protected vidmem.
-        uvm_rm_mem_t *pool_vidmem;
-
-       struct
-       {
-            // Current encryption key version, incremented upon key rotation.
-            // While there are separate keys for encryption and decryption, the
-            // two keys are rotated at once, so the versioning applies to both.
-            NvU32 version;
-
-            // Lock used to ensure mutual exclusion during key rotation.
-            uvm_mutex_t mutex;
-
-            // CSL contexts passed to RM for key rotation. This is usually an
-            // array containing the CSL contexts associated with the channels in
-            // the pool. In the case of the WLC pool, the array also includes
-            // CSL contexts associated with LCIC channels.
-            UvmCslContext **csl_contexts;
-
-            // Number of elements in the CSL context array.
-            unsigned num_csl_contexts;
-
-            // Number of bytes encrypted, or decrypted, on the engine associated
-            // with the pool since the last key rotation. Only used during
-            // testing, to force key rotations after a certain encryption size,
-            // see UVM_CONF_COMPUTING_KEY_ROTATION_LOWER_THRESHOLD.
-            //
-            // Encryptions on a LCIC pool are accounted for in the paired WLC
-            // pool.
-            //
-            // TODO: Bug 4612912: these accounting variables can be removed once
-            // RM exposes an API to set the key rotation lower threshold.
-            atomic64_t encrypted;
-            atomic64_t decrypted;
-        } key_rotation;
-
-    } conf_computing;
 } uvm_channel_pool_t;

 struct uvm_channel_struct
@ -366,14 +322,43 @@ struct uvm_channel_struct
        // work launches to match the order of push end-s that triggered them.
        volatile NvU32 gpu_put;

-        // Protected sysmem location makes WLC independent from the pushbuffer
-        // allocator. Unprotected sysmem and protected vidmem counterparts
-        // are allocated from the channel pool (sysmem, vidmem).
+        // Static pushbuffer for channels with static schedule (WLC/LCIC)
+        uvm_rm_mem_t *static_pb_protected_vidmem;
+
+        // Static pushbuffer staging buffer for WLC
+        uvm_rm_mem_t *static_pb_unprotected_sysmem;
+        void *static_pb_unprotected_sysmem_cpu;
+        void *static_pb_unprotected_sysmem_auth_tag_cpu;
+
+        // The above static locations are required by the WLC (and LCIC)
+        // schedule. Protected sysmem location completes WLC's independence
+        // from the pushbuffer allocator.
        void *static_pb_protected_sysmem;

+        // Static tracking semaphore notifier values
+        // Because of LCIC's fixed schedule, the secure semaphore release
+        // mechanism uses two additional static locations for incrementing the
+        // notifier values. See:
+        // . channel_semaphore_secure_release()
+        // . setup_lcic_schedule()
+        // . internal_channel_submit_work_wlc()
+        uvm_rm_mem_t *static_notifier_unprotected_sysmem;
+        NvU32 *static_notifier_entry_unprotected_sysmem_cpu;
+        NvU32 *static_notifier_exit_unprotected_sysmem_cpu;
+        uvm_gpu_address_t static_notifier_entry_unprotected_sysmem_gpu_va;
+        uvm_gpu_address_t static_notifier_exit_unprotected_sysmem_gpu_va;
+
+        // Explicit location for push launch tag used by WLC.
+        // Encryption auth tags have to be located in unprotected sysmem.
+        void *launch_auth_tag_cpu;
+        NvU64 launch_auth_tag_gpu_va;
+
        // Used to decrypt the push back to protected sysmem.
        // This happens when profilers register callbacks for migration data.
        uvm_push_crypto_bundle_t *push_crypto_bundles;
+
+        // Accompanying authentication tags for the crypto bundles
+        uvm_rm_mem_t *push_crypto_bundle_auth_tags;
    } conf_computing;

    // RM channel information
@ -433,7 +418,7 @@ struct uvm_channel_manager_struct
    unsigned num_channel_pools;

    // Mask containing the indexes of the usable Copy Engines. Each usable CE
-    // has at least one pool of type UVM_CHANNEL_POOL_TYPE_CE associated with it
+    // has at least one pool associated with it.
    DECLARE_BITMAP(ce_mask, UVM_COPY_ENGINE_COUNT_MAX);

    struct
@ -466,16 +451,6 @@ struct uvm_channel_manager_struct
        UVM_BUFFER_LOCATION gpput_loc;
        UVM_BUFFER_LOCATION pushbuffer_loc;
    } conf;
-
-    struct
-    {
-        // Flag indicating that the WLC/LCIC mechanism is ready/setup; should
-        // only be false during (de)initialization.
-        bool wlc_ready;
-
-        // True indicates that key rotation is enabled (UVM-wise).
-        bool key_rotation_enabled;
-    } conf_computing;
 };

 // Create a channel manager for the GPU
@ -526,14 +501,6 @@ uvm_channel_t *uvm_channel_lcic_get_paired_wlc(uvm_channel_t *lcic_channel);

 uvm_channel_t *uvm_channel_wlc_get_paired_lcic(uvm_channel_t *wlc_channel);

-NvU64 uvm_channel_get_static_pb_protected_vidmem_gpu_va(uvm_channel_t *channel);
-
-NvU64 uvm_channel_get_static_pb_unprotected_sysmem_gpu_va(uvm_channel_t *channel);
-
-char* uvm_channel_get_static_pb_unprotected_sysmem_cpu(uvm_channel_t *channel);
-
-char *uvm_channel_get_push_crypto_bundle_auth_tags_cpu_va(uvm_channel_t *channel, unsigned tag_index);
-
 static bool uvm_channel_pool_is_proxy(uvm_channel_pool_t *pool)
 {
    UVM_ASSERT(uvm_pool_type_is_valid(pool->pool_type));
@ -565,17 +532,6 @@ static uvm_channel_type_t uvm_channel_proxy_channel_type(void)
    return UVM_CHANNEL_TYPE_MEMOPS;
 }

-// Force key rotation in the engine associated with the given channel pool.
-// Rotation may still not happen if RM cannot acquire the necessary locks (in
-// which case the function returns NV_ERR_STATE_IN_USE).
-//
-// This function should be only invoked in pools in which key rotation is
-// enabled.
-NV_STATUS uvm_channel_pool_rotate_key(uvm_channel_pool_t *pool);
-
-// Retrieve the current encryption key version associated with the channel pool.
-NvU32 uvm_channel_pool_key_version(uvm_channel_pool_t *pool);
-
 // Privileged channels support all the Host and engine methods, while
 // non-privileged channels don't support privileged methods.
 //
@ -623,9 +579,12 @@ NvU32 uvm_channel_manager_update_progress(uvm_channel_manager_t *channel_manager
 // beginning.
 NV_STATUS uvm_channel_manager_wait(uvm_channel_manager_t *manager);

+// Check if WLC/LCIC mechanism is ready/setup
+// Should only return false during initialization
 static bool uvm_channel_manager_is_wlc_ready(uvm_channel_manager_t *manager)
 {
-    return manager->conf_computing.wlc_ready;
+    return (manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_WLC] != NULL) &&
+           (manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_LCIC] != NULL);
 }
 // Get the GPU VA of semaphore_channel's tracking semaphore within the VA space
 // associated with access_channel.
--- a/kernel-open/nvidia-uvm/uvm_channel_test.c
+++ b/kernel-open/nvidia-uvm/uvm_channel_test.c
@ -796,8 +796,11 @@ done:
 NV_STATUS test_conf_computing_channel_selection(uvm_va_space_t *va_space)
 {
    NV_STATUS status = NV_OK;
-    uvm_push_t *pushes = NULL;
-    uvm_gpu_t *gpu = NULL;
+    uvm_channel_pool_t *pool;
+    uvm_push_t *pushes;
+    uvm_gpu_t *gpu;
+    NvU32 i;
+    NvU32 num_pushes;

    if (!g_uvm_global.conf_computing_enabled)
        return NV_OK;
@ -807,19 +810,9 @@ NV_STATUS test_conf_computing_channel_selection(uvm_va_space_t *va_space)
    for_each_va_space_gpu(gpu, va_space) {
        uvm_channel_type_t channel_type;

-        // Key rotation is disabled because this test relies on nested pushes,
-        // which is illegal. If any push other than the first one triggers key
-        // rotation, the test won't complete. This is because key rotation
-        // depends on waiting for ongoing pushes to end, which doesn't happen
-        // if those pushes are ended after the current one begins.
-        uvm_conf_computing_disable_key_rotation(gpu);
-
        for (channel_type = 0; channel_type < UVM_CHANNEL_TYPE_COUNT; channel_type++) {
-            NvU32 i;
-            NvU32 num_pushes;
-            uvm_channel_pool_t *pool = gpu->channel_manager->pool_to_use.default_for_type[channel_type];
-
-            TEST_CHECK_GOTO(pool != NULL, error);
+            pool = gpu->channel_manager->pool_to_use.default_for_type[channel_type];
+            TEST_CHECK_RET(pool != NULL);

            // Skip LCIC channels as those can't accept any pushes
            if (uvm_channel_pool_is_lcic(pool))
@ -831,7 +824,7 @@ NV_STATUS test_conf_computing_channel_selection(uvm_va_space_t *va_space)
            num_pushes = min(pool->num_channels, (NvU32)UVM_PUSH_MAX_CONCURRENT_PUSHES);

            pushes = uvm_kvmalloc_zero(sizeof(*pushes) * num_pushes);
-            TEST_CHECK_GOTO(pushes != NULL, error);
+            TEST_CHECK_RET(pushes != NULL);

            for (i = 0; i < num_pushes; i++) {
                uvm_push_t *push = &pushes[i];
@ -848,18 +841,12 @@ NV_STATUS test_conf_computing_channel_selection(uvm_va_space_t *va_space)

            uvm_kvfree(pushes);
        }
-
-        uvm_conf_computing_enable_key_rotation(gpu);
    }

    uvm_thread_context_lock_enable_tracking();

    return status;
-
 error:
-    if (gpu != NULL)
-        uvm_conf_computing_enable_key_rotation(gpu);
-
    uvm_thread_context_lock_enable_tracking();
    uvm_kvfree(pushes);

@ -961,318 +948,6 @@ release:
    return NV_OK;
 }

-static NV_STATUS force_key_rotations(uvm_channel_pool_t *pool, unsigned num_rotations)
-{
-    unsigned num_tries;
-    unsigned max_num_tries = 20;
-    unsigned num_rotations_completed = 0;
-
-    if (num_rotations == 0)
-        return NV_OK;
-
-    // The number of accepted rotations is kept low, so failed rotation
-    // invocations due to RM not acquiring the necessary locks (which imply a
-    // sleep in the test) do not balloon the test execution time.
-    UVM_ASSERT(num_rotations <= 10);
-
-    for (num_tries = 0; (num_tries < max_num_tries) && (num_rotations_completed < num_rotations); num_tries++) {
-        // Force key rotation, irrespective of encryption usage.
-        NV_STATUS status = uvm_channel_pool_rotate_key(pool);
-
-        // Key rotation may not be able to complete due to RM failing to acquire
-        // the necessary locks. Detect the situation, sleep for a bit, and then
-        // try again
-        //
-        // The maximum time spent sleeping in a single rotation call is
-        // (max_num_tries * max_sleep_us)
-        if (status == NV_ERR_STATE_IN_USE) {
-            NvU32 min_sleep_us = 1000;
-            NvU32 max_sleep_us = 10000;
-
-            usleep_range(min_sleep_us, max_sleep_us);
-            continue;
-        }
-
-        TEST_NV_CHECK_RET(status);
-
-        num_rotations_completed++;
-    }
-
-    // If not a single key rotation occurred, the dependent tests still pass,
-    // but there is no much value to them. Instead, return an error so the
-    // maximum number of tries, or the maximum sleep time, are adjusted to
-    // ensure that at least one rotation completes.
-    if (num_rotations_completed > 0)
-        return NV_OK;
-    else
-        return NV_ERR_STATE_IN_USE;
-}
-
-static NV_STATUS force_key_rotation(uvm_channel_pool_t *pool)
-{
-    return force_key_rotations(pool, 1);
-}
-
-// Test key rotation in all pools. This is useful because key rotation may not
-// happen otherwise on certain engines during UVM test execution. For example,
-// if the MEMOPS channel type is mapped to a CE not shared with any other
-// channel type, then the only encryption taking place in the engine is due to
-// semaphore releases (4 bytes each). This small encryption size makes it
-// unlikely to exceed even small rotation thresholds.
-static NV_STATUS test_channel_key_rotation_basic(uvm_gpu_t *gpu)
-{
-    uvm_channel_pool_t *pool;
-
-    uvm_for_each_pool(pool, gpu->channel_manager) {
-        if (!uvm_conf_computing_is_key_rotation_enabled_in_pool(pool))
-            continue;
-
-        TEST_NV_CHECK_RET(force_key_rotation(pool));
-    }
-
-    return NV_OK;
-}
-
-// Interleave GPU encryptions and decryptions, and their CPU counterparts, with
-// key rotations.
-static NV_STATUS test_channel_key_rotation_interleave(uvm_gpu_t *gpu)
-{
-    int i;
-    uvm_channel_pool_t *gpu_to_cpu_pool;
-    uvm_channel_pool_t *cpu_to_gpu_pool;
-    NV_STATUS status = NV_OK;
-    size_t size = UVM_CONF_COMPUTING_DMA_BUFFER_SIZE;
-    void *initial_plain_cpu = NULL;
-    void *final_plain_cpu = NULL;
-    uvm_mem_t *plain_gpu = NULL;
-    uvm_gpu_address_t plain_gpu_address;
-
-    cpu_to_gpu_pool = gpu->channel_manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_CPU_TO_GPU];
-    TEST_CHECK_RET(uvm_conf_computing_is_key_rotation_enabled_in_pool(cpu_to_gpu_pool));
-
-    gpu_to_cpu_pool = gpu->channel_manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_GPU_TO_CPU];
-    TEST_CHECK_RET(uvm_conf_computing_is_key_rotation_enabled_in_pool(gpu_to_cpu_pool));
-
-    initial_plain_cpu = uvm_kvmalloc_zero(size);
-    if (initial_plain_cpu == NULL) {
-        status = NV_ERR_NO_MEMORY;
-        goto out;
-    }
-
-    final_plain_cpu = uvm_kvmalloc_zero(size);
-    if (final_plain_cpu == NULL) {
-        status = NV_ERR_NO_MEMORY;
-        goto out;
-    }
-
-    TEST_NV_CHECK_GOTO(uvm_mem_alloc_vidmem(size, gpu, &plain_gpu), out);
-    TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(plain_gpu, gpu), out);
-    plain_gpu_address = uvm_mem_gpu_address_virtual_kernel(plain_gpu, gpu);
-
-    memset(initial_plain_cpu, 1, size);
-
-    for (i = 0; i < 5; i++) {
-        TEST_NV_CHECK_GOTO(force_key_rotation(gpu_to_cpu_pool), out);
-        TEST_NV_CHECK_GOTO(force_key_rotation(cpu_to_gpu_pool), out);
-
-        TEST_NV_CHECK_GOTO(uvm_conf_computing_util_memcopy_cpu_to_gpu(gpu,
-                                                                      plain_gpu_address,
-                                                                      initial_plain_cpu,
-                                                                      size,
-                                                                      NULL,
-                                                                      "CPU > GPU"),
-                           out);
-
-        TEST_NV_CHECK_GOTO(force_key_rotation(gpu_to_cpu_pool), out);
-        TEST_NV_CHECK_GOTO(force_key_rotation(cpu_to_gpu_pool), out);
-
-        TEST_NV_CHECK_GOTO(uvm_conf_computing_util_memcopy_gpu_to_cpu(gpu,
-                                                                      final_plain_cpu,
-                                                                      plain_gpu_address,
-                                                                      size,
-                                                                      NULL,
-                                                                      "GPU > CPU"),
-                           out);
-
-        TEST_CHECK_GOTO(!memcmp(initial_plain_cpu, final_plain_cpu, size), out);
-
-        memset(final_plain_cpu, 0, size);
-    }
-
-out:
-    uvm_mem_free(plain_gpu);
-    uvm_kvfree(final_plain_cpu);
-    uvm_kvfree(initial_plain_cpu);
-
-    return status;
-}
-
-static NV_STATUS memset_vidmem(uvm_mem_t *mem, NvU8 val)
-{
-    uvm_push_t push;
-    uvm_gpu_address_t gpu_address;
-    uvm_gpu_t *gpu = mem->backing_gpu;
-
-    UVM_ASSERT(uvm_mem_is_vidmem(mem));
-
-    TEST_NV_CHECK_RET(uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_INTERNAL, &push, "zero vidmem"));
-
-    gpu_address = uvm_mem_gpu_address_virtual_kernel(mem, gpu);
-    gpu->parent->ce_hal->memset_1(&push, gpu_address, val, mem->size);
-
-    TEST_NV_CHECK_RET(uvm_push_end_and_wait(&push));
-
-    return NV_OK;
-}
-
-// Custom version of uvm_conf_computing_util_memcopy_gpu_to_cpu that allows
-// testing to insert key rotations in between the push end, and the CPU
-// decryption
-static NV_STATUS encrypted_memcopy_gpu_to_cpu(uvm_gpu_t *gpu,
-                                              void *dst_plain,
-                                              uvm_gpu_address_t src_gpu_address,
-                                              size_t size,
-                                              unsigned num_rotations_to_insert)
-{
-    NV_STATUS status;
-    uvm_push_t push;
-    uvm_conf_computing_dma_buffer_t *dma_buffer;
-    uvm_gpu_address_t dst_gpu_address, auth_tag_gpu_address;
-    void *src_cipher, *auth_tag;
-    uvm_channel_t *channel;
-
-    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
-    UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE);
-
-    status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL);
-    if (status != NV_OK)
-        return status;
-
-    status = uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_TO_CPU, &push, "Small GPU > CPU encryption");
-    if (status != NV_OK)
-        goto out;
-
-    channel = push.channel;
-    uvm_conf_computing_log_gpu_encryption(channel, size, dma_buffer->decrypt_iv);
-    dma_buffer->key_version[0] = uvm_channel_pool_key_version(channel->pool);
-
-    dst_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
-    auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
-    gpu->parent->ce_hal->encrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address);
-
-    status = uvm_push_end_and_wait(&push);
-    if (status != NV_OK)
-        goto out;
-
-    TEST_NV_CHECK_GOTO(force_key_rotations(channel->pool, num_rotations_to_insert), out);
-
-    // If num_rotations_to_insert is not zero, the current encryption key will
-    // be different from the one used during CE encryption.
-
-    src_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
-    auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
-    status = uvm_conf_computing_cpu_decrypt(channel,
-                                            dst_plain,
-                                            src_cipher,
-                                            dma_buffer->decrypt_iv,
-                                            dma_buffer->key_version[0],
-                                            size,
-                                            auth_tag);
-
- out:
-    uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL);
-    return status;
-}
-
-static NV_STATUS test_channel_key_rotation_cpu_decryption(uvm_gpu_t *gpu,
-                                                          unsigned num_repetitions,
-                                                          unsigned num_rotations_to_insert)
-{
-    unsigned i;
-    uvm_channel_pool_t *gpu_to_cpu_pool;
-    NV_STATUS status = NV_OK;
-    size_t size = UVM_CONF_COMPUTING_DMA_BUFFER_SIZE;
-    NvU8 *plain_cpu = NULL;
-    uvm_mem_t *plain_gpu = NULL;
-    uvm_gpu_address_t plain_gpu_address;
-
-    if (!uvm_conf_computing_is_key_rotation_enabled(gpu))
-        return NV_OK;
-
-    gpu_to_cpu_pool = gpu->channel_manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_GPU_TO_CPU];
-    TEST_CHECK_RET(uvm_conf_computing_is_key_rotation_enabled_in_pool(gpu_to_cpu_pool));
-
-    plain_cpu = (NvU8 *) uvm_kvmalloc_zero(size);
-    if (plain_cpu == NULL) {
-        status = NV_ERR_NO_MEMORY;
-        goto out;
-    }
-
-    TEST_NV_CHECK_GOTO(uvm_mem_alloc_vidmem(size, gpu, &plain_gpu), out);
-    TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(plain_gpu, gpu), out);
-    TEST_NV_CHECK_GOTO(memset_vidmem(plain_gpu, 1), out);
-
-    plain_gpu_address = uvm_mem_gpu_address_virtual_kernel(plain_gpu, gpu);
-
-    for (i = 0; i < num_repetitions; i++) {
-        unsigned j;
-
-        TEST_NV_CHECK_GOTO(encrypted_memcopy_gpu_to_cpu(gpu,
-                                                        plain_cpu,
-                                                        plain_gpu_address,
-                                                        size,
-                                                        num_rotations_to_insert),
-                          out);
-
-        for (j = 0; j < size; j++)
-            TEST_CHECK_GOTO(plain_cpu[j] == 1, out);
-
-        memset(plain_cpu, 0, size);
-
-    }
-out:
-    uvm_mem_free(plain_gpu);
-    uvm_kvfree(plain_cpu);
-
-    return status;
-}
-
-// Test that CPU decryptions can use old keys i.e. previous versions of the keys
-// that are no longer the current key, due to key rotation. Given that SEC2
-// does not expose encryption capabilities, the "decrypt-after-rotation" problem
-// is exclusive of CE encryptions.
-static NV_STATUS test_channel_key_rotation_decrypt_after_key_rotation(uvm_gpu_t *gpu)
-{
-    // Instruct encrypted_memcopy_gpu_to_cpu to insert several key rotations
-    // between the GPU encryption, and the associated CPU decryption.
-    unsigned num_rotations_to_insert = 8;
-
-    TEST_NV_CHECK_RET(test_channel_key_rotation_cpu_decryption(gpu, 1, num_rotations_to_insert));
-
-    return NV_OK;
-}
-
-static NV_STATUS test_channel_key_rotation(uvm_va_space_t *va_space)
-{
-    uvm_gpu_t *gpu;
-
-    if (!g_uvm_global.conf_computing_enabled)
-        return NV_OK;
-
-    for_each_va_space_gpu(gpu, va_space) {
-        if (!uvm_conf_computing_is_key_rotation_enabled(gpu))
-            break;
-
-        TEST_NV_CHECK_RET(test_channel_key_rotation_basic(gpu));
-
-        TEST_NV_CHECK_RET(test_channel_key_rotation_interleave(gpu));
-
-        TEST_NV_CHECK_RET(test_channel_key_rotation_decrypt_after_key_rotation(gpu));
-    }
-
-    return NV_OK;
-}
-
 NV_STATUS test_write_ctrl_gpfifo_noop(uvm_va_space_t *va_space)
 {
    uvm_gpu_t *gpu;
@ -1528,10 +1203,6 @@ NV_STATUS uvm_test_channel_sanity(UVM_TEST_CHANNEL_SANITY_PARAMS *params, struct
    if (status != NV_OK)
        goto done;

-    status = test_channel_key_rotation(va_space);
-    if (status != NV_OK)
-        goto done;
-
    // The following tests have side effects, they reset the GPU's
    // channel_manager.
    status = test_channel_pushbuffer_extension_base(va_space);
@ -1667,126 +1338,6 @@ done:
    return status;
 }

-static NV_STATUS channel_stress_key_rotation_cpu_encryption(uvm_gpu_t *gpu, UVM_TEST_CHANNEL_STRESS_PARAMS *params)
-{
-    int i;
-    uvm_channel_pool_t *cpu_to_gpu_pool;
-    NV_STATUS status = NV_OK;
-    size_t size = UVM_CONF_COMPUTING_DMA_BUFFER_SIZE;
-    void *initial_plain_cpu = NULL;
-    uvm_mem_t *plain_gpu = NULL;
-    uvm_gpu_address_t plain_gpu_address;
-
-    UVM_ASSERT(params->key_rotation_operation == UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_CPU_TO_GPU);
-
-    cpu_to_gpu_pool = gpu->channel_manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_CPU_TO_GPU];
-    TEST_CHECK_RET(uvm_conf_computing_is_key_rotation_enabled_in_pool(cpu_to_gpu_pool));
-
-    initial_plain_cpu = uvm_kvmalloc_zero(size);
-    if (initial_plain_cpu == NULL) {
-        status = NV_ERR_NO_MEMORY;
-        goto out;
-    }
-
-    TEST_NV_CHECK_GOTO(uvm_mem_alloc_vidmem(size, gpu, &plain_gpu), out);
-    TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(plain_gpu, gpu), out);
-    plain_gpu_address = uvm_mem_gpu_address_virtual_kernel(plain_gpu, gpu);
-
-    memset(initial_plain_cpu, 1, size);
-
-    for (i = 0; i < params->iterations; i++) {
-        TEST_NV_CHECK_GOTO(uvm_conf_computing_util_memcopy_cpu_to_gpu(gpu,
-                                                                      plain_gpu_address,
-                                                                      initial_plain_cpu,
-                                                                      size,
-                                                                      NULL,
-                                                                      "CPU > GPU"),
-                           out);
-    }
-
-out:
-    uvm_mem_free(plain_gpu);
-    uvm_kvfree(initial_plain_cpu);
-
-    return status;
-}
-
-static NV_STATUS channel_stress_key_rotation_cpu_decryption(uvm_gpu_t *gpu, UVM_TEST_CHANNEL_STRESS_PARAMS *params)
-{
-    unsigned num_rotations_to_insert = 0;
-
-    UVM_ASSERT(params->key_rotation_operation == UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_GPU_TO_CPU);
-
-    return test_channel_key_rotation_cpu_decryption(gpu, params->iterations, num_rotations_to_insert);
-}
-
-static NV_STATUS channel_stress_key_rotation_rotate(uvm_gpu_t *gpu, UVM_TEST_CHANNEL_STRESS_PARAMS *params)
-{
-    NvU32 i;
-
-    UVM_ASSERT(params->key_rotation_operation == UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_ROTATE);
-
-    for (i = 0; i < params->iterations; ++i) {
-        NV_STATUS status;
-        uvm_channel_pool_t *pool;
-        uvm_channel_type_t type;
-
-        if ((i % 3) == 0)
-            type = UVM_CHANNEL_TYPE_CPU_TO_GPU;
-        else if ((i % 3) == 1)
-            type = UVM_CHANNEL_TYPE_GPU_TO_CPU;
-        else
-            type = UVM_CHANNEL_TYPE_WLC;
-
-        pool = gpu->channel_manager->pool_to_use.default_for_type[type];
-
-        if (!uvm_conf_computing_is_key_rotation_enabled_in_pool(pool))
-            return NV_ERR_INVALID_STATE;
-
-        status = force_key_rotation(pool);
-        if (status != NV_OK)
-            return status;
-    }
-
-    return NV_OK;
-}
-
-// The objective of this test is documented in the user-level function
-static NV_STATUS uvm_test_channel_stress_key_rotation(uvm_va_space_t *va_space, UVM_TEST_CHANNEL_STRESS_PARAMS *params)
-{
-    uvm_test_rng_t rng;
-    uvm_gpu_t *gpu;
-    NV_STATUS status = NV_OK;
-
-    if (!g_uvm_global.conf_computing_enabled)
-        return NV_OK;
-
-    uvm_test_rng_init(&rng, params->seed);
-
-    uvm_va_space_down_read(va_space);
-
-    // Key rotation should be enabled, or disabled, in all GPUs. Pick a random
-    // one.
-    gpu = random_va_space_gpu(&rng, va_space);
-
-    if (!uvm_conf_computing_is_key_rotation_enabled(gpu))
-        goto out;
-
-    if (params->key_rotation_operation == UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_CPU_TO_GPU)
-        status = channel_stress_key_rotation_cpu_encryption(gpu, params);
-    else if (params->key_rotation_operation == UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_GPU_TO_CPU)
-        status = channel_stress_key_rotation_cpu_decryption(gpu, params);
-    else if (params->key_rotation_operation == UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_ROTATE)
-        status = channel_stress_key_rotation_rotate(gpu, params);
-    else
-        status = NV_ERR_INVALID_PARAMETER;
-
-out:
-    uvm_va_space_up_read(va_space);
-
-    return status;
-}
-
 NV_STATUS uvm_test_channel_stress(UVM_TEST_CHANNEL_STRESS_PARAMS *params, struct file *filp)
 {
    uvm_va_space_t *va_space = uvm_va_space_get(filp);
@ -1798,8 +1349,6 @@ NV_STATUS uvm_test_channel_stress(UVM_TEST_CHANNEL_STRESS_PARAMS *params, struct
            return uvm_test_channel_stress_update_channels(va_space, params);
        case UVM_TEST_CHANNEL_STRESS_MODE_NOOP_PUSH:
            return uvm_test_channel_noop_push(va_space, params);
-        case UVM_TEST_CHANNEL_STRESS_MODE_KEY_ROTATION:
-            return uvm_test_channel_stress_key_rotation(va_space, params);
        default:
            return NV_ERR_INVALID_PARAMETER;
    }
--- a/kernel-open/nvidia-uvm/uvm_conf_computing.c
+++ b/kernel-open/nvidia-uvm/uvm_conf_computing.c
@ -33,15 +33,6 @@
 #include "nv_uvm_interface.h"
 #include "uvm_va_block.h"

-// Amount of encrypted data on a given engine that triggers key rotation. This
-// is a UVM internal threshold, different from that of RM, and used only during
-// testing.
-//
-// Key rotation is triggered when the total encryption size, or the total
-// decryption size (whatever comes first) reaches this lower threshold on the
-// engine.
-#define UVM_CONF_COMPUTING_KEY_ROTATION_LOWER_THRESHOLD (UVM_SIZE_1MB * 8)
-
 // The maximum number of secure operations per push is:
 // UVM_MAX_PUSH_SIZE / min(CE encryption size, CE decryption size)
 // + 1 (tracking semaphore) =  128 * 1024 / 56 + 1 = 2342
@ -361,19 +352,6 @@ error:
    return status;
 }

-// The production key rotation defaults are such that key rotations rarely
-// happen. During UVM testing more frequent rotations are triggering by relying
-// on internal encryption usage accounting. When key rotations are triggered by
-// UVM, the driver does not rely on channel key rotation notifiers.
-//
-// TODO: Bug 4612912: UVM should be able to programmatically set the rotation
-// lower threshold. This function, and all the metadata associated with it
-// (per-pool encryption accounting, for example) can be removed at that point.
-static bool key_rotation_is_notifier_driven(void)
-{
-    return !uvm_enable_builtin_tests;
-}
-
 NV_STATUS uvm_conf_computing_gpu_init(uvm_gpu_t *gpu)
 {
    NV_STATUS status;
@ -416,35 +394,17 @@ void uvm_conf_computing_gpu_deinit(uvm_gpu_t *gpu)
    conf_computing_dma_buffer_pool_deinit(&gpu->conf_computing.dma_buffer_pool);
 }

-void uvm_conf_computing_log_gpu_encryption(uvm_channel_t *channel, size_t size, UvmCslIv *iv)
+void uvm_conf_computing_log_gpu_encryption(uvm_channel_t *channel, UvmCslIv *iv)
 {
    NV_STATUS status;
-    uvm_channel_pool_t *pool;
-
-    if (uvm_channel_is_lcic(channel))
-        pool = uvm_channel_lcic_get_paired_wlc(channel)->pool;
-    else
-        pool = channel->pool;

    uvm_mutex_lock(&channel->csl.ctx_lock);
-
-    if (uvm_conf_computing_is_key_rotation_enabled_in_pool(pool)) {
-        status = nvUvmInterfaceCslLogEncryption(&channel->csl.ctx, UVM_CSL_OPERATION_DECRYPT, size);
-
-        // Informing RM of an encryption/decryption should not fail
-        UVM_ASSERT(status == NV_OK);
-
-        if (!key_rotation_is_notifier_driven())
-            atomic64_add(size, &pool->conf_computing.key_rotation.encrypted);
-    }
-
    status = nvUvmInterfaceCslIncrementIv(&channel->csl.ctx, UVM_CSL_OPERATION_DECRYPT, 1, iv);
+    uvm_mutex_unlock(&channel->csl.ctx_lock);

    // IV rotation is done preemptively as needed, so the above
    // call cannot return failure.
    UVM_ASSERT(status == NV_OK);
-
-    uvm_mutex_unlock(&channel->csl.ctx_lock);
 }

 void uvm_conf_computing_acquire_encryption_iv(uvm_channel_t *channel, UvmCslIv *iv)
@ -468,46 +428,27 @@ void uvm_conf_computing_cpu_encrypt(uvm_channel_t *channel,
                                    void *auth_tag_buffer)
 {
    NV_STATUS status;
-    uvm_channel_pool_t *pool;

    UVM_ASSERT(size);

-    if (uvm_channel_is_lcic(channel))
-        pool = uvm_channel_lcic_get_paired_wlc(channel)->pool;
-    else
-        pool = channel->pool;
-
    uvm_mutex_lock(&channel->csl.ctx_lock);
-
    status = nvUvmInterfaceCslEncrypt(&channel->csl.ctx,
                                      size,
                                      (NvU8 const *) src_plain,
                                      encrypt_iv,
                                      (NvU8 *) dst_cipher,
                                      (NvU8 *) auth_tag_buffer);
+    uvm_mutex_unlock(&channel->csl.ctx_lock);

    // IV rotation is done preemptively as needed, so the above
    // call cannot return failure.
    UVM_ASSERT(status == NV_OK);
-
-    if (uvm_conf_computing_is_key_rotation_enabled_in_pool(pool)) {
-        status = nvUvmInterfaceCslLogEncryption(&channel->csl.ctx, UVM_CSL_OPERATION_ENCRYPT, size);
-
-        // Informing RM of an encryption/decryption should not fail
-        UVM_ASSERT(status == NV_OK);
-
-        if (!key_rotation_is_notifier_driven())
-            atomic64_add(size, &pool->conf_computing.key_rotation.decrypted);
-    }
-
-    uvm_mutex_unlock(&channel->csl.ctx_lock);
 }

 NV_STATUS uvm_conf_computing_cpu_decrypt(uvm_channel_t *channel,
                                         void *dst_plain,
                                         const void *src_cipher,
                                         const UvmCslIv *src_iv,
-                                         NvU32 key_version,
                                         size_t size,
                                         const void *auth_tag_buffer)
 {
@ -528,19 +469,10 @@ NV_STATUS uvm_conf_computing_cpu_decrypt(uvm_channel_t *channel,
                                      size,
                                      (const NvU8 *) src_cipher,
                                      src_iv,
-                                      key_version,
                                      (NvU8 *) dst_plain,
                                      NULL,
                                      0,
                                      (const NvU8 *) auth_tag_buffer);
-
-    if (status != NV_OK) {
-        UVM_ERR_PRINT("nvUvmInterfaceCslDecrypt() failed: %s, channel %s, GPU %s\n",
-                      nvstatusToString(status),
-                      channel->name,
-                      uvm_gpu_name(uvm_channel_get_gpu(channel)));
-    }
-
    uvm_mutex_unlock(&channel->csl.ctx_lock);

    return status;
@ -553,8 +485,6 @@ NV_STATUS uvm_conf_computing_fault_decrypt(uvm_parent_gpu_t *parent_gpu,
                                           NvU8 valid)
 {
    NV_STATUS status;
-    NvU32 fault_entry_size = parent_gpu->fault_buffer_hal->entry_size(parent_gpu);
-    UvmCslContext *csl_context = &parent_gpu->fault_buffer_info.rm_info.replayable.cslCtx;

    // There is no dedicated lock for the CSL context associated with replayable
    // faults. The mutual exclusion required by the RM CSL API is enforced by
@ -564,48 +494,36 @@ NV_STATUS uvm_conf_computing_fault_decrypt(uvm_parent_gpu_t *parent_gpu,

    UVM_ASSERT(g_uvm_global.conf_computing_enabled);

-    status = nvUvmInterfaceCslLogEncryption(csl_context, UVM_CSL_OPERATION_DECRYPT, fault_entry_size);
-
-    // Informing RM of an encryption/decryption should not fail
-    UVM_ASSERT(status == NV_OK);
-
-    status = nvUvmInterfaceCslDecrypt(csl_context,
-                                      fault_entry_size,
+    status = nvUvmInterfaceCslDecrypt(&parent_gpu->fault_buffer_info.rm_info.replayable.cslCtx,
+                                      parent_gpu->fault_buffer_hal->entry_size(parent_gpu),
                                      (const NvU8 *) src_cipher,
                                      NULL,
-                                      NV_U32_MAX,
                                      (NvU8 *) dst_plain,
                                      &valid,
                                      sizeof(valid),
                                      (const NvU8 *) auth_tag_buffer);

-    if (status != NV_OK) {
+    if (status != NV_OK)
        UVM_ERR_PRINT("nvUvmInterfaceCslDecrypt() failed: %s, GPU %s\n",
                      nvstatusToString(status),
                      uvm_parent_gpu_name(parent_gpu));

-    }
-
    return status;
 }

-void uvm_conf_computing_fault_increment_decrypt_iv(uvm_parent_gpu_t *parent_gpu)
+void uvm_conf_computing_fault_increment_decrypt_iv(uvm_parent_gpu_t *parent_gpu, NvU64 increment)
 {
    NV_STATUS status;
-    NvU32 fault_entry_size = parent_gpu->fault_buffer_hal->entry_size(parent_gpu);
-    UvmCslContext *csl_context = &parent_gpu->fault_buffer_info.rm_info.replayable.cslCtx;

    // See comment in uvm_conf_computing_fault_decrypt
    UVM_ASSERT(uvm_sem_is_locked(&parent_gpu->isr.replayable_faults.service_lock));

    UVM_ASSERT(g_uvm_global.conf_computing_enabled);

-    status = nvUvmInterfaceCslLogEncryption(csl_context, UVM_CSL_OPERATION_DECRYPT, fault_entry_size);
-
-    // Informing RM of an encryption/decryption should not fail
-    UVM_ASSERT(status == NV_OK);
-
-    status = nvUvmInterfaceCslIncrementIv(csl_context, UVM_CSL_OPERATION_DECRYPT, 1, NULL);
+    status = nvUvmInterfaceCslIncrementIv(&parent_gpu->fault_buffer_info.rm_info.replayable.cslCtx,
+                                          UVM_CSL_OPERATION_DECRYPT,
+                                          increment,
+                                          NULL);

    UVM_ASSERT(status == NV_OK);
 }
@ -707,231 +625,3 @@ NV_STATUS uvm_conf_computing_maybe_rotate_channel_ivs_retry_busy(uvm_channel_t *
 {
    return uvm_conf_computing_rotate_channel_ivs_below_limit(channel, uvm_conf_computing_channel_iv_rotation_limit, true);
 }
-
-void uvm_conf_computing_enable_key_rotation(uvm_gpu_t *gpu)
-{
-    if (!g_uvm_global.conf_computing_enabled)
-        return;
-
-    // Key rotation cannot be enabled on UVM if it is disabled on RM
-    if (!gpu->parent->rm_info.gpuConfComputeCaps.bKeyRotationEnabled)
-        return;
-
-    gpu->channel_manager->conf_computing.key_rotation_enabled = true;
-}
-
-void uvm_conf_computing_disable_key_rotation(uvm_gpu_t *gpu)
-{
-    if (!g_uvm_global.conf_computing_enabled)
-        return;
-
-    gpu->channel_manager->conf_computing.key_rotation_enabled = false;
-}
-
-bool uvm_conf_computing_is_key_rotation_enabled(uvm_gpu_t *gpu)
-{
-    return gpu->channel_manager->conf_computing.key_rotation_enabled;
-}
-
-bool uvm_conf_computing_is_key_rotation_enabled_in_pool(uvm_channel_pool_t *pool)
-{
-    if (!uvm_conf_computing_is_key_rotation_enabled(pool->manager->gpu))
-        return false;
-
-    // TODO: Bug 4586447: key rotation must be disabled in the SEC2 engine,
-    // because currently the encryption key is shared between UVM and RM, but
-    // UVM is not able to idle SEC2 channels owned by RM.
-    if (uvm_channel_pool_is_sec2(pool))
-        return false;
-
-    // Key rotation happens as part of channel reservation, and LCIC channels
-    // are never reserved directly. Rotation of keys in LCIC channels happens
-    // as the result of key rotation in WLC channels.
-    //
-    // Return false even if there is nothing fundamental prohibiting direct key
-    // rotation on LCIC pools
-    if (uvm_channel_pool_is_lcic(pool))
-        return false;
-
-    return true;
-}
-
-static bool conf_computing_is_key_rotation_pending_use_stats(uvm_channel_pool_t *pool)
-{
-    NvU64 decrypted, encrypted;
-
-    UVM_ASSERT(!key_rotation_is_notifier_driven());
-
-    decrypted = atomic64_read(&pool->conf_computing.key_rotation.decrypted);
-
-    if (decrypted > UVM_CONF_COMPUTING_KEY_ROTATION_LOWER_THRESHOLD)
-        return true;
-
-    encrypted = atomic64_read(&pool->conf_computing.key_rotation.encrypted);
-
-    if (encrypted > UVM_CONF_COMPUTING_KEY_ROTATION_LOWER_THRESHOLD)
-        return true;
-
-    return false;
-}
-
-static bool conf_computing_is_key_rotation_pending_use_notifier(uvm_channel_pool_t *pool)
-{
-    // If key rotation is pending for the pool's engine, then the key rotation
-    // notifier in any of the engine channels can be used by UVM to detect the
-    // situation. Note that RM doesn't update all the notifiers in a single
-    // atomic operation, so it is possible that the channel read by UVM (the
-    // first one in the pool) indicates that a key rotation is pending, but
-    // another channel in the pool (temporarily) indicates the opposite, or vice
-    // versa.
-    uvm_channel_t *first_channel = pool->channels;
-
-    UVM_ASSERT(key_rotation_is_notifier_driven());
-    UVM_ASSERT(first_channel != NULL);
-
-    return first_channel->channel_info.keyRotationNotifier->status == UVM_KEY_ROTATION_STATUS_PENDING;
-}
-
-bool uvm_conf_computing_is_key_rotation_pending_in_pool(uvm_channel_pool_t *pool)
-{
-    if (!uvm_conf_computing_is_key_rotation_enabled_in_pool(pool))
-        return false;
-
-    if (key_rotation_is_notifier_driven())
-        return conf_computing_is_key_rotation_pending_use_notifier(pool);
-    else
-        return conf_computing_is_key_rotation_pending_use_stats(pool);
-}
-
-NV_STATUS uvm_conf_computing_rotate_pool_key(uvm_channel_pool_t *pool)
-{
-    NV_STATUS status;
-
-    UVM_ASSERT(uvm_conf_computing_is_key_rotation_enabled_in_pool(pool));
-    UVM_ASSERT(pool->conf_computing.key_rotation.csl_contexts != NULL);
-    UVM_ASSERT(pool->conf_computing.key_rotation.num_csl_contexts > 0);
-
-    // NV_ERR_STATE_IN_USE indicates that RM was not able to acquire the
-    // required locks at this time. This status is not interpreted as an error,
-    // but as a sign for UVM to try again later. This is the same "protocol"
-    // used in IV rotation.
-    status = nvUvmInterfaceCslRotateKey(pool->conf_computing.key_rotation.csl_contexts,
-                                        pool->conf_computing.key_rotation.num_csl_contexts);
-
-    if (status == NV_OK) {
-        pool->conf_computing.key_rotation.version++;
-
-        if (!key_rotation_is_notifier_driven()) {
-            atomic64_set(&pool->conf_computing.key_rotation.decrypted, 0);
-            atomic64_set(&pool->conf_computing.key_rotation.encrypted, 0);
-        }
-    }
-    else if (status != NV_ERR_STATE_IN_USE) {
-        UVM_DBG_PRINT("nvUvmInterfaceCslRotateKey() failed in engine %u: %s\n",
-                      pool->engine_index,
-                      nvstatusToString(status));
-    }
-
-    return status;
-}
-
-__attribute__ ((format(printf, 6, 7)))
-NV_STATUS uvm_conf_computing_util_memcopy_cpu_to_gpu(uvm_gpu_t *gpu,
-                                                     uvm_gpu_address_t dst_gpu_address,
-                                                     void *src_plain,
-                                                     size_t size,
-                                                     uvm_tracker_t *tracker,
-                                                     const char *format,
-                                                     ...)
-{
-    NV_STATUS status;
-    uvm_push_t push;
-    uvm_conf_computing_dma_buffer_t *dma_buffer;
-    uvm_gpu_address_t src_gpu_address, auth_tag_gpu_address;
-    void *dst_cipher, *auth_tag;
-    va_list args;
-
-    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
-    UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE);
-
-    status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL);
-    if (status != NV_OK)
-        return status;
-
-    va_start(args, format);
-    status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_CPU_TO_GPU, tracker, &push, format, args);
-    va_end(args);
-
-    if (status != NV_OK)
-        goto out;
-
-    dst_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
-    auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
-    uvm_conf_computing_cpu_encrypt(push.channel, dst_cipher, src_plain, NULL, size, auth_tag);
-
-    src_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
-    auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
-    gpu->parent->ce_hal->decrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address);
-
-    status = uvm_push_end_and_wait(&push);
-
-out:
-    uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL);
-    return status;
-}
-
-__attribute__ ((format(printf, 6, 7)))
-NV_STATUS uvm_conf_computing_util_memcopy_gpu_to_cpu(uvm_gpu_t *gpu,
-                                                     void *dst_plain,
-                                                     uvm_gpu_address_t src_gpu_address,
-                                                     size_t size,
-                                                     uvm_tracker_t *tracker,
-                                                     const char *format,
-                                                     ...)
-{
-    NV_STATUS status;
-    uvm_push_t push;
-    uvm_conf_computing_dma_buffer_t *dma_buffer;
-    uvm_gpu_address_t dst_gpu_address, auth_tag_gpu_address;
-    void *src_cipher, *auth_tag;
-    va_list args;
-
-    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
-    UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE);
-
-    status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL);
-    if (status != NV_OK)
-        return status;
-
-    va_start(args, format);
-    status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_TO_CPU, tracker, &push, format, args);
-    va_end(args);
-
-    if (status != NV_OK)
-        goto out;
-
-    uvm_conf_computing_log_gpu_encryption(push.channel, size, dma_buffer->decrypt_iv);
-    dma_buffer->key_version[0] = uvm_channel_pool_key_version(push.channel->pool);
-
-    dst_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
-    auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
-    gpu->parent->ce_hal->encrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address);
-
-    status = uvm_push_end_and_wait(&push);
-    if (status != NV_OK)
-        goto out;
-
-    src_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
-    auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
-    status = uvm_conf_computing_cpu_decrypt(push.channel,
-                                            dst_plain,
-                                            src_cipher,
-                                            dma_buffer->decrypt_iv,
-                                            dma_buffer->key_version[0],
-                                            size,
-                                            auth_tag);
-
- out:
-    uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL);
-    return status;
-}
--- a/kernel-open/nvidia-uvm/uvm_conf_computing.h
+++ b/kernel-open/nvidia-uvm/uvm_conf_computing.h
@ -87,9 +87,9 @@ typedef struct
    // a free buffer.
    uvm_tracker_t tracker;

-    // When the DMA buffer is used as the destination of a GPU encryption, the
-    // engine (CE or SEC2) writes the authentication tag here. When the buffer
-    // is decrypted on the CPU the authentication tag is used by CSL to verify
+    // When the DMA buffer is used as the destination of a GPU encryption, SEC2
+    // writes the authentication tag here. Later when the buffer is decrypted
+    // on the CPU the authentication tag is used again (read) for CSL to verify
    // the authenticity. The allocation is big enough for one authentication
    // tag per PAGE_SIZE page in the alloc buffer.
    uvm_mem_t *auth_tag;
@ -98,12 +98,7 @@ typedef struct
    // to the authentication tag. The allocation is big enough for one IV per
    // PAGE_SIZE page in the alloc buffer. The granularity between the decrypt
    // IV and authentication tag must match.
-    UvmCslIv decrypt_iv[UVM_CONF_COMPUTING_DMA_BUFFER_SIZE / PAGE_SIZE];
-
-    // When the DMA buffer is used as the destination of a GPU encryption, the
-    // key version used during GPU encryption of each PAGE_SIZE page can be
-    // saved here, so CPU decryption uses the correct decryption key.
-    NvU32 key_version[UVM_CONF_COMPUTING_DMA_BUFFER_SIZE / PAGE_SIZE];
+    UvmCslIv decrypt_iv[(UVM_CONF_COMPUTING_DMA_BUFFER_SIZE / PAGE_SIZE)];

    // Bitmap of the encrypted pages in the backing allocation
    uvm_page_mask_t encrypted_page_mask;
@ -152,7 +147,7 @@ NV_STATUS uvm_conf_computing_gpu_init(uvm_gpu_t *gpu);
 void uvm_conf_computing_gpu_deinit(uvm_gpu_t *gpu);

 // Logs encryption information from the GPU and returns the IV.
-void uvm_conf_computing_log_gpu_encryption(uvm_channel_t *channel, size_t size, UvmCslIv *iv);
+void uvm_conf_computing_log_gpu_encryption(uvm_channel_t *channel, UvmCslIv *iv);

 // Acquires next CPU encryption IV and returns it.
 void uvm_conf_computing_acquire_encryption_iv(uvm_channel_t *channel, UvmCslIv *iv);
@ -172,14 +167,10 @@ void uvm_conf_computing_cpu_encrypt(uvm_channel_t *channel,
 // CPU side decryption helper. Decrypts data from src_cipher and writes the
 // plain text in dst_plain. src_cipher and dst_plain can't overlap. IV obtained
 // from uvm_conf_computing_log_gpu_encryption() needs to be be passed to src_iv.
-//
-// The caller must indicate which key to use for decryption by passing the
-// appropiate key version number.
 NV_STATUS uvm_conf_computing_cpu_decrypt(uvm_channel_t *channel,
                                         void *dst_plain,
                                         const void *src_cipher,
                                         const UvmCslIv *src_iv,
-                                         NvU32 key_version,
                                         size_t size,
                                         const void *auth_tag_buffer);

@ -200,12 +191,12 @@ NV_STATUS uvm_conf_computing_fault_decrypt(uvm_parent_gpu_t *parent_gpu,
                                           NvU8 valid);

 // Increment the CPU-side decrypt IV of the CSL context associated with
-// replayable faults.
+// replayable faults. The function is a no-op if the given increment is zero.
 //
 // The IV associated with a fault CSL context is a 64-bit counter.
 //
 // Locking: this function must be invoked while holding the replayable ISR lock.
-void uvm_conf_computing_fault_increment_decrypt_iv(uvm_parent_gpu_t *parent_gpu);
+void uvm_conf_computing_fault_increment_decrypt_iv(uvm_parent_gpu_t *parent_gpu, NvU64 increment);

 // Query the number of remaining messages before IV needs to be rotated.
 void uvm_conf_computing_query_message_pools(uvm_channel_t *channel,
@ -223,71 +214,4 @@ NV_STATUS uvm_conf_computing_maybe_rotate_channel_ivs_retry_busy(uvm_channel_t *
 // Check if there are fewer than 'limit' messages available in either direction
 // and rotate if not.
 NV_STATUS uvm_conf_computing_rotate_channel_ivs_below_limit(uvm_channel_t *channel, NvU64 limit, bool retry_if_busy);
-
-// Rotate the engine key associated with the given channel pool.
-NV_STATUS uvm_conf_computing_rotate_pool_key(uvm_channel_pool_t *pool);
-
-// Returns true if key rotation is allowed in the channel pool.
-bool uvm_conf_computing_is_key_rotation_enabled_in_pool(uvm_channel_pool_t *pool);
-
-// Returns true if key rotation is pending in the channel pool.
-bool uvm_conf_computing_is_key_rotation_pending_in_pool(uvm_channel_pool_t *pool);
-
-// Enable/disable key rotation in the passed GPU. Note that UVM enablement is
-// dependent on RM enablement: key rotation may still be disabled upon calling
-// this function, if it is disabled in RM. On the other hand, key rotation can
-// be disabled in UVM, even if it is enabled in RM.
-//
-// Enablement/Disablement affects only kernel key rotation in keys owned by UVM.
-// It doesn't affect user key rotation (CUDA, Video...), nor it affects RM
-// kernel key rotation.
-void uvm_conf_computing_enable_key_rotation(uvm_gpu_t *gpu);
-void uvm_conf_computing_disable_key_rotation(uvm_gpu_t *gpu);
-
-// Returns true if key rotation is enabled on UVM in the given GPU. Key rotation
-// can be enabled on the GPU but disabled on some of GPU engines (LCEs or SEC2),
-// see uvm_conf_computing_is_key_rotation_enabled_in_pool.
-bool uvm_conf_computing_is_key_rotation_enabled(uvm_gpu_t *gpu);
-
-// Launch a synchronous, encrypted copy between CPU and GPU.
-//
-// The maximum copy size allowed is UVM_CONF_COMPUTING_DMA_BUFFER_SIZE.
-//
-// The source CPU buffer pointed by src_plain contains the unencrypted (plain
-// text) contents; the function internally performs a CPU-side encryption step
-// before launching the GPU-side CE decryption. The source buffer can be in
-// protected or unprotected sysmem, while the destination buffer must be in
-// protected vidmem.
-//
-// The input tracker, if not NULL, is internally acquired by the push
-// responsible for the encrypted copy.
-__attribute__ ((format(printf, 6, 7)))
-NV_STATUS uvm_conf_computing_util_memcopy_cpu_to_gpu(uvm_gpu_t *gpu,
-                                                     uvm_gpu_address_t dst_gpu_address,
-                                                     void *src_plain,
-                                                     size_t size,
-                                                     uvm_tracker_t *tracker,
-                                                     const char *format,
-                                                     ...);
-
-// Launch a synchronous, encrypted copy between CPU and GPU.
-//
-// The maximum copy size allowed is UVM_CONF_COMPUTING_DMA_BUFFER_SIZE.
-//
-// The source CPU buffer pointed by src_plain contains the unencrypted (plain
-// text) contents; the function internally performs a CPU-side encryption step
-// before launching the GPU-side CE decryption. The source buffer can be in
-// protected or unprotected sysmem, while the destination buffer must be in
-// protected vidmem.
-//
-// The input tracker, if not NULL, is internally acquired by the push
-// responsible for the encrypted copy.
-__attribute__ ((format(printf, 6, 7)))
-NV_STATUS uvm_conf_computing_util_memcopy_gpu_to_cpu(uvm_gpu_t *gpu,
-                                                     void *dst_plain,
-                                                     uvm_gpu_address_t src_gpu_address,
-                                                     size_t size,
-                                                     uvm_tracker_t *tracker,
-                                                     const char *format,
-                                                     ...);
 #endif // __UVM_CONF_COMPUTING_H__
--- a/kernel-open/nvidia-uvm/uvm_gpu.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu.c
@ -218,8 +218,9 @@ static NV_STATUS alloc_and_init_address_space(uvm_gpu_t *gpu)
    if (status != NV_OK)
        return status;

-    gpu->big_page.internal_size = gpu_address_space_info.bigPageSize;
+    UVM_ASSERT(gpu_address_space_info.bigPageSize <= NV_U32_MAX);

+    gpu->big_page.internal_size = gpu_address_space_info.bigPageSize;
    gpu->time.time0_register = gpu_address_space_info.time0Offset;
    gpu->time.time1_register = gpu_address_space_info.time1Offset;

@ -458,6 +459,7 @@ static const char *uvm_gpu_virt_type_string(UVM_VIRT_MODE virtMode)

 static const char *uvm_gpu_link_type_string(uvm_gpu_link_type_t link_type)
 {
+
    BUILD_BUG_ON(UVM_GPU_LINK_MAX != 7);

    switch (link_type) {
@ -1082,9 +1084,6 @@ static NV_STATUS configure_address_space(uvm_gpu_t *gpu)
                   gpu->parent->rm_va_size,
                   va_per_entry);

-    UVM_ASSERT(uvm_mmu_page_size_supported(&gpu->address_space_tree, gpu->big_page.internal_size));
-    UVM_ASSERT(uvm_mmu_page_size_supported(&gpu->address_space_tree, gpu->mem_info.max_vidmem_page_size));
-
    tree_alloc = uvm_page_tree_pdb(&gpu->address_space_tree);
    status = uvm_rm_locked_call(nvUvmInterfaceSetPageDirectory(gpu->rm_address_space,
                                                               tree_alloc->addr.address,
@ -2364,9 +2363,7 @@ static NV_STATUS init_peer_access(uvm_gpu_t *gpu0,

    // check for peer-to-peer compatibility (PCI-E or NvLink).
    peer_caps->link_type = get_gpu_link_type(p2p_caps_params->p2pLink);
-    if (peer_caps->link_type == UVM_GPU_LINK_INVALID
-        || peer_caps->link_type == UVM_GPU_LINK_C2C
-        )
+    if (peer_caps->link_type == UVM_GPU_LINK_INVALID || peer_caps->link_type == UVM_GPU_LINK_C2C)
        return NV_ERR_NOT_SUPPORTED;

    peer_caps->total_link_line_rate_mbyte_per_s = p2p_caps_params->totalLinkLineRateMBps;
@ -3296,7 +3293,10 @@ void uvm_parent_gpu_dma_free_page(uvm_parent_gpu_t *parent_gpu, void *va, NvU64
    atomic64_sub(PAGE_SIZE, &parent_gpu->mapped_cpu_pages_size);
 }

-NV_STATUS uvm_parent_gpu_map_cpu_pages(uvm_parent_gpu_t *parent_gpu, struct page *page, size_t size, NvU64 *dma_address_out)
+NV_STATUS uvm_parent_gpu_map_cpu_pages(uvm_parent_gpu_t *parent_gpu,
+                                       struct page *page,
+                                       size_t size,
+                                       NvU64 *dma_address_out)
 {
    NvU64 dma_addr;

--- a/kernel-open/nvidia-uvm/uvm_gpu.h
+++ b/kernel-open/nvidia-uvm/uvm_gpu.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
--- a/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
@ -591,7 +591,7 @@ static void fault_buffer_skip_replayable_entry(uvm_parent_gpu_t *parent_gpu, NvU
    // replayable faults still requires manual adjustment so it is kept in sync
    // with the encryption IV on the GSP-RM's side.
    if (g_uvm_global.conf_computing_enabled)
-        uvm_conf_computing_fault_increment_decrypt_iv(parent_gpu);
+        uvm_conf_computing_fault_increment_decrypt_iv(parent_gpu, 1);

    parent_gpu->fault_buffer_hal->entry_clear_valid(parent_gpu, index);
 }
--- a/kernel-open/nvidia-uvm/uvm_gpu_semaphore.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_semaphore.c
@ -60,17 +60,6 @@ struct uvm_gpu_semaphore_pool_page_struct
    // Allocation backing the page
    uvm_rm_mem_t *memory;

-    struct {
-        // Unprotected sysmem storing encrypted value of semaphores
-        uvm_rm_mem_t *encrypted_payload_memory;
-
-        // Unprotected sysmem storing encryption auth tags
-        uvm_rm_mem_t *auth_tag_memory;
-
-        // Unprotected sysmem storing plain text notifier values
-        uvm_rm_mem_t *notifier_memory;
-    } conf_computing;
-
    // Pool the page is part of
    uvm_gpu_semaphore_pool_t *pool;

@ -91,6 +80,26 @@ static bool gpu_semaphore_is_secure(uvm_gpu_semaphore_t *semaphore)
    return gpu_semaphore_pool_is_secure(semaphore->page->pool);
 }

+static NvU32 get_index(uvm_gpu_semaphore_t *semaphore)
+{
+    NvU32 offset;
+    NvU32 index;
+
+    if (gpu_semaphore_is_secure(semaphore))
+        return semaphore->conf_computing.index;
+
+    UVM_ASSERT(semaphore->payload != NULL);
+    UVM_ASSERT(semaphore->page != NULL);
+
+    offset = (char*)semaphore->payload - (char*)uvm_rm_mem_get_cpu_va(semaphore->page->memory);
+    UVM_ASSERT(offset % UVM_SEMAPHORE_SIZE == 0);
+
+    index = offset / UVM_SEMAPHORE_SIZE;
+    UVM_ASSERT(index < UVM_SEMAPHORE_COUNT_PER_PAGE);
+
+    return index;
+}
+
 // Use canary values on debug builds to catch semaphore use-after-free. We can
 // catch release-after-free by simply setting the payload to a known value at
 // free then checking it on alloc or pool free, but catching acquire-after-free
@ -141,83 +150,34 @@ static bool gpu_can_access_semaphore_pool(uvm_gpu_t *gpu, uvm_rm_mem_t *rm_mem)
    return ((uvm_rm_mem_get_gpu_uvm_va(rm_mem, gpu) + rm_mem->size - 1) < gpu->parent->max_host_va);
 }

-static void pool_page_free_buffers(uvm_gpu_semaphore_pool_page_t *page)
-{
-    uvm_rm_mem_free(page->memory);
-    page->memory = NULL;
-
-    if (gpu_semaphore_pool_is_secure(page->pool)) {
-        uvm_rm_mem_free(page->conf_computing.encrypted_payload_memory);
-        uvm_rm_mem_free(page->conf_computing.auth_tag_memory);
-        uvm_rm_mem_free(page->conf_computing.notifier_memory);
-
-        page->conf_computing.encrypted_payload_memory = NULL;
-        page->conf_computing.auth_tag_memory = NULL;
-        page->conf_computing.notifier_memory = NULL;
-    }
-    else {
-        UVM_ASSERT(!page->conf_computing.encrypted_payload_memory);
-        UVM_ASSERT(!page->conf_computing.auth_tag_memory);
-        UVM_ASSERT(!page->conf_computing.notifier_memory);
-    }
-}
-
-static NV_STATUS pool_page_alloc_buffers(uvm_gpu_semaphore_pool_page_t *page)
+// Secure semaphore pools are allocated in the CPR of vidmem and only mapped to
+// the owning GPU as no other processor have access to it.
+static NV_STATUS pool_alloc_secure_page(uvm_gpu_semaphore_pool_t *pool,
+                                        uvm_gpu_semaphore_pool_page_t *pool_page,
+                                        uvm_rm_mem_type_t memory_type)
 {
    NV_STATUS status;
-    uvm_gpu_semaphore_pool_t *pool = page->pool;
-    uvm_rm_mem_type_t memory_type = (pool->aperture == UVM_APERTURE_SYS) ? UVM_RM_MEM_TYPE_SYS : UVM_RM_MEM_TYPE_GPU;
-    size_t align = 0;
-    bool map_all = true;
-    align = gpu_semaphore_pool_is_secure(pool) ? UVM_CONF_COMPUTING_BUF_ALIGNMENT : 0;
-    map_all = gpu_semaphore_pool_is_secure(pool) ? false : true;

-    if (map_all)
-        status = uvm_rm_mem_alloc_and_map_all(pool->gpu, memory_type, UVM_SEMAPHORE_PAGE_SIZE, align, &page->memory);
-    else
-        status = uvm_rm_mem_alloc(pool->gpu, memory_type, UVM_SEMAPHORE_PAGE_SIZE, align, &page->memory);
-
-    if (status != NV_OK)
-        goto error;
-
-    if (!gpu_semaphore_pool_is_secure(pool))
-        return NV_OK;
-
-    status = uvm_rm_mem_alloc_and_map_cpu(pool->gpu,
-                                          UVM_RM_MEM_TYPE_SYS,
+    UVM_ASSERT(gpu_semaphore_pool_is_secure(pool));
+    status = uvm_rm_mem_alloc(pool->gpu,
+                              memory_type,
                              UVM_SEMAPHORE_PAGE_SIZE,
                              UVM_CONF_COMPUTING_BUF_ALIGNMENT,
-                                          &page->conf_computing.encrypted_payload_memory);
-    if (status != NV_OK)
-        goto error;
+                              &pool_page->memory);

-    BUILD_BUG_ON(UVM_CONF_COMPUTING_AUTH_TAG_SIZE % UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT);
-    status = uvm_rm_mem_alloc_and_map_cpu(pool->gpu,
-                                          UVM_RM_MEM_TYPE_SYS,
-                                          UVM_SEMAPHORE_COUNT_PER_PAGE * UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
-                                          UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT,
-                                          &page->conf_computing.auth_tag_memory);
    if (status != NV_OK)
-        goto error;
-
-    status = uvm_rm_mem_alloc_and_map_cpu(pool->gpu,
-                                          UVM_RM_MEM_TYPE_SYS,
-                                          UVM_SEMAPHORE_COUNT_PER_PAGE * sizeof(NvU32),
-                                          0,
-                                          &page->conf_computing.notifier_memory);
-    if (status != NV_OK)
-        goto error;
+        return status;

    return NV_OK;
-error:
-    pool_page_free_buffers(page);
-    return status;
 }

 static NV_STATUS pool_alloc_page(uvm_gpu_semaphore_pool_t *pool)
 {
    NV_STATUS status;
    uvm_gpu_semaphore_pool_page_t *pool_page;
+    NvU32 *payloads;
+    size_t i;
+    uvm_rm_mem_type_t memory_type = (pool->aperture == UVM_APERTURE_SYS) ? UVM_RM_MEM_TYPE_SYS : UVM_RM_MEM_TYPE_GPU;

    uvm_assert_mutex_locked(&pool->mutex);

@ -228,9 +188,24 @@ static NV_STATUS pool_alloc_page(uvm_gpu_semaphore_pool_t *pool)

    pool_page->pool = pool;

-    status = pool_page_alloc_buffers(pool_page);
+    // Whenever the Confidential Computing feature is enabled, engines can
+    // access semaphores only in the CPR of vidmem. Mapping to other GPUs is
+    // also disabled.
+    if (gpu_semaphore_pool_is_secure(pool)) {
+        status = pool_alloc_secure_page(pool, pool_page, memory_type);
+
        if (status != NV_OK)
            goto error;
+    }
+    else {
+    status = uvm_rm_mem_alloc_and_map_all(pool->gpu,
+                                          memory_type,
+                                          UVM_SEMAPHORE_PAGE_SIZE,
+                                          0,
+                                          &pool_page->memory);
+    if (status != NV_OK)
+        goto error;
+    }

    // Verify the GPU can access the semaphore pool.
    UVM_ASSERT(gpu_can_access_semaphore_pool(pool->gpu, pool_page->memory));
@ -242,9 +217,7 @@ static NV_STATUS pool_alloc_page(uvm_gpu_semaphore_pool_t *pool)
    pool->free_semaphores_count += UVM_SEMAPHORE_COUNT_PER_PAGE;

    if (semaphore_uses_canary(pool)) {
-        size_t i;
-        NvU32 *payloads = uvm_rm_mem_get_cpu_va(pool_page->memory);
-
+        payloads = uvm_rm_mem_get_cpu_va(pool_page->memory);
        for (i = 0; i < UVM_SEMAPHORE_COUNT_PER_PAGE; i++)
            payloads[i] = make_canary(0);
    }
@ -280,7 +253,7 @@ static void pool_free_page(uvm_gpu_semaphore_pool_page_t *page)

    pool->free_semaphores_count -= UVM_SEMAPHORE_COUNT_PER_PAGE;
    list_del(&page->all_pages_node);
-    pool_page_free_buffers(page);
+    uvm_rm_mem_free(page->memory);
    uvm_kvfree(page);
 }

@ -300,22 +273,19 @@ NV_STATUS uvm_gpu_semaphore_alloc(uvm_gpu_semaphore_pool_t *pool, uvm_gpu_semaph
        goto done;

    list_for_each_entry(page, &pool->pages, all_pages_node) {
-        const NvU32 semaphore_index = find_first_bit(page->free_semaphores, UVM_SEMAPHORE_COUNT_PER_PAGE);
-
-        UVM_ASSERT(semaphore_index <= UVM_SEMAPHORE_COUNT_PER_PAGE);
-
+        NvU32 semaphore_index = find_first_bit(page->free_semaphores, UVM_SEMAPHORE_COUNT_PER_PAGE);
        if (semaphore_index == UVM_SEMAPHORE_COUNT_PER_PAGE)
            continue;

-        semaphore->page = page;
-        semaphore->index = semaphore_index;
-
        if (gpu_semaphore_pool_is_secure(pool)) {
-
-            // Reset the notifier to prevent detection of false attack when
-            // checking for updated value
-            *uvm_gpu_semaphore_get_notifier_cpu_va(semaphore) = semaphore->conf_computing.last_observed_notifier;
+            semaphore->conf_computing.index = semaphore_index;
        }
+        else {
+            semaphore->payload = (NvU32*)((char*)uvm_rm_mem_get_cpu_va(page->memory) +
+                                                 semaphore_index * UVM_SEMAPHORE_SIZE);
+        }
+
+        semaphore->page = page;

        if (semaphore_uses_canary(pool))
            UVM_ASSERT(is_canary(uvm_gpu_semaphore_get_payload(semaphore)));
@ -341,6 +311,7 @@ void uvm_gpu_semaphore_free(uvm_gpu_semaphore_t *semaphore)
 {
    uvm_gpu_semaphore_pool_page_t *page;
    uvm_gpu_semaphore_pool_t *pool;
+    NvU32 index;

    UVM_ASSERT(semaphore);

@ -352,6 +323,7 @@ void uvm_gpu_semaphore_free(uvm_gpu_semaphore_t *semaphore)
        return;

    pool = page->pool;
+    index = get_index(semaphore);

    // Write a known value lower than the current payload in an attempt to catch
    // release-after-free and acquire-after-free.
@ -361,9 +333,10 @@ void uvm_gpu_semaphore_free(uvm_gpu_semaphore_t *semaphore)
    uvm_mutex_lock(&pool->mutex);

    semaphore->page = NULL;
+    semaphore->payload = NULL;

    ++pool->free_semaphores_count;
-    __set_bit(semaphore->index, page->free_semaphores);
+    __set_bit(index, page->free_semaphores);

    uvm_mutex_unlock(&pool->mutex);
 }
@ -476,72 +449,18 @@ NvU64 uvm_gpu_semaphore_get_gpu_proxy_va(uvm_gpu_semaphore_t *semaphore, uvm_gpu

 NvU64 uvm_gpu_semaphore_get_gpu_va(uvm_gpu_semaphore_t *semaphore, uvm_gpu_t *gpu, bool is_proxy_va_space)
 {
+    NvU32 index = get_index(semaphore);
    NvU64 base_va = uvm_rm_mem_get_gpu_va(semaphore->page->memory, gpu, is_proxy_va_space).address;

-    return base_va + semaphore->index * UVM_SEMAPHORE_SIZE;
-}
-
-NvU32 *uvm_gpu_semaphore_get_cpu_va(uvm_gpu_semaphore_t *semaphore)
-{
-    char *base_va;
-
-    if (gpu_semaphore_is_secure(semaphore))
-        return &semaphore->conf_computing.cached_payload;
-
-    base_va = uvm_rm_mem_get_cpu_va(semaphore->page->memory);
-    return (NvU32*)(base_va + semaphore->index * UVM_SEMAPHORE_SIZE);
-}
-
-NvU32 *uvm_gpu_semaphore_get_encrypted_payload_cpu_va(uvm_gpu_semaphore_t *semaphore)
-{
-    char *encrypted_base_va = uvm_rm_mem_get_cpu_va(semaphore->page->conf_computing.encrypted_payload_memory);
-
-    return (NvU32*)(encrypted_base_va + semaphore->index * UVM_SEMAPHORE_SIZE);
-}
-
-uvm_gpu_address_t uvm_gpu_semaphore_get_encrypted_payload_gpu_va(uvm_gpu_semaphore_t *semaphore)
-{
-    NvU64 encrypted_base_va = uvm_rm_mem_get_gpu_uvm_va(semaphore->page->conf_computing.encrypted_payload_memory,
-                                                        semaphore->page->pool->gpu);
-
-    return uvm_gpu_address_virtual_unprotected(encrypted_base_va + semaphore->index * UVM_SEMAPHORE_SIZE);
-}
-
-uvm_gpu_semaphore_notifier_t *uvm_gpu_semaphore_get_notifier_cpu_va(uvm_gpu_semaphore_t *semaphore)
-{
-    uvm_gpu_semaphore_notifier_t *notifier_base_va =
-        uvm_rm_mem_get_cpu_va(semaphore->page->conf_computing.notifier_memory);
-
-    return notifier_base_va + semaphore->index;
-}
-
-uvm_gpu_address_t uvm_gpu_semaphore_get_notifier_gpu_va(uvm_gpu_semaphore_t *semaphore)
-{
-    NvU64 notifier_base_va = uvm_rm_mem_get_gpu_uvm_va(semaphore->page->conf_computing.notifier_memory,
-                                                       semaphore->page->pool->gpu);
-
-    return uvm_gpu_address_virtual_unprotected(notifier_base_va +
-                                               semaphore->index * sizeof(uvm_gpu_semaphore_notifier_t));
-}
-
-void *uvm_gpu_semaphore_get_auth_tag_cpu_va(uvm_gpu_semaphore_t *semaphore)
-{
-    char *auth_tag_base_va = uvm_rm_mem_get_cpu_va(semaphore->page->conf_computing.auth_tag_memory);
-
-    return (void*)(auth_tag_base_va + semaphore->index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE);
-}
-
-uvm_gpu_address_t uvm_gpu_semaphore_get_auth_tag_gpu_va(uvm_gpu_semaphore_t *semaphore)
-{
-    NvU64 auth_tag_base_va = uvm_rm_mem_get_gpu_uvm_va(semaphore->page->conf_computing.auth_tag_memory,
-                                                       semaphore->page->pool->gpu);
-
-    return uvm_gpu_address_virtual_unprotected(auth_tag_base_va + semaphore->index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE);
+    return base_va + UVM_SEMAPHORE_SIZE * index;
 }

 NvU32 uvm_gpu_semaphore_get_payload(uvm_gpu_semaphore_t *semaphore)
 {
-    return UVM_GPU_READ_ONCE(*uvm_gpu_semaphore_get_cpu_va(semaphore));
+    if (gpu_semaphore_is_secure(semaphore))
+        return UVM_GPU_READ_ONCE(semaphore->conf_computing.cached_payload);
+
+    return UVM_GPU_READ_ONCE(*semaphore->payload);
 }

 void uvm_gpu_semaphore_set_payload(uvm_gpu_semaphore_t *semaphore, NvU32 payload)
@ -558,7 +477,10 @@ void uvm_gpu_semaphore_set_payload(uvm_gpu_semaphore_t *semaphore, NvU32 payload
    // the GPU correctly even on non-SMP).
    mb();

-    UVM_GPU_WRITE_ONCE(*uvm_gpu_semaphore_get_cpu_va(semaphore), payload);
+    if (gpu_semaphore_is_secure(semaphore))
+            UVM_GPU_WRITE_ONCE(semaphore->conf_computing.cached_payload, payload);
+    else
+    UVM_GPU_WRITE_ONCE(*semaphore->payload, payload);
 }

 // This function is intended to catch channels which have been left dangling in
@ -624,11 +546,22 @@ void uvm_gpu_tracking_semaphore_free(uvm_gpu_tracking_semaphore_t *tracking_sem)
    uvm_gpu_semaphore_free(&tracking_sem->semaphore);
 }

-static void gpu_semaphore_encrypted_payload_update(uvm_channel_t *channel, uvm_gpu_semaphore_t *semaphore)
+static bool should_skip_secure_semaphore_update(NvU32 last_observed_notifier, NvU32 gpu_notifier)
 {
+    // No new value, or the GPU is currently writing the new encrypted material
+    // and no change in value would still result in corrupted data.
+    return (last_observed_notifier == gpu_notifier) || (gpu_notifier % 2);
+}
+
+static void uvm_gpu_semaphore_encrypted_payload_update(uvm_channel_t *channel, uvm_gpu_semaphore_t *semaphore)
+{
+    UvmCslIv local_iv;
    NvU32 local_payload;
-    uvm_gpu_semaphore_notifier_t gpu_notifier;
-    uvm_gpu_semaphore_notifier_t new_gpu_notifier = 0;
+    NvU32 new_sem_value;
+    NvU32 gpu_notifier;
+    NvU32 last_observed_notifier;
+    NvU32 new_gpu_notifier = 0;
+    NvU32 iv_index = 0;

    // A channel can have multiple entries pending and the tracking semaphore
    // update of each entry can race with this function. Since the semaphore
@ -637,72 +570,64 @@ static void gpu_semaphore_encrypted_payload_update(uvm_channel_t *channel, uvm_g
    unsigned tries_left = channel->num_gpfifo_entries;
    NV_STATUS status = NV_OK;
    NvU8 local_auth_tag[UVM_CONF_COMPUTING_AUTH_TAG_SIZE];
-    uvm_gpu_semaphore_notifier_t *semaphore_notifier_cpu_addr = uvm_gpu_semaphore_get_notifier_cpu_va(semaphore);
+    UvmCslIv *ivs_cpu_addr = semaphore->conf_computing.ivs;
+    void *auth_tag_cpu_addr = uvm_rm_mem_get_cpu_va(semaphore->conf_computing.auth_tag);
+    NvU32 *gpu_notifier_cpu_addr = (NvU32 *)uvm_rm_mem_get_cpu_va(semaphore->conf_computing.notifier);
+    NvU32 *payload_cpu_addr = (NvU32 *)uvm_rm_mem_get_cpu_va(semaphore->conf_computing.encrypted_payload);

    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
    UVM_ASSERT(uvm_channel_is_ce(channel));

-    do {
-        gpu_notifier = UVM_READ_ONCE(*semaphore_notifier_cpu_addr);
+    last_observed_notifier = semaphore->conf_computing.last_observed_notifier;
+    gpu_notifier = UVM_READ_ONCE(*gpu_notifier_cpu_addr);
+    UVM_ASSERT(last_observed_notifier <= gpu_notifier);

-        UVM_ASSERT(gpu_notifier >= semaphore->conf_computing.last_observed_notifier);
+    if (should_skip_secure_semaphore_update(last_observed_notifier, gpu_notifier))
+        return;
+
+    do {
+        gpu_notifier = UVM_READ_ONCE(*gpu_notifier_cpu_addr);

        // Odd notifier value means there's an update in progress.
        if (gpu_notifier % 2)
            continue;

-        // There's no change since last time
-        if (gpu_notifier == semaphore->conf_computing.last_observed_notifier)
-            return;
-
        // Make sure no memory accesses happen before we read the notifier
        smp_mb__after_atomic();

-        memcpy(local_auth_tag, uvm_gpu_semaphore_get_auth_tag_cpu_va(semaphore), sizeof(local_auth_tag));
-        local_payload = UVM_READ_ONCE(*uvm_gpu_semaphore_get_encrypted_payload_cpu_va(semaphore));
+        iv_index = (gpu_notifier / 2) % channel->num_gpfifo_entries;
+        memcpy(local_auth_tag, auth_tag_cpu_addr, sizeof(local_auth_tag));
+        local_payload = UVM_READ_ONCE(*payload_cpu_addr);
+        memcpy(&local_iv, &ivs_cpu_addr[iv_index], sizeof(local_iv));

        // Make sure the second read of notifier happens after
        // all memory accesses.
        smp_mb__before_atomic();
-        new_gpu_notifier = UVM_READ_ONCE(*semaphore_notifier_cpu_addr);
+        new_gpu_notifier = UVM_READ_ONCE(*gpu_notifier_cpu_addr);
        tries_left--;
    } while ((tries_left > 0) && ((gpu_notifier != new_gpu_notifier) || (gpu_notifier % 2)));

    if (!tries_left) {
        status = NV_ERR_INVALID_STATE;
+        goto error;
    }
-    else {
-        NvU32 key_version;
-        const NvU32 iv_index = (gpu_notifier / 2) % channel->num_gpfifo_entries;
-        NvU32 new_semaphore_value;
-
-        UVM_ASSERT(gpu_notifier == new_gpu_notifier);
-        UVM_ASSERT(gpu_notifier % 2 == 0);
-
-        // CPU decryption is guaranteed to use the same key version as the
-        // associated GPU encryption, because if there was any key rotation in
-        // between, then key rotation waited for all channels to complete before
-        // proceeding. The wait implies that the semaphore value matches the
-        // last one encrypted on the GPU, so this CPU decryption should happen
-        // before the key is rotated.
-        key_version = uvm_channel_pool_key_version(channel->pool);

+    if (gpu_notifier == new_gpu_notifier) {
        status = uvm_conf_computing_cpu_decrypt(channel,
-                                                &new_semaphore_value,
+                                                &new_sem_value,
                                                &local_payload,
-                                                &semaphore->conf_computing.ivs[iv_index],
-                                                key_version,
-                                                sizeof(new_semaphore_value),
+                                                &local_iv,
+                                                sizeof(new_sem_value),
                                                &local_auth_tag);

        if (status != NV_OK)
            goto error;

-        uvm_gpu_semaphore_set_payload(semaphore, new_semaphore_value);
+        uvm_gpu_semaphore_set_payload(semaphore, new_sem_value);
        UVM_WRITE_ONCE(semaphore->conf_computing.last_observed_notifier, new_gpu_notifier);
+    }

    return;
-    }

 error:
    // Decryption failure is a fatal error as well as running out of try left.
@ -725,11 +650,11 @@ static NvU64 update_completed_value_locked(uvm_gpu_tracking_semaphore_t *trackin
    else
        uvm_assert_spinlock_locked(&tracking_semaphore->s_lock);

-    if (gpu_semaphore_is_secure(&tracking_semaphore->semaphore)) {
+    if (tracking_semaphore->semaphore.conf_computing.encrypted_payload) {
        // TODO: Bug 4008734: [UVM][HCC] Extend secure tracking semaphore
        //                     mechanism to all semaphore
        uvm_channel_t *channel = container_of(tracking_semaphore, uvm_channel_t, tracking_sem);
-        gpu_semaphore_encrypted_payload_update(channel, &tracking_semaphore->semaphore);
+        uvm_gpu_semaphore_encrypted_payload_update(channel, &tracking_semaphore->semaphore);
    }

    new_sem_value = uvm_gpu_semaphore_get_payload(&tracking_semaphore->semaphore);
@ -765,7 +690,7 @@ static NvU64 update_completed_value_locked(uvm_gpu_tracking_semaphore_t *trackin
    UVM_ASSERT_MSG_RELEASE(new_value - old_value <= UVM_GPU_SEMAPHORE_MAX_JUMP,
                           "GPU %s unexpected semaphore (CPU VA 0x%llx) jump from 0x%llx to 0x%llx\n",
                           uvm_gpu_name(tracking_semaphore->semaphore.page->pool->gpu),
-                           (NvU64)(uintptr_t)uvm_gpu_semaphore_get_cpu_va(&tracking_semaphore->semaphore),
+                           (NvU64)(uintptr_t)tracking_semaphore->semaphore.payload,
                           old_value, new_value);

    // Use an atomic write even though the lock is held so that the value can
--- a/kernel-open/nvidia-uvm/uvm_gpu_semaphore.h
+++ b/kernel-open/nvidia-uvm/uvm_gpu_semaphore.h
@ -29,8 +29,6 @@
 #include "uvm_rm_mem.h"
 #include "uvm_linux.h"

-typedef NvU32 uvm_gpu_semaphore_notifier_t;
-
 // A GPU semaphore is a memory location accessible by the GPUs and the CPU
 // that's used for synchronization among them.
 // The GPU has primitives to acquire (wait for) and release (set) 4-byte memory
@ -47,15 +45,17 @@ struct uvm_gpu_semaphore_struct
    // The semaphore pool page the semaphore came from
    uvm_gpu_semaphore_pool_page_t *page;

-    // Index of the semaphore in semaphore page
-    NvU16 index;
-
+    // Pointer to the memory location
+    NvU32 *payload;
    struct {
-        UvmCslIv *ivs;
+        NvU16 index;
        NvU32 cached_payload;
-
-        uvm_gpu_semaphore_notifier_t last_pushed_notifier;
-        uvm_gpu_semaphore_notifier_t last_observed_notifier;
+        uvm_rm_mem_t *encrypted_payload;
+        uvm_rm_mem_t *notifier;
+        uvm_rm_mem_t *auth_tag;
+        UvmCslIv *ivs;
+        NvU32 last_pushed_notifier;
+        NvU32 last_observed_notifier;
    } conf_computing;
 };

@ -151,17 +151,6 @@ NvU64 uvm_gpu_semaphore_get_gpu_proxy_va(uvm_gpu_semaphore_t *semaphore, uvm_gpu

 NvU64 uvm_gpu_semaphore_get_gpu_va(uvm_gpu_semaphore_t *semaphore, uvm_gpu_t *gpu, bool is_proxy_va_space);

-NvU32 *uvm_gpu_semaphore_get_cpu_va(uvm_gpu_semaphore_t *semaphore);
-
-NvU32 *uvm_gpu_semaphore_get_encrypted_payload_cpu_va(uvm_gpu_semaphore_t *semaphore);
-uvm_gpu_address_t uvm_gpu_semaphore_get_encrypted_payload_gpu_va(uvm_gpu_semaphore_t *semaphore);
-
-uvm_gpu_semaphore_notifier_t *uvm_gpu_semaphore_get_notifier_cpu_va(uvm_gpu_semaphore_t *semaphore);
-uvm_gpu_address_t uvm_gpu_semaphore_get_notifier_gpu_va(uvm_gpu_semaphore_t *semaphore);
-
-void *uvm_gpu_semaphore_get_auth_tag_cpu_va(uvm_gpu_semaphore_t *semaphore);
-uvm_gpu_address_t uvm_gpu_semaphore_get_auth_tag_gpu_va(uvm_gpu_semaphore_t *semaphore);
-
 // Read the 32-bit payload of the semaphore
 // Notably doesn't provide any memory ordering guarantees and needs to be used with
 // care. For an example of what needs to be considered see
--- a/kernel-open/nvidia-uvm/uvm_hal.c
+++ b/kernel-open/nvidia-uvm/uvm_hal.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -251,6 +251,9 @@ static uvm_hal_class_ops_t host_table[] =
            .semaphore_release = uvm_hal_turing_host_semaphore_release,
            .clear_faulted_channel_method = uvm_hal_turing_host_clear_faulted_channel_method,
            .set_gpfifo_entry = uvm_hal_turing_host_set_gpfifo_entry,
+            .tlb_invalidate_all = uvm_hal_turing_host_tlb_invalidate_all,
+            .tlb_invalidate_va = uvm_hal_turing_host_tlb_invalidate_va,
+            .tlb_invalidate_test = uvm_hal_turing_host_tlb_invalidate_test,
        }
    },
    {
@ -632,13 +635,19 @@ NV_STATUS uvm_hal_init_table(void)
        return status;
    }

-    status = ops_init_from_parent(host_table, ARRAY_SIZE(host_table), HOST_OP_COUNT, offsetof(uvm_hal_class_ops_t, u.host_ops));
+    status = ops_init_from_parent(host_table,
+                                  ARRAY_SIZE(host_table),
+                                  HOST_OP_COUNT,
+                                  offsetof(uvm_hal_class_ops_t, u.host_ops));
    if (status != NV_OK) {
        UVM_ERR_PRINT("ops_init_from_parent(host_table) failed: %s\n", nvstatusToString(status));
        return status;
    }

-    status = ops_init_from_parent(arch_table, ARRAY_SIZE(arch_table), ARCH_OP_COUNT, offsetof(uvm_hal_class_ops_t, u.arch_ops));
+    status = ops_init_from_parent(arch_table,
+                                  ARRAY_SIZE(arch_table),
+                                  ARCH_OP_COUNT,
+                                  offsetof(uvm_hal_class_ops_t, u.arch_ops));
    if (status != NV_OK) {
        UVM_ERR_PRINT("ops_init_from_parent(arch_table) failed: %s\n", nvstatusToString(status));
        return status;
@ -932,14 +941,16 @@ const char *uvm_mmu_engine_type_string(uvm_mmu_engine_type_t mmu_engine_type)
 void uvm_hal_print_fault_entry(const uvm_fault_buffer_entry_t *entry)
 {
    UVM_DBG_PRINT("fault_address:                    0x%llx\n", entry->fault_address);
-    UVM_DBG_PRINT("    fault_instance_ptr:           {0x%llx:%s}\n", entry->instance_ptr.address,
+    UVM_DBG_PRINT("    fault_instance_ptr:           {0x%llx:%s}\n",
+                  entry->instance_ptr.address,
                  uvm_aperture_string(entry->instance_ptr.aperture));
    UVM_DBG_PRINT("    fault_type:                   %s\n", uvm_fault_type_string(entry->fault_type));
    UVM_DBG_PRINT("    fault_access_type:            %s\n", uvm_fault_access_type_string(entry->fault_access_type));
    UVM_DBG_PRINT("    is_replayable:                %s\n", entry->is_replayable? "true": "false");
    UVM_DBG_PRINT("    is_virtual:                   %s\n", entry->is_virtual? "true": "false");
    UVM_DBG_PRINT("    in_protected_mode:            %s\n", entry->in_protected_mode? "true": "false");
-    UVM_DBG_PRINT("    fault_source.client_type:     %s\n", uvm_fault_client_type_string(entry->fault_source.client_type));
+    UVM_DBG_PRINT("    fault_source.client_type:     %s\n",
+                  uvm_fault_client_type_string(entry->fault_source.client_type));
    UVM_DBG_PRINT("    fault_source.client_id:       %d\n", entry->fault_source.client_id);
    UVM_DBG_PRINT("    fault_source.gpc_id:          %d\n", entry->fault_source.gpc_id);
    UVM_DBG_PRINT("    fault_source.mmu_engine_id:   %d\n", entry->fault_source.mmu_engine_id);
@ -962,12 +973,14 @@ const char *uvm_access_counter_type_string(uvm_access_counter_type_t access_coun
 void uvm_hal_print_access_counter_buffer_entry(const uvm_access_counter_buffer_entry_t *entry)
 {
    if (!entry->address.is_virtual) {
-        UVM_DBG_PRINT("physical address: {0x%llx:%s}\n", entry->address.address,
+        UVM_DBG_PRINT("physical address: {0x%llx:%s}\n",
+                      entry->address.address,
                      uvm_aperture_string(entry->address.aperture));
    }
    else {
        UVM_DBG_PRINT("virtual address: 0x%llx\n", entry->address.address);
-        UVM_DBG_PRINT("    instance_ptr    {0x%llx:%s}\n", entry->virtual_info.instance_ptr.address,
+        UVM_DBG_PRINT("    instance_ptr    {0x%llx:%s}\n",
+                      entry->virtual_info.instance_ptr.address,
                      uvm_aperture_string(entry->virtual_info.instance_ptr.aperture));
        UVM_DBG_PRINT("    mmu_engine_type %s\n", uvm_mmu_engine_type_string(entry->virtual_info.mmu_engine_type));
        UVM_DBG_PRINT("    mmu_engine_id   %u\n", entry->virtual_info.mmu_engine_id);
--- a/kernel-open/nvidia-uvm/uvm_hal.h
+++ b/kernel-open/nvidia-uvm/uvm_hal.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -112,6 +112,10 @@ void uvm_hal_pascal_host_tlb_invalidate_all(uvm_push_t *push,
                                            uvm_gpu_phys_address_t pdb,
                                            NvU32 depth,
                                            uvm_membar_t membar);
+void uvm_hal_turing_host_tlb_invalidate_all(uvm_push_t *push,
+                                            uvm_gpu_phys_address_t pdb,
+                                            NvU32 depth,
+                                            uvm_membar_t membar);
 void uvm_hal_ampere_host_tlb_invalidate_all(uvm_push_t *push,
                                            uvm_gpu_phys_address_t pdb,
                                            NvU32 depth,
@ -149,42 +153,49 @@ typedef void (*uvm_hal_host_tlb_invalidate_va_t)(uvm_push_t *push,
                                                 NvU32 depth,
                                                 NvU64 base,
                                                 NvU64 size,
-                                                 NvU32 page_size,
+                                                 NvU64 page_size,
                                                 uvm_membar_t membar);
 void uvm_hal_maxwell_host_tlb_invalidate_va(uvm_push_t *push,
                                            uvm_gpu_phys_address_t pdb,
                                            NvU32 depth,
                                            NvU64 base,
                                            NvU64 size,
-                                            NvU32 page_size,
+                                            NvU64 page_size,
                                            uvm_membar_t membar);
 void uvm_hal_pascal_host_tlb_invalidate_va(uvm_push_t *push,
                                           uvm_gpu_phys_address_t pdb,
                                           NvU32 depth,
                                           NvU64 base,
                                           NvU64 size,
-                                           NvU32 page_size,
+                                           NvU64 page_size,
                                           uvm_membar_t membar);
 void uvm_hal_volta_host_tlb_invalidate_va(uvm_push_t *push,
                                          uvm_gpu_phys_address_t pdb,
                                          NvU32 depth,
                                          NvU64 base,
                                          NvU64 size,
-                                          NvU32 page_size,
+                                          NvU64 page_size,
+                                          uvm_membar_t membar);
+void uvm_hal_turing_host_tlb_invalidate_va(uvm_push_t *push,
+                                           uvm_gpu_phys_address_t pdb,
+                                           NvU32 depth,
+                                           NvU64 base,
+                                           NvU64 size,
+                                           NvU64 page_size,
                                           uvm_membar_t membar);
 void uvm_hal_ampere_host_tlb_invalidate_va(uvm_push_t *push,
                                           uvm_gpu_phys_address_t pdb,
                                           NvU32 depth,
                                           NvU64 base,
                                           NvU64 size,
-                                           NvU32 page_size,
+                                           NvU64 page_size,
                                           uvm_membar_t membar);
 void uvm_hal_hopper_host_tlb_invalidate_va(uvm_push_t *push,
                                           uvm_gpu_phys_address_t pdb,
                                           NvU32 depth,
                                           NvU64 base,
                                           NvU64 size,
-                                           NvU32 page_size,
+                                           NvU64 page_size,
                                           uvm_membar_t membar);

 typedef void (*uvm_hal_host_tlb_invalidate_test_t)(uvm_push_t *push,
@ -196,6 +207,9 @@ void uvm_hal_maxwell_host_tlb_invalidate_test(uvm_push_t *push,
 void uvm_hal_pascal_host_tlb_invalidate_test(uvm_push_t *push,
                                             uvm_gpu_phys_address_t pdb,
                                             UVM_TEST_INVALIDATE_TLB_PARAMS *params);
+void uvm_hal_turing_host_tlb_invalidate_test(uvm_push_t *push,
+                                             uvm_gpu_phys_address_t pdb,
+                                             UVM_TEST_INVALIDATE_TLB_PARAMS *params);
 void uvm_hal_ampere_host_tlb_invalidate_test(uvm_push_t *push,
                                             uvm_gpu_phys_address_t pdb,
                                             UVM_TEST_INVALIDATE_TLB_PARAMS *params);
@ -445,15 +459,15 @@ void uvm_hal_ada_arch_init_properties(uvm_parent_gpu_t *parent_gpu);
 void uvm_hal_hopper_arch_init_properties(uvm_parent_gpu_t *parent_gpu);

 // Retrieve the page-tree HAL for a given big page size
-typedef uvm_mmu_mode_hal_t *(*uvm_hal_lookup_mode_hal_t)(NvU32 big_page_size);
+typedef uvm_mmu_mode_hal_t *(*uvm_hal_lookup_mode_hal_t)(NvU64 big_page_size);
 typedef void (*uvm_hal_mmu_enable_prefetch_faults_t)(uvm_parent_gpu_t *parent_gpu);
 typedef void (*uvm_hal_mmu_disable_prefetch_faults_t)(uvm_parent_gpu_t *parent_gpu);
-uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_maxwell(NvU32 big_page_size);
-uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_pascal(NvU32 big_page_size);
-uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_volta(NvU32 big_page_size);
-uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_turing(NvU32 big_page_size);
-uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_ampere(NvU32 big_page_size);
-uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_hopper(NvU32 big_page_size);
+uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_maxwell(NvU64 big_page_size);
+uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_pascal(NvU64 big_page_size);
+uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_volta(NvU64 big_page_size);
+uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_turing(NvU64 big_page_size);
+uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_ampere(NvU64 big_page_size);
+uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_hopper(NvU64 big_page_size);
 void uvm_hal_maxwell_mmu_enable_prefetch_faults_unsupported(uvm_parent_gpu_t *parent_gpu);
 void uvm_hal_maxwell_mmu_disable_prefetch_faults_unsupported(uvm_parent_gpu_t *parent_gpu);
 void uvm_hal_pascal_mmu_enable_prefetch_faults(uvm_parent_gpu_t *parent_gpu);
--- a/kernel-open/nvidia-uvm/uvm_hmm.c
+++ b/kernel-open/nvidia-uvm/uvm_hmm.c
@ -284,10 +284,8 @@ static void hmm_va_block_unregister_gpu(uvm_va_block_t *va_block,

    // Reset preferred location and accessed-by of policy nodes if needed.
    uvm_for_each_va_policy_node_in(node, va_block, va_block->start, va_block->end) {
-        if (uvm_va_policy_preferred_location_equal(&node->policy, gpu->id, NUMA_NO_NODE)) {
+        if (uvm_id_equal(node->policy.preferred_location, gpu->id))
            node->policy.preferred_location = UVM_ID_INVALID;
-            node->policy.preferred_nid = NUMA_NO_NODE;
-        }

        uvm_processor_mask_clear(&node->policy.accessed_by, gpu->id);
    }
@ -1601,7 +1599,7 @@ static void hmm_va_block_cpu_unpopulate_chunk(uvm_va_block_t *va_block,
    UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == PAGE_SIZE);

    uvm_cpu_chunk_remove_from_block(va_block, chunk_nid, page_index);
-    uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk, page_index);
+    uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk);
    uvm_cpu_chunk_free(chunk);
 }

--- a/kernel-open/nvidia-uvm/uvm_hopper_host.c
+++ b/kernel-open/nvidia-uvm/uvm_hopper_host.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2020-2022 NVIDIA Corporation
+    Copyright (c) 2020-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -157,6 +157,7 @@ void uvm_hal_hopper_host_tlb_invalidate_all(uvm_push_t *push,
    NvU32 pdb_lo;
    NvU32 pdb_hi;
    NvU32 ack_value = 0;
+    NvU32 sysmembar_value = 0;

    UVM_ASSERT_MSG(pdb.aperture == UVM_APERTURE_VID || pdb.aperture == UVM_APERTURE_SYS, "aperture: %u", pdb.aperture);

@ -183,7 +184,12 @@ void uvm_hal_hopper_host_tlb_invalidate_all(uvm_push_t *push,
        ack_value = HWCONST(C86F, MEM_OP_C, TLB_INVALIDATE_ACK_TYPE, GLOBALLY);
    }

-    NV_PUSH_4U(C86F, MEM_OP_A, HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS) |
+    if (membar == UVM_MEMBAR_SYS)
+        sysmembar_value = HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, EN);
+    else
+        sysmembar_value = HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS);
+
+    NV_PUSH_4U(C86F, MEM_OP_A, sysmembar_value |
                               HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS),
                     MEM_OP_B, 0,
                     MEM_OP_C, HWCONST(C86F, MEM_OP_C, TLB_INVALIDATE_PDB, ONE) |
@ -196,7 +202,9 @@ void uvm_hal_hopper_host_tlb_invalidate_all(uvm_push_t *push,
                     MEM_OP_D, HWCONST(C86F, MEM_OP_D, OPERATION, MMU_TLB_INVALIDATE) |
                               HWVALUE(C86F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));

-    uvm_hal_tlb_invalidate_membar(push, membar);
+    // GPU membar still requires an explicit membar method.
+    if (membar == UVM_MEMBAR_GPU)
+        uvm_push_get_gpu(push)->parent->host_hal->membar_gpu(push);
 }

 void uvm_hal_hopper_host_tlb_invalidate_va(uvm_push_t *push,
@ -204,7 +212,7 @@ void uvm_hal_hopper_host_tlb_invalidate_va(uvm_push_t *push,
                                           NvU32 depth,
                                           NvU64 base,
                                           NvU64 size,
-                                           NvU32 page_size,
+                                           NvU64 page_size,
                                           uvm_membar_t membar)
 {
    NvU32 aperture_value;
@ -212,6 +220,7 @@ void uvm_hal_hopper_host_tlb_invalidate_va(uvm_push_t *push,
    NvU32 pdb_lo;
    NvU32 pdb_hi;
    NvU32 ack_value = 0;
+    NvU32 sysmembar_value = 0;
    NvU32 va_lo;
    NvU32 va_hi;
    NvU64 end;
@ -221,9 +230,9 @@ void uvm_hal_hopper_host_tlb_invalidate_va(uvm_push_t *push,
    NvU32 log2_invalidation_size;
    uvm_gpu_t *gpu = uvm_push_get_gpu(push);

-    UVM_ASSERT_MSG(IS_ALIGNED(page_size, 1 << 12), "page_size 0x%x\n", page_size);
-    UVM_ASSERT_MSG(IS_ALIGNED(base, page_size), "base 0x%llx page_size 0x%x\n", base, page_size);
-    UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%x\n", size, page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(page_size, 1 << 12), "page_size 0x%llx\n", page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(base, page_size), "base 0x%llx page_size 0x%llx\n", base, page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%llx\n", size, page_size);
    UVM_ASSERT_MSG(size > 0, "size 0x%llx\n", size);

    // The invalidation size must be a power-of-two number of pages containing
@ -277,8 +286,13 @@ void uvm_hal_hopper_host_tlb_invalidate_va(uvm_push_t *push,
        ack_value = HWCONST(C86F, MEM_OP_C, TLB_INVALIDATE_ACK_TYPE, GLOBALLY);
    }

+    if (membar == UVM_MEMBAR_SYS)
+        sysmembar_value = HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, EN);
+    else
+        sysmembar_value = HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS);
+
    NV_PUSH_4U(C86F, MEM_OP_A, HWVALUE(C86F, MEM_OP_A, TLB_INVALIDATE_INVALIDATION_SIZE, log2_invalidation_size) |
-                               HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS) |
+                               sysmembar_value |
                               HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS) |
                               HWVALUE(C86F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO, va_lo),
                     MEM_OP_B, HWVALUE(C86F, MEM_OP_B, TLB_INVALIDATE_TARGET_ADDR_HI, va_hi),
@ -292,7 +306,9 @@ void uvm_hal_hopper_host_tlb_invalidate_va(uvm_push_t *push,
                     MEM_OP_D, HWCONST(C86F, MEM_OP_D, OPERATION, MMU_TLB_INVALIDATE_TARGETED) |
                               HWVALUE(C86F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));

-    uvm_hal_tlb_invalidate_membar(push, membar);
+    // GPU membar still requires an explicit membar method.
+    if (membar == UVM_MEMBAR_GPU)
+        gpu->parent->host_hal->membar_gpu(push);
 }

 void uvm_hal_hopper_host_tlb_invalidate_test(uvm_push_t *push,
@ -300,12 +316,12 @@ void uvm_hal_hopper_host_tlb_invalidate_test(uvm_push_t *push,
                                             UVM_TEST_INVALIDATE_TLB_PARAMS *params)
 {
    NvU32 ack_value = 0;
+    NvU32 sysmembar_value = 0;
    NvU32 invalidate_gpc_value = 0;
    NvU32 aperture_value = 0;
    NvU32 pdb_lo = 0;
    NvU32 pdb_hi = 0;
    NvU32 page_table_level = 0;
-    uvm_membar_t membar;

    UVM_ASSERT_MSG(pdb.aperture == UVM_APERTURE_VID || pdb.aperture == UVM_APERTURE_SYS, "aperture: %u", pdb.aperture);
    if (pdb.aperture == UVM_APERTURE_VID)
@ -332,6 +348,11 @@ void uvm_hal_hopper_host_tlb_invalidate_test(uvm_push_t *push,
        ack_value = HWCONST(C86F, MEM_OP_C, TLB_INVALIDATE_ACK_TYPE, GLOBALLY);
    }

+    if (params->membar == UvmInvalidateTlbMemBarSys)
+        sysmembar_value = HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, EN);
+    else
+        sysmembar_value = HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS);
+
    if (params->disable_gpc_invalidate)
        invalidate_gpc_value = HWCONST(C86F, MEM_OP_C, TLB_INVALIDATE_GPC, DISABLE);
    else
@ -343,7 +364,7 @@ void uvm_hal_hopper_host_tlb_invalidate_test(uvm_push_t *push,
        NvU32 va_lo = va & HWMASK(C86F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO);
        NvU32 va_hi = va >> HWSIZE(C86F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO);

-        NV_PUSH_4U(C86F, MEM_OP_A, HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS) |
+        NV_PUSH_4U(C86F, MEM_OP_A, sysmembar_value |
                                   HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS) |
                                   HWVALUE(C86F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO, va_lo),
                         MEM_OP_B, HWVALUE(C86F, MEM_OP_B, TLB_INVALIDATE_TARGET_ADDR_HI, va_hi),
@ -358,7 +379,7 @@ void uvm_hal_hopper_host_tlb_invalidate_test(uvm_push_t *push,
                                   HWVALUE(C86F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));
    }
    else {
-        NV_PUSH_4U(C86F, MEM_OP_A, HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS) |
+        NV_PUSH_4U(C86F, MEM_OP_A, sysmembar_value |
                                   HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS),
                         MEM_OP_B, 0,
                         MEM_OP_C, HWCONST(C86F, MEM_OP_C, TLB_INVALIDATE_REPLAY, NONE) |
@ -372,14 +393,9 @@ void uvm_hal_hopper_host_tlb_invalidate_test(uvm_push_t *push,
                                   HWVALUE(C86F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));
    }

-    if (params->membar == UvmInvalidateTlbMemBarSys)
-        membar = UVM_MEMBAR_SYS;
-    else if (params->membar == UvmInvalidateTlbMemBarLocal)
-        membar = UVM_MEMBAR_GPU;
-    else
-        membar = UVM_MEMBAR_NONE;
-
-    uvm_hal_tlb_invalidate_membar(push, membar);
+    // GPU membar still requires an explicit membar method.
+    if (params->membar == UvmInvalidateTlbMemBarLocal)
+        uvm_push_get_gpu(push)->parent->host_hal->membar_gpu(push);
 }

 void uvm_hal_hopper_host_set_gpfifo_pushbuffer_segment_base(NvU64 *fifo_entry, NvU64 pushbuffer_va)
--- a/kernel-open/nvidia-uvm/uvm_hopper_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_hopper_mmu.c
@ -61,7 +61,7 @@ uvm_mmu_engine_type_t uvm_hal_hopper_mmu_engine_id_to_type(NvU16 mmu_engine_id)
    return UVM_MMU_ENGINE_TYPE_GRAPHICS;
 }

-static NvU32 page_table_depth_hopper(NvU32 page_size)
+static NvU32 page_table_depth_hopper(NvU64 page_size)
 {
    // The common-case is page_size == UVM_PAGE_SIZE_2M, hence the first check
    if (page_size == UVM_PAGE_SIZE_2M)
@ -79,7 +79,7 @@ static NvU32 entries_per_index_hopper(NvU32 depth)
    return 1;
 }

-static NvLength entry_offset_hopper(NvU32 depth, NvU32 page_size)
+static NvLength entry_offset_hopper(NvU32 depth, NvU64 page_size)
 {
    UVM_ASSERT(depth < 6);
    if ((page_size == UVM_PAGE_SIZE_4K) && (depth == 4))
@ -92,7 +92,7 @@ static NvLength entry_size_hopper(NvU32 depth)
    return entries_per_index_hopper(depth) * 8;
 }

-static NvU32 index_bits_hopper(NvU32 depth, NvU32 page_size)
+static NvU32 index_bits_hopper(NvU32 depth, NvU64 page_size)
 {
    static const NvU32 bit_widths[] = {1, 9, 9, 9, 8};

@ -120,7 +120,7 @@ static NvU32 num_va_bits_hopper(void)
    return 57;
 }

-static NvLength allocation_size_hopper(NvU32 depth, NvU32 page_size)
+static NvLength allocation_size_hopper(NvU32 depth, NvU64 page_size)
 {
    UVM_ASSERT(depth < 6);
    if (depth == 5 && page_size == UVM_PAGE_SIZE_64K)
@ -233,7 +233,7 @@ static NvU64 make_sparse_pte_hopper(void)
           HWCONST64(_MMU_VER3, PTE, PCF, SPARSE);
 }

-static NvU64 unmapped_pte_hopper(NvU32 page_size)
+static NvU64 unmapped_pte_hopper(NvU64 page_size)
 {
    // Setting PCF to NO_VALID_4KB_PAGE on an otherwise-zeroed big PTE causes
    // the corresponding 4k PTEs to be ignored. This allows the invalidation of
@ -490,7 +490,7 @@ static void make_pde_hopper(void *entry,

 static uvm_mmu_mode_hal_t hopper_mmu_mode_hal;

-uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_hopper(NvU32 big_page_size)
+uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_hopper(NvU64 big_page_size)
 {
    static bool initialized = false;

--- a/kernel-open/nvidia-uvm/uvm_ioctl.h
+++ b/kernel-open/nvidia-uvm/uvm_ioctl.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2013-2023 NVidia Corporation
+    Copyright (c) 2013-2024 NVidia Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -494,7 +494,7 @@ typedef struct
    NvU64                   base                            NV_ALIGN_BYTES(8); // IN
    NvU64                   length                          NV_ALIGN_BYTES(8); // IN
    NvU64                   offset                          NV_ALIGN_BYTES(8); // IN
-    UvmGpuMappingAttributes perGpuAttributes[UVM_MAX_GPUS_V2];                 // IN
+    UvmGpuMappingAttributes perGpuAttributes[UVM_MAX_GPUS];                    // IN
    NvU64                   gpuAttributesCount              NV_ALIGN_BYTES(8); // IN
    NvS32                   rmCtrlFd;                                          // IN
    NvU32                   hClient;                                           // IN
@ -952,7 +952,6 @@ typedef struct
    NvU32     version;                                    // OUT
 } UVM_TOOLS_GET_PROCESSOR_UUID_TABLE_PARAMS;

-
 //
 // UvmMapDynamicParallelismRegion
 //
@ -995,7 +994,7 @@ typedef struct
 {
    NvU64                   base                            NV_ALIGN_BYTES(8); // IN
    NvU64                   length                          NV_ALIGN_BYTES(8); // IN
-    UvmGpuMappingAttributes perGpuAttributes[UVM_MAX_GPUS_V2];                 // IN
+    UvmGpuMappingAttributes perGpuAttributes[UVM_MAX_GPUS];                    // IN
    NvU64                   gpuAttributesCount              NV_ALIGN_BYTES(8); // IN
    NV_STATUS               rmStatus;                                          // OUT
 } UVM_ALLOC_SEMAPHORE_POOL_PARAMS;
--- a/kernel-open/nvidia-uvm/uvm_lock.c
+++ b/kernel-open/nvidia-uvm/uvm_lock.c
@ -27,7 +27,7 @@

 const char *uvm_lock_order_to_string(uvm_lock_order_t lock_order)
 {
-    BUILD_BUG_ON(UVM_LOCK_ORDER_COUNT != 36);
+    BUILD_BUG_ON(UVM_LOCK_ORDER_COUNT != 34);

    switch (lock_order) {
        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_INVALID);
@ -48,9 +48,7 @@ const char *uvm_lock_order_to_string(uvm_lock_order_t lock_order)
        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_CONF_COMPUTING_DMA_BUFFER_POOL);
        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_CHUNK_MAPPING);
        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_PAGE_TREE);
-        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_KEY_ROTATION);
        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_CSL_PUSH);
-        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_KEY_ROTATION_WLC);
        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_CSL_WLC_PUSH);
        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_CSL_SEC2_PUSH);
        UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_PUSH);
--- a/kernel-open/nvidia-uvm/uvm_lock.h
+++ b/kernel-open/nvidia-uvm/uvm_lock.h
@ -322,15 +322,6 @@
 //      Operations not allowed while holding this lock
 //      - GPU memory allocation which can evict
 //
-// - Channel pool key rotation lock
-//      Order: UVM_LOCK_ORDER_KEY_ROTATION
-//      Condition: Confidential Computing is enabled
-//      Mutex per channel pool
-//
-//      The lock ensures mutual exclusion during key rotation affecting all the
-//      channels in the associated pool. Key rotation in WLC pools is handled
-//      using a separate lock order, see UVM_LOCK_ORDER_KEY_ROTATION_WLC below.
-//
 // - CE channel CSL channel pool semaphore
 //      Order: UVM_LOCK_ORDER_CSL_PUSH
 //      Condition: The Confidential Computing feature is enabled
@ -347,15 +338,6 @@
 //      Operations allowed while holding this lock
 //      - Pushing work to CE channels (except for WLC channels)
 //
-// - WLC channel pool key rotation lock
-//      Order: UVM_LOCK_ORDER_KEY_ROTATION_WLC
-//      Condition: Confidential Computing is enabled
-//      Mutex of WLC channel pool
-//
-//      The lock has the same purpose as the regular channel pool key rotation
-//      lock. Using a different order lock for WLC channels allows key rotation
-//      on those channels during indirect work submission.
-//
 // - WLC CSL channel pool semaphore
 //      Order: UVM_LOCK_ORDER_CSL_WLC_PUSH
 //      Condition: The Confidential Computing feature is enabled
@ -502,9 +484,7 @@ typedef enum
    UVM_LOCK_ORDER_CONF_COMPUTING_DMA_BUFFER_POOL,
    UVM_LOCK_ORDER_CHUNK_MAPPING,
    UVM_LOCK_ORDER_PAGE_TREE,
-    UVM_LOCK_ORDER_KEY_ROTATION,
    UVM_LOCK_ORDER_CSL_PUSH,
-    UVM_LOCK_ORDER_KEY_ROTATION_WLC,
    UVM_LOCK_ORDER_CSL_WLC_PUSH,
    UVM_LOCK_ORDER_CSL_SEC2_PUSH,
    UVM_LOCK_ORDER_PUSH,
--- a/kernel-open/nvidia-uvm/uvm_map_external.c
+++ b/kernel-open/nvidia-uvm/uvm_map_external.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2023 NVIDIA Corporation
+    Copyright (c) 2016-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -61,7 +61,7 @@ typedef struct
    size_t buffer_size;

    // Page size in bytes
-    NvU32 page_size;
+    NvU64 page_size;

    // Size of a single PTE in bytes
    NvU32 pte_size;
@ -91,7 +91,7 @@ static NV_STATUS uvm_pte_buffer_init(uvm_va_range_t *va_range,
                                     uvm_gpu_t *gpu,
                                     const uvm_map_rm_params_t *map_rm_params,
                                     NvU64 length,
-                                     NvU32 page_size,
+                                     NvU64 page_size,
                                     uvm_pte_buffer_t *pte_buffer)
 {
    uvm_gpu_va_space_t *gpu_va_space = uvm_gpu_va_space_get(va_range->va_space, gpu);
@ -650,9 +650,7 @@ static NV_STATUS set_ext_gpu_map_location(uvm_ext_gpu_map_t *ext_gpu_map,
        return NV_OK;
    }
    // This is a local or peer allocation, so the owning GPU must have been
-    // registered.
-    // This also checks for if EGM owning GPU is registered.
-
+    // registered. This also checks for if EGM owning GPU is registered.
    owning_gpu = uvm_va_space_get_gpu_by_uuid(va_space, &mem_info->uuid);
    if (!owning_gpu)
        return NV_ERR_INVALID_DEVICE;
@ -665,7 +663,6 @@ static NV_STATUS set_ext_gpu_map_location(uvm_ext_gpu_map_t *ext_gpu_map,
    // semantics of sysmem allocations.

    // Check if peer access for peer memory is enabled.
-    // This path also handles EGM allocations.
    if (owning_gpu != mapping_gpu && (!mem_info->sysmem || mem_info->egm)) {
        // TODO: Bug 1757136: In SLI, the returned UUID may be different but a
        //       local mapping must be used. We need to query SLI groups to know
@ -856,9 +853,10 @@ static NV_STATUS uvm_map_external_allocation_on_gpu(uvm_va_range_t *va_range,
    uvm_ext_gpu_range_tree_t *range_tree = uvm_ext_gpu_range_tree(va_range, mapping_gpu);
    UvmGpuMemoryInfo mem_info;
    uvm_gpu_va_space_t *gpu_va_space = uvm_gpu_va_space_get(va_space, mapping_gpu);
-    NvU32 mapping_page_size;
+    NvU64 mapping_page_size;
+    NvU64 biggest_mapping_page_size;
    NvU64 alignments;
-    NvU32 smallest_alignment;
+    NvU64 smallest_alignment;
    NV_STATUS status;

    uvm_assert_rwsem_locked_read(&va_space->lock);
@ -947,9 +945,11 @@ static NV_STATUS uvm_map_external_allocation_on_gpu(uvm_va_range_t *va_range,

    // Check for the maximum page size for the mapping of vidmem allocations,
    // the vMMU segment size may limit the range of page sizes.
+    biggest_mapping_page_size = uvm_mmu_biggest_page_size_up_to(&gpu_va_space->page_tables,
+                                                                mapping_gpu->mem_info.max_vidmem_page_size);
    if (!ext_gpu_map->is_sysmem && (ext_gpu_map->gpu == ext_gpu_map->owning_gpu) &&
-        (mapping_page_size > mapping_gpu->mem_info.max_vidmem_page_size))
-        mapping_page_size = mapping_gpu->mem_info.max_vidmem_page_size;
+        (mapping_page_size > biggest_mapping_page_size))
+        mapping_page_size = biggest_mapping_page_size;

    mem_info.pageSize = mapping_page_size;

@ -986,7 +986,7 @@ static NV_STATUS uvm_map_external_allocation(uvm_va_space_t *va_space, UVM_MAP_E
    if (uvm_api_range_invalid_4k(params->base, params->length))
        return NV_ERR_INVALID_ADDRESS;

-    if (params->gpuAttributesCount == 0 || params->gpuAttributesCount > UVM_MAX_GPUS_V2)
+    if (params->gpuAttributesCount == 0 || params->gpuAttributesCount > UVM_MAX_GPUS)
        return NV_ERR_INVALID_ARGUMENT;

    mapped_gpus = uvm_processor_mask_cache_alloc();
--- a/kernel-open/nvidia-uvm/uvm_maxwell_host.c
+++ b/kernel-open/nvidia-uvm/uvm_maxwell_host.c
@ -108,7 +108,7 @@ void uvm_hal_maxwell_host_tlb_invalidate_va(uvm_push_t *push,
                                            NvU32 depth,
                                            NvU64 base,
                                            NvU64 size,
-                                            NvU32 page_size,
+                                            NvU64 page_size,
                                            uvm_membar_t membar)
 {
    // No per VA invalidate on Maxwell, redirect to invalidate all.
--- a/kernel-open/nvidia-uvm/uvm_maxwell_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_maxwell_mmu.c
@ -52,7 +52,7 @@ static NvU32 entries_per_index_maxwell(NvU32 depth)
    return 1;
 }

-static NvLength entry_offset_maxwell(NvU32 depth, NvU32 page_size)
+static NvLength entry_offset_maxwell(NvU32 depth, NvU64 page_size)
 {
    UVM_ASSERT(depth < 2);
    if (page_size == UVM_PAGE_SIZE_4K && depth == 0)
@ -128,7 +128,7 @@ static NvLength entry_size_maxwell(NvU32 depth)
    return 8;
 }

-static NvU32 index_bits_maxwell_64(NvU32 depth, NvU32 page_size)
+static NvU32 index_bits_maxwell_64(NvU32 depth, NvU64 page_size)
 {
    UVM_ASSERT(depth < 2);
    UVM_ASSERT(page_size == UVM_PAGE_SIZE_4K ||
@ -146,7 +146,7 @@ static NvU32 index_bits_maxwell_64(NvU32 depth, NvU32 page_size)
    }
 }

-static NvU32 index_bits_maxwell_128(NvU32 depth, NvU32 page_size)
+static NvU32 index_bits_maxwell_128(NvU32 depth, NvU64 page_size)
 {
    UVM_ASSERT(depth < 2);
    UVM_ASSERT(page_size == UVM_PAGE_SIZE_4K ||
@ -169,32 +169,32 @@ static NvU32 num_va_bits_maxwell(void)
    return 40;
 }

-static NvLength allocation_size_maxwell_64(NvU32 depth, NvU32 page_size)
+static NvLength allocation_size_maxwell_64(NvU32 depth, NvU64 page_size)
 {
    return entry_size_maxwell(depth) << index_bits_maxwell_64(depth, page_size);
 }

-static NvLength allocation_size_maxwell_128(NvU32 depth, NvU32 page_size)
+static NvLength allocation_size_maxwell_128(NvU32 depth, NvU64 page_size)
 {
    return entry_size_maxwell(depth) << index_bits_maxwell_128(depth, page_size);
 }

-static NvU32 page_table_depth_maxwell(NvU32 page_size)
+static NvU32 page_table_depth_maxwell(NvU64 page_size)
 {
    return 1;
 }

-static NvU32 page_sizes_maxwell_128(void)
+static NvU64 page_sizes_maxwell_128(void)
 {
    return UVM_PAGE_SIZE_128K | UVM_PAGE_SIZE_4K;
 }

-static NvU32 page_sizes_maxwell_64(void)
+static NvU64 page_sizes_maxwell_64(void)
 {
    return UVM_PAGE_SIZE_64K | UVM_PAGE_SIZE_4K;
 }

-static NvU64 unmapped_pte_maxwell(NvU32 page_size)
+static NvU64 unmapped_pte_maxwell(NvU64 page_size)
 {
    // Setting the privilege bit on an otherwise-zeroed big PTE causes the
    // corresponding 4k PTEs to be ignored. This allows the invalidation of a
@ -356,7 +356,7 @@ static uvm_mmu_mode_hal_t maxwell_128_mmu_mode_hal =
    .page_sizes = page_sizes_maxwell_128
 };

-uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_maxwell(NvU32 big_page_size)
+uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_maxwell(NvU64 big_page_size)
 {
    UVM_ASSERT(big_page_size == UVM_PAGE_SIZE_64K || big_page_size == UVM_PAGE_SIZE_128K);
    if (big_page_size == UVM_PAGE_SIZE_64K)
--- a/kernel-open/nvidia-uvm/uvm_mem.c
+++ b/kernel-open/nvidia-uvm/uvm_mem.c
@ -290,15 +290,15 @@ uvm_chunk_sizes_mask_t uvm_mem_kernel_chunk_sizes(uvm_gpu_t *gpu)
    // Get the mmu mode hal directly as the internal address space tree has not
    // been created yet.
    uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(gpu->big_page.internal_size);
-    NvU32 page_sizes = hal->page_sizes();
+    NvU64 page_sizes = hal->page_sizes();

    return (uvm_chunk_sizes_mask_t)(page_sizes & UVM_CHUNK_SIZES_MASK);
 }

-static NvU32 mem_pick_chunk_size(uvm_mem_t *mem)
+static NvU64 mem_pick_chunk_size(uvm_mem_t *mem)
 {
-    NvU32 biggest_page_size;
-    NvU32 chunk_size;
+    NvU64 biggest_page_size;
+    NvU64 chunk_size;

    if (uvm_mem_is_sysmem(mem))
        return PAGE_SIZE;
@ -315,12 +315,12 @@ static NvU32 mem_pick_chunk_size(uvm_mem_t *mem)
    // When UVM_PAGE_SIZE_DEFAULT is used on NUMA-enabled GPUs, we force
    // chunk_size to be PAGE_SIZE at least, to allow CPU mappings.
    if (mem->backing_gpu->mem_info.numa.enabled)
-        chunk_size = max(chunk_size, (NvU32)PAGE_SIZE);
+        chunk_size = max(chunk_size, (NvU64)PAGE_SIZE);

    return chunk_size;
 }

-static NvU32 mem_pick_gpu_page_size(uvm_mem_t *mem, uvm_gpu_t *gpu, uvm_page_tree_t *gpu_page_tree)
+static NvU64 mem_pick_gpu_page_size(uvm_mem_t *mem, uvm_gpu_t *gpu, uvm_page_tree_t *gpu_page_tree)
 {
    if (uvm_mem_is_vidmem(mem)) {
        // For vidmem allocations the chunk size is picked out of the supported
@ -467,7 +467,7 @@ static NV_STATUS mem_alloc_sysmem_dma_chunks(uvm_mem_t *mem, gfp_t gfp_flags)
    NvU64 *dma_addrs;

    UVM_ASSERT_MSG(mem->chunk_size == PAGE_SIZE,
-                   "mem->chunk_size is 0x%x. PAGE_SIZE is only supported.",
+                   "mem->chunk_size is 0x%llx. PAGE_SIZE is only supported.",
                   mem->chunk_size);
    UVM_ASSERT(uvm_mem_is_sysmem_dma(mem));

@ -528,10 +528,9 @@ static NV_STATUS mem_alloc_sysmem_chunks(uvm_mem_t *mem, gfp_t gfp_flags)

 // In case of failure, the caller is required to handle cleanup by calling
 // uvm_mem_free
-static NV_STATUS mem_alloc_vidmem_chunks(uvm_mem_t *mem, bool zero, bool is_unprotected)
+static NV_STATUS mem_alloc_vidmem_chunks(uvm_mem_t *mem, bool zero)
 {
    NV_STATUS status;
-    uvm_pmm_gpu_memory_type_t mem_type;

    UVM_ASSERT(uvm_mem_is_vidmem(mem));

@ -548,23 +547,15 @@ static NV_STATUS mem_alloc_vidmem_chunks(uvm_mem_t *mem, bool zero, bool is_unpr
    if (!mem->vidmem.chunks)
        return NV_ERR_NO_MEMORY;

-    // When CC is disabled the behavior is identical to that of PMM, and the
-    // protection flag is ignored (squashed by PMM internally).
-    if (is_unprotected)
-        mem_type = UVM_PMM_GPU_MEMORY_TYPE_KERNEL_UNPROTECTED;
-    else
-        mem_type = UVM_PMM_GPU_MEMORY_TYPE_KERNEL_PROTECTED;
-
-    status = uvm_pmm_gpu_alloc(&mem->backing_gpu->pmm,
+    status = uvm_pmm_gpu_alloc_kernel(&mem->backing_gpu->pmm,
                                      mem->chunks_count,
                                      mem->chunk_size,
-                               mem_type,
                                      UVM_PMM_ALLOC_FLAGS_NONE,
                                      mem->vidmem.chunks,
                                      NULL);

    if (status != NV_OK) {
-        UVM_ERR_PRINT("uvm_pmm_gpu_alloc (count=%zd, size=0x%x) failed: %s\n",
+        UVM_ERR_PRINT("uvm_pmm_gpu_alloc_kernel (count=%zd, size=0x%llx) failed: %s\n",
                      mem->chunks_count,
                      mem->chunk_size,
                      nvstatusToString(status));
@ -574,7 +565,7 @@ static NV_STATUS mem_alloc_vidmem_chunks(uvm_mem_t *mem, bool zero, bool is_unpr
    return NV_OK;
 }

-static NV_STATUS mem_alloc_chunks(uvm_mem_t *mem, struct mm_struct *mm, bool zero, bool is_unprotected)
+static NV_STATUS mem_alloc_chunks(uvm_mem_t *mem, struct mm_struct *mm, bool zero)
 {
    if (uvm_mem_is_sysmem(mem)) {
        gfp_t gfp_flags;
@ -596,7 +587,7 @@ static NV_STATUS mem_alloc_chunks(uvm_mem_t *mem, struct mm_struct *mm, bool zer
        return status;
    }

-    return mem_alloc_vidmem_chunks(mem, zero, is_unprotected);
+    return mem_alloc_vidmem_chunks(mem, zero);
 }

 NV_STATUS uvm_mem_map_kernel(uvm_mem_t *mem, const uvm_processor_mask_t *mask)
@ -626,7 +617,6 @@ NV_STATUS uvm_mem_alloc(const uvm_mem_alloc_params_t *params, uvm_mem_t **mem_ou
    NV_STATUS status;
    NvU64 physical_size;
    uvm_mem_t *mem = NULL;
-    bool is_unprotected = false;

    UVM_ASSERT(params->size > 0);

@ -648,12 +638,7 @@ NV_STATUS uvm_mem_alloc(const uvm_mem_alloc_params_t *params, uvm_mem_t **mem_ou
    physical_size = UVM_ALIGN_UP(mem->size, mem->chunk_size);
    mem->chunks_count = physical_size / mem->chunk_size;

-    if (params->is_unprotected)
-        UVM_ASSERT(uvm_mem_is_vidmem(mem));
-
-    is_unprotected = params->is_unprotected;
-
-    status = mem_alloc_chunks(mem, params->mm, params->zero, is_unprotected);
+    status = mem_alloc_chunks(mem, params->mm, params->zero);
    if (status != NV_OK)
        goto error;

@ -1050,7 +1035,7 @@ static NV_STATUS mem_map_gpu(uvm_mem_t *mem,
                             uvm_page_table_range_vec_t **range_vec)
 {
    NV_STATUS status;
-    NvU32 page_size;
+    NvU64 page_size;
    uvm_pmm_alloc_flags_t pmm_flags = UVM_PMM_ALLOC_FLAGS_EVICT;

    uvm_mem_pte_maker_data_t pte_maker_data = {
@ -1059,7 +1044,7 @@ static NV_STATUS mem_map_gpu(uvm_mem_t *mem,
        };

    page_size = mem_pick_gpu_page_size(mem, gpu, tree);
-    UVM_ASSERT_MSG(uvm_mmu_page_size_supported(tree, page_size), "page_size 0x%x\n", page_size);
+    UVM_ASSERT_MSG(uvm_mmu_page_size_supported(tree, page_size), "page_size 0x%llx\n", page_size);

    // When the Confidential Computing feature is enabled, DMA allocations are
    // majoritarily allocated and managed by a per-GPU DMA buffer pool
--- a/kernel-open/nvidia-uvm/uvm_mem.h
+++ b/kernel-open/nvidia-uvm/uvm_mem.h
@ -126,12 +126,7 @@ typedef struct
    //
    // CPU mappings will always use PAGE_SIZE, so the physical allocation chunk
    // has to be aligned to PAGE_SIZE.
-    NvU32 page_size;
-
-    // The protection flag is only observed for vidmem allocations when CC is
-    // enabled. If set to true, the allocation returns unprotected vidmem;
-    // otherwise, the allocation returns protected vidmem.
-    bool is_unprotected;
+    NvU64 page_size;

    // If true, the allocation is zeroed (scrubbed).
    bool zero;
@ -199,7 +194,7 @@ struct uvm_mem_struct
    size_t chunks_count;

    // Size of each physical chunk (vidmem) or CPU page (sysmem)
-    NvU32 chunk_size;
+    NvU64 chunk_size;

    // Size of the allocation
    NvU64 size;
--- a/kernel-open/nvidia-uvm/uvm_mem_test.c
+++ b/kernel-open/nvidia-uvm/uvm_mem_test.c
@ -153,7 +153,7 @@ static NV_STATUS check_accessible_from_gpu(uvm_gpu_t *gpu, uvm_mem_t *mem)

    for (i = 0; i < verif_size / sizeof(*sys_verif); ++i) {
        if (sys_verif[i] != mem->size + i) {
-            UVM_TEST_PRINT("Verif failed for %zd = 0x%llx instead of 0x%llx, verif_size=0x%llx mem(size=0x%llx, page_size=%u, processor=%u)\n",
+            UVM_TEST_PRINT("Verif failed for %zd = 0x%llx instead of 0x%llx, verif_size=0x%llx mem(size=0x%llx, page_size=%llu, processor=%u)\n",
                           i,
                           sys_verif[i],
                           (NvU64)(verif_size + i),
@ -241,7 +241,7 @@ static NV_STATUS test_map_cpu(uvm_mem_t *mem)
    return NV_OK;
 }

-static NV_STATUS test_alloc_sysmem(uvm_va_space_t *va_space, NvU32 page_size, size_t size, uvm_mem_t **mem_out)
+static NV_STATUS test_alloc_sysmem(uvm_va_space_t *va_space, NvU64 page_size, size_t size, uvm_mem_t **mem_out)
 {
    NV_STATUS status;
    uvm_mem_t *mem;
@ -299,7 +299,7 @@ error:
    return status;
 }

-static NV_STATUS test_alloc_vidmem(uvm_gpu_t *gpu, NvU32 page_size, size_t size, uvm_mem_t **mem_out)
+static NV_STATUS test_alloc_vidmem(uvm_gpu_t *gpu, NvU64 page_size, size_t size, uvm_mem_t **mem_out)
 {
    NV_STATUS status;
    uvm_mem_t *mem;
@ -334,7 +334,7 @@ error:
    return status;
 }

-static bool should_test_page_size(size_t alloc_size, NvU32 page_size)
+static bool should_test_page_size(size_t alloc_size, NvU64 page_size)
 {
    if (g_uvm_global.num_simulated_devices == 0)
        return true;
@ -359,7 +359,7 @@ static NV_STATUS test_all(uvm_va_space_t *va_space)
    // size on pre-Pascal GPUs with 128K big page size.
    // Ampere+ also supports 512M PTEs, but since UVM's maximum chunk size is
    // 2M, we don't test for this page size.
-    static const NvU32 cpu_chunk_sizes = PAGE_SIZE | UVM_PAGE_SIZE_64K | UVM_PAGE_SIZE_128K | UVM_PAGE_SIZE_2M;
+    static const NvU64 cpu_chunk_sizes = PAGE_SIZE | UVM_PAGE_SIZE_64K | UVM_PAGE_SIZE_128K | UVM_PAGE_SIZE_2M;

    // All supported page sizes will be tested, CPU has the most with 4 and +1
    // for the default.
@ -494,41 +494,6 @@ done:
    return status;
 }

-static NV_STATUS test_basic_vidmem_unprotected(uvm_gpu_t *gpu)
-{
-    NV_STATUS status = NV_OK;
-    uvm_mem_t *mem = NULL;
-
-    uvm_mem_alloc_params_t params = { 0 };
-    params.size = UVM_PAGE_SIZE_4K;
-    params.backing_gpu = gpu;
-    params.page_size = UVM_PAGE_SIZE_4K;
-
-    // If CC is enabled, the protection flag is observed. Because currently all
-    // vidmem is in the protected region, the allocation should succeed.
-    //
-    // If CC is disabled, the protection flag is ignored.
-    params.is_unprotected = false;
-    TEST_NV_CHECK_RET(uvm_mem_alloc(&params, &mem));
-
-    uvm_mem_free(mem);
-    mem = NULL;
-
-    // If CC is enabled, the allocation should fail because currently the
-    // unprotected region is empty.
-    //
-    // If CC is disabled, the behavior should be identical to that of a
-    // protected allocation.
-    params.is_unprotected = true;
-    if (g_uvm_global.conf_computing_enabled)
-        TEST_CHECK_RET(uvm_mem_alloc(&params, &mem) == NV_ERR_NO_MEMORY);
-    else
-        TEST_NV_CHECK_RET(uvm_mem_alloc(&params, &mem));
-
-    uvm_mem_free(mem);
-    return status;
-}
-
 static NV_STATUS test_basic_sysmem(void)
 {
    NV_STATUS status = NV_OK;
@ -613,7 +578,6 @@ static NV_STATUS test_basic(uvm_va_space_t *va_space)
    for_each_va_space_gpu(gpu, va_space) {
        TEST_NV_CHECK_RET(test_basic_vidmem(gpu));
        TEST_NV_CHECK_RET(test_basic_sysmem_dma(gpu));
-        TEST_NV_CHECK_RET(test_basic_vidmem_unprotected(gpu));
        TEST_NV_CHECK_RET(test_basic_dma_pool(gpu));
    }

--- a/kernel-open/nvidia-uvm/uvm_migrate.c
+++ b/kernel-open/nvidia-uvm/uvm_migrate.c
@ -589,7 +589,7 @@ static NV_STATUS uvm_migrate_ranges(uvm_va_space_t *va_space,
                    skipped_migrate = true;
            }
            else if (uvm_processor_mask_test(&va_range->uvm_lite_gpus, dest_id) &&
-                     !uvm_va_policy_preferred_location_equal(policy, dest_id, NUMA_NO_NODE)) {
+                     !uvm_id_equal(dest_id, policy->preferred_location)) {
                // Don't migrate to a non-faultable GPU that is in UVM-Lite mode,
                // unless it's the preferred location
                status = NV_ERR_INVALID_DEVICE;
--- a/kernel-open/nvidia-uvm/uvm_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_mmu.c
@ -153,20 +153,17 @@ static NV_STATUS phys_mem_allocate_sysmem(uvm_page_tree_t *tree, NvLength size,
 // - UVM_APERTURE_VID       biggest page size on vidmem mappings
 // - UVM_APERTURE_SYS       biggest page size on sysmem mappings
 // - UVM_APERTURE_PEER_0-7  biggest page size on peer mappings
-static NvU32 mmu_biggest_page_size(uvm_page_tree_t *tree, uvm_aperture_t aperture)
+static NvU64 mmu_biggest_page_size(uvm_page_tree_t *tree, uvm_aperture_t aperture)
 {
    UVM_ASSERT(aperture < UVM_APERTURE_DEFAULT);

    // There may be scenarios where the GMMU must use a subset of the supported
    // page sizes, e.g., to comply with the vMMU supported page sizes due to
    // segmentation sizes.
-    if (aperture == UVM_APERTURE_VID) {
-        UVM_ASSERT(tree->gpu->mem_info.max_vidmem_page_size <= NV_U32_MAX);
-        return (NvU32) tree->gpu->mem_info.max_vidmem_page_size;
-    }
-    else {
-        return 1 << __fls(tree->hal->page_sizes());
-    }
+    if (aperture == UVM_APERTURE_VID)
+        return uvm_mmu_biggest_page_size_up_to(tree, tree->gpu->mem_info.max_vidmem_page_size);
+
+    return 1ULL << __fls(tree->hal->page_sizes());
 }

 static NV_STATUS phys_mem_allocate_vidmem(uvm_page_tree_t *tree,
@ -254,7 +251,7 @@ static void phys_mem_deallocate(uvm_page_tree_t *tree, uvm_mmu_page_table_alloc_
 }

 static void page_table_range_init(uvm_page_table_range_t *range,
-                                 NvU32 page_size,
+                                 NvU64 page_size,
                                 uvm_page_directory_t *dir,
                                 NvU32 start_index,
                                 NvU32 end_index)
@ -444,9 +441,9 @@ static void pde_fill(uvm_page_tree_t *tree,
        pde_fill_cpu(tree, directory, start_index, pde_count, phys_addr);
 }

-static void phys_mem_init(uvm_page_tree_t *tree, NvU32 page_size, uvm_page_directory_t *dir, uvm_push_t *push)
+static void phys_mem_init(uvm_page_tree_t *tree, NvU64 page_size, uvm_page_directory_t *dir, uvm_push_t *push)
 {
-    NvU32 entries_count = uvm_mmu_page_tree_entries(tree, dir->depth, page_size);
+    NvU64 entries_count = uvm_mmu_page_tree_entries(tree, dir->depth, page_size);
    NvU8 max_pde_depth = tree->hal->page_table_depth(UVM_PAGE_SIZE_AGNOSTIC) - 1;

    // Passing in NULL for the phys_allocs will mark the child entries as
@ -497,7 +494,7 @@ static void phys_mem_init(uvm_page_tree_t *tree, NvU32 page_size, uvm_page_direc
 }

 static uvm_page_directory_t *allocate_directory(uvm_page_tree_t *tree,
-                                                NvU32 page_size,
+                                                NvU64 page_size,
                                                NvU32 depth,
                                                uvm_pmm_alloc_flags_t pmm_flags)
 {
@ -546,7 +543,7 @@ static inline NvU32 entry_index_from_vaddr(NvU64 vaddr, NvU32 addr_bit_shift, Nv
    return (NvU32)((vaddr >> addr_bit_shift) & mask);
 }

-static inline NvU32 index_to_entry(uvm_mmu_mode_hal_t *hal, NvU32 entry_index, NvU32 depth, NvU32 page_size)
+static inline NvU32 index_to_entry(uvm_mmu_mode_hal_t *hal, NvU32 entry_index, NvU32 depth, NvU64 page_size)
 {
    return hal->entries_per_index(depth) * entry_index + hal->entry_offset(depth, page_size);
 }
@ -583,7 +580,7 @@ static void pde_write(uvm_page_tree_t *tree,
    pde_fill(tree, dir, entry_index, 1, phys_allocs, push);
 }

-static void host_pde_clear(uvm_page_tree_t *tree, uvm_page_directory_t *dir, NvU32 entry_index, NvU32 page_size)
+static void host_pde_clear(uvm_page_tree_t *tree, uvm_page_directory_t *dir, NvU32 entry_index, NvU64 page_size)
 {
    UVM_ASSERT(dir->ref_count > 0);

@ -594,35 +591,38 @@ static void host_pde_clear(uvm_page_tree_t *tree, uvm_page_directory_t *dir, NvU
 static void pde_clear(uvm_page_tree_t *tree,
                      uvm_page_directory_t *dir,
                      NvU32 entry_index,
-                      NvU32 page_size,
+                      NvU64 page_size,
                      uvm_push_t *push)
 {
    host_pde_clear(tree, dir, entry_index, page_size);
    pde_write(tree, dir, entry_index, false, push);
 }

-static uvm_chunk_sizes_mask_t allocation_sizes_for_big_page_size(uvm_parent_gpu_t *parent_gpu, NvU32 big_page_size)
+static uvm_chunk_sizes_mask_t allocation_sizes_for_big_page_size(uvm_parent_gpu_t *parent_gpu, NvU64 big_page_size)
 {
-    uvm_chunk_sizes_mask_t alloc_sizes = 0;
    uvm_mmu_mode_hal_t *hal = parent_gpu->arch_hal->mmu_mode_hal(big_page_size);
+    unsigned long page_sizes, page_size_log2;
+    uvm_chunk_sizes_mask_t alloc_sizes;
+
+    if (hal == NULL)
+        return 0;
+
+    page_sizes = hal->page_sizes();
+    alloc_sizes = 0;

-    if (hal != NULL) {
-        unsigned long page_size_log2;
-        unsigned long page_sizes = hal->page_sizes();
    BUILD_BUG_ON(sizeof(hal->page_sizes()) > sizeof(page_sizes));

    for_each_set_bit(page_size_log2, &page_sizes, BITS_PER_LONG) {
        NvU32 i;
-            NvU32 page_size = (NvU32)(1ULL << page_size_log2);
+        NvU64 page_size = 1ULL << page_size_log2;
        for (i = 0; i <= hal->page_table_depth(page_size); i++)
            alloc_sizes |= hal->allocation_size(i, page_size);
    }
-    }

    return alloc_sizes;
 }

-static NvU32 page_sizes_for_big_page_size(uvm_parent_gpu_t *parent_gpu, NvU32 big_page_size)
+static NvU64 page_sizes_for_big_page_size(uvm_parent_gpu_t *parent_gpu, NvU64 big_page_size)
 {
    uvm_mmu_mode_hal_t *hal = parent_gpu->arch_hal->mmu_mode_hal(big_page_size);

@ -662,7 +662,7 @@ static NV_STATUS page_tree_end_and_wait(uvm_page_tree_t *tree, uvm_push_t *push)
 }

 static NV_STATUS write_gpu_state_cpu(uvm_page_tree_t *tree,
-                                     NvU32 page_size,
+                                     NvU64 page_size,
                                     NvS32 invalidate_depth,
                                     NvU32 used_count,
                                     uvm_page_directory_t **dirs_used)
@ -713,7 +713,7 @@ static NV_STATUS write_gpu_state_cpu(uvm_page_tree_t *tree,
 }

 static NV_STATUS write_gpu_state_gpu(uvm_page_tree_t *tree,
-                                     NvU32 page_size,
+                                     NvU64 page_size,
                                     NvS32 invalidate_depth,
                                     NvU32 used_count,
                                     uvm_page_directory_t **dirs_used)
@ -805,7 +805,7 @@ static NV_STATUS write_gpu_state_gpu(uvm_page_tree_t *tree,

 // initialize new page tables and insert them into the tree
 static NV_STATUS write_gpu_state(uvm_page_tree_t *tree,
-                                 NvU32 page_size,
+                                 NvU64 page_size,
                                 NvS32 invalidate_depth,
                                 NvU32 used_count,
                                 uvm_page_directory_t **dirs_used)
@ -842,7 +842,7 @@ static void free_unused_directories(uvm_page_tree_t *tree,
    }
 }

-static NV_STATUS allocate_page_table(uvm_page_tree_t *tree, NvU32 page_size, uvm_mmu_page_table_alloc_t *out)
+static NV_STATUS allocate_page_table(uvm_page_tree_t *tree, NvU64 page_size, uvm_mmu_page_table_alloc_t *out)
 {
    NvU32 depth = tree->hal->page_table_depth(page_size);
    NvLength alloc_size = tree->hal->allocation_size(depth, page_size);
@ -871,7 +871,7 @@ static NV_STATUS page_tree_ats_init(uvm_page_tree_t *tree)
 {
    NV_STATUS status;
    NvU64 min_va_upper, max_va_lower;
-    NvU32 page_size;
+    NvU64 page_size;

    if (!page_tree_ats_init_required(tree))
        return NV_OK;
@ -1090,7 +1090,7 @@ static void page_tree_set_location(uvm_page_tree_t *tree, uvm_aperture_t locatio
 NV_STATUS uvm_page_tree_init(uvm_gpu_t *gpu,
                             uvm_gpu_va_space_t *gpu_va_space,
                             uvm_page_tree_type_t type,
-                             NvU32 big_page_size,
+                             NvU64 big_page_size,
                             uvm_aperture_t location,
                             uvm_page_tree_t *tree)
 {
@ -1110,7 +1110,7 @@ NV_STATUS uvm_page_tree_init(uvm_gpu_t *gpu,
    tree->gpu_va_space = gpu_va_space;
    tree->big_page_size = big_page_size;

-    UVM_ASSERT(gpu->mem_info.max_vidmem_page_size & tree->hal->page_sizes());
+    UVM_ASSERT(uvm_mmu_page_size_supported(tree, big_page_size));

    page_tree_set_location(tree, location);

@ -1347,7 +1347,7 @@ NV_STATUS uvm_page_tree_wait(uvm_page_tree_t *tree)
 }

 static NV_STATUS try_get_ptes(uvm_page_tree_t *tree,
-                              NvU32 page_size,
+                              NvU64 page_size,
                              NvU64 start,
                              NvLength size,
                              uvm_page_table_range_t *range,
@ -1379,7 +1379,7 @@ static NV_STATUS try_get_ptes(uvm_page_tree_t *tree,
    // This algorithm will work with unaligned ranges, but the caller's intent
    // is unclear
    UVM_ASSERT_MSG(start % page_size == 0 && size % page_size == 0,
-                   "start 0x%llx size 0x%zx page_size 0x%x\n",
+                   "start 0x%llx size 0x%zx page_size 0x%llx\n",
                   start,
                   (size_t)size,
                   page_size);
@ -1448,7 +1448,7 @@ static NV_STATUS map_remap(uvm_page_tree_t *tree, NvU64 start, NvLength size, uv
 {
    NV_STATUS status;
    uvm_push_t push;
-    NvU32 page_sizes;
+    NvU64 page_sizes;
    uvm_mmu_page_table_alloc_t *phys_alloc[1];

    // TODO: Bug 2734399
@ -1460,7 +1460,7 @@ static NV_STATUS map_remap(uvm_page_tree_t *tree, NvU64 start, NvLength size, uv
    status = page_tree_begin_acquire(tree,
                                     &tree->tracker,
                                     &push,
-                                     "map remap: [0x%llx, 0x%llx), page_size: %d",
+                                     "map remap: [0x%llx, 0x%llx), page_size: %lld",
                                     start,
                                     start + size,
                                     range->page_size);
@ -1500,7 +1500,7 @@ static NV_STATUS map_remap(uvm_page_tree_t *tree, NvU64 start, NvLength size, uv
 }

 NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree,
-                                       NvU32 page_size,
+                                       NvU64 page_size,
                                       NvU64 start,
                                       NvLength size,
                                       uvm_pmm_alloc_flags_t pmm_flags,
@ -1545,7 +1545,7 @@ NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree,
 }

 NV_STATUS uvm_page_tree_get_ptes(uvm_page_tree_t *tree,
-                                 NvU32 page_size,
+                                 NvU64 page_size,
                                 NvU64 start,
                                 NvLength size,
                                 uvm_pmm_alloc_flags_t pmm_flags,
@ -1596,7 +1596,7 @@ void uvm_page_table_range_shrink(uvm_page_tree_t *tree, uvm_page_table_range_t *
 }

 NV_STATUS uvm_page_tree_get_entry(uvm_page_tree_t *tree,
-                                  NvU32 page_size,
+                                  NvU64 page_size,
                                  NvU64 start,
                                  uvm_pmm_alloc_flags_t pmm_flags,
                                  uvm_page_table_range_t *single)
@ -1621,7 +1621,7 @@ void uvm_page_tree_clear_pde(uvm_page_tree_t *tree, uvm_page_table_range_t *sing
 static NV_STATUS poison_ptes(uvm_page_tree_t *tree,
                             uvm_page_directory_t *pte_dir,
                             uvm_page_directory_t *parent,
-                             NvU32 page_size)
+                             NvU64 page_size)
 {
    NV_STATUS status;
    uvm_push_t push;
@ -1633,7 +1633,7 @@ static NV_STATUS poison_ptes(uvm_page_tree_t *tree,
    // The flat mappings should always be set up when executing this path
    UVM_ASSERT(!uvm_mmu_use_cpu(tree));

-    status = page_tree_begin_acquire(tree, &tree->tracker, &push, "Poisoning child table of page size %u", page_size);
+    status = page_tree_begin_acquire(tree, &tree->tracker, &push, "Poisoning child table of page size %llu", page_size);
    if (status != NV_OK)
        return status;

@ -1660,7 +1660,7 @@ static NV_STATUS poison_ptes(uvm_page_tree_t *tree,
 }

 NV_STATUS uvm_page_tree_alloc_table(uvm_page_tree_t *tree,
-                                    NvU32 page_size,
+                                    NvU64 page_size,
                                    uvm_pmm_alloc_flags_t pmm_flags,
                                    uvm_page_table_range_t *single,
                                    uvm_page_table_range_t *children)
@ -1768,7 +1768,7 @@ static size_t range_vec_calc_range_index(uvm_page_table_range_vec_t *range_vec,
 NV_STATUS uvm_page_table_range_vec_init(uvm_page_tree_t *tree,
                                        NvU64 start,
                                        NvU64 size,
-                                        NvU32 page_size,
+                                        NvU64 page_size,
                                        uvm_pmm_alloc_flags_t pmm_flags,
                                        uvm_page_table_range_vec_t *range_vec)
 {
@ -1776,8 +1776,8 @@ NV_STATUS uvm_page_table_range_vec_init(uvm_page_tree_t *tree,
    size_t i;

    UVM_ASSERT(size != 0);
-    UVM_ASSERT_MSG(IS_ALIGNED(start, page_size), "start 0x%llx page_size 0x%x\n", start, page_size);
-    UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%x\n", size, page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(start, page_size), "start 0x%llx page_size 0x%llx\n", start, page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%llx\n", size, page_size);

    range_vec->tree = tree;
    range_vec->page_size = page_size;
@ -1826,7 +1826,7 @@ out:
 NV_STATUS uvm_page_table_range_vec_create(uvm_page_tree_t *tree,
                                          NvU64 start,
                                          NvU64 size,
-                                          NvU32 page_size,
+                                          NvU64 page_size,
                                          uvm_pmm_alloc_flags_t pmm_flags,
                                          uvm_page_table_range_vec_t **range_vec_out)
 {
@ -1952,7 +1952,7 @@ static NV_STATUS uvm_page_table_range_vec_clear_ptes_gpu(uvm_page_table_range_ve
    size_t i;
    uvm_page_tree_t *tree = range_vec->tree;
    uvm_gpu_t *gpu = tree->gpu;
-    NvU32 page_size = range_vec->page_size;
+    NvU64 page_size = range_vec->page_size;
    NvU32 entry_size = uvm_mmu_pte_size(tree, page_size);
    NvU64 invalid_pte = 0;
    uvm_push_t push;
@ -2237,7 +2237,7 @@ static NV_STATUS create_identity_mapping(uvm_gpu_t *gpu,
                                         NvU64 size,
                                         uvm_aperture_t aperture,
                                         NvU64 phys_offset,
-                                         NvU32 page_size,
+                                         NvU64 page_size,
                                         uvm_pmm_alloc_flags_t pmm_flags)
 {
    NV_STATUS status;
@ -2312,7 +2312,7 @@ bool uvm_mmu_parent_gpu_needs_dynamic_sysmem_mapping(uvm_parent_gpu_t *parent_gp

 NV_STATUS create_static_vidmem_mapping(uvm_gpu_t *gpu)
 {
-    NvU32 page_size;
+    NvU64 page_size;
    NvU64 size;
    uvm_aperture_t aperture = UVM_APERTURE_VID;
    NvU64 phys_offset = 0;
@ -2351,7 +2351,7 @@ static void destroy_static_vidmem_mapping(uvm_gpu_t *gpu)

 NV_STATUS uvm_mmu_create_peer_identity_mappings(uvm_gpu_t *gpu, uvm_gpu_t *peer)
 {
-    NvU32 page_size;
+    NvU64 page_size;
    NvU64 size;
    uvm_aperture_t aperture;
    NvU64 phys_offset;
@ -2535,7 +2535,7 @@ static void root_chunk_mapping_destroy(uvm_gpu_t *gpu, uvm_gpu_root_chunk_mappin
    uvm_push_t push;
    NvU32 entry_size;
    uvm_pte_batch_t pte_batch;
-    NvU32 page_size;
+    NvU64 page_size;
    NvU64 size;
    NvU64 invalid_pte;
    uvm_page_table_range_t *range = root_chunk_mapping->range;
@ -2585,7 +2585,7 @@ static NV_STATUS root_chunk_mapping_create(uvm_gpu_t *gpu, uvm_gpu_root_chunk_ma
    uvm_push_t push;
    NvU64 pte_bits;
    NvU32 entry_size;
-    NvU32 page_size = UVM_CHUNK_SIZE_MAX;
+    NvU64 page_size = UVM_CHUNK_SIZE_MAX;
    NvU64 size = UVM_CHUNK_SIZE_MAX;

    range = uvm_kvmalloc_zero(sizeof(*range));
@ -2852,7 +2852,7 @@ NV_STATUS uvm_mmu_sysmem_map(uvm_gpu_t *gpu, NvU64 pa, NvU64 size)
        if (sysmem_mapping->range_vec == NULL) {
            uvm_gpu_address_t virtual_address = uvm_parent_gpu_address_virtual_from_sysmem_phys(gpu->parent, curr_pa);
            NvU64 phys_offset = curr_pa;
-            NvU32 page_size = mmu_biggest_page_size(&gpu->address_space_tree, UVM_APERTURE_SYS);
+            NvU64 page_size = mmu_biggest_page_size(&gpu->address_space_tree, UVM_APERTURE_SYS);
            uvm_pmm_alloc_flags_t pmm_flags;

            // No eviction is requested when allocating the page tree storage,
--- a/kernel-open/nvidia-uvm/uvm_mmu.h
+++ b/kernel-open/nvidia-uvm/uvm_mmu.h
@ -208,7 +208,7 @@ struct uvm_mmu_mode_hal_struct
    // This is an optimization which reduces TLB pressure, reduces the number of
    // TLB invalidates we must issue, and means we don't have to initialize the
    // 4k PTEs which are covered by big PTEs since the MMU will never read them.
-    NvU64 (*unmapped_pte)(NvU32 page_size);
+    NvU64 (*unmapped_pte)(NvU64 page_size);

    // Bit pattern used for debug purposes to clobber PTEs which ought to be
    // unused. In practice this will generate a PRIV violation or a physical
@ -234,23 +234,23 @@ struct uvm_mmu_mode_hal_struct
    // For dual PDEs, this is ether 1 or 0, depending on the page size.
    // This is used to index the host copy only. GPU PDEs are always entirely
    // re-written using make_pde.
-    NvLength (*entry_offset)(NvU32 depth, NvU32 page_size);
+    NvLength (*entry_offset)(NvU32 depth, NvU64 page_size);

    // number of virtual address bits used to index the directory/table at a
    // given depth
-    NvU32 (*index_bits)(NvU32 depth, NvU32 page_size);
+    NvU32 (*index_bits)(NvU32 depth, NvU64 page_size);

    // total number of bits that represent the virtual address space
    NvU32 (*num_va_bits)(void);

    // the size, in bytes, of a directory/table at a given depth.
-    NvLength (*allocation_size)(NvU32 depth, NvU32 page_size);
+    NvLength (*allocation_size)(NvU32 depth, NvU64 page_size);

    // the depth which corresponds to the page tables
-    NvU32 (*page_table_depth)(NvU32 page_size);
+    NvU32 (*page_table_depth)(NvU64 page_size);

    // bitwise-or of supported page sizes
-    NvU32 (*page_sizes)(void);
+    NvU64 (*page_sizes)(void);
 };

 struct uvm_page_table_range_struct
@ -258,7 +258,7 @@ struct uvm_page_table_range_struct
    uvm_page_directory_t *table;
    NvU32 start_index;
    NvU32 entry_count;
-    NvU32 page_size;
+    NvU64 page_size;
 };

 typedef enum
@ -275,7 +275,7 @@ struct uvm_page_tree_struct
    uvm_page_directory_t *root;
    uvm_mmu_mode_hal_t *hal;
    uvm_page_tree_type_t type;
-    NvU32 big_page_size;
+    NvU64 big_page_size;

    // Pointer to the GPU VA space containing the page tree.
    // This pointer is set only for page trees of type
@ -325,7 +325,7 @@ struct uvm_page_table_range_vec_struct
    NvU64 size;

    // Page size used for all the page table ranges
-    NvU32 page_size;
+    NvU64 page_size;

    // Page table ranges covering the VA
    uvm_page_table_range_t *ranges;
@ -352,7 +352,7 @@ void uvm_mmu_init_gpu_peer_addresses(uvm_gpu_t *gpu);
 NV_STATUS uvm_page_tree_init(uvm_gpu_t *gpu,
                             uvm_gpu_va_space_t *gpu_va_space,
                             uvm_page_tree_type_t type,
-                             NvU32 big_page_size,
+                             NvU64 big_page_size,
                             uvm_aperture_t location,
                             uvm_page_tree_t *tree_out);

@ -374,7 +374,7 @@ void uvm_page_tree_deinit(uvm_page_tree_t *tree);
 // an existing range or change the size of an existing range, use
 // uvm_page_table_range_get_upper() and/or uvm_page_table_range_shrink().
 NV_STATUS uvm_page_tree_get_ptes(uvm_page_tree_t *tree,
-                                 NvU32 page_size,
+                                 NvU64 page_size,
                                 NvU64 start,
                                 NvLength size,
                                 uvm_pmm_alloc_flags_t pmm_flags,
@ -384,7 +384,7 @@ NV_STATUS uvm_page_tree_get_ptes(uvm_page_tree_t *tree,
 //
 // All pending operations can be waited on with uvm_page_tree_wait().
 NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree,
-                                       NvU32 page_size,
+                                       NvU64 page_size,
                                       NvU64 start,
                                       NvLength size,
                                       uvm_pmm_alloc_flags_t pmm_flags,
@ -395,7 +395,7 @@ NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree,
 // This is equivalent to calling uvm_page_tree_get_ptes() with size equal to
 // page_size.
 NV_STATUS uvm_page_tree_get_entry(uvm_page_tree_t *tree,
-                                  NvU32 page_size,
+                                  NvU64 page_size,
                                  NvU64 start,
                                  uvm_pmm_alloc_flags_t pmm_flags,
                                  uvm_page_table_range_t *single);
@ -426,7 +426,7 @@ void uvm_page_tree_clear_pde(uvm_page_tree_t *tree, uvm_page_table_range_t *sing
 // It is the caller's responsibility to initialize the returned table before
 // calling uvm_page_tree_write_pde.
 NV_STATUS uvm_page_tree_alloc_table(uvm_page_tree_t *tree,
-                                    NvU32 page_size,
+                                    NvU64 page_size,
                                    uvm_pmm_alloc_flags_t pmm_flags,
                                    uvm_page_table_range_t *single,
                                    uvm_page_table_range_t *children);
@ -480,7 +480,7 @@ static uvm_mmu_page_table_alloc_t *uvm_page_tree_pdb(uvm_page_tree_t *tree)
 NV_STATUS uvm_page_table_range_vec_init(uvm_page_tree_t *tree,
                                        NvU64 start,
                                        NvU64 size,
-                                        NvU32 page_size,
+                                        NvU64 page_size,
                                        uvm_pmm_alloc_flags_t pmm_flags,
                                        uvm_page_table_range_vec_t *range_vec);

@ -489,7 +489,7 @@ NV_STATUS uvm_page_table_range_vec_init(uvm_page_tree_t *tree,
 NV_STATUS uvm_page_table_range_vec_create(uvm_page_tree_t *tree,
                                          NvU64 start,
                                          NvU64 size,
-                                          NvU32 page_size,
+                                          NvU64 page_size,
                                          uvm_pmm_alloc_flags_t pmm_flags,
                                          uvm_page_table_range_vec_t **range_vec_out);

@ -601,12 +601,12 @@ void uvm_mmu_chunk_unmap(uvm_gpu_chunk_t *chunk, uvm_tracker_t *tracker);
 // uvm_parent_gpu_map_cpu_pages for the given GPU.
 NV_STATUS uvm_mmu_sysmem_map(uvm_gpu_t *gpu, NvU64 pa, NvU64 size);

-static NvU64 uvm_mmu_page_tree_entries(uvm_page_tree_t *tree, NvU32 depth, NvU32 page_size)
+static NvU64 uvm_mmu_page_tree_entries(uvm_page_tree_t *tree, NvU32 depth, NvU64 page_size)
 {
    return 1ull << tree->hal->index_bits(depth, page_size);
 }

-static NvU64 uvm_mmu_pde_coverage(uvm_page_tree_t *tree, NvU32 page_size)
+static NvU64 uvm_mmu_pde_coverage(uvm_page_tree_t *tree, NvU64 page_size)
 {
    NvU32 depth = tree->hal->page_table_depth(page_size);
    return uvm_mmu_page_tree_entries(tree, depth, page_size) * page_size;
@ -615,21 +615,21 @@ static NvU64 uvm_mmu_pde_coverage(uvm_page_tree_t *tree, NvU32 page_size)
 // Page sizes supported by the GPU. Use uvm_mmu_biggest_page_size() to retrieve
 // the largest page size supported in a given system, which considers the GMMU
 // and vMMU page sizes and segment sizes.
-static bool uvm_mmu_page_size_supported(uvm_page_tree_t *tree, NvU32 page_size)
+static bool uvm_mmu_page_size_supported(uvm_page_tree_t *tree, NvU64 page_size)
 {
-    UVM_ASSERT_MSG(is_power_of_2(page_size), "0x%x\n", page_size);
+    UVM_ASSERT_MSG(is_power_of_2(page_size), "0x%llx\n", page_size);

    return (tree->hal->page_sizes() & page_size) != 0;
 }

-static NvU32 uvm_mmu_biggest_page_size_up_to(uvm_page_tree_t *tree, NvU32 max_page_size)
+static NvU64 uvm_mmu_biggest_page_size_up_to(uvm_page_tree_t *tree, NvU64 max_page_size)
 {
-    NvU32 gpu_page_sizes = tree->hal->page_sizes();
-    NvU32 smallest_gpu_page_size = gpu_page_sizes & ~(gpu_page_sizes - 1);
-    NvU32 page_sizes;
-    NvU32 page_size;
+    NvU64 gpu_page_sizes = tree->hal->page_sizes();
+    NvU64 smallest_gpu_page_size = gpu_page_sizes & ~(gpu_page_sizes - 1);
+    NvU64 page_sizes;
+    NvU64 page_size;

-    UVM_ASSERT_MSG(is_power_of_2(max_page_size), "0x%x\n", max_page_size);
+    UVM_ASSERT_MSG(is_power_of_2(max_page_size), "0x%llx\n", max_page_size);

    if (max_page_size < smallest_gpu_page_size)
        return 0;
@ -638,14 +638,14 @@ static NvU32 uvm_mmu_biggest_page_size_up_to(uvm_page_tree_t *tree, NvU32 max_pa
    page_sizes = gpu_page_sizes & (max_page_size | (max_page_size - 1));

    // And pick the biggest one of them
-    page_size = 1 << __fls(page_sizes);
+    page_size = 1ULL << __fls(page_sizes);

-    UVM_ASSERT_MSG(uvm_mmu_page_size_supported(tree, page_size), "page_size 0x%x", page_size);
+    UVM_ASSERT_MSG(uvm_mmu_page_size_supported(tree, page_size), "page_size 0x%llx", page_size);

    return page_size;
 }

-static NvU32 uvm_mmu_pte_size(uvm_page_tree_t *tree, NvU32 page_size)
+static NvU32 uvm_mmu_pte_size(uvm_page_tree_t *tree, NvU64 page_size)
 {
    return tree->hal->entry_size(tree->hal->page_table_depth(page_size));
 }
--- a/kernel-open/nvidia-uvm/uvm_page_tree_test.c
+++ b/kernel-open/nvidia-uvm/uvm_page_tree_test.c
@ -96,7 +96,7 @@ typedef struct
 {
    NvU64 base;
    NvU64 size;
-    NvU32 page_size;
+    NvU64 page_size;
    NvU32 depth;
    uvm_membar_t membar;
 } fake_tlb_invalidate_t;
@ -153,7 +153,7 @@ static void fake_tlb_invalidate_va(uvm_push_t *push,
                                   NvU32 depth,
                                   NvU64 base,
                                   NvU64 size,
-                                   NvU32 page_size,
+                                   NvU64 page_size,
                                   uvm_membar_t membar)
 {
    if (!g_fake_tlb_invals_tracking_enabled)
@ -249,7 +249,11 @@ static bool assert_last_invalidate_all(NvU32 expected_depth, bool expected_memba
 }

 static bool assert_invalidate_range_specific(fake_tlb_invalidate_t *inval,
-        NvU64 base, NvU64 size, NvU32 page_size, NvU32 expected_depth, bool expected_membar)
+                                             NvU64 base,
+                                             NvU64 size,
+                                             NvU64 page_size,
+                                             NvU32 expected_depth,
+                                             bool expected_membar)
 {
    UVM_ASSERT(g_fake_tlb_invals_tracking_enabled);

@ -271,7 +275,7 @@ static bool assert_invalidate_range_specific(fake_tlb_invalidate_t *inval,
        return false;
    }
    if (inval->page_size != page_size && inval->base != 0 && inval->size != -1) {
-        UVM_TEST_PRINT("Expected page size %u, got %u instead\n", page_size, inval->page_size);
+        UVM_TEST_PRINT("Expected page size %llu, got %llu instead\n", page_size, inval->page_size);
        return false;
    }

@ -280,7 +284,7 @@ static bool assert_invalidate_range_specific(fake_tlb_invalidate_t *inval,

 static bool assert_invalidate_range(NvU64 base,
                                    NvU64 size,
-                                    NvU32 page_size,
+                                    NvU64 page_size,
                                    bool allow_inval_all,
                                    NvU32 range_depth,
                                    NvU32 all_depth,
@ -325,7 +329,7 @@ static NV_STATUS test_page_tree_init_kernel(uvm_gpu_t *gpu, NvU32 big_page_size,
 }

 static NV_STATUS test_page_tree_get_ptes(uvm_page_tree_t *tree,
-                                         NvU32 page_size,
+                                         NvU64 page_size,
                                         NvU64 start,
                                         NvLength size,
                                         uvm_page_table_range_t *range)
@ -341,7 +345,7 @@ static NV_STATUS test_page_tree_get_ptes(uvm_page_tree_t *tree,
 }

 static NV_STATUS test_page_tree_get_entry(uvm_page_tree_t *tree,
-                                          NvU32 page_size,
+                                          NvU64 page_size,
                                          NvU64 start,
                                          uvm_page_table_range_t *single)
 {
@ -355,14 +359,14 @@ static NV_STATUS test_page_tree_get_entry(uvm_page_tree_t *tree,
 }

 static NV_STATUS test_page_tree_alloc_table(uvm_page_tree_t *tree,
-                                            NvU32 page_size,
+                                            NvU64 page_size,
                                            uvm_page_table_range_t *single,
                                            uvm_page_table_range_t *children)
 {
    return uvm_page_tree_alloc_table(tree, page_size, UVM_PMM_ALLOC_FLAGS_NONE, single, children);
 }

-static bool assert_entry_no_invalidate(uvm_page_tree_t *tree, NvU32 page_size, NvU64 start)
+static bool assert_entry_no_invalidate(uvm_page_tree_t *tree, NvU64 page_size, NvU64 start)
 {
    uvm_page_table_range_t entry;
    bool result = true;
@ -378,7 +382,7 @@ static bool assert_entry_no_invalidate(uvm_page_tree_t *tree, NvU32 page_size, N
    return assert_no_invalidate() && result;
 }

-static bool assert_entry_invalidate(uvm_page_tree_t *tree, NvU32 page_size, NvU64 start, NvU32 depth, bool membar)
+static bool assert_entry_invalidate(uvm_page_tree_t *tree, NvU64 page_size, NvU64 start, NvU32 depth, bool membar)
 {
    uvm_page_table_range_t entry;
    bool result = true;
@ -932,8 +936,8 @@ static NV_STATUS split_and_free(uvm_gpu_t *gpu)

 static NV_STATUS check_sizes(uvm_gpu_t *gpu)
 {
-    NvU32 user_sizes = UVM_PAGE_SIZE_2M;
-    NvU32 kernel_sizes = UVM_PAGE_SIZE_4K | 256;
+    NvU64 user_sizes = UVM_PAGE_SIZE_2M;
+    NvU64 kernel_sizes = UVM_PAGE_SIZE_4K | 256;

    if (UVM_PAGE_SIZE_64K >= PAGE_SIZE)
        user_sizes |= UVM_PAGE_SIZE_64K;
@ -1161,7 +1165,7 @@ static NV_STATUS test_tlb_batch_invalidates_case(uvm_page_tree_t *tree,
    return status;
 }

-static NV_STATUS test_tlb_batch_invalidates(uvm_gpu_t *gpu, const NvU32 *page_sizes, const NvU32 page_sizes_count)
+static NV_STATUS test_tlb_batch_invalidates(uvm_gpu_t *gpu, const NvU64 *page_sizes, const NvU32 page_sizes_count)
 {
    NV_STATUS status = NV_OK;
    uvm_page_tree_t tree;
@ -1177,8 +1181,8 @@ static NV_STATUS test_tlb_batch_invalidates(uvm_gpu_t *gpu, const NvU32 *page_si
    for (min_index = 0; min_index < page_sizes_count; ++min_index) {
        for (max_index = min_index; max_index < page_sizes_count; ++max_index) {
            for (size_index = 0; size_index < ARRAY_SIZE(sizes_in_max_pages); ++size_index) {
-                NvU32 min_page_size = page_sizes[min_index];
-                NvU32 max_page_size = page_sizes[max_index];
+                NvU64 min_page_size = page_sizes[min_index];
+                NvU64 max_page_size = page_sizes[max_index];
                NvU64 size = (NvU64)sizes_in_max_pages[size_index] * max_page_size;

                TEST_CHECK_GOTO(test_tlb_batch_invalidates_case(&tree,
@ -1282,7 +1286,7 @@ static NV_STATUS test_range_vec_clear_ptes(uvm_page_table_range_vec_t *range_vec
 static NV_STATUS test_range_vec_create(uvm_page_tree_t *tree,
                                       NvU64 start,
                                       NvU64 size,
-                                       NvU32 page_size,
+                                       NvU64 page_size,
                                       uvm_page_table_range_vec_t **range_vec_out)
 {
    uvm_page_table_range_vec_t *range_vec;
@ -1303,7 +1307,7 @@ static NV_STATUS test_range_vec_create(uvm_page_tree_t *tree,
 // Test page table range vector APIs.
 // Notably the test leaks the page_tree and range_vec on error as it's hard to
 // clean up on failure and the destructors would likely assert.
-static NV_STATUS test_range_vec(uvm_gpu_t *gpu, NvU32 big_page_size, NvU32 page_size)
+static NV_STATUS test_range_vec(uvm_gpu_t *gpu, NvU32 big_page_size, NvU64 page_size)
 {
    NV_STATUS status = NV_OK;
    uvm_page_tree_t tree;
@ -1511,7 +1515,7 @@ static uvm_mmu_page_table_alloc_t fake_table_alloc(uvm_aperture_t aperture, NvU6
 // Queries the supported page sizes of the GPU(uvm_gpu_t) and fills the
 // page_sizes array up to MAX_NUM_PAGE_SIZE. Returns the number of elements in
 // page_sizes;
-size_t get_page_sizes(uvm_gpu_t *gpu, NvU32 *page_sizes)
+size_t get_page_sizes(uvm_gpu_t *gpu, NvU64 *page_sizes)
 {
    unsigned long page_size_log2;
    unsigned long page_sizes_bitvec;
@ -1524,7 +1528,7 @@ size_t get_page_sizes(uvm_gpu_t *gpu, NvU32 *page_sizes)
    page_sizes_bitvec = hal->page_sizes();

    for_each_set_bit(page_size_log2, &page_sizes_bitvec, BITS_PER_LONG) {
-        NvU32 page_size = (NvU32)(1ULL << page_size_log2);
+        NvU64 page_size = 1ULL << page_size_log2;
        UVM_ASSERT(count < MAX_NUM_PAGE_SIZES);
        page_sizes[count++] = page_size;
    }
@ -1572,7 +1576,7 @@ typedef NV_STATUS (*entry_test_page_size_func)(uvm_gpu_t *gpu, size_t page_size)

 static NV_STATUS entry_test_maxwell(uvm_gpu_t *gpu)
 {
-    static const NvU32 big_page_sizes[] = {UVM_PAGE_SIZE_64K, UVM_PAGE_SIZE_128K};
+    static const NvU64 big_page_sizes[] = {UVM_PAGE_SIZE_64K, UVM_PAGE_SIZE_128K};
    NvU64 pde_bits;
    uvm_mmu_page_table_alloc_t *phys_allocs[2];
    uvm_mmu_page_table_alloc_t alloc_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x9999999000LL);
@ -1663,7 +1667,7 @@ static NV_STATUS entry_test_maxwell(uvm_gpu_t *gpu)

 static NV_STATUS entry_test_pascal(uvm_gpu_t *gpu, entry_test_page_size_func entry_test_page_size)
 {
-    NvU32 page_sizes[MAX_NUM_PAGE_SIZES];
+    NvU64 page_sizes[MAX_NUM_PAGE_SIZES];
    NvU64 pde_bits[2];
    size_t i, num_page_sizes;
    uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
@ -1759,7 +1763,7 @@ static NV_STATUS entry_test_pascal(uvm_gpu_t *gpu, entry_test_page_size_func ent

 static NV_STATUS entry_test_volta(uvm_gpu_t *gpu, entry_test_page_size_func entry_test_page_size)
 {
-    NvU32 page_sizes[MAX_NUM_PAGE_SIZES];
+    NvU64 page_sizes[MAX_NUM_PAGE_SIZES];
    NvU64 pde_bits[2];
    size_t i, num_page_sizes;
    uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
@ -1833,7 +1837,7 @@ static NV_STATUS entry_test_volta(uvm_gpu_t *gpu, entry_test_page_size_func entr

 static NV_STATUS entry_test_ampere(uvm_gpu_t *gpu, entry_test_page_size_func entry_test_page_size)
 {
-    NvU32 page_sizes[MAX_NUM_PAGE_SIZES];
+    NvU64 page_sizes[MAX_NUM_PAGE_SIZES];
    NvU32 i, num_page_sizes;

    num_page_sizes = get_page_sizes(gpu, page_sizes);
@ -1847,7 +1851,7 @@ static NV_STATUS entry_test_ampere(uvm_gpu_t *gpu, entry_test_page_size_func ent
 static NV_STATUS entry_test_hopper(uvm_gpu_t *gpu, entry_test_page_size_func entry_test_page_size)
 {
    NV_STATUS status = NV_OK;
-    NvU32 page_sizes[MAX_NUM_PAGE_SIZES];
+    NvU64 page_sizes[MAX_NUM_PAGE_SIZES];
    NvU64 pde_bits[2];
    uvm_page_directory_t *dirs[5];
    size_t i, num_page_sizes;
@ -2290,8 +2294,8 @@ static NV_STATUS fake_gpu_init_hopper(uvm_gpu_t *fake_gpu)
 static NV_STATUS maxwell_test_page_tree(uvm_gpu_t *maxwell)
 {
    // create a fake Maxwell GPU for this test.
-    static const NvU32 big_page_sizes[] = {UVM_PAGE_SIZE_64K, UVM_PAGE_SIZE_128K};
-    NvU32 i, j, big_page_size, page_size;
+    static const NvU64 big_page_sizes[] = {UVM_PAGE_SIZE_64K, UVM_PAGE_SIZE_128K};
+    NvU64 i, j, big_page_size, page_size;

    TEST_CHECK_RET(fake_gpu_init_maxwell(maxwell) == NV_OK);

@ -2320,7 +2324,7 @@ static NV_STATUS pascal_test_page_tree(uvm_gpu_t *pascal)
    // create a fake Pascal GPU for this test.
    NvU32 tlb_batch_saved_max_pages;
    NvU32 i;
-    NvU32 page_sizes[MAX_NUM_PAGE_SIZES];
+    NvU64 page_sizes[MAX_NUM_PAGE_SIZES];
    size_t num_page_sizes;

    TEST_CHECK_RET(fake_gpu_init_pascal(pascal) == NV_OK);
@ -2381,7 +2385,7 @@ static NV_STATUS volta_test_page_tree(uvm_gpu_t *volta)
 static NV_STATUS ampere_test_page_tree(uvm_gpu_t *ampere)
 {
    NvU32 i, tlb_batch_saved_max_pages;
-    NvU32 page_sizes[MAX_NUM_PAGE_SIZES];
+    NvU64 page_sizes[MAX_NUM_PAGE_SIZES];
    size_t num_page_sizes;

    TEST_CHECK_RET(fake_gpu_init_ampere(ampere) == NV_OK);
--- a/kernel-open/nvidia-uvm/uvm_pascal_host.c
+++ b/kernel-open/nvidia-uvm/uvm_pascal_host.c
@ -92,7 +92,13 @@ void uvm_hal_pascal_host_tlb_invalidate_all(uvm_push_t *push, uvm_gpu_phys_addre
    uvm_hal_tlb_invalidate_membar(push, membar);
 }

-void uvm_hal_pascal_host_tlb_invalidate_va(uvm_push_t *push, uvm_gpu_phys_address_t pdb, NvU32 depth, NvU64 base, NvU64 size, NvU32 page_size, uvm_membar_t membar)
+void uvm_hal_pascal_host_tlb_invalidate_va(uvm_push_t *push,
+                                           uvm_gpu_phys_address_t pdb,
+                                           NvU32 depth,
+                                           NvU64 base,
+                                           NvU64 size,
+                                           NvU64 page_size,
+                                           uvm_membar_t membar)
 {
    NvU32 aperture_value;
    NvU32 page_table_level;
@ -127,9 +133,9 @@ void uvm_hal_pascal_host_tlb_invalidate_va(uvm_push_t *push, uvm_gpu_phys_addres
        ack_value = HWCONST(C06F, MEM_OP_C, TLB_INVALIDATE_ACK_TYPE, GLOBALLY);
    }

-    UVM_ASSERT_MSG(IS_ALIGNED(page_size, 1 << 12), "page_size 0x%x\n", page_size);
-    UVM_ASSERT_MSG(IS_ALIGNED(base, page_size), "base 0x%llx page_size 0x%x\n", base, page_size);
-    UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%x\n", size, page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(page_size, 1 << 12), "page_size 0x%llx\n", page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(base, page_size), "base 0x%llx page_size 0x%llx\n", base, page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%llx\n", size, page_size);
    UVM_ASSERT_MSG(size > 0, "size 0x%llx\n", size);

    base >>= 12;
--- a/kernel-open/nvidia-uvm/uvm_pascal_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_pascal_mmu.c
@ -54,7 +54,7 @@ static NvU32 entries_per_index_pascal(NvU32 depth)
    return 1;
 }

-static NvLength entry_offset_pascal(NvU32 depth, NvU32 page_size)
+static NvLength entry_offset_pascal(NvU32 depth, NvU64 page_size)
 {
    UVM_ASSERT(depth < 5);
    if (page_size == UVM_PAGE_SIZE_4K && depth == 3)
@ -178,7 +178,7 @@ static NvLength entry_size_pascal(NvU32 depth)
        return 8;
 }

-static NvU32 index_bits_pascal(NvU32 depth, NvU32 page_size)
+static NvU32 index_bits_pascal(NvU32 depth, NvU64 page_size)
 {
    static const NvU32 bit_widths[] = {2, 9, 9, 8};
    // some code paths keep on querying this until they get a 0, meaning only the page offset remains.
@ -204,7 +204,7 @@ static NvU32 num_va_bits_pascal(void)
    return 49;
 }

-static NvLength allocation_size_pascal(NvU32 depth, NvU32 page_size)
+static NvLength allocation_size_pascal(NvU32 depth, NvU64 page_size)
 {
    UVM_ASSERT(depth < 5);
    if (depth == 4 && page_size == UVM_PAGE_SIZE_64K)
@ -213,7 +213,7 @@ static NvLength allocation_size_pascal(NvU32 depth, NvU32 page_size)
    return 4096;
 }

-static NvU32 page_table_depth_pascal(NvU32 page_size)
+static NvU32 page_table_depth_pascal(NvU64 page_size)
 {
    if (page_size == UVM_PAGE_SIZE_2M)
        return 3;
@ -221,12 +221,12 @@ static NvU32 page_table_depth_pascal(NvU32 page_size)
        return 4;
 }

-static NvU32 page_sizes_pascal(void)
+static NvU64 page_sizes_pascal(void)
 {
    return UVM_PAGE_SIZE_2M | UVM_PAGE_SIZE_64K | UVM_PAGE_SIZE_4K;
 }

-static NvU64 unmapped_pte_pascal(NvU32 page_size)
+static NvU64 unmapped_pte_pascal(NvU64 page_size)
 {
    // Setting the privilege bit on an otherwise-zeroed big PTE causes the
    // corresponding 4k PTEs to be ignored. This allows the invalidation of a
@ -362,7 +362,7 @@ static uvm_mmu_mode_hal_t pascal_mmu_mode_hal =
    .page_sizes = page_sizes_pascal
 };

-uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_pascal(NvU32 big_page_size)
+uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_pascal(NvU64 big_page_size)
 {
    UVM_ASSERT(big_page_size == UVM_PAGE_SIZE_64K || big_page_size == UVM_PAGE_SIZE_128K);

--- a/kernel-open/nvidia-uvm/uvm_perf_prefetch.c
+++ b/kernel-open/nvidia-uvm/uvm_perf_prefetch.c
@ -162,7 +162,7 @@ static void grow_fault_granularity_if_no_thrashing(uvm_perf_prefetch_bitmap_tree
 }

 static void grow_fault_granularity(uvm_perf_prefetch_bitmap_tree_t *bitmap_tree,
-                                   NvU32 big_page_size,
+                                   NvU64 big_page_size,
                                   uvm_va_block_region_t big_pages_region,
                                   uvm_va_block_region_t max_prefetch_region,
                                   const uvm_page_mask_t *faulted_pages,
@ -245,7 +245,7 @@ static void update_bitmap_tree_from_va_block(uvm_perf_prefetch_bitmap_tree_t *bi
                                             uvm_va_block_region_t max_prefetch_region)

 {
-    NvU32 big_page_size;
+    NvU64 big_page_size;
    uvm_va_block_region_t big_pages_region;
    uvm_va_space_t *va_space;
    const uvm_page_mask_t *thrashing_pages;
--- a/kernel-open/nvidia-uvm/uvm_perf_thrashing.c
+++ b/kernel-open/nvidia-uvm/uvm_perf_thrashing.c
@ -1987,21 +1987,12 @@ NV_STATUS uvm_perf_thrashing_init(void)
                                         UVM_PERF_THRASHING_PIN_THRESHOLD_DEFAULT,
                                         UVM_PERF_THRASHING_PIN_THRESHOLD_MAX);

-
-
-    // In Confidential Computing, the DMA path is slower due to cryptographic
-    // operations & other associated overhead. Enforce a larger window to allow
-    // the thrashing mitigation mechanisms to work properly.
-    if (g_uvm_global.conf_computing_enabled)
-        INIT_THRASHING_PARAMETER_NONZERO(uvm_perf_thrashing_lapse_usec, UVM_PERF_THRASHING_LAPSE_USEC_DEFAULT * 10);
-    else
    INIT_THRASHING_PARAMETER_NONZERO(uvm_perf_thrashing_lapse_usec, UVM_PERF_THRASHING_LAPSE_USEC_DEFAULT);

    INIT_THRASHING_PARAMETER_NONZERO_MAX(uvm_perf_thrashing_nap,
                                         UVM_PERF_THRASHING_NAP_DEFAULT,
                                         UVM_PERF_THRASHING_NAP_MAX);

-
    INIT_THRASHING_PARAMETER_NONZERO(uvm_perf_thrashing_epoch, UVM_PERF_THRASHING_EPOCH_DEFAULT);

    INIT_THRASHING_PARAMETER(uvm_perf_thrashing_pin, UVM_PERF_THRASHING_PIN_DEFAULT);
--- a/kernel-open/nvidia-uvm/uvm_pmm_gpu.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_gpu.c
@ -1890,8 +1890,11 @@ static uvm_gpu_chunk_t *claim_free_chunk(uvm_pmm_gpu_t *pmm, uvm_pmm_gpu_memory_
    if (!chunk)
        goto out;

-    UVM_ASSERT_MSG(uvm_gpu_chunk_get_size(chunk) == chunk_size, "chunk size %u expected %u\n",
-            uvm_gpu_chunk_get_size(chunk), chunk_size);
+    UVM_ASSERT_MSG(uvm_gpu_chunk_get_size(chunk) == chunk_size,
+                   "chunk size %u expected %u\n",
+                   uvm_gpu_chunk_get_size(chunk),
+                   chunk_size);
+
    UVM_ASSERT(chunk->type == type);
    UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_FREE);
    UVM_ASSERT(!chunk_is_in_eviction(pmm, chunk));
@ -2756,7 +2759,7 @@ static bool uvm_pmm_should_inject_pma_eviction_error(uvm_pmm_gpu_t *pmm)
 // See the documentation of pmaEvictPagesCb_t in pma.h for details of the
 // expected semantics.
 static NV_STATUS uvm_pmm_gpu_pma_evict_pages(void *void_pmm,
-                                             NvU32 page_size,
+                                             NvU64 page_size,
                                             NvU64 *pages,
                                             NvU32 num_pages_to_evict,
                                             NvU64 phys_start,
@ -2861,7 +2864,7 @@ error:
 }

 static NV_STATUS uvm_pmm_gpu_pma_evict_pages_wrapper(void *void_pmm,
-                                                     NvU32 page_size,
+                                                     NvU64 page_size,
                                                     NvU64 *pages,
                                                     NvU32 num_pages_to_evict,
                                                     NvU64 phys_start,
--- a/kernel-open/nvidia-uvm/uvm_pmm_gpu.h
+++ b/kernel-open/nvidia-uvm/uvm_pmm_gpu.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -65,30 +65,30 @@

 typedef enum
 {
-    UVM_CHUNK_SIZE_1       =           1ULL,
-    UVM_CHUNK_SIZE_2       =           2ULL,
-    UVM_CHUNK_SIZE_4       =           4ULL,
-    UVM_CHUNK_SIZE_8       =           8ULL,
-    UVM_CHUNK_SIZE_16      =          16ULL,
-    UVM_CHUNK_SIZE_32      =          32ULL,
-    UVM_CHUNK_SIZE_64      =          64ULL,
-    UVM_CHUNK_SIZE_128     =         128ULL,
-    UVM_CHUNK_SIZE_256     =         256ULL,
-    UVM_CHUNK_SIZE_512     =         512ULL,
-    UVM_CHUNK_SIZE_1K      =        1024ULL,
-    UVM_CHUNK_SIZE_2K      =      2*1024ULL,
-    UVM_CHUNK_SIZE_4K      =      4*1024ULL,
-    UVM_CHUNK_SIZE_8K      =      8*1024ULL,
-    UVM_CHUNK_SIZE_16K     =     16*1024ULL,
-    UVM_CHUNK_SIZE_32K     =     32*1024ULL,
-    UVM_CHUNK_SIZE_64K     =     64*1024ULL,
-    UVM_CHUNK_SIZE_128K    =    128*1024ULL,
-    UVM_CHUNK_SIZE_256K    =    256*1024ULL,
-    UVM_CHUNK_SIZE_512K    =    512*1024ULL,
-    UVM_CHUNK_SIZE_1M      =   1024*1024ULL,
-    UVM_CHUNK_SIZE_2M      = 2*1024*1024ULL,
+    UVM_CHUNK_SIZE_1       =           1,
+    UVM_CHUNK_SIZE_2       =           2,
+    UVM_CHUNK_SIZE_4       =           4,
+    UVM_CHUNK_SIZE_8       =           8,
+    UVM_CHUNK_SIZE_16      =          16,
+    UVM_CHUNK_SIZE_32      =          32,
+    UVM_CHUNK_SIZE_64      =          64,
+    UVM_CHUNK_SIZE_128     =         128,
+    UVM_CHUNK_SIZE_256     =         256,
+    UVM_CHUNK_SIZE_512     =         512,
+    UVM_CHUNK_SIZE_1K      =        1024,
+    UVM_CHUNK_SIZE_2K      =      2*1024,
+    UVM_CHUNK_SIZE_4K      =      4*1024,
+    UVM_CHUNK_SIZE_8K      =      8*1024,
+    UVM_CHUNK_SIZE_16K     =     16*1024,
+    UVM_CHUNK_SIZE_32K     =     32*1024,
+    UVM_CHUNK_SIZE_64K     =     64*1024,
+    UVM_CHUNK_SIZE_128K    =    128*1024,
+    UVM_CHUNK_SIZE_256K    =    256*1024,
+    UVM_CHUNK_SIZE_512K    =    512*1024,
+    UVM_CHUNK_SIZE_1M      =   1024*1024,
+    UVM_CHUNK_SIZE_2M      = 2*1024*1024,
    UVM_CHUNK_SIZE_MAX     = UVM_CHUNK_SIZE_2M,
-    UVM_CHUNK_SIZE_INVALID = UVM_CHUNK_SIZE_MAX * 2ULL
+    UVM_CHUNK_SIZE_INVALID = UVM_CHUNK_SIZE_MAX * 2
 } uvm_chunk_size_t;

 #define UVM_CHUNK_SIZES_MASK     (uvm_chunk_sizes_mask_t)(UVM_CHUNK_SIZE_MAX | (UVM_CHUNK_SIZE_MAX-1))
--- a/kernel-open/nvidia-uvm/uvm_pmm_sysmem.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_sysmem.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2017-2023 NVIDIA Corporation
+    Copyright (c) 2017-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -43,7 +43,7 @@ NV_STATUS uvm_pmm_sysmem_init(void)
    // Ensure that only supported CPU chunk sizes are enabled.
    uvm_cpu_chunk_allocation_sizes &= UVM_CPU_CHUNK_SIZES;
    if (!uvm_cpu_chunk_allocation_sizes || !(uvm_cpu_chunk_allocation_sizes & PAGE_SIZE)) {
-        pr_info("Invalid value for uvm_cpu_chunk_allocation_sizes = 0x%x, using 0x%lx instead\n",
+        pr_info("Invalid value for uvm_cpu_chunk_allocation_sizes = 0x%x, using 0x%llx instead\n",
                uvm_cpu_chunk_allocation_sizes,
                UVM_CPU_CHUNK_SIZES);
        uvm_cpu_chunk_allocation_sizes = UVM_CPU_CHUNK_SIZES;
@ -126,7 +126,7 @@ NV_STATUS uvm_pmm_sysmem_mappings_add_gpu_mapping(uvm_pmm_sysmem_mappings_t *sys
            NvU64 remove_key;

            for (remove_key = base_key; remove_key < key; ++remove_key)
-                (void)radix_tree_delete(&sysmem_mappings->reverse_map_tree, remove_key);
+                (void *)radix_tree_delete(&sysmem_mappings->reverse_map_tree, remove_key);

            kmem_cache_free(g_reverse_page_map_cache, new_reverse_map);
            status = errno_to_nv_status(ret);
@ -461,69 +461,12 @@ static NvU32 compute_gpu_mappings_entry_index(uvm_parent_processor_mask_t *dma_a
    return uvm_parent_processor_mask_get_gpu_count(&subset_mask);
 }

-static void cpu_chunk_release(nv_kref_t *kref)
-{
-    uvm_cpu_chunk_t *chunk = container_of(kref, uvm_cpu_chunk_t, refcount);
-    uvm_parent_processor_mask_t *mapping_mask;
-    uvm_parent_processor_id_t id;
-    uvm_cpu_physical_chunk_t *phys_chunk = NULL;
-    uvm_cpu_logical_chunk_t *logical_chunk = NULL;
-
-    if (uvm_cpu_chunk_is_physical(chunk)) {
-        phys_chunk = uvm_cpu_chunk_to_physical(chunk);
-        uvm_assert_mutex_unlocked(&phys_chunk->lock);
-        mapping_mask = &phys_chunk->gpu_mappings.dma_addrs_mask;
-    }
-    else {
-        logical_chunk = uvm_cpu_chunk_to_logical(chunk);
-        mapping_mask = &logical_chunk->mapped_gpus;
-    }
-
-    for_each_parent_id_in_mask(id, mapping_mask) {
-        uvm_parent_gpu_t *parent_gpu = uvm_parent_gpu_get(id);
-        uvm_cpu_chunk_unmap_parent_gpu_phys(chunk, parent_gpu);
-    }
-
-    if (uvm_cpu_chunk_is_physical(chunk)) {
-        if (phys_chunk->gpu_mappings.max_entries > 1)
-            uvm_kvfree(phys_chunk->gpu_mappings.dynamic_entries);
-
-        if (uvm_cpu_chunk_get_size(chunk) > PAGE_SIZE &&
-            !bitmap_empty(phys_chunk->dirty_bitmap, uvm_cpu_chunk_num_pages(chunk)))
-            SetPageDirty(phys_chunk->common.page);
-
-        uvm_kvfree(phys_chunk->dirty_bitmap);
-
-        if (chunk->type != UVM_CPU_CHUNK_TYPE_HMM)
-            put_page(phys_chunk->common.page);
-    }
-    else {
-        uvm_cpu_chunk_free(logical_chunk->parent);
-    }
-
-    uvm_kvfree(chunk);
-}
-
-static void uvm_cpu_chunk_get(uvm_cpu_chunk_t *chunk)
-{
-    UVM_ASSERT(chunk);
-    nv_kref_get(&chunk->refcount);
-}
-
-void uvm_cpu_chunk_free(uvm_cpu_chunk_t *chunk)
-{
-    if (!chunk)
-        return;
-
-    nv_kref_put(&chunk->refcount, cpu_chunk_release);
-}
-
 static uvm_cpu_physical_chunk_t *get_physical_parent(uvm_cpu_chunk_t *chunk)
 {
    UVM_ASSERT(chunk);
    UVM_ASSERT(chunk->page);

-    while (!uvm_cpu_chunk_is_physical(chunk))
+    while (uvm_cpu_chunk_is_logical(chunk))
        chunk = uvm_cpu_chunk_to_logical(chunk)->parent;

    return uvm_cpu_chunk_to_physical(chunk);
@ -581,6 +524,7 @@ static uvm_cpu_phys_mapping_t *chunk_phys_mapping_alloc(uvm_cpu_physical_chunk_t
 static uvm_cpu_phys_mapping_t *chunk_phys_mapping_get(uvm_cpu_physical_chunk_t *chunk, uvm_parent_gpu_id_t id)
 {
    uvm_assert_mutex_locked(&chunk->lock);
+
    if (uvm_parent_processor_mask_test(&chunk->gpu_mappings.dma_addrs_mask, id)) {
        if (chunk->gpu_mappings.max_entries == 1) {
            return &chunk->gpu_mappings.static_entry;
@ -598,7 +542,6 @@ static void chunk_inc_gpu_mapping(uvm_cpu_physical_chunk_t *chunk, uvm_parent_gp
 {
    uvm_cpu_phys_mapping_t *mapping;

-    uvm_assert_mutex_locked(&chunk->lock);
    mapping = chunk_phys_mapping_get(chunk, id);
    UVM_ASSERT(mapping);
    mapping->map_count++;
@ -608,7 +551,6 @@ static void chunk_dec_gpu_mapping(uvm_cpu_physical_chunk_t *chunk, uvm_parent_gp
 {
    uvm_cpu_phys_mapping_t *mapping;

-    uvm_assert_mutex_locked(&chunk->lock);
    mapping = chunk_phys_mapping_get(chunk, id);
    UVM_ASSERT(mapping);
    UVM_ASSERT(mapping->dma_addr && mapping->map_count);
@ -616,6 +558,8 @@ static void chunk_dec_gpu_mapping(uvm_cpu_physical_chunk_t *chunk, uvm_parent_gp
    if (mapping->map_count == 0) {
        uvm_parent_gpu_t *parent_gpu = uvm_parent_gpu_get(id);

+        UVM_ASSERT(uvm_sub_processor_mask_empty(&mapping->sub_processors));
+
        uvm_parent_gpu_unmap_cpu_pages(parent_gpu, mapping->dma_addr, uvm_cpu_chunk_get_size(&chunk->common));
        mapping->dma_addr = 0;
        if (chunk->gpu_mappings.max_entries > 1) {
@ -631,7 +575,7 @@ static void chunk_dec_gpu_mapping(uvm_cpu_physical_chunk_t *chunk, uvm_parent_gp
    }
 }

-NvU64 uvm_cpu_chunk_get_parent_gpu_phys_addr(uvm_cpu_chunk_t *chunk, uvm_parent_gpu_t *parent_gpu)
+NvU64 uvm_cpu_chunk_get_gpu_phys_addr(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu)
 {
    uvm_cpu_physical_chunk_t *phys_chunk = get_physical_parent(chunk);
    uvm_cpu_phys_mapping_t *mapping;
@ -641,36 +585,41 @@ NvU64 uvm_cpu_chunk_get_parent_gpu_phys_addr(uvm_cpu_chunk_t *chunk, uvm_parent_
    if (uvm_cpu_chunk_is_logical(chunk)) {
        uvm_cpu_logical_chunk_t *logical_chunk = uvm_cpu_chunk_to_logical(chunk);

-        if (!uvm_parent_processor_mask_test(&logical_chunk->mapped_gpus, parent_gpu->id))
+        if (!uvm_processor_mask_test(&logical_chunk->mapped_gpus, gpu->id))
            return 0;

        parent_offset = cpu_chunk_get_phys_index(logical_chunk);
    }

    uvm_mutex_lock(&phys_chunk->lock);
-    mapping = chunk_phys_mapping_get(phys_chunk, parent_gpu->id);
-    if (mapping)
+    mapping = chunk_phys_mapping_get(phys_chunk, gpu->parent->id);
+    if (mapping &&
+        (uvm_cpu_chunk_is_logical(chunk) ||
+         uvm_sub_processor_mask_test(&mapping->sub_processors, uvm_id_sub_processor_index(gpu->id))))
        dma_addr = mapping->dma_addr + (parent_offset * PAGE_SIZE);
-
    uvm_mutex_unlock(&phys_chunk->lock);
+
    return dma_addr;
 }

-// Create a DMA mapping for the chunk on the given parent GPU. This will map the
-// entire parent physical chunk on the GPU.
+// Create a DMA mapping for the chunk on the given GPU. This will map the
+// entire physical chunk on the parent GPU and record that a given MIG
+// partition is using the mapping.
 //
 // Returns NV_OK on success. On error, any of the errors returned by
 // uvm_parent_gpu_map_cpu_pages() can be returned. In the case that the DMA
 // mapping structure could not be allocated, NV_ERR_NO_MEMORY is returned.
-static NV_STATUS cpu_chunk_map_parent_gpu_phys(uvm_cpu_chunk_t *chunk, uvm_parent_gpu_t *parent_gpu)
+static NV_STATUS cpu_chunk_map_gpu_phys(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu)
 {
+    uvm_parent_gpu_t *parent_gpu = gpu->parent;
    uvm_cpu_physical_chunk_t *phys_chunk;
    uvm_cpu_logical_chunk_t *logical_chunk = NULL;
+    uvm_cpu_phys_mapping_t *mapping;
    NV_STATUS status = NV_OK;

    if (uvm_cpu_chunk_is_logical(chunk)) {
        logical_chunk = uvm_cpu_chunk_to_logical(chunk);
-        if (uvm_parent_processor_mask_test(&logical_chunk->mapped_gpus, parent_gpu->id))
+        if (uvm_processor_mask_test(&logical_chunk->mapped_gpus, gpu->id))
            return status;
    }

@ -679,7 +628,6 @@ static NV_STATUS cpu_chunk_map_parent_gpu_phys(uvm_cpu_chunk_t *chunk, uvm_paren

    if (!uvm_parent_processor_mask_test(&phys_chunk->gpu_mappings.dma_addrs_mask, parent_gpu->id)) {
        uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(&phys_chunk->common);
-        uvm_cpu_phys_mapping_t *mapping;
        NvU64 dma_addr;

        status = uvm_parent_gpu_map_cpu_pages(parent_gpu, phys_chunk->common.page, chunk_size, &dma_addr);
@ -695,39 +643,59 @@ static NV_STATUS cpu_chunk_map_parent_gpu_phys(uvm_cpu_chunk_t *chunk, uvm_paren

        mapping->dma_addr = dma_addr;
        mapping->map_count = 1;
+        uvm_sub_processor_mask_zero(&mapping->sub_processors);
+        if (!logical_chunk)
+            uvm_sub_processor_mask_set(&mapping->sub_processors, uvm_id_sub_processor_index(gpu->id));
+
        uvm_parent_processor_mask_set(&phys_chunk->gpu_mappings.dma_addrs_mask, parent_gpu->id);
    }
    else {
-        // The mapping count on the physical chunk is only increased when
-        // mapping logical chunks.
-        if (uvm_cpu_chunk_is_logical(chunk))
-            chunk_inc_gpu_mapping(phys_chunk, parent_gpu->id);
+        mapping = chunk_phys_mapping_get(phys_chunk, parent_gpu->id);
+        UVM_ASSERT(mapping);
+
+        // Increment the map_count for logical chunks or the first time a
+        // MIG partition is sharing a physical chunk.
+        if (logical_chunk ||
+            !uvm_sub_processor_mask_test_and_set(&mapping->sub_processors, uvm_id_sub_processor_index(gpu->id)))
+            mapping->map_count++;
+    }
+
+    if (logical_chunk) {
+        uvm_processor_mask_set(&logical_chunk->mapped_gpus, gpu->id);
+        UVM_ASSERT(uvm_sub_processor_mask_empty(&mapping->sub_processors));
+    }
+    else {
+        UVM_ASSERT(!uvm_sub_processor_mask_empty(&mapping->sub_processors));
+        UVM_ASSERT(uvm_sub_processor_mask_get_count(&mapping->sub_processors) == mapping->map_count);
    }

 done:
    uvm_mutex_unlock(&phys_chunk->lock);

-    if (status == NV_OK && uvm_cpu_chunk_is_logical(chunk))
-        uvm_parent_processor_mask_set(&logical_chunk->mapped_gpus, parent_gpu->id);
-
    return status;
 }

-void uvm_cpu_chunk_unmap_parent_gpu_phys(uvm_cpu_chunk_t *chunk, uvm_parent_gpu_t *parent_gpu)
+static void cpu_chunk_unmap_gpu_phys(uvm_cpu_chunk_t *chunk, uvm_gpu_id_t gpu_id)
 {
-    uvm_cpu_physical_chunk_t *phys_chunk;
-    uvm_cpu_logical_chunk_t *logical_chunk;
+    uvm_cpu_physical_chunk_t *phys_chunk = get_physical_parent(chunk);
+    uvm_parent_gpu_id_t id = uvm_parent_gpu_id_from_gpu_id(gpu_id);
+
+    uvm_mutex_lock(&phys_chunk->lock);

    if (uvm_cpu_chunk_is_logical(chunk)) {
-        logical_chunk = uvm_cpu_chunk_to_logical(chunk);
-        if (!uvm_parent_processor_mask_test_and_clear(&logical_chunk->mapped_gpus, parent_gpu->id))
-            return;
-    }
+        uvm_processor_mask_t *mapping_mask = &uvm_cpu_chunk_to_logical(chunk)->mapped_gpus;

-    phys_chunk = get_physical_parent(chunk);
-    uvm_mutex_lock(&phys_chunk->lock);
-    if (uvm_parent_processor_mask_test(&phys_chunk->gpu_mappings.dma_addrs_mask, parent_gpu->id))
-        chunk_dec_gpu_mapping(phys_chunk, parent_gpu->id);
+        if (uvm_processor_mask_test_and_clear(mapping_mask, gpu_id))
+            chunk_dec_gpu_mapping(phys_chunk, id);
+    }
+    else {
+        if (uvm_parent_processor_mask_test(&phys_chunk->gpu_mappings.dma_addrs_mask, id)) {
+            uvm_cpu_phys_mapping_t *mapping = chunk_phys_mapping_get(phys_chunk, id);
+
+            if (uvm_sub_processor_mask_test_and_clear(&mapping->sub_processors, uvm_id_sub_processor_index(gpu_id)))
+                chunk_dec_gpu_mapping(phys_chunk, id);
+        }
+    }

    uvm_mutex_unlock(&phys_chunk->lock);
 }
@ -737,17 +705,112 @@ NV_STATUS uvm_cpu_chunk_map_gpu(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu)
    NV_STATUS status;
    uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk);

-    status = cpu_chunk_map_parent_gpu_phys(chunk, gpu->parent);
+    status = cpu_chunk_map_gpu_phys(chunk, gpu);
    if (status != NV_OK)
        return status;

-    status = uvm_mmu_sysmem_map(gpu, uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent), chunk_size);
+    status = uvm_mmu_sysmem_map(gpu, uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu), chunk_size);
    if (status != NV_OK)
-        uvm_cpu_chunk_unmap_parent_gpu_phys(chunk, gpu->parent);
+        cpu_chunk_unmap_gpu_phys(chunk, gpu->id);

    return status;
 }

+void uvm_cpu_chunk_unmap_gpu(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu)
+{
+    cpu_chunk_unmap_gpu_phys(chunk, gpu->id);
+
+    // Note: there is no corresponding uvm_mmu_sysmem_unmap() for
+    // uvm_mmu_sysmem_map().
+}
+
+static void cpu_logical_chunk_release(uvm_cpu_logical_chunk_t *logical_chunk)
+{
+    uvm_cpu_physical_chunk_t *phys_chunk = get_physical_parent(logical_chunk->parent);
+    uvm_processor_id_t gpu_id;
+
+    uvm_mutex_lock(&phys_chunk->lock);
+
+    for_each_id_in_mask(gpu_id, &logical_chunk->mapped_gpus)
+        chunk_dec_gpu_mapping(phys_chunk, uvm_parent_gpu_id_from_gpu_id(gpu_id));
+
+    uvm_mutex_unlock(&phys_chunk->lock);
+
+    uvm_cpu_chunk_free(logical_chunk->parent);
+}
+
+static void cpu_physical_chunk_release(uvm_cpu_chunk_t *chunk)
+{
+    uvm_cpu_physical_chunk_t *phys_chunk = uvm_cpu_chunk_to_physical(chunk);
+    uvm_parent_processor_id_t id;
+
+    uvm_assert_mutex_unlocked(&phys_chunk->lock);
+
+    // There should be no other threads using this chunk but we lock it because
+    // of assertions in chunk_phys_mapping_get() and chunk_dec_gpu_mapping().
+    uvm_mutex_lock(&phys_chunk->lock);
+
+    for_each_parent_id_in_mask(id, &phys_chunk->gpu_mappings.dma_addrs_mask) {
+        uvm_cpu_phys_mapping_t *mapping = chunk_phys_mapping_get(phys_chunk, id);
+        NvU32 count;
+
+        UVM_ASSERT(mapping);
+        UVM_ASSERT(!uvm_sub_processor_mask_empty(&mapping->sub_processors));
+
+        // Get a count of set bits in the sub_processors mask then clear it so
+        // that chunk_dec_gpu_mapping() sees an empty mask when map_count == 0.
+        // Using for_each_sub_processor_in_mask could try to dereference
+        // mapping after map_count == 0 in the loop below.
+        count = uvm_sub_processor_mask_get_count(&mapping->sub_processors);
+        uvm_sub_processor_mask_zero(&mapping->sub_processors);
+
+        for (; count; count--)
+            chunk_dec_gpu_mapping(phys_chunk, id);
+    }
+
+    uvm_mutex_unlock(&phys_chunk->lock);
+
+    UVM_ASSERT(uvm_parent_processor_mask_empty(&phys_chunk->gpu_mappings.dma_addrs_mask));
+
+    if (phys_chunk->gpu_mappings.max_entries > 1)
+        uvm_kvfree(phys_chunk->gpu_mappings.dynamic_entries);
+
+    if (uvm_cpu_chunk_get_size(chunk) > PAGE_SIZE &&
+        !bitmap_empty(phys_chunk->dirty_bitmap, uvm_cpu_chunk_num_pages(chunk)))
+        SetPageDirty(chunk->page);
+
+    uvm_kvfree(phys_chunk->dirty_bitmap);
+
+    if (chunk->type != UVM_CPU_CHUNK_TYPE_HMM)
+        put_page(chunk->page);
+}
+
+static void cpu_chunk_release(nv_kref_t *kref)
+{
+    uvm_cpu_chunk_t *chunk = container_of(kref, uvm_cpu_chunk_t, refcount);
+
+    if (uvm_cpu_chunk_is_logical(chunk))
+        cpu_logical_chunk_release(uvm_cpu_chunk_to_logical(chunk));
+    else
+        cpu_physical_chunk_release(chunk);
+
+    uvm_kvfree(chunk);
+}
+
+static void uvm_cpu_chunk_get(uvm_cpu_chunk_t *chunk)
+{
+    UVM_ASSERT(chunk);
+    nv_kref_get(&chunk->refcount);
+}
+
+void uvm_cpu_chunk_free(uvm_cpu_chunk_t *chunk)
+{
+    if (!chunk)
+        return;
+
+    nv_kref_put(&chunk->refcount, cpu_chunk_release);
+}
+
 static struct page *uvm_cpu_chunk_alloc_page(uvm_chunk_size_t alloc_size,
                                             int nid,
                                             uvm_cpu_chunk_alloc_flags_t alloc_flags)
@ -876,14 +939,37 @@ int uvm_cpu_chunk_get_numa_node(uvm_cpu_chunk_t *chunk)
    return page_to_nid(chunk->page);
 }

+// Convert the mask of DMA mapped parent GPUs and the sub-processor mask into
+// one uvm_processor_mask_t in 'dma_map_mask'.
+static void get_dma_map_mask(uvm_cpu_physical_chunk_t *chunk, uvm_processor_mask_t *dma_map_mask)
+{
+    uvm_parent_processor_id_t id;
+    NvU32 sub_index;
+
+    uvm_assert_mutex_locked(&chunk->lock);
+
+    for_each_parent_id_in_mask(id, &chunk->gpu_mappings.dma_addrs_mask) {
+        uvm_cpu_phys_mapping_t *mapping = chunk_phys_mapping_get(chunk, id);
+
+        for_each_sub_processor_index_in_mask(sub_index, &mapping->sub_processors) {
+            uvm_processor_id_t gpu_id = uvm_gpu_id_from_sub_processor(id, sub_index);
+
+            uvm_sub_processor_mask_clear(&mapping->sub_processors, sub_index);
+            uvm_processor_mask_set(dma_map_mask, gpu_id);
+        }
+
+        UVM_ASSERT(uvm_sub_processor_mask_empty(&mapping->sub_processors));
+    }
+}
+
 NV_STATUS uvm_cpu_chunk_split(uvm_cpu_chunk_t *chunk, uvm_cpu_chunk_t **new_chunks)
 {
    NV_STATUS status = NV_OK;
    uvm_cpu_logical_chunk_t *new_chunk;
    uvm_cpu_physical_chunk_t *phys_chunk = get_physical_parent(chunk);
    uvm_cpu_logical_chunk_t *logical_chunk = NULL;
-    uvm_parent_processor_id_t id;
-    uvm_parent_processor_mask_t *dma_map_mask;
+    uvm_processor_id_t gpu_id;
+    uvm_processor_mask_t *dma_map_mask = NULL;
    uvm_chunk_size_t new_size;
    size_t num_new_chunks;
    size_t num_subchunk_pages;
@ -902,21 +988,20 @@ NV_STATUS uvm_cpu_chunk_split(uvm_cpu_chunk_t *chunk, uvm_cpu_chunk_t **new_chun

    // Get the largest size below the size of the input chunk.
    new_size = uvm_chunk_find_prev_size(uvm_cpu_chunk_get_allocation_sizes(), uvm_cpu_chunk_get_size(chunk));
+    UVM_ASSERT(new_size);
    UVM_ASSERT(new_size != UVM_CHUNK_SIZE_INVALID);
    num_new_chunks = uvm_cpu_chunk_get_size(chunk) / new_size;
    num_subchunk_pages = new_size / PAGE_SIZE;

-    if (uvm_cpu_chunk_is_physical(chunk)) {
-        dma_map_mask = &phys_chunk->gpu_mappings.dma_addrs_mask;
-    }
-    else {
+    if (uvm_cpu_chunk_is_logical(chunk)) {
        logical_chunk = uvm_cpu_chunk_to_logical(chunk);
        dma_map_mask = &logical_chunk->mapped_gpus;
    }

    uvm_mutex_lock(&phys_chunk->lock);
+
    for (i = 0; i < num_new_chunks; i++) {
-        new_chunk = uvm_kvmalloc_zero(sizeof(*logical_chunk));
+        new_chunk = uvm_kvmalloc_zero(sizeof(*new_chunk));
        if (!new_chunk) {
            uvm_mutex_unlock(&phys_chunk->lock);
            status = NV_ERR_NO_MEMORY;
@ -929,19 +1014,25 @@ NV_STATUS uvm_cpu_chunk_split(uvm_cpu_chunk_t *chunk, uvm_cpu_chunk_t **new_chun
        nv_kref_init(&new_chunk->common.refcount);
        new_chunk->parent = chunk;
        uvm_cpu_chunk_get(new_chunk->parent);
-        for_each_parent_id_in_mask(id, dma_map_mask)
-            chunk_inc_gpu_mapping(phys_chunk, id);
-        uvm_parent_processor_mask_copy(&new_chunk->mapped_gpus, dma_map_mask);
+        if (i == 0 && !logical_chunk) {
+            dma_map_mask = &new_chunk->mapped_gpus;
+            get_dma_map_mask(phys_chunk, dma_map_mask);
+        }
+        else {
+            uvm_processor_mask_copy(&new_chunk->mapped_gpus, dma_map_mask);
+        }
+        for_each_id_in_mask(gpu_id, dma_map_mask)
+            chunk_inc_gpu_mapping(phys_chunk, uvm_parent_gpu_id_from_gpu_id(gpu_id));
        new_chunks[i] = &new_chunk->common;
    }

    // Release the references that are held by the chunk being split.
-    for_each_parent_id_in_mask(id, dma_map_mask)
-        chunk_dec_gpu_mapping(phys_chunk, id);
+    for_each_id_in_mask(gpu_id, dma_map_mask)
+        chunk_dec_gpu_mapping(phys_chunk, uvm_parent_gpu_id_from_gpu_id(gpu_id));

    // If the chunk being split is a logical chunk clear it's mapped_gpus mask.
-    if (uvm_cpu_chunk_is_logical(chunk))
-        uvm_parent_processor_mask_zero(&logical_chunk->mapped_gpus);
+    if (logical_chunk)
+        uvm_processor_mask_zero(&logical_chunk->mapped_gpus);

    uvm_mutex_unlock(&phys_chunk->lock);

@ -963,7 +1054,7 @@ static bool verify_merging_chunks(uvm_cpu_chunk_t **chunks, size_t num_chunks)
 {
    uvm_cpu_logical_chunk_t *logical_chunk;
    uvm_cpu_chunk_t *first_chunk_parent;
-    uvm_parent_processor_mask_t *first_chunk_mapped_gpus;
+    uvm_processor_mask_t *first_chunk_mapped_gpus;
    uvm_chunk_size_t first_chunk_size;
    size_t i;

@ -994,7 +1085,7 @@ static bool verify_merging_chunks(uvm_cpu_chunk_t **chunks, size_t num_chunks)
        //       2.1 All mappings to GPUs in each of child chunks' masks that are
        //           not also present in the parent chunk's mask are destroyed.
        //       2.2 mapped_gpus mask of the parent chunk remains unmodified.
-        UVM_ASSERT(uvm_parent_processor_mask_equal(&logical_chunk->mapped_gpus, first_chunk_mapped_gpus));
+        UVM_ASSERT(uvm_processor_mask_equal(&logical_chunk->mapped_gpus, first_chunk_mapped_gpus));
    }

    return true;
@ -1005,14 +1096,14 @@ uvm_cpu_chunk_t *uvm_cpu_chunk_merge(uvm_cpu_chunk_t **chunks)
    uvm_cpu_chunk_t *parent;
    uvm_cpu_logical_chunk_t *logical_chunk;
    uvm_cpu_physical_chunk_t *phys_chunk;
-    uvm_parent_processor_id_t id;
+    uvm_processor_id_t gpu_id;
    uvm_chunk_size_t chunk_size;
    uvm_chunk_size_t parent_chunk_size;
    size_t num_merge_chunks;
    size_t i;

    UVM_ASSERT(chunks);
-    UVM_ASSERT(!uvm_cpu_chunk_is_physical(chunks[0]));
+    UVM_ASSERT(uvm_cpu_chunk_is_logical(chunks[0]));

    logical_chunk = uvm_cpu_chunk_to_logical(chunks[0]);
    parent = logical_chunk->parent;
@ -1033,11 +1124,22 @@ uvm_cpu_chunk_t *uvm_cpu_chunk_merge(uvm_cpu_chunk_t **chunks)
    phys_chunk = get_physical_parent(chunks[0]);

    uvm_mutex_lock(&phys_chunk->lock);
-    for_each_parent_id_in_mask(id, &logical_chunk->mapped_gpus)
-        chunk_inc_gpu_mapping(phys_chunk, id);

-    if (!uvm_cpu_chunk_is_physical(parent))
-        uvm_parent_processor_mask_copy(&uvm_cpu_chunk_to_logical(parent)->mapped_gpus, &logical_chunk->mapped_gpus);
+    for_each_id_in_mask(gpu_id, &logical_chunk->mapped_gpus)
+        chunk_inc_gpu_mapping(phys_chunk, uvm_parent_gpu_id_from_gpu_id(gpu_id));
+
+    if (uvm_cpu_chunk_is_logical(parent)) {
+        uvm_processor_mask_copy(&uvm_cpu_chunk_to_logical(parent)->mapped_gpus, &logical_chunk->mapped_gpus);
+    }
+    else {
+        // Restore the mapping->sub_processors mask for each mapped GPU.
+        for_each_id_in_mask(gpu_id, &logical_chunk->mapped_gpus) {
+            uvm_cpu_phys_mapping_t *mapping = chunk_phys_mapping_get(phys_chunk, uvm_parent_gpu_id_from_gpu_id(gpu_id));
+
+            UVM_ASSERT(mapping);
+            uvm_sub_processor_mask_set(&mapping->sub_processors, uvm_id_sub_processor_index(gpu_id));
+        }
+    }

    uvm_mutex_unlock(&phys_chunk->lock);

--- a/kernel-open/nvidia-uvm/uvm_pmm_sysmem.h
+++ b/kernel-open/nvidia-uvm/uvm_pmm_sysmem.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2017-2023 NVIDIA Corporation
+    Copyright (c) 2017-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -246,8 +246,19 @@ struct uvm_cpu_chunk_struct

 typedef struct
 {
+    // Physical GPU DMA address of the CPU chunk.
    NvU64 dma_addr;
+
+    // Reference count of all sub_processors using this mapping across logical
+    // and physical chunks.
    NvU32 map_count;
+
+    // Mask of MIG instances or physical GPU.
+    // This is only valid for physical CPU chunks that have not been split into
+    // logical chunks. When the chunk is split, all the
+    // uvm_cpu_logical_chunk_t::mapped_gpus masks have a bit set for each
+    // count in map_count and sub_processors is set to zero.
+    uvm_sub_processor_mask_t sub_processors;
 } uvm_cpu_phys_mapping_t;

 typedef struct
@ -304,7 +315,9 @@ typedef struct

    // Pointer to the parent chunk (which could also be a logical chunk).
    uvm_cpu_chunk_t *parent;
-    uvm_parent_processor_mask_t mapped_gpus;
+
+    // This is a reference per bit but also recorded in mapping->map_count.
+    uvm_processor_mask_t mapped_gpus;
 } uvm_cpu_logical_chunk_t;

 // Return the set of allowed CPU chunk allocation sizes.
@ -417,15 +430,15 @@ void uvm_cpu_chunk_free(uvm_cpu_chunk_t *chunk);
 // For more details see uvm_mmu_sysmem_map().
 NV_STATUS uvm_cpu_chunk_map_gpu(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu);

-// Destroy a CPU chunk's DMA mapping for the parent GPU.
+// Destroy a CPU chunk's DMA mapping for the given GPU.
 // If chunk is a logical chunk, this call may not necessarily destroy the DMA
-// mapping of the parent physical chunk since all logical chunks share the
-// parent's DMA mapping.
-void uvm_cpu_chunk_unmap_parent_gpu_phys(uvm_cpu_chunk_t *chunk, uvm_parent_gpu_t *parent_gpu);
+// mapping of the parent physical chunk since all logical chunks and MIG
+// partitions share the parent's DMA mapping.
+void uvm_cpu_chunk_unmap_gpu(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu);

 // Get the CPU chunk's DMA mapping address for the specified GPU ID.
 // If there is no mapping for the GPU, 0 is returned.
-NvU64 uvm_cpu_chunk_get_parent_gpu_phys_addr(uvm_cpu_chunk_t *chunk, uvm_parent_gpu_t *parent_gpu);
+NvU64 uvm_cpu_chunk_get_gpu_phys_addr(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu);

 // Split a CPU chunk into a set of CPU chunks of the next size down from the set
 // of enabled CPU chunk sizes.
--- a/kernel-open/nvidia-uvm/uvm_pmm_sysmem_test.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_sysmem_test.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2017-2023 NVIDIA Corporation
+    Copyright (c) 2017-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -626,7 +626,7 @@ static NV_STATUS test_cpu_chunk_mapping_access(uvm_cpu_chunk_t *chunk, uvm_gpu_t
    TEST_NV_CHECK_RET(cpu_chunk_map_on_cpu(chunk, (void **)&cpu_addr));
    memset(cpu_addr, 0, chunk_size);

-    dma_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
+    dma_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu);
    gpu_addr = uvm_gpu_address_copy(gpu, uvm_gpu_phys_address(UVM_APERTURE_SYS, dma_addr));

    TEST_NV_CHECK_GOTO(uvm_push_begin_acquire(gpu->channel_manager,
@ -733,21 +733,21 @@ static NV_STATUS test_cpu_chunk_mapping_basic_verify(uvm_gpu_t *gpu,
    //   - no GPU mapping address.
    TEST_CHECK_GOTO(phys_chunk->gpu_mappings.max_entries == 1, done);
    TEST_CHECK_GOTO(uvm_parent_processor_mask_get_gpu_count(&phys_chunk->gpu_mappings.dma_addrs_mask) == 0, done);
-    TEST_CHECK_GOTO(uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent) == 0, done);
+    TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu) == 0, done);
    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu), done);

    // Test basic access.
    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu), done);

    // Test double map is harmless.
-    dma_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
+    dma_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu);
    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu), done);
-    TEST_CHECK_GOTO(uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent) == dma_addr, done);
+    TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu) == dma_addr, done);
    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu), done);

    // Test unmap, remap.
-    uvm_cpu_chunk_unmap_parent_gpu_phys(chunk, gpu->parent);
-    TEST_CHECK_GOTO(uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent) == 0, done);
+    uvm_cpu_chunk_unmap_gpu(chunk, gpu);
+    TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu) == 0, done);
    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu), done);
    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu), done);

@ -768,6 +768,39 @@ static NV_STATUS test_cpu_chunk_mapping_basic(uvm_gpu_t *gpu, uvm_cpu_chunk_allo
    return NV_OK;
 }

+// TODO: Bug 4351121: This won't actually test anything until uvm_test
+// enumerates multiple MIG instances.
+static NV_STATUS test_cpu_chunk_mig(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
+{
+    NV_STATUS status = NV_OK;
+    uvm_cpu_chunk_t *chunk;
+    uvm_cpu_physical_chunk_t *phys_chunk;
+    NvU64 dma_addr_gpu0;
+
+    UVM_ASSERT(gpu0->parent == gpu1->parent);
+
+    TEST_NV_CHECK_RET(test_cpu_chunk_alloc(PAGE_SIZE, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, NUMA_NO_NODE, &chunk));
+    phys_chunk = uvm_cpu_chunk_to_physical(chunk);
+
+    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu0), done);
+    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu1), done);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu0), done);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu1), done);
+
+    // MIG instances in the same physical GPU share the same DMA addresses.
+    dma_addr_gpu0 = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu0);
+    TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu1) == dma_addr_gpu0, done);
+
+    // Unmapping one GPU shouldn't affect the other.
+    uvm_cpu_chunk_unmap_gpu(chunk, gpu0);
+    TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu0) == 0, done);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu1), done);
+
+done:
+    uvm_cpu_chunk_free(chunk);
+    return status;
+}
+
 static NV_STATUS test_cpu_chunk_mapping_array(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1, uvm_gpu_t *gpu2)
 {
    NV_STATUS status = NV_OK;
@ -783,8 +816,8 @@ static NV_STATUS test_cpu_chunk_mapping_array(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1,
    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu2), done);
    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu1), done);
    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu2), done);
-    dma_addr_gpu1 = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu1->parent);
-    uvm_cpu_chunk_unmap_parent_gpu_phys(chunk, gpu2->parent);
+    dma_addr_gpu1 = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu1);
+    uvm_cpu_chunk_unmap_gpu(chunk, gpu2);
    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu1), done);
    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu0), done);
    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu0), done);
@ -798,7 +831,9 @@ static NV_STATUS test_cpu_chunk_mapping_array(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1,
    // GPU1. It's true that we may get a false negative if both addresses
    // happened to alias and we had a bug in how the addresses are shifted in
    // the dense array, but that's better than intermittent failure.
-    TEST_CHECK_GOTO(uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu1->parent) == dma_addr_gpu1, done);
+    // Also note that multiple MIG instances in the same physical GPU share the
+    // parent's physical DMA mapping.
+    TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu1) == dma_addr_gpu1, done);

 done:
    uvm_cpu_chunk_free(chunk);
@ -828,7 +863,7 @@ static NV_STATUS do_test_cpu_chunk_split_and_merge(uvm_cpu_chunk_t *chunk, uvm_g

    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu), done_free);
    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu), done_free);
-    uvm_cpu_chunk_unmap_parent_gpu_phys(chunk, gpu->parent);
+    uvm_cpu_chunk_unmap_gpu(chunk, gpu);

    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_split(chunk, split_chunks), done_free);
    TEST_CHECK_GOTO(nv_kref_read(&chunk->refcount) == num_split_chunks, done);
@ -845,13 +880,14 @@ static NV_STATUS do_test_cpu_chunk_split_and_merge(uvm_cpu_chunk_t *chunk, uvm_g
    merged_chunk = uvm_cpu_chunk_merge(split_chunks);
    TEST_CHECK_GOTO(uvm_cpu_chunk_get_size(merged_chunk) == size, done_free);
    TEST_CHECK_GOTO(merged_chunk == chunk, done_free);
+    TEST_CHECK_GOTO(nv_kref_read(&chunk->refcount) == 1, done_free);

    // Since all logical chunks were mapped, the entire merged chunk should
    // be accessible without needing to map it.
    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(merged_chunk, gpu), done_free);

    // Test that GPU mappings are transferred after a split
-    phys_dma_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
+    phys_dma_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu);

    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_split(chunk, split_chunks), done_free);

@ -859,9 +895,9 @@ static NV_STATUS do_test_cpu_chunk_split_and_merge(uvm_cpu_chunk_t *chunk, uvm_g
        NvU64 dma_addr;

        TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(split_chunks[i], gpu), done);
-        dma_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(split_chunks[i], gpu->parent);
+        dma_addr = uvm_cpu_chunk_get_gpu_phys_addr(split_chunks[i], gpu);
        TEST_CHECK_GOTO(dma_addr == phys_dma_addr + (i * split_size), done);
-        uvm_cpu_chunk_unmap_parent_gpu_phys(split_chunks[i], gpu->parent);
+        uvm_cpu_chunk_unmap_gpu(split_chunks[i], gpu);
    }

    // Test that mapping one logical chunk does not affect others.
@ -871,7 +907,7 @@ static NV_STATUS do_test_cpu_chunk_split_and_merge(uvm_cpu_chunk_t *chunk, uvm_g

    for (i = 0; i < num_split_chunks; i++) {
        if (i != map_chunk)
-            TEST_CHECK_GOTO(uvm_cpu_chunk_get_parent_gpu_phys_addr(split_chunks[i], gpu->parent) == 0, done);
+            TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(split_chunks[i], gpu) == 0, done);
    }

    if (split_size > PAGE_SIZE) {
@ -927,6 +963,118 @@ static NV_STATUS test_cpu_chunk_split_and_merge(uvm_gpu_t *gpu)
    return NV_OK;
 }

+static NV_STATUS do_test_cpu_chunk_split_and_merge_2(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
+{
+    NV_STATUS status = NV_OK;
+    uvm_chunk_size_t size = uvm_cpu_chunk_get_size(chunk);
+    uvm_chunk_sizes_mask_t alloc_sizes = uvm_cpu_chunk_get_allocation_sizes();
+    size_t num_split_chunks;
+    uvm_cpu_chunk_t **split_chunks;
+    uvm_cpu_chunk_t *merged_chunk;
+    uvm_chunk_size_t split_size;
+    size_t i;
+
+    split_size = uvm_chunk_find_prev_size(alloc_sizes, size);
+    UVM_ASSERT(split_size != UVM_CHUNK_SIZE_INVALID);
+    num_split_chunks = size / split_size;
+    split_chunks = uvm_kvmalloc_zero(num_split_chunks * sizeof(*split_chunks));
+
+    if (!split_chunks)
+        return NV_ERR_NO_MEMORY;
+
+    // Map both GPUs.
+    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu0), done_free);
+    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu1), done_free);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu0), done_free);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu1), done_free);
+
+    // Then split.
+    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_split(chunk, split_chunks), done_free);
+    TEST_CHECK_GOTO(nv_kref_read(&chunk->refcount) == num_split_chunks, done);
+
+    // Unmap gpu0 from all split chunks.
+    for (i = 0; i < num_split_chunks; i++) {
+        TEST_CHECK_GOTO(split_chunks[i], done);
+        TEST_CHECK_GOTO(uvm_cpu_chunk_is_logical(split_chunks[i]), done);
+        TEST_CHECK_GOTO(uvm_cpu_chunk_get_size(split_chunks[i]) == split_size, done);
+        uvm_cpu_chunk_unmap_gpu(split_chunks[i], gpu0);
+        TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(split_chunks[i], gpu0) == 0, done);
+
+        // Test that gpu1 still has access.
+        TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(split_chunks[i], gpu1), done);
+    }
+
+    // Test CPU chunk merging.
+    merged_chunk = uvm_cpu_chunk_merge(split_chunks);
+    TEST_CHECK_GOTO(uvm_cpu_chunk_get_size(merged_chunk) == size, done_free);
+    TEST_CHECK_GOTO(merged_chunk == chunk, done_free);
+    TEST_CHECK_GOTO(nv_kref_read(&chunk->refcount) == 1, done_free);
+
+    // Since all logical chunks were mapped, the entire merged chunk should
+    // be accessible without needing to map it.
+    TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(merged_chunk, gpu0) == 0, done_free);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(merged_chunk, gpu1), done_free);
+
+    // Unmap gpu1 so we start with a fully unmapped physical chunk.
+    uvm_cpu_chunk_unmap_gpu(chunk, gpu1);
+
+    // Split the physical chunk.
+    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_split(chunk, split_chunks), done_free);
+
+    // Now map everything.
+    for (i = 0; i < num_split_chunks; i++) {
+        TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(split_chunks[i], gpu0), done);
+        TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(split_chunks[i], gpu1), done);
+    }
+
+    // Test CPU chunk merging with everything mapped.
+    merged_chunk = uvm_cpu_chunk_merge(split_chunks);
+
+    // At this point, all split chunks have been merged.
+    num_split_chunks = 0;
+
+    TEST_CHECK_GOTO(uvm_cpu_chunk_get_size(merged_chunk) == size, done_free);
+    TEST_CHECK_GOTO(merged_chunk == chunk, done_free);
+
+    // Since all logical chunks were mapped, the entire merged chunk should
+    // be accessible without needing to map it.
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(merged_chunk, gpu0), done_free);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(merged_chunk, gpu1), done_free);
+
+done:
+    for (i = 0; i < num_split_chunks; i++)
+        uvm_cpu_chunk_free(split_chunks[i]);
+
+done_free:
+    uvm_kvfree(split_chunks);
+
+    return status;
+}
+
+static NV_STATUS test_cpu_chunk_split_and_merge_2(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
+{
+    uvm_chunk_sizes_mask_t alloc_sizes = uvm_cpu_chunk_get_allocation_sizes();
+    uvm_chunk_size_t size;
+
+    size = uvm_chunk_find_next_size(alloc_sizes, PAGE_SIZE);
+    for_each_chunk_size_from(size, alloc_sizes) {
+        uvm_cpu_chunk_t *chunk;
+        NV_STATUS status;
+
+        // It is possible that the allocation fails due to lack of large pages
+        // rather than an API issue, which will result in a false negative.
+        // However, that should be very rare.
+        TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, NUMA_NO_NODE, &chunk));
+        status = do_test_cpu_chunk_split_and_merge_2(chunk, gpu0, gpu1);
+        uvm_cpu_chunk_free(chunk);
+
+        if (status != NV_OK)
+            return status;
+    }
+
+    return NV_OK;
+}
+
 static NV_STATUS test_cpu_chunk_dirty_split(uvm_cpu_chunk_t *chunk)
 {
    uvm_chunk_size_t size = uvm_cpu_chunk_get_size(chunk);
@ -1072,7 +1220,9 @@ done:
    return status;
 }

-NV_STATUS do_test_cpu_chunk_free(uvm_cpu_chunk_t *chunk, uvm_va_space_t *va_space, uvm_processor_mask_t *test_gpus)
+NV_STATUS do_test_cpu_chunk_free(uvm_cpu_chunk_t *chunk,
+                                 uvm_va_space_t *va_space,
+                                 const uvm_processor_mask_t *test_gpus)
 {
    NV_STATUS status = NV_OK;
    uvm_cpu_chunk_t **split_chunks;
@ -1099,7 +1249,7 @@ NV_STATUS do_test_cpu_chunk_free(uvm_cpu_chunk_t *chunk, uvm_va_space_t *va_spac
    chunk = NULL;

    // Map every other chunk.
-    // The call to uvm_cpu_chunk_unmap_parent_gpu_phys() is here in case this
+    // The call to uvm_cpu_chunk_unmap_gpu() is here in case this
    // is part of a double split (see below). In that case, the parent chunk
    // would be either mapped or unmapped.
    //
@ -1111,7 +1261,7 @@ NV_STATUS do_test_cpu_chunk_free(uvm_cpu_chunk_t *chunk, uvm_va_space_t *va_spac
            if (i & (1 << uvm_id_gpu_index(gpu->id)))
                TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(split_chunks[i], gpu), done);
            else
-                uvm_cpu_chunk_unmap_parent_gpu_phys(split_chunks[i], gpu->parent);
+                uvm_cpu_chunk_unmap_gpu(split_chunks[i], gpu);
        }
    }

@ -1147,9 +1297,9 @@ NV_STATUS do_test_cpu_chunk_free(uvm_cpu_chunk_t *chunk, uvm_va_space_t *va_spac
            TEST_CHECK_GOTO(uvm_cpu_chunk_get_size(split_chunks[j]) == split_size, done);
            for_each_va_space_gpu_in_mask(gpu, va_space, test_gpus) {
                if (j & (1 << uvm_id_gpu_index(gpu->id)))
-                    TEST_CHECK_GOTO(uvm_cpu_chunk_get_parent_gpu_phys_addr(split_chunks[j], gpu->parent), done);
+                    TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(split_chunks[j], gpu), done);
                else
-                    TEST_CHECK_GOTO(!uvm_cpu_chunk_get_parent_gpu_phys_addr(split_chunks[j], gpu->parent), done);
+                    TEST_CHECK_GOTO(!uvm_cpu_chunk_get_gpu_phys_addr(split_chunks[j], gpu), done);
            }
        }
    }
@ -1168,7 +1318,8 @@ done_free:
    return status;
 }

-NV_STATUS test_cpu_chunk_free(uvm_va_space_t *va_space, uvm_processor_mask_t *test_gpus)
+NV_STATUS test_cpu_chunk_free(uvm_va_space_t *va_space,
+                              const uvm_processor_mask_t *test_gpus)
 {
    uvm_cpu_chunk_t *chunk;
    uvm_chunk_sizes_mask_t alloc_sizes = uvm_cpu_chunk_get_allocation_sizes();
@ -1204,6 +1355,50 @@ static NV_STATUS test_cpu_chunk_numa_alloc(uvm_va_space_t *va_space)
    return NV_OK;
 }

+static uvm_gpu_t *find_first_parent_gpu(const uvm_processor_mask_t *test_gpus,
+                                        uvm_va_space_t *va_space)
+{
+    return uvm_processor_mask_find_first_va_space_gpu(test_gpus, va_space);
+}
+
+static uvm_gpu_t *find_next_parent_gpu(const uvm_processor_mask_t *test_gpus,
+                                       uvm_va_space_t *va_space,
+                                       uvm_gpu_t *gpu)
+{
+    uvm_gpu_t *next_gpu = gpu;
+
+    while (next_gpu) {
+        next_gpu = uvm_processor_mask_find_next_va_space_gpu(test_gpus, va_space, next_gpu);
+        if (!next_gpu || next_gpu->parent != gpu->parent)
+            break;
+    }
+
+    return next_gpu;
+}
+
+static void find_shared_gpu_pair(const uvm_processor_mask_t *test_gpus,
+                                 uvm_va_space_t *va_space,
+                                 uvm_gpu_t **out_gpu0,
+                                 uvm_gpu_t **out_gpu1)
+{
+    uvm_gpu_t *gpu0 = uvm_processor_mask_find_first_va_space_gpu(test_gpus, va_space);
+    uvm_gpu_t *gpu1 = uvm_processor_mask_find_next_va_space_gpu(test_gpus, va_space, gpu0);
+
+    while (gpu1) {
+        if (gpu0->parent == gpu1->parent) {
+            *out_gpu0 = gpu0;
+            *out_gpu1 = gpu1;
+            return;
+        }
+
+        gpu0 = gpu1;
+        gpu1 = uvm_processor_mask_find_next_va_space_gpu(test_gpus, va_space, gpu0);
+    }
+
+    *out_gpu0 = NULL;
+    *out_gpu1 = NULL;
+}
+
 NV_STATUS uvm_test_cpu_chunk_api(UVM_TEST_CPU_CHUNK_API_PARAMS *params, struct file *filp)
 {
    uvm_va_space_t *va_space = uvm_va_space_get(filp);
@ -1228,15 +1423,31 @@ NV_STATUS uvm_test_cpu_chunk_api(UVM_TEST_CPU_CHUNK_API_PARAMS *params, struct f
    TEST_NV_CHECK_GOTO(test_cpu_chunk_free(va_space, test_gpus), done);
    TEST_NV_CHECK_GOTO(test_cpu_chunk_numa_alloc(va_space), done);

-    if (uvm_processor_mask_get_gpu_count(test_gpus) >= 3) {
-        uvm_gpu_t *gpu2, *gpu3;
+    if (uvm_processor_mask_get_gpu_count(test_gpus) >= 2) {
+        uvm_gpu_t *gpu2, *gpu3 = NULL;

-        gpu = uvm_processor_mask_find_first_va_space_gpu(test_gpus, va_space);
-        gpu2 = uvm_processor_mask_find_next_va_space_gpu(test_gpus, va_space, gpu);
-        gpu3 = uvm_processor_mask_find_next_va_space_gpu(test_gpus, va_space, gpu2);
+        // Look for a pair of GPUs that don't share a common parent.
+        gpu = find_first_parent_gpu(test_gpus, va_space);
+        gpu2 = find_next_parent_gpu(test_gpus, va_space, gpu);
+        if (gpu2) {
+            TEST_NV_CHECK_GOTO(test_cpu_chunk_split_and_merge_2(gpu, gpu2), done);
+
+            // Look for a third physical GPU.
+            gpu3 = find_next_parent_gpu(test_gpus, va_space, gpu2);
+
+            if (gpu3)
                TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_array(gpu, gpu2, gpu3), done);
        }

+        // Look for a pair of GPUs that share a common parent.
+        find_shared_gpu_pair(test_gpus, va_space, &gpu, &gpu2);
+        if (gpu) {
+            // Test MIG instances within the same parent GPU.
+            TEST_NV_CHECK_GOTO(test_cpu_chunk_split_and_merge_2(gpu, gpu2), done);
+            TEST_NV_CHECK_GOTO(test_cpu_chunk_mig(gpu, gpu2), done);
+        }
+    }
+
 done:
    uvm_va_space_up_read(va_space);
    uvm_processor_mask_cache_free(test_gpus);
--- a/kernel-open/nvidia-uvm/uvm_policy.c
+++ b/kernel-open/nvidia-uvm/uvm_policy.c
@ -671,9 +671,6 @@ static NV_STATUS va_block_set_read_duplication_locked(uvm_va_block_t *va_block,

    uvm_assert_mutex_locked(&va_block->lock);

-    // Force CPU page residency to be on the preferred NUMA node.
-    va_block_context->make_resident.dest_nid = uvm_va_range_get_policy(va_block->va_range)->preferred_nid;
-
    for_each_id_in_mask(src_id, &va_block->resident) {
        NV_STATUS status;
        uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, src_id, NUMA_NO_NODE);
--- a/kernel-open/nvidia-uvm/uvm_processors.c
+++ b/kernel-open/nvidia-uvm/uvm_processors.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2023 NVIDIA Corporation
+    Copyright (c) 2023-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -30,6 +30,8 @@ const uvm_processor_mask_t g_uvm_processor_mask_empty = { };

 NV_STATUS uvm_processor_mask_cache_init(void)
 {
+    BUILD_BUG_ON((8 * sizeof(((uvm_sub_processor_mask_t *)0)->bitmap)) < UVM_PARENT_ID_MAX_SUB_PROCESSORS);
+
    g_uvm_processor_mask_cache = NV_KMEM_CACHE_CREATE("uvm_processor_mask_t", uvm_processor_mask_t);
    if (!g_uvm_processor_mask_cache)
        return NV_ERR_NO_MEMORY;
@ -100,8 +102,16 @@ void uvm_parent_gpus_from_processor_mask(uvm_parent_processor_mask_t *parent_mas

 bool uvm_numa_id_eq(int nid0, int nid1)
 {
-    UVM_ASSERT(nid0 >= NUMA_NO_NODE && nid0 < MAX_NUMNODES);
-    UVM_ASSERT(nid1 >= NUMA_NO_NODE && nid1 < MAX_NUMNODES);
+    UVM_ASSERT(nid0 == -1 || nid0 < MAX_NUMNODES);
+    UVM_ASSERT(nid1 == -1 || nid1 < MAX_NUMNODES);
+
+    if ((nid0 == NUMA_NO_NODE || nid1 == NUMA_NO_NODE) && nodes_weight(node_possible_map) == 1) {
+        if (nid0 == NUMA_NO_NODE)
+            nid0 = first_node(node_possible_map);
+
+        if (nid1 == NUMA_NO_NODE)
+            nid1 = first_node(node_possible_map);
+    }

    return nid0 == nid1;
 }
--- a/kernel-open/nvidia-uvm/uvm_processors.h
+++ b/kernel-open/nvidia-uvm/uvm_processors.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2023 NVIDIA Corporation
+    Copyright (c) 2016-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -277,8 +277,6 @@ typedef uvm_processor_id_t uvm_gpu_id_t;
 #define UVM_PARENT_ID_MAX_GPUS       NV_MAX_DEVICES
 #define UVM_PARENT_ID_MAX_PROCESSORS (UVM_PARENT_ID_MAX_GPUS + 1)

-#define UVM_PARENT_ID_MAX_SUB_PROCESSORS 8
-
 #define UVM_ID_MAX_GPUS       (UVM_PARENT_ID_MAX_GPUS * UVM_PARENT_ID_MAX_SUB_PROCESSORS)
 #define UVM_ID_MAX_PROCESSORS (UVM_ID_MAX_GPUS + 1)
 #define UVM_MAX_UNIQUE_GPU_PAIRS SUM_FROM_0_TO_N(UVM_ID_MAX_GPUS - 1)
@ -292,6 +290,9 @@ typedef uvm_processor_id_t uvm_gpu_id_t;

 #define UVM_ID_CHECK_BOUNDS(id) UVM_ASSERT_MSG(id.val <= UVM_ID_MAX_PROCESSORS, "id %u\n", id.val)

+#define UVM_SUB_PROCESSOR_INDEX_CHECK_BOUNDS(sub_index) \
+    UVM_ASSERT_MSG((sub_index) < UVM_PARENT_ID_MAX_SUB_PROCESSORS, "sub_index %u\n", (sub_index))
+
 static int uvm_parent_id_cmp(uvm_parent_processor_id_t id1, uvm_parent_processor_id_t id2)
 {
    UVM_PARENT_ID_CHECK_BOUNDS(id1);
@ -493,11 +494,16 @@ static uvm_gpu_id_t uvm_gpu_id_from_parent_gpu_id(const uvm_parent_gpu_id_t id)
 static uvm_gpu_id_t uvm_gpu_id_from_sub_processor_index(NvU32 index, NvU32 sub_index)
 {
    UVM_ASSERT(index < UVM_PARENT_ID_MAX_GPUS);
-    UVM_ASSERT(sub_index < UVM_PARENT_ID_MAX_SUB_PROCESSORS);
+    UVM_SUB_PROCESSOR_INDEX_CHECK_BOUNDS(sub_index);

    return uvm_gpu_id_from_index(index * UVM_PARENT_ID_MAX_SUB_PROCESSORS + sub_index);
 }

+static uvm_gpu_id_t uvm_gpu_id_from_sub_processor(uvm_parent_gpu_id_t id, NvU32 sub_index)
+{
+    return uvm_gpu_id_from_sub_processor_index(uvm_parent_id_gpu_index(id), sub_index);
+}
+
 static uvm_parent_gpu_id_t uvm_parent_gpu_id_from_gpu_id(const uvm_gpu_id_t id)
 {
    UVM_ASSERT(UVM_ID_IS_GPU(id));
@ -525,6 +531,71 @@ UVM_PROCESSOR_MASK(uvm_processor_mask_t,              \
 extern const uvm_processor_mask_t g_uvm_processor_mask_cpu;
 extern const uvm_processor_mask_t g_uvm_processor_mask_empty;

+// This is similar to uvm_parent_processor_mask_t and uvm_processor_mask_t
+// but defined as a NvU8 in order to save memory since DECLARE_BITMAP() uses
+// unsigned long. It also means we need to define our own bitops.
+// Note that these are not atomic operations.
+typedef struct
+{
+    NvU8 bitmap;
+} uvm_sub_processor_mask_t;
+
+static bool uvm_sub_processor_mask_test(const uvm_sub_processor_mask_t *mask, NvU32 sub_index)
+{
+    UVM_SUB_PROCESSOR_INDEX_CHECK_BOUNDS(sub_index);
+
+    return mask->bitmap & (1 << sub_index);
+}
+
+static void uvm_sub_processor_mask_set(uvm_sub_processor_mask_t *mask, NvU32 sub_index)
+{
+    UVM_SUB_PROCESSOR_INDEX_CHECK_BOUNDS(sub_index);
+
+    mask->bitmap |= 1 << sub_index;
+}
+
+static void uvm_sub_processor_mask_clear(uvm_sub_processor_mask_t *mask, NvU32 sub_index)
+{
+    UVM_SUB_PROCESSOR_INDEX_CHECK_BOUNDS(sub_index);
+
+    mask->bitmap &= ~(1 << sub_index);
+}
+
+static bool uvm_sub_processor_mask_test_and_set(uvm_sub_processor_mask_t *mask, NvU32 sub_index)
+{
+    bool result = uvm_sub_processor_mask_test(mask, sub_index);
+
+    if (!result)
+        uvm_sub_processor_mask_set(mask, sub_index);
+
+    return result;
+}
+
+static bool uvm_sub_processor_mask_test_and_clear(uvm_sub_processor_mask_t *mask, NvU32 sub_index)
+{
+    bool result = uvm_sub_processor_mask_test(mask, sub_index);
+
+    if (result)
+        uvm_sub_processor_mask_clear(mask, sub_index);
+
+    return result;
+}
+
+static void uvm_sub_processor_mask_zero(uvm_sub_processor_mask_t *mask)
+{
+    mask->bitmap = 0;
+}
+
+static bool uvm_sub_processor_mask_empty(const uvm_sub_processor_mask_t *mask)
+{
+    return mask->bitmap == 0;
+}
+
+static NvU32 uvm_sub_processor_mask_get_count(const uvm_sub_processor_mask_t *mask)
+{
+    return hweight8(mask->bitmap);
+}
+
 // Like uvm_processor_mask_subset() but ignores the CPU in the subset mask.
 // Returns whether the GPUs in subset are a subset of the GPUs in mask.
 bool uvm_processor_mask_gpu_subset(const uvm_processor_mask_t *subset,
@ -571,8 +642,28 @@ void uvm_parent_gpus_from_processor_mask(uvm_parent_processor_mask_t *parent_mas
         i = uvm_gpu_id_next(i))

 // Helper to iterate over all sub processor indexes.
-#define for_each_sub_processor_index(i) \
-    for (i = 0; i < UVM_PARENT_ID_MAX_SUB_PROCESSORS; i++)
+#define for_each_sub_processor_index(sub_index) \
+    for ((sub_index) = 0; (sub_index) < UVM_PARENT_ID_MAX_SUB_PROCESSORS; (sub_index)++)
+
+static NvU32 uvm_sub_processor_mask_find_first_index(const uvm_sub_processor_mask_t *mask)
+{
+    unsigned long bitmap = mask->bitmap;
+
+    return find_first_bit(&bitmap, UVM_PARENT_ID_MAX_SUB_PROCESSORS);
+}
+
+static NvU32 uvm_sub_processor_mask_find_next_index(const uvm_sub_processor_mask_t *mask, NvU32 min_index)
+{
+    unsigned long bitmap = mask->bitmap;
+
+    return find_next_bit(&bitmap, UVM_PARENT_ID_MAX_SUB_PROCESSORS, min_index);
+}
+
+// Helper to iterate over all sub processor indexes in a given mask.
+#define for_each_sub_processor_index_in_mask(sub_index, sub_mask)                           \
+    for ((sub_index) = uvm_sub_processor_mask_find_first_index((sub_mask));                 \
+         (sub_index) < UVM_PARENT_ID_MAX_SUB_PROCESSORS;                                    \
+         (sub_index) = uvm_sub_processor_mask_find_next_index((sub_mask), (sub_index) + 1))

 // Helper to iterate over all valid processor ids.
 #define for_each_id(i) for (i = UVM_ID_CPU; UVM_ID_IS_VALID(i); i = uvm_id_next(i))
--- a/kernel-open/nvidia-uvm/uvm_push.h
+++ b/kernel-open/nvidia-uvm/uvm_push.h
@ -65,12 +65,9 @@ typedef enum
 } uvm_push_flag_t;

 struct uvm_push_crypto_bundle_struct {
-    // Initialization vector used to decrypt the push on the CPU
+    // Initialization vector used to decrypt the push
    UvmCslIv iv;

-    // Key version used to decrypt the push on the CPU
-    NvU32 key_version;
-
    // Size of the pushbuffer that is encrypted/decrypted
    NvU32 push_size;
 };
--- a/kernel-open/nvidia-uvm/uvm_pushbuffer.c
+++ b/kernel-open/nvidia-uvm/uvm_pushbuffer.c
@ -451,6 +451,7 @@ static uvm_pushbuffer_chunk_t *gpfifo_to_chunk(uvm_pushbuffer_t *pushbuffer, uvm
 static void decrypt_push(uvm_channel_t *channel, uvm_gpfifo_entry_t *gpfifo)
 {
    NV_STATUS status;
+    NvU32 auth_tag_offset;
    void *auth_tag_cpu_va;
    void *push_protected_cpu_va;
    void *push_unprotected_cpu_va;
@ -469,15 +470,16 @@ static void decrypt_push(uvm_channel_t *channel, uvm_gpfifo_entry_t *gpfifo)
    UVM_ASSERT(!uvm_channel_is_wlc(channel));
    UVM_ASSERT(!uvm_channel_is_lcic(channel));

-    push_protected_cpu_va = get_base_cpu_va(pushbuffer) + pushbuffer_offset;
+    push_protected_cpu_va = (char *)get_base_cpu_va(pushbuffer) + pushbuffer_offset;
    push_unprotected_cpu_va = (char *)uvm_rm_mem_get_cpu_va(pushbuffer->memory_unprotected_sysmem) + pushbuffer_offset;
-    auth_tag_cpu_va = uvm_channel_get_push_crypto_bundle_auth_tags_cpu_va(channel, push_info_index);
+    auth_tag_offset = push_info_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
+    auth_tag_cpu_va = (char *)uvm_rm_mem_get_cpu_va(channel->conf_computing.push_crypto_bundle_auth_tags) +
+                              auth_tag_offset;

    status = uvm_conf_computing_cpu_decrypt(channel,
                                            push_protected_cpu_va,
                                            push_unprotected_cpu_va,
                                            &crypto_bundle->iv,
-                                            crypto_bundle->key_version,
                                            crypto_bundle->push_size,
                                            auth_tag_cpu_va);

@ -556,7 +558,7 @@ NvU64 uvm_pushbuffer_get_gpu_va_for_push(uvm_pushbuffer_t *pushbuffer, uvm_push_
    if (uvm_channel_is_wlc(push->channel) || uvm_channel_is_lcic(push->channel)) {
        // We need to use the same static locations for PB as the fixed
        // schedule because that's what the channels are initialized to use.
-        return uvm_channel_get_static_pb_protected_vidmem_gpu_va(push->channel);
+        return uvm_rm_mem_get_gpu_uvm_va(push->channel->conf_computing.static_pb_protected_vidmem, gpu);
    }
    else if (uvm_channel_is_sec2(push->channel)) {
        // SEC2 PBs are in unprotected sysmem
@ -573,7 +575,7 @@ void *uvm_pushbuffer_get_unprotected_cpu_va_for_push(uvm_pushbuffer_t *pushbuffe
    if (uvm_channel_is_wlc(push->channel)) {
        // Reuse existing WLC static pb for initialization
        UVM_ASSERT(!uvm_channel_manager_is_wlc_ready(push->channel->pool->manager));
-        return uvm_channel_get_static_pb_unprotected_sysmem_cpu(push->channel);
+        return push->channel->conf_computing.static_pb_unprotected_sysmem_cpu;
    }

    pushbuffer_base = uvm_rm_mem_get_cpu_va(pushbuffer->memory_unprotected_sysmem);
@ -588,8 +590,8 @@ NvU64 uvm_pushbuffer_get_unprotected_gpu_va_for_push(uvm_pushbuffer_t *pushbuffe
    if (uvm_channel_is_wlc(push->channel)) {
        // Reuse existing WLC static pb for initialization
        UVM_ASSERT(!uvm_channel_manager_is_wlc_ready(push->channel->pool->manager));
-
-        return uvm_channel_get_static_pb_unprotected_sysmem_gpu_va(push->channel);
+        return uvm_rm_mem_get_gpu_uvm_va(push->channel->conf_computing.static_pb_unprotected_sysmem,
+                                         uvm_push_get_gpu(push));
    }

    pushbuffer_base = uvm_rm_mem_get_gpu_uvm_va(pushbuffer->memory_unprotected_sysmem, uvm_push_get_gpu(push));
--- a/kernel-open/nvidia-uvm/uvm_sec2_test.c
+++ b/kernel-open/nvidia-uvm/uvm_sec2_test.c
@ -322,7 +322,6 @@ static NV_STATUS cpu_decrypt(uvm_channel_t *channel,
                             uvm_mem_t *dst_mem,
                             uvm_mem_t *src_mem,
                             UvmCslIv *decrypt_iv,
-                             NvU32 key_version,
                             uvm_mem_t *auth_tag_mem,
                             size_t size,
                             size_t copy_size)
@ -339,7 +338,6 @@ static NV_STATUS cpu_decrypt(uvm_channel_t *channel,
                                                         dst_plain,
                                                         src_cipher,
                                                         &decrypt_iv[i],
-                                                         key_version,
                                                         copy_size,
                                                         auth_tag_buffer));

@ -370,7 +368,7 @@ static void gpu_encrypt(uvm_push_t *push,
    uvm_gpu_address_t auth_tag_address = uvm_mem_gpu_address_virtual_kernel(auth_tag_mem, gpu);

    for (i = 0; i < num_iterations; i++) {
-        uvm_conf_computing_log_gpu_encryption(push->channel, copy_size, decrypt_iv);
+        uvm_conf_computing_log_gpu_encryption(push->channel, decrypt_iv);

        if (i > 0)
            uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
@ -429,7 +427,6 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu, size_t copy_size, siz
    size_t auth_tag_buffer_size = (size / copy_size) * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
    uvm_push_t push;
    UvmCslIv *decrypt_iv;
-    NvU32 key_version;

    decrypt_iv = uvm_kvmalloc_zero((size / copy_size) * sizeof(UvmCslIv));
    if (!decrypt_iv)
@ -459,11 +456,6 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu, size_t copy_size, siz

    gpu_encrypt(&push, dst_cipher, dst_plain, decrypt_iv, auth_tag_mem, size, copy_size);

-    // There shouldn't be any key rotation between the end of the push and the
-    // CPU decryption(s), but it is more robust against test changes to force
-    // decryption to use the saved key.
-    key_version = uvm_channel_pool_key_version(push.channel->pool);
-
    TEST_NV_CHECK_GOTO(uvm_push_end_and_wait(&push), out);

    TEST_CHECK_GOTO(!mem_match(src_plain, src_cipher), out);
@ -473,7 +465,6 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu, size_t copy_size, siz
                                   dst_plain_cpu,
                                   dst_cipher,
                                   decrypt_iv,
-                                   key_version,
                                   auth_tag_mem,
                                   size,
                                   copy_size),
--- a/kernel-open/nvidia-uvm/uvm_test.c
+++ b/kernel-open/nvidia-uvm/uvm_test.c
@ -124,23 +124,24 @@ static NV_STATUS uvm_test_verify_bh_affinity(uvm_intr_handler_t *isr, int node)
 static NV_STATUS uvm_test_numa_check_affinity(UVM_TEST_NUMA_CHECK_AFFINITY_PARAMS *params, struct file *filp)
 {
    uvm_gpu_t *gpu;
-    NV_STATUS status = NV_OK;
+    NV_STATUS status;
+    uvm_rm_user_object_t user_rm_va_space = {
+        .rm_control_fd = -1,
+        .user_client = params->client,
+        .user_object = params->smc_part_ref
+    };

    if (!UVM_THREAD_AFFINITY_SUPPORTED())
        return NV_ERR_NOT_SUPPORTED;

-    uvm_mutex_lock(&g_uvm_global.global_lock);
-
-    gpu = uvm_gpu_get_by_uuid(&params->gpu_uuid);
-    if (!gpu) {
-        status = NV_ERR_INVALID_DEVICE;
-        goto unlock;
-    }
+    status = uvm_gpu_retain_by_uuid(&params->gpu_uuid, &user_rm_va_space, &gpu);
+    if (status != NV_OK)
+        return status;

    // If the GPU is not attached to a NUMA node, there is nothing to do.
    if (gpu->parent->closest_cpu_numa_node == NUMA_NO_NODE) {
        status = NV_ERR_NOT_SUPPORTED;
-        goto unlock;
+        goto release;
    }

    if (gpu->parent->replayable_faults_supported) {
@ -149,7 +150,7 @@ static NV_STATUS uvm_test_numa_check_affinity(UVM_TEST_NUMA_CHECK_AFFINITY_PARAM
                                              gpu->parent->closest_cpu_numa_node);
        uvm_parent_gpu_replayable_faults_isr_unlock(gpu->parent);
        if (status != NV_OK)
-            goto unlock;
+            goto release;

        if (gpu->parent->non_replayable_faults_supported) {
            uvm_parent_gpu_non_replayable_faults_isr_lock(gpu->parent);
@ -157,7 +158,7 @@ static NV_STATUS uvm_test_numa_check_affinity(UVM_TEST_NUMA_CHECK_AFFINITY_PARAM
                                                  gpu->parent->closest_cpu_numa_node);
            uvm_parent_gpu_non_replayable_faults_isr_unlock(gpu->parent);
            if (status != NV_OK)
-                goto unlock;
+                goto release;
        }

        if (gpu->parent->access_counters_supported) {
@ -167,9 +168,8 @@ static NV_STATUS uvm_test_numa_check_affinity(UVM_TEST_NUMA_CHECK_AFFINITY_PARAM
            uvm_parent_gpu_access_counters_isr_unlock(gpu->parent);
        }
    }
-
-unlock:
-    uvm_mutex_unlock(&g_uvm_global.global_lock);
+release:
+    uvm_gpu_release(gpu);
    return status;
 }

--- a/kernel-open/nvidia-uvm/uvm_test_ioctl.h
+++ b/kernel-open/nvidia-uvm/uvm_test_ioctl.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2022 NVidia Corporation
+    Copyright (c) 2015-2024 NVidia Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -191,7 +191,7 @@ typedef struct
    NvU32                           read_duplication;                                   // Out (UVM_TEST_READ_DUPLICATION_POLICY)
    NvProcessorUuid                 preferred_location;                                 // Out
    NvS32                           preferred_cpu_nid;                                  // Out
-    NvProcessorUuid                 accessed_by[UVM_MAX_PROCESSORS_V2];                 // Out
+    NvProcessorUuid                 accessed_by[UVM_MAX_PROCESSORS];                    // Out
    NvU32                           accessed_by_count;                                  // Out
    NvU32                           type;                                               // Out (UVM_TEST_VA_RANGE_TYPE)
    union
@ -347,30 +347,20 @@ typedef enum
    UVM_TEST_CHANNEL_STRESS_MODE_NOOP_PUSH = 0,
    UVM_TEST_CHANNEL_STRESS_MODE_UPDATE_CHANNELS,
    UVM_TEST_CHANNEL_STRESS_MODE_STREAM,
-    UVM_TEST_CHANNEL_STRESS_MODE_KEY_ROTATION,
 } UVM_TEST_CHANNEL_STRESS_MODE;

-typedef enum
-{
-    UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_CPU_TO_GPU,
-    UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_GPU_TO_CPU,
-    UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_ROTATE,
-} UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION;
-
 #define UVM_TEST_CHANNEL_STRESS                          UVM_TEST_IOCTL_BASE(15)
 typedef struct
 {
-    NvU32     mode;                   // In, one of UVM_TEST_CHANNEL_STRESS_MODE
+    NvU32     mode;                   // In

    // Number of iterations:
    //   mode == NOOP_PUSH: number of noop pushes
    //   mode == UPDATE_CHANNELS: number of updates
    //   mode == STREAM: number of iterations per stream
-    //   mode == ROTATION: number of operations
    NvU32     iterations;

-    NvU32     num_streams;            // In, used only if mode == STREAM
-    NvU32     key_rotation_operation; // In, used only if mode == ROTATION
+    NvU32     num_streams;            // In, used only for mode == UVM_TEST_CHANNEL_STRESS_MODE_STREAM
    NvU32     seed;                   // In
    NvU32     verbose;                // In
    NV_STATUS rmStatus;               // Out
@ -634,7 +624,7 @@ typedef struct

    // Array of processors which have a resident copy of the page containing
    // lookup_address.
-    NvProcessorUuid                 resident_on[UVM_MAX_PROCESSORS_V2];                 // Out
+    NvProcessorUuid                 resident_on[UVM_MAX_PROCESSORS];                    // Out
    NvU32                           resident_on_count;                                  // Out

    // If the memory is resident on the CPU, the NUMA node on which the page
@ -645,24 +635,24 @@ typedef struct
    // system-page-sized portion of this allocation which contains
    // lookup_address is guaranteed to be resident on the corresponding
    // processor.
-    NvU32                           resident_physical_size[UVM_MAX_PROCESSORS_V2];      // Out
+    NvU32                           resident_physical_size[UVM_MAX_PROCESSORS];         // Out

    // The physical address of the physical allocation backing lookup_address.
-    NvU64                           resident_physical_address[UVM_MAX_PROCESSORS_V2] NV_ALIGN_BYTES(8); // Out
+    NvU64                           resident_physical_address[UVM_MAX_PROCESSORS] NV_ALIGN_BYTES(8); // Out

    // Array of processors which have a virtual mapping covering lookup_address.
-    NvProcessorUuid                 mapped_on[UVM_MAX_PROCESSORS_V2];                   // Out
-    NvU32                           mapping_type[UVM_MAX_PROCESSORS_V2];                // Out
-    NvU64                           mapping_physical_address[UVM_MAX_PROCESSORS_V2] NV_ALIGN_BYTES(8); // Out
+    NvProcessorUuid                 mapped_on[UVM_MAX_PROCESSORS];                      // Out
+    NvU32                           mapping_type[UVM_MAX_PROCESSORS];                   // Out
+    NvU64                           mapping_physical_address[UVM_MAX_PROCESSORS] NV_ALIGN_BYTES(8); // Out
    NvU32                           mapped_on_count;                                    // Out

    // The size of the virtual mapping covering lookup_address on each
    // mapped_on processor.
-    NvU32                           page_size[UVM_MAX_PROCESSORS_V2];                   // Out
+    NvU32                           page_size[UVM_MAX_PROCESSORS];                      // Out

    // Array of processors which have physical memory populated that would back
    // lookup_address if it was resident.
-    NvProcessorUuid                 populated_on[UVM_MAX_PROCESSORS_V2];                // Out
+    NvProcessorUuid                 populated_on[UVM_MAX_PROCESSORS];                   // Out
    NvU32                           populated_on_count;                                 // Out

    NV_STATUS rmStatus;                                                                 // Out
@ -1220,6 +1210,8 @@ typedef struct
 typedef struct
 {
    NvProcessorUuid                 gpu_uuid;                                           // In
+    NvHandle                        client;                                             // In
+    NvHandle                        smc_part_ref;                                       // In

    NV_STATUS                       rmStatus;                                           // Out
 } UVM_TEST_NUMA_CHECK_AFFINITY_PARAMS;
--- a/kernel-open/nvidia-uvm/uvm_tlb_batch.c
+++ b/kernel-open/nvidia-uvm/uvm_tlb_batch.c
@ -30,18 +30,18 @@ void uvm_tlb_batch_begin(uvm_page_tree_t *tree, uvm_tlb_batch_t *batch)
    batch->tree = tree;
 }

-static NvU32 smallest_page_size(NvU32 page_sizes)
+static NvU64 smallest_page_size(NvU64 page_sizes)
 {
    UVM_ASSERT(page_sizes != 0);

-    return 1u << __ffs(page_sizes);
+    return 1ULL << __ffs(page_sizes);
 }

-static NvU32 biggest_page_size(NvU32 page_sizes)
+static NvU64 biggest_page_size(NvU64 page_sizes)
 {
    UVM_ASSERT(page_sizes != 0);

-    return 1u << __fls(page_sizes);
+    return 1ULL << __fls(page_sizes);
 }

 static void tlb_batch_flush_invalidate_per_va(uvm_tlb_batch_t *batch, uvm_push_t *push)
@ -53,8 +53,8 @@ static void tlb_batch_flush_invalidate_per_va(uvm_tlb_batch_t *batch, uvm_push_t

    for (i = 0; i < batch->count; ++i) {
        uvm_tlb_batch_range_t *entry = &batch->ranges[i];
-        NvU32 min_page_size = smallest_page_size(entry->page_sizes);
-        NvU32 max_page_size = biggest_page_size(entry->page_sizes);
+        NvU64 min_page_size = smallest_page_size(entry->page_sizes);
+        NvU64 max_page_size = biggest_page_size(entry->page_sizes);

        // Use the depth of the max page size as it's the broadest
        NvU32 depth = tree->hal->page_table_depth(max_page_size);
@ -113,7 +113,7 @@ void uvm_tlb_batch_end(uvm_tlb_batch_t *batch, uvm_push_t *push, uvm_membar_t tl
        tlb_batch_flush_invalidate_per_va(batch, push);
 }

-void uvm_tlb_batch_invalidate(uvm_tlb_batch_t *batch, NvU64 start, NvU64 size, NvU32 page_sizes, uvm_membar_t tlb_membar)
+void uvm_tlb_batch_invalidate(uvm_tlb_batch_t *batch, NvU64 start, NvU64 size, NvU64 page_sizes, uvm_membar_t tlb_membar)
 {
    uvm_tlb_batch_range_t *new_entry;

--- a/kernel-open/nvidia-uvm/uvm_tlb_batch.h
+++ b/kernel-open/nvidia-uvm/uvm_tlb_batch.h
@ -41,7 +41,7 @@ typedef struct
    NvU64 size;

    // Min and max page size ored together
-    NvU32 page_sizes;
+    NvU64 page_sizes;
 } uvm_tlb_batch_range_t;

 struct uvm_tlb_batch_struct
@ -63,7 +63,7 @@ struct uvm_tlb_batch_struct
    NvU32 count;

    // Biggest page size across all queued up invalidates
-    NvU32 biggest_page_size;
+    NvU64 biggest_page_size;

    // Max membar across all queued up invalidates
    uvm_membar_t membar;
@ -81,7 +81,7 @@ void uvm_tlb_batch_begin(uvm_page_tree_t *tree, uvm_tlb_batch_t *batch);
 // If the membar parameter is not UVM_MEMBAR_NONE, the specified membar will
 // be performed logically after the TLB invalidate such that all physical memory
 // accesses using the old translations are ordered to the scope of the membar.
-void uvm_tlb_batch_invalidate(uvm_tlb_batch_t *batch, NvU64 start, NvU64 size, NvU32 page_sizes, uvm_membar_t tlb_membar);
+void uvm_tlb_batch_invalidate(uvm_tlb_batch_t *batch, NvU64 start, NvU64 size, NvU64 page_sizes, uvm_membar_t tlb_membar);

 // End a TLB invalidate batch
 //
@ -97,8 +97,12 @@ void uvm_tlb_batch_end(uvm_tlb_batch_t *batch, uvm_push_t *push, uvm_membar_t tl
 // Helper for invalidating a single range immediately.
 //
 // Internally begins and ends a TLB batch.
-static void uvm_tlb_batch_single_invalidate(uvm_page_tree_t *tree, uvm_push_t *push,
-        NvU64 start, NvU64 size, NvU32 page_sizes, uvm_membar_t tlb_membar)
+static void uvm_tlb_batch_single_invalidate(uvm_page_tree_t *tree,
+                                            uvm_push_t *push,
+                                            NvU64 start,
+                                            NvU64 size,
+                                            NvU64 page_sizes,
+                                            uvm_membar_t tlb_membar)
 {
    uvm_tlb_batch_t batch;

--- a/kernel-open/nvidia-uvm/uvm_tools.c
+++ b/kernel-open/nvidia-uvm/uvm_tools.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2023 NVIDIA Corporation
+    Copyright (c) 2016-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -57,20 +57,12 @@ typedef struct
    struct list_head queue_nodes[UvmEventNumTypesAll];

    struct page **queue_buffer_pages;
-    union
-    {
-        UvmEventEntry_V1 *queue_v1;
-        UvmEventEntry_V2 *queue_v2;
-    };
+    void *queue_buffer;
    NvU32 queue_buffer_count;
    NvU32 notification_threshold;

    struct page **control_buffer_pages;
-    union
-    {
-        UvmToolsEventControlData_V1 *control_v1;
-        UvmToolsEventControlData_V2 *control_v2;
-    };
+    UvmToolsEventControlData *control;

    wait_queue_head_t wait_queue;
    bool is_wakeup_get_valid;
@ -398,16 +390,12 @@ static void destroy_event_tracker(uvm_tools_event_tracker_t *event_tracker)

        if (event_tracker->is_queue) {
            uvm_tools_queue_t *queue = &event_tracker->queue;
-            NvU64 buffer_size, control_size;
+            NvU64 buffer_size;

-            if (event_tracker->version == UvmToolsEventQueueVersion_V1) {
+            if (event_tracker->version == UvmToolsEventQueueVersion_V1)
                buffer_size = queue->queue_buffer_count * sizeof(UvmEventEntry_V1);
-                control_size = sizeof(UvmToolsEventControlData_V1);
-            }
-            else {
+            else
                buffer_size = queue->queue_buffer_count * sizeof(UvmEventEntry_V2);
-                control_size = sizeof(UvmToolsEventControlData_V2);
-            }

            remove_event_tracker(va_space,
                                 queue->queue_nodes,
@ -415,16 +403,16 @@ static void destroy_event_tracker(uvm_tools_event_tracker_t *event_tracker)
                                 queue->subscribed_queues,
                                 &queue->subscribed_queues);

-            if (queue->queue_v2 != NULL) {
+            if (queue->queue_buffer != NULL) {
                unmap_user_pages(queue->queue_buffer_pages,
-                                 queue->queue_v2,
+                                 queue->queue_buffer,
                                 buffer_size);
            }

-            if (queue->control_v2 != NULL) {
+            if (queue->control != NULL) {
                unmap_user_pages(queue->control_buffer_pages,
-                                 queue->control_v2,
-                                 control_size);
+                                 queue->control,
+                                 sizeof(UvmToolsEventControlData));
            }
        }
        else {
@ -456,9 +444,9 @@ static void destroy_event_tracker(uvm_tools_event_tracker_t *event_tracker)
    kmem_cache_free(g_tools_event_tracker_cache, event_tracker);
 }

-static void enqueue_event_v1(const UvmEventEntry_V1 *entry, uvm_tools_queue_t *queue)
+static void enqueue_event(const void *entry, size_t entry_size, NvU8 eventType, uvm_tools_queue_t *queue)
 {
-    UvmToolsEventControlData_V1 *ctrl = queue->control_v1;
+    UvmToolsEventControlData *ctrl = queue->control;
    uvm_tools_queue_snapshot_t sn;
    NvU32 queue_size = queue->queue_buffer_count;
    NvU32 queue_mask = queue_size - 1;
@ -481,11 +469,11 @@ static void enqueue_event_v1(const UvmEventEntry_V1 *entry, uvm_tools_queue_t *q

    // one free element means that the queue is full
    if (((queue_size + sn.get_behind - sn.put_behind) & queue_mask) == 1) {
-        atomic64_inc((atomic64_t *)&ctrl->dropped + entry->eventData.eventType);
+        atomic64_inc((atomic64_t *)&ctrl->dropped + eventType);
        goto unlock;
    }

-    memcpy(queue->queue_v1 + sn.put_behind, entry, sizeof(*entry));
+    memcpy((char *)queue->queue_buffer + sn.put_behind * entry_size, entry, entry_size);

    sn.put_behind = sn.put_ahead;

@ -509,79 +497,45 @@ unlock:
    uvm_spin_unlock(&queue->lock);
 }

+static void enqueue_event_v1(const UvmEventEntry_V1 *entry, uvm_tools_queue_t *queue)
+{
+    enqueue_event(entry, sizeof(*entry), entry->eventData.eventType, queue);
+}
+
 static void enqueue_event_v2(const UvmEventEntry_V2 *entry, uvm_tools_queue_t *queue)
 {
-    UvmToolsEventControlData_V2 *ctrl = queue->control_v2;
-    uvm_tools_queue_snapshot_t sn;
-    NvU32 queue_size = queue->queue_buffer_count;
-    NvU32 queue_mask = queue_size - 1;
+    enqueue_event(entry, sizeof(*entry), entry->eventData.eventType, queue);
+}

-    // Prevent processor speculation prior to accessing user-mapped memory to
-    // avoid leaking information from side-channel attacks. There are many
-    // possible paths leading to this point and it would be difficult and error-
-    // prone to audit all of them to determine whether user mode could guide
-    // this access to kernel memory under speculative execution, so to be on the
-    // safe side we'll just always block speculation.
-    nv_speculation_barrier();
+static void uvm_tools_record_event(struct list_head *head,
+                                   const void *entry,
+                                   size_t entry_size,
+                                   NvU8 eventType)
+{
+    uvm_tools_queue_t *queue;

-    uvm_spin_lock(&queue->lock);
+    UVM_ASSERT(eventType < UvmEventNumTypesAll);

-    // ctrl is mapped into user space with read and write permissions,
-    // so its values cannot be trusted.
-    sn.get_behind = atomic_read((atomic_t *)&ctrl->get_behind) & queue_mask;
-    sn.put_behind = atomic_read((atomic_t *)&ctrl->put_behind) & queue_mask;
-    sn.put_ahead = (sn.put_behind + 1) & queue_mask;
-
-    // one free element means that the queue is full
-    if (((queue_size + sn.get_behind - sn.put_behind) & queue_mask) == 1) {
-        atomic64_inc((atomic64_t *)&ctrl->dropped + entry->eventData.eventType);
-        goto unlock;
-    }
-
-    memcpy(queue->queue_v2 + sn.put_behind, entry, sizeof(*entry));
-
-    sn.put_behind = sn.put_ahead;
-    // put_ahead and put_behind will always be the same outside of queue->lock
-    // this allows the user-space consumer to choose either a 2 or 4 pointer synchronization approach
-    atomic_set((atomic_t *)&ctrl->put_ahead, sn.put_behind);
-    atomic_set((atomic_t *)&ctrl->put_behind, sn.put_behind);
-
-    sn.get_ahead = atomic_read((atomic_t *)&ctrl->get_ahead);
-    // if the queue needs to be woken up, only signal if we haven't signaled before for this value of get_ahead
-    if (queue_needs_wakeup(queue, &sn) && !(queue->is_wakeup_get_valid && queue->wakeup_get == sn.get_ahead)) {
-        queue->is_wakeup_get_valid = true;
-        queue->wakeup_get = sn.get_ahead;
-        wake_up_all(&queue->wait_queue);
-    }
-
-unlock:
-    uvm_spin_unlock(&queue->lock);
+    list_for_each_entry(queue, head + eventType, queue_nodes[eventType])
+        enqueue_event(entry, entry_size, eventType, queue);
 }

 static void uvm_tools_record_event_v1(uvm_va_space_t *va_space, const UvmEventEntry_V1 *entry)
 {
    NvU8 eventType = entry->eventData.eventType;
-    uvm_tools_queue_t *queue;
-
-    UVM_ASSERT(eventType < UvmEventNumTypesAll);

    uvm_assert_rwsem_locked(&va_space->tools.lock);

-    list_for_each_entry(queue, va_space->tools.queues_v1 + eventType, queue_nodes[eventType])
-        enqueue_event_v1(entry, queue);
+    uvm_tools_record_event(va_space->tools.queues_v1, entry, sizeof(*entry), eventType);
 }

 static void uvm_tools_record_event_v2(uvm_va_space_t *va_space, const UvmEventEntry_V2 *entry)
 {
    NvU8 eventType = entry->eventData.eventType;
-    uvm_tools_queue_t *queue;
-
-    UVM_ASSERT(eventType < UvmEventNumTypesAll);

    uvm_assert_rwsem_locked(&va_space->tools.lock);

-    list_for_each_entry(queue, va_space->tools.queues_v2 + eventType, queue_nodes[eventType])
-        enqueue_event_v2(entry, queue);
+    uvm_tools_record_event(va_space->tools.queues_v2, entry, sizeof(*entry), eventType);
 }

 static bool counter_matches_processor(UvmCounterName counter, const NvProcessorUuid *processor)
@ -751,7 +705,7 @@ static unsigned uvm_tools_poll(struct file *filp, poll_table *wait)
    int flags = 0;
    uvm_tools_queue_snapshot_t sn;
    uvm_tools_event_tracker_t *event_tracker;
-    UvmToolsEventControlData_V2 *ctrl;
+    UvmToolsEventControlData *ctrl;

    if (uvm_global_get_status() != NV_OK)
        return POLLERR;
@ -763,7 +717,7 @@ static unsigned uvm_tools_poll(struct file *filp, poll_table *wait)
    uvm_spin_lock(&event_tracker->queue.lock);

    event_tracker->queue.is_wakeup_get_valid = false;
-    ctrl = event_tracker->queue.control_v2;
+    ctrl = event_tracker->queue.control;
    sn.get_ahead = atomic_read((atomic_t *)&ctrl->get_ahead);
    sn.put_behind = atomic_read((atomic_t *)&ctrl->put_behind);

@ -878,6 +832,24 @@ static void record_gpu_fault_instance(uvm_gpu_t *gpu,
    }
 }

+static void record_cpu_fault(UvmEventCpuFaultInfo *info, uvm_perf_event_data_t *event_data)
+{
+    info->eventType = UvmEventTypeCpuFault;
+    if (event_data->fault.cpu.is_write)
+        info->accessType = UvmEventMemoryAccessTypeWrite;
+    else
+        info->accessType = UvmEventMemoryAccessTypeRead;
+
+    info->address = event_data->fault.cpu.fault_va;
+    info->timeStamp = NV_GETTIME();
+    // assume that current owns va_space
+    info->pid = uvm_get_stale_process_id();
+    info->threadId = uvm_get_stale_thread_id();
+    info->pc = event_data->fault.cpu.pc;
+    // TODO: Bug 4515381: set info->nid when we decide if it's NUMA node ID or
+    // CPU ID.
+}
+
 static void uvm_tools_record_fault(uvm_perf_event_t event_id, uvm_perf_event_data_t *event_data)
 {
    uvm_va_space_t *va_space = event_data->fault.space;
@ -895,41 +867,17 @@ static void uvm_tools_record_fault(uvm_perf_event_t event_id, uvm_perf_event_dat
    if (UVM_ID_IS_CPU(event_data->fault.proc_id)) {
        if (tools_is_event_enabled_version(va_space, UvmEventTypeCpuFault, UvmToolsEventQueueVersion_V1)) {
            UvmEventEntry_V1 entry;
-            UvmEventCpuFaultInfo_V1 *info = &entry.eventData.cpuFault;
            memset(&entry, 0, sizeof(entry));

-            info->eventType = UvmEventTypeCpuFault;
-            if (event_data->fault.cpu.is_write)
-                info->accessType = UvmEventMemoryAccessTypeWrite;
-            else
-                info->accessType = UvmEventMemoryAccessTypeRead;
-
-            info->address = event_data->fault.cpu.fault_va;
-            info->timeStamp = NV_GETTIME();
-            // assume that current owns va_space
-            info->pid = uvm_get_stale_process_id();
-            info->threadId = uvm_get_stale_thread_id();
-            info->pc = event_data->fault.cpu.pc;
+            record_cpu_fault(&entry.eventData.cpuFault, event_data);

            uvm_tools_record_event_v1(va_space, &entry);
        }
        if (tools_is_event_enabled_version(va_space, UvmEventTypeCpuFault, UvmToolsEventQueueVersion_V2)) {
            UvmEventEntry_V2 entry;
-            UvmEventCpuFaultInfo_V2 *info = &entry.eventData.cpuFault;
            memset(&entry, 0, sizeof(entry));

-            info->eventType = UvmEventTypeCpuFault;
-            if (event_data->fault.cpu.is_write)
-                info->accessType = UvmEventMemoryAccessTypeWrite;
-            else
-                info->accessType = UvmEventMemoryAccessTypeRead;
-
-            info->address = event_data->fault.cpu.fault_va;
-            info->timeStamp = NV_GETTIME();
-            // assume that current owns va_space
-            info->pid = uvm_get_stale_process_id();
-            info->threadId = uvm_get_stale_thread_id();
-            info->pc = event_data->fault.cpu.pc;
+            record_cpu_fault(&entry.eventData.cpuFault, event_data);

            uvm_tools_record_event_v2(va_space, &entry);
        }
@ -1834,7 +1782,7 @@ void uvm_tools_record_thrashing(uvm_va_space_t *va_space,
        info->size      = region_size;
        info->timeStamp = NV_GETTIME();

-        BUILD_BUG_ON(UVM_MAX_PROCESSORS_V2 < UVM_ID_MAX_PROCESSORS);
+        BUILD_BUG_ON(UVM_MAX_PROCESSORS < UVM_ID_MAX_PROCESSORS);
        bitmap_copy((long unsigned *)&info->processors, processors->bitmap, UVM_ID_MAX_PROCESSORS);

        uvm_tools_record_event_v2(va_space, &entry);
@ -2151,7 +2099,7 @@ NV_STATUS uvm_api_tools_init_event_tracker(UVM_TOOLS_INIT_EVENT_TRACKER_PARAMS *
    event_tracker->is_queue = params->queueBufferSize != 0;
    if (event_tracker->is_queue) {
        uvm_tools_queue_t *queue = &event_tracker->queue;
-        NvU64 buffer_size, control_size;
+        NvU64 buffer_size;

        uvm_spin_lock_init(&queue->lock, UVM_LOCK_ORDER_LEAF);
        init_waitqueue_head(&queue->wait_queue);
@ -2170,25 +2118,21 @@ NV_STATUS uvm_api_tools_init_event_tracker(UVM_TOOLS_INIT_EVENT_TRACKER_PARAMS *
            goto fail;
        }

-        if (event_tracker->version == UvmToolsEventQueueVersion_V1) {
+        if (event_tracker->version == UvmToolsEventQueueVersion_V1)
            buffer_size = queue->queue_buffer_count * sizeof(UvmEventEntry_V1);
-            control_size = sizeof(UvmToolsEventControlData_V1);
-        }
-        else {
+        else
            buffer_size = queue->queue_buffer_count * sizeof(UvmEventEntry_V2);
-            control_size = sizeof(UvmToolsEventControlData_V2);
-        }

        status = map_user_pages(params->queueBuffer,
                                buffer_size,
-                                (void **)&queue->queue_v2,
+                                &queue->queue_buffer,
                                &queue->queue_buffer_pages);
        if (status != NV_OK)
            goto fail;

        status = map_user_pages(params->controlBuffer,
-                                control_size,
-                                (void **)&queue->control_v2,
+                                sizeof(UvmToolsEventControlData),
+                                (void **)&queue->control,
                                &queue->control_buffer_pages);

        if (status != NV_OK)
@ -2224,6 +2168,7 @@ NV_STATUS uvm_api_tools_set_notification_threshold(UVM_TOOLS_SET_NOTIFICATION_TH
 {
    uvm_tools_queue_snapshot_t sn;
    uvm_tools_event_tracker_t *event_tracker = tools_event_tracker(filp);
+    UvmToolsEventControlData *ctrl;

    if (!tracker_is_queue(event_tracker))
        return NV_ERR_INVALID_ARGUMENT;
@ -2232,18 +2177,9 @@ NV_STATUS uvm_api_tools_set_notification_threshold(UVM_TOOLS_SET_NOTIFICATION_TH

    event_tracker->queue.notification_threshold = params->notificationThreshold;

-    if (event_tracker->version == UvmToolsEventQueueVersion_V1) {
-        UvmToolsEventControlData_V1 *ctrl = event_tracker->queue.control_v1;
-
+    ctrl = event_tracker->queue.control;
    sn.put_behind = atomic_read((atomic_t *)&ctrl->put_behind);
    sn.get_ahead = atomic_read((atomic_t *)&ctrl->get_ahead);
-    }
-    else {
-        UvmToolsEventControlData_V2 *ctrl = event_tracker->queue.control_v2;
-
-        sn.put_behind = atomic_read((atomic_t *)&ctrl->put_behind);
-        sn.get_ahead = atomic_read((atomic_t *)&ctrl->get_ahead);
-    }

    if (queue_needs_wakeup(&event_tracker->queue, &sn))
        wake_up_all(&event_tracker->queue.wait_queue);
--- a/kernel-open/nvidia-uvm/uvm_turing_host.c
+++ b/kernel-open/nvidia-uvm/uvm_turing_host.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2017-2021 NVIDIA Corporation
+    Copyright (c) 2017-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -104,3 +104,248 @@ void uvm_hal_turing_host_set_gpfifo_entry(NvU64 *fifo_entry,
    *fifo_entry = fifo_entry_value;
 }

+void uvm_hal_turing_host_tlb_invalidate_all(uvm_push_t *push,
+                                            uvm_gpu_phys_address_t pdb,
+                                            NvU32 depth,
+                                            uvm_membar_t membar)
+{
+    NvU32 aperture_value;
+    NvU32 page_table_level;
+    NvU32 pdb_lo;
+    NvU32 pdb_hi;
+    NvU32 ack_value = 0;
+    NvU32 sysmembar_value = 0;
+
+    UVM_ASSERT_MSG(pdb.aperture == UVM_APERTURE_VID || pdb.aperture == UVM_APERTURE_SYS, "aperture: %u", pdb.aperture);
+
+    if (pdb.aperture == UVM_APERTURE_VID)
+        aperture_value = HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_APERTURE, VID_MEM);
+    else
+        aperture_value = HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_APERTURE, SYS_MEM_COHERENT);
+
+    UVM_ASSERT_MSG(IS_ALIGNED(pdb.address, 1 << 12), "pdb 0x%llx\n", pdb.address);
+    pdb.address >>= 12;
+
+    pdb_lo = pdb.address & HWMASK(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);
+    pdb_hi = pdb.address >> HWSIZE(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);
+
+    // PDE3 is the highest level on Pascal-Turing, see the comment in
+    // uvm_pascal_mmu.c for details.
+    UVM_ASSERT_MSG(depth < NVC46F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE3, "depth %u", depth);
+    page_table_level = NVC46F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE3 - depth;
+
+    if (membar != UVM_MEMBAR_NONE) {
+        // If a GPU or SYS membar is needed, ACK_TYPE needs to be set to
+        // GLOBALLY to make sure all the pending accesses can be picked up by
+        // the membar.
+        ack_value = HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_ACK_TYPE, GLOBALLY);
+    }
+
+    if (membar == UVM_MEMBAR_SYS)
+        sysmembar_value = HWCONST(C46F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, EN);
+    else
+        sysmembar_value = HWCONST(C46F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS);
+
+    NV_PUSH_4U(C46F, MEM_OP_A, sysmembar_value,
+                     MEM_OP_B, 0,
+                     MEM_OP_C, HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_PDB, ONE) |
+                               HWVALUE(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO, pdb_lo) |
+                               HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_GPC, ENABLE) |
+                               HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_REPLAY, NONE) |
+                               HWVALUE(C46F, MEM_OP_C, TLB_INVALIDATE_PAGE_TABLE_LEVEL, page_table_level) |
+                               aperture_value |
+                               ack_value,
+                     MEM_OP_D, HWCONST(C46F, MEM_OP_D, OPERATION, MMU_TLB_INVALIDATE) |
+                               HWVALUE(C46F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));
+
+    // GPU membar still requires an explicit membar method.
+    if (membar == UVM_MEMBAR_GPU)
+        uvm_push_get_gpu(push)->parent->host_hal->membar_gpu(push);
+}
+
+void uvm_hal_turing_host_tlb_invalidate_va(uvm_push_t *push,
+                                           uvm_gpu_phys_address_t pdb,
+                                           NvU32 depth,
+                                           NvU64 base,
+                                           NvU64 size,
+                                           NvU64 page_size,
+                                           uvm_membar_t membar)
+{
+    NvU32 aperture_value;
+    NvU32 page_table_level;
+    NvU32 pdb_lo;
+    NvU32 pdb_hi;
+    NvU32 ack_value = 0;
+    NvU32 sysmembar_value = 0;
+    NvU32 va_lo;
+    NvU32 va_hi;
+    NvU64 end;
+    NvU64 actual_base;
+    NvU64 actual_size;
+    NvU64 actual_end;
+    NvU32 log2_invalidation_size;
+    uvm_gpu_t *gpu = uvm_push_get_gpu(push);
+
+    UVM_ASSERT_MSG(IS_ALIGNED(page_size, 1 << 12), "page_size 0x%llx\n", page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(base, page_size), "base 0x%llx page_size 0x%llx\n", base, page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%llx\n", size, page_size);
+    UVM_ASSERT_MSG(size > 0, "size 0x%llx\n", size);
+
+    // The invalidation size must be a power-of-two number of pages containing
+    // the passed interval
+    end = base + size - 1;
+    log2_invalidation_size = __fls((unsigned long)(end ^ base)) + 1;
+
+    if (log2_invalidation_size == 64) {
+        // Invalidate everything
+        gpu->parent->host_hal->tlb_invalidate_all(push, pdb, depth, membar);
+        return;
+    }
+
+    // The hardware aligns the target address down to the invalidation size.
+    actual_size = 1ULL << log2_invalidation_size;
+    actual_base = UVM_ALIGN_DOWN(base, actual_size);
+    actual_end = actual_base + actual_size - 1;
+    UVM_ASSERT(actual_end >= end);
+
+    // The invalidation size field expects log2(invalidation size in 4K), not
+    // log2(invalidation size in bytes)
+    log2_invalidation_size -= 12;
+
+    // Address to invalidate, as a multiple of 4K.
+    base >>= 12;
+    va_lo = base & HWMASK(C46F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO);
+    va_hi = base >> HWSIZE(C46F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO);
+
+    UVM_ASSERT_MSG(pdb.aperture == UVM_APERTURE_VID || pdb.aperture == UVM_APERTURE_SYS, "aperture: %u", pdb.aperture);
+
+    if (pdb.aperture == UVM_APERTURE_VID)
+        aperture_value = HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_APERTURE, VID_MEM);
+    else
+        aperture_value = HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_APERTURE, SYS_MEM_COHERENT);
+
+    UVM_ASSERT_MSG(IS_ALIGNED(pdb.address, 1 << 12), "pdb 0x%llx\n", pdb.address);
+    pdb.address >>= 12;
+
+    pdb_lo = pdb.address & HWMASK(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);
+    pdb_hi = pdb.address >> HWSIZE(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);
+
+    // PDE3 is the highest level on Pascal-Turing, see the comment in
+    // uvm_pascal_mmu.c for details.
+    UVM_ASSERT_MSG(depth < NVC46F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE3, "depth %u", depth);
+    page_table_level = NVC46F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE3 - depth;
+
+    if (membar != UVM_MEMBAR_NONE) {
+        // If a GPU or SYS membar is needed, ACK_TYPE needs to be set to
+        // GLOBALLY to make sure all the pending accesses can be picked up by
+        // the membar.
+        ack_value = HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_ACK_TYPE, GLOBALLY);
+    }
+
+    if (membar == UVM_MEMBAR_SYS)
+        sysmembar_value = HWCONST(C46F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, EN);
+    else
+        sysmembar_value = HWCONST(C46F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS);
+
+    NV_PUSH_4U(C46F, MEM_OP_A, HWVALUE(C46F, MEM_OP_A, TLB_INVALIDATE_INVALIDATION_SIZE, log2_invalidation_size) |
+                               sysmembar_value |
+                               HWVALUE(C46F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO, va_lo),
+                     MEM_OP_B, HWVALUE(C46F, MEM_OP_B, TLB_INVALIDATE_TARGET_ADDR_HI, va_hi),
+                     MEM_OP_C, HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_PDB, ONE) |
+                               HWVALUE(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO, pdb_lo) |
+                               HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_GPC, ENABLE) |
+                               HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_REPLAY, NONE) |
+                               HWVALUE(C46F, MEM_OP_C, TLB_INVALIDATE_PAGE_TABLE_LEVEL, page_table_level) |
+                               aperture_value |
+                               ack_value,
+                     MEM_OP_D, HWCONST(C46F, MEM_OP_D, OPERATION, MMU_TLB_INVALIDATE_TARGETED) |
+                               HWVALUE(C46F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));
+
+    // GPU membar still requires an explicit membar method.
+    if (membar == UVM_MEMBAR_GPU)
+        gpu->parent->host_hal->membar_gpu(push);
+}
+
+void uvm_hal_turing_host_tlb_invalidate_test(uvm_push_t *push,
+                                             uvm_gpu_phys_address_t pdb,
+                                             UVM_TEST_INVALIDATE_TLB_PARAMS *params)
+{
+    NvU32 ack_value = 0;
+    NvU32 sysmembar_value = 0;
+    NvU32 invalidate_gpc_value = 0;
+    NvU32 aperture_value = 0;
+    NvU32 pdb_lo = 0;
+    NvU32 pdb_hi = 0;
+    NvU32 page_table_level = 0;
+
+    UVM_ASSERT_MSG(pdb.aperture == UVM_APERTURE_VID || pdb.aperture == UVM_APERTURE_SYS, "aperture: %u", pdb.aperture);
+    if (pdb.aperture == UVM_APERTURE_VID)
+        aperture_value = HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_APERTURE, VID_MEM);
+    else
+        aperture_value = HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_APERTURE, SYS_MEM_COHERENT);
+
+    UVM_ASSERT_MSG(IS_ALIGNED(pdb.address, 1 << 12), "pdb 0x%llx\n", pdb.address);
+    pdb.address >>= 12;
+
+    pdb_lo = pdb.address & HWMASK(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);
+    pdb_hi = pdb.address >> HWSIZE(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);
+
+    if (params->page_table_level != UvmInvalidatePageTableLevelAll) {
+        // PDE3 is the highest level on Pascal-Turing, see the comment in
+        // uvm_pascal_mmu.c for details.
+        page_table_level = min((NvU32)UvmInvalidatePageTableLevelPde3, params->page_table_level) - 1;
+    }
+
+    if (params->membar != UvmInvalidateTlbMemBarNone) {
+        // If a GPU or SYS membar is needed, ack_value needs to be set to
+        // GLOBALLY to make sure all the pending accesses can be picked up by
+        // the membar.
+        ack_value = HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_ACK_TYPE, GLOBALLY);
+    }
+
+    if (params->membar == UvmInvalidateTlbMemBarSys)
+        sysmembar_value = HWCONST(C46F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, EN);
+    else
+        sysmembar_value = HWCONST(C46F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS);
+
+    if (params->disable_gpc_invalidate)
+        invalidate_gpc_value = HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_GPC, DISABLE);
+    else
+        invalidate_gpc_value = HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_GPC, ENABLE);
+
+    if (params->target_va_mode == UvmTargetVaModeTargeted) {
+        NvU64 va = params->va >> 12;
+
+        NvU32 va_lo = va & HWMASK(C46F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO);
+        NvU32 va_hi = va >> HWSIZE(C46F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO);
+        NV_PUSH_4U(C46F, MEM_OP_A, sysmembar_value |
+                                   HWVALUE(C46F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO, va_lo),
+                         MEM_OP_B, HWVALUE(C46F, MEM_OP_B, TLB_INVALIDATE_TARGET_ADDR_HI, va_hi),
+                         MEM_OP_C, HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_REPLAY, NONE) |
+                                   HWVALUE(C46F, MEM_OP_C, TLB_INVALIDATE_PAGE_TABLE_LEVEL, page_table_level) |
+                                   HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_PDB, ONE) |
+                                   HWVALUE(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO, pdb_lo) |
+                                   invalidate_gpc_value |
+                                   aperture_value |
+                                   ack_value,
+                         MEM_OP_D, HWCONST(C46F, MEM_OP_D, OPERATION, MMU_TLB_INVALIDATE_TARGETED) |
+                                   HWVALUE(C46F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));
+    }
+    else {
+        NV_PUSH_4U(C46F, MEM_OP_A, sysmembar_value,
+                         MEM_OP_B, 0,
+                         MEM_OP_C, HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_REPLAY, NONE) |
+                                   HWVALUE(C46F, MEM_OP_C, TLB_INVALIDATE_PAGE_TABLE_LEVEL, page_table_level) |
+                                   HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_PDB, ONE) |
+                                   HWVALUE(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO, pdb_lo) |
+                                   invalidate_gpc_value |
+                                   aperture_value |
+                                   ack_value,
+                         MEM_OP_D, HWCONST(C46F, MEM_OP_D, OPERATION, MMU_TLB_INVALIDATE) |
+                                   HWVALUE(C46F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));
+    }
+
+    // GPU membar still requires an explicit membar method.
+    if (params->membar == UvmInvalidateTlbMemBarLocal)
+        uvm_push_get_gpu(push)->parent->host_hal->membar_gpu(push);
+}
--- a/kernel-open/nvidia-uvm/uvm_turing_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_turing_mmu.c
@ -138,7 +138,7 @@ static NvU64 poisoned_pte_turing(void)

 static uvm_mmu_mode_hal_t turing_mmu_mode_hal;

-uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_turing(NvU32 big_page_size)
+uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_turing(NvU64 big_page_size)
 {
    static bool initialized = false;

--- a/kernel-open/nvidia-uvm/uvm_types.h
+++ b/kernel-open/nvidia-uvm/uvm_types.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2013-2023 NVidia Corporation
+    Copyright (c) 2013-2024 NVidia Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -52,19 +52,18 @@ typedef enum

 typedef unsigned long long UvmStream;

+// The maximum number of sub-processors per parent GPU.
+#define UVM_PARENT_ID_MAX_SUB_PROCESSORS 8
+
 // The maximum number of GPUs changed when multiple MIG instances per
-// uvm_parent_gpu_t were added. See UvmEventQueueCreate().
+// uvm_parent_gpu_t were added. The old version is kept as a convenience
+// for code that needs to maintain forward compatibility.
 #define UVM_MAX_GPUS_V1       NV_MAX_DEVICES
 #define UVM_MAX_PROCESSORS_V1 (UVM_MAX_GPUS_V1 + 1)
-#define UVM_MAX_GPUS_V2       (NV_MAX_DEVICES * NV_MAX_SUBDEVICES)
-#define UVM_MAX_PROCESSORS_V2 (UVM_MAX_GPUS_V2 + 1)
+#define UVM_MAX_GPUS          (NV_MAX_DEVICES * UVM_PARENT_ID_MAX_SUB_PROCESSORS)
+#define UVM_MAX_PROCESSORS    (UVM_MAX_GPUS + 1)

-// For backward compatibility:
-// TODO: Bug 4465348: remove these after replacing old references.
-#define UVM_MAX_GPUS UVM_MAX_GPUS_V1
-#define UVM_MAX_PROCESSORS UVM_MAX_PROCESSORS_V1
-
-#define UVM_PROCESSOR_MASK_SIZE ((UVM_MAX_PROCESSORS_V2 + (sizeof(NvU64) * 8) - 1) / (sizeof(NvU64) * 8))
+#define UVM_PROCESSOR_MASK_SIZE ((UVM_MAX_PROCESSORS + (sizeof(NvU64) * 8) - 1) / (sizeof(NvU64) * 8))

 #define UVM_INIT_FLAGS_DISABLE_HMM                       ((NvU64)0x1)
 #define UVM_INIT_FLAGS_MULTI_PROCESS_SHARING_MODE        ((NvU64)0x2)
@ -423,29 +422,7 @@ typedef struct
    NvU32 pid;                // process id causing the fault
    NvU32 threadId;           // thread id causing the fault
    NvU64 pc;                 // address of the instruction causing the fault
-} UvmEventCpuFaultInfo_V1;
-
-typedef struct
-{
-    //
-    // eventType has to be 1st argument of this structure. Setting eventType to
-    // UvmEventTypeMemoryViolation helps to identify event data in a queue.
-    //
-    NvU8 eventType;
-    NvU8 accessType;          // read/write violation (UvmEventMemoryAccessType)
-    //
-    // This structure is shared between UVM kernel and tools.
-    // Manually padding the structure so that compiler options like pragma pack
-    // or malign-double will have no effect on the field offsets.
-    //
-    NvU16 padding16Bits;
-    NvS32 nid;                // NUMA node ID of faulting CPU
-    NvU64 address;            // faulting address
-    NvU64 timeStamp;          // cpu time when the fault occurred
-    NvU32 pid;                // process id causing the fault
-    NvU32 threadId;           // thread id causing the fault
-    NvU64 pc;                 // address of the instruction causing the fault
-} UvmEventCpuFaultInfo_V2;
+} UvmEventCpuFaultInfo;

 typedef enum
 {
@ -721,13 +698,7 @@ typedef struct
    //
    NvU8 eventType;
    NvU8 faultType;       // type of gpu fault, refer UvmEventFaultType
-    NvU8 accessType;      // memory access type, refer UvmEventMemoryAccessType
-    //
-    // This structure is shared between UVM kernel and tools.
-    // Manually padding the structure so that compiler options like pragma pack
-    // or malign-double will have no effect on the field offsets
-    //
-    NvU8 padding8Bits_1;
+    NvU16 gpuIndex;       // GPU that experienced the fault
    union
    {
        NvU16 gpcId;      // If this is a replayable fault, this field contains
@ -759,14 +730,13 @@ typedef struct
                          // UvmEventFaultClientTypeGpc indicates replayable
                          // fault, while UvmEventFaultClientTypeHub indicates
                          // non-replayable fault.
-
+    NvU8 accessType;      // memory access type, refer UvmEventMemoryAccessType
    //
    // This structure is shared between UVM kernel and tools.
    // Manually padding the structure so that compiler options like pragma pack
    // or malign-double will have no effect on the field offsets
    //
-    NvU8 padding8Bits_2;
-    NvU16 gpuIndex;       // GPU that experienced the fault
+    NvU16 padding16bits;
 } UvmEventGpuFaultInfo_V2;

 //------------------------------------------------------------------------------
@ -1108,8 +1078,8 @@ typedef struct
    // or malign-double will have no effect on the field offsets
    //
    NvU8  padding8bits;
-    NvU16 padding16bits[2];
    NvU16 processorIndex;   // index of the cpu/gpu that was throttled
+    NvU32 padding32bits;
    NvU64 address;          // address of the page whose servicing is being
                            // throttled
    NvU64 timeStamp;        // cpu start time stamp for the throttling operation
@ -1150,8 +1120,8 @@ typedef struct
    // or malign-double will have no effect on the field offsets
    //
    NvU8  padding8bits;
-    NvU16 padding16bits[2];
    NvU16 processorIndex;   // index of the cpu/gpu that was throttled
+    NvU32 padding32bits;
    NvU64 address;          // address of the page whose servicing is being
                            // throttled
    NvU64 timeStamp;        // cpu end time stamp for the throttling operation
@ -1409,7 +1379,7 @@ typedef struct
            NvU8 eventType;
            UvmEventMigrationInfo_Lite migration_Lite;

-            UvmEventCpuFaultInfo_V1 cpuFault;
+            UvmEventCpuFaultInfo cpuFault;
            UvmEventMigrationInfo_V1 migration;
            UvmEventGpuFaultInfo_V1 gpuFault;
            UvmEventGpuFaultReplayInfo_V1 gpuFaultReplay;
@ -1443,7 +1413,7 @@ typedef struct
            NvU8 eventType;
            UvmEventMigrationInfo_Lite migration_Lite;

-            UvmEventCpuFaultInfo_V2 cpuFault;
+            UvmEventCpuFaultInfo cpuFault;
            UvmEventMigrationInfo_V2 migration;
            UvmEventGpuFaultInfo_V2 gpuFault;
            UvmEventGpuFaultReplayInfo_V2 gpuFaultReplay;
@ -1510,19 +1480,7 @@ typedef enum {
    UvmToolsEventQueueVersion_V2 = 2,
 } UvmToolsEventQueueVersion;

-typedef struct UvmEventControlData_V1_tag {
-    // entries between get_ahead and get_behind are currently being read
-    volatile NvU32 get_ahead;
-    volatile NvU32 get_behind;
-    // entries between put_ahead and put_behind are currently being written
-    volatile NvU32 put_ahead;
-    volatile NvU32 put_behind;
-
-    // counter of dropped events
-    NvU64 dropped[UvmEventNumTypesAll];
-} UvmToolsEventControlData_V1;
-
-typedef struct UvmEventControlData_V2_tag {
+typedef struct UvmEventControlData_tag {
    // entries between get_ahead and get_behind are currently being read
    volatile NvU32 get_ahead;
    volatile NvU32 get_behind;
@ -1531,19 +1489,12 @@ typedef struct UvmEventControlData_V2_tag {
    volatile NvU32 put_ahead;
    volatile NvU32 put_behind;

-    // The version values are limited to UvmToolsEventQueueVersion and
-    // initialized by UvmToolsCreateEventQueue().
-    NvU32 version;
-    NvU32 padding32Bits;
-
    // counter of dropped events
    NvU64 dropped[UvmEventNumTypesAll];
-} UvmToolsEventControlData_V2;
+} UvmToolsEventControlData;

-// For backward compatibility:
-// TODO: Bug 4465348: remove these after replacing old references.
-typedef UvmToolsEventControlData_V1 UvmToolsEventControlData;
-typedef UvmEventEntry_V1 UvmEventEntry;
+// TODO: Bug 4465348: remove this after replacing old references.
+typedef UvmToolsEventControlData UvmToolsEventControlData_V1;

 //------------------------------------------------------------------------------
 // UVM Tools forward types (handles) definitions
--- a/kernel-open/nvidia-uvm/uvm_va_block.c
+++ b/kernel-open/nvidia-uvm/uvm_va_block.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -725,9 +725,8 @@ bool uvm_va_block_cpu_is_region_resident_on(uvm_va_block_t *va_block, int nid, u
 }

 // Return the preferred NUMA node ID for the block's policy.
-// If the preferred node ID is NUMA_NO_NODE, the nearest NUMA node ID
-// with memory is returned. In most cases, this should be the current
-// NUMA node.
+// If the preferred node ID is NUMA_NO_NODE, the current NUMA node ID
+// is returned.
 static int uvm_va_block_context_get_node(uvm_va_block_context_t *va_block_context)
 {
    if (va_block_context->make_resident.dest_nid != NUMA_NO_NODE)
@ -1329,12 +1328,12 @@ error_block_free:

 static void cpu_chunk_remove_sysmem_gpu_mapping(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu)
 {
-    NvU64 gpu_mapping_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
+    NvU64 gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu);
    if (gpu_mapping_addr == 0)
        return;

    uvm_pmm_sysmem_mappings_remove_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings, gpu_mapping_addr);
-    uvm_cpu_chunk_unmap_parent_gpu_phys(chunk, gpu->parent);
+    uvm_cpu_chunk_unmap_gpu(chunk, gpu);
 }

 static NV_STATUS cpu_chunk_add_sysmem_gpu_mapping(uvm_cpu_chunk_t *chunk,
@ -1357,17 +1356,14 @@ static NV_STATUS cpu_chunk_add_sysmem_gpu_mapping(uvm_cpu_chunk_t *chunk,

    chunk_size = uvm_cpu_chunk_get_size(chunk);

-    // TODO: Bug 3744779: Handle benign assertion in
-    //       pmm_sysmem_mappings_remove_gpu_mapping() in case of a
-    //       failure.
    status = uvm_pmm_sysmem_mappings_add_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings,
-                                                     uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent),
+                                                     uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu),
                                                     uvm_va_block_cpu_page_address(block, page_index),
                                                     chunk_size,
                                                     block,
                                                     UVM_ID_CPU);
    if (status != NV_OK)
-        cpu_chunk_remove_sysmem_gpu_mapping(chunk, gpu);
+        uvm_cpu_chunk_unmap_gpu(chunk, gpu);

    return status;
 }
@ -1396,10 +1392,10 @@ static NV_STATUS block_gpu_map_phys_all_cpu_pages(uvm_va_block_t *block, uvm_gpu

    for_each_possible_uvm_node(nid) {
        for_each_cpu_chunk_in_block(chunk, page_index, block, nid) {
-            UVM_ASSERT_MSG(uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent) == 0,
+            UVM_ASSERT_MSG(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu) == 0,
                           "GPU%u DMA address 0x%llx\n",
                           uvm_id_value(gpu->id),
-                           uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent));
+                           uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu));

            status = cpu_chunk_add_sysmem_gpu_mapping(chunk, block, page_index, gpu);
            if (status != NV_OK)
@ -1562,8 +1558,7 @@ NV_STATUS uvm_va_block_gpu_state_alloc(uvm_va_block_t *va_block)
 }

 void uvm_va_block_unmap_cpu_chunk_on_gpus(uvm_va_block_t *block,
-                                          uvm_cpu_chunk_t *chunk,
-                                          uvm_page_index_t page_index)
+                                          uvm_cpu_chunk_t *chunk)
 {
    uvm_gpu_id_t id;

@ -1602,7 +1597,7 @@ NV_STATUS uvm_va_block_map_cpu_chunk_on_gpus(uvm_va_block_t *block,
    return NV_OK;

 error:
-    uvm_va_block_unmap_cpu_chunk_on_gpus(block, chunk, page_index);
+    uvm_va_block_unmap_cpu_chunk_on_gpus(block, chunk);
    return status;
 }

@ -1621,7 +1616,7 @@ void uvm_va_block_remove_cpu_chunks(uvm_va_block_t *va_block, uvm_va_block_regio
            uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], chunk_region);
            uvm_va_block_cpu_clear_resident_region(va_block, nid, chunk_region);
            uvm_cpu_chunk_remove_from_block(va_block, nid, page_index);
-            uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk, page_index);
+            uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk);
            uvm_cpu_chunk_free(chunk);
        }
    }
@ -2071,7 +2066,6 @@ static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,
    uvm_page_mask_t *allocated_mask;
    uvm_cpu_chunk_alloc_flags_t alloc_flags = UVM_CPU_CHUNK_ALLOC_FLAGS_NONE;
    uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
-    const uvm_va_policy_t *policy = uvm_va_policy_get_region(block, populate_region);
    uvm_page_index_t page_index;
    uvm_gpu_id_t id;
    int preferred_nid = block_context->make_resident.dest_nid;
@ -2079,10 +2073,6 @@ static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,
    if (block_test && block_test->cpu_chunk_allocation_target_id != NUMA_NO_NODE)
        preferred_nid = block_test->cpu_chunk_allocation_target_id;

-    // If the VA range has a preferred NUMA node, use it.
-    if (preferred_nid == NUMA_NO_NODE)
-        preferred_nid = policy->preferred_nid;
-
    // TODO: Bug 4158598: Using NUMA_NO_NODE for staging allocations is sub-optimal.
    if (preferred_nid != NUMA_NO_NODE) {
        uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, preferred_nid);
@ -2133,12 +2123,13 @@ static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,
        uvm_page_mask_t *node_pages_mask = &block_context->make_resident.node_pages_mask;
        uvm_chunk_sizes_mask_t allocation_sizes;

-        if (uvm_page_mask_test(allocated_mask, page_index) ||
-            uvm_va_block_cpu_is_page_resident_on(block, preferred_nid, page_index)) {
+        if (uvm_page_mask_test(allocated_mask, page_index)) {
            page_index = uvm_va_block_next_unset_page_in_mask(populate_region, allocated_mask, page_index) - 1;
            continue;
        }

+        UVM_ASSERT(!uvm_va_block_cpu_is_page_resident_on(block, preferred_nid, page_index));
+
        allocation_sizes = block_calculate_largest_alloc_size(block,
                                                              page_index,
                                                              allocated_mask,
@ -2313,7 +2304,7 @@ static bool block_gpu_supports_2m(uvm_va_block_t *block, uvm_gpu_t *gpu)
    return uvm_mmu_page_size_supported(&gpu_va_space->page_tables, UVM_PAGE_SIZE_2M);
 }

-NvU32 uvm_va_block_gpu_big_page_size(uvm_va_block_t *va_block, uvm_gpu_t *gpu)
+NvU64 uvm_va_block_gpu_big_page_size(uvm_va_block_t *va_block, uvm_gpu_t *gpu)
 {
    uvm_gpu_va_space_t *gpu_va_space;

@ -2321,7 +2312,7 @@ NvU32 uvm_va_block_gpu_big_page_size(uvm_va_block_t *va_block, uvm_gpu_t *gpu)
    return gpu_va_space->page_tables.big_page_size;
 }

-static uvm_va_block_region_t range_big_page_region_all(NvU64 start, NvU64 end, NvU32 big_page_size)
+static uvm_va_block_region_t range_big_page_region_all(NvU64 start, NvU64 end, NvU64 big_page_size)
 {
    NvU64 first_addr = UVM_ALIGN_UP(start, big_page_size);
    NvU64 outer_addr = UVM_ALIGN_DOWN(end + 1, big_page_size);
@ -2335,20 +2326,20 @@ static uvm_va_block_region_t range_big_page_region_all(NvU64 start, NvU64 end, N
    return uvm_va_block_region((first_addr - start) / PAGE_SIZE, (outer_addr - start) / PAGE_SIZE);
 }

-static size_t range_num_big_pages(NvU64 start, NvU64 end, NvU32 big_page_size)
+static size_t range_num_big_pages(NvU64 start, NvU64 end, NvU64 big_page_size)
 {
    uvm_va_block_region_t region = range_big_page_region_all(start, end, big_page_size);
    return (size_t)uvm_div_pow2_64(uvm_va_block_region_size(region), big_page_size);
 }

-uvm_va_block_region_t uvm_va_block_big_page_region_all(uvm_va_block_t *va_block, NvU32 big_page_size)
+uvm_va_block_region_t uvm_va_block_big_page_region_all(uvm_va_block_t *va_block, NvU64 big_page_size)
 {
    return range_big_page_region_all(va_block->start, va_block->end, big_page_size);
 }

 uvm_va_block_region_t uvm_va_block_big_page_region_subset(uvm_va_block_t *va_block,
                                                          uvm_va_block_region_t region,
-                                                          NvU32 big_page_size)
+                                                          NvU64 big_page_size)
 {
    NvU64 start = uvm_va_block_region_start(va_block, region);
    NvU64 end = uvm_va_block_region_end(va_block, region);
@ -2366,12 +2357,12 @@ uvm_va_block_region_t uvm_va_block_big_page_region_subset(uvm_va_block_t *va_blo
    return big_region;
 }

-size_t uvm_va_block_num_big_pages(uvm_va_block_t *va_block, NvU32 big_page_size)
+size_t uvm_va_block_num_big_pages(uvm_va_block_t *va_block, NvU64 big_page_size)
 {
    return range_num_big_pages(va_block->start, va_block->end, big_page_size);
 }

-NvU64 uvm_va_block_big_page_addr(uvm_va_block_t *va_block, size_t big_page_index, NvU32 big_page_size)
+NvU64 uvm_va_block_big_page_addr(uvm_va_block_t *va_block, size_t big_page_index, NvU64 big_page_size)
 {
    NvU64 addr = UVM_ALIGN_UP(va_block->start, big_page_size) + (big_page_index * big_page_size);
    UVM_ASSERT(addr >= va_block->start);
@ -2379,7 +2370,7 @@ NvU64 uvm_va_block_big_page_addr(uvm_va_block_t *va_block, size_t big_page_index
    return addr;
 }

-uvm_va_block_region_t uvm_va_block_big_page_region(uvm_va_block_t *va_block, size_t big_page_index, NvU32 big_page_size)
+uvm_va_block_region_t uvm_va_block_big_page_region(uvm_va_block_t *va_block, size_t big_page_index, NvU64 big_page_size)
 {
    NvU64 page_addr = uvm_va_block_big_page_addr(va_block, big_page_index, big_page_size);

@ -2395,7 +2386,7 @@ uvm_va_block_region_t uvm_va_block_big_page_region(uvm_va_block_t *va_block, siz
 // uvm_va_block_gpu_state_t::big_ptes) corresponding to page_index. If
 // page_index cannot be covered by a big PTE due to alignment or block size,
 // MAX_BIG_PAGES_PER_UVM_VA_BLOCK is returned.
-size_t uvm_va_block_big_page_index(uvm_va_block_t *va_block, uvm_page_index_t page_index, NvU32 big_page_size)
+size_t uvm_va_block_big_page_index(uvm_va_block_t *va_block, uvm_page_index_t page_index, NvU64 big_page_size)
 {
    uvm_va_block_region_t big_region_all = uvm_va_block_big_page_region_all(va_block, big_page_size);
    size_t big_index;
@ -2420,7 +2411,7 @@ static void uvm_page_mask_init_from_big_ptes(uvm_va_block_t *block,
 {
    uvm_va_block_region_t big_region;
    size_t big_page_index;
-    NvU32 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu);
+    NvU64 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu);

    uvm_page_mask_zero(mask_out);

@ -2430,7 +2421,7 @@ static void uvm_page_mask_init_from_big_ptes(uvm_va_block_t *block,
    }
 }

-NvU32 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block, uvm_page_index_t page_index)
+NvU64 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block, uvm_page_index_t page_index)
 {
    if (!uvm_page_mask_test(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index))
        return 0;
@ -2444,7 +2435,7 @@ NvU32 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block, uvm_page_index_t page
    return PAGE_SIZE;
 }

-NvU32 uvm_va_block_page_size_gpu(uvm_va_block_t *va_block, uvm_gpu_id_t gpu_id, uvm_page_index_t page_index)
+NvU64 uvm_va_block_page_size_gpu(uvm_va_block_t *va_block, uvm_gpu_id_t gpu_id, uvm_page_index_t page_index)
 {
    uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id);
    size_t big_page_size, big_page_index;
@ -2472,7 +2463,7 @@ NvU32 uvm_va_block_page_size_gpu(uvm_va_block_t *va_block, uvm_gpu_id_t gpu_id,
 // resident. Note that this is different from uvm_va_block_page_size_* because
 // those return the size of the PTE which maps the page index, which may be
 // smaller than the physical allocation.
-static NvU32 block_phys_page_size(uvm_va_block_t *block, block_phys_page_t page)
+static NvU64 block_phys_page_size(uvm_va_block_t *block, block_phys_page_t page)
 {
    uvm_va_block_gpu_state_t *gpu_state;
    uvm_chunk_size_t chunk_size;
@ -2485,7 +2476,7 @@ static NvU32 block_phys_page_size(uvm_va_block_t *block, block_phys_page_t page)
            return 0;

        UVM_ASSERT(uvm_processor_mask_test(&block->resident, UVM_ID_CPU));
-        return (NvU32)uvm_cpu_chunk_get_size(chunk);
+        return uvm_cpu_chunk_get_size(chunk);
    }

    gpu_state = uvm_va_block_gpu_state_get(block, page.processor);
@ -2494,10 +2485,10 @@ static NvU32 block_phys_page_size(uvm_va_block_t *block, block_phys_page_t page)

    UVM_ASSERT(uvm_processor_mask_test(&block->resident, page.processor));
    block_gpu_chunk_index(block, block_get_gpu(block, page.processor), page.page_index, &chunk_size);
-    return (NvU32)chunk_size;
+    return chunk_size;
 }

-NvU32 uvm_va_block_get_physical_size(uvm_va_block_t *block,
+NvU64 uvm_va_block_get_physical_size(uvm_va_block_t *block,
                                     uvm_processor_id_t processor,
                                     uvm_page_index_t page_index)
 {
@ -3349,7 +3340,7 @@ static uvm_gpu_phys_address_t block_phys_page_address(uvm_va_block_t *block,

    if (UVM_ID_IS_CPU(block_page.processor)) {
        uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, block_page.nid, block_page.page_index);
-        NvU64 dma_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
+        NvU64 dma_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu);
        uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block,
                                                                       uvm_cpu_chunk_get_size(chunk),
                                                                       block_page.page_index);
@ -3848,7 +3839,6 @@ static void conf_computing_block_copy_push_gpu_to_cpu(uvm_va_block_t *block,
    uvm_gpu_address_t staging_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
    uvm_gpu_address_t auth_tag_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
    uvm_gpu_address_t src_address = block_copy_get_address(block, &copy_state->src, page_index, gpu);
-    NvU32 key_version = uvm_channel_pool_key_version(push->channel->pool);

    UVM_ASSERT(UVM_ID_IS_GPU(copy_state->src.id));
    UVM_ASSERT(UVM_ID_IS_CPU(copy_state->dst.id));
@ -3866,8 +3856,7 @@ static void conf_computing_block_copy_push_gpu_to_cpu(uvm_va_block_t *block,
    // crypto-operations and it only guarantees PAGE_SIZE contiguity, all
    // encryptions and decryptions must happen on a PAGE_SIZE basis.
    for_each_va_block_page_in_region(page_index, region) {
-        uvm_conf_computing_log_gpu_encryption(push->channel, PAGE_SIZE, &dma_buffer->decrypt_iv[page_index]);
-        dma_buffer->key_version[page_index] = key_version;
+        uvm_conf_computing_log_gpu_encryption(push->channel, &dma_buffer->decrypt_iv[page_index]);

        // All but the first encryption can be pipelined. The first encryption
        // uses the caller's pipelining settings.
@ -3926,8 +3915,7 @@ static NV_STATUS conf_computing_copy_pages_finish(uvm_va_block_t *block,
        status = uvm_conf_computing_cpu_decrypt(push->channel,
                                                cpu_page_address,
                                                staging_buffer,
-                                                dma_buffer->decrypt_iv + page_index,
-                                                dma_buffer->key_version[page_index],
+                                                &dma_buffer->decrypt_iv[page_index],
                                                PAGE_SIZE,
                                                auth_tag_buffer);
        kunmap(dst_page);
@ -4045,7 +4033,7 @@ static NV_STATUS block_copy_pages(uvm_va_block_t *va_block,

        UVM_ASSERT(dst_chunk);
        UVM_ASSERT(uvm_cpu_chunk_get_size(src_chunk) >= uvm_va_block_region_size(region));
-        UVM_ASSERT(uvm_va_block_region_size(region) <= uvm_cpu_chunk_get_size(dst_chunk));
+        UVM_ASSERT(uvm_cpu_chunk_get_size(src_chunk) <= uvm_cpu_chunk_get_size(dst_chunk));

        // CPU-to-CPU copies using memcpy() don't have any inherent ordering with
        // copies using GPU CEs. So, we have to make sure that all previously
@ -5140,7 +5128,7 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
    uvm_page_mask_t *dst_resident_mask;
    uvm_page_mask_t *migrated_pages;
    uvm_page_mask_t *staged_pages;
-    uvm_page_mask_t *scratch_residency_mask;
+    uvm_page_mask_t *first_touch_mask;

    // TODO: Bug 3660922: need to implement HMM read duplication support.
    UVM_ASSERT(!uvm_va_block_is_hmm(va_block));
@ -5159,10 +5147,6 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
    uvm_assert_mutex_locked(&va_block->lock);
    UVM_ASSERT(!uvm_va_block_is_dead(va_block));

-    scratch_residency_mask = kmem_cache_alloc(g_uvm_page_mask_cache, NV_UVM_GFP_FLAGS);
-    if (!scratch_residency_mask)
-        return NV_ERR_NO_MEMORY;
-
    // For pages that are entering read-duplication we need to unmap remote
    // mappings and revoke RW and higher access permissions.
    //
@ -5189,12 +5173,12 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,

        status = block_prep_read_duplicate_mapping(va_block, va_block_context, src_id, region, preprocess_page_mask);
        if (status != NV_OK)
-            goto out;
+            return status;
    }

    status = block_populate_pages(va_block, va_block_retry, va_block_context, dest_id, region, page_mask);
    if (status != NV_OK)
-        goto out;
+        return status;

    status = block_copy_resident_pages(va_block,
                                       va_block_context,
@ -5204,17 +5188,22 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
                                       prefetch_page_mask,
                                       UVM_VA_BLOCK_TRANSFER_MODE_COPY);
    if (status != NV_OK)
-        goto out;
+        return status;

    // Pages that weren't resident anywhere else were populated at the
    // destination directly. Mark them as resident now, since there were no
    // errors from block_copy_resident_pages() above.
+    // Note that va_block_context->scratch_page_mask is passed to
+    // block_copy_set_first_touch_residency() which is generally unsafe but in
+    // this case, block_copy_set_first_touch_residency() copies page_mask
+    // before scratch_page_mask could be clobbered.
    migrated_pages = &va_block_context->make_resident.pages_migrated;
-    uvm_page_mask_init_from_region(scratch_residency_mask, region, page_mask);
-    uvm_page_mask_andnot(scratch_residency_mask, scratch_residency_mask, migrated_pages);
+    first_touch_mask = &va_block_context->scratch_page_mask;
+    uvm_page_mask_init_from_region(first_touch_mask, region, page_mask);
+    uvm_page_mask_andnot(first_touch_mask, first_touch_mask, migrated_pages);

-    if (!uvm_page_mask_empty(scratch_residency_mask))
-        block_copy_set_first_touch_residency(va_block, va_block_context, dest_id, region, scratch_residency_mask);
+    if (!uvm_page_mask_empty(first_touch_mask))
+        block_copy_set_first_touch_residency(va_block, va_block_context, dest_id, region, first_touch_mask);

    staged_pages = &va_block_context->make_resident.pages_staged;
    if (!UVM_ID_IS_CPU(dest_id) && !uvm_page_mask_empty(staged_pages)) {
@ -5226,18 +5215,6 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,

    if (!uvm_page_mask_empty(migrated_pages)) {
        if (UVM_ID_IS_CPU(dest_id)) {
-            // Check if the CPU is already in the resident set of processors.
-            // We need to do this since we can't have multiple NUMA nodes with
-            // resident pages.
-            // If any of the migrate pages were already resident on the CPU, the
-            // residency has to be switched to the destination NUMA node.
-            if (uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) &&
-                uvm_page_mask_and(scratch_residency_mask,
-                                  uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE),
-                                  migrated_pages)) {
-                uvm_va_block_cpu_clear_resident_all_chunks(va_block, va_block_context, scratch_residency_mask);
-            }
-
            uvm_va_block_cpu_set_resident_all_chunks(va_block, va_block_context, migrated_pages);
        }
        else {
@ -5266,9 +5243,7 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
    // Check state of all chunks after residency change.
    // TODO: Bug 4207783: Check both CPU and GPU chunks.
    UVM_ASSERT(block_check_cpu_chunks(va_block));
-out:
-    kmem_cache_free(g_uvm_page_mask_cache, scratch_residency_mask);
-    return status;
+    return NV_OK;
 }

 // Looks up the current CPU mapping state of page from the
@ -5408,7 +5383,7 @@ static bool block_check_gpu_chunks(uvm_va_block_t *block, uvm_gpu_id_t id)

        if (chunk) {
            if (chunk_size != uvm_gpu_chunk_get_size(chunk)) {
-                UVM_ERR_PRINT("chunk size mismatch: calc %u, actual %u. VA block [0x%llx, 0x%llx) GPU: %u page_index: %u chunk index: %zu\n",
+                UVM_ERR_PRINT("chunk size mismatch: calc %u, actual %u. VA block [0x%llx, 0x%llx) GPU: %u page_index: %u chunk index: %lu\n",
                              chunk_size,
                              uvm_gpu_chunk_get_size(chunk),
                              block->start,
@ -5420,7 +5395,7 @@ static bool block_check_gpu_chunks(uvm_va_block_t *block, uvm_gpu_id_t id)
            }

            if (chunk->state != UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) {
-                UVM_ERR_PRINT("Invalid chunk state %s. VA block [0x%llx, 0x%llx) GPU: %u page_index: %u chunk index: %zu chunk_size: %u\n",
+                UVM_ERR_PRINT("Invalid chunk state %s. VA block [0x%llx, 0x%llx) GPU: %u page_index: %u chunk index: %lu chunk_size: llu\n",
                              uvm_pmm_gpu_chunk_state_string(chunk->state),
                              block->start,
                              block->end + 1,
@ -5553,15 +5528,13 @@ static bool block_check_mappings_page(uvm_va_block_t *block,
                   *block->read_duplicated_pages.bitmap);

    // Test read_duplicated_pages mask
-    UVM_ASSERT_MSG((!uvm_page_mask_test(&block->read_duplicated_pages, page_index) &&
-                    uvm_processor_mask_get_count(resident_processors) <= 1) ||
-                   (uvm_page_mask_test(&block->read_duplicated_pages, page_index) &&
-                    uvm_processor_mask_get_count(resident_processors) >= 1),
+    UVM_ASSERT_MSG((uvm_processor_mask_get_count(resident_processors) <= 1 &&
+                     !uvm_page_mask_test(&block->read_duplicated_pages, page_index)) ||
+                   (uvm_processor_mask_get_count(resident_processors) > 1 &&
+                     uvm_page_mask_test(&block->read_duplicated_pages, page_index)),
                   "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n",
                   *resident_processors->bitmap,
-                   *read_mappings->bitmap,
-                   *write_mappings->bitmap,
-                   *atomic_mappings->bitmap,
+                   *read_mappings->bitmap, *write_mappings->bitmap, *atomic_mappings->bitmap,
                   *va_space->system_wide_atomics_enabled_processors.bitmap,
                   *block->read_duplicated_pages.bitmap);

@ -5741,7 +5714,7 @@ static bool block_check_mappings_ptes(uvm_va_block_t *block, uvm_va_block_contex
    uvm_pte_bits_gpu_t pte_bit;
    uvm_processor_id_t resident_id;
    uvm_prot_t prot;
-    NvU32 big_page_size;
+    NvU64 big_page_size;
    size_t num_big_pages, big_page_index;
    uvm_va_block_region_t big_region, chunk_region;
    uvm_gpu_chunk_t *chunk;
@ -6045,7 +6018,7 @@ static bool block_has_remote_mapping_gpu(uvm_va_block_t *block,
        if (uvm_page_mask_empty(mapped_pages))
            return false;

-        return !uvm_va_policy_preferred_location_equal(uvm_va_range_get_policy(block->va_range), gpu_id, NUMA_NO_NODE);
+        return !uvm_id_equal(uvm_va_range_get_policy(block->va_range)->preferred_location, gpu_id);
    }

    // Remote pages are pages which are mapped but not resident locally
@ -6193,7 +6166,7 @@ static void block_gpu_pte_big_split_write_4k(uvm_va_block_t *block,
    size_t big_page_index;
    uvm_processor_id_t curr_resident_id;
    uvm_prot_t curr_prot;
-    NvU32 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu);
+    NvU64 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu);

    if (UVM_ID_IS_INVALID(resident_id))
        UVM_ASSERT(new_prot == UVM_PROT_NONE);
@ -6275,7 +6248,7 @@ static void block_gpu_pte_clear_big(uvm_va_block_t *block,
 {
    uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
    uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu);
-    NvU32 big_page_size = gpu_va_space->page_tables.big_page_size;
+    NvU64 big_page_size = gpu_va_space->page_tables.big_page_size;
    uvm_gpu_phys_address_t pte_addr;
    NvU32 pte_size = uvm_mmu_pte_size(&gpu_va_space->page_tables, big_page_size);
    size_t big_page_index;
@ -6321,7 +6294,7 @@ static void block_gpu_pte_write_big(uvm_va_block_t *block,
    uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
    uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu);
    uvm_page_tree_t *tree = &gpu_va_space->page_tables;
-    NvU32 big_page_size = tree->big_page_size;
+    NvU64 big_page_size = tree->big_page_size;
    NvU32 pte_size = uvm_mmu_pte_size(tree, big_page_size);
    size_t big_page_index;
    uvm_va_block_region_t contig_region = {0};
@ -6399,7 +6372,7 @@ static void block_gpu_pte_merge_big_and_end(uvm_va_block_t *block,
 {
    uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
    uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
-    NvU32 big_page_size = tree->big_page_size;
+    NvU64 big_page_size = tree->big_page_size;
    NvU64 unmapped_pte_val = tree->hal->unmapped_pte(big_page_size);
    size_t big_page_index;
    DECLARE_BITMAP(dummy_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
@ -6960,7 +6933,7 @@ static void block_gpu_split_big(uvm_va_block_t *block,
    uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
    uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
    uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
-    NvU32 big_page_size = tree->big_page_size;
+    NvU64 big_page_size = tree->big_page_size;
    uvm_va_block_region_t big_region;
    uvm_processor_id_t resident_id;
    size_t big_page_index;
@ -7062,7 +7035,7 @@ static void block_gpu_map_big_and_4k(uvm_va_block_t *block,
    DECLARE_BITMAP(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
    uvm_va_block_region_t big_region;
    size_t big_page_index;
-    NvU32 big_page_size = tree->big_page_size;
+    NvU64 big_page_size = tree->big_page_size;
    uvm_membar_t tlb_membar = block_pte_op_membar(pte_op, gpu, resident_id);

    UVM_ASSERT(!gpu_state->pte_is_2m);
@ -7364,7 +7337,7 @@ static void block_gpu_unmap_big_and_4k(uvm_va_block_t *block,
    DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
    DECLARE_BITMAP(big_ptes_before_or_after, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
    DECLARE_BITMAP(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
-    NvU32 big_page_size = tree->big_page_size;
+    NvU64 big_page_size = tree->big_page_size;
    NvU64 unmapped_pte_val = tree->hal->unmapped_pte(big_page_size);

    UVM_ASSERT(!gpu_state->pte_is_2m);
@ -7510,7 +7483,7 @@ static void block_gpu_compute_new_pte_state(uvm_va_block_t *block,
 {
    uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
    uvm_va_block_region_t big_region_all, big_page_region, region;
-    NvU32 big_page_size;
+    NvU64 big_page_size;
    uvm_page_index_t page_index;
    size_t big_page_index;
    DECLARE_BITMAP(big_ptes_not_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
@ -7663,7 +7636,7 @@ static void block_gpu_compute_new_pte_state(uvm_va_block_t *block,
 // happens, the pending tracker is added to the block's tracker.
 static NV_STATUS block_alloc_pt_range_with_retry(uvm_va_block_t *va_block,
                                                 uvm_gpu_t *gpu,
-                                                 NvU32 page_size,
+                                                 NvU64 page_size,
                                                 uvm_page_table_range_t *page_table_range,
                                                 uvm_tracker_t *pending_tracker)
 {
@ -7786,13 +7759,13 @@ allocated:
 // sizes. See block_alloc_pt_range_with_retry.
 static NV_STATUS block_alloc_ptes_with_retry(uvm_va_block_t *va_block,
                                             uvm_gpu_t *gpu,
-                                             NvU32 page_sizes,
+                                             NvU64 page_sizes,
                                             uvm_tracker_t *pending_tracker)
 {
    uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
    uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(va_block, gpu);
    uvm_page_table_range_t *range;
-    NvU32 page_size;
+    NvU64 page_size;
    NV_STATUS status, final_status = NV_OK;

    UVM_ASSERT(gpu_state);
@ -7844,7 +7817,7 @@ static NV_STATUS block_alloc_ptes_new_state(uvm_va_block_t *va_block,
                                            uvm_va_block_new_pte_state_t *new_pte_state,
                                            uvm_tracker_t *pending_tracker)
 {
-    NvU32 page_sizes = 0;
+    NvU64 page_sizes = 0;

    if (new_pte_state->pte_is_2m) {
        page_sizes |= UVM_PAGE_SIZE_2M;
@ -7876,8 +7849,8 @@ static NV_STATUS block_pre_populate_pde1_gpu(uvm_va_block_t *block,
                                             uvm_gpu_va_space_t *gpu_va_space,
                                             uvm_tracker_t *pending_tracker)
 {
-    NvU32 page_sizes;
-    NvU32 big_page_size;
+    NvU64 page_sizes;
+    NvU64 big_page_size;
    uvm_gpu_t *gpu;
    uvm_va_block_gpu_state_t *gpu_state;

@ -8388,7 +8361,6 @@ static NV_STATUS block_map_gpu_to(uvm_va_block_t *va_block,
                                  uvm_va_block_context_t *block_context,
                                  uvm_gpu_t *gpu,
                                  uvm_processor_id_t resident_id,
-                                  int resident_nid,
                                  uvm_page_mask_t *map_page_mask,
                                  uvm_prot_t new_prot,
                                  uvm_tracker_t *out_tracker)
@ -8398,7 +8370,7 @@ static NV_STATUS block_map_gpu_to(uvm_va_block_t *va_block,
    uvm_push_t push;
    NV_STATUS status;
    uvm_page_mask_t *pages_to_map = &block_context->mapping.page_mask;
-    const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id, resident_nid);
+    const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id, NUMA_NO_NODE);
    uvm_pte_bits_gpu_t pte_bit;
    uvm_pte_bits_gpu_t prot_pte_bit = get_gpu_pte_bit_index(new_prot);
    uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
@ -8407,10 +8379,8 @@ static NV_STATUS block_map_gpu_to(uvm_va_block_t *va_block,
    UVM_ASSERT(map_page_mask);
    UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(resident_id)], gpu->id));

-    if (uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id)) {
-        uvm_va_policy_t *policy = uvm_va_range_get_policy(va_block->va_range);
-        UVM_ASSERT(uvm_va_policy_preferred_location_equal(policy, resident_id, policy->preferred_nid));
-    }
+    if (uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id))
+        UVM_ASSERT(uvm_id_equal(resident_id, uvm_va_range_get_policy(va_block->va_range)->preferred_location));

    UVM_ASSERT(!uvm_page_mask_and(&block_context->scratch_page_mask,
                                  map_page_mask,
@ -8512,27 +8482,18 @@ static NV_STATUS block_map_gpu_to(uvm_va_block_t *va_block,
    return uvm_tracker_add_push_safe(out_tracker, &push);
 }

-// allowed_nid_mask is only valid if the CPU is set in allowed_mask.
 static void map_get_allowed_destinations(uvm_va_block_t *block,
                                         uvm_va_block_context_t *va_block_context,
                                         const uvm_va_policy_t *policy,
                                         uvm_processor_id_t id,
-                                         uvm_processor_mask_t *allowed_mask,
-                                         nodemask_t *allowed_nid_mask)
+                                         uvm_processor_mask_t *allowed_mask)
 {
    uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);

-    *allowed_nid_mask = node_possible_map;
-
    if (uvm_processor_mask_test(block_get_uvm_lite_gpus(block), id)) {
        // UVM-Lite can only map resident pages on the preferred location
        uvm_processor_mask_zero(allowed_mask);
        uvm_processor_mask_set(allowed_mask, policy->preferred_location);
-        if (UVM_ID_IS_CPU(policy->preferred_location) &&
-            !uvm_va_policy_preferred_location_equal(policy, UVM_ID_CPU, NUMA_NO_NODE)) {
-            nodes_clear(*allowed_nid_mask);
-            node_set(policy->preferred_nid, *allowed_nid_mask);
-        }
    }
    else if ((uvm_va_policy_is_read_duplicate(policy, va_space) ||
              (uvm_id_equal(policy->preferred_location, id) &&
@ -8575,7 +8536,6 @@ NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block,
    uvm_page_mask_t *running_page_mask = &va_block_context->mapping.map_running_page_mask;
    NV_STATUS status = NV_OK;
    const uvm_va_policy_t *policy = uvm_va_policy_get_region(va_block, region);
-    nodemask_t *allowed_nid_destinations;

    va_block_context->mapping.cause = cause;

@ -8625,20 +8585,10 @@ NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block,
    if (!allowed_destinations)
        return NV_ERR_NO_MEMORY;

-    allowed_nid_destinations = uvm_kvmalloc(sizeof(*allowed_nid_destinations));
-    if (!allowed_nid_destinations) {
-        uvm_processor_mask_cache_free(allowed_destinations);
-        return NV_ERR_NO_MEMORY;
-    }
-
    // Map per resident location so we can more easily detect physically-
    // contiguous mappings.
-    map_get_allowed_destinations(va_block,
-                                 va_block_context,
-                                 policy,
-                                 id,
-                                 allowed_destinations,
-                                 allowed_nid_destinations);
+    map_get_allowed_destinations(va_block, va_block_context, policy, id, allowed_destinations);
+
    for_each_closest_id(resident_id, allowed_destinations, id, va_space) {
        if (UVM_ID_IS_CPU(id)) {
            status = block_map_cpu_to(va_block,
@ -8649,30 +8599,11 @@ NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block,
                                      new_prot,
                                      out_tracker);
        }
-        else if (UVM_ID_IS_CPU(resident_id)) {
-            int nid;
-
-            // map_get_allowed_distinations() will set the mask of CPU NUMA
-            // nodes that should be mapped.
-            for_each_node_mask(nid, *allowed_nid_destinations) {
-                status = block_map_gpu_to(va_block,
-                                          va_block_context,
-                                          gpu,
-                                          resident_id,
-                                          nid,
-                                          running_page_mask,
-                                          new_prot,
-                                          out_tracker);
-                if (status != NV_OK)
-                    break;
-            }
-        }
        else {
            status = block_map_gpu_to(va_block,
                                      va_block_context,
                                      gpu,
                                      resident_id,
-                                      NUMA_NO_NODE,
                                      running_page_mask,
                                      new_prot,
                                      out_tracker);
@ -8687,7 +8618,6 @@ NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block,
    }

    uvm_processor_mask_cache_free(allowed_destinations);
-    uvm_kvfree(allowed_nid_destinations);

    return status;
 }
@ -9575,7 +9505,6 @@ static void block_kill(uvm_va_block_t *block)
    // Free CPU pages
    for_each_possible_uvm_node(nid) {
        uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, nid);
-        size_t index = node_to_index(nid);

        for_each_cpu_chunk_in_block_safe(chunk, page_index, next_page_index, block, nid) {
            // be conservative.
@ -9590,9 +9519,20 @@ static void block_kill(uvm_va_block_t *block)

        UVM_ASSERT(uvm_page_mask_empty(&node_state->allocated));
        UVM_ASSERT(node_state->chunks == 0);
-        kmem_cache_free(g_uvm_va_block_cpu_node_state_cache, block->cpu.node_state[index]);
    }

+    // While a per-NUMA node_state array is in use, all of its elements are
+    // expected to be valid. Therefore the teardown of these elements must occur
+    // as a single "transaction". This teardown must take place after freeing
+    // the CPU pages (see the "Free CPU pages" loop above). This is because as
+    // part of removing chunks from VA blocks, the per-page allocated bitmap is
+    // recomputed using the per-NUMA node_state array elements.
+    for_each_possible_uvm_node(nid) {
+        uvm_va_block_cpu_node_state_t *node_state;
+
+        node_state = block_node_state_get(block, nid);
+        kmem_cache_free(g_uvm_va_block_cpu_node_state_cache, node_state);
+    }
    uvm_kvfree((void *)block->cpu.node_state);
    block->cpu.node_state = NULL;

@ -9708,8 +9648,8 @@ static NV_STATUS block_split_presplit_ptes_gpu(uvm_va_block_t *existing, uvm_va_
    uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu->id);
    uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing);
    uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
-    NvU32 big_page_size = uvm_va_block_gpu_big_page_size(existing, gpu);
-    NvU32 alloc_sizes;
+    NvU64 big_page_size = uvm_va_block_gpu_big_page_size(existing, gpu);
+    NvU64 alloc_sizes;
    DECLARE_BITMAP(new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
    uvm_page_index_t new_start_page_index = uvm_va_block_cpu_page_index(existing, new->start);
    size_t big_page_index;
@ -10052,7 +9992,7 @@ static NV_STATUS block_split_cpu_chunk_one(uvm_va_block_t *block, uvm_page_index
        gpu = block_get_gpu(block, id);

        // If the parent chunk has not been mapped, there is nothing to split.
-        gpu_mapping_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
+        gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu);
        if (gpu_mapping_addr == 0)
            continue;

@ -10074,7 +10014,7 @@ static NV_STATUS block_split_cpu_chunk_one(uvm_va_block_t *block, uvm_page_index
 merge:
        for_each_gpu_id_in_mask(id, gpu_split_mask) {
            gpu = block_get_gpu(block, id);
-            gpu_mapping_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
+            gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu);
            uvm_pmm_sysmem_mappings_merge_gpu_mappings(&gpu->pmm_reverse_sysmem_mappings,
                                                       gpu_mapping_addr,
                                                       chunk_size);
@ -10260,7 +10200,7 @@ static void block_merge_cpu_chunks_one(uvm_va_block_t *block, uvm_page_index_t p
            continue;

        gpu = block_get_gpu(block, id);
-        gpu_mapping_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
+        gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu);
        if (gpu_mapping_addr == 0)
            continue;

@ -10712,8 +10652,7 @@ static void block_split_gpu(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_g
    for_each_possible_uvm_node(nid) {
        for_each_cpu_chunk_in_block(cpu_chunk, page_index, new, nid) {
            uvm_pmm_sysmem_mappings_reparent_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings,
-                                                         uvm_cpu_chunk_get_parent_gpu_phys_addr(cpu_chunk,
-                                                                                                gpu->parent),
+                                                         uvm_cpu_chunk_get_gpu_phys_addr(cpu_chunk, gpu),
                                                         new);
        }
    }
@ -10751,7 +10690,7 @@ static void block_split_gpu(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_g
    gpu_va_space = uvm_gpu_va_space_get(va_space, gpu);
    if (gpu_va_space) {
        if (existing_gpu_state->page_table_range_big.table) {
-            NvU32 big_page_size = uvm_va_block_gpu_big_page_size(existing, gpu);
+            NvU64 big_page_size = uvm_va_block_gpu_big_page_size(existing, gpu);

            // existing's end has not been adjusted yet
            existing_pages_big = range_num_big_pages(existing->start, new->start - 1, big_page_size);
@ -11241,8 +11180,8 @@ NV_STATUS uvm_va_block_add_mappings_after_migration(uvm_va_block_t *va_block,
    // so uvm_va_block_map will be a no-op.
    uvm_processor_mask_and(map_uvm_lite_gpus, map_other_processors, block_get_uvm_lite_gpus(va_block));
    if (!uvm_processor_mask_empty(map_uvm_lite_gpus) &&
-        uvm_va_policy_preferred_location_equal(policy, new_residency, va_block_context->make_resident.dest_nid)) {
-        for_each_id_in_mask (map_processor_id, map_uvm_lite_gpus) {
+        uvm_id_equal(new_residency, preferred_location)) {
+        for_each_id_in_mask(map_processor_id, map_uvm_lite_gpus) {
            status = uvm_va_block_map(va_block,
                                      va_block_context,
                                      map_processor_id,
@ -11703,10 +11642,6 @@ static int block_select_node_residency(uvm_va_block_t *va_block,
    // For GPU faults, the bottom half is pinned to CPUs closest to their GPU.
    // Therefore, in both cases, we can use numa_mem_id() to get the NUMA node
    // ID of the faulting processor.
-    // Note that numa_mem_id() returns the nearest node with memory. In most
-    // cases, this will be the current NUMA node. However, in the case that the
-    // current node does not have any memory, we probably want the nearest node
-    // with memory, anyway.
    int current_nid = numa_mem_id();
    bool may_read_duplicate = can_read_duplicate(va_block, page_index, policy, thrashing_hint);

@ -11730,12 +11665,7 @@ static int block_select_node_residency(uvm_va_block_t *va_block,
    // If read duplication is enabled and the page is also resident on the CPU,
    // keep its current NUMA node residency.
    if (may_read_duplicate && uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index))
-        return NUMA_NO_NODE;
-
-    // The new_residency processor is the CPU and the preferred location is not
-    // the CPU. If the page is resident on the CPU, keep its current residency.
-    if (uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index))
-        return NUMA_NO_NODE;
+        return block_get_page_node_residency(va_block, page_index);

    return current_nid;
 }
@ -12639,6 +12569,125 @@ NV_STATUS uvm_va_block_find_create(uvm_va_space_t *va_space,
        return uvm_hmm_va_block_find_create(va_space, addr, hmm_vma, out_block);
 }

+// Launch a synchronous, encrypted copy between GPU and CPU.
+//
+// The copy entails a GPU-side encryption (relying on the Copy Engine), and a
+// CPU-side decryption step, such that the destination CPU buffer pointed by
+// dst_plain will contain the unencrypted (plain text) contents. The destination
+// buffer can be in protected or unprotected sysmem, while the source buffer
+// must be in protected vidmem.
+//
+// The maximum copy size allowed is UVM_CONF_COMPUTING_DMA_BUFFER_SIZE.
+//
+// The input tracker, if not NULL, is internally acquired by the push
+// responsible for the encrypted copy.
+__attribute__ ((format(printf, 6, 7)))
+static NV_STATUS encrypted_memcopy_gpu_to_cpu(uvm_gpu_t *gpu,
+                                              void *dst_plain,
+                                              uvm_gpu_address_t src_gpu_address,
+                                              size_t size,
+                                              uvm_tracker_t *tracker,
+                                              const char *format,
+                                              ...)
+{
+    NV_STATUS status;
+    UvmCslIv decrypt_iv;
+    uvm_push_t push;
+    uvm_conf_computing_dma_buffer_t *dma_buffer;
+    uvm_gpu_address_t dst_gpu_address, auth_tag_gpu_address;
+    void *src_cipher, *auth_tag;
+    va_list args;
+
+    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
+    UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE);
+
+    status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL);
+    if (status != NV_OK)
+        return status;
+
+    va_start(args, format);
+    status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_TO_CPU, tracker, &push, format, args);
+    va_end(args);
+
+    if (status != NV_OK)
+        goto out;
+
+    uvm_conf_computing_log_gpu_encryption(push.channel, &decrypt_iv);
+
+    dst_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
+    auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
+    gpu->parent->ce_hal->encrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address);
+
+    status = uvm_push_end_and_wait(&push);
+    if (status != NV_OK)
+        goto out;
+
+    src_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
+    auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
+    status = uvm_conf_computing_cpu_decrypt(push.channel, dst_plain, src_cipher, &decrypt_iv, size, auth_tag);
+
+ out:
+    uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL);
+    return status;
+}
+
+// Launch a synchronous, encrypted copy between CPU and GPU.
+//
+// The source CPU buffer pointed by src_plain contains the unencrypted (plain
+// text) contents; the function internally performs a CPU-side encryption step
+// before launching the GPU-side CE decryption. The source buffer can be in
+// protected or unprotected sysmem, while the destination buffer must be in
+// protected vidmem.
+//
+// The maximum copy size allowed is UVM_CONF_COMPUTING_DMA_BUFFER_SIZE.
+//
+// The input tracker, if not NULL, is internally acquired by the push
+// responsible for the encrypted copy.
+__attribute__ ((format(printf, 6, 7)))
+static NV_STATUS encrypted_memcopy_cpu_to_gpu(uvm_gpu_t *gpu,
+                                              uvm_gpu_address_t dst_gpu_address,
+                                              void *src_plain,
+                                              size_t size,
+                                              uvm_tracker_t *tracker,
+                                              const char *format,
+                                              ...)
+{
+    NV_STATUS status;
+    uvm_push_t push;
+    uvm_conf_computing_dma_buffer_t *dma_buffer;
+    uvm_gpu_address_t src_gpu_address, auth_tag_gpu_address;
+    void *dst_cipher, *auth_tag;
+    va_list args;
+
+    UVM_ASSERT(g_uvm_global.conf_computing_enabled);
+    UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE);
+
+    status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL);
+    if (status != NV_OK)
+        return status;
+
+    va_start(args, format);
+    status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_CPU_TO_GPU, tracker, &push, format, args);
+    va_end(args);
+
+    if (status != NV_OK)
+        goto out;
+
+    dst_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
+    auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
+    uvm_conf_computing_cpu_encrypt(push.channel, dst_cipher, src_plain, NULL, size, auth_tag);
+
+    src_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
+    auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
+    gpu->parent->ce_hal->decrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address);
+
+    status = uvm_push_end_and_wait(&push);
+
+out:
+    uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL);
+    return status;
+}
+
 static NV_STATUS va_block_write_cpu_to_gpu(uvm_va_block_t *va_block,
                                           uvm_gpu_t *gpu,
                                           uvm_gpu_address_t dst_gpu_address,
@ -12651,7 +12700,7 @@ static NV_STATUS va_block_write_cpu_to_gpu(uvm_va_block_t *va_block,
    uvm_gpu_address_t src_gpu_address;

    if (g_uvm_global.conf_computing_enabled) {
-        return uvm_conf_computing_util_memcopy_cpu_to_gpu(gpu,
+        return encrypted_memcopy_cpu_to_gpu(gpu,
                                            dst_gpu_address,
                                            uvm_mem_get_cpu_addr_kernel(src_mem),
                                            size,
@ -12755,7 +12804,7 @@ static NV_STATUS va_block_read_gpu_to_cpu(uvm_va_block_t *va_block,
    uvm_gpu_address_t dst_gpu_address;

    if (g_uvm_global.conf_computing_enabled) {
-        return uvm_conf_computing_util_memcopy_gpu_to_cpu(gpu,
+        return encrypted_memcopy_gpu_to_cpu(gpu,
                                            uvm_mem_get_cpu_addr_kernel(dst_mem),
                                            src_gpu_address,
                                            size,
@ -13570,7 +13619,7 @@ NV_STATUS uvm_test_va_residency_info(UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params,
    for_each_id_in_mask(id, &block->mapped) {
        uvm_processor_id_t processor_to_map;
        block_phys_page_t block_page;
-        NvU32 page_size = uvm_va_block_page_size_processor(block, id, page_index);
+        NvU64 page_size = uvm_va_block_page_size_processor(block, id, page_index);
        int nid = NUMA_NO_NODE;

        if (page_size == 0)
@ -13606,7 +13655,7 @@ NV_STATUS uvm_test_va_residency_info(UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params,
        if (uvm_processor_mask_test(resident_on_mask, UVM_ID_CPU)) {
            if (uvm_pmm_sysmem_mappings_indirect_supported()) {
                for_each_gpu_id(id) {
-                    NvU32 page_size = uvm_va_block_page_size_processor(block, id, page_index);
+                    NvU64 page_size = uvm_va_block_page_size_processor(block, id, page_index);
                    uvm_reverse_map_t sysmem_page;
                    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page_resident(block, page_index);
                    size_t num_pages;
@ -13621,8 +13670,7 @@ NV_STATUS uvm_test_va_residency_info(UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params,
                        continue;

                    num_pages = uvm_pmm_sysmem_mappings_dma_to_virt(&gpu->pmm_reverse_sysmem_mappings,
-                                                                    uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk,
-                                                                                                           gpu->parent),
+                                                                    uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu),
                                                                    uvm_cpu_chunk_get_size(chunk),
                                                                    &sysmem_page,
                                                                    1);
--- a/kernel-open/nvidia-uvm/uvm_va_block.h
+++ b/kernel-open/nvidia-uvm/uvm_va_block.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -111,8 +111,6 @@ typedef struct
    // Pages that have been evicted to sysmem
    uvm_page_mask_t evicted;

-    NvU64 *cpu_chunks_dma_addrs;
-
    // Array of naturally-aligned chunks. Each chunk has the largest possible
    // size which can fit within the block, so they are not uniform size.
    //
@ -2155,8 +2153,7 @@ NV_STATUS uvm_va_block_map_cpu_chunk_on_gpus(uvm_va_block_t *va_block,
 // Physically unmap a CPU chunk from all registered GPUs.
 // Locking: The va_block lock must be held.
 void uvm_va_block_unmap_cpu_chunk_on_gpus(uvm_va_block_t *va_block,
-                                          uvm_cpu_chunk_t *chunk,
-                                          uvm_page_index_t page_index);
+                                          uvm_cpu_chunk_t *chunk);

 // Remove any CPU chunks in the given region.
 // Locking: The va_block lock must be held.
@ -2166,19 +2163,19 @@ void uvm_va_block_remove_cpu_chunks(uvm_va_block_t *va_block, uvm_va_block_regio
 // specified processor in the block. Returns 0 if the address is not resident on
 // the specified processor.
 // Locking: The va_block lock must be held.
-NvU32 uvm_va_block_get_physical_size(uvm_va_block_t *block,
+NvU64 uvm_va_block_get_physical_size(uvm_va_block_t *block,
                                     uvm_processor_id_t processor,
                                     uvm_page_index_t page_index);

 // Get CPU page size or 0 if it is not mapped
-NvU32 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block,
+NvU64 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block,
                                 uvm_page_index_t page_index);

 // Get GPU page size or 0 if it is not mapped on the given GPU
-NvU32 uvm_va_block_page_size_gpu(uvm_va_block_t *va_block, uvm_gpu_id_t gpu_id, uvm_page_index_t page_index);
+NvU64 uvm_va_block_page_size_gpu(uvm_va_block_t *va_block, uvm_gpu_id_t gpu_id, uvm_page_index_t page_index);

 // Get page size or 0 if it is not mapped on the given processor
-static NvU32 uvm_va_block_page_size_processor(uvm_va_block_t *va_block,
+static NvU64 uvm_va_block_page_size_processor(uvm_va_block_t *va_block,
                                              uvm_processor_id_t processor_id,
                                              uvm_page_index_t page_index)
 {
@ -2189,10 +2186,10 @@ static NvU32 uvm_va_block_page_size_processor(uvm_va_block_t *va_block,
 }

 // Returns the big page size for the GPU VA space of the block
-NvU32 uvm_va_block_gpu_big_page_size(uvm_va_block_t *va_block, uvm_gpu_t *gpu);
+NvU64 uvm_va_block_gpu_big_page_size(uvm_va_block_t *va_block, uvm_gpu_t *gpu);

 // Returns the number of big pages in the VA block for the given size
-size_t uvm_va_block_num_big_pages(uvm_va_block_t *va_block, NvU32 big_page_size);
+size_t uvm_va_block_num_big_pages(uvm_va_block_t *va_block, NvU64 big_page_size);

 // Returns the number of big pages in the VA block for the big page size on the
 // given GPU
@ -2202,29 +2199,29 @@ static size_t uvm_va_block_gpu_num_big_pages(uvm_va_block_t *va_block, uvm_gpu_t
 }

 // Returns the start address of the given big page index and big page size
-NvU64 uvm_va_block_big_page_addr(uvm_va_block_t *va_block, size_t big_page_index, NvU32 big_page_size);
+NvU64 uvm_va_block_big_page_addr(uvm_va_block_t *va_block, size_t big_page_index, NvU64 big_page_size);

 // Returns the region [start, end] of the given big page index and big page size
 uvm_va_block_region_t uvm_va_block_big_page_region(uvm_va_block_t *va_block,
                                                   size_t big_page_index,
-                                                   NvU32 big_page_size);
+                                                   NvU64 big_page_size);

 // Returns the largest sub-region region of [start, end] which can fit big
 // pages. If the region cannot fit any big pages, an invalid region (0, 0) is
 // returned.
-uvm_va_block_region_t uvm_va_block_big_page_region_all(uvm_va_block_t *va_block, NvU32 big_page_size);
+uvm_va_block_region_t uvm_va_block_big_page_region_all(uvm_va_block_t *va_block, NvU64 big_page_size);

 // Returns the largest sub-region region of 'region' which can fit big pages.
 // If the region cannot fit any big pages, an invalid region (0, 0) is returned.
 uvm_va_block_region_t uvm_va_block_big_page_region_subset(uvm_va_block_t *va_block,
                                                          uvm_va_block_region_t region,
-                                                          NvU32 big_page_size);
+                                                          NvU64 big_page_size);

 // Returns the big page index (the bit index within
 // uvm_va_block_gpu_state_t::big_ptes) corresponding to page_index. If
 // page_index cannot be covered by a big PTE due to alignment or block size,
 // MAX_BIG_PAGES_PER_UVM_VA_BLOCK is returned.
-size_t uvm_va_block_big_page_index(uvm_va_block_t *va_block, uvm_page_index_t page_index, NvU32 big_page_size);
+size_t uvm_va_block_big_page_index(uvm_va_block_t *va_block, uvm_page_index_t page_index, NvU64 big_page_size);

 // Returns the new residency for a page that faulted or triggered access counter
 // notifications. The read_duplicate output parameter indicates if the page
--- a/kernel-open/nvidia-uvm/uvm_va_policy.c
+++ b/kernel-open/nvidia-uvm/uvm_va_policy.c
@ -105,12 +105,6 @@ bool uvm_va_policy_preferred_location_equal(const uvm_va_policy_t *policy, uvm_p
 {
    bool equal = uvm_id_equal(policy->preferred_location, proc);

-    if (!UVM_ID_IS_CPU(policy->preferred_location))
-        UVM_ASSERT(policy->preferred_nid == NUMA_NO_NODE);
-
-    if (!UVM_ID_IS_CPU(proc))
-        UVM_ASSERT(cpu_numa_id == NUMA_NO_NODE);
-
    if (equal && UVM_ID_IS_CPU(policy->preferred_location))
        equal = uvm_numa_id_eq(policy->preferred_nid, cpu_numa_id);

@ -662,7 +656,7 @@ const uvm_va_policy_t *uvm_va_policy_set_preferred_location(uvm_va_block_t *va_b
        // and that the policy is changing.
        UVM_ASSERT(node->node.start >= start);
        UVM_ASSERT(node->node.end <= end);
-        UVM_ASSERT(!uvm_va_policy_preferred_location_equal(&node->policy, processor_id, cpu_node_id));
+        UVM_ASSERT(!uvm_id_equal(node->policy.preferred_location, processor_id));
    }

    node->policy.preferred_location = processor_id;
--- a/kernel-open/nvidia-uvm/uvm_va_range.c
+++ b/kernel-open/nvidia-uvm/uvm_va_range.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -868,9 +868,9 @@ static void uvm_va_range_disable_peer_managed(uvm_va_range_t *va_range, uvm_gpu_
        // preferred location. If peer mappings are being disabled to the
        // preferred location, then unmap the other GPU.
        // Nothing to do otherwise.
-        if (uvm_va_policy_preferred_location_equal(uvm_va_range_get_policy(va_range), gpu0->id, NUMA_NO_NODE))
+        if (uvm_id_equal(uvm_va_range_get_policy(va_range)->preferred_location, gpu0->id))
            uvm_lite_gpu_to_unmap = gpu1;
-        else if (uvm_va_policy_preferred_location_equal(uvm_va_range_get_policy(va_range), gpu1->id, NUMA_NO_NODE))
+        else if (uvm_id_equal(uvm_va_range_get_policy(va_range)->preferred_location, gpu1->id))
            uvm_lite_gpu_to_unmap = gpu0;
        else
            return;
@ -951,7 +951,7 @@ static void va_range_unregister_gpu_managed(uvm_va_range_t *va_range, uvm_gpu_t
    // Reset preferred location and accessed-by of VA ranges if needed
    // Note: ignoring the return code of uvm_va_range_set_preferred_location since this
    // will only return on error when setting a preferred location, not on a reset
-    if (uvm_va_policy_preferred_location_equal(uvm_va_range_get_policy(va_range), gpu->id, NUMA_NO_NODE))
+    if (uvm_id_equal(uvm_va_range_get_policy(va_range)->preferred_location, gpu->id))
        (void)uvm_va_range_set_preferred_location(va_range, UVM_ID_INVALID, NUMA_NO_NODE, mm, NULL);

    uvm_va_range_unset_accessed_by(va_range, gpu->id, NULL);
@ -1683,7 +1683,7 @@ void uvm_va_range_unset_accessed_by(uvm_va_range_t *va_range,
    // If a UVM-Lite GPU is being removed from the accessed_by mask, it will
    // also stop being a UVM-Lite GPU unless it's also the preferred location.
    if (uvm_processor_mask_test(&va_range->uvm_lite_gpus, processor_id) &&
-        !uvm_va_policy_preferred_location_equal(uvm_va_range_get_policy(va_range), processor_id, NUMA_NO_NODE)) {
+        !uvm_id_equal(uvm_va_range_get_policy(va_range)->preferred_location, processor_id)) {
        range_unmap(va_range, processor_id, out_tracker);
    }

@ -1853,7 +1853,7 @@ NV_STATUS uvm_api_alloc_semaphore_pool(UVM_ALLOC_SEMAPHORE_POOL_PARAMS *params,

    if (uvm_api_range_invalid(params->base, params->length))
        return NV_ERR_INVALID_ADDRESS;
-    if (params->gpuAttributesCount > UVM_MAX_GPUS_V2)
+    if (params->gpuAttributesCount > UVM_MAX_GPUS)
        return NV_ERR_INVALID_ARGUMENT;

    if (g_uvm_global.conf_computing_enabled && params->gpuAttributesCount == 0)
--- a/kernel-open/nvidia-uvm/uvm_va_range.h
+++ b/kernel-open/nvidia-uvm/uvm_va_range.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2022 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -188,8 +188,7 @@ typedef struct
    // GPU which owns the allocation. For sysmem, this is the GPU that the
    // sysmem was originally allocated under. For the allocation to remain valid
    // we need to prevent the GPU from going away, similarly to P2P mapped
-    // memory.
-    // Similarly for EGM memory.
+    // memory and to EGM memory.
    //
    // This field is not used for sparse mappings as they don't have an
    // allocation and, hence, owning GPU.
@ -212,6 +211,7 @@ typedef struct
    // EGM memory. If true is_sysmem also has to be true and owning_gpu
    // has to be valid.
    bool is_egm;
+
    // GPU page tables mapping the allocation
    uvm_page_table_range_vec_t pt_range_vec;

--- a/kernel-open/nvidia-uvm/uvm_volta_host.c
+++ b/kernel-open/nvidia-uvm/uvm_volta_host.c
@ -199,7 +199,7 @@ void uvm_hal_volta_host_tlb_invalidate_va(uvm_push_t *push,
                                          NvU32 depth,
                                          NvU64 base,
                                          NvU64 size,
-                                          NvU32 page_size,
+                                          NvU64 page_size,
                                          uvm_membar_t membar)
 {
    NvU32 aperture_value;
@ -216,9 +216,9 @@ void uvm_hal_volta_host_tlb_invalidate_va(uvm_push_t *push,
    NvU32 log2_invalidation_size;
    uvm_gpu_t *gpu = uvm_push_get_gpu(push);

-    UVM_ASSERT_MSG(IS_ALIGNED(page_size, 1 << 12), "page_size 0x%x\n", page_size);
-    UVM_ASSERT_MSG(IS_ALIGNED(base, page_size), "base 0x%llx page_size 0x%x\n", base, page_size);
-    UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%x\n", size, page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(page_size, 1 << 12), "page_size 0x%llx\n", page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(base, page_size), "base 0x%llx page_size 0x%llx\n", base, page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%llx\n", size, page_size);
    UVM_ASSERT_MSG(size > 0, "size 0x%llx\n", size);

    // The invalidation size must be a power-of-two number of pages containing
--- a/kernel-open/nvidia-uvm/uvm_volta_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_volta_mmu.c
@ -42,7 +42,7 @@ static NvU32 entries_per_index_volta(NvU32 depth)
    return 1;
 }

-static NvLength entry_offset_volta(NvU32 depth, NvU32 page_size)
+static NvLength entry_offset_volta(NvU32 depth, NvU64 page_size)
 {
    UVM_ASSERT(depth < 5);
    if (page_size == UVM_PAGE_SIZE_4K && depth == 3)
@ -252,7 +252,7 @@ static NvU64 make_pte_volta(uvm_aperture_t aperture, NvU64 address, uvm_prot_t p

 static uvm_mmu_mode_hal_t volta_mmu_mode_hal;

-uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_volta(NvU32 big_page_size)
+uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_volta(NvU64 big_page_size)
 {
    static bool initialized = false;

--- a/kernel-open/nvidia/libspdm_aead.c
+++ b/kernel-open/nvidia/libspdm_aead.c
@ -159,14 +159,7 @@ static int lkca_aead_internal(struct crypto_aead *aead,
    }

    if (rc != 0) {
-        if (enc) {
-            pr_info("aead.c: Encryption failed with error %i\n", rc);
-        } else {
-            pr_info("aead.c: Decryption failed with error %i\n", rc);
-            if (rc == -EBADMSG) {
-                pr_info("aead.c: Authentication tag mismatch!\n");
-            }
-        }
+        pr_info("Encryption FAILED\n");
    }

    *data_out_size = data_in_size;
--- a/kernel-open/nvidia/libspdm_internal_crypt_lib.c
+++ b/kernel-open/nvidia/libspdm_internal_crypt_lib.c
@ -1,42 +0,0 @@
-/*
-* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: MIT
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-* DEALINGS IN THE SOFTWARE.
-*
-* Comments, prototypes and checks taken from DMTF: Copyright 2021-2022 DMTF. All rights reserved.
-* License: BSD 3-Clause License. For full text see link: https://github.com/DMTF/libspdm/blob/main/LICENSE.md
-*/
-
-#include "os-interface.h"
-#include "internal_crypt_lib.h"
-#include "library/cryptlib.h"
-
-bool libspdm_check_crypto_backend(void)
-{
-#ifdef USE_LKCA
-    nv_printf(NV_DBG_INFO, "libspdm_check_crypto_backend: LKCA wrappers found.\n");
-    nv_printf(NV_DBG_INFO, "libspdm_check_crypto_backend: LKCA calls may still fail if modules have not been loaded!\n");
-    return true;
-#else
-    nv_printf(NV_DBG_ERRORS, "libspdm_check_crypto_backend: Error - libspdm expects LKCA but found stubs!\n");
-    return false;
-#endif
-}
-
--- a/kernel-open/nvidia/libspdm_rsa.c
+++ b/kernel-open/nvidia/libspdm_rsa.c
@ -39,7 +39,9 @@
 #define RSA_PSS_PADDING_ZEROS_SIZE_BYTE        (8)
 #define RSA_PSS_TRAILER_FIELD                  (0xbc)
 #define SHIFT_RIGHT_AND_GET_BYTE(val, x)       ((val >> x) & 0xFF)
+#ifndef BITS_TO_BYTES
 #define BITS_TO_BYTES(b)                       (b >> 3)
+#endif

 static const unsigned char zeroes[RSA_PSS_PADDING_ZEROS_SIZE_BYTE] = { 0 };

--- a/kernel-open/nvidia/nv-acpi.c
+++ b/kernel-open/nvidia/nv-acpi.c
@ -66,6 +66,9 @@ static NvBool battery_present = NV_FALSE;
 #define ACPI_VIDEO_CLASS    "video"
 #endif

+/* Maximum size of ACPI _DSM method's 4th argument */
+#define NV_MAX_ACPI_DSM_PARAM_SIZE     1024
+
 // Used for NVPCF event handling
 static acpi_handle nvpcf_handle = NULL;
 static acpi_handle nvpcf_device_handle = NULL;
@ -73,21 +76,6 @@ static nv_acpi_t  *nvpcf_nv_acpi_object = NULL;

 #define ACPI_NVPCF_EVENT_CHANGE    0xC0

-static int nv_acpi_get_device_handle(nv_state_t *nv, acpi_handle *dev_handle)
-{
-    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
-
-#if defined(DEVICE_ACPI_HANDLE)
-    *dev_handle = DEVICE_ACPI_HANDLE(nvl->dev);
-    return NV_TRUE;
-#elif defined (ACPI_HANDLE)
-    *dev_handle = ACPI_HANDLE(nvl->dev);
-    return NV_TRUE;
-#else
-    return NV_FALSE;
-#endif
-}
-
 /*
 * This callback will be invoked by the acpi_notifier_call_chain()
 */
@ -174,7 +162,7 @@ static void nv_acpi_nvpcf_event(acpi_handle handle, u32 event_type, void *data)
    }
    else
    {
-        nv_printf(NV_DBG_INFO,"NVRM: %s: NVPCF event 0x%x is not supported\n", event_type, __FUNCTION__);
+        nv_printf(NV_DBG_INFO,"NVRM: %s: NVPCF event 0x%x is not supported\n", __FUNCTION__, event_type);
    }
 }

@ -267,11 +255,10 @@ static void nv_acpi_notify_event(acpi_handle handle, u32 event_type, void *data)

 void nv_acpi_register_notifier(nv_linux_state_t *nvl)
 {
-    acpi_handle dev_handle  = NULL;
+    acpi_handle dev_handle  = ACPI_HANDLE(nvl->dev);

    /* Install the ACPI notifier corresponding to dGPU ACPI device. */
    if ((nvl->nv_acpi_object == NULL) &&
-        nv_acpi_get_device_handle(NV_STATE_PTR(nvl), &dev_handle) &&
        (dev_handle != NULL))
    {
        nvl->nv_acpi_object = nv_install_notifier(dev_handle, nv_acpi_notify_event, nvl);
@ -657,7 +644,100 @@ static NV_STATUS nv_acpi_nvif_method(
    return NV_OK;
 }

-#define MAX_INPUT_PARAM_SIZE     1024
+static NV_STATUS nv_acpi_evaluate_dsm_method(
+    acpi_handle   dev_handle,
+    NvU8         *pathname,
+    NvU8         *pAcpiDsmGuid,
+    NvU32         acpiDsmRev,
+    NvU32         acpiDsmSubFunction,
+    void         *arg3,
+    NvU16         arg3Size,
+    NvBool        bArg3Integer,
+    NvU32        *outStatus,
+    void         *pOutData,
+    NvU16        *pSize
+)
+{
+    NV_STATUS rmStatus = NV_OK;
+    acpi_status status;
+    struct acpi_object_list input;
+    union acpi_object *dsm = NULL;
+    struct acpi_buffer output = { ACPI_ALLOCATE_BUFFER, NULL };
+    union acpi_object dsm_params[4];
+    NvU32 data_size;
+
+    if (!NV_MAY_SLEEP())
+    {
+#if defined(DEBUG)
+        nv_printf(NV_DBG_ERRORS, "NVRM: %s: invalid context!\n", __FUNCTION__);
+#endif
+        return NV_ERR_NOT_SUPPORTED;
+    }
+
+    dsm_params[0].buffer.type    = ACPI_TYPE_BUFFER;
+    dsm_params[0].buffer.length  = 0x10;
+    dsm_params[0].buffer.pointer = pAcpiDsmGuid;
+
+    dsm_params[1].integer.type   = ACPI_TYPE_INTEGER;
+    dsm_params[1].integer.value  = acpiDsmRev;
+
+    dsm_params[2].integer.type   = ACPI_TYPE_INTEGER;
+    dsm_params[2].integer.value  = acpiDsmSubFunction;
+
+    if (bArg3Integer)
+    {
+        dsm_params[3].integer.type  = ACPI_TYPE_INTEGER;
+        dsm_params[3].integer.value = *((NvU32 *)arg3);
+    }
+    else
+    {
+         dsm_params[3].buffer.type    = ACPI_TYPE_BUFFER;
+         dsm_params[3].buffer.length  = arg3Size;
+         dsm_params[3].buffer.pointer = arg3;
+    }
+
+    // parameters for dsm calls (GUID, rev, subfunction, data)
+    input.count = 4;
+    input.pointer = dsm_params;
+
+    status = acpi_evaluate_object(dev_handle, pathname, &input, &output);
+    if (ACPI_FAILURE(status))
+    {
+        nv_printf(NV_DBG_INFO,
+              "NVRM: %s: failed to evaluate _DSM method!\n", __FUNCTION__);
+        return NV_ERR_OPERATING_SYSTEM;
+    }
+
+    dsm = output.pointer;
+    if (dsm != NULL)
+    {
+        if (outStatus)
+        {
+            *outStatus = dsm->buffer.pointer[3] << 24 |
+                         dsm->buffer.pointer[2] << 16 |
+                         dsm->buffer.pointer[1] << 8  |
+                         dsm->buffer.pointer[0];
+        }
+
+        rmStatus = nv_acpi_extract_object(dsm, pOutData, *pSize, &data_size);
+        *pSize = data_size;
+
+        kfree(output.pointer);
+    }
+    else
+    {
+        *pSize = 0;
+    }
+
+    if (rmStatus != NV_OK)
+    {
+        nv_printf(NV_DBG_ERRORS,
+                  "NVRM: %s: DSM data invalid!\n", __FUNCTION__);
+    }
+
+    return rmStatus;
+}
+
 /*
 * This function executes a _DSM ACPI method.
 */
@ -674,65 +754,27 @@ NV_STATUS NV_API_CALL nv_acpi_dsm_method(
    NvU16       *pSize
 )
 {
-    NV_STATUS status = NV_ERR_OPERATING_SYSTEM;
-    acpi_status acpi_status;
-    struct acpi_object_list input;
-    union acpi_object *dsm = NULL;
-    struct acpi_buffer output = { ACPI_ALLOCATE_BUFFER, NULL };
-    union acpi_object dsm_params[4];
+    NV_STATUS rmStatus     = NV_ERR_OPERATING_SYSTEM;
    NvU8 *argument3        = NULL;
-    NvU32 data_size;
-    acpi_handle dev_handle  = NULL;
-
-    if (!nv_acpi_get_device_handle(nv, &dev_handle))
-        return NV_ERR_NOT_SUPPORTED;
+    nv_linux_state_t *nvl  = NV_GET_NVL_FROM_NV_STATE(nv);
+    acpi_handle dev_handle = ACPI_HANDLE(nvl->dev);
+    NvU8         *pathname = "_DSM";

    if (!dev_handle)
        return NV_ERR_INVALID_ARGUMENT;

-    if ((!pInParams) || (inParamSize > MAX_INPUT_PARAM_SIZE) || (!pOutData) || (!pSize))
+    if ((!pInParams) || (inParamSize > NV_MAX_ACPI_DSM_PARAM_SIZE) || (!pOutData) || (!pSize))
    {
        nv_printf(NV_DBG_INFO,
                  "NVRM: %s: invalid argument(s)!\n", __FUNCTION__);
        return NV_ERR_INVALID_ARGUMENT;
    }

-    if (!NV_MAY_SLEEP())
-    {
-#if defined(DEBUG)
-        nv_printf(NV_DBG_INFO,
-                  "NVRM: %s: invalid argument(s)!\n", __FUNCTION__);
-#endif
-        return NV_ERR_NOT_SUPPORTED;
-    }
+    rmStatus = os_alloc_mem((void **)&argument3, inParamSize);
+    if (rmStatus != NV_OK)
+        return rmStatus;

-    status = os_alloc_mem((void **)&argument3, inParamSize);
-    if (status != NV_OK)
-        return status;
-
-    //
-    // dsm_params[0].buffer.pointer and dsm_params[1].integer.value set in
-    // switch below based on acpiDsmFunction
-    //
-
-    dsm_params[0].buffer.type    = ACPI_TYPE_BUFFER;
-    dsm_params[0].buffer.length  = 0x10;
-    dsm_params[0].buffer.pointer = pAcpiDsmGuid;
-
-    dsm_params[1].integer.type   = ACPI_TYPE_INTEGER;
-    dsm_params[1].integer.value  = acpiDsmRev;
-
-    dsm_params[2].integer.type   = ACPI_TYPE_INTEGER;
-    dsm_params[2].integer.value  = acpiDsmSubFunction;
-
-    dsm_params[3].buffer.type    = ACPI_TYPE_BUFFER;
-    dsm_params[3].buffer.length  = inParamSize;
-    memcpy(argument3, pInParams, dsm_params[3].buffer.length);
-    dsm_params[3].buffer.pointer = argument3;
-
-    // parameters for dsm calls (GUID, rev, subfunction, data)
-    input.count = 4;
-    input.pointer = dsm_params;
+    memcpy(argument3, pInParams, inParamSize);

    if (acpiNvpcfDsmFunction)
    {
@ -742,45 +784,15 @@ NV_STATUS NV_API_CALL nv_acpi_dsm_method(
        // not have device handle for NVPCF device
        //
        dev_handle = NULL;
-        acpi_status = acpi_evaluate_object(dev_handle, "\\_SB.NPCF._DSM", &input, &output);
-    }
-    else
-    {
-        acpi_status = acpi_evaluate_object(dev_handle, "_DSM", &input, &output);
+        pathname   = "\\_SB.NPCF._DSM";
    }

-    if (ACPI_FAILURE(acpi_status))
-    {
-        nv_printf(NV_DBG_INFO,
-              "NVRM: %s: failed to evaluate _DSM method!\n", __FUNCTION__);
-        goto exit;
-    }
+    rmStatus = nv_acpi_evaluate_dsm_method(dev_handle, pathname, pAcpiDsmGuid, acpiDsmRev,
+                                           acpiDsmSubFunction, argument3, inParamSize,
+                                           NV_FALSE, NULL, pOutData, pSize);

-    dsm = output.pointer;
-    if (dsm != NULL)
-    {
-        if (outStatus)
-        {
-            *outStatus = dsm->buffer.pointer[3] << 24 |
-                         dsm->buffer.pointer[2] << 16 |
-                         dsm->buffer.pointer[1] << 8  |
-                         dsm->buffer.pointer[0];
-        }
-
-        status = nv_acpi_extract_object(dsm, pOutData, *pSize, &data_size);
-        *pSize = data_size;
-
-        kfree(output.pointer);
-    }
-    if (status != NV_OK)
-    {
-        nv_printf(NV_DBG_ERRORS,
-                  "NVRM: %s: DSM data invalid!\n", __FUNCTION__);
-    }
-
-exit:
    os_free_mem(argument3);
-    return status;
+    return rmStatus;
 }

 /*
@ -796,13 +808,11 @@ NV_STATUS NV_API_CALL nv_acpi_ddc_method(
    acpi_status status;
    union acpi_object *ddc = NULL;
    NvU32 i, largestEdidSize;
-    acpi_handle dev_handle  = NULL;
+    nv_linux_state_t *nvl  = NV_GET_NVL_FROM_NV_STATE(nv);
+    acpi_handle dev_handle = ACPI_HANDLE(nvl->dev);
    acpi_handle lcd_dev_handle  = NULL;
    acpi_handle handle = NULL;

-    if (!nv_acpi_get_device_handle(nv, &dev_handle))
-        return NV_ERR_NOT_SUPPORTED;
-
    if (!dev_handle)
        return NV_ERR_INVALID_ARGUMENT;

@ -836,7 +846,7 @@ NV_STATUS NV_API_CALL nv_acpi_ddc_method(
            case 0x0400:
            case 0xA420:
                lcd_dev_handle = handle;
-                nv_printf(NV_DBG_INFO, "NVRM: %s Found LCD: %x\n",
+                nv_printf(NV_DBG_INFO, "NVRM: %s Found LCD: %llx\n",
                          __FUNCTION__, device_id);
                break;
            default:
@ -915,12 +925,10 @@ NV_STATUS NV_API_CALL nv_acpi_rom_method(
    union acpi_object *rom;
    union acpi_object rom_arg[2];
    struct acpi_object_list input = { 2, rom_arg };
-    acpi_handle dev_handle  = NULL;
+    nv_linux_state_t *nvl  = NV_GET_NVL_FROM_NV_STATE(nv);
+    acpi_handle dev_handle = ACPI_HANDLE(nvl->dev);
    uint32_t offset, length;

-    if (!nv_acpi_get_device_handle(nv, &dev_handle))
-        return NV_ERR_NOT_SUPPORTED;
-
    if (!dev_handle)
        return NV_ERR_INVALID_ARGUMENT;

@ -982,12 +990,10 @@ NV_STATUS NV_API_CALL nv_acpi_dod_method(
    acpi_status status;
    struct acpi_buffer output = { ACPI_ALLOCATE_BUFFER, NULL };
    union acpi_object *dod;
-    acpi_handle dev_handle = NULL;
+    nv_linux_state_t *nvl  = NV_GET_NVL_FROM_NV_STATE(nv);
+    acpi_handle dev_handle = ACPI_HANDLE(nvl->dev);
    NvU32 i, count = (*pSize / sizeof(NvU32));

-    if (!nv_acpi_get_device_handle(nv, &dev_handle))
-        return NV_ERR_NOT_SUPPORTED;
-
    if (!dev_handle)
        return NV_ERR_INVALID_ARGUMENT;

@ -1129,17 +1135,11 @@ NvBool nv_acpi_power_resource_method_present(
    struct pci_dev *pdev
 )
 {
-    acpi_handle handle = NULL;
+    acpi_handle handle = ACPI_HANDLE(&pdev->dev);
    struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL };
    union acpi_object *object_package, *object_reference;
    acpi_status status;

-#if defined(DEVICE_ACPI_HANDLE)
-    handle = DEVICE_ACPI_HANDLE(&pdev->dev);
-#elif defined (ACPI_HANDLE)
-    handle = ACPI_HANDLE(&pdev->dev);
-#endif
-
    if (!handle)
        return NV_FALSE;

@ -1198,7 +1198,8 @@ NV_STATUS NV_API_CALL nv_acpi_mux_method(
    union acpi_object *mux        = NULL;
    union acpi_object mux_arg     = { ACPI_TYPE_INTEGER };
    struct acpi_object_list input = { 1, &mux_arg };
-    acpi_handle dev_handle        = NULL;
+    nv_linux_state_t *nvl         = NV_GET_NVL_FROM_NV_STATE(nv);
+    acpi_handle dev_handle        = ACPI_HANDLE(nvl->dev);
    acpi_handle mux_dev_handle    = NULL;
    acpi_handle handle            = NULL;
    unsigned long long device_id  = 0;
@ -1216,9 +1217,6 @@ NV_STATUS NV_API_CALL nv_acpi_mux_method(
                  __FUNCTION__, pMethodName);
    }

-    if (!nv_acpi_get_device_handle(nv, &dev_handle))
-        return NV_ERR_NOT_SUPPORTED;
-
    if (!dev_handle)
        return NV_ERR_INVALID_ARGUMENT;

@ -1384,6 +1382,34 @@ NvBool NV_API_CALL nv_acpi_is_battery_present(void)
    return NV_FALSE;
 }

+NV_STATUS NV_API_CALL nv_acpi_d3cold_dsm_for_upstream_port(
+    nv_state_t *nv,
+    NvU8       *pAcpiDsmGuid,
+    NvU32       acpiDsmRev,
+    NvU32       acpiDsmSubFunction,
+    NvU32      *data
+)
+{
+    NV_STATUS rmStatus = NV_ERR_OPERATING_SYSTEM;
+    nv_linux_state_t *nvl  = NV_GET_NVL_FROM_NV_STATE(nv);
+    acpi_handle dev_handle = ACPI_HANDLE(nvl->dev->parent);
+    NvU32 outData     = 0;
+    NvU16 outDatasize = sizeof(NvU32);
+    NvU16 inParamSize = sizeof(NvU32);
+
+    if (!dev_handle)
+        return NV_ERR_INVALID_ARGUMENT;
+
+    rmStatus = nv_acpi_evaluate_dsm_method(dev_handle, "_DSM", pAcpiDsmGuid, acpiDsmRev,
+                                           acpiDsmSubFunction, data, inParamSize, NV_TRUE,
+                                           NULL, &outData, &outDatasize);
+
+    if (rmStatus == NV_OK)
+        *data = outData;
+
+    return rmStatus;
+}
+
 #else // NV_LINUX_ACPI_EVENTS_SUPPORTED

 void NV_API_CALL nv_acpi_methods_init(NvU32 *handlePresent)
@ -1426,6 +1452,17 @@ NV_STATUS NV_API_CALL nv_acpi_dsm_method(
    return NV_ERR_NOT_SUPPORTED;
 }

+NV_STATUS NV_API_CALL nv_acpi_d3cold_dsm_for_upstream_port(
+    nv_state_t *nv,
+    NvU8       *pAcpiDsmGuid,
+    NvU32       acpiDsmRev,
+    NvU32       acpiDsmSubFunction,
+    NvU32      *data
+)
+{
+    return NV_ERR_NOT_SUPPORTED;
+}
+
 NV_STATUS NV_API_CALL nv_acpi_ddc_method(
    nv_state_t *nv,
    void *pEdidBuffer,
--- a/kernel-open/nvidia/nv-caps-imex.c
+++ b/kernel-open/nvidia/nv-caps-imex.c
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -24,6 +24,7 @@
 #include "nv-linux.h"

 extern int NVreg_ImexChannelCount;
+extern int NVreg_CreateImexChannel0;

 static int nv_caps_imex_open(struct inode *inode, struct file *file)
 {
@ -104,6 +105,10 @@ int NV_API_CALL nv_caps_imex_init(void)
    if (NVreg_ImexChannelCount == 0)
    {
        nv_printf(NV_DBG_INFO, "nv-caps-imex is disabled.\n");
+
+        // Disable channel creation as well
+        NVreg_CreateImexChannel0 = 0;
+
        return 0;
    }

--- a/kernel-open/nvidia/nv-caps.c
+++ b/kernel-open/nvidia/nv-caps.c
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -26,6 +26,8 @@
 #include "nv-procfs.h"
 #include "nv-hash.h"

+#include "nvmisc.h"
+
 extern int NVreg_ModifyDeviceFiles;

 /* sys_close() or __close_fd() */
@ -49,7 +51,7 @@ typedef struct nv_cap_table_entry
    struct hlist_node hlist;
 } nv_cap_table_entry_t;

-#define NV_CAP_NUM_ENTRIES(_table) (sizeof(_table) / sizeof(_table[0]))
+#define NV_CAP_NUM_ENTRIES(_table) (NV_ARRAY_ELEMENTS(_table))

 static nv_cap_table_entry_t g_nv_cap_nvlink_table[] =
 {
@ -361,18 +363,28 @@ static ssize_t nv_cap_procfs_write(struct file *file,
    nv_cap_file_private_t *private = NULL;
    unsigned long bytes_left;
    char *proc_buffer;
+    int status;
+
+    status = nv_down_read_interruptible(&nv_system_pm_lock);
+    if (status < 0)
+    {
+        nv_printf(NV_DBG_ERRORS, "nv-caps: failed to lock the nv_system_pm_lock!\n");
+        return status;
+    }

    private = ((struct seq_file *)file->private_data)->private;
    bytes_left = (sizeof(private->buffer) - private->offset - 1);

    if (count == 0)
    {
-        return -EINVAL;
+        count = -EINVAL;
+        goto done;
    }

    if ((bytes_left == 0) || (count > bytes_left))
    {
-        return -ENOSPC;
+        count = -ENOSPC;
+        goto done;
    }

    proc_buffer = &private->buffer[private->offset];
@ -380,7 +392,8 @@ static ssize_t nv_cap_procfs_write(struct file *file,
    if (copy_from_user(proc_buffer, buffer, count))
    {
        nv_printf(NV_DBG_ERRORS, "nv-caps: failed to copy in proc data!\n");
-        return -EFAULT;
+        count = -EFAULT;
+        goto done;
    }

    private->offset += count;
@ -388,17 +401,28 @@ static ssize_t nv_cap_procfs_write(struct file *file,

    *pos = private->offset;

+done:
+    up_read(&nv_system_pm_lock);
+
    return count;
 }

 static int nv_cap_procfs_read(struct seq_file *s, void *v)
 {
+    int status;
    nv_cap_file_private_t *private = s->private;

+    status = nv_down_read_interruptible(&nv_system_pm_lock);
+    if (status < 0)
+    {
+        return status;
+    }
+
    seq_printf(s, "%s: %d\n", "DeviceFileMinor", private->minor);
    seq_printf(s, "%s: %d\n", "DeviceFileMode", private->permissions);
    seq_printf(s, "%s: %d\n", "DeviceFileModify", private->modify);

+    up_read(&nv_system_pm_lock);
    return 0;
 }

@ -423,14 +447,6 @@ static int nv_cap_procfs_open(struct inode *inode, struct file *file)
    if (rc < 0)
    {
        NV_KFREE(private, sizeof(nv_cap_file_private_t));
-        return rc;
-    }
-
-    rc = nv_down_read_interruptible(&nv_system_pm_lock);
-    if (rc < 0)
-    {
-        single_release(inode, file);
-        NV_KFREE(private, sizeof(nv_cap_file_private_t));
    }

    return rc;
@ -449,8 +465,6 @@ static int nv_cap_procfs_release(struct inode *inode, struct file *file)
        private = s->private;
    }

-    up_read(&nv_system_pm_lock);
-
    single_release(inode, file);

    if (private != NULL)
--- a/Show More
+++ b/Show More