555.42.02

2025-03-25 13:55:57 +01:00 · 2024-05-21 15:11:46 +02:00 · 2024-05-21 15:11:46 +02:00 · 5a1c474040
commit 5a1c474040
parent 083cd9cf17
955 changed files with 171849 additions and 144768 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,9 @@
 # Changelog

+## Release 555 Entries
+
+### [555.42.02] 2024-05-21
+
 ## Release 550 Entries

 ### [550.78] 2024-04-25
--- a/README.md
+++ b/README.md
@ -1,7 +1,7 @@
 # NVIDIA Linux Open GPU Kernel Module Source

 This is the source release of the NVIDIA Linux open GPU kernel modules,
-version 550.78.
+version 555.42.02.


 ## How to Build
@ -17,7 +17,7 @@ as root:

 Note that the kernel modules built here must be used with GSP
 firmware and user-space NVIDIA GPU driver components from a corresponding
-550.78 driver release.  This can be achieved by installing
+555.42.02 driver release.  This can be achieved by installing
 the NVIDIA GPU driver from the .run file using the `--no-kernel-modules`
 option.  E.g.,

@ -74,7 +74,7 @@ kernel.

 The NVIDIA open kernel modules support the same range of Linux kernel
 versions that are supported with the proprietary NVIDIA kernel modules.
-This is currently Linux kernel 3.10 or newer.
+This is currently Linux kernel 4.15 or newer.


 ## How to Contribute
@ -188,7 +188,7 @@ encountered specific to them.
 For details on feature support and limitations, see the NVIDIA GPU driver
 end user README here:

-https://us.download.nvidia.com/XFree86/Linux-x86_64/550.78/README/kernel_open.html
+https://us.download.nvidia.com/XFree86/Linux-x86_64/555.42.02/README/kernel_open.html

 For vGPU support, please refer to the README.vgpu packaged in the vGPU Host
 Package for more details.
@ -856,6 +856,7 @@ Subsystem Device ID.
 | NVIDIA RTX A500 Embedded GPU                    | 25FB           |
 | NVIDIA GeForce RTX 4090                         | 2684           |
 | NVIDIA GeForce RTX 4090 D                       | 2685           |
+| NVIDIA GeForce RTX 4070 Ti SUPER                | 2689           |
 | NVIDIA RTX 6000 Ada Generation                  | 26B1 1028 16A1 |
 | NVIDIA RTX 6000 Ada Generation                  | 26B1 103C 16A1 |
 | NVIDIA RTX 6000 Ada Generation                  | 26B1 10DE 16A1 |
--- a/kernel-open/Kbuild
+++ b/kernel-open/Kbuild
@ -72,7 +72,7 @@ EXTRA_CFLAGS += -I$(src)/common/inc
 EXTRA_CFLAGS += -I$(src)
 EXTRA_CFLAGS += -Wall $(DEFINES) $(INCLUDES) -Wno-cast-qual -Wno-format-extra-args
 EXTRA_CFLAGS += -D__KERNEL__ -DMODULE -DNVRM
-EXTRA_CFLAGS += -DNV_VERSION_STRING=\"550.78\"
+EXTRA_CFLAGS += -DNV_VERSION_STRING=\"555.42.02\"

 ifneq ($(SYSSRCHOST1X),)
 EXTRA_CFLAGS += -I$(SYSSRCHOST1X)
@ -118,7 +118,7 @@ ifeq ($(ARCH),x86_64)
 endif

 ifeq ($(ARCH),powerpc)
- EXTRA_CFLAGS += -mlittle-endian -mno-strict-align -mno-altivec
+ EXTRA_CFLAGS += -mlittle-endian -mno-strict-align
 endif

 EXTRA_CFLAGS += -DNV_UVM_ENABLE
@ -172,6 +172,7 @@ NV_CFLAGS_FROM_CONFTEST := $(shell $(NV_CONFTEST_CMD) build_cflags)
 NV_CONFTEST_CFLAGS = $(NV_CFLAGS_FROM_CONFTEST) $(EXTRA_CFLAGS) -fno-pie
 NV_CONFTEST_CFLAGS += $(call cc-disable-warning,pointer-sign)
 NV_CONFTEST_CFLAGS += $(call cc-option,-fshort-wchar,)
+NV_CONFTEST_CFLAGS += $(call cc-option,-Werror=incompatible-pointer-types,)
 NV_CONFTEST_CFLAGS += -Wno-error

 NV_CONFTEST_COMPILE_TEST_HEADERS := $(obj)/conftest/macros.h
--- a/kernel-open/Makefile
+++ b/kernel-open/Makefile
@ -28,7 +28,7 @@ else
  else
    KERNEL_UNAME ?= $(shell uname -r)
    KERNEL_MODLIB := /lib/modules/$(KERNEL_UNAME)
-    KERNEL_SOURCES := $(shell test -d $(KERNEL_MODLIB)/source && echo $(KERNEL_MODLIB)/source || echo $(KERNEL_MODLIB)/build)
+    KERNEL_SOURCES := $(shell ((test -d $(KERNEL_MODLIB)/source && echo $(KERNEL_MODLIB)/source) || (test -d $(KERNEL_MODLIB)/build/source && echo $(KERNEL_MODLIB)/build/source)) || echo $(KERNEL_MODLIB)/build)
  endif

  KERNEL_OUTPUT := $(KERNEL_SOURCES)
@ -42,7 +42,11 @@ else
  else
    KERNEL_UNAME ?= $(shell uname -r)
    KERNEL_MODLIB := /lib/modules/$(KERNEL_UNAME)
-    ifeq ($(KERNEL_SOURCES), $(KERNEL_MODLIB)/source)
+    # $(filter patter...,text) - Returns all whitespace-separated words in text that
+    # do match any of the pattern words, removing any words that do not match.
+    # Set the KERNEL_OUTPUT only if either $(KERNEL_MODLIB)/source or
+    # $(KERNEL_MODLIB)/build/source path matches the KERNEL_SOURCES.
+    ifneq ($(filter $(KERNEL_SOURCES),$(KERNEL_MODLIB)/source $(KERNEL_MODLIB)/build/source),)
      KERNEL_OUTPUT := $(KERNEL_MODLIB)/build
      KBUILD_PARAMS := KBUILD_OUTPUT=$(KERNEL_OUTPUT)
    endif
--- a/kernel-open/common/inc/nv-hypervisor.h
+++ b/kernel-open/common/inc/nv-hypervisor.h
@ -37,13 +37,11 @@ typedef enum _HYPERVISOR_TYPE
    OS_HYPERVISOR_UNKNOWN
 } HYPERVISOR_TYPE;

-#define CMD_VGPU_VFIO_WAKE_WAIT_QUEUE         0
-#define CMD_VGPU_VFIO_INJECT_INTERRUPT        1
-#define CMD_VGPU_VFIO_REGISTER_MDEV           2
-#define CMD_VGPU_VFIO_PRESENT                 3
-#define CMD_VFIO_PCI_CORE_PRESENT             4
+#define CMD_VFIO_WAKE_REMOVE_GPU              1
+#define CMD_VGPU_VFIO_PRESENT                 2
+#define CMD_VFIO_PCI_CORE_PRESENT             3

-#define MAX_VF_COUNT_PER_GPU 64
+#define MAX_VF_COUNT_PER_GPU                  64

 typedef enum _VGPU_TYPE_INFO
 {
@ -54,17 +52,11 @@ typedef enum _VGPU_TYPE_INFO

 typedef struct
 {
-    void  *vgpuVfioRef;
-    void  *waitQueue;
    void  *nv;
-    NvU32 *vgpuTypeIds;
-    NvU8 **vgpuNames;
-    NvU32  numVgpuTypes;
-    NvU32  domain;
-    NvU8   bus;
-    NvU8   slot;
-    NvU8   function;
-    NvBool is_virtfn;
+    NvU32 domain;
+    NvU32 bus;
+    NvU32 device;
+    NvU32 return_status;
 } vgpu_vfio_info;

 typedef struct
--- a/kernel-open/common/inc/nv-linux.h
+++ b/kernel-open/common/inc/nv-linux.h
@ -58,14 +58,10 @@
 #include <linux/version.h>
 #include <linux/utsname.h>

-#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 32)
-#error "This driver does not support kernels older than 2.6.32!"
-#elif LINUX_VERSION_CODE < KERNEL_VERSION(2, 7, 0)
-#  define KERNEL_2_6
-#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 0)
-#  define KERNEL_3
-#else
-#error "This driver does not support development kernels!"
+#if LINUX_VERSION_CODE == KERNEL_VERSION(4, 4, 0)
+// Version 4.4 is allowed, temporarily, although not officially supported.
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0)
+#error "This driver does not support kernels older than Linux 4.15!"
 #endif

 #if defined (CONFIG_SMP) && !defined (__SMP__)
@ -836,16 +832,16 @@ static inline dma_addr_t nv_phys_to_dma(struct device *dev, NvU64 pa)
 #define NV_PRINT_AT(nv_debug_level,at)                                           \
    {                                                                            \
        nv_printf(nv_debug_level,                                                \
-            "NVRM: VM: %s:%d: 0x%p, %d page(s), count = %d, flags = 0x%08x, "    \
+            "NVRM: VM: %s:%d: 0x%p, %d page(s), count = %d, "                    \
            "page_table = 0x%p\n",  __FUNCTION__, __LINE__, at,                  \
            at->num_pages, NV_ATOMIC_READ(at->usage_count),                      \
-            at->flags, at->page_table);                                          \
+            at->page_table);                                                     \
    }

 #define NV_PRINT_VMA(nv_debug_level,vma)                                                 \
    {                                                                                    \
        nv_printf(nv_debug_level,                                                        \
-            "NVRM: VM: %s:%d: 0x%lx - 0x%lx, 0x%08x bytes @ 0x%016llx, 0x%p, 0x%p\n",    \
+            "NVRM: VM: %s:%d: 0x%lx - 0x%lx, 0x%08lx bytes @ 0x%016llx, 0x%p, 0x%p\n",    \
            __FUNCTION__, __LINE__, vma->vm_start, vma->vm_end, NV_VMA_SIZE(vma),        \
            NV_VMA_OFFSET(vma), NV_VMA_PRIVATE(vma), NV_VMA_FILE(vma));                  \
    }
@ -1078,6 +1074,8 @@ static inline void nv_kmem_ctor_dummy(void *arg)
        kmem_cache_destroy(kmem_cache);     \
    }

+#define NV_KMEM_CACHE_ALLOC_ATOMIC(kmem_cache)     \
+    kmem_cache_alloc(kmem_cache, GFP_ATOMIC)
 #define NV_KMEM_CACHE_ALLOC(kmem_cache)     \
    kmem_cache_alloc(kmem_cache, GFP_KERNEL)
 #define NV_KMEM_CACHE_FREE(ptr, kmem_cache) \
@ -1104,6 +1102,23 @@ static inline void *nv_kmem_cache_zalloc(struct kmem_cache *k, gfp_t flags)
 #endif
 }

+static inline int nv_kmem_cache_alloc_stack_atomic(nvidia_stack_t **stack)
+{
+    nvidia_stack_t *sp = NULL;
+#if defined(NVCPU_X86_64)
+    if (rm_is_altstack_in_use())
+    {
+        sp = NV_KMEM_CACHE_ALLOC_ATOMIC(nvidia_stack_t_cache);
+        if (sp == NULL)
+            return -ENOMEM;
+        sp->size = sizeof(sp->stack);
+        sp->top = sp->stack + sp->size;
+    }
+#endif
+    *stack = sp;
+    return 0;
+}
+
 static inline int nv_kmem_cache_alloc_stack(nvidia_stack_t **stack)
 {
    nvidia_stack_t *sp = NULL;
@ -1614,6 +1629,10 @@ typedef struct nv_linux_state_s {
    nv_kthread_q_t open_q;
    NvBool is_accepting_opens;
    struct semaphore open_q_lock;
+#if defined(NV_VGPU_KVM_BUILD)
+    wait_queue_head_t wait;
+    NvS32 return_status;
+#endif
 } nv_linux_state_t;

 extern nv_linux_state_t *nv_linux_devices;
--- a/kernel-open/common/inc/nv-mm.h
+++ b/kernel-open/common/inc/nv-mm.h
@ -29,17 +29,17 @@
 typedef int vm_fault_t;
 #endif

-/* pin_user_pages
+/*
+ * pin_user_pages()
+ *
 * Presence of pin_user_pages() also implies the presence of unpin-user_page().
- * Both were added in the v5.6-rc1
+ * Both were added in the v5.6.
 *
- * pin_user_pages() was added by commit eddb1c228f7951d399240
- * ("mm/gup: introduce pin_user_pages*() and FOLL_PIN") in v5.6-rc1 (2020-01-30)
- *
- * Removed vmas parameter from pin_user_pages() by commit 40896a02751
- * ("mm/gup: remove vmas parameter from pin_user_pages()")
- * in linux-next, expected in v6.5-rc1 (2023-05-17)
+ * pin_user_pages() was added by commit eddb1c228f79
+ * ("mm/gup: introduce pin_user_pages*() and FOLL_PIN") in v5.6.
 *
+ * Removed vmas parameter from pin_user_pages() by commit 4c630f307455
+ * ("mm/gup: remove vmas parameter from pin_user_pages()") in v6.5.
 */

 #include <linux/mm.h>
@ -63,25 +63,28 @@ typedef int vm_fault_t;
    #define NV_UNPIN_USER_PAGE put_page
 #endif // NV_PIN_USER_PAGES_PRESENT

-/* get_user_pages
+/*
+ * get_user_pages()
 *
- * The 8-argument version of get_user_pages was deprecated by commit
- * (2016 Feb 12: cde70140fed8429acf7a14e2e2cbd3e329036653)for the non-remote case
+ * The 8-argument version of get_user_pages() was deprecated by commit
+ * cde70140fed8 ("mm/gup: Overload get_user_pages() functions") in v4.6-rc1.
 * (calling get_user_pages with current and current->mm).
 *
- * Completely moved to the 6 argument version of get_user_pages -
- * 2016 Apr 4: c12d2da56d0e07d230968ee2305aaa86b93a6832
+ * Completely moved to the 6 argument version of get_user_pages() by
+ * commit c12d2da56d0e ("mm/gup: Remove the macro overload API migration
+ * helpers from the get_user*() APIs") in v4.6-rc4.
 *
- * write and force parameters were replaced with gup_flags by -
- * 2016 Oct 12: 768ae309a96103ed02eb1e111e838c87854d8b51
+ * write and force parameters were replaced with gup_flags by
+ * commit 768ae309a961 ("mm: replace get_user_pages() write/force parameters
+ * with gup_flags") in v4.9.
 *
 * A 7-argument version of get_user_pages was introduced into linux-4.4.y by
- * commit 8e50b8b07f462ab4b91bc1491b1c91bd75e4ad40 which cherry-picked the
- * replacement of the write and force parameters with gup_flags
+ * commit 8e50b8b07f462 ("mm: replace get_user_pages() write/force parameters
+ * with gup_flags") which cherry-picked the replacement of the write and
+ * force parameters with gup_flags.
 *
- * Removed vmas parameter from get_user_pages() by commit 7bbf9c8c99
- * ("mm/gup: remove unused vmas parameter from get_user_pages()")
- * in linux-next, expected in v6.5-rc1 (2023-05-17)
+ * Removed vmas parameter from get_user_pages() by commit 54d020692b34
+ * ("mm/gup: remove unused vmas parameter from get_user_pages()") in v6.5.
 *
 */

@ -112,18 +115,19 @@ typedef int vm_fault_t;
    }
 #endif // NV_GET_USER_PAGES_HAS_ARGS_FLAGS

-/* pin_user_pages_remote
+/*
+ * pin_user_pages_remote()
 *
- * pin_user_pages_remote() was added by commit eddb1c228f7951d399240
- * ("mm/gup: introduce pin_user_pages*() and FOLL_PIN") in v5.6 (2020-01-30)
+ * pin_user_pages_remote() was added by commit eddb1c228f79
+ * ("mm/gup: introduce pin_user_pages*() and FOLL_PIN") in v5.6.
 *
 * pin_user_pages_remote() removed 'tsk' parameter by commit
- * 64019a2e467a ("mm/gup: remove task_struct pointer for  all gup code")
- * in v5.9-rc1 (2020-08-11). *
+ * 64019a2e467a ("mm/gup: remove task_struct pointer for all gup code")
+ * in v5.9.
 *
 * Removed unused vmas parameter from pin_user_pages_remote() by commit
- * 83bcc2e132("mm/gup: remove unused vmas parameter from pin_user_pages_remote()")
- * in linux-next, expected in v6.5-rc1 (2023-05-14)
+ * 0b295316b3a9 ("mm/gup: remove unused vmas parameter from
+ * pin_user_pages_remote()") in v6.5.
 *
 */

@ -143,7 +147,7 @@ typedef int vm_fault_t;

 /*
 * get_user_pages_remote() was added by commit 1e9877902dc7
- * ("mm/gup: Introduce get_user_pages_remote()") in v4.6 (2016-02-12).
+ * ("mm/gup: Introduce get_user_pages_remote()") in v4.6.
 *
 * Note that get_user_pages_remote() requires the caller to hold a reference on
 * the task_struct (if non-NULL and if this API has tsk argument) and the mm_struct.
@ -153,19 +157,17 @@ typedef int vm_fault_t;
 *
 * get_user_pages_remote() write/force parameters were replaced
 * with gup_flags by commit 9beae1ea8930 ("mm: replace get_user_pages_remote()
- * write/force parameters with gup_flags") in v4.9 (2016-10-13).
+ * write/force parameters with gup_flags") in v4.9.
 *
 * get_user_pages_remote() added 'locked' parameter by commit 5b56d49fc31d
- * ("mm: add locked parameter to get_user_pages_remote()") in
- * v4.10 (2016-12-14).
+ * ("mm: add locked parameter to get_user_pages_remote()") in v4.10.
 *
 * get_user_pages_remote() removed 'tsk' parameter by
 * commit 64019a2e467a ("mm/gup: remove task_struct pointer for
- * all gup code") in v5.9-rc1 (2020-08-11).
+ * all gup code") in v5.9.
 *
- * Removed vmas parameter from get_user_pages_remote() by commit a4bde14d549 
- * ("mm/gup: remove vmas parameter from get_user_pages_remote()")
- * in linux-next, expected in v6.5-rc1 (2023-05-14)
+ * Removed vmas parameter from get_user_pages_remote() by commit ca5e863233e8
+ * ("mm/gup: remove vmas parameter from get_user_pages_remote()") in v6.5.
 *
 */

--- a/kernel-open/common/inc/nv.h
+++ b/kernel-open/common/inc/nv.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 1999-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1999-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -609,6 +609,15 @@ typedef enum
    NV_POWER_STATE_RUNNING
 } nv_power_state_t;

+typedef struct
+{
+    const char *vidmem_power_status;
+    const char *dynamic_power_status;
+    const char *gc6_support;
+    const char *gcoff_support;
+    const char *s0ix_status;
+} nv_power_info_t;
+
 #define NV_PRIMARY_VGA(nv)      ((nv)->primary_vga)

 #define NV_IS_CTL_DEVICE(nv)    ((nv)->flags & NV_FLAG_CONTROL)
@ -778,7 +787,7 @@ nv_state_t*  NV_API_CALL  nv_get_ctl_state       (void);

 void   NV_API_CALL  nv_set_dma_address_size      (nv_state_t *, NvU32 );

-NV_STATUS  NV_API_CALL  nv_alias_pages           (nv_state_t *, NvU32, NvU32, NvU32, NvU64, NvU64 *, void **);
+NV_STATUS  NV_API_CALL  nv_alias_pages           (nv_state_t *, NvU32, NvU64, NvU32, NvU32, NvU64, NvU64 *, void **);
 NV_STATUS  NV_API_CALL  nv_alloc_pages           (nv_state_t *, NvU32, NvU64, NvBool, NvU32, NvBool, NvBool, NvS32, NvU64 *, void **);
 NV_STATUS  NV_API_CALL  nv_free_pages            (nv_state_t *, NvU32, NvBool, NvU32, void *);

@ -822,6 +831,7 @@ void   NV_API_CALL  nv_acpi_methods_init         (NvU32 *);
 void   NV_API_CALL  nv_acpi_methods_uninit       (void);

 NV_STATUS  NV_API_CALL  nv_acpi_method           (NvU32, NvU32, NvU32, void *, NvU16, NvU32 *, void *, NvU16 *);
+NV_STATUS  NV_API_CALL  nv_acpi_d3cold_dsm_for_upstream_port (nv_state_t *, NvU8 *, NvU32, NvU32, NvU32 *);
 NV_STATUS  NV_API_CALL  nv_acpi_dsm_method       (nv_state_t *, NvU8 *, NvU32, NvBool, NvU32, void *, NvU16, NvU32 *, void *, NvU16 *);
 NV_STATUS  NV_API_CALL  nv_acpi_ddc_method       (nv_state_t *, void *, NvU32 *, NvBool);
 NV_STATUS  NV_API_CALL  nv_acpi_dod_method       (nv_state_t *, NvU32 *, NvU32 *);
@ -990,10 +1000,10 @@ NV_STATUS  NV_API_CALL  rm_p2p_init_mapping       (nvidia_stack_t *, NvU64, NvU6
 NV_STATUS  NV_API_CALL  rm_p2p_destroy_mapping    (nvidia_stack_t *, NvU64);
 NV_STATUS  NV_API_CALL  rm_p2p_get_pages          (nvidia_stack_t *, NvU64, NvU32, NvU64, NvU64, NvU64 *, NvU32 *, NvU32 *, NvU32 *, NvU8 **, void *);
 NV_STATUS  NV_API_CALL  rm_p2p_get_gpu_info       (nvidia_stack_t *, NvU64, NvU64, NvU8 **, void **);
-NV_STATUS  NV_API_CALL  rm_p2p_get_pages_persistent (nvidia_stack_t *,  NvU64, NvU64, void **, NvU64 *, NvU32 *, void *, void *);
+NV_STATUS  NV_API_CALL  rm_p2p_get_pages_persistent (nvidia_stack_t *,  NvU64, NvU64, void **, NvU64 *, NvU32 *, void *, void *, void **);
 NV_STATUS  NV_API_CALL  rm_p2p_register_callback  (nvidia_stack_t *, NvU64, NvU64, NvU64, void *, void (*)(void *), void *);
 NV_STATUS  NV_API_CALL  rm_p2p_put_pages          (nvidia_stack_t *, NvU64, NvU32, NvU64, void *);
-NV_STATUS  NV_API_CALL  rm_p2p_put_pages_persistent(nvidia_stack_t *, void *, void *);
+NV_STATUS  NV_API_CALL  rm_p2p_put_pages_persistent(nvidia_stack_t *, void *, void *, void *);
 NV_STATUS  NV_API_CALL  rm_p2p_dma_map_pages      (nvidia_stack_t *, nv_dma_device_t *, NvU8 *, NvU64, NvU32, NvU64 *, void **);
 NV_STATUS  NV_API_CALL  rm_dma_buf_dup_mem_handle (nvidia_stack_t *, nv_state_t *, NvHandle, NvHandle, NvHandle, NvHandle, void *, NvHandle, NvU64, NvU64, NvHandle *, void **);
 void       NV_API_CALL  rm_dma_buf_undup_mem_handle(nvidia_stack_t *, nv_state_t *, NvHandle, NvHandle);
@ -1027,9 +1037,7 @@ void       NV_API_CALL rm_enable_dynamic_power_management(nvidia_stack_t *, nv_s
 NV_STATUS  NV_API_CALL rm_ref_dynamic_power(nvidia_stack_t *, nv_state_t *, nv_dynamic_power_mode_t);
 void       NV_API_CALL rm_unref_dynamic_power(nvidia_stack_t *, nv_state_t *, nv_dynamic_power_mode_t);
 NV_STATUS  NV_API_CALL rm_transition_dynamic_power(nvidia_stack_t *, nv_state_t *, NvBool, NvBool *);
-const char* NV_API_CALL rm_get_vidmem_power_status(nvidia_stack_t *, nv_state_t *);
-const char* NV_API_CALL rm_get_dynamic_power_management_status(nvidia_stack_t *, nv_state_t *);
-const char* NV_API_CALL rm_get_gpu_gcx_support(nvidia_stack_t *, nv_state_t *, NvBool);
+void       NV_API_CALL rm_get_power_info(nvidia_stack_t *, nv_state_t *, nv_power_info_t *);

 void       NV_API_CALL rm_acpi_notify(nvidia_stack_t *, nv_state_t *, NvU32);
 void       NV_API_CALL rm_acpi_nvpcf_notify(nvidia_stack_t *);
@ -1041,13 +1049,12 @@ NV_STATUS  NV_API_CALL  nv_vgpu_create_request(nvidia_stack_t *, nv_state_t *, c
 NV_STATUS  NV_API_CALL  nv_vgpu_delete(nvidia_stack_t *, const NvU8 *, NvU16);
 NV_STATUS  NV_API_CALL  nv_vgpu_get_type_ids(nvidia_stack_t *, nv_state_t *, NvU32 *, NvU32 *, NvBool, NvU8, NvBool);
 NV_STATUS  NV_API_CALL  nv_vgpu_get_type_info(nvidia_stack_t *, nv_state_t *, NvU32, char *, int, NvU8);
-NV_STATUS  NV_API_CALL  nv_vgpu_get_bar_info(nvidia_stack_t *, nv_state_t *, const NvU8 *, NvU64 *, NvU32, void *, NvBool *);
+NV_STATUS  NV_API_CALL  nv_vgpu_get_bar_info(nvidia_stack_t *, nv_state_t *, const NvU8 *, NvU64 *,
+                                             NvU64 *, NvU64 *, NvU32 *, NvBool *, NvU8 *);
 NV_STATUS  NV_API_CALL  nv_vgpu_get_hbm_info(nvidia_stack_t *, nv_state_t *, const NvU8 *, NvU64 *, NvU64 *);
-NV_STATUS  NV_API_CALL  nv_vgpu_start(nvidia_stack_t *, const NvU8 *, void *, NvS32 *, NvU8 *, NvU32);
-NV_STATUS  NV_API_CALL  nv_vgpu_get_sparse_mmap(nvidia_stack_t *, nv_state_t *, const NvU8 *, NvU64 **, NvU64 **, NvU32 *);
 NV_STATUS  NV_API_CALL  nv_vgpu_process_vf_info(nvidia_stack_t *, nv_state_t *, NvU8, NvU32, NvU8, NvU8, NvU8, NvBool, void *);
-NV_STATUS  NV_API_CALL  nv_vgpu_update_request(nvidia_stack_t *, const NvU8 *, NvU32, NvU64 *, NvU64 *, const char *);
 NV_STATUS  NV_API_CALL  nv_gpu_bind_event(nvidia_stack_t *);
+NV_STATUS  NV_API_CALL  nv_gpu_unbind_event(nvidia_stack_t *, NvU32, NvBool *);

 NV_STATUS NV_API_CALL nv_get_usermap_access_params(nv_state_t*, nv_usermap_access_params_t*);
 nv_soc_irq_type_t NV_API_CALL nv_get_current_irq_type(nv_state_t*);
--- a/kernel-open/common/inc/nv_uvm_interface.h
+++ b/kernel-open/common/inc/nv_uvm_interface.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2013-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2013-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -1462,6 +1462,29 @@ NV_STATUS nvUvmInterfacePagingChannelPushStream(UvmGpuPagingChannelHandle channe
                                                char *methodStream,
                                                NvU32 methodStreamSize);

+/*******************************************************************************
+    nvUvmInterfaceKeyRotationChannelDisable
+
+    This function notifies RM that the given channels are idle.
+
+    This function is called after RM has notified UVM that keys need to be rotated.
+    When called RM will disable the channels, rotate their keys, and then re-enable
+    the channels.
+
+    Locking: This function acquires an API and GPU lock.
+    Memory : This function dynamically allocates memory.
+
+    Arguments:
+        channelList[IN]      - An array of channel handles whose channels are idle.
+        channelListCount[IN] - Number of channels in channelList. Its value must be
+                               greater than 0.
+
+    Error codes:
+      NV_ERR_INVALID_ARGUMENT - channelList is NULL or channeListCount is 0.
+*/
+NV_STATUS nvUvmInterfaceKeyRotationChannelDisable(uvmGpuChannelHandle channelList[],
+                                                  NvU32 channeListCount);
+
 /*******************************************************************************
    Cryptography Services Library (CSL) Interface
 */
@ -1507,7 +1530,7 @@ void nvUvmInterfaceDeinitCslContext(UvmCslContext *uvmCslContext);
 /*******************************************************************************
    nvUvmInterfaceCslUpdateContext

-    Updates a context after a key rotation event and can only be called once per
+    Updates contexts after a key rotation event and can only be called once per
    key rotation event. Following a key rotation event, and before
    nvUvmInterfaceCslUpdateContext is called, data encrypted by the GPU with the
    previous key can be decrypted with nvUvmInterfaceCslDecrypt.
@ -1516,12 +1539,14 @@ void nvUvmInterfaceDeinitCslContext(UvmCslContext *uvmCslContext);
    Memory : This function does not dynamically allocate memory.

    Arguments:
-        uvmCslContext[IN] - The CSL context associated with a channel.
-
+        contextList[IN/OUT]  - An array of pointers to CSL contexts.
+        contextListCount[IN] - Number of CSL contexts in contextList. Its value
+                               must be greater than 0.
    Error codes:
-        NV_ERR_INVALID_ARGUMENT - The CSL context is not associated with a channel.
+        NV_ERR_INVALID_ARGUMENT - contextList is NULL or contextListCount is 0.
 */
-NV_STATUS nvUvmInterfaceCslUpdateContext(UvmCslContext *uvmCslContext);
+NV_STATUS nvUvmInterfaceCslUpdateContext(UvmCslContext *contextList[],
+                                         NvU32 contextListCount);

 /*******************************************************************************
    nvUvmInterfaceCslRotateIv
@ -1739,7 +1764,14 @@ NV_STATUS nvUvmInterfaceCslIncrementIv(UvmCslContext *uvmCslContext,
    Checks and logs information about non-CSL encryptions, such as those that
    originate from the GPU.

-    This function does not modify elements of the UvmCslContext.
+    For contexts associated with channels, this function does not modify elements of
+    the UvmCslContext and must be called for each external encryption invocation.
+
+    For the context associated with fault buffers, bufferSize can encompass multiple
+    encryption invocations, and the UvmCslContext will be updated following a key
+    rotation event.
+
+    In either case the IV remains unmodified after this function is called.

    Locking: This function does not acquire an API or GPU lock.
    Memory : This function does not dynamically allocate memory.
@ -1748,7 +1780,7 @@ NV_STATUS nvUvmInterfaceCslIncrementIv(UvmCslContext *uvmCslContext,

    Arguments:
        uvmCslContext[IN/OUT] - The CSL context.
-        bufferSize[OUT]       - The size of the buffer encrypted by the
+        bufferSize[OUT]       - The size of the buffer(s) encrypted by the
                                external entity in units of bytes.

    Error codes:
--- a/kernel-open/common/inc/nv_uvm_types.h
+++ b/kernel-open/common/inc/nv_uvm_types.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2014-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2014-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -39,12 +39,12 @@
 // are multiple BIG page sizes in RM. These defines are used as flags to "0"
 // should be OK when user is not sure which pagesize allocation it wants
 //
-#define UVM_PAGE_SIZE_DEFAULT    0x0
-#define UVM_PAGE_SIZE_4K         0x1000
-#define UVM_PAGE_SIZE_64K        0x10000
-#define UVM_PAGE_SIZE_128K       0x20000
-#define UVM_PAGE_SIZE_2M         0x200000
-#define UVM_PAGE_SIZE_512M       0x20000000
+#define UVM_PAGE_SIZE_DEFAULT    0x0ULL
+#define UVM_PAGE_SIZE_4K         0x1000ULL
+#define UVM_PAGE_SIZE_64K        0x10000ULL
+#define UVM_PAGE_SIZE_128K       0x20000ULL
+#define UVM_PAGE_SIZE_2M         0x200000ULL
+#define UVM_PAGE_SIZE_512M       0x20000000ULL

 //
 // When modifying flags, make sure they are compatible with the mirrored
@ -267,6 +267,7 @@ typedef struct UvmGpuChannelInfo_tag

    // The errorNotifier is filled out when the channel hits an RC error.
    NvNotification    *errorNotifier;
+    NvNotification    *keyRotationNotifier;

    NvU32              hwRunlistId;
    NvU32              hwChannelId;
@ -292,13 +293,13 @@ typedef struct UvmGpuChannelInfo_tag

    // GPU VAs of both GPFIFO and GPPUT are needed in Confidential Computing
    // so a channel can be controlled via another channel (SEC2 or WLC/LCIC)
-    NvU64             gpFifoGpuVa;
-    NvU64             gpPutGpuVa;
-    NvU64             gpGetGpuVa;
+    NvU64              gpFifoGpuVa;
+    NvU64              gpPutGpuVa;
+    NvU64              gpGetGpuVa;
    // GPU VA of work submission offset is needed in Confidential Computing
    // so CE channels can ring doorbell of other channels as required for
    // WLC/LCIC work submission
-    NvU64             workSubmissionOffsetGpuVa;
+    NvU64              workSubmissionOffsetGpuVa;
 } UvmGpuChannelInfo;

 typedef enum
@ -1086,4 +1087,21 @@ typedef enum UvmCslOperation
    UVM_CSL_OPERATION_DECRYPT
 } UvmCslOperation;

+typedef enum UVM_KEY_ROTATION_STATUS {
+    // Key rotation complete/not in progress
+    UVM_KEY_ROTATION_STATUS_IDLE = 0,
+    // RM is waiting for clients to report their channels are idle for key rotation
+    UVM_KEY_ROTATION_STATUS_PENDING = 1,
+    // Key rotation is in progress
+    UVM_KEY_ROTATION_STATUS_IN_PROGRESS = 2,
+    // Key rotation timeout failure, RM will RC non-idle channels.
+    // UVM should never see this status value.
+    UVM_KEY_ROTATION_STATUS_FAILED_TIMEOUT = 3,
+    // Key rotation failed because upper threshold was crossed, RM will RC non-idle channels
+    UVM_KEY_ROTATION_STATUS_FAILED_THRESHOLD = 4,
+    // Internal RM failure while rotating keys for a certain channel, RM will RC the channel.
+    UVM_KEY_ROTATION_STATUS_FAILED_ROTATION = 5,
+    UVM_KEY_ROTATION_STATUS_MAX_COUNT = 6,
+} UVM_KEY_ROTATION_STATUS;
+
 #endif // _NV_UVM_TYPES_H_
--- a/kernel-open/common/inc/nvmisc.h
+++ b/kernel-open/common/inc/nvmisc.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2020 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -494,6 +494,23 @@ do                                                      \
 //
 #define NV_TWO_N_MINUS_ONE(n) (((1ULL<<(n/2))<<((n+1)/2))-1)

+//
+// Create a 64b bitmask with n bits set
+// This is the same as ((1ULL<<n) - 1), but it doesn't overflow for n=64
+//
+// ...
+// n=-1, 0x0000000000000000
+// n=0,  0x0000000000000000
+// n=1,  0x0000000000000001
+// ...
+// n=63, 0x7FFFFFFFFFFFFFFF
+// n=64, 0xFFFFFFFFFFFFFFFF
+// n=65, 0xFFFFFFFFFFFFFFFF
+// n=66, 0xFFFFFFFFFFFFFFFF
+// ...
+//
+#define NV_BITMASK64(n) ((n<1) ? 0ULL : (NV_U64_MAX>>((n>64) ? 0 : (64-n))))
+
 #define DRF_READ_1WORD_BS(d,r,f,v) \
    ((DRF_EXTENT_MW(NV##d##r##f)<8)?DRF_READ_1BYTE_BS(NV##d##r##f,(v)): \
    ((DRF_EXTENT_MW(NV##d##r##f)<16)?DRF_READ_2BYTE_BS(NV##d##r##f,(v)): \
@ -574,6 +591,13 @@ nvMaskPos32(const NvU32 mask, const NvU32 bitIdx)
    n32 = BIT_IDX_32(LOWESTBIT(n32));\
 }

+// Destructive operation on n64
+#define LOWESTBITIDX_64(n64)         \
+{                                    \
+    n64 = BIT_IDX_64(LOWESTBIT(n64));\
+}
+
+
 // Destructive operation on n32
 #define HIGHESTBITIDX_32(n32)   \
 {                               \
@ -918,6 +942,11 @@ static NV_FORCEINLINE void *NV_NVUPTR_TO_PTR(NvUPtr address)
 // Use (lo) if (b) is less than 64, and (hi) if >= 64.
 //
 #define NV_BIT_SET_128(b, lo, hi)              { nvAssert( (b) < 128 ); if ( (b) < 64 ) (lo) |= NVBIT64(b); else (hi) |= NVBIT64( b & 0x3F ); }
+//
+// Clear the bit at pos (b) for U64 which is < 128.
+// Use (lo) if (b) is less than 64, and (hi) if >= 64.
+//
+#define NV_BIT_CLEAR_128(b, lo, hi)            { nvAssert( (b) < 128 ); if ( (b) < 64 ) (lo) &= ~NVBIT64(b); else (hi) &= ~NVBIT64( b & 0x3F ); }

 // Get the number of elements the specified fixed-size array
 #define NV_ARRAY_ELEMENTS(x)                   ((sizeof(x)/sizeof((x)[0])))
--- a/kernel-open/common/inc/nvstatuscodes.h
+++ b/kernel-open/common/inc/nvstatuscodes.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2014-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2014-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -152,6 +152,7 @@ NV_STATUS_CODE(NV_ERR_FABRIC_MANAGER_NOT_PRESENT,      0x0000007A, "Fabric Manag
 NV_STATUS_CODE(NV_ERR_ALREADY_SIGNALLED,               0x0000007B, "Semaphore Surface value already >= requested wait value")
 NV_STATUS_CODE(NV_ERR_QUEUE_TASK_SLOT_NOT_AVAILABLE,   0x0000007C, "PMU RPC error due to no queue slot available for this event")
 NV_STATUS_CODE(NV_ERR_KEY_ROTATION_IN_PROGRESS,        0x0000007D, "Operation not allowed as key rotation is in progress")
+NV_STATUS_CODE(NV_ERR_TEST_ONLY_CODE_NOT_ENABLED,      0x0000007E, "Test-only code path not enabled")

 // Warnings:
 NV_STATUS_CODE(NV_WARN_HOT_SWITCH,                     0x00010001, "WARNING Hot switch")
--- a/kernel-open/common/inc/nvtypes.h
+++ b/kernel-open/common/inc/nvtypes.h
@ -152,6 +152,12 @@ typedef   signed short     NvS16; /* -32768 to 32767                         */
     (((NvU32)(c) & 0xff) << 8)  | \
     (((NvU32)(d) & 0xff))))

+// Macro to build an NvU64 from two DWORDS, listed from msb to lsb
+#define NvU64_BUILD(a, b) \
+    ((NvU64)( \
+     (((NvU64)(a) & ~0U) << 32) | \
+     (((NvU64)(b) & ~0U))))
+
 #if NVTYPES_USE_STDINT
 typedef uint32_t           NvV32; /* "void": enumerated or multiple fields   */
 typedef uint32_t           NvU32; /* 0 to 4294967295                         */
--- a/kernel-open/common/inc/rm-gpu-ops.h
+++ b/kernel-open/common/inc/rm-gpu-ops.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 1999-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1999-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -101,9 +101,10 @@ NV_STATUS  NV_API_CALL rm_gpu_ops_paging_channels_map(nvidia_stack_t *, nvgpuAdd
 void       NV_API_CALL rm_gpu_ops_paging_channels_unmap(nvidia_stack_t *, nvgpuAddressSpaceHandle_t, NvU64, nvgpuDeviceHandle_t);
 NV_STATUS  NV_API_CALL rm_gpu_ops_paging_channel_push_stream(nvidia_stack_t *, nvgpuPagingChannelHandle_t, char *, NvU32);

+NV_STATUS  NV_API_CALL rm_gpu_ops_key_rotation_channel_disable(nvidia_stack_t *, nvgpuChannelHandle_t [], NvU32);
 NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_context_init(nvidia_stack_t *, struct ccslContext_t **, nvgpuChannelHandle_t);
 NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_context_clear(nvidia_stack_t *, struct ccslContext_t *);
-NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_context_update(nvidia_stack_t *, struct ccslContext_t *);
+NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_context_update(nvidia_stack_t *, UvmCslContext *[], NvU32);
 NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_rotate_iv(nvidia_stack_t *, struct ccslContext_t *, NvU8);
 NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_encrypt(nvidia_stack_t *, struct ccslContext_t *, NvU32, NvU8 const *, NvU8 *, NvU8 *);
 NV_STATUS  NV_API_CALL rm_gpu_ops_ccsl_encrypt_with_iv(nvidia_stack_t *, struct ccslContext_t *, NvU32, NvU8 const *, NvU8*, NvU8 *, NvU8 *);
--- a/kernel-open/conftest.sh
+++ b/kernel-open/conftest.sh
@ -1416,6 +1416,42 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_VFIO_REGISTER_EMULATED_IOMMU_DEV_PRESENT" "" "functions"
        ;;

+        bus_type_has_iommu_ops)
+            #
+            # Determine if 'bus_type' structure has a 'iommu_ops' field.
+            #
+            # This field was removed by commit 17de3f5fdd35 (iommu: Retire bus ops)
+            # in v6.8
+            #
+            CODE="
+            #include <linux/device.h>
+
+            int conftest_bus_type_has_iommu_ops(void) {
+                return offsetof(struct bus_type, iommu_ops);
+            }"
+
+            compile_check_conftest "$CODE" "NV_BUS_TYPE_HAS_IOMMU_OPS" "" "types"
+        ;;
+
+        eventfd_signal_has_counter_arg)
+            #
+            # Determine if eventfd_signal() function has an additional 'counter' argument.
+            #
+            # This argument was removed by commit 3652117f8548 (eventfd: simplify
+            # eventfd_signal()) in v6.8
+            #
+            CODE="
+            #include <linux/eventfd.h>
+
+            void conftest_eventfd_signal_has_counter_arg(void) {
+                struct eventfd_ctx *ctx;
+
+                eventfd_signal(ctx, 1);
+            }"
+
+            compile_check_conftest "$CODE" "NV_EVENTFD_SIGNAL_HAS_COUNTER_ARG" "" "types"
+        ;;
+
        drm_available)
            # Determine if the DRM subsystem is usable
            CODE="
@ -5520,7 +5556,8 @@ compile_test() {

        of_dma_configure)
            #
-            # Determine if of_dma_configure() function is present
+            # Determine if of_dma_configure() function is present, and how
+            # many arguments it takes.
            #
            # Added by commit 591c1ee465ce ("of: configure the platform
            # device dma parameters") in v3.16.  However, it was a static,
@ -5530,17 +5567,69 @@ compile_test() {
            # commit 1f5c69aa51f9 ("of: Move of_dma_configure() to device.c
            # to help re-use") in v4.1.
            #
-            CODE="
+            # It subsequently began taking a third parameter with commit
+            # 3d6ce86ee794 ("drivers: remove force dma flag from buses")
+            # in v4.18.
+            #
+
+            echo "$CONFTEST_PREAMBLE
            #if defined(NV_LINUX_OF_DEVICE_H_PRESENT)
            #include <linux/of_device.h>
            #endif
+
            void conftest_of_dma_configure(void)
            {
                of_dma_configure();
            }
-            "
+            " > conftest$$.c

-            compile_check_conftest "$CODE" "NV_OF_DMA_CONFIGURE_PRESENT" "" "functions"
+            $CC $CFLAGS -c conftest$$.c > /dev/null 2>&1
+            rm -f conftest$$.c
+
+            if [ -f conftest$$.o ]; then
+                rm -f conftest$$.o
+
+                echo "#undef NV_OF_DMA_CONFIGURE_PRESENT" | append_conftest "functions"
+                echo "#undef NV_OF_DMA_CONFIGURE_ARGUMENT_COUNT" | append_conftest "functions"
+            else
+                echo "#define NV_OF_DMA_CONFIGURE_PRESENT" | append_conftest "functions"
+
+                echo "$CONFTEST_PREAMBLE
+                #if defined(NV_LINUX_OF_DEVICE_H_PRESENT)
+                #include <linux/of_device.h>
+                #endif
+
+                void conftest_of_dma_configure(void) {
+                    of_dma_configure(NULL, NULL, false);
+                }" > conftest$$.c
+
+                $CC $CFLAGS -c conftest$$.c > /dev/null 2>&1
+                rm -f conftest$$.c
+
+                if [ -f conftest$$.o ]; then
+                    rm -f conftest$$.o
+                    echo "#define NV_OF_DMA_CONFIGURE_ARGUMENT_COUNT 3" | append_conftest "functions"
+                    return
+                fi
+
+                echo "$CONFTEST_PREAMBLE
+                #if defined(NV_LINUX_OF_DEVICE_H_PRESENT)
+                #include <linux/of_device.h>
+                #endif
+
+                void conftest_of_dma_configure(void) {
+                    of_dma_configure(NULL, NULL);
+                }" > conftest$$.c
+
+                $CC $CFLAGS -c conftest$$.c > /dev/null 2>&1
+                rm -f conftest$$.c
+
+                if [ -f conftest$$.o ]; then
+                    rm -f conftest$$.o
+                    echo "#define NV_OF_DMA_CONFIGURE_ARGUMENT_COUNT 2" | append_conftest "functions"
+                    return
+                fi
+            fi
        ;;

        icc_get)
@ -6761,12 +6850,45 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_DRM_MODE_CREATE_DP_COLORSPACE_PROPERTY_HAS_SUPPORTED_COLORSPACES_ARG" "" "types"
        ;;

+        drm_syncobj_features_present)
+            # Determine if DRIVER_SYNCOBJ and DRIVER_SYNCOBJ_TIMELINE DRM
+            # driver features are present. Timeline DRM synchronization objects
+            # may only be used if both of these are supported by the driver.
+            #
+            # DRIVER_SYNCOBJ_TIMELINE Added by commit 060cebb20cdb ("drm:
+            # introduce a capability flag for syncobj timeline support") in
+            # v5.2
+            #
+            # DRIVER_SYNCOBJ Added by commit e9083420bbac ("drm: introduce
+            # sync objects (v4)") in v4.12
+            CODE="
+            #if defined(NV_DRM_DRM_DRV_H_PRESENT)
+            #include <drm/drm_drv.h>
+            #endif
+            int features = DRIVER_SYNCOBJ | DRIVER_SYNCOBJ_TIMELINE;"
+
+            compile_check_conftest "$CODE" "NV_DRM_SYNCOBJ_FEATURES_PRESENT" "" "types"
+        ;;
+
+        stack_trace)
+            # Determine if functions stack_trace_{save,print} are present.
+            # Added by commit e9b98e162 ("stacktrace: Provide helpers for
+            # common stack trace operations") in v5.2.
+            CODE="
+            #include <linux/stacktrace.h>
+            void conftest_stack_trace(void) {
+                stack_trace_save();
+                stack_trace_print();
+            }"
+
+            compile_check_conftest "$CODE" "NV_STACK_TRACE_PRESENT" "" "functions"
+        ;;
+
        drm_unlocked_ioctl_flag_present)
            # Determine if DRM_UNLOCKED IOCTL flag is present.
            #
            # DRM_UNLOCKED was removed by commit 2798ffcc1d6a ("drm: Remove
-            # locking for legacy ioctls and DRM_UNLOCKED") in Linux
-            # next-20231208.
+            # locking for legacy ioctls and DRM_UNLOCKED") in v6.8.
            #
            # DRM_UNLOCKED definition was moved from drmP.h to drm_ioctl.h by
            # commit 2640981f3600 ("drm: document drm_ioctl.[hc]") in v4.12.
--- a/kernel-open/header-presence-tests.mk
+++ b/kernel-open/header-presence-tests.mk
@ -52,6 +52,7 @@ NV_HEADER_PRESENCE_TESTS = \
  linux/dma-resv.h \
  soc/tegra/chip-id.h \
  soc/tegra/fuse.h \
+  soc/tegra/fuse-helper.h \
  soc/tegra/tegra_bpmp.h \
  video/nv_internal.h \
  linux/platform/tegra/dce/dce-client-ipc.h \
--- a/kernel-open/nvidia-drm/nvidia-drm-crtc.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-crtc.c
@ -176,12 +176,10 @@ cursor_plane_req_config_update(struct drm_plane *plane,
        return;
    }

-    *req_config = (struct NvKmsKapiCursorRequestedConfig) {
-        .surface = to_nv_framebuffer(plane_state->fb)->pSurface,
-
-        .dstX = plane_state->crtc_x,
-        .dstY = plane_state->crtc_y,
-    };
+    memset(req_config, 0, sizeof(*req_config));
+    req_config->surface = to_nv_framebuffer(plane_state->fb)->pSurface;
+    req_config->dstX = plane_state->crtc_x;
+    req_config->dstY = plane_state->crtc_y;

 #if defined(NV_DRM_ALPHA_BLENDING_AVAILABLE)
    if (plane->blend_mode_property != NULL && plane->alpha_property != NULL) {
@ -275,24 +273,22 @@ plane_req_config_update(struct drm_plane *plane,
        return 0;
    }

-    *req_config = (struct NvKmsKapiLayerRequestedConfig) {
-        .config = {
-            .surface = to_nv_framebuffer(plane_state->fb)->pSurface,
+    memset(req_config, 0, sizeof(*req_config));

-            /* Source values are 16.16 fixed point */
-            .srcX = plane_state->src_x >> 16,
-            .srcY = plane_state->src_y >> 16,
-            .srcWidth  = plane_state->src_w >> 16,
-            .srcHeight = plane_state->src_h >> 16,
+    req_config->config.surface = to_nv_framebuffer(plane_state->fb)->pSurface;

-            .dstX = plane_state->crtc_x,
-            .dstY = plane_state->crtc_y,
-            .dstWidth  = plane_state->crtc_w,
-            .dstHeight = plane_state->crtc_h,
+    /* Source values are 16.16 fixed point */
+    req_config->config.srcX = plane_state->src_x >> 16;
+    req_config->config.srcY = plane_state->src_y >> 16;
+    req_config->config.srcWidth  = plane_state->src_w >> 16;
+    req_config->config.srcHeight = plane_state->src_h >> 16;

-            .csc = old_config.csc
-        },
-    };
+    req_config->config.dstX = plane_state->crtc_x;
+    req_config->config.dstY = plane_state->crtc_y;
+    req_config->config.dstWidth  = plane_state->crtc_w;
+    req_config->config.dstHeight = plane_state->crtc_h;
+
+    req_config->config.csc = old_config.csc;

 #if defined(NV_DRM_ROTATION_AVAILABLE)
    /*
@ -688,9 +684,7 @@ static int nv_drm_plane_atomic_set_property(
        to_nv_drm_plane_state(state);

    if (property == nv_dev->nv_out_fence_property) {
-#if defined(NV_LINUX_NVHOST_H_PRESENT) && defined(CONFIG_TEGRA_GRHOST)
-        nv_drm_plane_state->fd_user_ptr = u64_to_user_ptr(val);
-#endif
+        nv_drm_plane_state->fd_user_ptr = (void __user *)(uintptr_t)(val);
        return 0;
    } else if (property == nv_dev->nv_input_colorspace_property) {
        nv_drm_plane_state->input_colorspace = val;
@ -875,14 +869,12 @@ static inline void nv_drm_crtc_duplicate_req_head_modeset_config(
     * there is no change in new configuration yet with respect
     * to older one!
     */
-    *new = (struct NvKmsKapiHeadRequestedConfig) {
-        .modeSetConfig = old->modeSetConfig,
-    };
+    memset(new, 0, sizeof(*new));
+    new->modeSetConfig = old->modeSetConfig;

    for (i = 0; i < ARRAY_SIZE(old->layerRequestedConfig); i++) {
-        new->layerRequestedConfig[i] = (struct NvKmsKapiLayerRequestedConfig) {
-            .config = old->layerRequestedConfig[i].config,
-        };
+        new->layerRequestedConfig[i].config =
+            old->layerRequestedConfig[i].config;
    }
 }

--- a/kernel-open/nvidia-drm/nvidia-drm-drv.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-drv.c
@ -373,19 +373,15 @@ static int nv_drm_create_properties(struct nv_drm_device *nv_dev)
        len++;
    }

-#if defined(NV_LINUX_NVHOST_H_PRESENT) && defined(CONFIG_TEGRA_GRHOST)
-    if (!nv_dev->supportsSyncpts) {
-        return 0;
+    if (nv_dev->supportsSyncpts) {
+        nv_dev->nv_out_fence_property =
+            drm_property_create_range(nv_dev->dev, DRM_MODE_PROP_ATOMIC,
+                    "NV_DRM_OUT_FENCE_PTR", 0, U64_MAX);
+        if (nv_dev->nv_out_fence_property == NULL) {
+            return -ENOMEM;
+        }
    }

-    nv_dev->nv_out_fence_property =
-        drm_property_create_range(nv_dev->dev, DRM_MODE_PROP_ATOMIC,
-            "NV_DRM_OUT_FENCE_PTR", 0, U64_MAX);
-    if (nv_dev->nv_out_fence_property == NULL) {
-        return -ENOMEM;
-    }
-#endif
-
    nv_dev->nv_input_colorspace_property =
        drm_property_create_enum(nv_dev->dev, 0, "NV_INPUT_COLORSPACE",
                                 enum_list, len);
@ -480,6 +476,22 @@ static int nv_drm_load(struct drm_device *dev, unsigned long flags)
        return -ENODEV;
    }

+#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE)
+    /*
+     * If fbdev is enabled, take modeset ownership now before other DRM clients
+     * can take master (and thus NVKMS ownership).
+     */
+    if (nv_drm_fbdev_module_param) {
+        if (!nvKms->grabOwnership(pDevice)) {
+            nvKms->freeDevice(pDevice);
+            NV_DRM_DEV_LOG_ERR(nv_dev, "Failed to grab NVKMS modeset ownership");
+            return -EBUSY;
+        }
+
+        nv_dev->hasFramebufferConsole = NV_TRUE;
+    }
+#endif
+
    mutex_lock(&nv_dev->lock);

    /* Set NvKmsKapiDevice */
@ -590,6 +602,15 @@ static void __nv_drm_unload(struct drm_device *dev)
        return;
    }

+    /* Release modeset ownership if fbdev is enabled */
+
+#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE)
+    if (nv_dev->hasFramebufferConsole) {
+        drm_atomic_helper_shutdown(dev);
+        nvKms->releaseOwnership(nv_dev->pDevice);
+    }
+#endif
+
    cancel_delayed_work_sync(&nv_dev->hotplug_event_work);
    mutex_lock(&nv_dev->lock);

@ -781,6 +802,14 @@ static int nv_drm_get_dev_info_ioctl(struct drm_device *dev,
    return 0;
 }

+static int nv_drm_get_drm_file_unique_id_ioctl(struct drm_device *dev,
+                                               void *data, struct drm_file *filep)
+{
+    struct drm_nvidia_get_drm_file_unique_id_params *params = data;
+    params->id = (u64)(filep->driver_priv);
+    return 0;
+}
+
 static int nv_drm_dmabuf_supported_ioctl(struct drm_device *dev,
                                         void *data, struct drm_file *filep)
 {
@ -1279,6 +1308,17 @@ static void nv_drm_postclose(struct drm_device *dev, struct drm_file *filep)
 }
 #endif /* NV_DRM_ATOMIC_MODESET_AVAILABLE */

+static int nv_drm_open(struct drm_device *dev, struct drm_file *filep)
+{
+    _Static_assert(sizeof(filep->driver_priv) >= sizeof(u64),
+                   "filep->driver_priv can not hold an u64");
+    static atomic64_t id = ATOMIC_INIT(0);
+
+    filep->driver_priv = (void *)atomic64_inc_return(&id);
+
+    return 0;
+}
+
 #if defined(NV_DRM_MASTER_HAS_LEASES)
 static struct drm_master *nv_drm_find_lessee(struct drm_master *master,
                                             int lessee_id)
@ -1522,6 +1562,9 @@ static const struct drm_ioctl_desc nv_drm_ioctls[] = {
    DRM_IOCTL_DEF_DRV(NVIDIA_GET_DEV_INFO,
                      nv_drm_get_dev_info_ioctl,
                      DRM_RENDER_ALLOW|DRM_UNLOCKED),
+    DRM_IOCTL_DEF_DRV(NVIDIA_GET_DRM_FILE_UNIQUE_ID,
+                      nv_drm_get_drm_file_unique_id_ioctl,
+                      DRM_RENDER_ALLOW|DRM_UNLOCKED),

 #if defined(NV_DRM_FENCE_AVAILABLE)
    DRM_IOCTL_DEF_DRV(NVIDIA_FENCE_SUPPORTED,
@ -1604,6 +1647,9 @@ static struct drm_driver nv_drm_driver = {
    .driver_features        =
 #if defined(NV_DRM_DRIVER_PRIME_FLAG_PRESENT)
                               DRIVER_PRIME |
+#endif
+#if defined(NV_DRM_SYNCOBJ_FEATURES_PRESENT)
+                               DRIVER_SYNCOBJ | DRIVER_SYNCOBJ_TIMELINE |
 #endif
                               DRIVER_GEM  | DRIVER_RENDER,

@ -1615,14 +1661,14 @@ static struct drm_driver nv_drm_driver = {
    .num_ioctls             = ARRAY_SIZE(nv_drm_ioctls),

 /*
- * linux-next commit 71a7974ac701 ("drm/prime: Unexport helpers for fd/handle
- * conversion") unexports drm_gem_prime_handle_to_fd() and
+ * Linux kernel v6.6 commit 71a7974ac701 ("drm/prime: Unexport helpers
+ * for fd/handle conversion") unexports drm_gem_prime_handle_to_fd() and
 * drm_gem_prime_fd_to_handle().
 *
- * Prior linux-next commit 6b85aa68d9d5 ("drm: Enable PRIME import/export for
- * all drivers") made these helpers the default when .prime_handle_to_fd /
- * .prime_fd_to_handle are unspecified, so it's fine to just skip specifying
- * them if the helpers aren't present.
+ * Prior Linux kernel v6.6 commit 6b85aa68d9d5 ("drm: Enable PRIME
+ * import/export for all drivers") made these helpers the default when
+ * .prime_handle_to_fd / .prime_fd_to_handle are unspecified, so it's fine
+ * to just skip specifying them if the helpers aren't present.
 */
 #if NV_IS_EXPORT_SYMBOL_PRESENT_drm_gem_prime_handle_to_fd
    .prime_handle_to_fd     = drm_gem_prime_handle_to_fd,
@ -1656,6 +1702,7 @@ static struct drm_driver nv_drm_driver = {
 #if defined(NV_DRM_ATOMIC_MODESET_AVAILABLE)
    .postclose              = nv_drm_postclose,
 #endif
+    .open                   = nv_drm_open,

    .fops                   = &nv_drm_fops,

@ -1714,6 +1761,7 @@ void nv_drm_register_drm_device(const nv_gpu_info_t *gpu_info)
    struct nv_drm_device *nv_dev = NULL;
    struct drm_device *dev = NULL;
    struct device *device = gpu_info->os_device_ptr;
+    bool bus_is_pci;

    DRM_DEBUG(
        "Registering device for NVIDIA GPU ID 0x08%x",
@ -1747,7 +1795,7 @@ void nv_drm_register_drm_device(const nv_gpu_info_t *gpu_info)
    dev->dev_private = nv_dev;
    nv_dev->dev = dev;

-    bool bus_is_pci =
+    bus_is_pci =
 #if defined(NV_LINUX)
        device->bus == &pci_bus_type;
 #elif defined(NV_BSD)
@ -1771,11 +1819,6 @@ void nv_drm_register_drm_device(const nv_gpu_info_t *gpu_info)
    if (nv_drm_fbdev_module_param &&
        drm_core_check_feature(dev, DRIVER_MODESET)) {

-        if (!nvKms->grabOwnership(nv_dev->pDevice)) {
-            NV_DRM_DEV_LOG_ERR(nv_dev, "Failed to grab NVKMS modeset ownership");
-            goto failed_grab_ownership;
-        }
-
        if (bus_is_pci) {
            struct pci_dev *pdev = to_pci_dev(device);

@ -1786,8 +1829,6 @@ void nv_drm_register_drm_device(const nv_gpu_info_t *gpu_info)
 #endif
        }
        drm_fbdev_generic_setup(dev, 32);
-
-        nv_dev->hasFramebufferConsole = NV_TRUE;
    }
 #endif /* defined(NV_DRM_FBDEV_GENERIC_AVAILABLE) */

@ -1798,12 +1839,6 @@ void nv_drm_register_drm_device(const nv_gpu_info_t *gpu_info)

    return; /* Success */

-#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE)
-failed_grab_ownership:
-
-    drm_dev_unregister(dev);
-#endif
-
 failed_drm_register:

    nv_drm_dev_free(dev);
@ -1870,12 +1905,6 @@ void nv_drm_remove_devices(void)
        struct nv_drm_device *next = dev_list->next;
        struct drm_device *dev = dev_list->dev;

-#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE)
-        if (dev_list->hasFramebufferConsole) {
-            drm_atomic_helper_shutdown(dev);
-            nvKms->releaseOwnership(dev_list->pDevice);
-        }
-#endif
        drm_dev_unregister(dev);
        nv_drm_dev_free(dev);

--- a/kernel-open/nvidia-drm/nvidia-drm-fence.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-fence.c
@ -293,14 +293,12 @@ __nv_drm_prime_fence_context_new(
     * to check a return value.
     */

-    *nv_prime_fence_context = (struct nv_drm_prime_fence_context) {
-        .base.ops = &nv_drm_prime_fence_context_ops,
-        .base.nv_dev = nv_dev,
-        .base.context = nv_dma_fence_context_alloc(1),
-        .base.fenceSemIndex = p->index,
-        .pSemSurface = pSemSurface,
-        .pLinearAddress = pLinearAddress,
-    };
+    nv_prime_fence_context->base.ops = &nv_drm_prime_fence_context_ops;
+    nv_prime_fence_context->base.nv_dev = nv_dev;
+    nv_prime_fence_context->base.context = nv_dma_fence_context_alloc(1);
+    nv_prime_fence_context->base.fenceSemIndex = p->index;
+    nv_prime_fence_context->pSemSurface = pSemSurface;
+    nv_prime_fence_context->pLinearAddress = pLinearAddress;

    INIT_LIST_HEAD(&nv_prime_fence_context->pending);

@ -1261,18 +1259,16 @@ __nv_drm_semsurf_fence_ctx_new(
     * to check a return value.
     */

-    *ctx = (struct nv_drm_semsurf_fence_ctx) {
-        .base.ops = &nv_drm_semsurf_fence_ctx_ops,
-        .base.nv_dev = nv_dev,
-        .base.context = nv_dma_fence_context_alloc(1),
-        .base.fenceSemIndex = p->index,
-        .pSemSurface = pSemSurface,
-        .pSemMapping.pVoid = semMapping,
-        .pMaxSubmittedMapping = (volatile NvU64 *)maxSubmittedMapping,
-        .callback.local = NULL,
-        .callback.nvKms = NULL,
-        .current_wait_value = 0,
-    };
+    ctx->base.ops = &nv_drm_semsurf_fence_ctx_ops;
+    ctx->base.nv_dev = nv_dev;
+    ctx->base.context = nv_dma_fence_context_alloc(1);
+    ctx->base.fenceSemIndex = p->index;
+    ctx->pSemSurface = pSemSurface;
+    ctx->pSemMapping.pVoid = semMapping;
+    ctx->pMaxSubmittedMapping = (volatile NvU64 *)maxSubmittedMapping;
+    ctx->callback.local = NULL;
+    ctx->callback.nvKms = NULL;
+    ctx->current_wait_value = 0;

    spin_lock_init(&ctx->lock);
    INIT_LIST_HEAD(&ctx->pending_fences);
--- a/kernel-open/nvidia-drm/nvidia-drm-gem-nvkms-memory.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-gem-nvkms-memory.c
@ -551,14 +551,12 @@ static struct drm_gem_object *__nv_drm_gem_nvkms_prime_dup(
 {
    struct nv_drm_device *nv_dev = to_nv_device(dev);
    const struct nv_drm_device *nv_dev_src;
-    const struct nv_drm_gem_nvkms_memory *nv_nvkms_memory_src;
    struct nv_drm_gem_nvkms_memory *nv_nvkms_memory;
    struct NvKmsKapiMemory *pMemory;

    BUG_ON(nv_gem_src == NULL || nv_gem_src->ops != &nv_gem_nvkms_memory_ops);

    nv_dev_src = to_nv_device(nv_gem_src->base.dev);
-    nv_nvkms_memory_src = to_nv_nvkms_memory_const(nv_gem_src);

    if ((nv_nvkms_memory =
            nv_drm_calloc(1, sizeof(*nv_nvkms_memory))) == NULL) {
--- a/kernel-open/nvidia-drm/nvidia-drm-helper.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-helper.c
@ -45,8 +45,7 @@

 /*
 * The inclusion of drm_framebuffer.h was removed from drm_crtc.h by commit
- * 720cf96d8fecde29b72e1101f8a567a0ce99594f ("drm: Drop drm_framebuffer.h from
- * drm_crtc.h") in linux-next, expected in v5.19-rc7.
+ * 720cf96d8fec ("drm: Drop drm_framebuffer.h from drm_crtc.h") in v6.0.
 *
 * We only need drm_framebuffer.h for drm_framebuffer_put(), and it is always
 * present (v4.9+) when drm_framebuffer_{put,get}() is present (v4.12+), so it
--- a/kernel-open/nvidia-drm/nvidia-drm-helper.h
+++ b/kernel-open/nvidia-drm/nvidia-drm-helper.h
@ -613,8 +613,8 @@ static inline int nv_drm_format_num_planes(uint32_t format)
 #endif /* defined(NV_DRM_FORMAT_MODIFIERS_PRESENT) */

 /*
- * DRM_UNLOCKED was removed with linux-next commit 2798ffcc1d6a ("drm: Remove
- * locking for legacy ioctls and DRM_UNLOCKED"), but it was previously made
+ * DRM_UNLOCKED was removed with commit 2798ffcc1d6a ("drm: Remove locking for
+ * legacy ioctls and DRM_UNLOCKED") in v6.8, but it was previously made
 * implicit for all non-legacy DRM driver IOCTLs since Linux v4.10 commit
 * fa5386459f06 "drm: Used DRM_LEGACY for all legacy functions" (Linux v4.4
 * commit ea487835e887 "drm: Enforce unlocked ioctl operation for kms driver
--- a/kernel-open/nvidia-drm/nvidia-drm-ioctl.h
+++ b/kernel-open/nvidia-drm/nvidia-drm-ioctl.h
@ -52,6 +52,7 @@
 #define DRM_NVIDIA_SEMSURF_FENCE_CREATE             0x15
 #define DRM_NVIDIA_SEMSURF_FENCE_WAIT               0x16
 #define DRM_NVIDIA_SEMSURF_FENCE_ATTACH             0x17
+#define DRM_NVIDIA_GET_DRM_FILE_UNIQUE_ID           0x18

 #define DRM_IOCTL_NVIDIA_GEM_IMPORT_NVKMS_MEMORY                           \
    DRM_IOWR((DRM_COMMAND_BASE + DRM_NVIDIA_GEM_IMPORT_NVKMS_MEMORY),      \
@ -157,6 +158,11 @@
              DRM_NVIDIA_SEMSURF_FENCE_ATTACH),                         \
              struct drm_nvidia_semsurf_fence_attach_params)

+#define DRM_IOCTL_NVIDIA_GET_DRM_FILE_UNIQUE_ID                         \
+    DRM_IOWR((DRM_COMMAND_BASE +                                        \
+              DRM_NVIDIA_GET_DRM_FILE_UNIQUE_ID),                       \
+              struct drm_nvidia_get_drm_file_unique_id_params)
+
 struct drm_nvidia_gem_import_nvkms_memory_params {
    uint64_t mem_size;           /* IN */

@ -385,4 +391,8 @@ struct drm_nvidia_semsurf_fence_attach_params {
    uint64_t wait_value;            /* IN Semaphore value to reach before signal */
 };

+struct drm_nvidia_get_drm_file_unique_id_params {
+    uint64_t id;                    /* OUT Unique ID of the DRM file */
+};
+
 #endif /* _UAPI_NVIDIA_DRM_IOCTL_H_ */
--- a/kernel-open/nvidia-drm/nvidia-drm-modeset.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-modeset.c
@ -587,6 +587,9 @@ int nv_drm_atomic_commit(struct drm_device *dev,
                NV_DRM_DEV_LOG_ERR(
                    nv_dev,
                    "Flip event timeout on head %u", nv_crtc->head);
+                while (!list_empty(&nv_crtc->flip_list)) {
+                    __nv_drm_handle_flip_event(nv_crtc);
+                }
            }
        }
    }
--- a/kernel-open/nvidia-drm/nvidia-drm-sources.mk
+++ b/kernel-open/nvidia-drm/nvidia-drm-sources.mk
@ -128,4 +128,5 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += drm_driver_has_dumb_destroy
 NV_CONFTEST_TYPE_COMPILE_TESTS += fence_ops_use_64bit_seqno
 NV_CONFTEST_TYPE_COMPILE_TESTS += drm_aperture_remove_conflicting_pci_framebuffers_has_driver_arg
 NV_CONFTEST_TYPE_COMPILE_TESTS += drm_mode_create_dp_colorspace_property_has_supported_colorspaces_arg
+NV_CONFTEST_TYPE_COMPILE_TESTS += drm_syncobj_features_present
 NV_CONFTEST_TYPE_COMPILE_TESTS += drm_unlocked_ioctl_flag_present
--- a/kernel-open/nvidia-modeset/nvidia-modeset-linux.c
+++ b/kernel-open/nvidia-modeset/nvidia-modeset-linux.c
@ -77,10 +77,10 @@ module_param_named(disable_hdmi_frl, disable_hdmi_frl, bool, 0400);
 static bool disable_vrr_memclk_switch = false;
 module_param_named(disable_vrr_memclk_switch, disable_vrr_memclk_switch, bool, 0400);

-static bool hdmi_deepcolor = false;
+static bool hdmi_deepcolor = true;
 module_param_named(hdmi_deepcolor, hdmi_deepcolor, bool, 0400);

-static bool vblank_sem_control = false;
+static bool vblank_sem_control = true;
 module_param_named(vblank_sem_control, vblank_sem_control, bool, 0400);

 static bool opportunistic_display_sync = true;
@ -139,6 +139,20 @@ NvBool nvkms_opportunistic_display_sync(void)
    return opportunistic_display_sync;
 }

+NvBool nvkms_kernel_supports_syncpts(void)
+{
+/*
+ * Note this only checks that the kernel has the prerequisite
+ * support for syncpts; callers must also check that the hardware
+ * supports syncpts.
+ */
+#if (defined(CONFIG_TEGRA_GRHOST) || defined(NV_LINUX_HOST1X_NEXT_H_PRESENT))
+    return NV_TRUE;
+#else
+    return NV_FALSE;
+#endif
+}
+
 #define NVKMS_SYNCPT_STUBS_NEEDED

 /*************************************************************************
@ -1234,6 +1248,26 @@ void nvkms_close_from_kapi(struct nvkms_per_open *popen)
    nvkms_close_pm_unlocked(popen);
 }

+NvBool nvkms_ioctl_from_kapi_try_pmlock
+(
+    struct nvkms_per_open *popen,
+    NvU32 cmd, void *params_address, const size_t param_size
+)
+{
+    NvBool ret;
+
+    if (nvkms_read_trylock_pm_lock()) {
+        return NV_FALSE;
+    }
+
+    ret = nvkms_ioctl_common(popen,
+                             cmd,
+                             (NvU64)(NvUPtr)params_address, param_size) == 0;
+    nvkms_read_unlock_pm_lock();
+
+    return ret;
+}
+
 NvBool nvkms_ioctl_from_kapi
 (
    struct nvkms_per_open *popen,
--- a/kernel-open/nvidia-modeset/nvidia-modeset-os-interface.h
+++ b/kernel-open/nvidia-modeset/nvidia-modeset-os-interface.h
@ -304,6 +304,11 @@ NvU32 nvkms_enumerate_gpus(nv_gpu_info_t *gpu_info);

 NvBool nvkms_allow_write_combining(void);

+/*!
+ * Check if OS supports syncpoints.
+ */
+NvBool nvkms_kernel_supports_syncpts(void);
+
 /*!
 * Checks whether the fd is associated with an nvidia character device.
 */
@ -328,6 +333,16 @@ NvBool nvkms_ioctl_from_kapi
    NvU32 cmd, void *params_address, const size_t params_size
 );

+/*!
+ * Like nvkms_ioctl_from_kapi, but return NV_FALSE instead of waiting if the
+ * power management read lock cannot be acquired.
+ */
+NvBool nvkms_ioctl_from_kapi_try_pmlock
+(
+    struct nvkms_per_open *popen,
+    NvU32 cmd, void *params_address, const size_t params_size
+);
+
 /*!
 * APIs for locking.
 */
--- a/kernel-open/nvidia-modeset/nvidia-modeset.Kbuild
+++ b/kernel-open/nvidia-modeset/nvidia-modeset.Kbuild
@ -105,3 +105,4 @@ NV_CONFTEST_FUNCTION_COMPILE_TESTS += list_is_first
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += ktime_get_real_ts64
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += ktime_get_raw_ts64
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += acpi_video_backlight_use_native
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += kernel_read_has_pointer_pos_arg
--- a/kernel-open/nvidia-uvm/uvm.h
+++ b/kernel-open/nvidia-uvm/uvm.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2013-2023 NVIDIA Corporation
+    Copyright (c) 2013-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -3463,8 +3463,7 @@ NV_STATUS UvmToolsDestroySession(UvmToolsSessionHandle session);
 //

 #if UVM_API_REV_IS_AT_MOST(10)
-// This is deprecated and replaced by sizeof(UvmToolsEventControlData_V1) or
-// sizeof(UvmToolsEventControlData_V2).
+// This is deprecated and replaced by sizeof(UvmToolsEventControlData).
 NvLength UvmToolsGetEventControlSize(void);

 // This is deprecated and replaced by sizeof(UvmEventEntry_V1) or
@ -3488,8 +3487,6 @@ NvLength UvmToolsGetNumberOfCounters(void);
 //     version: (INPUT)
 //         Requested version for events or counters.
 //         See UvmEventEntry_V1 and UvmEventEntry_V2.
-//         UvmToolsEventControlData_V2::version records the entry version that
-//         will be generated.
 //
 //     event_buffer: (INPUT)
 //         User allocated buffer. Must be page-aligned. Must be large enough to
@ -3502,8 +3499,7 @@ NvLength UvmToolsGetNumberOfCounters(void);
 //
 //     event_control (INPUT)
 //         User allocated buffer. Must be page-aligned. Must be large enough to
-//         hold UvmToolsEventControlData_V1 if version is UvmEventEntry_V1 or
-//         UvmToolsEventControlData_V2 (although single page-size allocation
+//         hold UvmToolsEventControlData (although single page-size allocation
 //         should be more than enough). Gets pinned until queue is destroyed.
 //
 //     queue: (OUTPUT)
--- a/kernel-open/nvidia-uvm/uvm_ampere_host.c
+++ b/kernel-open/nvidia-uvm/uvm_ampere_host.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2018-2023 NVIDIA Corporation
+    Copyright (c) 2018-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -205,17 +205,18 @@ void uvm_hal_ampere_host_clear_faulted_channel_sw_method(uvm_push_t *push,
                     CLEAR_FAULTED_B, HWVALUE(C076, CLEAR_FAULTED_B, INST_HI, instance_ptr_hi));
 }

-// Copy from Pascal, this version sets TLB_INVALIDATE_INVAL_SCOPE.
+// Copy from Turing, this version sets TLB_INVALIDATE_INVAL_SCOPE.
 void uvm_hal_ampere_host_tlb_invalidate_all(uvm_push_t *push,
-                                            uvm_gpu_phys_address_t pdb,
-                                            NvU32 depth,
-                                            uvm_membar_t membar)
+                                           uvm_gpu_phys_address_t pdb,
+                                           NvU32 depth,
+                                           uvm_membar_t membar)
 {
    NvU32 aperture_value;
    NvU32 page_table_level;
    NvU32 pdb_lo;
    NvU32 pdb_hi;
    NvU32 ack_value = 0;
+    NvU32 sysmembar_value = 0;

    UVM_ASSERT_MSG(pdb.aperture == UVM_APERTURE_VID || pdb.aperture == UVM_APERTURE_SYS, "aperture: %u", pdb.aperture);

@ -230,8 +231,8 @@ void uvm_hal_ampere_host_tlb_invalidate_all(uvm_push_t *push,
    pdb_lo = pdb.address & HWMASK(C56F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);
    pdb_hi = pdb.address >> HWSIZE(C56F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);

-    // PDE3 is the highest level on Pascal, see the comment in uvm_pascal_mmu.c
-    // for details.
+    // PDE3 is the highest level on Pascal-Ampere, see the comment in
+    // uvm_pascal_mmu.c for details.
    UVM_ASSERT_MSG(depth < NVC56F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE3, "depth %u", depth);
    page_table_level = NVC56F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE3 - depth;

@ -242,7 +243,12 @@ void uvm_hal_ampere_host_tlb_invalidate_all(uvm_push_t *push,
        ack_value = HWCONST(C56F, MEM_OP_C, TLB_INVALIDATE_ACK_TYPE, GLOBALLY);
    }

-    NV_PUSH_4U(C56F, MEM_OP_A, HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS) |
+    if (membar == UVM_MEMBAR_SYS)
+        sysmembar_value = HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, EN);
+    else
+        sysmembar_value = HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS);
+
+    NV_PUSH_4U(C56F, MEM_OP_A, sysmembar_value |
                               HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS),
                     MEM_OP_B, 0,
                     MEM_OP_C, HWCONST(C56F, MEM_OP_C, TLB_INVALIDATE_PDB, ONE) |
@ -255,16 +261,18 @@ void uvm_hal_ampere_host_tlb_invalidate_all(uvm_push_t *push,
                     MEM_OP_D, HWCONST(C56F, MEM_OP_D, OPERATION, MMU_TLB_INVALIDATE) |
                               HWVALUE(C56F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));

-    uvm_hal_tlb_invalidate_membar(push, membar);
+    // GPU membar still requires an explicit membar method.
+    if (membar == UVM_MEMBAR_GPU)
+        uvm_push_get_gpu(push)->parent->host_hal->membar_gpu(push);
 }

-// Copy from Volta, this version sets TLB_INVALIDATE_INVAL_SCOPE.
+// Copy from Turing, this version sets TLB_INVALIDATE_INVAL_SCOPE.
 void uvm_hal_ampere_host_tlb_invalidate_va(uvm_push_t *push,
                                           uvm_gpu_phys_address_t pdb,
                                           NvU32 depth,
                                           NvU64 base,
                                           NvU64 size,
-                                           NvU32 page_size,
+                                           NvU64 page_size,
                                           uvm_membar_t membar)
 {
    NvU32 aperture_value;
@ -272,6 +280,7 @@ void uvm_hal_ampere_host_tlb_invalidate_va(uvm_push_t *push,
    NvU32 pdb_lo;
    NvU32 pdb_hi;
    NvU32 ack_value = 0;
+    NvU32 sysmembar_value = 0;
    NvU32 va_lo;
    NvU32 va_hi;
    NvU64 end;
@ -281,9 +290,9 @@ void uvm_hal_ampere_host_tlb_invalidate_va(uvm_push_t *push,
    NvU32 log2_invalidation_size;
    uvm_gpu_t *gpu = uvm_push_get_gpu(push);

-    UVM_ASSERT_MSG(IS_ALIGNED(page_size, 1 << 12), "page_size 0x%x\n", page_size);
-    UVM_ASSERT_MSG(IS_ALIGNED(base, page_size), "base 0x%llx page_size 0x%x\n", base, page_size);
-    UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%x\n", size, page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(page_size, 1 << 12), "page_size 0x%llx\n", page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(base, page_size), "base 0x%llx page_size 0x%llx\n", base, page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%llx\n", size, page_size);
    UVM_ASSERT_MSG(size > 0, "size 0x%llx\n", size);

    // The invalidation size must be a power-of-two number of pages containing
@ -325,7 +334,7 @@ void uvm_hal_ampere_host_tlb_invalidate_va(uvm_push_t *push,
    pdb_lo = pdb.address & HWMASK(C56F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);
    pdb_hi = pdb.address >> HWSIZE(C56F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);

-    // PDE3 is the highest level on Pascal-Ampere , see the comment in
+    // PDE3 is the highest level on Pascal-Ampere, see the comment in
    // uvm_pascal_mmu.c for details.
    UVM_ASSERT_MSG(depth < NVC56F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE3, "depth %u", depth);
    page_table_level = NVC56F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE3 - depth;
@ -337,10 +346,15 @@ void uvm_hal_ampere_host_tlb_invalidate_va(uvm_push_t *push,
        ack_value = HWCONST(C56F, MEM_OP_C, TLB_INVALIDATE_ACK_TYPE, GLOBALLY);
    }

+    if (membar == UVM_MEMBAR_SYS)
+        sysmembar_value = HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, EN);
+    else
+        sysmembar_value = HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS);
+
    NV_PUSH_4U(C56F, MEM_OP_A, HWVALUE(C56F, MEM_OP_A, TLB_INVALIDATE_INVALIDATION_SIZE, log2_invalidation_size) |
-                               HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS) |
-                               HWVALUE(C56F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO, va_lo) |
-                               HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS),
+                               HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS) |
+                               sysmembar_value |
+                               HWVALUE(C56F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO, va_lo),
                     MEM_OP_B, HWVALUE(C56F, MEM_OP_B, TLB_INVALIDATE_TARGET_ADDR_HI, va_hi),
                     MEM_OP_C, HWCONST(C56F, MEM_OP_C, TLB_INVALIDATE_PDB, ONE) |
                               HWVALUE(C56F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO, pdb_lo) |
@ -352,21 +366,23 @@ void uvm_hal_ampere_host_tlb_invalidate_va(uvm_push_t *push,
                     MEM_OP_D, HWCONST(C56F, MEM_OP_D, OPERATION, MMU_TLB_INVALIDATE_TARGETED) |
                               HWVALUE(C56F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));

-    uvm_hal_tlb_invalidate_membar(push, membar);
+    // GPU membar still requires an explicit membar method.
+    if (membar == UVM_MEMBAR_GPU)
+        gpu->parent->host_hal->membar_gpu(push);
 }

-// Copy from Pascal, this version sets TLB_INVALIDATE_INVAL_SCOPE.
+// Copy from Turing, this version sets TLB_INVALIDATE_INVAL_SCOPE.
 void uvm_hal_ampere_host_tlb_invalidate_test(uvm_push_t *push,
                                             uvm_gpu_phys_address_t pdb,
                                             UVM_TEST_INVALIDATE_TLB_PARAMS *params)
 {
    NvU32 ack_value = 0;
+    NvU32 sysmembar_value = 0;
    NvU32 invalidate_gpc_value = 0;
    NvU32 aperture_value = 0;
    NvU32 pdb_lo = 0;
    NvU32 pdb_hi = 0;
    NvU32 page_table_level = 0;
-    uvm_membar_t membar;

    UVM_ASSERT_MSG(pdb.aperture == UVM_APERTURE_VID || pdb.aperture == UVM_APERTURE_SYS, "aperture: %u", pdb.aperture);
    if (pdb.aperture == UVM_APERTURE_VID)
@ -381,7 +397,7 @@ void uvm_hal_ampere_host_tlb_invalidate_test(uvm_push_t *push,
    pdb_hi = pdb.address >> HWSIZE(C56F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);

    if (params->page_table_level != UvmInvalidatePageTableLevelAll) {
-        // PDE3 is the highest level on Pascal, see the comment in
+        // PDE3 is the highest level on Pascal-Ampere, see the comment in
        // uvm_pascal_mmu.c for details.
        page_table_level = min((NvU32)UvmInvalidatePageTableLevelPde3, params->page_table_level) - 1;
    }
@ -393,6 +409,11 @@ void uvm_hal_ampere_host_tlb_invalidate_test(uvm_push_t *push,
        ack_value = HWCONST(C56F, MEM_OP_C, TLB_INVALIDATE_ACK_TYPE, GLOBALLY);
    }

+    if (params->membar == UvmInvalidateTlbMemBarSys)
+        sysmembar_value = HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, EN);
+    else
+        sysmembar_value = HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS);
+
    if (params->disable_gpc_invalidate)
        invalidate_gpc_value = HWCONST(C56F, MEM_OP_C, TLB_INVALIDATE_GPC, DISABLE);
    else
@ -403,9 +424,9 @@ void uvm_hal_ampere_host_tlb_invalidate_test(uvm_push_t *push,

        NvU32 va_lo = va & HWMASK(C56F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO);
        NvU32 va_hi = va >> HWSIZE(C56F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO);
-        NV_PUSH_4U(C56F, MEM_OP_A, HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS) |
-                                   HWVALUE(C56F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO, va_lo) |
-                                   HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS),
+        NV_PUSH_4U(C56F, MEM_OP_A, sysmembar_value |
+                                   HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS) |
+                                   HWVALUE(C56F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO, va_lo),
                         MEM_OP_B, HWVALUE(C56F, MEM_OP_B, TLB_INVALIDATE_TARGET_ADDR_HI, va_hi),
                         MEM_OP_C, HWCONST(C56F, MEM_OP_C, TLB_INVALIDATE_REPLAY, NONE) |
                                   HWVALUE(C56F, MEM_OP_C, TLB_INVALIDATE_PAGE_TABLE_LEVEL, page_table_level) |
@ -418,7 +439,7 @@ void uvm_hal_ampere_host_tlb_invalidate_test(uvm_push_t *push,
                                   HWVALUE(C56F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));
    }
    else {
-        NV_PUSH_4U(C56F, MEM_OP_A, HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS) |
+        NV_PUSH_4U(C56F, MEM_OP_A, sysmembar_value |
                                   HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS),
                         MEM_OP_B, 0,
                         MEM_OP_C, HWCONST(C56F, MEM_OP_C, TLB_INVALIDATE_REPLAY, NONE) |
@ -432,12 +453,7 @@ void uvm_hal_ampere_host_tlb_invalidate_test(uvm_push_t *push,
                                   HWVALUE(C56F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));
    }

-    if (params->membar == UvmInvalidateTlbMemBarSys)
-        membar = UVM_MEMBAR_SYS;
-    else if (params->membar == UvmInvalidateTlbMemBarLocal)
-        membar = UVM_MEMBAR_GPU;
-    else
-        membar = UVM_MEMBAR_NONE;
-
-    uvm_hal_tlb_invalidate_membar(push, membar);
+    // GPU membar still requires an explicit membar method.
+    if (params->membar == UvmInvalidateTlbMemBarLocal)
+        uvm_push_get_gpu(push)->parent->host_hal->membar_gpu(push);
 }
--- a/kernel-open/nvidia-uvm/uvm_ampere_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_ampere_mmu.c
@ -51,7 +51,7 @@ uvm_mmu_engine_type_t uvm_hal_ampere_mmu_engine_id_to_type(NvU16 mmu_engine_id)
    return UVM_MMU_ENGINE_TYPE_GRAPHICS;
 }

-static NvU32 page_table_depth_ampere(NvU32 page_size)
+static NvU32 page_table_depth_ampere(NvU64 page_size)
 {
    // The common-case is page_size == UVM_PAGE_SIZE_2M, hence the first check
    if (page_size == UVM_PAGE_SIZE_2M)
@ -62,14 +62,14 @@ static NvU32 page_table_depth_ampere(NvU32 page_size)
        return 4;
 }

-static NvU32 page_sizes_ampere(void)
+static NvU64 page_sizes_ampere(void)
 {
    return UVM_PAGE_SIZE_512M | UVM_PAGE_SIZE_2M | UVM_PAGE_SIZE_64K | UVM_PAGE_SIZE_4K;
 }

 static uvm_mmu_mode_hal_t ampere_mmu_mode_hal;

-uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_ampere(NvU32 big_page_size)
+uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_ampere(NvU64 big_page_size)
 {
    static bool initialized = false;

--- a/kernel-open/nvidia-uvm/uvm_ats.c
+++ b/kernel-open/nvidia-uvm/uvm_ats.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2018-2021 NVIDIA Corporation
+    Copyright (c) 2018-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
--- a/kernel-open/nvidia-uvm/uvm_ats.h
+++ b/kernel-open/nvidia-uvm/uvm_ats.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2018-2021 NVIDIA Corporation
+    Copyright (c) 2018-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -29,10 +29,9 @@
 #include "uvm_ats_ibm.h"
 #include "nv_uvm_types.h"
 #include "uvm_lock.h"
+#include "uvm_ats_sva.h"

-    #include "uvm_ats_sva.h"
-
-    #define UVM_ATS_SUPPORTED() (UVM_ATS_IBM_SUPPORTED() || UVM_ATS_SVA_SUPPORTED())
+#define UVM_ATS_SUPPORTED() (UVM_ATS_IBM_SUPPORTED() || UVM_ATS_SVA_SUPPORTED())

 typedef struct
 {
--- a/kernel-open/nvidia-uvm/uvm_channel.c
+++ b/kernel-open/nvidia-uvm/uvm_channel.c
@ -1541,14 +1541,14 @@ static uvm_gpfifo_entry_t *uvm_channel_get_first_pending_entry(uvm_channel_t *ch
 NV_STATUS uvm_channel_get_status(uvm_channel_t *channel)
 {
    uvm_gpu_t *gpu;
-    NvNotification *errorNotifier;
+    NvNotification *error_notifier;

    if (uvm_channel_is_proxy(channel))
-        errorNotifier = channel->proxy.channel_info.shadowErrorNotifier;
+        error_notifier = channel->proxy.channel_info.shadowErrorNotifier;
    else
-        errorNotifier = channel->channel_info.errorNotifier;
+        error_notifier = channel->channel_info.errorNotifier;

-    if (errorNotifier->status == 0)
+    if (error_notifier->status == 0)
        return NV_OK;

    // In case we hit a channel error, check the ECC error notifier as well so
@ -2584,16 +2584,18 @@ out:

 // Return the pool corresponding to the given CE index
 //
-// This function cannot be used to access the proxy pool in SR-IOV heavy.
+// Used to retrieve pools of type UVM_CHANNEL_POOL_TYPE_CE only.
 static uvm_channel_pool_t *channel_manager_ce_pool(uvm_channel_manager_t *manager, NvU32 ce)
 {
-    uvm_channel_pool_t *pool;
+    uvm_channel_pool_t *pool = uvm_channel_pool_first(manager, UVM_CHANNEL_POOL_TYPE_CE);

+    UVM_ASSERT(pool != NULL);
    UVM_ASSERT(test_bit(ce, manager->ce_mask));

-    // The index of the pool associated with 'ce' is the number of usable CEs
-    // in [0, ce)
-    pool = manager->channel_pools + bitmap_weight(manager->ce_mask, ce);
+    // Pools of type UVM_CHANNEL_POOL_TYPE_CE are stored contiguously. The
+    // offset of the pool associated with 'ce' is the number of usable CEs in
+    // [0, ce).
+    pool += bitmap_weight(manager->ce_mask, ce);

    UVM_ASSERT(pool->pool_type == UVM_CHANNEL_POOL_TYPE_CE);
    UVM_ASSERT(pool->engine_index == ce);
@ -2811,6 +2813,7 @@ static unsigned channel_manager_get_max_pools(uvm_channel_manager_t *manager)
 static NV_STATUS channel_manager_create_ce_pools(uvm_channel_manager_t *manager, unsigned *preferred_ce)
 {
    unsigned ce;
+    unsigned type;

    // A pool is created for each usable CE, even if it has not been selected as
    // the preferred CE for any type, because as more information is discovered
@ -2818,18 +2821,20 @@ static NV_STATUS channel_manager_create_ce_pools(uvm_channel_manager_t *manager,
    // previously idle pools.
    for_each_set_bit(ce, manager->ce_mask, UVM_COPY_ENGINE_COUNT_MAX) {
        NV_STATUS status;
-        unsigned type;
        uvm_channel_pool_t *pool = NULL;

        status = channel_pool_add(manager, UVM_CHANNEL_POOL_TYPE_CE, ce, &pool);
        if (status != NV_OK)
            return status;
+    }

-        for (type = 0; type < UVM_CHANNEL_TYPE_CE_COUNT; type++) {
-            // Set pool type if it hasn't been set before.
-            if (preferred_ce[type] == ce && manager->pool_to_use.default_for_type[type] == NULL)
-                manager->pool_to_use.default_for_type[type] = pool;
-        }
+    for (type = 0; type < UVM_CHANNEL_TYPE_CE_COUNT; type++) {
+        // Avoid overwriting previously set defaults.
+        if (manager->pool_to_use.default_for_type[type] != NULL)
+            continue;
+
+        ce = preferred_ce[type];
+        manager->pool_to_use.default_for_type[type] = channel_manager_ce_pool(manager, ce);
    }

    return NV_OK;
--- a/kernel-open/nvidia-uvm/uvm_gpu.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu.c
@ -218,8 +218,9 @@ static NV_STATUS alloc_and_init_address_space(uvm_gpu_t *gpu)
    if (status != NV_OK)
        return status;

-    gpu->big_page.internal_size = gpu_address_space_info.bigPageSize;
+    UVM_ASSERT(gpu_address_space_info.bigPageSize <= NV_U32_MAX);

+    gpu->big_page.internal_size = gpu_address_space_info.bigPageSize;
    gpu->time.time0_register = gpu_address_space_info.time0Offset;
    gpu->time.time1_register = gpu_address_space_info.time1Offset;

@ -458,6 +459,7 @@ static const char *uvm_gpu_virt_type_string(UVM_VIRT_MODE virtMode)

 static const char *uvm_gpu_link_type_string(uvm_gpu_link_type_t link_type)
 {
+
    BUILD_BUG_ON(UVM_GPU_LINK_MAX != 7);

    switch (link_type) {
@ -1082,9 +1084,6 @@ static NV_STATUS configure_address_space(uvm_gpu_t *gpu)
                   gpu->parent->rm_va_size,
                   va_per_entry);

-    UVM_ASSERT(uvm_mmu_page_size_supported(&gpu->address_space_tree, gpu->big_page.internal_size));
-    UVM_ASSERT(uvm_mmu_page_size_supported(&gpu->address_space_tree, gpu->mem_info.max_vidmem_page_size));
-
    tree_alloc = uvm_page_tree_pdb(&gpu->address_space_tree);
    status = uvm_rm_locked_call(nvUvmInterfaceSetPageDirectory(gpu->rm_address_space,
                                                               tree_alloc->addr.address,
@ -2364,9 +2363,7 @@ static NV_STATUS init_peer_access(uvm_gpu_t *gpu0,

    // check for peer-to-peer compatibility (PCI-E or NvLink).
    peer_caps->link_type = get_gpu_link_type(p2p_caps_params->p2pLink);
-    if (peer_caps->link_type == UVM_GPU_LINK_INVALID
-        || peer_caps->link_type == UVM_GPU_LINK_C2C
-        )
+    if (peer_caps->link_type == UVM_GPU_LINK_INVALID || peer_caps->link_type == UVM_GPU_LINK_C2C)
        return NV_ERR_NOT_SUPPORTED;

    peer_caps->total_link_line_rate_mbyte_per_s = p2p_caps_params->totalLinkLineRateMBps;
@ -3296,7 +3293,10 @@ void uvm_parent_gpu_dma_free_page(uvm_parent_gpu_t *parent_gpu, void *va, NvU64
    atomic64_sub(PAGE_SIZE, &parent_gpu->mapped_cpu_pages_size);
 }

-NV_STATUS uvm_parent_gpu_map_cpu_pages(uvm_parent_gpu_t *parent_gpu, struct page *page, size_t size, NvU64 *dma_address_out)
+NV_STATUS uvm_parent_gpu_map_cpu_pages(uvm_parent_gpu_t *parent_gpu,
+                                       struct page *page,
+                                       size_t size,
+                                       NvU64 *dma_address_out)
 {
    NvU64 dma_addr;

--- a/kernel-open/nvidia-uvm/uvm_gpu.h
+++ b/kernel-open/nvidia-uvm/uvm_gpu.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
--- a/kernel-open/nvidia-uvm/uvm_hal.c
+++ b/kernel-open/nvidia-uvm/uvm_hal.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -251,6 +251,9 @@ static uvm_hal_class_ops_t host_table[] =
            .semaphore_release = uvm_hal_turing_host_semaphore_release,
            .clear_faulted_channel_method = uvm_hal_turing_host_clear_faulted_channel_method,
            .set_gpfifo_entry = uvm_hal_turing_host_set_gpfifo_entry,
+            .tlb_invalidate_all = uvm_hal_turing_host_tlb_invalidate_all,
+            .tlb_invalidate_va = uvm_hal_turing_host_tlb_invalidate_va,
+            .tlb_invalidate_test = uvm_hal_turing_host_tlb_invalidate_test,
        }
    },
    {
@ -632,13 +635,19 @@ NV_STATUS uvm_hal_init_table(void)
        return status;
    }

-    status = ops_init_from_parent(host_table, ARRAY_SIZE(host_table), HOST_OP_COUNT, offsetof(uvm_hal_class_ops_t, u.host_ops));
+    status = ops_init_from_parent(host_table,
+                                  ARRAY_SIZE(host_table),
+                                  HOST_OP_COUNT,
+                                  offsetof(uvm_hal_class_ops_t, u.host_ops));
    if (status != NV_OK) {
        UVM_ERR_PRINT("ops_init_from_parent(host_table) failed: %s\n", nvstatusToString(status));
        return status;
    }

-    status = ops_init_from_parent(arch_table, ARRAY_SIZE(arch_table), ARCH_OP_COUNT, offsetof(uvm_hal_class_ops_t, u.arch_ops));
+    status = ops_init_from_parent(arch_table,
+                                  ARRAY_SIZE(arch_table),
+                                  ARCH_OP_COUNT,
+                                  offsetof(uvm_hal_class_ops_t, u.arch_ops));
    if (status != NV_OK) {
        UVM_ERR_PRINT("ops_init_from_parent(arch_table) failed: %s\n", nvstatusToString(status));
        return status;
@ -932,14 +941,16 @@ const char *uvm_mmu_engine_type_string(uvm_mmu_engine_type_t mmu_engine_type)
 void uvm_hal_print_fault_entry(const uvm_fault_buffer_entry_t *entry)
 {
    UVM_DBG_PRINT("fault_address:                    0x%llx\n", entry->fault_address);
-    UVM_DBG_PRINT("    fault_instance_ptr:           {0x%llx:%s}\n", entry->instance_ptr.address,
-                                                                     uvm_aperture_string(entry->instance_ptr.aperture));
+    UVM_DBG_PRINT("    fault_instance_ptr:           {0x%llx:%s}\n",
+                  entry->instance_ptr.address,
+                  uvm_aperture_string(entry->instance_ptr.aperture));
    UVM_DBG_PRINT("    fault_type:                   %s\n", uvm_fault_type_string(entry->fault_type));
    UVM_DBG_PRINT("    fault_access_type:            %s\n", uvm_fault_access_type_string(entry->fault_access_type));
    UVM_DBG_PRINT("    is_replayable:                %s\n", entry->is_replayable? "true": "false");
    UVM_DBG_PRINT("    is_virtual:                   %s\n", entry->is_virtual? "true": "false");
    UVM_DBG_PRINT("    in_protected_mode:            %s\n", entry->in_protected_mode? "true": "false");
-    UVM_DBG_PRINT("    fault_source.client_type:     %s\n", uvm_fault_client_type_string(entry->fault_source.client_type));
+    UVM_DBG_PRINT("    fault_source.client_type:     %s\n",
+                  uvm_fault_client_type_string(entry->fault_source.client_type));
    UVM_DBG_PRINT("    fault_source.client_id:       %d\n", entry->fault_source.client_id);
    UVM_DBG_PRINT("    fault_source.gpc_id:          %d\n", entry->fault_source.gpc_id);
    UVM_DBG_PRINT("    fault_source.mmu_engine_id:   %d\n", entry->fault_source.mmu_engine_id);
@ -962,13 +973,15 @@ const char *uvm_access_counter_type_string(uvm_access_counter_type_t access_coun
 void uvm_hal_print_access_counter_buffer_entry(const uvm_access_counter_buffer_entry_t *entry)
 {
    if (!entry->address.is_virtual) {
-        UVM_DBG_PRINT("physical address: {0x%llx:%s}\n", entry->address.address,
-                                                         uvm_aperture_string(entry->address.aperture));
+        UVM_DBG_PRINT("physical address: {0x%llx:%s}\n",
+                      entry->address.address,
+                      uvm_aperture_string(entry->address.aperture));
    }
    else {
        UVM_DBG_PRINT("virtual address: 0x%llx\n", entry->address.address);
-        UVM_DBG_PRINT("    instance_ptr    {0x%llx:%s}\n", entry->virtual_info.instance_ptr.address,
-                                                    uvm_aperture_string(entry->virtual_info.instance_ptr.aperture));
+        UVM_DBG_PRINT("    instance_ptr    {0x%llx:%s}\n",
+                      entry->virtual_info.instance_ptr.address,
+                      uvm_aperture_string(entry->virtual_info.instance_ptr.aperture));
        UVM_DBG_PRINT("    mmu_engine_type %s\n", uvm_mmu_engine_type_string(entry->virtual_info.mmu_engine_type));
        UVM_DBG_PRINT("    mmu_engine_id   %u\n", entry->virtual_info.mmu_engine_id);
        UVM_DBG_PRINT("    ve_id           %u\n", entry->virtual_info.ve_id);
--- a/kernel-open/nvidia-uvm/uvm_hal.h
+++ b/kernel-open/nvidia-uvm/uvm_hal.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -112,6 +112,10 @@ void uvm_hal_pascal_host_tlb_invalidate_all(uvm_push_t *push,
                                            uvm_gpu_phys_address_t pdb,
                                            NvU32 depth,
                                            uvm_membar_t membar);
+void uvm_hal_turing_host_tlb_invalidate_all(uvm_push_t *push,
+                                            uvm_gpu_phys_address_t pdb,
+                                            NvU32 depth,
+                                            uvm_membar_t membar);
 void uvm_hal_ampere_host_tlb_invalidate_all(uvm_push_t *push,
                                            uvm_gpu_phys_address_t pdb,
                                            NvU32 depth,
@ -149,42 +153,49 @@ typedef void (*uvm_hal_host_tlb_invalidate_va_t)(uvm_push_t *push,
                                                 NvU32 depth,
                                                 NvU64 base,
                                                 NvU64 size,
-                                                 NvU32 page_size,
+                                                 NvU64 page_size,
                                                 uvm_membar_t membar);
 void uvm_hal_maxwell_host_tlb_invalidate_va(uvm_push_t *push,
                                            uvm_gpu_phys_address_t pdb,
                                            NvU32 depth,
                                            NvU64 base,
                                            NvU64 size,
-                                            NvU32 page_size,
+                                            NvU64 page_size,
                                            uvm_membar_t membar);
 void uvm_hal_pascal_host_tlb_invalidate_va(uvm_push_t *push,
                                           uvm_gpu_phys_address_t pdb,
                                           NvU32 depth,
                                           NvU64 base,
                                           NvU64 size,
-                                           NvU32 page_size,
+                                           NvU64 page_size,
                                           uvm_membar_t membar);
 void uvm_hal_volta_host_tlb_invalidate_va(uvm_push_t *push,
                                          uvm_gpu_phys_address_t pdb,
                                          NvU32 depth,
                                          NvU64 base,
                                          NvU64 size,
-                                          NvU32 page_size,
+                                          NvU64 page_size,
                                          uvm_membar_t membar);
+void uvm_hal_turing_host_tlb_invalidate_va(uvm_push_t *push,
+                                           uvm_gpu_phys_address_t pdb,
+                                           NvU32 depth,
+                                           NvU64 base,
+                                           NvU64 size,
+                                           NvU64 page_size,
+                                           uvm_membar_t membar);
 void uvm_hal_ampere_host_tlb_invalidate_va(uvm_push_t *push,
                                           uvm_gpu_phys_address_t pdb,
                                           NvU32 depth,
                                           NvU64 base,
                                           NvU64 size,
-                                           NvU32 page_size,
+                                           NvU64 page_size,
                                           uvm_membar_t membar);
 void uvm_hal_hopper_host_tlb_invalidate_va(uvm_push_t *push,
                                           uvm_gpu_phys_address_t pdb,
                                           NvU32 depth,
                                           NvU64 base,
                                           NvU64 size,
-                                           NvU32 page_size,
+                                           NvU64 page_size,
                                           uvm_membar_t membar);

 typedef void (*uvm_hal_host_tlb_invalidate_test_t)(uvm_push_t *push,
@ -196,6 +207,9 @@ void uvm_hal_maxwell_host_tlb_invalidate_test(uvm_push_t *push,
 void uvm_hal_pascal_host_tlb_invalidate_test(uvm_push_t *push,
                                             uvm_gpu_phys_address_t pdb,
                                             UVM_TEST_INVALIDATE_TLB_PARAMS *params);
+void uvm_hal_turing_host_tlb_invalidate_test(uvm_push_t *push,
+                                             uvm_gpu_phys_address_t pdb,
+                                             UVM_TEST_INVALIDATE_TLB_PARAMS *params);
 void uvm_hal_ampere_host_tlb_invalidate_test(uvm_push_t *push,
                                             uvm_gpu_phys_address_t pdb,
                                             UVM_TEST_INVALIDATE_TLB_PARAMS *params);
@ -445,15 +459,15 @@ void uvm_hal_ada_arch_init_properties(uvm_parent_gpu_t *parent_gpu);
 void uvm_hal_hopper_arch_init_properties(uvm_parent_gpu_t *parent_gpu);

 // Retrieve the page-tree HAL for a given big page size
-typedef uvm_mmu_mode_hal_t *(*uvm_hal_lookup_mode_hal_t)(NvU32 big_page_size);
+typedef uvm_mmu_mode_hal_t *(*uvm_hal_lookup_mode_hal_t)(NvU64 big_page_size);
 typedef void (*uvm_hal_mmu_enable_prefetch_faults_t)(uvm_parent_gpu_t *parent_gpu);
 typedef void (*uvm_hal_mmu_disable_prefetch_faults_t)(uvm_parent_gpu_t *parent_gpu);
-uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_maxwell(NvU32 big_page_size);
-uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_pascal(NvU32 big_page_size);
-uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_volta(NvU32 big_page_size);
-uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_turing(NvU32 big_page_size);
-uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_ampere(NvU32 big_page_size);
-uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_hopper(NvU32 big_page_size);
+uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_maxwell(NvU64 big_page_size);
+uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_pascal(NvU64 big_page_size);
+uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_volta(NvU64 big_page_size);
+uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_turing(NvU64 big_page_size);
+uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_ampere(NvU64 big_page_size);
+uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_hopper(NvU64 big_page_size);
 void uvm_hal_maxwell_mmu_enable_prefetch_faults_unsupported(uvm_parent_gpu_t *parent_gpu);
 void uvm_hal_maxwell_mmu_disable_prefetch_faults_unsupported(uvm_parent_gpu_t *parent_gpu);
 void uvm_hal_pascal_mmu_enable_prefetch_faults(uvm_parent_gpu_t *parent_gpu);
--- a/kernel-open/nvidia-uvm/uvm_hmm.c
+++ b/kernel-open/nvidia-uvm/uvm_hmm.c
@ -1599,7 +1599,7 @@ static void hmm_va_block_cpu_unpopulate_chunk(uvm_va_block_t *va_block,
    UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == PAGE_SIZE);

    uvm_cpu_chunk_remove_from_block(va_block, chunk_nid, page_index);
-    uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk, page_index);
+    uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk);
    uvm_cpu_chunk_free(chunk);
 }

--- a/kernel-open/nvidia-uvm/uvm_hopper_host.c
+++ b/kernel-open/nvidia-uvm/uvm_hopper_host.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2020-2022 NVIDIA Corporation
+    Copyright (c) 2020-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -157,6 +157,7 @@ void uvm_hal_hopper_host_tlb_invalidate_all(uvm_push_t *push,
    NvU32 pdb_lo;
    NvU32 pdb_hi;
    NvU32 ack_value = 0;
+    NvU32 sysmembar_value = 0;

    UVM_ASSERT_MSG(pdb.aperture == UVM_APERTURE_VID || pdb.aperture == UVM_APERTURE_SYS, "aperture: %u", pdb.aperture);

@ -183,7 +184,12 @@ void uvm_hal_hopper_host_tlb_invalidate_all(uvm_push_t *push,
        ack_value = HWCONST(C86F, MEM_OP_C, TLB_INVALIDATE_ACK_TYPE, GLOBALLY);
    }

-    NV_PUSH_4U(C86F, MEM_OP_A, HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS) |
+    if (membar == UVM_MEMBAR_SYS)
+        sysmembar_value = HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, EN);
+    else
+        sysmembar_value = HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS);
+
+    NV_PUSH_4U(C86F, MEM_OP_A, sysmembar_value |
                               HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS),
                     MEM_OP_B, 0,
                     MEM_OP_C, HWCONST(C86F, MEM_OP_C, TLB_INVALIDATE_PDB, ONE) |
@ -196,7 +202,9 @@ void uvm_hal_hopper_host_tlb_invalidate_all(uvm_push_t *push,
                     MEM_OP_D, HWCONST(C86F, MEM_OP_D, OPERATION, MMU_TLB_INVALIDATE) |
                               HWVALUE(C86F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));

-    uvm_hal_tlb_invalidate_membar(push, membar);
+    // GPU membar still requires an explicit membar method.
+    if (membar == UVM_MEMBAR_GPU)
+        uvm_push_get_gpu(push)->parent->host_hal->membar_gpu(push);
 }

 void uvm_hal_hopper_host_tlb_invalidate_va(uvm_push_t *push,
@ -204,7 +212,7 @@ void uvm_hal_hopper_host_tlb_invalidate_va(uvm_push_t *push,
                                           NvU32 depth,
                                           NvU64 base,
                                           NvU64 size,
-                                           NvU32 page_size,
+                                           NvU64 page_size,
                                           uvm_membar_t membar)
 {
    NvU32 aperture_value;
@ -212,6 +220,7 @@ void uvm_hal_hopper_host_tlb_invalidate_va(uvm_push_t *push,
    NvU32 pdb_lo;
    NvU32 pdb_hi;
    NvU32 ack_value = 0;
+    NvU32 sysmembar_value = 0;
    NvU32 va_lo;
    NvU32 va_hi;
    NvU64 end;
@ -221,9 +230,9 @@ void uvm_hal_hopper_host_tlb_invalidate_va(uvm_push_t *push,
    NvU32 log2_invalidation_size;
    uvm_gpu_t *gpu = uvm_push_get_gpu(push);

-    UVM_ASSERT_MSG(IS_ALIGNED(page_size, 1 << 12), "page_size 0x%x\n", page_size);
-    UVM_ASSERT_MSG(IS_ALIGNED(base, page_size), "base 0x%llx page_size 0x%x\n", base, page_size);
-    UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%x\n", size, page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(page_size, 1 << 12), "page_size 0x%llx\n", page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(base, page_size), "base 0x%llx page_size 0x%llx\n", base, page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%llx\n", size, page_size);
    UVM_ASSERT_MSG(size > 0, "size 0x%llx\n", size);

    // The invalidation size must be a power-of-two number of pages containing
@ -277,8 +286,13 @@ void uvm_hal_hopper_host_tlb_invalidate_va(uvm_push_t *push,
        ack_value = HWCONST(C86F, MEM_OP_C, TLB_INVALIDATE_ACK_TYPE, GLOBALLY);
    }

+    if (membar == UVM_MEMBAR_SYS)
+        sysmembar_value = HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, EN);
+    else
+        sysmembar_value = HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS);
+
    NV_PUSH_4U(C86F, MEM_OP_A, HWVALUE(C86F, MEM_OP_A, TLB_INVALIDATE_INVALIDATION_SIZE, log2_invalidation_size) |
-                               HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS) |
+                               sysmembar_value |
                               HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS) |
                               HWVALUE(C86F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO, va_lo),
                     MEM_OP_B, HWVALUE(C86F, MEM_OP_B, TLB_INVALIDATE_TARGET_ADDR_HI, va_hi),
@ -292,7 +306,9 @@ void uvm_hal_hopper_host_tlb_invalidate_va(uvm_push_t *push,
                     MEM_OP_D, HWCONST(C86F, MEM_OP_D, OPERATION, MMU_TLB_INVALIDATE_TARGETED) |
                               HWVALUE(C86F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));

-    uvm_hal_tlb_invalidate_membar(push, membar);
+    // GPU membar still requires an explicit membar method.
+    if (membar == UVM_MEMBAR_GPU)
+        gpu->parent->host_hal->membar_gpu(push);
 }

 void uvm_hal_hopper_host_tlb_invalidate_test(uvm_push_t *push,
@ -300,12 +316,12 @@ void uvm_hal_hopper_host_tlb_invalidate_test(uvm_push_t *push,
                                             UVM_TEST_INVALIDATE_TLB_PARAMS *params)
 {
    NvU32 ack_value = 0;
+    NvU32 sysmembar_value = 0;
    NvU32 invalidate_gpc_value = 0;
    NvU32 aperture_value = 0;
    NvU32 pdb_lo = 0;
    NvU32 pdb_hi = 0;
    NvU32 page_table_level = 0;
-    uvm_membar_t membar;

    UVM_ASSERT_MSG(pdb.aperture == UVM_APERTURE_VID || pdb.aperture == UVM_APERTURE_SYS, "aperture: %u", pdb.aperture);
    if (pdb.aperture == UVM_APERTURE_VID)
@ -332,6 +348,11 @@ void uvm_hal_hopper_host_tlb_invalidate_test(uvm_push_t *push,
        ack_value = HWCONST(C86F, MEM_OP_C, TLB_INVALIDATE_ACK_TYPE, GLOBALLY);
    }

+    if (params->membar == UvmInvalidateTlbMemBarSys)
+        sysmembar_value = HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, EN);
+    else
+        sysmembar_value = HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS);
+
    if (params->disable_gpc_invalidate)
        invalidate_gpc_value = HWCONST(C86F, MEM_OP_C, TLB_INVALIDATE_GPC, DISABLE);
    else
@ -343,7 +364,7 @@ void uvm_hal_hopper_host_tlb_invalidate_test(uvm_push_t *push,
        NvU32 va_lo = va & HWMASK(C86F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO);
        NvU32 va_hi = va >> HWSIZE(C86F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO);

-        NV_PUSH_4U(C86F, MEM_OP_A, HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS) |
+        NV_PUSH_4U(C86F, MEM_OP_A, sysmembar_value |
                                   HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS) |
                                   HWVALUE(C86F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO, va_lo),
                         MEM_OP_B, HWVALUE(C86F, MEM_OP_B, TLB_INVALIDATE_TARGET_ADDR_HI, va_hi),
@ -358,7 +379,7 @@ void uvm_hal_hopper_host_tlb_invalidate_test(uvm_push_t *push,
                                   HWVALUE(C86F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));
    }
    else {
-        NV_PUSH_4U(C86F, MEM_OP_A, HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS) |
+        NV_PUSH_4U(C86F, MEM_OP_A, sysmembar_value |
                                   HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS),
                         MEM_OP_B, 0,
                         MEM_OP_C, HWCONST(C86F, MEM_OP_C, TLB_INVALIDATE_REPLAY, NONE) |
@ -372,14 +393,9 @@ void uvm_hal_hopper_host_tlb_invalidate_test(uvm_push_t *push,
                                   HWVALUE(C86F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));
    }

-    if (params->membar == UvmInvalidateTlbMemBarSys)
-        membar = UVM_MEMBAR_SYS;
-    else if (params->membar == UvmInvalidateTlbMemBarLocal)
-        membar = UVM_MEMBAR_GPU;
-    else
-        membar = UVM_MEMBAR_NONE;
-
-    uvm_hal_tlb_invalidate_membar(push, membar);
+    // GPU membar still requires an explicit membar method.
+    if (params->membar == UvmInvalidateTlbMemBarLocal)
+        uvm_push_get_gpu(push)->parent->host_hal->membar_gpu(push);
 }

 void uvm_hal_hopper_host_set_gpfifo_pushbuffer_segment_base(NvU64 *fifo_entry, NvU64 pushbuffer_va)
--- a/kernel-open/nvidia-uvm/uvm_hopper_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_hopper_mmu.c
@ -61,7 +61,7 @@ uvm_mmu_engine_type_t uvm_hal_hopper_mmu_engine_id_to_type(NvU16 mmu_engine_id)
    return UVM_MMU_ENGINE_TYPE_GRAPHICS;
 }

-static NvU32 page_table_depth_hopper(NvU32 page_size)
+static NvU32 page_table_depth_hopper(NvU64 page_size)
 {
    // The common-case is page_size == UVM_PAGE_SIZE_2M, hence the first check
    if (page_size == UVM_PAGE_SIZE_2M)
@ -79,7 +79,7 @@ static NvU32 entries_per_index_hopper(NvU32 depth)
    return 1;
 }

-static NvLength entry_offset_hopper(NvU32 depth, NvU32 page_size)
+static NvLength entry_offset_hopper(NvU32 depth, NvU64 page_size)
 {
    UVM_ASSERT(depth < 6);
    if ((page_size == UVM_PAGE_SIZE_4K) && (depth == 4))
@ -92,7 +92,7 @@ static NvLength entry_size_hopper(NvU32 depth)
    return entries_per_index_hopper(depth) * 8;
 }

-static NvU32 index_bits_hopper(NvU32 depth, NvU32 page_size)
+static NvU32 index_bits_hopper(NvU32 depth, NvU64 page_size)
 {
    static const NvU32 bit_widths[] = {1, 9, 9, 9, 8};

@ -120,7 +120,7 @@ static NvU32 num_va_bits_hopper(void)
    return 57;
 }

-static NvLength allocation_size_hopper(NvU32 depth, NvU32 page_size)
+static NvLength allocation_size_hopper(NvU32 depth, NvU64 page_size)
 {
    UVM_ASSERT(depth < 6);
    if (depth == 5 && page_size == UVM_PAGE_SIZE_64K)
@ -233,7 +233,7 @@ static NvU64 make_sparse_pte_hopper(void)
           HWCONST64(_MMU_VER3, PTE, PCF, SPARSE);
 }

-static NvU64 unmapped_pte_hopper(NvU32 page_size)
+static NvU64 unmapped_pte_hopper(NvU64 page_size)
 {
    // Setting PCF to NO_VALID_4KB_PAGE on an otherwise-zeroed big PTE causes
    // the corresponding 4k PTEs to be ignored. This allows the invalidation of
@ -490,7 +490,7 @@ static void make_pde_hopper(void *entry,

 static uvm_mmu_mode_hal_t hopper_mmu_mode_hal;

-uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_hopper(NvU32 big_page_size)
+uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_hopper(NvU64 big_page_size)
 {
    static bool initialized = false;

--- a/kernel-open/nvidia-uvm/uvm_ioctl.h
+++ b/kernel-open/nvidia-uvm/uvm_ioctl.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2013-2023 NVidia Corporation
+    Copyright (c) 2013-2024 NVidia Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -494,7 +494,7 @@ typedef struct
    NvU64                   base                            NV_ALIGN_BYTES(8); // IN
    NvU64                   length                          NV_ALIGN_BYTES(8); // IN
    NvU64                   offset                          NV_ALIGN_BYTES(8); // IN
-    UvmGpuMappingAttributes perGpuAttributes[UVM_MAX_GPUS_V2];                 // IN
+    UvmGpuMappingAttributes perGpuAttributes[UVM_MAX_GPUS];                    // IN
    NvU64                   gpuAttributesCount              NV_ALIGN_BYTES(8); // IN
    NvS32                   rmCtrlFd;                                          // IN
    NvU32                   hClient;                                           // IN
@ -952,7 +952,6 @@ typedef struct
    NvU32     version;                                    // OUT
 } UVM_TOOLS_GET_PROCESSOR_UUID_TABLE_PARAMS;

-
 //
 // UvmMapDynamicParallelismRegion
 //
@ -995,7 +994,7 @@ typedef struct
 {
    NvU64                   base                            NV_ALIGN_BYTES(8); // IN
    NvU64                   length                          NV_ALIGN_BYTES(8); // IN
-    UvmGpuMappingAttributes perGpuAttributes[UVM_MAX_GPUS_V2];                 // IN
+    UvmGpuMappingAttributes perGpuAttributes[UVM_MAX_GPUS];                    // IN
    NvU64                   gpuAttributesCount              NV_ALIGN_BYTES(8); // IN
    NV_STATUS               rmStatus;                                          // OUT
 } UVM_ALLOC_SEMAPHORE_POOL_PARAMS;
--- a/kernel-open/nvidia-uvm/uvm_map_external.c
+++ b/kernel-open/nvidia-uvm/uvm_map_external.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2023 NVIDIA Corporation
+    Copyright (c) 2016-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -39,6 +39,7 @@
 #include "uvm_pte_batch.h"
 #include "uvm_tlb_batch.h"
 #include "nv_uvm_interface.h"
+#include "nv_uvm_types.h"

 #include "uvm_pushbuffer.h"

@ -60,7 +61,7 @@ typedef struct
    size_t buffer_size;

    // Page size in bytes
-    NvU32 page_size;
+    NvU64 page_size;

    // Size of a single PTE in bytes
    NvU32 pte_size;
@ -90,7 +91,7 @@ static NV_STATUS uvm_pte_buffer_init(uvm_va_range_t *va_range,
                                     uvm_gpu_t *gpu,
                                     const uvm_map_rm_params_t *map_rm_params,
                                     NvU64 length,
-                                     NvU32 page_size,
+                                     NvU64 page_size,
                                     uvm_pte_buffer_t *pte_buffer)
 {
    uvm_gpu_va_space_t *gpu_va_space = uvm_gpu_va_space_get(va_range->va_space, gpu);
@ -101,11 +102,11 @@ static NV_STATUS uvm_pte_buffer_init(uvm_va_range_t *va_range,

    pte_buffer->va_range = va_range;
    pte_buffer->gpu = gpu;
-    pte_buffer->mapping_info.cachingType = map_rm_params->caching_type;
-    pte_buffer->mapping_info.mappingType = map_rm_params->mapping_type;
-    pte_buffer->mapping_info.formatType = map_rm_params->format_type;
-    pte_buffer->mapping_info.elementBits = map_rm_params->element_bits;
-    pte_buffer->mapping_info.compressionType = map_rm_params->compression_type;
+    pte_buffer->mapping_info.cachingType        = (UvmRmGpuCachingType) map_rm_params->caching_type;
+    pte_buffer->mapping_info.mappingType        = (UvmRmGpuMappingType) map_rm_params->mapping_type;
+    pte_buffer->mapping_info.formatType         = (UvmRmGpuFormatType) map_rm_params->format_type;
+    pte_buffer->mapping_info.elementBits        = (UvmRmGpuFormatElementBits) map_rm_params->element_bits;
+    pte_buffer->mapping_info.compressionType    = (UvmRmGpuCompressionType) map_rm_params->compression_type;
    if (va_range->type == UVM_VA_RANGE_TYPE_EXTERNAL)
        pte_buffer->mapping_info.mappingPageSize = page_size;

@ -649,9 +650,7 @@ static NV_STATUS set_ext_gpu_map_location(uvm_ext_gpu_map_t *ext_gpu_map,
        return NV_OK;
    }
    // This is a local or peer allocation, so the owning GPU must have been
-    // registered.
-    // This also checks for if EGM owning GPU is registered.
-
+    // registered. This also checks for if EGM owning GPU is registered.
    owning_gpu = uvm_va_space_get_gpu_by_uuid(va_space, &mem_info->uuid);
    if (!owning_gpu)
        return NV_ERR_INVALID_DEVICE;
@ -664,7 +663,6 @@ static NV_STATUS set_ext_gpu_map_location(uvm_ext_gpu_map_t *ext_gpu_map,
    // semantics of sysmem allocations.

    // Check if peer access for peer memory is enabled.
-    // This path also handles EGM allocations.
    if (owning_gpu != mapping_gpu && (!mem_info->sysmem || mem_info->egm)) {
        // TODO: Bug 1757136: In SLI, the returned UUID may be different but a
        //       local mapping must be used. We need to query SLI groups to know
@ -855,9 +853,10 @@ static NV_STATUS uvm_map_external_allocation_on_gpu(uvm_va_range_t *va_range,
    uvm_ext_gpu_range_tree_t *range_tree = uvm_ext_gpu_range_tree(va_range, mapping_gpu);
    UvmGpuMemoryInfo mem_info;
    uvm_gpu_va_space_t *gpu_va_space = uvm_gpu_va_space_get(va_space, mapping_gpu);
-    NvU32 mapping_page_size;
+    NvU64 mapping_page_size;
+    NvU64 biggest_mapping_page_size;
    NvU64 alignments;
-    NvU32 smallest_alignment;
+    NvU64 smallest_alignment;
    NV_STATUS status;

    uvm_assert_rwsem_locked_read(&va_space->lock);
@ -946,9 +945,11 @@ static NV_STATUS uvm_map_external_allocation_on_gpu(uvm_va_range_t *va_range,

    // Check for the maximum page size for the mapping of vidmem allocations,
    // the vMMU segment size may limit the range of page sizes.
+    biggest_mapping_page_size = uvm_mmu_biggest_page_size_up_to(&gpu_va_space->page_tables,
+                                                                mapping_gpu->mem_info.max_vidmem_page_size);
    if (!ext_gpu_map->is_sysmem && (ext_gpu_map->gpu == ext_gpu_map->owning_gpu) &&
-        (mapping_page_size > mapping_gpu->mem_info.max_vidmem_page_size))
-        mapping_page_size = mapping_gpu->mem_info.max_vidmem_page_size;
+        (mapping_page_size > biggest_mapping_page_size))
+        mapping_page_size = biggest_mapping_page_size;

    mem_info.pageSize = mapping_page_size;

@ -985,7 +986,7 @@ static NV_STATUS uvm_map_external_allocation(uvm_va_space_t *va_space, UVM_MAP_E
    if (uvm_api_range_invalid_4k(params->base, params->length))
        return NV_ERR_INVALID_ADDRESS;

-    if (params->gpuAttributesCount == 0 || params->gpuAttributesCount > UVM_MAX_GPUS_V2)
+    if (params->gpuAttributesCount == 0 || params->gpuAttributesCount > UVM_MAX_GPUS)
        return NV_ERR_INVALID_ARGUMENT;

    mapped_gpus = uvm_processor_mask_cache_alloc();
--- a/kernel-open/nvidia-uvm/uvm_maxwell_host.c
+++ b/kernel-open/nvidia-uvm/uvm_maxwell_host.c
@ -108,7 +108,7 @@ void uvm_hal_maxwell_host_tlb_invalidate_va(uvm_push_t *push,
                                            NvU32 depth,
                                            NvU64 base,
                                            NvU64 size,
-                                            NvU32 page_size,
+                                            NvU64 page_size,
                                            uvm_membar_t membar)
 {
    // No per VA invalidate on Maxwell, redirect to invalidate all.
--- a/kernel-open/nvidia-uvm/uvm_maxwell_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_maxwell_mmu.c
@ -52,7 +52,7 @@ static NvU32 entries_per_index_maxwell(NvU32 depth)
    return 1;
 }

-static NvLength entry_offset_maxwell(NvU32 depth, NvU32 page_size)
+static NvLength entry_offset_maxwell(NvU32 depth, NvU64 page_size)
 {
    UVM_ASSERT(depth < 2);
    if (page_size == UVM_PAGE_SIZE_4K && depth == 0)
@ -128,7 +128,7 @@ static NvLength entry_size_maxwell(NvU32 depth)
    return 8;
 }

-static NvU32 index_bits_maxwell_64(NvU32 depth, NvU32 page_size)
+static NvU32 index_bits_maxwell_64(NvU32 depth, NvU64 page_size)
 {
    UVM_ASSERT(depth < 2);
    UVM_ASSERT(page_size == UVM_PAGE_SIZE_4K ||
@ -146,7 +146,7 @@ static NvU32 index_bits_maxwell_64(NvU32 depth, NvU32 page_size)
    }
 }

-static NvU32 index_bits_maxwell_128(NvU32 depth, NvU32 page_size)
+static NvU32 index_bits_maxwell_128(NvU32 depth, NvU64 page_size)
 {
    UVM_ASSERT(depth < 2);
    UVM_ASSERT(page_size == UVM_PAGE_SIZE_4K ||
@ -169,32 +169,32 @@ static NvU32 num_va_bits_maxwell(void)
    return 40;
 }

-static NvLength allocation_size_maxwell_64(NvU32 depth, NvU32 page_size)
+static NvLength allocation_size_maxwell_64(NvU32 depth, NvU64 page_size)
 {
    return entry_size_maxwell(depth) << index_bits_maxwell_64(depth, page_size);
 }

-static NvLength allocation_size_maxwell_128(NvU32 depth, NvU32 page_size)
+static NvLength allocation_size_maxwell_128(NvU32 depth, NvU64 page_size)
 {
    return entry_size_maxwell(depth) << index_bits_maxwell_128(depth, page_size);
 }

-static NvU32 page_table_depth_maxwell(NvU32 page_size)
+static NvU32 page_table_depth_maxwell(NvU64 page_size)
 {
    return 1;
 }

-static NvU32 page_sizes_maxwell_128(void)
+static NvU64 page_sizes_maxwell_128(void)
 {
    return UVM_PAGE_SIZE_128K | UVM_PAGE_SIZE_4K;
 }

-static NvU32 page_sizes_maxwell_64(void)
+static NvU64 page_sizes_maxwell_64(void)
 {
    return UVM_PAGE_SIZE_64K | UVM_PAGE_SIZE_4K;
 }

-static NvU64 unmapped_pte_maxwell(NvU32 page_size)
+static NvU64 unmapped_pte_maxwell(NvU64 page_size)
 {
    // Setting the privilege bit on an otherwise-zeroed big PTE causes the
    // corresponding 4k PTEs to be ignored. This allows the invalidation of a
@ -356,7 +356,7 @@ static uvm_mmu_mode_hal_t maxwell_128_mmu_mode_hal =
    .page_sizes = page_sizes_maxwell_128
 };

-uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_maxwell(NvU32 big_page_size)
+uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_maxwell(NvU64 big_page_size)
 {
    UVM_ASSERT(big_page_size == UVM_PAGE_SIZE_64K || big_page_size == UVM_PAGE_SIZE_128K);
    if (big_page_size == UVM_PAGE_SIZE_64K)
--- a/kernel-open/nvidia-uvm/uvm_mem.c
+++ b/kernel-open/nvidia-uvm/uvm_mem.c
@ -290,15 +290,15 @@ uvm_chunk_sizes_mask_t uvm_mem_kernel_chunk_sizes(uvm_gpu_t *gpu)
    // Get the mmu mode hal directly as the internal address space tree has not
    // been created yet.
    uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(gpu->big_page.internal_size);
-    NvU32 page_sizes = hal->page_sizes();
+    NvU64 page_sizes = hal->page_sizes();

    return (uvm_chunk_sizes_mask_t)(page_sizes & UVM_CHUNK_SIZES_MASK);
 }

-static NvU32 mem_pick_chunk_size(uvm_mem_t *mem)
+static NvU64 mem_pick_chunk_size(uvm_mem_t *mem)
 {
-    NvU32 biggest_page_size;
-    NvU32 chunk_size;
+    NvU64 biggest_page_size;
+    NvU64 chunk_size;

    if (uvm_mem_is_sysmem(mem))
        return PAGE_SIZE;
@ -315,12 +315,12 @@ static NvU32 mem_pick_chunk_size(uvm_mem_t *mem)
    // When UVM_PAGE_SIZE_DEFAULT is used on NUMA-enabled GPUs, we force
    // chunk_size to be PAGE_SIZE at least, to allow CPU mappings.
    if (mem->backing_gpu->mem_info.numa.enabled)
-        chunk_size = max(chunk_size, (NvU32)PAGE_SIZE);
+        chunk_size = max(chunk_size, (NvU64)PAGE_SIZE);

    return chunk_size;
 }

-static NvU32 mem_pick_gpu_page_size(uvm_mem_t *mem, uvm_gpu_t *gpu, uvm_page_tree_t *gpu_page_tree)
+static NvU64 mem_pick_gpu_page_size(uvm_mem_t *mem, uvm_gpu_t *gpu, uvm_page_tree_t *gpu_page_tree)
 {
    if (uvm_mem_is_vidmem(mem)) {
        // For vidmem allocations the chunk size is picked out of the supported
@ -467,7 +467,7 @@ static NV_STATUS mem_alloc_sysmem_dma_chunks(uvm_mem_t *mem, gfp_t gfp_flags)
    NvU64 *dma_addrs;

    UVM_ASSERT_MSG(mem->chunk_size == PAGE_SIZE,
-                   "mem->chunk_size is 0x%x. PAGE_SIZE is only supported.",
+                   "mem->chunk_size is 0x%llx. PAGE_SIZE is only supported.",
                   mem->chunk_size);
    UVM_ASSERT(uvm_mem_is_sysmem_dma(mem));

@ -528,10 +528,9 @@ static NV_STATUS mem_alloc_sysmem_chunks(uvm_mem_t *mem, gfp_t gfp_flags)

 // In case of failure, the caller is required to handle cleanup by calling
 // uvm_mem_free
-static NV_STATUS mem_alloc_vidmem_chunks(uvm_mem_t *mem, bool zero, bool is_unprotected)
+static NV_STATUS mem_alloc_vidmem_chunks(uvm_mem_t *mem, bool zero)
 {
    NV_STATUS status;
-    uvm_pmm_gpu_memory_type_t mem_type;

    UVM_ASSERT(uvm_mem_is_vidmem(mem));

@ -548,23 +547,15 @@ static NV_STATUS mem_alloc_vidmem_chunks(uvm_mem_t *mem, bool zero, bool is_unpr
    if (!mem->vidmem.chunks)
        return NV_ERR_NO_MEMORY;

-    // When CC is disabled the behavior is identical to that of PMM, and the
-    // protection flag is ignored (squashed by PMM internally).
-    if (is_unprotected)
-        mem_type = UVM_PMM_GPU_MEMORY_TYPE_KERNEL_UNPROTECTED;
-    else
-        mem_type = UVM_PMM_GPU_MEMORY_TYPE_KERNEL_PROTECTED;
-
-    status = uvm_pmm_gpu_alloc(&mem->backing_gpu->pmm,
-                               mem->chunks_count,
-                               mem->chunk_size,
-                               mem_type,
-                               UVM_PMM_ALLOC_FLAGS_NONE,
-                               mem->vidmem.chunks,
-                               NULL);
+    status = uvm_pmm_gpu_alloc_kernel(&mem->backing_gpu->pmm,
+                                      mem->chunks_count,
+                                      mem->chunk_size,
+                                      UVM_PMM_ALLOC_FLAGS_NONE,
+                                      mem->vidmem.chunks,
+                                      NULL);

    if (status != NV_OK) {
-        UVM_ERR_PRINT("uvm_pmm_gpu_alloc (count=%zd, size=0x%x) failed: %s\n",
+        UVM_ERR_PRINT("uvm_pmm_gpu_alloc_kernel (count=%zd, size=0x%llx) failed: %s\n",
                      mem->chunks_count,
                      mem->chunk_size,
                      nvstatusToString(status));
@ -574,7 +565,7 @@ static NV_STATUS mem_alloc_vidmem_chunks(uvm_mem_t *mem, bool zero, bool is_unpr
    return NV_OK;
 }

-static NV_STATUS mem_alloc_chunks(uvm_mem_t *mem, struct mm_struct *mm, bool zero, bool is_unprotected)
+static NV_STATUS mem_alloc_chunks(uvm_mem_t *mem, struct mm_struct *mm, bool zero)
 {
    if (uvm_mem_is_sysmem(mem)) {
        gfp_t gfp_flags;
@ -596,7 +587,7 @@ static NV_STATUS mem_alloc_chunks(uvm_mem_t *mem, struct mm_struct *mm, bool zer
        return status;
    }

-    return mem_alloc_vidmem_chunks(mem, zero, is_unprotected);
+    return mem_alloc_vidmem_chunks(mem, zero);
 }

 NV_STATUS uvm_mem_map_kernel(uvm_mem_t *mem, const uvm_processor_mask_t *mask)
@ -626,7 +617,6 @@ NV_STATUS uvm_mem_alloc(const uvm_mem_alloc_params_t *params, uvm_mem_t **mem_ou
    NV_STATUS status;
    NvU64 physical_size;
    uvm_mem_t *mem = NULL;
-    bool is_unprotected = false;

    UVM_ASSERT(params->size > 0);

@ -648,12 +638,7 @@ NV_STATUS uvm_mem_alloc(const uvm_mem_alloc_params_t *params, uvm_mem_t **mem_ou
    physical_size = UVM_ALIGN_UP(mem->size, mem->chunk_size);
    mem->chunks_count = physical_size / mem->chunk_size;

-    if (params->is_unprotected)
-        UVM_ASSERT(uvm_mem_is_vidmem(mem));
-
-    is_unprotected = params->is_unprotected;
-
-    status = mem_alloc_chunks(mem, params->mm, params->zero, is_unprotected);
+    status = mem_alloc_chunks(mem, params->mm, params->zero);
    if (status != NV_OK)
        goto error;

@ -1050,7 +1035,7 @@ static NV_STATUS mem_map_gpu(uvm_mem_t *mem,
                             uvm_page_table_range_vec_t **range_vec)
 {
    NV_STATUS status;
-    NvU32 page_size;
+    NvU64 page_size;
    uvm_pmm_alloc_flags_t pmm_flags = UVM_PMM_ALLOC_FLAGS_EVICT;

    uvm_mem_pte_maker_data_t pte_maker_data = {
@ -1059,7 +1044,7 @@ static NV_STATUS mem_map_gpu(uvm_mem_t *mem,
        };

    page_size = mem_pick_gpu_page_size(mem, gpu, tree);
-    UVM_ASSERT_MSG(uvm_mmu_page_size_supported(tree, page_size), "page_size 0x%x\n", page_size);
+    UVM_ASSERT_MSG(uvm_mmu_page_size_supported(tree, page_size), "page_size 0x%llx\n", page_size);

    // When the Confidential Computing feature is enabled, DMA allocations are
    // majoritarily allocated and managed by a per-GPU DMA buffer pool
--- a/kernel-open/nvidia-uvm/uvm_mem.h
+++ b/kernel-open/nvidia-uvm/uvm_mem.h
@ -126,12 +126,7 @@ typedef struct
    //
    // CPU mappings will always use PAGE_SIZE, so the physical allocation chunk
    // has to be aligned to PAGE_SIZE.
-    NvU32 page_size;
-
-    // The protection flag is only observed for vidmem allocations when CC is
-    // enabled. If set to true, the allocation returns unprotected vidmem;
-    // otherwise, the allocation returns protected vidmem.
-    bool is_unprotected;
+    NvU64 page_size;

    // If true, the allocation is zeroed (scrubbed).
    bool zero;
@ -199,7 +194,7 @@ struct uvm_mem_struct
    size_t chunks_count;

    // Size of each physical chunk (vidmem) or CPU page (sysmem)
-    NvU32 chunk_size;
+    NvU64 chunk_size;

    // Size of the allocation
    NvU64 size;
--- a/kernel-open/nvidia-uvm/uvm_mem_test.c
+++ b/kernel-open/nvidia-uvm/uvm_mem_test.c
@ -153,7 +153,7 @@ static NV_STATUS check_accessible_from_gpu(uvm_gpu_t *gpu, uvm_mem_t *mem)

    for (i = 0; i < verif_size / sizeof(*sys_verif); ++i) {
        if (sys_verif[i] != mem->size + i) {
-            UVM_TEST_PRINT("Verif failed for %zd = 0x%llx instead of 0x%llx, verif_size=0x%llx mem(size=0x%llx, page_size=%u, processor=%u)\n",
+            UVM_TEST_PRINT("Verif failed for %zd = 0x%llx instead of 0x%llx, verif_size=0x%llx mem(size=0x%llx, page_size=%llu, processor=%u)\n",
                           i,
                           sys_verif[i],
                           (NvU64)(verif_size + i),
@ -241,7 +241,7 @@ static NV_STATUS test_map_cpu(uvm_mem_t *mem)
    return NV_OK;
 }

-static NV_STATUS test_alloc_sysmem(uvm_va_space_t *va_space, NvU32 page_size, size_t size, uvm_mem_t **mem_out)
+static NV_STATUS test_alloc_sysmem(uvm_va_space_t *va_space, NvU64 page_size, size_t size, uvm_mem_t **mem_out)
 {
    NV_STATUS status;
    uvm_mem_t *mem;
@ -299,7 +299,7 @@ error:
    return status;
 }

-static NV_STATUS test_alloc_vidmem(uvm_gpu_t *gpu, NvU32 page_size, size_t size, uvm_mem_t **mem_out)
+static NV_STATUS test_alloc_vidmem(uvm_gpu_t *gpu, NvU64 page_size, size_t size, uvm_mem_t **mem_out)
 {
    NV_STATUS status;
    uvm_mem_t *mem;
@ -334,7 +334,7 @@ error:
    return status;
 }

-static bool should_test_page_size(size_t alloc_size, NvU32 page_size)
+static bool should_test_page_size(size_t alloc_size, NvU64 page_size)
 {
    if (g_uvm_global.num_simulated_devices == 0)
        return true;
@ -359,7 +359,7 @@ static NV_STATUS test_all(uvm_va_space_t *va_space)
    // size on pre-Pascal GPUs with 128K big page size.
    // Ampere+ also supports 512M PTEs, but since UVM's maximum chunk size is
    // 2M, we don't test for this page size.
-    static const NvU32 cpu_chunk_sizes = PAGE_SIZE | UVM_PAGE_SIZE_64K | UVM_PAGE_SIZE_128K | UVM_PAGE_SIZE_2M;
+    static const NvU64 cpu_chunk_sizes = PAGE_SIZE | UVM_PAGE_SIZE_64K | UVM_PAGE_SIZE_128K | UVM_PAGE_SIZE_2M;

    // All supported page sizes will be tested, CPU has the most with 4 and +1
    // for the default.
@ -494,41 +494,6 @@ done:
    return status;
 }

-static NV_STATUS test_basic_vidmem_unprotected(uvm_gpu_t *gpu)
-{
-    NV_STATUS status = NV_OK;
-    uvm_mem_t *mem = NULL;
-
-    uvm_mem_alloc_params_t params = { 0 };
-    params.size = UVM_PAGE_SIZE_4K;
-    params.backing_gpu = gpu;
-    params.page_size = UVM_PAGE_SIZE_4K;
-
-    // If CC is enabled, the protection flag is observed. Because currently all
-    // vidmem is in the protected region, the allocation should succeed.
-    //
-    // If CC is disabled, the protection flag is ignored.
-    params.is_unprotected = false;
-    TEST_NV_CHECK_RET(uvm_mem_alloc(&params, &mem));
-
-    uvm_mem_free(mem);
-    mem = NULL;
-
-    // If CC is enabled, the allocation should fail because currently the
-    // unprotected region is empty.
-    //
-    // If CC is disabled, the behavior should be identical to that of a
-    // protected allocation.
-    params.is_unprotected = true;
-    if (g_uvm_global.conf_computing_enabled)
-        TEST_CHECK_RET(uvm_mem_alloc(&params, &mem) == NV_ERR_NO_MEMORY);
-    else
-        TEST_NV_CHECK_RET(uvm_mem_alloc(&params, &mem));
-
-    uvm_mem_free(mem);
-    return status;
-}
-
 static NV_STATUS test_basic_sysmem(void)
 {
    NV_STATUS status = NV_OK;
@ -613,7 +578,6 @@ static NV_STATUS test_basic(uvm_va_space_t *va_space)
    for_each_va_space_gpu(gpu, va_space) {
        TEST_NV_CHECK_RET(test_basic_vidmem(gpu));
        TEST_NV_CHECK_RET(test_basic_sysmem_dma(gpu));
-        TEST_NV_CHECK_RET(test_basic_vidmem_unprotected(gpu));
        TEST_NV_CHECK_RET(test_basic_dma_pool(gpu));
    }

--- a/kernel-open/nvidia-uvm/uvm_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_mmu.c
@ -153,20 +153,17 @@ static NV_STATUS phys_mem_allocate_sysmem(uvm_page_tree_t *tree, NvLength size,
 // - UVM_APERTURE_VID       biggest page size on vidmem mappings
 // - UVM_APERTURE_SYS       biggest page size on sysmem mappings
 // - UVM_APERTURE_PEER_0-7  biggest page size on peer mappings
-static NvU32 mmu_biggest_page_size(uvm_page_tree_t *tree, uvm_aperture_t aperture)
+static NvU64 mmu_biggest_page_size(uvm_page_tree_t *tree, uvm_aperture_t aperture)
 {
    UVM_ASSERT(aperture < UVM_APERTURE_DEFAULT);

    // There may be scenarios where the GMMU must use a subset of the supported
    // page sizes, e.g., to comply with the vMMU supported page sizes due to
    // segmentation sizes.
-    if (aperture == UVM_APERTURE_VID) {
-        UVM_ASSERT(tree->gpu->mem_info.max_vidmem_page_size <= NV_U32_MAX);
-        return (NvU32) tree->gpu->mem_info.max_vidmem_page_size;
-    }
-    else {
-        return 1 << __fls(tree->hal->page_sizes());
-    }
+    if (aperture == UVM_APERTURE_VID)
+        return uvm_mmu_biggest_page_size_up_to(tree, tree->gpu->mem_info.max_vidmem_page_size);
+
+    return 1ULL << __fls(tree->hal->page_sizes());
 }

 static NV_STATUS phys_mem_allocate_vidmem(uvm_page_tree_t *tree,
@ -254,7 +251,7 @@ static void phys_mem_deallocate(uvm_page_tree_t *tree, uvm_mmu_page_table_alloc_
 }

 static void page_table_range_init(uvm_page_table_range_t *range,
-                                 NvU32 page_size,
+                                 NvU64 page_size,
                                 uvm_page_directory_t *dir,
                                 NvU32 start_index,
                                 NvU32 end_index)
@ -444,9 +441,9 @@ static void pde_fill(uvm_page_tree_t *tree,
        pde_fill_cpu(tree, directory, start_index, pde_count, phys_addr);
 }

-static void phys_mem_init(uvm_page_tree_t *tree, NvU32 page_size, uvm_page_directory_t *dir, uvm_push_t *push)
+static void phys_mem_init(uvm_page_tree_t *tree, NvU64 page_size, uvm_page_directory_t *dir, uvm_push_t *push)
 {
-    NvU32 entries_count = uvm_mmu_page_tree_entries(tree, dir->depth, page_size);
+    NvU64 entries_count = uvm_mmu_page_tree_entries(tree, dir->depth, page_size);
    NvU8 max_pde_depth = tree->hal->page_table_depth(UVM_PAGE_SIZE_AGNOSTIC) - 1;

    // Passing in NULL for the phys_allocs will mark the child entries as
@ -497,7 +494,7 @@ static void phys_mem_init(uvm_page_tree_t *tree, NvU32 page_size, uvm_page_direc
 }

 static uvm_page_directory_t *allocate_directory(uvm_page_tree_t *tree,
-                                                NvU32 page_size,
+                                                NvU64 page_size,
                                                NvU32 depth,
                                                uvm_pmm_alloc_flags_t pmm_flags)
 {
@ -546,7 +543,7 @@ static inline NvU32 entry_index_from_vaddr(NvU64 vaddr, NvU32 addr_bit_shift, Nv
    return (NvU32)((vaddr >> addr_bit_shift) & mask);
 }

-static inline NvU32 index_to_entry(uvm_mmu_mode_hal_t *hal, NvU32 entry_index, NvU32 depth, NvU32 page_size)
+static inline NvU32 index_to_entry(uvm_mmu_mode_hal_t *hal, NvU32 entry_index, NvU32 depth, NvU64 page_size)
 {
    return hal->entries_per_index(depth) * entry_index + hal->entry_offset(depth, page_size);
 }
@ -583,7 +580,7 @@ static void pde_write(uvm_page_tree_t *tree,
    pde_fill(tree, dir, entry_index, 1, phys_allocs, push);
 }

-static void host_pde_clear(uvm_page_tree_t *tree, uvm_page_directory_t *dir, NvU32 entry_index, NvU32 page_size)
+static void host_pde_clear(uvm_page_tree_t *tree, uvm_page_directory_t *dir, NvU32 entry_index, NvU64 page_size)
 {
    UVM_ASSERT(dir->ref_count > 0);

@ -594,35 +591,38 @@ static void host_pde_clear(uvm_page_tree_t *tree, uvm_page_directory_t *dir, NvU
 static void pde_clear(uvm_page_tree_t *tree,
                      uvm_page_directory_t *dir,
                      NvU32 entry_index,
-                      NvU32 page_size,
+                      NvU64 page_size,
                      uvm_push_t *push)
 {
    host_pde_clear(tree, dir, entry_index, page_size);
    pde_write(tree, dir, entry_index, false, push);
 }

-static uvm_chunk_sizes_mask_t allocation_sizes_for_big_page_size(uvm_parent_gpu_t *parent_gpu, NvU32 big_page_size)
+static uvm_chunk_sizes_mask_t allocation_sizes_for_big_page_size(uvm_parent_gpu_t *parent_gpu, NvU64 big_page_size)
 {
-    uvm_chunk_sizes_mask_t alloc_sizes = 0;
    uvm_mmu_mode_hal_t *hal = parent_gpu->arch_hal->mmu_mode_hal(big_page_size);
+    unsigned long page_sizes, page_size_log2;
+    uvm_chunk_sizes_mask_t alloc_sizes;

-    if (hal != NULL) {
-        unsigned long page_size_log2;
-        unsigned long page_sizes = hal->page_sizes();
-        BUILD_BUG_ON(sizeof(hal->page_sizes()) > sizeof(page_sizes));
+    if (hal == NULL)
+        return 0;

-        for_each_set_bit(page_size_log2, &page_sizes, BITS_PER_LONG) {
-            NvU32 i;
-            NvU32 page_size = (NvU32)(1ULL << page_size_log2);
-            for (i = 0; i <= hal->page_table_depth(page_size); i++)
-                alloc_sizes |= hal->allocation_size(i, page_size);
-        }
+    page_sizes = hal->page_sizes();
+    alloc_sizes = 0;
+
+    BUILD_BUG_ON(sizeof(hal->page_sizes()) > sizeof(page_sizes));
+
+    for_each_set_bit(page_size_log2, &page_sizes, BITS_PER_LONG) {
+        NvU32 i;
+        NvU64 page_size = 1ULL << page_size_log2;
+        for (i = 0; i <= hal->page_table_depth(page_size); i++)
+            alloc_sizes |= hal->allocation_size(i, page_size);
    }

    return alloc_sizes;
 }

-static NvU32 page_sizes_for_big_page_size(uvm_parent_gpu_t *parent_gpu, NvU32 big_page_size)
+static NvU64 page_sizes_for_big_page_size(uvm_parent_gpu_t *parent_gpu, NvU64 big_page_size)
 {
    uvm_mmu_mode_hal_t *hal = parent_gpu->arch_hal->mmu_mode_hal(big_page_size);

@ -662,7 +662,7 @@ static NV_STATUS page_tree_end_and_wait(uvm_page_tree_t *tree, uvm_push_t *push)
 }

 static NV_STATUS write_gpu_state_cpu(uvm_page_tree_t *tree,
-                                     NvU32 page_size,
+                                     NvU64 page_size,
                                     NvS32 invalidate_depth,
                                     NvU32 used_count,
                                     uvm_page_directory_t **dirs_used)
@ -713,7 +713,7 @@ static NV_STATUS write_gpu_state_cpu(uvm_page_tree_t *tree,
 }

 static NV_STATUS write_gpu_state_gpu(uvm_page_tree_t *tree,
-                                     NvU32 page_size,
+                                     NvU64 page_size,
                                     NvS32 invalidate_depth,
                                     NvU32 used_count,
                                     uvm_page_directory_t **dirs_used)
@ -805,7 +805,7 @@ static NV_STATUS write_gpu_state_gpu(uvm_page_tree_t *tree,

 // initialize new page tables and insert them into the tree
 static NV_STATUS write_gpu_state(uvm_page_tree_t *tree,
-                                 NvU32 page_size,
+                                 NvU64 page_size,
                                 NvS32 invalidate_depth,
                                 NvU32 used_count,
                                 uvm_page_directory_t **dirs_used)
@ -842,7 +842,7 @@ static void free_unused_directories(uvm_page_tree_t *tree,
    }
 }

-static NV_STATUS allocate_page_table(uvm_page_tree_t *tree, NvU32 page_size, uvm_mmu_page_table_alloc_t *out)
+static NV_STATUS allocate_page_table(uvm_page_tree_t *tree, NvU64 page_size, uvm_mmu_page_table_alloc_t *out)
 {
    NvU32 depth = tree->hal->page_table_depth(page_size);
    NvLength alloc_size = tree->hal->allocation_size(depth, page_size);
@ -871,7 +871,7 @@ static NV_STATUS page_tree_ats_init(uvm_page_tree_t *tree)
 {
    NV_STATUS status;
    NvU64 min_va_upper, max_va_lower;
-    NvU32 page_size;
+    NvU64 page_size;

    if (!page_tree_ats_init_required(tree))
        return NV_OK;
@ -1090,7 +1090,7 @@ static void page_tree_set_location(uvm_page_tree_t *tree, uvm_aperture_t locatio
 NV_STATUS uvm_page_tree_init(uvm_gpu_t *gpu,
                             uvm_gpu_va_space_t *gpu_va_space,
                             uvm_page_tree_type_t type,
-                             NvU32 big_page_size,
+                             NvU64 big_page_size,
                             uvm_aperture_t location,
                             uvm_page_tree_t *tree)
 {
@ -1110,7 +1110,7 @@ NV_STATUS uvm_page_tree_init(uvm_gpu_t *gpu,
    tree->gpu_va_space = gpu_va_space;
    tree->big_page_size = big_page_size;

-    UVM_ASSERT(gpu->mem_info.max_vidmem_page_size & tree->hal->page_sizes());
+    UVM_ASSERT(uvm_mmu_page_size_supported(tree, big_page_size));

    page_tree_set_location(tree, location);

@ -1347,7 +1347,7 @@ NV_STATUS uvm_page_tree_wait(uvm_page_tree_t *tree)
 }

 static NV_STATUS try_get_ptes(uvm_page_tree_t *tree,
-                              NvU32 page_size,
+                              NvU64 page_size,
                              NvU64 start,
                              NvLength size,
                              uvm_page_table_range_t *range,
@ -1379,7 +1379,7 @@ static NV_STATUS try_get_ptes(uvm_page_tree_t *tree,
    // This algorithm will work with unaligned ranges, but the caller's intent
    // is unclear
    UVM_ASSERT_MSG(start % page_size == 0 && size % page_size == 0,
-                   "start 0x%llx size 0x%zx page_size 0x%x\n",
+                   "start 0x%llx size 0x%zx page_size 0x%llx\n",
                   start,
                   (size_t)size,
                   page_size);
@ -1448,7 +1448,7 @@ static NV_STATUS map_remap(uvm_page_tree_t *tree, NvU64 start, NvLength size, uv
 {
    NV_STATUS status;
    uvm_push_t push;
-    NvU32 page_sizes;
+    NvU64 page_sizes;
    uvm_mmu_page_table_alloc_t *phys_alloc[1];

    // TODO: Bug 2734399
@ -1460,7 +1460,7 @@ static NV_STATUS map_remap(uvm_page_tree_t *tree, NvU64 start, NvLength size, uv
    status = page_tree_begin_acquire(tree,
                                     &tree->tracker,
                                     &push,
-                                     "map remap: [0x%llx, 0x%llx), page_size: %d",
+                                     "map remap: [0x%llx, 0x%llx), page_size: %lld",
                                     start,
                                     start + size,
                                     range->page_size);
@ -1500,7 +1500,7 @@ static NV_STATUS map_remap(uvm_page_tree_t *tree, NvU64 start, NvLength size, uv
 }

 NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree,
-                                       NvU32 page_size,
+                                       NvU64 page_size,
                                       NvU64 start,
                                       NvLength size,
                                       uvm_pmm_alloc_flags_t pmm_flags,
@ -1545,7 +1545,7 @@ NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree,
 }

 NV_STATUS uvm_page_tree_get_ptes(uvm_page_tree_t *tree,
-                                 NvU32 page_size,
+                                 NvU64 page_size,
                                 NvU64 start,
                                 NvLength size,
                                 uvm_pmm_alloc_flags_t pmm_flags,
@ -1596,7 +1596,7 @@ void uvm_page_table_range_shrink(uvm_page_tree_t *tree, uvm_page_table_range_t *
 }

 NV_STATUS uvm_page_tree_get_entry(uvm_page_tree_t *tree,
-                                  NvU32 page_size,
+                                  NvU64 page_size,
                                  NvU64 start,
                                  uvm_pmm_alloc_flags_t pmm_flags,
                                  uvm_page_table_range_t *single)
@ -1621,7 +1621,7 @@ void uvm_page_tree_clear_pde(uvm_page_tree_t *tree, uvm_page_table_range_t *sing
 static NV_STATUS poison_ptes(uvm_page_tree_t *tree,
                             uvm_page_directory_t *pte_dir,
                             uvm_page_directory_t *parent,
-                             NvU32 page_size)
+                             NvU64 page_size)
 {
    NV_STATUS status;
    uvm_push_t push;
@ -1633,7 +1633,7 @@ static NV_STATUS poison_ptes(uvm_page_tree_t *tree,
    // The flat mappings should always be set up when executing this path
    UVM_ASSERT(!uvm_mmu_use_cpu(tree));

-    status = page_tree_begin_acquire(tree, &tree->tracker, &push, "Poisoning child table of page size %u", page_size);
+    status = page_tree_begin_acquire(tree, &tree->tracker, &push, "Poisoning child table of page size %llu", page_size);
    if (status != NV_OK)
        return status;

@ -1660,7 +1660,7 @@ static NV_STATUS poison_ptes(uvm_page_tree_t *tree,
 }

 NV_STATUS uvm_page_tree_alloc_table(uvm_page_tree_t *tree,
-                                    NvU32 page_size,
+                                    NvU64 page_size,
                                    uvm_pmm_alloc_flags_t pmm_flags,
                                    uvm_page_table_range_t *single,
                                    uvm_page_table_range_t *children)
@ -1768,7 +1768,7 @@ static size_t range_vec_calc_range_index(uvm_page_table_range_vec_t *range_vec,
 NV_STATUS uvm_page_table_range_vec_init(uvm_page_tree_t *tree,
                                        NvU64 start,
                                        NvU64 size,
-                                        NvU32 page_size,
+                                        NvU64 page_size,
                                        uvm_pmm_alloc_flags_t pmm_flags,
                                        uvm_page_table_range_vec_t *range_vec)
 {
@ -1776,8 +1776,8 @@ NV_STATUS uvm_page_table_range_vec_init(uvm_page_tree_t *tree,
    size_t i;

    UVM_ASSERT(size != 0);
-    UVM_ASSERT_MSG(IS_ALIGNED(start, page_size), "start 0x%llx page_size 0x%x\n", start, page_size);
-    UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%x\n", size, page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(start, page_size), "start 0x%llx page_size 0x%llx\n", start, page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%llx\n", size, page_size);

    range_vec->tree = tree;
    range_vec->page_size = page_size;
@ -1826,7 +1826,7 @@ out:
 NV_STATUS uvm_page_table_range_vec_create(uvm_page_tree_t *tree,
                                          NvU64 start,
                                          NvU64 size,
-                                          NvU32 page_size,
+                                          NvU64 page_size,
                                          uvm_pmm_alloc_flags_t pmm_flags,
                                          uvm_page_table_range_vec_t **range_vec_out)
 {
@ -1952,7 +1952,7 @@ static NV_STATUS uvm_page_table_range_vec_clear_ptes_gpu(uvm_page_table_range_ve
    size_t i;
    uvm_page_tree_t *tree = range_vec->tree;
    uvm_gpu_t *gpu = tree->gpu;
-    NvU32 page_size = range_vec->page_size;
+    NvU64 page_size = range_vec->page_size;
    NvU32 entry_size = uvm_mmu_pte_size(tree, page_size);
    NvU64 invalid_pte = 0;
    uvm_push_t push;
@ -2237,7 +2237,7 @@ static NV_STATUS create_identity_mapping(uvm_gpu_t *gpu,
                                         NvU64 size,
                                         uvm_aperture_t aperture,
                                         NvU64 phys_offset,
-                                         NvU32 page_size,
+                                         NvU64 page_size,
                                         uvm_pmm_alloc_flags_t pmm_flags)
 {
    NV_STATUS status;
@ -2312,7 +2312,7 @@ bool uvm_mmu_parent_gpu_needs_dynamic_sysmem_mapping(uvm_parent_gpu_t *parent_gp

 NV_STATUS create_static_vidmem_mapping(uvm_gpu_t *gpu)
 {
-    NvU32 page_size;
+    NvU64 page_size;
    NvU64 size;
    uvm_aperture_t aperture = UVM_APERTURE_VID;
    NvU64 phys_offset = 0;
@ -2351,7 +2351,7 @@ static void destroy_static_vidmem_mapping(uvm_gpu_t *gpu)

 NV_STATUS uvm_mmu_create_peer_identity_mappings(uvm_gpu_t *gpu, uvm_gpu_t *peer)
 {
-    NvU32 page_size;
+    NvU64 page_size;
    NvU64 size;
    uvm_aperture_t aperture;
    NvU64 phys_offset;
@ -2535,7 +2535,7 @@ static void root_chunk_mapping_destroy(uvm_gpu_t *gpu, uvm_gpu_root_chunk_mappin
    uvm_push_t push;
    NvU32 entry_size;
    uvm_pte_batch_t pte_batch;
-    NvU32 page_size;
+    NvU64 page_size;
    NvU64 size;
    NvU64 invalid_pte;
    uvm_page_table_range_t *range = root_chunk_mapping->range;
@ -2585,7 +2585,7 @@ static NV_STATUS root_chunk_mapping_create(uvm_gpu_t *gpu, uvm_gpu_root_chunk_ma
    uvm_push_t push;
    NvU64 pte_bits;
    NvU32 entry_size;
-    NvU32 page_size = UVM_CHUNK_SIZE_MAX;
+    NvU64 page_size = UVM_CHUNK_SIZE_MAX;
    NvU64 size = UVM_CHUNK_SIZE_MAX;

    range = uvm_kvmalloc_zero(sizeof(*range));
@ -2852,7 +2852,7 @@ NV_STATUS uvm_mmu_sysmem_map(uvm_gpu_t *gpu, NvU64 pa, NvU64 size)
        if (sysmem_mapping->range_vec == NULL) {
            uvm_gpu_address_t virtual_address = uvm_parent_gpu_address_virtual_from_sysmem_phys(gpu->parent, curr_pa);
            NvU64 phys_offset = curr_pa;
-            NvU32 page_size = mmu_biggest_page_size(&gpu->address_space_tree, UVM_APERTURE_SYS);
+            NvU64 page_size = mmu_biggest_page_size(&gpu->address_space_tree, UVM_APERTURE_SYS);
            uvm_pmm_alloc_flags_t pmm_flags;

            // No eviction is requested when allocating the page tree storage,
--- a/kernel-open/nvidia-uvm/uvm_mmu.h
+++ b/kernel-open/nvidia-uvm/uvm_mmu.h
@ -208,7 +208,7 @@ struct uvm_mmu_mode_hal_struct
    // This is an optimization which reduces TLB pressure, reduces the number of
    // TLB invalidates we must issue, and means we don't have to initialize the
    // 4k PTEs which are covered by big PTEs since the MMU will never read them.
-    NvU64 (*unmapped_pte)(NvU32 page_size);
+    NvU64 (*unmapped_pte)(NvU64 page_size);

    // Bit pattern used for debug purposes to clobber PTEs which ought to be
    // unused. In practice this will generate a PRIV violation or a physical
@ -234,23 +234,23 @@ struct uvm_mmu_mode_hal_struct
    // For dual PDEs, this is ether 1 or 0, depending on the page size.
    // This is used to index the host copy only. GPU PDEs are always entirely
    // re-written using make_pde.
-    NvLength (*entry_offset)(NvU32 depth, NvU32 page_size);
+    NvLength (*entry_offset)(NvU32 depth, NvU64 page_size);

    // number of virtual address bits used to index the directory/table at a
    // given depth
-    NvU32 (*index_bits)(NvU32 depth, NvU32 page_size);
+    NvU32 (*index_bits)(NvU32 depth, NvU64 page_size);

    // total number of bits that represent the virtual address space
    NvU32 (*num_va_bits)(void);

    // the size, in bytes, of a directory/table at a given depth.
-    NvLength (*allocation_size)(NvU32 depth, NvU32 page_size);
+    NvLength (*allocation_size)(NvU32 depth, NvU64 page_size);

    // the depth which corresponds to the page tables
-    NvU32 (*page_table_depth)(NvU32 page_size);
+    NvU32 (*page_table_depth)(NvU64 page_size);

    // bitwise-or of supported page sizes
-    NvU32 (*page_sizes)(void);
+    NvU64 (*page_sizes)(void);
 };

 struct uvm_page_table_range_struct
@ -258,7 +258,7 @@ struct uvm_page_table_range_struct
    uvm_page_directory_t *table;
    NvU32 start_index;
    NvU32 entry_count;
-    NvU32 page_size;
+    NvU64 page_size;
 };

 typedef enum
@ -275,7 +275,7 @@ struct uvm_page_tree_struct
    uvm_page_directory_t *root;
    uvm_mmu_mode_hal_t *hal;
    uvm_page_tree_type_t type;
-    NvU32 big_page_size;
+    NvU64 big_page_size;

    // Pointer to the GPU VA space containing the page tree.
    // This pointer is set only for page trees of type
@ -325,7 +325,7 @@ struct uvm_page_table_range_vec_struct
    NvU64 size;

    // Page size used for all the page table ranges
-    NvU32 page_size;
+    NvU64 page_size;

    // Page table ranges covering the VA
    uvm_page_table_range_t *ranges;
@ -352,7 +352,7 @@ void uvm_mmu_init_gpu_peer_addresses(uvm_gpu_t *gpu);
 NV_STATUS uvm_page_tree_init(uvm_gpu_t *gpu,
                             uvm_gpu_va_space_t *gpu_va_space,
                             uvm_page_tree_type_t type,
-                             NvU32 big_page_size,
+                             NvU64 big_page_size,
                             uvm_aperture_t location,
                             uvm_page_tree_t *tree_out);

@ -374,7 +374,7 @@ void uvm_page_tree_deinit(uvm_page_tree_t *tree);
 // an existing range or change the size of an existing range, use
 // uvm_page_table_range_get_upper() and/or uvm_page_table_range_shrink().
 NV_STATUS uvm_page_tree_get_ptes(uvm_page_tree_t *tree,
-                                 NvU32 page_size,
+                                 NvU64 page_size,
                                 NvU64 start,
                                 NvLength size,
                                 uvm_pmm_alloc_flags_t pmm_flags,
@ -384,7 +384,7 @@ NV_STATUS uvm_page_tree_get_ptes(uvm_page_tree_t *tree,
 //
 // All pending operations can be waited on with uvm_page_tree_wait().
 NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree,
-                                       NvU32 page_size,
+                                       NvU64 page_size,
                                       NvU64 start,
                                       NvLength size,
                                       uvm_pmm_alloc_flags_t pmm_flags,
@ -395,7 +395,7 @@ NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree,
 // This is equivalent to calling uvm_page_tree_get_ptes() with size equal to
 // page_size.
 NV_STATUS uvm_page_tree_get_entry(uvm_page_tree_t *tree,
-                                  NvU32 page_size,
+                                  NvU64 page_size,
                                  NvU64 start,
                                  uvm_pmm_alloc_flags_t pmm_flags,
                                  uvm_page_table_range_t *single);
@ -426,7 +426,7 @@ void uvm_page_tree_clear_pde(uvm_page_tree_t *tree, uvm_page_table_range_t *sing
 // It is the caller's responsibility to initialize the returned table before
 // calling uvm_page_tree_write_pde.
 NV_STATUS uvm_page_tree_alloc_table(uvm_page_tree_t *tree,
-                                    NvU32 page_size,
+                                    NvU64 page_size,
                                    uvm_pmm_alloc_flags_t pmm_flags,
                                    uvm_page_table_range_t *single,
                                    uvm_page_table_range_t *children);
@ -480,7 +480,7 @@ static uvm_mmu_page_table_alloc_t *uvm_page_tree_pdb(uvm_page_tree_t *tree)
 NV_STATUS uvm_page_table_range_vec_init(uvm_page_tree_t *tree,
                                        NvU64 start,
                                        NvU64 size,
-                                        NvU32 page_size,
+                                        NvU64 page_size,
                                        uvm_pmm_alloc_flags_t pmm_flags,
                                        uvm_page_table_range_vec_t *range_vec);

@ -489,7 +489,7 @@ NV_STATUS uvm_page_table_range_vec_init(uvm_page_tree_t *tree,
 NV_STATUS uvm_page_table_range_vec_create(uvm_page_tree_t *tree,
                                          NvU64 start,
                                          NvU64 size,
-                                          NvU32 page_size,
+                                          NvU64 page_size,
                                          uvm_pmm_alloc_flags_t pmm_flags,
                                          uvm_page_table_range_vec_t **range_vec_out);

@ -601,12 +601,12 @@ void uvm_mmu_chunk_unmap(uvm_gpu_chunk_t *chunk, uvm_tracker_t *tracker);
 // uvm_parent_gpu_map_cpu_pages for the given GPU.
 NV_STATUS uvm_mmu_sysmem_map(uvm_gpu_t *gpu, NvU64 pa, NvU64 size);

-static NvU64 uvm_mmu_page_tree_entries(uvm_page_tree_t *tree, NvU32 depth, NvU32 page_size)
+static NvU64 uvm_mmu_page_tree_entries(uvm_page_tree_t *tree, NvU32 depth, NvU64 page_size)
 {
    return 1ull << tree->hal->index_bits(depth, page_size);
 }

-static NvU64 uvm_mmu_pde_coverage(uvm_page_tree_t *tree, NvU32 page_size)
+static NvU64 uvm_mmu_pde_coverage(uvm_page_tree_t *tree, NvU64 page_size)
 {
    NvU32 depth = tree->hal->page_table_depth(page_size);
    return uvm_mmu_page_tree_entries(tree, depth, page_size) * page_size;
@ -615,21 +615,21 @@ static NvU64 uvm_mmu_pde_coverage(uvm_page_tree_t *tree, NvU32 page_size)
 // Page sizes supported by the GPU. Use uvm_mmu_biggest_page_size() to retrieve
 // the largest page size supported in a given system, which considers the GMMU
 // and vMMU page sizes and segment sizes.
-static bool uvm_mmu_page_size_supported(uvm_page_tree_t *tree, NvU32 page_size)
+static bool uvm_mmu_page_size_supported(uvm_page_tree_t *tree, NvU64 page_size)
 {
-    UVM_ASSERT_MSG(is_power_of_2(page_size), "0x%x\n", page_size);
+    UVM_ASSERT_MSG(is_power_of_2(page_size), "0x%llx\n", page_size);

    return (tree->hal->page_sizes() & page_size) != 0;
 }

-static NvU32 uvm_mmu_biggest_page_size_up_to(uvm_page_tree_t *tree, NvU32 max_page_size)
+static NvU64 uvm_mmu_biggest_page_size_up_to(uvm_page_tree_t *tree, NvU64 max_page_size)
 {
-    NvU32 gpu_page_sizes = tree->hal->page_sizes();
-    NvU32 smallest_gpu_page_size = gpu_page_sizes & ~(gpu_page_sizes - 1);
-    NvU32 page_sizes;
-    NvU32 page_size;
+    NvU64 gpu_page_sizes = tree->hal->page_sizes();
+    NvU64 smallest_gpu_page_size = gpu_page_sizes & ~(gpu_page_sizes - 1);
+    NvU64 page_sizes;
+    NvU64 page_size;

-    UVM_ASSERT_MSG(is_power_of_2(max_page_size), "0x%x\n", max_page_size);
+    UVM_ASSERT_MSG(is_power_of_2(max_page_size), "0x%llx\n", max_page_size);

    if (max_page_size < smallest_gpu_page_size)
        return 0;
@ -638,14 +638,14 @@ static NvU32 uvm_mmu_biggest_page_size_up_to(uvm_page_tree_t *tree, NvU32 max_pa
    page_sizes = gpu_page_sizes & (max_page_size | (max_page_size - 1));

    // And pick the biggest one of them
-    page_size = 1 << __fls(page_sizes);
+    page_size = 1ULL << __fls(page_sizes);

-    UVM_ASSERT_MSG(uvm_mmu_page_size_supported(tree, page_size), "page_size 0x%x", page_size);
+    UVM_ASSERT_MSG(uvm_mmu_page_size_supported(tree, page_size), "page_size 0x%llx", page_size);

    return page_size;
 }

-static NvU32 uvm_mmu_pte_size(uvm_page_tree_t *tree, NvU32 page_size)
+static NvU32 uvm_mmu_pte_size(uvm_page_tree_t *tree, NvU64 page_size)
 {
    return tree->hal->entry_size(tree->hal->page_table_depth(page_size));
 }
--- a/kernel-open/nvidia-uvm/uvm_page_tree_test.c
+++ b/kernel-open/nvidia-uvm/uvm_page_tree_test.c
@ -96,7 +96,7 @@ typedef struct
 {
    NvU64 base;
    NvU64 size;
-    NvU32 page_size;
+    NvU64 page_size;
    NvU32 depth;
    uvm_membar_t membar;
 } fake_tlb_invalidate_t;
@ -153,7 +153,7 @@ static void fake_tlb_invalidate_va(uvm_push_t *push,
                                   NvU32 depth,
                                   NvU64 base,
                                   NvU64 size,
-                                   NvU32 page_size,
+                                   NvU64 page_size,
                                   uvm_membar_t membar)
 {
    if (!g_fake_tlb_invals_tracking_enabled)
@ -249,7 +249,11 @@ static bool assert_last_invalidate_all(NvU32 expected_depth, bool expected_memba
 }

 static bool assert_invalidate_range_specific(fake_tlb_invalidate_t *inval,
-        NvU64 base, NvU64 size, NvU32 page_size, NvU32 expected_depth, bool expected_membar)
+                                             NvU64 base,
+                                             NvU64 size,
+                                             NvU64 page_size,
+                                             NvU32 expected_depth,
+                                             bool expected_membar)
 {
    UVM_ASSERT(g_fake_tlb_invals_tracking_enabled);

@ -271,7 +275,7 @@ static bool assert_invalidate_range_specific(fake_tlb_invalidate_t *inval,
        return false;
    }
    if (inval->page_size != page_size && inval->base != 0 && inval->size != -1) {
-        UVM_TEST_PRINT("Expected page size %u, got %u instead\n", page_size, inval->page_size);
+        UVM_TEST_PRINT("Expected page size %llu, got %llu instead\n", page_size, inval->page_size);
        return false;
    }

@ -280,7 +284,7 @@ static bool assert_invalidate_range_specific(fake_tlb_invalidate_t *inval,

 static bool assert_invalidate_range(NvU64 base,
                                    NvU64 size,
-                                    NvU32 page_size,
+                                    NvU64 page_size,
                                    bool allow_inval_all,
                                    NvU32 range_depth,
                                    NvU32 all_depth,
@ -325,7 +329,7 @@ static NV_STATUS test_page_tree_init_kernel(uvm_gpu_t *gpu, NvU32 big_page_size,
 }

 static NV_STATUS test_page_tree_get_ptes(uvm_page_tree_t *tree,
-                                         NvU32 page_size,
+                                         NvU64 page_size,
                                         NvU64 start,
                                         NvLength size,
                                         uvm_page_table_range_t *range)
@ -341,7 +345,7 @@ static NV_STATUS test_page_tree_get_ptes(uvm_page_tree_t *tree,
 }

 static NV_STATUS test_page_tree_get_entry(uvm_page_tree_t *tree,
-                                          NvU32 page_size,
+                                          NvU64 page_size,
                                          NvU64 start,
                                          uvm_page_table_range_t *single)
 {
@ -355,14 +359,14 @@ static NV_STATUS test_page_tree_get_entry(uvm_page_tree_t *tree,
 }

 static NV_STATUS test_page_tree_alloc_table(uvm_page_tree_t *tree,
-                                            NvU32 page_size,
+                                            NvU64 page_size,
                                            uvm_page_table_range_t *single,
                                            uvm_page_table_range_t *children)
 {
    return uvm_page_tree_alloc_table(tree, page_size, UVM_PMM_ALLOC_FLAGS_NONE, single, children);
 }

-static bool assert_entry_no_invalidate(uvm_page_tree_t *tree, NvU32 page_size, NvU64 start)
+static bool assert_entry_no_invalidate(uvm_page_tree_t *tree, NvU64 page_size, NvU64 start)
 {
    uvm_page_table_range_t entry;
    bool result = true;
@ -378,7 +382,7 @@ static bool assert_entry_no_invalidate(uvm_page_tree_t *tree, NvU32 page_size, N
    return assert_no_invalidate() && result;
 }

-static bool assert_entry_invalidate(uvm_page_tree_t *tree, NvU32 page_size, NvU64 start, NvU32 depth, bool membar)
+static bool assert_entry_invalidate(uvm_page_tree_t *tree, NvU64 page_size, NvU64 start, NvU32 depth, bool membar)
 {
    uvm_page_table_range_t entry;
    bool result = true;
@ -932,8 +936,8 @@ static NV_STATUS split_and_free(uvm_gpu_t *gpu)

 static NV_STATUS check_sizes(uvm_gpu_t *gpu)
 {
-    NvU32 user_sizes = UVM_PAGE_SIZE_2M;
-    NvU32 kernel_sizes = UVM_PAGE_SIZE_4K | 256;
+    NvU64 user_sizes = UVM_PAGE_SIZE_2M;
+    NvU64 kernel_sizes = UVM_PAGE_SIZE_4K | 256;

    if (UVM_PAGE_SIZE_64K >= PAGE_SIZE)
        user_sizes |= UVM_PAGE_SIZE_64K;
@ -1161,7 +1165,7 @@ static NV_STATUS test_tlb_batch_invalidates_case(uvm_page_tree_t *tree,
    return status;
 }

-static NV_STATUS test_tlb_batch_invalidates(uvm_gpu_t *gpu, const NvU32 *page_sizes, const NvU32 page_sizes_count)
+static NV_STATUS test_tlb_batch_invalidates(uvm_gpu_t *gpu, const NvU64 *page_sizes, const NvU32 page_sizes_count)
 {
    NV_STATUS status = NV_OK;
    uvm_page_tree_t tree;
@ -1177,8 +1181,8 @@ static NV_STATUS test_tlb_batch_invalidates(uvm_gpu_t *gpu, const NvU32 *page_si
    for (min_index = 0; min_index < page_sizes_count; ++min_index) {
        for (max_index = min_index; max_index < page_sizes_count; ++max_index) {
            for (size_index = 0; size_index < ARRAY_SIZE(sizes_in_max_pages); ++size_index) {
-                NvU32 min_page_size = page_sizes[min_index];
-                NvU32 max_page_size = page_sizes[max_index];
+                NvU64 min_page_size = page_sizes[min_index];
+                NvU64 max_page_size = page_sizes[max_index];
                NvU64 size = (NvU64)sizes_in_max_pages[size_index] * max_page_size;

                TEST_CHECK_GOTO(test_tlb_batch_invalidates_case(&tree,
@ -1282,7 +1286,7 @@ static NV_STATUS test_range_vec_clear_ptes(uvm_page_table_range_vec_t *range_vec
 static NV_STATUS test_range_vec_create(uvm_page_tree_t *tree,
                                       NvU64 start,
                                       NvU64 size,
-                                       NvU32 page_size,
+                                       NvU64 page_size,
                                       uvm_page_table_range_vec_t **range_vec_out)
 {
    uvm_page_table_range_vec_t *range_vec;
@ -1303,7 +1307,7 @@ static NV_STATUS test_range_vec_create(uvm_page_tree_t *tree,
 // Test page table range vector APIs.
 // Notably the test leaks the page_tree and range_vec on error as it's hard to
 // clean up on failure and the destructors would likely assert.
-static NV_STATUS test_range_vec(uvm_gpu_t *gpu, NvU32 big_page_size, NvU32 page_size)
+static NV_STATUS test_range_vec(uvm_gpu_t *gpu, NvU32 big_page_size, NvU64 page_size)
 {
    NV_STATUS status = NV_OK;
    uvm_page_tree_t tree;
@ -1511,7 +1515,7 @@ static uvm_mmu_page_table_alloc_t fake_table_alloc(uvm_aperture_t aperture, NvU6
 // Queries the supported page sizes of the GPU(uvm_gpu_t) and fills the
 // page_sizes array up to MAX_NUM_PAGE_SIZE. Returns the number of elements in
 // page_sizes;
-size_t get_page_sizes(uvm_gpu_t *gpu, NvU32 *page_sizes)
+size_t get_page_sizes(uvm_gpu_t *gpu, NvU64 *page_sizes)
 {
    unsigned long page_size_log2;
    unsigned long page_sizes_bitvec;
@ -1524,7 +1528,7 @@ size_t get_page_sizes(uvm_gpu_t *gpu, NvU32 *page_sizes)
    page_sizes_bitvec = hal->page_sizes();

    for_each_set_bit(page_size_log2, &page_sizes_bitvec, BITS_PER_LONG) {
-        NvU32 page_size = (NvU32)(1ULL << page_size_log2);
+        NvU64 page_size = 1ULL << page_size_log2;
        UVM_ASSERT(count < MAX_NUM_PAGE_SIZES);
        page_sizes[count++] = page_size;
    }
@ -1572,7 +1576,7 @@ typedef NV_STATUS (*entry_test_page_size_func)(uvm_gpu_t *gpu, size_t page_size)

 static NV_STATUS entry_test_maxwell(uvm_gpu_t *gpu)
 {
-    static const NvU32 big_page_sizes[] = {UVM_PAGE_SIZE_64K, UVM_PAGE_SIZE_128K};
+    static const NvU64 big_page_sizes[] = {UVM_PAGE_SIZE_64K, UVM_PAGE_SIZE_128K};
    NvU64 pde_bits;
    uvm_mmu_page_table_alloc_t *phys_allocs[2];
    uvm_mmu_page_table_alloc_t alloc_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x9999999000LL);
@ -1663,7 +1667,7 @@ static NV_STATUS entry_test_maxwell(uvm_gpu_t *gpu)

 static NV_STATUS entry_test_pascal(uvm_gpu_t *gpu, entry_test_page_size_func entry_test_page_size)
 {
-    NvU32 page_sizes[MAX_NUM_PAGE_SIZES];
+    NvU64 page_sizes[MAX_NUM_PAGE_SIZES];
    NvU64 pde_bits[2];
    size_t i, num_page_sizes;
    uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
@ -1759,7 +1763,7 @@ static NV_STATUS entry_test_pascal(uvm_gpu_t *gpu, entry_test_page_size_func ent

 static NV_STATUS entry_test_volta(uvm_gpu_t *gpu, entry_test_page_size_func entry_test_page_size)
 {
-    NvU32 page_sizes[MAX_NUM_PAGE_SIZES];
+    NvU64 page_sizes[MAX_NUM_PAGE_SIZES];
    NvU64 pde_bits[2];
    size_t i, num_page_sizes;
    uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
@ -1833,7 +1837,7 @@ static NV_STATUS entry_test_volta(uvm_gpu_t *gpu, entry_test_page_size_func entr

 static NV_STATUS entry_test_ampere(uvm_gpu_t *gpu, entry_test_page_size_func entry_test_page_size)
 {
-    NvU32 page_sizes[MAX_NUM_PAGE_SIZES];
+    NvU64 page_sizes[MAX_NUM_PAGE_SIZES];
    NvU32 i, num_page_sizes;

    num_page_sizes = get_page_sizes(gpu, page_sizes);
@ -1847,7 +1851,7 @@ static NV_STATUS entry_test_ampere(uvm_gpu_t *gpu, entry_test_page_size_func ent
 static NV_STATUS entry_test_hopper(uvm_gpu_t *gpu, entry_test_page_size_func entry_test_page_size)
 {
    NV_STATUS status = NV_OK;
-    NvU32 page_sizes[MAX_NUM_PAGE_SIZES];
+    NvU64 page_sizes[MAX_NUM_PAGE_SIZES];
    NvU64 pde_bits[2];
    uvm_page_directory_t *dirs[5];
    size_t i, num_page_sizes;
@ -2290,8 +2294,8 @@ static NV_STATUS fake_gpu_init_hopper(uvm_gpu_t *fake_gpu)
 static NV_STATUS maxwell_test_page_tree(uvm_gpu_t *maxwell)
 {
    // create a fake Maxwell GPU for this test.
-    static const NvU32 big_page_sizes[] = {UVM_PAGE_SIZE_64K, UVM_PAGE_SIZE_128K};
-    NvU32 i, j, big_page_size, page_size;
+    static const NvU64 big_page_sizes[] = {UVM_PAGE_SIZE_64K, UVM_PAGE_SIZE_128K};
+    NvU64 i, j, big_page_size, page_size;

    TEST_CHECK_RET(fake_gpu_init_maxwell(maxwell) == NV_OK);

@ -2320,7 +2324,7 @@ static NV_STATUS pascal_test_page_tree(uvm_gpu_t *pascal)
    // create a fake Pascal GPU for this test.
    NvU32 tlb_batch_saved_max_pages;
    NvU32 i;
-    NvU32 page_sizes[MAX_NUM_PAGE_SIZES];
+    NvU64 page_sizes[MAX_NUM_PAGE_SIZES];
    size_t num_page_sizes;

    TEST_CHECK_RET(fake_gpu_init_pascal(pascal) == NV_OK);
@ -2381,7 +2385,7 @@ static NV_STATUS volta_test_page_tree(uvm_gpu_t *volta)
 static NV_STATUS ampere_test_page_tree(uvm_gpu_t *ampere)
 {
    NvU32 i, tlb_batch_saved_max_pages;
-    NvU32 page_sizes[MAX_NUM_PAGE_SIZES];
+    NvU64 page_sizes[MAX_NUM_PAGE_SIZES];
    size_t num_page_sizes;

    TEST_CHECK_RET(fake_gpu_init_ampere(ampere) == NV_OK);
--- a/kernel-open/nvidia-uvm/uvm_pascal_host.c
+++ b/kernel-open/nvidia-uvm/uvm_pascal_host.c
@ -92,7 +92,13 @@ void uvm_hal_pascal_host_tlb_invalidate_all(uvm_push_t *push, uvm_gpu_phys_addre
    uvm_hal_tlb_invalidate_membar(push, membar);
 }

-void uvm_hal_pascal_host_tlb_invalidate_va(uvm_push_t *push, uvm_gpu_phys_address_t pdb, NvU32 depth, NvU64 base, NvU64 size, NvU32 page_size, uvm_membar_t membar)
+void uvm_hal_pascal_host_tlb_invalidate_va(uvm_push_t *push,
+                                           uvm_gpu_phys_address_t pdb,
+                                           NvU32 depth,
+                                           NvU64 base,
+                                           NvU64 size,
+                                           NvU64 page_size,
+                                           uvm_membar_t membar)
 {
    NvU32 aperture_value;
    NvU32 page_table_level;
@ -127,9 +133,9 @@ void uvm_hal_pascal_host_tlb_invalidate_va(uvm_push_t *push, uvm_gpu_phys_addres
        ack_value = HWCONST(C06F, MEM_OP_C, TLB_INVALIDATE_ACK_TYPE, GLOBALLY);
    }

-    UVM_ASSERT_MSG(IS_ALIGNED(page_size, 1 << 12), "page_size 0x%x\n", page_size);
-    UVM_ASSERT_MSG(IS_ALIGNED(base, page_size), "base 0x%llx page_size 0x%x\n", base, page_size);
-    UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%x\n", size, page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(page_size, 1 << 12), "page_size 0x%llx\n", page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(base, page_size), "base 0x%llx page_size 0x%llx\n", base, page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%llx\n", size, page_size);
    UVM_ASSERT_MSG(size > 0, "size 0x%llx\n", size);

    base >>= 12;
--- a/kernel-open/nvidia-uvm/uvm_pascal_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_pascal_mmu.c
@ -54,7 +54,7 @@ static NvU32 entries_per_index_pascal(NvU32 depth)
    return 1;
 }

-static NvLength entry_offset_pascal(NvU32 depth, NvU32 page_size)
+static NvLength entry_offset_pascal(NvU32 depth, NvU64 page_size)
 {
    UVM_ASSERT(depth < 5);
    if (page_size == UVM_PAGE_SIZE_4K && depth == 3)
@ -178,7 +178,7 @@ static NvLength entry_size_pascal(NvU32 depth)
        return 8;
 }

-static NvU32 index_bits_pascal(NvU32 depth, NvU32 page_size)
+static NvU32 index_bits_pascal(NvU32 depth, NvU64 page_size)
 {
    static const NvU32 bit_widths[] = {2, 9, 9, 8};
    // some code paths keep on querying this until they get a 0, meaning only the page offset remains.
@ -204,7 +204,7 @@ static NvU32 num_va_bits_pascal(void)
    return 49;
 }

-static NvLength allocation_size_pascal(NvU32 depth, NvU32 page_size)
+static NvLength allocation_size_pascal(NvU32 depth, NvU64 page_size)
 {
    UVM_ASSERT(depth < 5);
    if (depth == 4 && page_size == UVM_PAGE_SIZE_64K)
@ -213,7 +213,7 @@ static NvLength allocation_size_pascal(NvU32 depth, NvU32 page_size)
    return 4096;
 }

-static NvU32 page_table_depth_pascal(NvU32 page_size)
+static NvU32 page_table_depth_pascal(NvU64 page_size)
 {
    if (page_size == UVM_PAGE_SIZE_2M)
        return 3;
@ -221,12 +221,12 @@ static NvU32 page_table_depth_pascal(NvU32 page_size)
        return 4;
 }

-static NvU32 page_sizes_pascal(void)
+static NvU64 page_sizes_pascal(void)
 {
    return UVM_PAGE_SIZE_2M | UVM_PAGE_SIZE_64K | UVM_PAGE_SIZE_4K;
 }

-static NvU64 unmapped_pte_pascal(NvU32 page_size)
+static NvU64 unmapped_pte_pascal(NvU64 page_size)
 {
    // Setting the privilege bit on an otherwise-zeroed big PTE causes the
    // corresponding 4k PTEs to be ignored. This allows the invalidation of a
@ -362,7 +362,7 @@ static uvm_mmu_mode_hal_t pascal_mmu_mode_hal =
    .page_sizes = page_sizes_pascal
 };

-uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_pascal(NvU32 big_page_size)
+uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_pascal(NvU64 big_page_size)
 {
    UVM_ASSERT(big_page_size == UVM_PAGE_SIZE_64K || big_page_size == UVM_PAGE_SIZE_128K);

--- a/kernel-open/nvidia-uvm/uvm_perf_prefetch.c
+++ b/kernel-open/nvidia-uvm/uvm_perf_prefetch.c
@ -162,7 +162,7 @@ static void grow_fault_granularity_if_no_thrashing(uvm_perf_prefetch_bitmap_tree
 }

 static void grow_fault_granularity(uvm_perf_prefetch_bitmap_tree_t *bitmap_tree,
-                                   NvU32 big_page_size,
+                                   NvU64 big_page_size,
                                   uvm_va_block_region_t big_pages_region,
                                   uvm_va_block_region_t max_prefetch_region,
                                   const uvm_page_mask_t *faulted_pages,
@ -245,7 +245,7 @@ static void update_bitmap_tree_from_va_block(uvm_perf_prefetch_bitmap_tree_t *bi
                                             uvm_va_block_region_t max_prefetch_region)

 {
-    NvU32 big_page_size;
+    NvU64 big_page_size;
    uvm_va_block_region_t big_pages_region;
    uvm_va_space_t *va_space;
    const uvm_page_mask_t *thrashing_pages;
--- a/kernel-open/nvidia-uvm/uvm_perf_thrashing.c
+++ b/kernel-open/nvidia-uvm/uvm_perf_thrashing.c
@ -1987,21 +1987,12 @@ NV_STATUS uvm_perf_thrashing_init(void)
                                         UVM_PERF_THRASHING_PIN_THRESHOLD_DEFAULT,
                                         UVM_PERF_THRASHING_PIN_THRESHOLD_MAX);

-
-
-    // In Confidential Computing, the DMA path is slower due to cryptographic
-    // operations & other associated overhead. Enforce a larger window to allow
-    // the thrashing mitigation mechanisms to work properly.
-    if (g_uvm_global.conf_computing_enabled)
-        INIT_THRASHING_PARAMETER_NONZERO(uvm_perf_thrashing_lapse_usec, UVM_PERF_THRASHING_LAPSE_USEC_DEFAULT * 10);
-    else
-        INIT_THRASHING_PARAMETER_NONZERO(uvm_perf_thrashing_lapse_usec, UVM_PERF_THRASHING_LAPSE_USEC_DEFAULT);
+    INIT_THRASHING_PARAMETER_NONZERO(uvm_perf_thrashing_lapse_usec, UVM_PERF_THRASHING_LAPSE_USEC_DEFAULT);

    INIT_THRASHING_PARAMETER_NONZERO_MAX(uvm_perf_thrashing_nap,
                                         UVM_PERF_THRASHING_NAP_DEFAULT,
                                         UVM_PERF_THRASHING_NAP_MAX);

-
    INIT_THRASHING_PARAMETER_NONZERO(uvm_perf_thrashing_epoch, UVM_PERF_THRASHING_EPOCH_DEFAULT);

    INIT_THRASHING_PARAMETER(uvm_perf_thrashing_pin, UVM_PERF_THRASHING_PIN_DEFAULT);
--- a/kernel-open/nvidia-uvm/uvm_pmm_gpu.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_gpu.c
@ -1890,8 +1890,11 @@ static uvm_gpu_chunk_t *claim_free_chunk(uvm_pmm_gpu_t *pmm, uvm_pmm_gpu_memory_
    if (!chunk)
        goto out;

-    UVM_ASSERT_MSG(uvm_gpu_chunk_get_size(chunk) == chunk_size, "chunk size %u expected %u\n",
-            uvm_gpu_chunk_get_size(chunk), chunk_size);
+    UVM_ASSERT_MSG(uvm_gpu_chunk_get_size(chunk) == chunk_size,
+                   "chunk size %u expected %u\n",
+                   uvm_gpu_chunk_get_size(chunk),
+                   chunk_size);
+
    UVM_ASSERT(chunk->type == type);
    UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_FREE);
    UVM_ASSERT(!chunk_is_in_eviction(pmm, chunk));
@ -2756,7 +2759,7 @@ static bool uvm_pmm_should_inject_pma_eviction_error(uvm_pmm_gpu_t *pmm)
 // See the documentation of pmaEvictPagesCb_t in pma.h for details of the
 // expected semantics.
 static NV_STATUS uvm_pmm_gpu_pma_evict_pages(void *void_pmm,
-                                             NvU32 page_size,
+                                             NvU64 page_size,
                                             NvU64 *pages,
                                             NvU32 num_pages_to_evict,
                                             NvU64 phys_start,
@ -2861,7 +2864,7 @@ error:
 }

 static NV_STATUS uvm_pmm_gpu_pma_evict_pages_wrapper(void *void_pmm,
-                                                     NvU32 page_size,
+                                                     NvU64 page_size,
                                                     NvU64 *pages,
                                                     NvU32 num_pages_to_evict,
                                                     NvU64 phys_start,
--- a/kernel-open/nvidia-uvm/uvm_pmm_gpu.h
+++ b/kernel-open/nvidia-uvm/uvm_pmm_gpu.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -65,30 +65,30 @@

 typedef enum
 {
-    UVM_CHUNK_SIZE_1       =           1ULL,
-    UVM_CHUNK_SIZE_2       =           2ULL,
-    UVM_CHUNK_SIZE_4       =           4ULL,
-    UVM_CHUNK_SIZE_8       =           8ULL,
-    UVM_CHUNK_SIZE_16      =          16ULL,
-    UVM_CHUNK_SIZE_32      =          32ULL,
-    UVM_CHUNK_SIZE_64      =          64ULL,
-    UVM_CHUNK_SIZE_128     =         128ULL,
-    UVM_CHUNK_SIZE_256     =         256ULL,
-    UVM_CHUNK_SIZE_512     =         512ULL,
-    UVM_CHUNK_SIZE_1K      =        1024ULL,
-    UVM_CHUNK_SIZE_2K      =      2*1024ULL,
-    UVM_CHUNK_SIZE_4K      =      4*1024ULL,
-    UVM_CHUNK_SIZE_8K      =      8*1024ULL,
-    UVM_CHUNK_SIZE_16K     =     16*1024ULL,
-    UVM_CHUNK_SIZE_32K     =     32*1024ULL,
-    UVM_CHUNK_SIZE_64K     =     64*1024ULL,
-    UVM_CHUNK_SIZE_128K    =    128*1024ULL,
-    UVM_CHUNK_SIZE_256K    =    256*1024ULL,
-    UVM_CHUNK_SIZE_512K    =    512*1024ULL,
-    UVM_CHUNK_SIZE_1M      =   1024*1024ULL,
-    UVM_CHUNK_SIZE_2M      = 2*1024*1024ULL,
+    UVM_CHUNK_SIZE_1       =           1,
+    UVM_CHUNK_SIZE_2       =           2,
+    UVM_CHUNK_SIZE_4       =           4,
+    UVM_CHUNK_SIZE_8       =           8,
+    UVM_CHUNK_SIZE_16      =          16,
+    UVM_CHUNK_SIZE_32      =          32,
+    UVM_CHUNK_SIZE_64      =          64,
+    UVM_CHUNK_SIZE_128     =         128,
+    UVM_CHUNK_SIZE_256     =         256,
+    UVM_CHUNK_SIZE_512     =         512,
+    UVM_CHUNK_SIZE_1K      =        1024,
+    UVM_CHUNK_SIZE_2K      =      2*1024,
+    UVM_CHUNK_SIZE_4K      =      4*1024,
+    UVM_CHUNK_SIZE_8K      =      8*1024,
+    UVM_CHUNK_SIZE_16K     =     16*1024,
+    UVM_CHUNK_SIZE_32K     =     32*1024,
+    UVM_CHUNK_SIZE_64K     =     64*1024,
+    UVM_CHUNK_SIZE_128K    =    128*1024,
+    UVM_CHUNK_SIZE_256K    =    256*1024,
+    UVM_CHUNK_SIZE_512K    =    512*1024,
+    UVM_CHUNK_SIZE_1M      =   1024*1024,
+    UVM_CHUNK_SIZE_2M      = 2*1024*1024,
    UVM_CHUNK_SIZE_MAX     = UVM_CHUNK_SIZE_2M,
-    UVM_CHUNK_SIZE_INVALID = UVM_CHUNK_SIZE_MAX * 2ULL
+    UVM_CHUNK_SIZE_INVALID = UVM_CHUNK_SIZE_MAX * 2
 } uvm_chunk_size_t;

 #define UVM_CHUNK_SIZES_MASK     (uvm_chunk_sizes_mask_t)(UVM_CHUNK_SIZE_MAX | (UVM_CHUNK_SIZE_MAX-1))
--- a/kernel-open/nvidia-uvm/uvm_pmm_sysmem.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_sysmem.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2017-2023 NVIDIA Corporation
+    Copyright (c) 2017-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -43,7 +43,7 @@ NV_STATUS uvm_pmm_sysmem_init(void)
    // Ensure that only supported CPU chunk sizes are enabled.
    uvm_cpu_chunk_allocation_sizes &= UVM_CPU_CHUNK_SIZES;
    if (!uvm_cpu_chunk_allocation_sizes || !(uvm_cpu_chunk_allocation_sizes & PAGE_SIZE)) {
-        pr_info("Invalid value for uvm_cpu_chunk_allocation_sizes = 0x%x, using 0x%lx instead\n",
+        pr_info("Invalid value for uvm_cpu_chunk_allocation_sizes = 0x%x, using 0x%llx instead\n",
                uvm_cpu_chunk_allocation_sizes,
                UVM_CPU_CHUNK_SIZES);
        uvm_cpu_chunk_allocation_sizes = UVM_CPU_CHUNK_SIZES;
@ -461,69 +461,12 @@ static NvU32 compute_gpu_mappings_entry_index(uvm_parent_processor_mask_t *dma_a
    return uvm_parent_processor_mask_get_gpu_count(&subset_mask);
 }

-static void cpu_chunk_release(nv_kref_t *kref)
-{
-    uvm_cpu_chunk_t *chunk = container_of(kref, uvm_cpu_chunk_t, refcount);
-    uvm_parent_processor_mask_t *mapping_mask;
-    uvm_parent_processor_id_t id;
-    uvm_cpu_physical_chunk_t *phys_chunk = NULL;
-    uvm_cpu_logical_chunk_t *logical_chunk = NULL;
-
-    if (uvm_cpu_chunk_is_physical(chunk)) {
-        phys_chunk = uvm_cpu_chunk_to_physical(chunk);
-        uvm_assert_mutex_unlocked(&phys_chunk->lock);
-        mapping_mask = &phys_chunk->gpu_mappings.dma_addrs_mask;
-    }
-    else {
-        logical_chunk = uvm_cpu_chunk_to_logical(chunk);
-        mapping_mask = &logical_chunk->mapped_gpus;
-    }
-
-    for_each_parent_id_in_mask(id, mapping_mask) {
-        uvm_parent_gpu_t *parent_gpu = uvm_parent_gpu_get(id);
-        uvm_cpu_chunk_unmap_parent_gpu_phys(chunk, parent_gpu);
-    }
-
-    if (uvm_cpu_chunk_is_physical(chunk)) {
-        if (phys_chunk->gpu_mappings.max_entries > 1)
-            uvm_kvfree(phys_chunk->gpu_mappings.dynamic_entries);
-
-        if (uvm_cpu_chunk_get_size(chunk) > PAGE_SIZE &&
-            !bitmap_empty(phys_chunk->dirty_bitmap, uvm_cpu_chunk_num_pages(chunk)))
-            SetPageDirty(phys_chunk->common.page);
-
-        uvm_kvfree(phys_chunk->dirty_bitmap);
-
-        if (chunk->type != UVM_CPU_CHUNK_TYPE_HMM)
-            put_page(phys_chunk->common.page);
-    }
-    else {
-        uvm_cpu_chunk_free(logical_chunk->parent);
-    }
-
-    uvm_kvfree(chunk);
-}
-
-static void uvm_cpu_chunk_get(uvm_cpu_chunk_t *chunk)
-{
-    UVM_ASSERT(chunk);
-    nv_kref_get(&chunk->refcount);
-}
-
-void uvm_cpu_chunk_free(uvm_cpu_chunk_t *chunk)
-{
-    if (!chunk)
-        return;
-
-    nv_kref_put(&chunk->refcount, cpu_chunk_release);
-}
-
 static uvm_cpu_physical_chunk_t *get_physical_parent(uvm_cpu_chunk_t *chunk)
 {
    UVM_ASSERT(chunk);
    UVM_ASSERT(chunk->page);

-    while (!uvm_cpu_chunk_is_physical(chunk))
+    while (uvm_cpu_chunk_is_logical(chunk))
        chunk = uvm_cpu_chunk_to_logical(chunk)->parent;

    return uvm_cpu_chunk_to_physical(chunk);
@ -581,6 +524,7 @@ static uvm_cpu_phys_mapping_t *chunk_phys_mapping_alloc(uvm_cpu_physical_chunk_t
 static uvm_cpu_phys_mapping_t *chunk_phys_mapping_get(uvm_cpu_physical_chunk_t *chunk, uvm_parent_gpu_id_t id)
 {
    uvm_assert_mutex_locked(&chunk->lock);
+
    if (uvm_parent_processor_mask_test(&chunk->gpu_mappings.dma_addrs_mask, id)) {
        if (chunk->gpu_mappings.max_entries == 1) {
            return &chunk->gpu_mappings.static_entry;
@ -598,7 +542,6 @@ static void chunk_inc_gpu_mapping(uvm_cpu_physical_chunk_t *chunk, uvm_parent_gp
 {
    uvm_cpu_phys_mapping_t *mapping;

-    uvm_assert_mutex_locked(&chunk->lock);
    mapping = chunk_phys_mapping_get(chunk, id);
    UVM_ASSERT(mapping);
    mapping->map_count++;
@ -608,7 +551,6 @@ static void chunk_dec_gpu_mapping(uvm_cpu_physical_chunk_t *chunk, uvm_parent_gp
 {
    uvm_cpu_phys_mapping_t *mapping;

-    uvm_assert_mutex_locked(&chunk->lock);
    mapping = chunk_phys_mapping_get(chunk, id);
    UVM_ASSERT(mapping);
    UVM_ASSERT(mapping->dma_addr && mapping->map_count);
@ -616,6 +558,8 @@ static void chunk_dec_gpu_mapping(uvm_cpu_physical_chunk_t *chunk, uvm_parent_gp
    if (mapping->map_count == 0) {
        uvm_parent_gpu_t *parent_gpu = uvm_parent_gpu_get(id);

+        UVM_ASSERT(uvm_sub_processor_mask_empty(&mapping->sub_processors));
+
        uvm_parent_gpu_unmap_cpu_pages(parent_gpu, mapping->dma_addr, uvm_cpu_chunk_get_size(&chunk->common));
        mapping->dma_addr = 0;
        if (chunk->gpu_mappings.max_entries > 1) {
@ -631,7 +575,7 @@ static void chunk_dec_gpu_mapping(uvm_cpu_physical_chunk_t *chunk, uvm_parent_gp
    }
 }

-NvU64 uvm_cpu_chunk_get_parent_gpu_phys_addr(uvm_cpu_chunk_t *chunk, uvm_parent_gpu_t *parent_gpu)
+NvU64 uvm_cpu_chunk_get_gpu_phys_addr(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu)
 {
    uvm_cpu_physical_chunk_t *phys_chunk = get_physical_parent(chunk);
    uvm_cpu_phys_mapping_t *mapping;
@ -641,36 +585,41 @@ NvU64 uvm_cpu_chunk_get_parent_gpu_phys_addr(uvm_cpu_chunk_t *chunk, uvm_parent_
    if (uvm_cpu_chunk_is_logical(chunk)) {
        uvm_cpu_logical_chunk_t *logical_chunk = uvm_cpu_chunk_to_logical(chunk);

-        if (!uvm_parent_processor_mask_test(&logical_chunk->mapped_gpus, parent_gpu->id))
+        if (!uvm_processor_mask_test(&logical_chunk->mapped_gpus, gpu->id))
            return 0;

        parent_offset = cpu_chunk_get_phys_index(logical_chunk);
    }

    uvm_mutex_lock(&phys_chunk->lock);
-    mapping = chunk_phys_mapping_get(phys_chunk, parent_gpu->id);
-    if (mapping)
+    mapping = chunk_phys_mapping_get(phys_chunk, gpu->parent->id);
+    if (mapping &&
+        (uvm_cpu_chunk_is_logical(chunk) ||
+         uvm_sub_processor_mask_test(&mapping->sub_processors, uvm_id_sub_processor_index(gpu->id))))
        dma_addr = mapping->dma_addr + (parent_offset * PAGE_SIZE);
-
    uvm_mutex_unlock(&phys_chunk->lock);
+
    return dma_addr;
 }

-// Create a DMA mapping for the chunk on the given parent GPU. This will map the
-// entire parent physical chunk on the GPU.
+// Create a DMA mapping for the chunk on the given GPU. This will map the
+// entire physical chunk on the parent GPU and record that a given MIG
+// partition is using the mapping.
 //
 // Returns NV_OK on success. On error, any of the errors returned by
 // uvm_parent_gpu_map_cpu_pages() can be returned. In the case that the DMA
 // mapping structure could not be allocated, NV_ERR_NO_MEMORY is returned.
-static NV_STATUS cpu_chunk_map_parent_gpu_phys(uvm_cpu_chunk_t *chunk, uvm_parent_gpu_t *parent_gpu)
+static NV_STATUS cpu_chunk_map_gpu_phys(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu)
 {
+    uvm_parent_gpu_t *parent_gpu = gpu->parent;
    uvm_cpu_physical_chunk_t *phys_chunk;
    uvm_cpu_logical_chunk_t *logical_chunk = NULL;
+    uvm_cpu_phys_mapping_t *mapping;
    NV_STATUS status = NV_OK;

    if (uvm_cpu_chunk_is_logical(chunk)) {
        logical_chunk = uvm_cpu_chunk_to_logical(chunk);
-        if (uvm_parent_processor_mask_test(&logical_chunk->mapped_gpus, parent_gpu->id))
+        if (uvm_processor_mask_test(&logical_chunk->mapped_gpus, gpu->id))
            return status;
    }

@ -679,7 +628,6 @@ static NV_STATUS cpu_chunk_map_parent_gpu_phys(uvm_cpu_chunk_t *chunk, uvm_paren

    if (!uvm_parent_processor_mask_test(&phys_chunk->gpu_mappings.dma_addrs_mask, parent_gpu->id)) {
        uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(&phys_chunk->common);
-        uvm_cpu_phys_mapping_t *mapping;
        NvU64 dma_addr;

        status = uvm_parent_gpu_map_cpu_pages(parent_gpu, phys_chunk->common.page, chunk_size, &dma_addr);
@ -695,39 +643,59 @@ static NV_STATUS cpu_chunk_map_parent_gpu_phys(uvm_cpu_chunk_t *chunk, uvm_paren

        mapping->dma_addr = dma_addr;
        mapping->map_count = 1;
+        uvm_sub_processor_mask_zero(&mapping->sub_processors);
+        if (!logical_chunk)
+            uvm_sub_processor_mask_set(&mapping->sub_processors, uvm_id_sub_processor_index(gpu->id));
+
        uvm_parent_processor_mask_set(&phys_chunk->gpu_mappings.dma_addrs_mask, parent_gpu->id);
    }
    else {
-        // The mapping count on the physical chunk is only increased when
-        // mapping logical chunks.
-        if (uvm_cpu_chunk_is_logical(chunk))
-            chunk_inc_gpu_mapping(phys_chunk, parent_gpu->id);
+        mapping = chunk_phys_mapping_get(phys_chunk, parent_gpu->id);
+        UVM_ASSERT(mapping);
+
+        // Increment the map_count for logical chunks or the first time a
+        // MIG partition is sharing a physical chunk.
+        if (logical_chunk ||
+            !uvm_sub_processor_mask_test_and_set(&mapping->sub_processors, uvm_id_sub_processor_index(gpu->id)))
+            mapping->map_count++;
+    }
+
+    if (logical_chunk) {
+        uvm_processor_mask_set(&logical_chunk->mapped_gpus, gpu->id);
+        UVM_ASSERT(uvm_sub_processor_mask_empty(&mapping->sub_processors));
+    }
+    else {
+        UVM_ASSERT(!uvm_sub_processor_mask_empty(&mapping->sub_processors));
+        UVM_ASSERT(uvm_sub_processor_mask_get_count(&mapping->sub_processors) == mapping->map_count);
    }

 done:
    uvm_mutex_unlock(&phys_chunk->lock);

-    if (status == NV_OK && uvm_cpu_chunk_is_logical(chunk))
-        uvm_parent_processor_mask_set(&logical_chunk->mapped_gpus, parent_gpu->id);
-
    return status;
 }

-void uvm_cpu_chunk_unmap_parent_gpu_phys(uvm_cpu_chunk_t *chunk, uvm_parent_gpu_t *parent_gpu)
+static void cpu_chunk_unmap_gpu_phys(uvm_cpu_chunk_t *chunk, uvm_gpu_id_t gpu_id)
 {
-    uvm_cpu_physical_chunk_t *phys_chunk;
-    uvm_cpu_logical_chunk_t *logical_chunk;
+    uvm_cpu_physical_chunk_t *phys_chunk = get_physical_parent(chunk);
+    uvm_parent_gpu_id_t id = uvm_parent_gpu_id_from_gpu_id(gpu_id);
+
+    uvm_mutex_lock(&phys_chunk->lock);

    if (uvm_cpu_chunk_is_logical(chunk)) {
-        logical_chunk = uvm_cpu_chunk_to_logical(chunk);
-        if (!uvm_parent_processor_mask_test_and_clear(&logical_chunk->mapped_gpus, parent_gpu->id))
-            return;
-    }
+        uvm_processor_mask_t *mapping_mask = &uvm_cpu_chunk_to_logical(chunk)->mapped_gpus;

-    phys_chunk = get_physical_parent(chunk);
-    uvm_mutex_lock(&phys_chunk->lock);
-    if (uvm_parent_processor_mask_test(&phys_chunk->gpu_mappings.dma_addrs_mask, parent_gpu->id))
-        chunk_dec_gpu_mapping(phys_chunk, parent_gpu->id);
+        if (uvm_processor_mask_test_and_clear(mapping_mask, gpu_id))
+            chunk_dec_gpu_mapping(phys_chunk, id);
+    }
+    else {
+        if (uvm_parent_processor_mask_test(&phys_chunk->gpu_mappings.dma_addrs_mask, id)) {
+            uvm_cpu_phys_mapping_t *mapping = chunk_phys_mapping_get(phys_chunk, id);
+
+            if (uvm_sub_processor_mask_test_and_clear(&mapping->sub_processors, uvm_id_sub_processor_index(gpu_id)))
+                chunk_dec_gpu_mapping(phys_chunk, id);
+        }
+    }

    uvm_mutex_unlock(&phys_chunk->lock);
 }
@ -737,17 +705,112 @@ NV_STATUS uvm_cpu_chunk_map_gpu(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu)
    NV_STATUS status;
    uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk);

-    status = cpu_chunk_map_parent_gpu_phys(chunk, gpu->parent);
+    status = cpu_chunk_map_gpu_phys(chunk, gpu);
    if (status != NV_OK)
        return status;

-    status = uvm_mmu_sysmem_map(gpu, uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent), chunk_size);
+    status = uvm_mmu_sysmem_map(gpu, uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu), chunk_size);
    if (status != NV_OK)
-        uvm_cpu_chunk_unmap_parent_gpu_phys(chunk, gpu->parent);
+        cpu_chunk_unmap_gpu_phys(chunk, gpu->id);

    return status;
 }

+void uvm_cpu_chunk_unmap_gpu(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu)
+{
+    cpu_chunk_unmap_gpu_phys(chunk, gpu->id);
+
+    // Note: there is no corresponding uvm_mmu_sysmem_unmap() for
+    // uvm_mmu_sysmem_map().
+}
+
+static void cpu_logical_chunk_release(uvm_cpu_logical_chunk_t *logical_chunk)
+{
+    uvm_cpu_physical_chunk_t *phys_chunk = get_physical_parent(logical_chunk->parent);
+    uvm_processor_id_t gpu_id;
+
+    uvm_mutex_lock(&phys_chunk->lock);
+
+    for_each_id_in_mask(gpu_id, &logical_chunk->mapped_gpus)
+        chunk_dec_gpu_mapping(phys_chunk, uvm_parent_gpu_id_from_gpu_id(gpu_id));
+
+    uvm_mutex_unlock(&phys_chunk->lock);
+
+    uvm_cpu_chunk_free(logical_chunk->parent);
+}
+
+static void cpu_physical_chunk_release(uvm_cpu_chunk_t *chunk)
+{
+    uvm_cpu_physical_chunk_t *phys_chunk = uvm_cpu_chunk_to_physical(chunk);
+    uvm_parent_processor_id_t id;
+
+    uvm_assert_mutex_unlocked(&phys_chunk->lock);
+
+    // There should be no other threads using this chunk but we lock it because
+    // of assertions in chunk_phys_mapping_get() and chunk_dec_gpu_mapping().
+    uvm_mutex_lock(&phys_chunk->lock);
+
+    for_each_parent_id_in_mask(id, &phys_chunk->gpu_mappings.dma_addrs_mask) {
+        uvm_cpu_phys_mapping_t *mapping = chunk_phys_mapping_get(phys_chunk, id);
+        NvU32 count;
+
+        UVM_ASSERT(mapping);
+        UVM_ASSERT(!uvm_sub_processor_mask_empty(&mapping->sub_processors));
+
+        // Get a count of set bits in the sub_processors mask then clear it so
+        // that chunk_dec_gpu_mapping() sees an empty mask when map_count == 0.
+        // Using for_each_sub_processor_in_mask could try to dereference
+        // mapping after map_count == 0 in the loop below.
+        count = uvm_sub_processor_mask_get_count(&mapping->sub_processors);
+        uvm_sub_processor_mask_zero(&mapping->sub_processors);
+
+        for (; count; count--)
+            chunk_dec_gpu_mapping(phys_chunk, id);
+    }
+
+    uvm_mutex_unlock(&phys_chunk->lock);
+
+    UVM_ASSERT(uvm_parent_processor_mask_empty(&phys_chunk->gpu_mappings.dma_addrs_mask));
+
+    if (phys_chunk->gpu_mappings.max_entries > 1)
+        uvm_kvfree(phys_chunk->gpu_mappings.dynamic_entries);
+
+    if (uvm_cpu_chunk_get_size(chunk) > PAGE_SIZE &&
+        !bitmap_empty(phys_chunk->dirty_bitmap, uvm_cpu_chunk_num_pages(chunk)))
+        SetPageDirty(chunk->page);
+
+    uvm_kvfree(phys_chunk->dirty_bitmap);
+
+    if (chunk->type != UVM_CPU_CHUNK_TYPE_HMM)
+        put_page(chunk->page);
+}
+
+static void cpu_chunk_release(nv_kref_t *kref)
+{
+    uvm_cpu_chunk_t *chunk = container_of(kref, uvm_cpu_chunk_t, refcount);
+
+    if (uvm_cpu_chunk_is_logical(chunk))
+        cpu_logical_chunk_release(uvm_cpu_chunk_to_logical(chunk));
+    else
+        cpu_physical_chunk_release(chunk);
+
+    uvm_kvfree(chunk);
+}
+
+static void uvm_cpu_chunk_get(uvm_cpu_chunk_t *chunk)
+{
+    UVM_ASSERT(chunk);
+    nv_kref_get(&chunk->refcount);
+}
+
+void uvm_cpu_chunk_free(uvm_cpu_chunk_t *chunk)
+{
+    if (!chunk)
+        return;
+
+    nv_kref_put(&chunk->refcount, cpu_chunk_release);
+}
+
 static struct page *uvm_cpu_chunk_alloc_page(uvm_chunk_size_t alloc_size,
                                             int nid,
                                             uvm_cpu_chunk_alloc_flags_t alloc_flags)
@ -876,14 +939,37 @@ int uvm_cpu_chunk_get_numa_node(uvm_cpu_chunk_t *chunk)
    return page_to_nid(chunk->page);
 }

+// Convert the mask of DMA mapped parent GPUs and the sub-processor mask into
+// one uvm_processor_mask_t in 'dma_map_mask'.
+static void get_dma_map_mask(uvm_cpu_physical_chunk_t *chunk, uvm_processor_mask_t *dma_map_mask)
+{
+    uvm_parent_processor_id_t id;
+    NvU32 sub_index;
+
+    uvm_assert_mutex_locked(&chunk->lock);
+
+    for_each_parent_id_in_mask(id, &chunk->gpu_mappings.dma_addrs_mask) {
+        uvm_cpu_phys_mapping_t *mapping = chunk_phys_mapping_get(chunk, id);
+
+        for_each_sub_processor_index_in_mask(sub_index, &mapping->sub_processors) {
+            uvm_processor_id_t gpu_id = uvm_gpu_id_from_sub_processor(id, sub_index);
+
+            uvm_sub_processor_mask_clear(&mapping->sub_processors, sub_index);
+            uvm_processor_mask_set(dma_map_mask, gpu_id);
+        }
+
+        UVM_ASSERT(uvm_sub_processor_mask_empty(&mapping->sub_processors));
+    }
+}
+
 NV_STATUS uvm_cpu_chunk_split(uvm_cpu_chunk_t *chunk, uvm_cpu_chunk_t **new_chunks)
 {
    NV_STATUS status = NV_OK;
    uvm_cpu_logical_chunk_t *new_chunk;
    uvm_cpu_physical_chunk_t *phys_chunk = get_physical_parent(chunk);
    uvm_cpu_logical_chunk_t *logical_chunk = NULL;
-    uvm_parent_processor_id_t id;
-    uvm_parent_processor_mask_t *dma_map_mask;
+    uvm_processor_id_t gpu_id;
+    uvm_processor_mask_t *dma_map_mask = NULL;
    uvm_chunk_size_t new_size;
    size_t num_new_chunks;
    size_t num_subchunk_pages;
@ -902,21 +988,20 @@ NV_STATUS uvm_cpu_chunk_split(uvm_cpu_chunk_t *chunk, uvm_cpu_chunk_t **new_chun

    // Get the largest size below the size of the input chunk.
    new_size = uvm_chunk_find_prev_size(uvm_cpu_chunk_get_allocation_sizes(), uvm_cpu_chunk_get_size(chunk));
+    UVM_ASSERT(new_size);
    UVM_ASSERT(new_size != UVM_CHUNK_SIZE_INVALID);
    num_new_chunks = uvm_cpu_chunk_get_size(chunk) / new_size;
    num_subchunk_pages = new_size / PAGE_SIZE;

-    if (uvm_cpu_chunk_is_physical(chunk)) {
-        dma_map_mask = &phys_chunk->gpu_mappings.dma_addrs_mask;
-    }
-    else {
+    if (uvm_cpu_chunk_is_logical(chunk)) {
        logical_chunk = uvm_cpu_chunk_to_logical(chunk);
        dma_map_mask = &logical_chunk->mapped_gpus;
    }

    uvm_mutex_lock(&phys_chunk->lock);
+
    for (i = 0; i < num_new_chunks; i++) {
-        new_chunk = uvm_kvmalloc_zero(sizeof(*logical_chunk));
+        new_chunk = uvm_kvmalloc_zero(sizeof(*new_chunk));
        if (!new_chunk) {
            uvm_mutex_unlock(&phys_chunk->lock);
            status = NV_ERR_NO_MEMORY;
@ -929,19 +1014,25 @@ NV_STATUS uvm_cpu_chunk_split(uvm_cpu_chunk_t *chunk, uvm_cpu_chunk_t **new_chun
        nv_kref_init(&new_chunk->common.refcount);
        new_chunk->parent = chunk;
        uvm_cpu_chunk_get(new_chunk->parent);
-        for_each_parent_id_in_mask(id, dma_map_mask)
-            chunk_inc_gpu_mapping(phys_chunk, id);
-        uvm_parent_processor_mask_copy(&new_chunk->mapped_gpus, dma_map_mask);
+        if (i == 0 && !logical_chunk) {
+            dma_map_mask = &new_chunk->mapped_gpus;
+            get_dma_map_mask(phys_chunk, dma_map_mask);
+        }
+        else {
+            uvm_processor_mask_copy(&new_chunk->mapped_gpus, dma_map_mask);
+        }
+        for_each_id_in_mask(gpu_id, dma_map_mask)
+            chunk_inc_gpu_mapping(phys_chunk, uvm_parent_gpu_id_from_gpu_id(gpu_id));
        new_chunks[i] = &new_chunk->common;
    }

    // Release the references that are held by the chunk being split.
-    for_each_parent_id_in_mask(id, dma_map_mask)
-        chunk_dec_gpu_mapping(phys_chunk, id);
+    for_each_id_in_mask(gpu_id, dma_map_mask)
+        chunk_dec_gpu_mapping(phys_chunk, uvm_parent_gpu_id_from_gpu_id(gpu_id));

    // If the chunk being split is a logical chunk clear it's mapped_gpus mask.
-    if (uvm_cpu_chunk_is_logical(chunk))
-        uvm_parent_processor_mask_zero(&logical_chunk->mapped_gpus);
+    if (logical_chunk)
+        uvm_processor_mask_zero(&logical_chunk->mapped_gpus);

    uvm_mutex_unlock(&phys_chunk->lock);

@ -963,7 +1054,7 @@ static bool verify_merging_chunks(uvm_cpu_chunk_t **chunks, size_t num_chunks)
 {
    uvm_cpu_logical_chunk_t *logical_chunk;
    uvm_cpu_chunk_t *first_chunk_parent;
-    uvm_parent_processor_mask_t *first_chunk_mapped_gpus;
+    uvm_processor_mask_t *first_chunk_mapped_gpus;
    uvm_chunk_size_t first_chunk_size;
    size_t i;

@ -994,7 +1085,7 @@ static bool verify_merging_chunks(uvm_cpu_chunk_t **chunks, size_t num_chunks)
        //       2.1 All mappings to GPUs in each of child chunks' masks that are
        //           not also present in the parent chunk's mask are destroyed.
        //       2.2 mapped_gpus mask of the parent chunk remains unmodified.
-        UVM_ASSERT(uvm_parent_processor_mask_equal(&logical_chunk->mapped_gpus, first_chunk_mapped_gpus));
+        UVM_ASSERT(uvm_processor_mask_equal(&logical_chunk->mapped_gpus, first_chunk_mapped_gpus));
    }

    return true;
@ -1005,14 +1096,14 @@ uvm_cpu_chunk_t *uvm_cpu_chunk_merge(uvm_cpu_chunk_t **chunks)
    uvm_cpu_chunk_t *parent;
    uvm_cpu_logical_chunk_t *logical_chunk;
    uvm_cpu_physical_chunk_t *phys_chunk;
-    uvm_parent_processor_id_t id;
+    uvm_processor_id_t gpu_id;
    uvm_chunk_size_t chunk_size;
    uvm_chunk_size_t parent_chunk_size;
    size_t num_merge_chunks;
    size_t i;

    UVM_ASSERT(chunks);
-    UVM_ASSERT(!uvm_cpu_chunk_is_physical(chunks[0]));
+    UVM_ASSERT(uvm_cpu_chunk_is_logical(chunks[0]));

    logical_chunk = uvm_cpu_chunk_to_logical(chunks[0]);
    parent = logical_chunk->parent;
@ -1033,11 +1124,22 @@ uvm_cpu_chunk_t *uvm_cpu_chunk_merge(uvm_cpu_chunk_t **chunks)
    phys_chunk = get_physical_parent(chunks[0]);

    uvm_mutex_lock(&phys_chunk->lock);
-    for_each_parent_id_in_mask(id, &logical_chunk->mapped_gpus)
-        chunk_inc_gpu_mapping(phys_chunk, id);

-    if (!uvm_cpu_chunk_is_physical(parent))
-        uvm_parent_processor_mask_copy(&uvm_cpu_chunk_to_logical(parent)->mapped_gpus, &logical_chunk->mapped_gpus);
+    for_each_id_in_mask(gpu_id, &logical_chunk->mapped_gpus)
+        chunk_inc_gpu_mapping(phys_chunk, uvm_parent_gpu_id_from_gpu_id(gpu_id));
+
+    if (uvm_cpu_chunk_is_logical(parent)) {
+        uvm_processor_mask_copy(&uvm_cpu_chunk_to_logical(parent)->mapped_gpus, &logical_chunk->mapped_gpus);
+    }
+    else {
+        // Restore the mapping->sub_processors mask for each mapped GPU.
+        for_each_id_in_mask(gpu_id, &logical_chunk->mapped_gpus) {
+            uvm_cpu_phys_mapping_t *mapping = chunk_phys_mapping_get(phys_chunk, uvm_parent_gpu_id_from_gpu_id(gpu_id));
+
+            UVM_ASSERT(mapping);
+            uvm_sub_processor_mask_set(&mapping->sub_processors, uvm_id_sub_processor_index(gpu_id));
+        }
+    }

    uvm_mutex_unlock(&phys_chunk->lock);

--- a/kernel-open/nvidia-uvm/uvm_pmm_sysmem.h
+++ b/kernel-open/nvidia-uvm/uvm_pmm_sysmem.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2017-2023 NVIDIA Corporation
+    Copyright (c) 2017-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -246,8 +246,19 @@ struct uvm_cpu_chunk_struct

 typedef struct
 {
+    // Physical GPU DMA address of the CPU chunk.
    NvU64 dma_addr;
+
+    // Reference count of all sub_processors using this mapping across logical
+    // and physical chunks.
    NvU32 map_count;
+
+    // Mask of MIG instances or physical GPU.
+    // This is only valid for physical CPU chunks that have not been split into
+    // logical chunks. When the chunk is split, all the
+    // uvm_cpu_logical_chunk_t::mapped_gpus masks have a bit set for each
+    // count in map_count and sub_processors is set to zero.
+    uvm_sub_processor_mask_t sub_processors;
 } uvm_cpu_phys_mapping_t;

 typedef struct
@ -304,7 +315,9 @@ typedef struct

    // Pointer to the parent chunk (which could also be a logical chunk).
    uvm_cpu_chunk_t *parent;
-    uvm_parent_processor_mask_t mapped_gpus;
+
+    // This is a reference per bit but also recorded in mapping->map_count.
+    uvm_processor_mask_t mapped_gpus;
 } uvm_cpu_logical_chunk_t;

 // Return the set of allowed CPU chunk allocation sizes.
@ -417,15 +430,15 @@ void uvm_cpu_chunk_free(uvm_cpu_chunk_t *chunk);
 // For more details see uvm_mmu_sysmem_map().
 NV_STATUS uvm_cpu_chunk_map_gpu(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu);

-// Destroy a CPU chunk's DMA mapping for the parent GPU.
+// Destroy a CPU chunk's DMA mapping for the given GPU.
 // If chunk is a logical chunk, this call may not necessarily destroy the DMA
-// mapping of the parent physical chunk since all logical chunks share the
-// parent's DMA mapping.
-void uvm_cpu_chunk_unmap_parent_gpu_phys(uvm_cpu_chunk_t *chunk, uvm_parent_gpu_t *parent_gpu);
+// mapping of the parent physical chunk since all logical chunks and MIG
+// partitions share the parent's DMA mapping.
+void uvm_cpu_chunk_unmap_gpu(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu);

 // Get the CPU chunk's DMA mapping address for the specified GPU ID.
 // If there is no mapping for the GPU, 0 is returned.
-NvU64 uvm_cpu_chunk_get_parent_gpu_phys_addr(uvm_cpu_chunk_t *chunk, uvm_parent_gpu_t *parent_gpu);
+NvU64 uvm_cpu_chunk_get_gpu_phys_addr(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu);

 // Split a CPU chunk into a set of CPU chunks of the next size down from the set
 // of enabled CPU chunk sizes.
--- a/kernel-open/nvidia-uvm/uvm_pmm_sysmem_test.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_sysmem_test.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2017-2023 NVIDIA Corporation
+    Copyright (c) 2017-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -626,7 +626,7 @@ static NV_STATUS test_cpu_chunk_mapping_access(uvm_cpu_chunk_t *chunk, uvm_gpu_t
    TEST_NV_CHECK_RET(cpu_chunk_map_on_cpu(chunk, (void **)&cpu_addr));
    memset(cpu_addr, 0, chunk_size);

-    dma_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
+    dma_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu);
    gpu_addr = uvm_gpu_address_copy(gpu, uvm_gpu_phys_address(UVM_APERTURE_SYS, dma_addr));

    TEST_NV_CHECK_GOTO(uvm_push_begin_acquire(gpu->channel_manager,
@ -733,21 +733,21 @@ static NV_STATUS test_cpu_chunk_mapping_basic_verify(uvm_gpu_t *gpu,
    //   - no GPU mapping address.
    TEST_CHECK_GOTO(phys_chunk->gpu_mappings.max_entries == 1, done);
    TEST_CHECK_GOTO(uvm_parent_processor_mask_get_gpu_count(&phys_chunk->gpu_mappings.dma_addrs_mask) == 0, done);
-    TEST_CHECK_GOTO(uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent) == 0, done);
+    TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu) == 0, done);
    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu), done);

    // Test basic access.
    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu), done);

    // Test double map is harmless.
-    dma_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
+    dma_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu);
    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu), done);
-    TEST_CHECK_GOTO(uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent) == dma_addr, done);
+    TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu) == dma_addr, done);
    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu), done);

    // Test unmap, remap.
-    uvm_cpu_chunk_unmap_parent_gpu_phys(chunk, gpu->parent);
-    TEST_CHECK_GOTO(uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent) == 0, done);
+    uvm_cpu_chunk_unmap_gpu(chunk, gpu);
+    TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu) == 0, done);
    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu), done);
    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu), done);

@ -768,6 +768,39 @@ static NV_STATUS test_cpu_chunk_mapping_basic(uvm_gpu_t *gpu, uvm_cpu_chunk_allo
    return NV_OK;
 }

+// TODO: Bug 4351121: This won't actually test anything until uvm_test
+// enumerates multiple MIG instances.
+static NV_STATUS test_cpu_chunk_mig(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
+{
+    NV_STATUS status = NV_OK;
+    uvm_cpu_chunk_t *chunk;
+    uvm_cpu_physical_chunk_t *phys_chunk;
+    NvU64 dma_addr_gpu0;
+
+    UVM_ASSERT(gpu0->parent == gpu1->parent);
+
+    TEST_NV_CHECK_RET(test_cpu_chunk_alloc(PAGE_SIZE, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, NUMA_NO_NODE, &chunk));
+    phys_chunk = uvm_cpu_chunk_to_physical(chunk);
+
+    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu0), done);
+    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu1), done);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu0), done);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu1), done);
+
+    // MIG instances in the same physical GPU share the same DMA addresses.
+    dma_addr_gpu0 = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu0);
+    TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu1) == dma_addr_gpu0, done);
+
+    // Unmapping one GPU shouldn't affect the other.
+    uvm_cpu_chunk_unmap_gpu(chunk, gpu0);
+    TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu0) == 0, done);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu1), done);
+
+done:
+    uvm_cpu_chunk_free(chunk);
+    return status;
+}
+
 static NV_STATUS test_cpu_chunk_mapping_array(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1, uvm_gpu_t *gpu2)
 {
    NV_STATUS status = NV_OK;
@ -783,8 +816,8 @@ static NV_STATUS test_cpu_chunk_mapping_array(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1,
    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu2), done);
    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu1), done);
    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu2), done);
-    dma_addr_gpu1 = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu1->parent);
-    uvm_cpu_chunk_unmap_parent_gpu_phys(chunk, gpu2->parent);
+    dma_addr_gpu1 = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu1);
+    uvm_cpu_chunk_unmap_gpu(chunk, gpu2);
    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu1), done);
    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu0), done);
    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu0), done);
@ -798,7 +831,9 @@ static NV_STATUS test_cpu_chunk_mapping_array(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1,
    // GPU1. It's true that we may get a false negative if both addresses
    // happened to alias and we had a bug in how the addresses are shifted in
    // the dense array, but that's better than intermittent failure.
-    TEST_CHECK_GOTO(uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu1->parent) == dma_addr_gpu1, done);
+    // Also note that multiple MIG instances in the same physical GPU share the
+    // parent's physical DMA mapping.
+    TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu1) == dma_addr_gpu1, done);

 done:
    uvm_cpu_chunk_free(chunk);
@ -828,7 +863,7 @@ static NV_STATUS do_test_cpu_chunk_split_and_merge(uvm_cpu_chunk_t *chunk, uvm_g

    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu), done_free);
    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu), done_free);
-    uvm_cpu_chunk_unmap_parent_gpu_phys(chunk, gpu->parent);
+    uvm_cpu_chunk_unmap_gpu(chunk, gpu);

    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_split(chunk, split_chunks), done_free);
    TEST_CHECK_GOTO(nv_kref_read(&chunk->refcount) == num_split_chunks, done);
@ -845,13 +880,14 @@ static NV_STATUS do_test_cpu_chunk_split_and_merge(uvm_cpu_chunk_t *chunk, uvm_g
    merged_chunk = uvm_cpu_chunk_merge(split_chunks);
    TEST_CHECK_GOTO(uvm_cpu_chunk_get_size(merged_chunk) == size, done_free);
    TEST_CHECK_GOTO(merged_chunk == chunk, done_free);
+    TEST_CHECK_GOTO(nv_kref_read(&chunk->refcount) == 1, done_free);

    // Since all logical chunks were mapped, the entire merged chunk should
    // be accessible without needing to map it.
    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(merged_chunk, gpu), done_free);

    // Test that GPU mappings are transferred after a split
-    phys_dma_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
+    phys_dma_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu);

    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_split(chunk, split_chunks), done_free);

@ -859,9 +895,9 @@ static NV_STATUS do_test_cpu_chunk_split_and_merge(uvm_cpu_chunk_t *chunk, uvm_g
        NvU64 dma_addr;

        TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(split_chunks[i], gpu), done);
-        dma_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(split_chunks[i], gpu->parent);
+        dma_addr = uvm_cpu_chunk_get_gpu_phys_addr(split_chunks[i], gpu);
        TEST_CHECK_GOTO(dma_addr == phys_dma_addr + (i * split_size), done);
-        uvm_cpu_chunk_unmap_parent_gpu_phys(split_chunks[i], gpu->parent);
+        uvm_cpu_chunk_unmap_gpu(split_chunks[i], gpu);
    }

    // Test that mapping one logical chunk does not affect others.
@ -871,7 +907,7 @@ static NV_STATUS do_test_cpu_chunk_split_and_merge(uvm_cpu_chunk_t *chunk, uvm_g

    for (i = 0; i < num_split_chunks; i++) {
        if (i != map_chunk)
-            TEST_CHECK_GOTO(uvm_cpu_chunk_get_parent_gpu_phys_addr(split_chunks[i], gpu->parent) == 0, done);
+            TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(split_chunks[i], gpu) == 0, done);
    }

    if (split_size > PAGE_SIZE) {
@ -927,6 +963,118 @@ static NV_STATUS test_cpu_chunk_split_and_merge(uvm_gpu_t *gpu)
    return NV_OK;
 }

+static NV_STATUS do_test_cpu_chunk_split_and_merge_2(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
+{
+    NV_STATUS status = NV_OK;
+    uvm_chunk_size_t size = uvm_cpu_chunk_get_size(chunk);
+    uvm_chunk_sizes_mask_t alloc_sizes = uvm_cpu_chunk_get_allocation_sizes();
+    size_t num_split_chunks;
+    uvm_cpu_chunk_t **split_chunks;
+    uvm_cpu_chunk_t *merged_chunk;
+    uvm_chunk_size_t split_size;
+    size_t i;
+
+    split_size = uvm_chunk_find_prev_size(alloc_sizes, size);
+    UVM_ASSERT(split_size != UVM_CHUNK_SIZE_INVALID);
+    num_split_chunks = size / split_size;
+    split_chunks = uvm_kvmalloc_zero(num_split_chunks * sizeof(*split_chunks));
+
+    if (!split_chunks)
+        return NV_ERR_NO_MEMORY;
+
+    // Map both GPUs.
+    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu0), done_free);
+    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu1), done_free);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu0), done_free);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu1), done_free);
+
+    // Then split.
+    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_split(chunk, split_chunks), done_free);
+    TEST_CHECK_GOTO(nv_kref_read(&chunk->refcount) == num_split_chunks, done);
+
+    // Unmap gpu0 from all split chunks.
+    for (i = 0; i < num_split_chunks; i++) {
+        TEST_CHECK_GOTO(split_chunks[i], done);
+        TEST_CHECK_GOTO(uvm_cpu_chunk_is_logical(split_chunks[i]), done);
+        TEST_CHECK_GOTO(uvm_cpu_chunk_get_size(split_chunks[i]) == split_size, done);
+        uvm_cpu_chunk_unmap_gpu(split_chunks[i], gpu0);
+        TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(split_chunks[i], gpu0) == 0, done);
+
+        // Test that gpu1 still has access.
+        TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(split_chunks[i], gpu1), done);
+    }
+
+    // Test CPU chunk merging.
+    merged_chunk = uvm_cpu_chunk_merge(split_chunks);
+    TEST_CHECK_GOTO(uvm_cpu_chunk_get_size(merged_chunk) == size, done_free);
+    TEST_CHECK_GOTO(merged_chunk == chunk, done_free);
+    TEST_CHECK_GOTO(nv_kref_read(&chunk->refcount) == 1, done_free);
+
+    // Since all logical chunks were mapped, the entire merged chunk should
+    // be accessible without needing to map it.
+    TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(merged_chunk, gpu0) == 0, done_free);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(merged_chunk, gpu1), done_free);
+
+    // Unmap gpu1 so we start with a fully unmapped physical chunk.
+    uvm_cpu_chunk_unmap_gpu(chunk, gpu1);
+
+    // Split the physical chunk.
+    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_split(chunk, split_chunks), done_free);
+
+    // Now map everything.
+    for (i = 0; i < num_split_chunks; i++) {
+        TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(split_chunks[i], gpu0), done);
+        TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(split_chunks[i], gpu1), done);
+    }
+
+    // Test CPU chunk merging with everything mapped.
+    merged_chunk = uvm_cpu_chunk_merge(split_chunks);
+
+    // At this point, all split chunks have been merged.
+    num_split_chunks = 0;
+
+    TEST_CHECK_GOTO(uvm_cpu_chunk_get_size(merged_chunk) == size, done_free);
+    TEST_CHECK_GOTO(merged_chunk == chunk, done_free);
+
+    // Since all logical chunks were mapped, the entire merged chunk should
+    // be accessible without needing to map it.
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(merged_chunk, gpu0), done_free);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(merged_chunk, gpu1), done_free);
+
+done:
+    for (i = 0; i < num_split_chunks; i++)
+        uvm_cpu_chunk_free(split_chunks[i]);
+
+done_free:
+    uvm_kvfree(split_chunks);
+
+    return status;
+}
+
+static NV_STATUS test_cpu_chunk_split_and_merge_2(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
+{
+    uvm_chunk_sizes_mask_t alloc_sizes = uvm_cpu_chunk_get_allocation_sizes();
+    uvm_chunk_size_t size;
+
+    size = uvm_chunk_find_next_size(alloc_sizes, PAGE_SIZE);
+    for_each_chunk_size_from(size, alloc_sizes) {
+        uvm_cpu_chunk_t *chunk;
+        NV_STATUS status;
+
+        // It is possible that the allocation fails due to lack of large pages
+        // rather than an API issue, which will result in a false negative.
+        // However, that should be very rare.
+        TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, NUMA_NO_NODE, &chunk));
+        status = do_test_cpu_chunk_split_and_merge_2(chunk, gpu0, gpu1);
+        uvm_cpu_chunk_free(chunk);
+
+        if (status != NV_OK)
+            return status;
+    }
+
+    return NV_OK;
+}
+
 static NV_STATUS test_cpu_chunk_dirty_split(uvm_cpu_chunk_t *chunk)
 {
    uvm_chunk_size_t size = uvm_cpu_chunk_get_size(chunk);
@ -1072,7 +1220,9 @@ done:
    return status;
 }

-NV_STATUS do_test_cpu_chunk_free(uvm_cpu_chunk_t *chunk, uvm_va_space_t *va_space, uvm_processor_mask_t *test_gpus)
+NV_STATUS do_test_cpu_chunk_free(uvm_cpu_chunk_t *chunk,
+                                 uvm_va_space_t *va_space,
+                                 const uvm_processor_mask_t *test_gpus)
 {
    NV_STATUS status = NV_OK;
    uvm_cpu_chunk_t **split_chunks;
@ -1099,7 +1249,7 @@ NV_STATUS do_test_cpu_chunk_free(uvm_cpu_chunk_t *chunk, uvm_va_space_t *va_spac
    chunk = NULL;

    // Map every other chunk.
-    // The call to uvm_cpu_chunk_unmap_parent_gpu_phys() is here in case this
+    // The call to uvm_cpu_chunk_unmap_gpu() is here in case this
    // is part of a double split (see below). In that case, the parent chunk
    // would be either mapped or unmapped.
    //
@ -1111,7 +1261,7 @@ NV_STATUS do_test_cpu_chunk_free(uvm_cpu_chunk_t *chunk, uvm_va_space_t *va_spac
            if (i & (1 << uvm_id_gpu_index(gpu->id)))
                TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(split_chunks[i], gpu), done);
            else
-                uvm_cpu_chunk_unmap_parent_gpu_phys(split_chunks[i], gpu->parent);
+                uvm_cpu_chunk_unmap_gpu(split_chunks[i], gpu);
        }
    }

@ -1147,9 +1297,9 @@ NV_STATUS do_test_cpu_chunk_free(uvm_cpu_chunk_t *chunk, uvm_va_space_t *va_spac
            TEST_CHECK_GOTO(uvm_cpu_chunk_get_size(split_chunks[j]) == split_size, done);
            for_each_va_space_gpu_in_mask(gpu, va_space, test_gpus) {
                if (j & (1 << uvm_id_gpu_index(gpu->id)))
-                    TEST_CHECK_GOTO(uvm_cpu_chunk_get_parent_gpu_phys_addr(split_chunks[j], gpu->parent), done);
+                    TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(split_chunks[j], gpu), done);
                else
-                    TEST_CHECK_GOTO(!uvm_cpu_chunk_get_parent_gpu_phys_addr(split_chunks[j], gpu->parent), done);
+                    TEST_CHECK_GOTO(!uvm_cpu_chunk_get_gpu_phys_addr(split_chunks[j], gpu), done);
            }
        }
    }
@ -1168,7 +1318,8 @@ done_free:
    return status;
 }

-NV_STATUS test_cpu_chunk_free(uvm_va_space_t *va_space, uvm_processor_mask_t *test_gpus)
+NV_STATUS test_cpu_chunk_free(uvm_va_space_t *va_space,
+                              const uvm_processor_mask_t *test_gpus)
 {
    uvm_cpu_chunk_t *chunk;
    uvm_chunk_sizes_mask_t alloc_sizes = uvm_cpu_chunk_get_allocation_sizes();
@ -1204,6 +1355,50 @@ static NV_STATUS test_cpu_chunk_numa_alloc(uvm_va_space_t *va_space)
    return NV_OK;
 }

+static uvm_gpu_t *find_first_parent_gpu(const uvm_processor_mask_t *test_gpus,
+                                        uvm_va_space_t *va_space)
+{
+    return uvm_processor_mask_find_first_va_space_gpu(test_gpus, va_space);
+}
+
+static uvm_gpu_t *find_next_parent_gpu(const uvm_processor_mask_t *test_gpus,
+                                       uvm_va_space_t *va_space,
+                                       uvm_gpu_t *gpu)
+{
+    uvm_gpu_t *next_gpu = gpu;
+
+    while (next_gpu) {
+        next_gpu = uvm_processor_mask_find_next_va_space_gpu(test_gpus, va_space, next_gpu);
+        if (!next_gpu || next_gpu->parent != gpu->parent)
+            break;
+    }
+
+    return next_gpu;
+}
+
+static void find_shared_gpu_pair(const uvm_processor_mask_t *test_gpus,
+                                 uvm_va_space_t *va_space,
+                                 uvm_gpu_t **out_gpu0,
+                                 uvm_gpu_t **out_gpu1)
+{
+    uvm_gpu_t *gpu0 = uvm_processor_mask_find_first_va_space_gpu(test_gpus, va_space);
+    uvm_gpu_t *gpu1 = uvm_processor_mask_find_next_va_space_gpu(test_gpus, va_space, gpu0);
+
+    while (gpu1) {
+        if (gpu0->parent == gpu1->parent) {
+            *out_gpu0 = gpu0;
+            *out_gpu1 = gpu1;
+            return;
+        }
+
+        gpu0 = gpu1;
+        gpu1 = uvm_processor_mask_find_next_va_space_gpu(test_gpus, va_space, gpu0);
+    }
+
+    *out_gpu0 = NULL;
+    *out_gpu1 = NULL;
+}
+
 NV_STATUS uvm_test_cpu_chunk_api(UVM_TEST_CPU_CHUNK_API_PARAMS *params, struct file *filp)
 {
    uvm_va_space_t *va_space = uvm_va_space_get(filp);
@ -1228,13 +1423,29 @@ NV_STATUS uvm_test_cpu_chunk_api(UVM_TEST_CPU_CHUNK_API_PARAMS *params, struct f
    TEST_NV_CHECK_GOTO(test_cpu_chunk_free(va_space, test_gpus), done);
    TEST_NV_CHECK_GOTO(test_cpu_chunk_numa_alloc(va_space), done);

-    if (uvm_processor_mask_get_gpu_count(test_gpus) >= 3) {
-        uvm_gpu_t *gpu2, *gpu3;
+    if (uvm_processor_mask_get_gpu_count(test_gpus) >= 2) {
+        uvm_gpu_t *gpu2, *gpu3 = NULL;

-        gpu = uvm_processor_mask_find_first_va_space_gpu(test_gpus, va_space);
-        gpu2 = uvm_processor_mask_find_next_va_space_gpu(test_gpus, va_space, gpu);
-        gpu3 = uvm_processor_mask_find_next_va_space_gpu(test_gpus, va_space, gpu2);
-        TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_array(gpu, gpu2, gpu3), done);
+        // Look for a pair of GPUs that don't share a common parent.
+        gpu = find_first_parent_gpu(test_gpus, va_space);
+        gpu2 = find_next_parent_gpu(test_gpus, va_space, gpu);
+        if (gpu2) {
+            TEST_NV_CHECK_GOTO(test_cpu_chunk_split_and_merge_2(gpu, gpu2), done);
+
+            // Look for a third physical GPU.
+            gpu3 = find_next_parent_gpu(test_gpus, va_space, gpu2);
+
+            if (gpu3)
+                TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_array(gpu, gpu2, gpu3), done);
+        }
+
+        // Look for a pair of GPUs that share a common parent.
+        find_shared_gpu_pair(test_gpus, va_space, &gpu, &gpu2);
+        if (gpu) {
+            // Test MIG instances within the same parent GPU.
+            TEST_NV_CHECK_GOTO(test_cpu_chunk_split_and_merge_2(gpu, gpu2), done);
+            TEST_NV_CHECK_GOTO(test_cpu_chunk_mig(gpu, gpu2), done);
+        }
    }

 done:
--- a/kernel-open/nvidia-uvm/uvm_processors.c
+++ b/kernel-open/nvidia-uvm/uvm_processors.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2023 NVIDIA Corporation
+    Copyright (c) 2023-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -30,6 +30,8 @@ const uvm_processor_mask_t g_uvm_processor_mask_empty = { };

 NV_STATUS uvm_processor_mask_cache_init(void)
 {
+    BUILD_BUG_ON((8 * sizeof(((uvm_sub_processor_mask_t *)0)->bitmap)) < UVM_PARENT_ID_MAX_SUB_PROCESSORS);
+
    g_uvm_processor_mask_cache = NV_KMEM_CACHE_CREATE("uvm_processor_mask_t", uvm_processor_mask_t);
    if (!g_uvm_processor_mask_cache)
        return NV_ERR_NO_MEMORY;
--- a/kernel-open/nvidia-uvm/uvm_processors.h
+++ b/kernel-open/nvidia-uvm/uvm_processors.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2023 NVIDIA Corporation
+    Copyright (c) 2016-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -277,8 +277,6 @@ typedef uvm_processor_id_t uvm_gpu_id_t;
 #define UVM_PARENT_ID_MAX_GPUS       NV_MAX_DEVICES
 #define UVM_PARENT_ID_MAX_PROCESSORS (UVM_PARENT_ID_MAX_GPUS + 1)

-#define UVM_PARENT_ID_MAX_SUB_PROCESSORS 8
-
 #define UVM_ID_MAX_GPUS       (UVM_PARENT_ID_MAX_GPUS * UVM_PARENT_ID_MAX_SUB_PROCESSORS)
 #define UVM_ID_MAX_PROCESSORS (UVM_ID_MAX_GPUS + 1)
 #define UVM_MAX_UNIQUE_GPU_PAIRS SUM_FROM_0_TO_N(UVM_ID_MAX_GPUS - 1)
@ -292,6 +290,9 @@ typedef uvm_processor_id_t uvm_gpu_id_t;

 #define UVM_ID_CHECK_BOUNDS(id) UVM_ASSERT_MSG(id.val <= UVM_ID_MAX_PROCESSORS, "id %u\n", id.val)

+#define UVM_SUB_PROCESSOR_INDEX_CHECK_BOUNDS(sub_index) \
+    UVM_ASSERT_MSG((sub_index) < UVM_PARENT_ID_MAX_SUB_PROCESSORS, "sub_index %u\n", (sub_index))
+
 static int uvm_parent_id_cmp(uvm_parent_processor_id_t id1, uvm_parent_processor_id_t id2)
 {
    UVM_PARENT_ID_CHECK_BOUNDS(id1);
@ -493,11 +494,16 @@ static uvm_gpu_id_t uvm_gpu_id_from_parent_gpu_id(const uvm_parent_gpu_id_t id)
 static uvm_gpu_id_t uvm_gpu_id_from_sub_processor_index(NvU32 index, NvU32 sub_index)
 {
    UVM_ASSERT(index < UVM_PARENT_ID_MAX_GPUS);
-    UVM_ASSERT(sub_index < UVM_PARENT_ID_MAX_SUB_PROCESSORS);
+    UVM_SUB_PROCESSOR_INDEX_CHECK_BOUNDS(sub_index);

    return uvm_gpu_id_from_index(index * UVM_PARENT_ID_MAX_SUB_PROCESSORS + sub_index);
 }

+static uvm_gpu_id_t uvm_gpu_id_from_sub_processor(uvm_parent_gpu_id_t id, NvU32 sub_index)
+{
+    return uvm_gpu_id_from_sub_processor_index(uvm_parent_id_gpu_index(id), sub_index);
+}
+
 static uvm_parent_gpu_id_t uvm_parent_gpu_id_from_gpu_id(const uvm_gpu_id_t id)
 {
    UVM_ASSERT(UVM_ID_IS_GPU(id));
@ -525,6 +531,71 @@ UVM_PROCESSOR_MASK(uvm_processor_mask_t,              \
 extern const uvm_processor_mask_t g_uvm_processor_mask_cpu;
 extern const uvm_processor_mask_t g_uvm_processor_mask_empty;

+// This is similar to uvm_parent_processor_mask_t and uvm_processor_mask_t
+// but defined as a NvU8 in order to save memory since DECLARE_BITMAP() uses
+// unsigned long. It also means we need to define our own bitops.
+// Note that these are not atomic operations.
+typedef struct
+{
+    NvU8 bitmap;
+} uvm_sub_processor_mask_t;
+
+static bool uvm_sub_processor_mask_test(const uvm_sub_processor_mask_t *mask, NvU32 sub_index)
+{
+    UVM_SUB_PROCESSOR_INDEX_CHECK_BOUNDS(sub_index);
+
+    return mask->bitmap & (1 << sub_index);
+}
+
+static void uvm_sub_processor_mask_set(uvm_sub_processor_mask_t *mask, NvU32 sub_index)
+{
+    UVM_SUB_PROCESSOR_INDEX_CHECK_BOUNDS(sub_index);
+
+    mask->bitmap |= 1 << sub_index;
+}
+
+static void uvm_sub_processor_mask_clear(uvm_sub_processor_mask_t *mask, NvU32 sub_index)
+{
+    UVM_SUB_PROCESSOR_INDEX_CHECK_BOUNDS(sub_index);
+
+    mask->bitmap &= ~(1 << sub_index);
+}
+
+static bool uvm_sub_processor_mask_test_and_set(uvm_sub_processor_mask_t *mask, NvU32 sub_index)
+{
+    bool result = uvm_sub_processor_mask_test(mask, sub_index);
+
+    if (!result)
+        uvm_sub_processor_mask_set(mask, sub_index);
+
+    return result;
+}
+
+static bool uvm_sub_processor_mask_test_and_clear(uvm_sub_processor_mask_t *mask, NvU32 sub_index)
+{
+    bool result = uvm_sub_processor_mask_test(mask, sub_index);
+
+    if (result)
+        uvm_sub_processor_mask_clear(mask, sub_index);
+
+    return result;
+}
+
+static void uvm_sub_processor_mask_zero(uvm_sub_processor_mask_t *mask)
+{
+    mask->bitmap = 0;
+}
+
+static bool uvm_sub_processor_mask_empty(const uvm_sub_processor_mask_t *mask)
+{
+    return mask->bitmap == 0;
+}
+
+static NvU32 uvm_sub_processor_mask_get_count(const uvm_sub_processor_mask_t *mask)
+{
+    return hweight8(mask->bitmap);
+}
+
 // Like uvm_processor_mask_subset() but ignores the CPU in the subset mask.
 // Returns whether the GPUs in subset are a subset of the GPUs in mask.
 bool uvm_processor_mask_gpu_subset(const uvm_processor_mask_t *subset,
@ -571,8 +642,28 @@ void uvm_parent_gpus_from_processor_mask(uvm_parent_processor_mask_t *parent_mas
         i = uvm_gpu_id_next(i))

 // Helper to iterate over all sub processor indexes.
-#define for_each_sub_processor_index(i) \
-    for (i = 0; i < UVM_PARENT_ID_MAX_SUB_PROCESSORS; i++)
+#define for_each_sub_processor_index(sub_index) \
+    for ((sub_index) = 0; (sub_index) < UVM_PARENT_ID_MAX_SUB_PROCESSORS; (sub_index)++)
+
+static NvU32 uvm_sub_processor_mask_find_first_index(const uvm_sub_processor_mask_t *mask)
+{
+    unsigned long bitmap = mask->bitmap;
+
+    return find_first_bit(&bitmap, UVM_PARENT_ID_MAX_SUB_PROCESSORS);
+}
+
+static NvU32 uvm_sub_processor_mask_find_next_index(const uvm_sub_processor_mask_t *mask, NvU32 min_index)
+{
+    unsigned long bitmap = mask->bitmap;
+
+    return find_next_bit(&bitmap, UVM_PARENT_ID_MAX_SUB_PROCESSORS, min_index);
+}
+
+// Helper to iterate over all sub processor indexes in a given mask.
+#define for_each_sub_processor_index_in_mask(sub_index, sub_mask)                           \
+    for ((sub_index) = uvm_sub_processor_mask_find_first_index((sub_mask));                 \
+         (sub_index) < UVM_PARENT_ID_MAX_SUB_PROCESSORS;                                    \
+         (sub_index) = uvm_sub_processor_mask_find_next_index((sub_mask), (sub_index) + 1))

 // Helper to iterate over all valid processor ids.
 #define for_each_id(i) for (i = UVM_ID_CPU; UVM_ID_IS_VALID(i); i = uvm_id_next(i))
--- a/kernel-open/nvidia-uvm/uvm_test_ioctl.h
+++ b/kernel-open/nvidia-uvm/uvm_test_ioctl.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2022 NVidia Corporation
+    Copyright (c) 2015-2024 NVidia Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -191,7 +191,7 @@ typedef struct
    NvU32                           read_duplication;                                   // Out (UVM_TEST_READ_DUPLICATION_POLICY)
    NvProcessorUuid                 preferred_location;                                 // Out
    NvS32                           preferred_cpu_nid;                                  // Out
-    NvProcessorUuid                 accessed_by[UVM_MAX_PROCESSORS_V2];                 // Out
+    NvProcessorUuid                 accessed_by[UVM_MAX_PROCESSORS];                    // Out
    NvU32                           accessed_by_count;                                  // Out
    NvU32                           type;                                               // Out (UVM_TEST_VA_RANGE_TYPE)
    union
@ -624,7 +624,7 @@ typedef struct

    // Array of processors which have a resident copy of the page containing
    // lookup_address.
-    NvProcessorUuid                 resident_on[UVM_MAX_PROCESSORS_V2];                 // Out
+    NvProcessorUuid                 resident_on[UVM_MAX_PROCESSORS];                    // Out
    NvU32                           resident_on_count;                                  // Out

    // If the memory is resident on the CPU, the NUMA node on which the page
@ -635,24 +635,24 @@ typedef struct
    // system-page-sized portion of this allocation which contains
    // lookup_address is guaranteed to be resident on the corresponding
    // processor.
-    NvU32                           resident_physical_size[UVM_MAX_PROCESSORS_V2];      // Out
+    NvU32                           resident_physical_size[UVM_MAX_PROCESSORS];         // Out

    // The physical address of the physical allocation backing lookup_address.
-    NvU64                           resident_physical_address[UVM_MAX_PROCESSORS_V2] NV_ALIGN_BYTES(8); // Out
+    NvU64                           resident_physical_address[UVM_MAX_PROCESSORS] NV_ALIGN_BYTES(8); // Out

    // Array of processors which have a virtual mapping covering lookup_address.
-    NvProcessorUuid                 mapped_on[UVM_MAX_PROCESSORS_V2];                   // Out
-    NvU32                           mapping_type[UVM_MAX_PROCESSORS_V2];                // Out
-    NvU64                           mapping_physical_address[UVM_MAX_PROCESSORS_V2] NV_ALIGN_BYTES(8); // Out
+    NvProcessorUuid                 mapped_on[UVM_MAX_PROCESSORS];                      // Out
+    NvU32                           mapping_type[UVM_MAX_PROCESSORS];                   // Out
+    NvU64                           mapping_physical_address[UVM_MAX_PROCESSORS] NV_ALIGN_BYTES(8); // Out
    NvU32                           mapped_on_count;                                    // Out

    // The size of the virtual mapping covering lookup_address on each
    // mapped_on processor.
-    NvU32                           page_size[UVM_MAX_PROCESSORS_V2];                   // Out
+    NvU32                           page_size[UVM_MAX_PROCESSORS];                      // Out

    // Array of processors which have physical memory populated that would back
    // lookup_address if it was resident.
-    NvProcessorUuid                 populated_on[UVM_MAX_PROCESSORS_V2];                // Out
+    NvProcessorUuid                 populated_on[UVM_MAX_PROCESSORS];                   // Out
    NvU32                           populated_on_count;                                 // Out

    NV_STATUS rmStatus;                                                                 // Out
--- a/kernel-open/nvidia-uvm/uvm_tlb_batch.c
+++ b/kernel-open/nvidia-uvm/uvm_tlb_batch.c
@ -30,18 +30,18 @@ void uvm_tlb_batch_begin(uvm_page_tree_t *tree, uvm_tlb_batch_t *batch)
    batch->tree = tree;
 }

-static NvU32 smallest_page_size(NvU32 page_sizes)
+static NvU64 smallest_page_size(NvU64 page_sizes)
 {
    UVM_ASSERT(page_sizes != 0);

-    return 1u << __ffs(page_sizes);
+    return 1ULL << __ffs(page_sizes);
 }

-static NvU32 biggest_page_size(NvU32 page_sizes)
+static NvU64 biggest_page_size(NvU64 page_sizes)
 {
    UVM_ASSERT(page_sizes != 0);

-    return 1u << __fls(page_sizes);
+    return 1ULL << __fls(page_sizes);
 }

 static void tlb_batch_flush_invalidate_per_va(uvm_tlb_batch_t *batch, uvm_push_t *push)
@ -53,8 +53,8 @@ static void tlb_batch_flush_invalidate_per_va(uvm_tlb_batch_t *batch, uvm_push_t

    for (i = 0; i < batch->count; ++i) {
        uvm_tlb_batch_range_t *entry = &batch->ranges[i];
-        NvU32 min_page_size = smallest_page_size(entry->page_sizes);
-        NvU32 max_page_size = biggest_page_size(entry->page_sizes);
+        NvU64 min_page_size = smallest_page_size(entry->page_sizes);
+        NvU64 max_page_size = biggest_page_size(entry->page_sizes);

        // Use the depth of the max page size as it's the broadest
        NvU32 depth = tree->hal->page_table_depth(max_page_size);
@ -113,7 +113,7 @@ void uvm_tlb_batch_end(uvm_tlb_batch_t *batch, uvm_push_t *push, uvm_membar_t tl
        tlb_batch_flush_invalidate_per_va(batch, push);
 }

-void uvm_tlb_batch_invalidate(uvm_tlb_batch_t *batch, NvU64 start, NvU64 size, NvU32 page_sizes, uvm_membar_t tlb_membar)
+void uvm_tlb_batch_invalidate(uvm_tlb_batch_t *batch, NvU64 start, NvU64 size, NvU64 page_sizes, uvm_membar_t tlb_membar)
 {
    uvm_tlb_batch_range_t *new_entry;

--- a/kernel-open/nvidia-uvm/uvm_tlb_batch.h
+++ b/kernel-open/nvidia-uvm/uvm_tlb_batch.h
@ -41,7 +41,7 @@ typedef struct
    NvU64 size;

    // Min and max page size ored together
-    NvU32 page_sizes;
+    NvU64 page_sizes;
 } uvm_tlb_batch_range_t;

 struct uvm_tlb_batch_struct
@ -63,7 +63,7 @@ struct uvm_tlb_batch_struct
    NvU32 count;

    // Biggest page size across all queued up invalidates
-    NvU32 biggest_page_size;
+    NvU64 biggest_page_size;

    // Max membar across all queued up invalidates
    uvm_membar_t membar;
@ -81,7 +81,7 @@ void uvm_tlb_batch_begin(uvm_page_tree_t *tree, uvm_tlb_batch_t *batch);
 // If the membar parameter is not UVM_MEMBAR_NONE, the specified membar will
 // be performed logically after the TLB invalidate such that all physical memory
 // accesses using the old translations are ordered to the scope of the membar.
-void uvm_tlb_batch_invalidate(uvm_tlb_batch_t *batch, NvU64 start, NvU64 size, NvU32 page_sizes, uvm_membar_t tlb_membar);
+void uvm_tlb_batch_invalidate(uvm_tlb_batch_t *batch, NvU64 start, NvU64 size, NvU64 page_sizes, uvm_membar_t tlb_membar);

 // End a TLB invalidate batch
 //
@ -97,8 +97,12 @@ void uvm_tlb_batch_end(uvm_tlb_batch_t *batch, uvm_push_t *push, uvm_membar_t tl
 // Helper for invalidating a single range immediately.
 //
 // Internally begins and ends a TLB batch.
-static void uvm_tlb_batch_single_invalidate(uvm_page_tree_t *tree, uvm_push_t *push,
-        NvU64 start, NvU64 size, NvU32 page_sizes, uvm_membar_t tlb_membar)
+static void uvm_tlb_batch_single_invalidate(uvm_page_tree_t *tree,
+                                            uvm_push_t *push,
+                                            NvU64 start,
+                                            NvU64 size,
+                                            NvU64 page_sizes,
+                                            uvm_membar_t tlb_membar)
 {
    uvm_tlb_batch_t batch;

--- a/kernel-open/nvidia-uvm/uvm_tools.c
+++ b/kernel-open/nvidia-uvm/uvm_tools.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2023 NVIDIA Corporation
+    Copyright (c) 2016-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -57,20 +57,12 @@ typedef struct
    struct list_head queue_nodes[UvmEventNumTypesAll];

    struct page **queue_buffer_pages;
-    union
-    {
-        UvmEventEntry_V1 *queue_v1;
-        UvmEventEntry_V2 *queue_v2;
-    };
+    void *queue_buffer;
    NvU32 queue_buffer_count;
    NvU32 notification_threshold;

    struct page **control_buffer_pages;
-    union
-    {
-        UvmToolsEventControlData_V1 *control_v1;
-        UvmToolsEventControlData_V2 *control_v2;
-    };
+    UvmToolsEventControlData *control;

    wait_queue_head_t wait_queue;
    bool is_wakeup_get_valid;
@ -398,16 +390,12 @@ static void destroy_event_tracker(uvm_tools_event_tracker_t *event_tracker)

        if (event_tracker->is_queue) {
            uvm_tools_queue_t *queue = &event_tracker->queue;
-            NvU64 buffer_size, control_size;
+            NvU64 buffer_size;

-            if (event_tracker->version == UvmToolsEventQueueVersion_V1) {
+            if (event_tracker->version == UvmToolsEventQueueVersion_V1)
                buffer_size = queue->queue_buffer_count * sizeof(UvmEventEntry_V1);
-                control_size = sizeof(UvmToolsEventControlData_V1);
-            }
-            else {
+            else
                buffer_size = queue->queue_buffer_count * sizeof(UvmEventEntry_V2);
-                control_size = sizeof(UvmToolsEventControlData_V2);
-            }

            remove_event_tracker(va_space,
                                 queue->queue_nodes,
@ -415,16 +403,16 @@ static void destroy_event_tracker(uvm_tools_event_tracker_t *event_tracker)
                                 queue->subscribed_queues,
                                 &queue->subscribed_queues);

-            if (queue->queue_v2 != NULL) {
+            if (queue->queue_buffer != NULL) {
                unmap_user_pages(queue->queue_buffer_pages,
-                                 queue->queue_v2,
+                                 queue->queue_buffer,
                                 buffer_size);
            }

-            if (queue->control_v2 != NULL) {
+            if (queue->control != NULL) {
                unmap_user_pages(queue->control_buffer_pages,
-                                 queue->control_v2,
-                                 control_size);
+                                 queue->control,
+                                 sizeof(UvmToolsEventControlData));
            }
        }
        else {
@ -456,9 +444,9 @@ static void destroy_event_tracker(uvm_tools_event_tracker_t *event_tracker)
    kmem_cache_free(g_tools_event_tracker_cache, event_tracker);
 }

-static void enqueue_event_v1(const UvmEventEntry_V1 *entry, uvm_tools_queue_t *queue)
+static void enqueue_event(const void *entry, size_t entry_size, NvU8 eventType, uvm_tools_queue_t *queue)
 {
-    UvmToolsEventControlData_V1 *ctrl = queue->control_v1;
+    UvmToolsEventControlData *ctrl = queue->control;
    uvm_tools_queue_snapshot_t sn;
    NvU32 queue_size = queue->queue_buffer_count;
    NvU32 queue_mask = queue_size - 1;
@ -481,11 +469,11 @@ static void enqueue_event_v1(const UvmEventEntry_V1 *entry, uvm_tools_queue_t *q

    // one free element means that the queue is full
    if (((queue_size + sn.get_behind - sn.put_behind) & queue_mask) == 1) {
-        atomic64_inc((atomic64_t *)&ctrl->dropped + entry->eventData.eventType);
+        atomic64_inc((atomic64_t *)&ctrl->dropped + eventType);
        goto unlock;
    }

-    memcpy(queue->queue_v1 + sn.put_behind, entry, sizeof(*entry));
+    memcpy((char *)queue->queue_buffer + sn.put_behind * entry_size, entry, entry_size);

    sn.put_behind = sn.put_ahead;

@ -509,79 +497,45 @@ unlock:
    uvm_spin_unlock(&queue->lock);
 }

+static void enqueue_event_v1(const UvmEventEntry_V1 *entry, uvm_tools_queue_t *queue)
+{
+    enqueue_event(entry, sizeof(*entry), entry->eventData.eventType, queue);
+}
+
 static void enqueue_event_v2(const UvmEventEntry_V2 *entry, uvm_tools_queue_t *queue)
 {
-    UvmToolsEventControlData_V2 *ctrl = queue->control_v2;
-    uvm_tools_queue_snapshot_t sn;
-    NvU32 queue_size = queue->queue_buffer_count;
-    NvU32 queue_mask = queue_size - 1;
+    enqueue_event(entry, sizeof(*entry), entry->eventData.eventType, queue);
+}

-    // Prevent processor speculation prior to accessing user-mapped memory to
-    // avoid leaking information from side-channel attacks. There are many
-    // possible paths leading to this point and it would be difficult and error-
-    // prone to audit all of them to determine whether user mode could guide
-    // this access to kernel memory under speculative execution, so to be on the
-    // safe side we'll just always block speculation.
-    nv_speculation_barrier();
+static void uvm_tools_record_event(struct list_head *head,
+                                   const void *entry,
+                                   size_t entry_size,
+                                   NvU8 eventType)
+{
+    uvm_tools_queue_t *queue;

-    uvm_spin_lock(&queue->lock);
+    UVM_ASSERT(eventType < UvmEventNumTypesAll);

-    // ctrl is mapped into user space with read and write permissions,
-    // so its values cannot be trusted.
-    sn.get_behind = atomic_read((atomic_t *)&ctrl->get_behind) & queue_mask;
-    sn.put_behind = atomic_read((atomic_t *)&ctrl->put_behind) & queue_mask;
-    sn.put_ahead = (sn.put_behind + 1) & queue_mask;
-
-    // one free element means that the queue is full
-    if (((queue_size + sn.get_behind - sn.put_behind) & queue_mask) == 1) {
-        atomic64_inc((atomic64_t *)&ctrl->dropped + entry->eventData.eventType);
-        goto unlock;
-    }
-
-    memcpy(queue->queue_v2 + sn.put_behind, entry, sizeof(*entry));
-
-    sn.put_behind = sn.put_ahead;
-    // put_ahead and put_behind will always be the same outside of queue->lock
-    // this allows the user-space consumer to choose either a 2 or 4 pointer synchronization approach
-    atomic_set((atomic_t *)&ctrl->put_ahead, sn.put_behind);
-    atomic_set((atomic_t *)&ctrl->put_behind, sn.put_behind);
-
-    sn.get_ahead = atomic_read((atomic_t *)&ctrl->get_ahead);
-    // if the queue needs to be woken up, only signal if we haven't signaled before for this value of get_ahead
-    if (queue_needs_wakeup(queue, &sn) && !(queue->is_wakeup_get_valid && queue->wakeup_get == sn.get_ahead)) {
-        queue->is_wakeup_get_valid = true;
-        queue->wakeup_get = sn.get_ahead;
-        wake_up_all(&queue->wait_queue);
-    }
-
-unlock:
-    uvm_spin_unlock(&queue->lock);
+    list_for_each_entry(queue, head + eventType, queue_nodes[eventType])
+        enqueue_event(entry, entry_size, eventType, queue);
 }

 static void uvm_tools_record_event_v1(uvm_va_space_t *va_space, const UvmEventEntry_V1 *entry)
 {
    NvU8 eventType = entry->eventData.eventType;
-    uvm_tools_queue_t *queue;
-
-    UVM_ASSERT(eventType < UvmEventNumTypesAll);

    uvm_assert_rwsem_locked(&va_space->tools.lock);

-    list_for_each_entry(queue, va_space->tools.queues_v1 + eventType, queue_nodes[eventType])
-        enqueue_event_v1(entry, queue);
+    uvm_tools_record_event(va_space->tools.queues_v1, entry, sizeof(*entry), eventType);
 }

 static void uvm_tools_record_event_v2(uvm_va_space_t *va_space, const UvmEventEntry_V2 *entry)
 {
    NvU8 eventType = entry->eventData.eventType;
-    uvm_tools_queue_t *queue;
-
-    UVM_ASSERT(eventType < UvmEventNumTypesAll);

    uvm_assert_rwsem_locked(&va_space->tools.lock);

-    list_for_each_entry(queue, va_space->tools.queues_v2 + eventType, queue_nodes[eventType])
-        enqueue_event_v2(entry, queue);
+    uvm_tools_record_event(va_space->tools.queues_v2, entry, sizeof(*entry), eventType);
 }

 static bool counter_matches_processor(UvmCounterName counter, const NvProcessorUuid *processor)
@ -751,7 +705,7 @@ static unsigned uvm_tools_poll(struct file *filp, poll_table *wait)
    int flags = 0;
    uvm_tools_queue_snapshot_t sn;
    uvm_tools_event_tracker_t *event_tracker;
-    UvmToolsEventControlData_V2 *ctrl;
+    UvmToolsEventControlData *ctrl;

    if (uvm_global_get_status() != NV_OK)
        return POLLERR;
@ -763,7 +717,7 @@ static unsigned uvm_tools_poll(struct file *filp, poll_table *wait)
    uvm_spin_lock(&event_tracker->queue.lock);

    event_tracker->queue.is_wakeup_get_valid = false;
-    ctrl = event_tracker->queue.control_v2;
+    ctrl = event_tracker->queue.control;
    sn.get_ahead = atomic_read((atomic_t *)&ctrl->get_ahead);
    sn.put_behind = atomic_read((atomic_t *)&ctrl->put_behind);

@ -878,6 +832,24 @@ static void record_gpu_fault_instance(uvm_gpu_t *gpu,
    }
 }

+static void record_cpu_fault(UvmEventCpuFaultInfo *info, uvm_perf_event_data_t *event_data)
+{
+    info->eventType = UvmEventTypeCpuFault;
+    if (event_data->fault.cpu.is_write)
+        info->accessType = UvmEventMemoryAccessTypeWrite;
+    else
+        info->accessType = UvmEventMemoryAccessTypeRead;
+
+    info->address = event_data->fault.cpu.fault_va;
+    info->timeStamp = NV_GETTIME();
+    // assume that current owns va_space
+    info->pid = uvm_get_stale_process_id();
+    info->threadId = uvm_get_stale_thread_id();
+    info->pc = event_data->fault.cpu.pc;
+    // TODO: Bug 4515381: set info->nid when we decide if it's NUMA node ID or
+    // CPU ID.
+}
+
 static void uvm_tools_record_fault(uvm_perf_event_t event_id, uvm_perf_event_data_t *event_data)
 {
    uvm_va_space_t *va_space = event_data->fault.space;
@ -895,41 +867,17 @@ static void uvm_tools_record_fault(uvm_perf_event_t event_id, uvm_perf_event_dat
    if (UVM_ID_IS_CPU(event_data->fault.proc_id)) {
        if (tools_is_event_enabled_version(va_space, UvmEventTypeCpuFault, UvmToolsEventQueueVersion_V1)) {
            UvmEventEntry_V1 entry;
-            UvmEventCpuFaultInfo_V1 *info = &entry.eventData.cpuFault;
            memset(&entry, 0, sizeof(entry));

-            info->eventType = UvmEventTypeCpuFault;
-            if (event_data->fault.cpu.is_write)
-                info->accessType = UvmEventMemoryAccessTypeWrite;
-            else
-                info->accessType = UvmEventMemoryAccessTypeRead;
-
-            info->address = event_data->fault.cpu.fault_va;
-            info->timeStamp = NV_GETTIME();
-            // assume that current owns va_space
-            info->pid = uvm_get_stale_process_id();
-            info->threadId = uvm_get_stale_thread_id();
-            info->pc = event_data->fault.cpu.pc;
+            record_cpu_fault(&entry.eventData.cpuFault, event_data);

            uvm_tools_record_event_v1(va_space, &entry);
        }
        if (tools_is_event_enabled_version(va_space, UvmEventTypeCpuFault, UvmToolsEventQueueVersion_V2)) {
            UvmEventEntry_V2 entry;
-            UvmEventCpuFaultInfo_V2 *info = &entry.eventData.cpuFault;
            memset(&entry, 0, sizeof(entry));

-            info->eventType = UvmEventTypeCpuFault;
-            if (event_data->fault.cpu.is_write)
-                info->accessType = UvmEventMemoryAccessTypeWrite;
-            else
-                info->accessType = UvmEventMemoryAccessTypeRead;
-
-            info->address = event_data->fault.cpu.fault_va;
-            info->timeStamp = NV_GETTIME();
-            // assume that current owns va_space
-            info->pid = uvm_get_stale_process_id();
-            info->threadId = uvm_get_stale_thread_id();
-            info->pc = event_data->fault.cpu.pc;
+            record_cpu_fault(&entry.eventData.cpuFault, event_data);

            uvm_tools_record_event_v2(va_space, &entry);
        }
@ -1834,7 +1782,7 @@ void uvm_tools_record_thrashing(uvm_va_space_t *va_space,
        info->size      = region_size;
        info->timeStamp = NV_GETTIME();

-        BUILD_BUG_ON(UVM_MAX_PROCESSORS_V2 < UVM_ID_MAX_PROCESSORS);
+        BUILD_BUG_ON(UVM_MAX_PROCESSORS < UVM_ID_MAX_PROCESSORS);
        bitmap_copy((long unsigned *)&info->processors, processors->bitmap, UVM_ID_MAX_PROCESSORS);

        uvm_tools_record_event_v2(va_space, &entry);
@ -2151,7 +2099,7 @@ NV_STATUS uvm_api_tools_init_event_tracker(UVM_TOOLS_INIT_EVENT_TRACKER_PARAMS *
    event_tracker->is_queue = params->queueBufferSize != 0;
    if (event_tracker->is_queue) {
        uvm_tools_queue_t *queue = &event_tracker->queue;
-        NvU64 buffer_size, control_size;
+        NvU64 buffer_size;

        uvm_spin_lock_init(&queue->lock, UVM_LOCK_ORDER_LEAF);
        init_waitqueue_head(&queue->wait_queue);
@ -2170,25 +2118,21 @@ NV_STATUS uvm_api_tools_init_event_tracker(UVM_TOOLS_INIT_EVENT_TRACKER_PARAMS *
            goto fail;
        }

-        if (event_tracker->version == UvmToolsEventQueueVersion_V1) {
+        if (event_tracker->version == UvmToolsEventQueueVersion_V1)
            buffer_size = queue->queue_buffer_count * sizeof(UvmEventEntry_V1);
-            control_size = sizeof(UvmToolsEventControlData_V1);
-        }
-        else {
+        else
            buffer_size = queue->queue_buffer_count * sizeof(UvmEventEntry_V2);
-            control_size = sizeof(UvmToolsEventControlData_V2);
-        }

        status = map_user_pages(params->queueBuffer,
                                buffer_size,
-                                (void **)&queue->queue_v2,
+                                &queue->queue_buffer,
                                &queue->queue_buffer_pages);
        if (status != NV_OK)
            goto fail;

        status = map_user_pages(params->controlBuffer,
-                                control_size,
-                                (void **)&queue->control_v2,
+                                sizeof(UvmToolsEventControlData),
+                                (void **)&queue->control,
                                &queue->control_buffer_pages);

        if (status != NV_OK)
@ -2224,6 +2168,7 @@ NV_STATUS uvm_api_tools_set_notification_threshold(UVM_TOOLS_SET_NOTIFICATION_TH
 {
    uvm_tools_queue_snapshot_t sn;
    uvm_tools_event_tracker_t *event_tracker = tools_event_tracker(filp);
+    UvmToolsEventControlData *ctrl;

    if (!tracker_is_queue(event_tracker))
        return NV_ERR_INVALID_ARGUMENT;
@ -2232,18 +2177,9 @@ NV_STATUS uvm_api_tools_set_notification_threshold(UVM_TOOLS_SET_NOTIFICATION_TH

    event_tracker->queue.notification_threshold = params->notificationThreshold;

-    if (event_tracker->version == UvmToolsEventQueueVersion_V1) {
-        UvmToolsEventControlData_V1 *ctrl = event_tracker->queue.control_v1;
-
-        sn.put_behind = atomic_read((atomic_t *)&ctrl->put_behind);
-        sn.get_ahead = atomic_read((atomic_t *)&ctrl->get_ahead);
-    }
-    else {
-        UvmToolsEventControlData_V2 *ctrl = event_tracker->queue.control_v2;
-
-        sn.put_behind = atomic_read((atomic_t *)&ctrl->put_behind);
-        sn.get_ahead = atomic_read((atomic_t *)&ctrl->get_ahead);
-    }
+    ctrl = event_tracker->queue.control;
+    sn.put_behind = atomic_read((atomic_t *)&ctrl->put_behind);
+    sn.get_ahead = atomic_read((atomic_t *)&ctrl->get_ahead);

    if (queue_needs_wakeup(&event_tracker->queue, &sn))
        wake_up_all(&event_tracker->queue.wait_queue);
--- a/kernel-open/nvidia-uvm/uvm_turing_host.c
+++ b/kernel-open/nvidia-uvm/uvm_turing_host.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2017-2021 NVIDIA Corporation
+    Copyright (c) 2017-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -104,3 +104,248 @@ void uvm_hal_turing_host_set_gpfifo_entry(NvU64 *fifo_entry,
    *fifo_entry = fifo_entry_value;
 }

+void uvm_hal_turing_host_tlb_invalidate_all(uvm_push_t *push,
+                                            uvm_gpu_phys_address_t pdb,
+                                            NvU32 depth,
+                                            uvm_membar_t membar)
+{
+    NvU32 aperture_value;
+    NvU32 page_table_level;
+    NvU32 pdb_lo;
+    NvU32 pdb_hi;
+    NvU32 ack_value = 0;
+    NvU32 sysmembar_value = 0;
+
+    UVM_ASSERT_MSG(pdb.aperture == UVM_APERTURE_VID || pdb.aperture == UVM_APERTURE_SYS, "aperture: %u", pdb.aperture);
+
+    if (pdb.aperture == UVM_APERTURE_VID)
+        aperture_value = HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_APERTURE, VID_MEM);
+    else
+        aperture_value = HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_APERTURE, SYS_MEM_COHERENT);
+
+    UVM_ASSERT_MSG(IS_ALIGNED(pdb.address, 1 << 12), "pdb 0x%llx\n", pdb.address);
+    pdb.address >>= 12;
+
+    pdb_lo = pdb.address & HWMASK(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);
+    pdb_hi = pdb.address >> HWSIZE(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);
+
+    // PDE3 is the highest level on Pascal-Turing, see the comment in
+    // uvm_pascal_mmu.c for details.
+    UVM_ASSERT_MSG(depth < NVC46F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE3, "depth %u", depth);
+    page_table_level = NVC46F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE3 - depth;
+
+    if (membar != UVM_MEMBAR_NONE) {
+        // If a GPU or SYS membar is needed, ACK_TYPE needs to be set to
+        // GLOBALLY to make sure all the pending accesses can be picked up by
+        // the membar.
+        ack_value = HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_ACK_TYPE, GLOBALLY);
+    }
+
+    if (membar == UVM_MEMBAR_SYS)
+        sysmembar_value = HWCONST(C46F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, EN);
+    else
+        sysmembar_value = HWCONST(C46F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS);
+
+    NV_PUSH_4U(C46F, MEM_OP_A, sysmembar_value,
+                     MEM_OP_B, 0,
+                     MEM_OP_C, HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_PDB, ONE) |
+                               HWVALUE(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO, pdb_lo) |
+                               HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_GPC, ENABLE) |
+                               HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_REPLAY, NONE) |
+                               HWVALUE(C46F, MEM_OP_C, TLB_INVALIDATE_PAGE_TABLE_LEVEL, page_table_level) |
+                               aperture_value |
+                               ack_value,
+                     MEM_OP_D, HWCONST(C46F, MEM_OP_D, OPERATION, MMU_TLB_INVALIDATE) |
+                               HWVALUE(C46F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));
+
+    // GPU membar still requires an explicit membar method.
+    if (membar == UVM_MEMBAR_GPU)
+        uvm_push_get_gpu(push)->parent->host_hal->membar_gpu(push);
+}
+
+void uvm_hal_turing_host_tlb_invalidate_va(uvm_push_t *push,
+                                           uvm_gpu_phys_address_t pdb,
+                                           NvU32 depth,
+                                           NvU64 base,
+                                           NvU64 size,
+                                           NvU64 page_size,
+                                           uvm_membar_t membar)
+{
+    NvU32 aperture_value;
+    NvU32 page_table_level;
+    NvU32 pdb_lo;
+    NvU32 pdb_hi;
+    NvU32 ack_value = 0;
+    NvU32 sysmembar_value = 0;
+    NvU32 va_lo;
+    NvU32 va_hi;
+    NvU64 end;
+    NvU64 actual_base;
+    NvU64 actual_size;
+    NvU64 actual_end;
+    NvU32 log2_invalidation_size;
+    uvm_gpu_t *gpu = uvm_push_get_gpu(push);
+
+    UVM_ASSERT_MSG(IS_ALIGNED(page_size, 1 << 12), "page_size 0x%llx\n", page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(base, page_size), "base 0x%llx page_size 0x%llx\n", base, page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%llx\n", size, page_size);
+    UVM_ASSERT_MSG(size > 0, "size 0x%llx\n", size);
+
+    // The invalidation size must be a power-of-two number of pages containing
+    // the passed interval
+    end = base + size - 1;
+    log2_invalidation_size = __fls((unsigned long)(end ^ base)) + 1;
+
+    if (log2_invalidation_size == 64) {
+        // Invalidate everything
+        gpu->parent->host_hal->tlb_invalidate_all(push, pdb, depth, membar);
+        return;
+    }
+
+    // The hardware aligns the target address down to the invalidation size.
+    actual_size = 1ULL << log2_invalidation_size;
+    actual_base = UVM_ALIGN_DOWN(base, actual_size);
+    actual_end = actual_base + actual_size - 1;
+    UVM_ASSERT(actual_end >= end);
+
+    // The invalidation size field expects log2(invalidation size in 4K), not
+    // log2(invalidation size in bytes)
+    log2_invalidation_size -= 12;
+
+    // Address to invalidate, as a multiple of 4K.
+    base >>= 12;
+    va_lo = base & HWMASK(C46F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO);
+    va_hi = base >> HWSIZE(C46F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO);
+
+    UVM_ASSERT_MSG(pdb.aperture == UVM_APERTURE_VID || pdb.aperture == UVM_APERTURE_SYS, "aperture: %u", pdb.aperture);
+
+    if (pdb.aperture == UVM_APERTURE_VID)
+        aperture_value = HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_APERTURE, VID_MEM);
+    else
+        aperture_value = HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_APERTURE, SYS_MEM_COHERENT);
+
+    UVM_ASSERT_MSG(IS_ALIGNED(pdb.address, 1 << 12), "pdb 0x%llx\n", pdb.address);
+    pdb.address >>= 12;
+
+    pdb_lo = pdb.address & HWMASK(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);
+    pdb_hi = pdb.address >> HWSIZE(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);
+
+    // PDE3 is the highest level on Pascal-Turing, see the comment in
+    // uvm_pascal_mmu.c for details.
+    UVM_ASSERT_MSG(depth < NVC46F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE3, "depth %u", depth);
+    page_table_level = NVC46F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE3 - depth;
+
+    if (membar != UVM_MEMBAR_NONE) {
+        // If a GPU or SYS membar is needed, ACK_TYPE needs to be set to
+        // GLOBALLY to make sure all the pending accesses can be picked up by
+        // the membar.
+        ack_value = HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_ACK_TYPE, GLOBALLY);
+    }
+
+    if (membar == UVM_MEMBAR_SYS)
+        sysmembar_value = HWCONST(C46F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, EN);
+    else
+        sysmembar_value = HWCONST(C46F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS);
+
+    NV_PUSH_4U(C46F, MEM_OP_A, HWVALUE(C46F, MEM_OP_A, TLB_INVALIDATE_INVALIDATION_SIZE, log2_invalidation_size) |
+                               sysmembar_value |
+                               HWVALUE(C46F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO, va_lo),
+                     MEM_OP_B, HWVALUE(C46F, MEM_OP_B, TLB_INVALIDATE_TARGET_ADDR_HI, va_hi),
+                     MEM_OP_C, HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_PDB, ONE) |
+                               HWVALUE(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO, pdb_lo) |
+                               HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_GPC, ENABLE) |
+                               HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_REPLAY, NONE) |
+                               HWVALUE(C46F, MEM_OP_C, TLB_INVALIDATE_PAGE_TABLE_LEVEL, page_table_level) |
+                               aperture_value |
+                               ack_value,
+                     MEM_OP_D, HWCONST(C46F, MEM_OP_D, OPERATION, MMU_TLB_INVALIDATE_TARGETED) |
+                               HWVALUE(C46F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));
+
+    // GPU membar still requires an explicit membar method.
+    if (membar == UVM_MEMBAR_GPU)
+        gpu->parent->host_hal->membar_gpu(push);
+}
+
+void uvm_hal_turing_host_tlb_invalidate_test(uvm_push_t *push,
+                                             uvm_gpu_phys_address_t pdb,
+                                             UVM_TEST_INVALIDATE_TLB_PARAMS *params)
+{
+    NvU32 ack_value = 0;
+    NvU32 sysmembar_value = 0;
+    NvU32 invalidate_gpc_value = 0;
+    NvU32 aperture_value = 0;
+    NvU32 pdb_lo = 0;
+    NvU32 pdb_hi = 0;
+    NvU32 page_table_level = 0;
+
+    UVM_ASSERT_MSG(pdb.aperture == UVM_APERTURE_VID || pdb.aperture == UVM_APERTURE_SYS, "aperture: %u", pdb.aperture);
+    if (pdb.aperture == UVM_APERTURE_VID)
+        aperture_value = HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_APERTURE, VID_MEM);
+    else
+        aperture_value = HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_APERTURE, SYS_MEM_COHERENT);
+
+    UVM_ASSERT_MSG(IS_ALIGNED(pdb.address, 1 << 12), "pdb 0x%llx\n", pdb.address);
+    pdb.address >>= 12;
+
+    pdb_lo = pdb.address & HWMASK(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);
+    pdb_hi = pdb.address >> HWSIZE(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);
+
+    if (params->page_table_level != UvmInvalidatePageTableLevelAll) {
+        // PDE3 is the highest level on Pascal-Turing, see the comment in
+        // uvm_pascal_mmu.c for details.
+        page_table_level = min((NvU32)UvmInvalidatePageTableLevelPde3, params->page_table_level) - 1;
+    }
+
+    if (params->membar != UvmInvalidateTlbMemBarNone) {
+        // If a GPU or SYS membar is needed, ack_value needs to be set to
+        // GLOBALLY to make sure all the pending accesses can be picked up by
+        // the membar.
+        ack_value = HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_ACK_TYPE, GLOBALLY);
+    }
+
+    if (params->membar == UvmInvalidateTlbMemBarSys)
+        sysmembar_value = HWCONST(C46F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, EN);
+    else
+        sysmembar_value = HWCONST(C46F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS);
+
+    if (params->disable_gpc_invalidate)
+        invalidate_gpc_value = HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_GPC, DISABLE);
+    else
+        invalidate_gpc_value = HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_GPC, ENABLE);
+
+    if (params->target_va_mode == UvmTargetVaModeTargeted) {
+        NvU64 va = params->va >> 12;
+
+        NvU32 va_lo = va & HWMASK(C46F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO);
+        NvU32 va_hi = va >> HWSIZE(C46F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO);
+        NV_PUSH_4U(C46F, MEM_OP_A, sysmembar_value |
+                                   HWVALUE(C46F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO, va_lo),
+                         MEM_OP_B, HWVALUE(C46F, MEM_OP_B, TLB_INVALIDATE_TARGET_ADDR_HI, va_hi),
+                         MEM_OP_C, HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_REPLAY, NONE) |
+                                   HWVALUE(C46F, MEM_OP_C, TLB_INVALIDATE_PAGE_TABLE_LEVEL, page_table_level) |
+                                   HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_PDB, ONE) |
+                                   HWVALUE(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO, pdb_lo) |
+                                   invalidate_gpc_value |
+                                   aperture_value |
+                                   ack_value,
+                         MEM_OP_D, HWCONST(C46F, MEM_OP_D, OPERATION, MMU_TLB_INVALIDATE_TARGETED) |
+                                   HWVALUE(C46F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));
+    }
+    else {
+        NV_PUSH_4U(C46F, MEM_OP_A, sysmembar_value,
+                         MEM_OP_B, 0,
+                         MEM_OP_C, HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_REPLAY, NONE) |
+                                   HWVALUE(C46F, MEM_OP_C, TLB_INVALIDATE_PAGE_TABLE_LEVEL, page_table_level) |
+                                   HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_PDB, ONE) |
+                                   HWVALUE(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO, pdb_lo) |
+                                   invalidate_gpc_value |
+                                   aperture_value |
+                                   ack_value,
+                         MEM_OP_D, HWCONST(C46F, MEM_OP_D, OPERATION, MMU_TLB_INVALIDATE) |
+                                   HWVALUE(C46F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));
+    }
+
+    // GPU membar still requires an explicit membar method.
+    if (params->membar == UvmInvalidateTlbMemBarLocal)
+        uvm_push_get_gpu(push)->parent->host_hal->membar_gpu(push);
+}
--- a/kernel-open/nvidia-uvm/uvm_turing_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_turing_mmu.c
@ -138,7 +138,7 @@ static NvU64 poisoned_pte_turing(void)

 static uvm_mmu_mode_hal_t turing_mmu_mode_hal;

-uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_turing(NvU32 big_page_size)
+uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_turing(NvU64 big_page_size)
 {
    static bool initialized = false;

--- a/kernel-open/nvidia-uvm/uvm_types.h
+++ b/kernel-open/nvidia-uvm/uvm_types.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2013-2023 NVidia Corporation
+    Copyright (c) 2013-2024 NVidia Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -52,19 +52,18 @@ typedef enum

 typedef unsigned long long UvmStream;

+// The maximum number of sub-processors per parent GPU.
+#define UVM_PARENT_ID_MAX_SUB_PROCESSORS 8
+
 // The maximum number of GPUs changed when multiple MIG instances per
-// uvm_parent_gpu_t were added. See UvmEventQueueCreate().
+// uvm_parent_gpu_t were added. The old version is kept as a convenience
+// for code that needs to maintain forward compatibility.
 #define UVM_MAX_GPUS_V1       NV_MAX_DEVICES
 #define UVM_MAX_PROCESSORS_V1 (UVM_MAX_GPUS_V1 + 1)
-#define UVM_MAX_GPUS_V2       (NV_MAX_DEVICES * NV_MAX_SUBDEVICES)
-#define UVM_MAX_PROCESSORS_V2 (UVM_MAX_GPUS_V2 + 1)
+#define UVM_MAX_GPUS          (NV_MAX_DEVICES * UVM_PARENT_ID_MAX_SUB_PROCESSORS)
+#define UVM_MAX_PROCESSORS    (UVM_MAX_GPUS + 1)

-// For backward compatibility:
-// TODO: Bug 4465348: remove these after replacing old references.
-#define UVM_MAX_GPUS UVM_MAX_GPUS_V1
-#define UVM_MAX_PROCESSORS UVM_MAX_PROCESSORS_V1
-
-#define UVM_PROCESSOR_MASK_SIZE ((UVM_MAX_PROCESSORS_V2 + (sizeof(NvU64) * 8) - 1) / (sizeof(NvU64) * 8))
+#define UVM_PROCESSOR_MASK_SIZE ((UVM_MAX_PROCESSORS + (sizeof(NvU64) * 8) - 1) / (sizeof(NvU64) * 8))

 #define UVM_INIT_FLAGS_DISABLE_HMM                       ((NvU64)0x1)
 #define UVM_INIT_FLAGS_MULTI_PROCESS_SHARING_MODE        ((NvU64)0x2)
@ -423,29 +422,7 @@ typedef struct
    NvU32 pid;                // process id causing the fault
    NvU32 threadId;           // thread id causing the fault
    NvU64 pc;                 // address of the instruction causing the fault
-} UvmEventCpuFaultInfo_V1;
-
-typedef struct
-{
-    //
-    // eventType has to be 1st argument of this structure. Setting eventType to
-    // UvmEventTypeMemoryViolation helps to identify event data in a queue.
-    //
-    NvU8 eventType;
-    NvU8 accessType;          // read/write violation (UvmEventMemoryAccessType)
-    //
-    // This structure is shared between UVM kernel and tools.
-    // Manually padding the structure so that compiler options like pragma pack
-    // or malign-double will have no effect on the field offsets.
-    //
-    NvU16 padding16Bits;
-    NvS32 nid;                // NUMA node ID of faulting CPU
-    NvU64 address;            // faulting address
-    NvU64 timeStamp;          // cpu time when the fault occurred
-    NvU32 pid;                // process id causing the fault
-    NvU32 threadId;           // thread id causing the fault
-    NvU64 pc;                 // address of the instruction causing the fault
-} UvmEventCpuFaultInfo_V2;
+} UvmEventCpuFaultInfo;

 typedef enum
 {
@ -721,13 +698,7 @@ typedef struct
    //
    NvU8 eventType;
    NvU8 faultType;       // type of gpu fault, refer UvmEventFaultType
-    NvU8 accessType;      // memory access type, refer UvmEventMemoryAccessType
-    //
-    // This structure is shared between UVM kernel and tools.
-    // Manually padding the structure so that compiler options like pragma pack
-    // or malign-double will have no effect on the field offsets
-    //
-    NvU8 padding8Bits_1;
+    NvU16 gpuIndex;       // GPU that experienced the fault
    union
    {
        NvU16 gpcId;      // If this is a replayable fault, this field contains
@ -759,14 +730,13 @@ typedef struct
                          // UvmEventFaultClientTypeGpc indicates replayable
                          // fault, while UvmEventFaultClientTypeHub indicates
                          // non-replayable fault.
-
+    NvU8 accessType;      // memory access type, refer UvmEventMemoryAccessType
    //
    // This structure is shared between UVM kernel and tools.
    // Manually padding the structure so that compiler options like pragma pack
    // or malign-double will have no effect on the field offsets
    //
-    NvU8 padding8Bits_2;
-    NvU16 gpuIndex;       // GPU that experienced the fault
+    NvU16 padding16bits;
 } UvmEventGpuFaultInfo_V2;

 //------------------------------------------------------------------------------
@ -1108,8 +1078,8 @@ typedef struct
    // or malign-double will have no effect on the field offsets
    //
    NvU8  padding8bits;
-    NvU16 padding16bits[2];
    NvU16 processorIndex;   // index of the cpu/gpu that was throttled
+    NvU32 padding32bits;
    NvU64 address;          // address of the page whose servicing is being
                            // throttled
    NvU64 timeStamp;        // cpu start time stamp for the throttling operation
@ -1150,8 +1120,8 @@ typedef struct
    // or malign-double will have no effect on the field offsets
    //
    NvU8  padding8bits;
-    NvU16 padding16bits[2];
    NvU16 processorIndex;   // index of the cpu/gpu that was throttled
+    NvU32 padding32bits;
    NvU64 address;          // address of the page whose servicing is being
                            // throttled
    NvU64 timeStamp;        // cpu end time stamp for the throttling operation
@ -1409,7 +1379,7 @@ typedef struct
            NvU8 eventType;
            UvmEventMigrationInfo_Lite migration_Lite;

-            UvmEventCpuFaultInfo_V1 cpuFault;
+            UvmEventCpuFaultInfo cpuFault;
            UvmEventMigrationInfo_V1 migration;
            UvmEventGpuFaultInfo_V1 gpuFault;
            UvmEventGpuFaultReplayInfo_V1 gpuFaultReplay;
@ -1443,7 +1413,7 @@ typedef struct
            NvU8 eventType;
            UvmEventMigrationInfo_Lite migration_Lite;

-            UvmEventCpuFaultInfo_V2 cpuFault;
+            UvmEventCpuFaultInfo cpuFault;
            UvmEventMigrationInfo_V2 migration;
            UvmEventGpuFaultInfo_V2 gpuFault;
            UvmEventGpuFaultReplayInfo_V2 gpuFaultReplay;
@ -1510,19 +1480,7 @@ typedef enum {
    UvmToolsEventQueueVersion_V2 = 2,
 } UvmToolsEventQueueVersion;

-typedef struct UvmEventControlData_V1_tag {
-    // entries between get_ahead and get_behind are currently being read
-    volatile NvU32 get_ahead;
-    volatile NvU32 get_behind;
-    // entries between put_ahead and put_behind are currently being written
-    volatile NvU32 put_ahead;
-    volatile NvU32 put_behind;
-
-    // counter of dropped events
-    NvU64 dropped[UvmEventNumTypesAll];
-} UvmToolsEventControlData_V1;
-
-typedef struct UvmEventControlData_V2_tag {
+typedef struct UvmEventControlData_tag {
    // entries between get_ahead and get_behind are currently being read
    volatile NvU32 get_ahead;
    volatile NvU32 get_behind;
@ -1531,19 +1489,12 @@ typedef struct UvmEventControlData_V2_tag {
    volatile NvU32 put_ahead;
    volatile NvU32 put_behind;

-    // The version values are limited to UvmToolsEventQueueVersion and
-    // initialized by UvmToolsCreateEventQueue().
-    NvU32 version;
-    NvU32 padding32Bits;
-
    // counter of dropped events
    NvU64 dropped[UvmEventNumTypesAll];
-} UvmToolsEventControlData_V2;
+} UvmToolsEventControlData;

-// For backward compatibility:
-// TODO: Bug 4465348: remove these after replacing old references.
-typedef UvmToolsEventControlData_V1 UvmToolsEventControlData;
-typedef UvmEventEntry_V1 UvmEventEntry;
+// TODO: Bug 4465348: remove this after replacing old references.
+typedef UvmToolsEventControlData UvmToolsEventControlData_V1;

 //------------------------------------------------------------------------------
 // UVM Tools forward types (handles) definitions
--- a/kernel-open/nvidia-uvm/uvm_va_block.c
+++ b/kernel-open/nvidia-uvm/uvm_va_block.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -1328,12 +1328,12 @@ error_block_free:

 static void cpu_chunk_remove_sysmem_gpu_mapping(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu)
 {
-    NvU64 gpu_mapping_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
+    NvU64 gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu);
    if (gpu_mapping_addr == 0)
        return;

    uvm_pmm_sysmem_mappings_remove_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings, gpu_mapping_addr);
-    uvm_cpu_chunk_unmap_parent_gpu_phys(chunk, gpu->parent);
+    uvm_cpu_chunk_unmap_gpu(chunk, gpu);
 }

 static NV_STATUS cpu_chunk_add_sysmem_gpu_mapping(uvm_cpu_chunk_t *chunk,
@ -1356,17 +1356,14 @@ static NV_STATUS cpu_chunk_add_sysmem_gpu_mapping(uvm_cpu_chunk_t *chunk,

    chunk_size = uvm_cpu_chunk_get_size(chunk);

-    // TODO: Bug 3744779: Handle benign assertion in
-    //       pmm_sysmem_mappings_remove_gpu_mapping() in case of a
-    //       failure.
    status = uvm_pmm_sysmem_mappings_add_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings,
-                                                     uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent),
+                                                     uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu),
                                                     uvm_va_block_cpu_page_address(block, page_index),
                                                     chunk_size,
                                                     block,
                                                     UVM_ID_CPU);
    if (status != NV_OK)
-        cpu_chunk_remove_sysmem_gpu_mapping(chunk, gpu);
+        uvm_cpu_chunk_unmap_gpu(chunk, gpu);

    return status;
 }
@ -1395,10 +1392,10 @@ static NV_STATUS block_gpu_map_phys_all_cpu_pages(uvm_va_block_t *block, uvm_gpu

    for_each_possible_uvm_node(nid) {
        for_each_cpu_chunk_in_block(chunk, page_index, block, nid) {
-            UVM_ASSERT_MSG(uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent) == 0,
+            UVM_ASSERT_MSG(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu) == 0,
                           "GPU%u DMA address 0x%llx\n",
                           uvm_id_value(gpu->id),
-                           uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent));
+                           uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu));

            status = cpu_chunk_add_sysmem_gpu_mapping(chunk, block, page_index, gpu);
            if (status != NV_OK)
@ -1561,8 +1558,7 @@ NV_STATUS uvm_va_block_gpu_state_alloc(uvm_va_block_t *va_block)
 }

 void uvm_va_block_unmap_cpu_chunk_on_gpus(uvm_va_block_t *block,
-                                          uvm_cpu_chunk_t *chunk,
-                                          uvm_page_index_t page_index)
+                                          uvm_cpu_chunk_t *chunk)
 {
    uvm_gpu_id_t id;

@ -1601,7 +1597,7 @@ NV_STATUS uvm_va_block_map_cpu_chunk_on_gpus(uvm_va_block_t *block,
    return NV_OK;

 error:
-    uvm_va_block_unmap_cpu_chunk_on_gpus(block, chunk, page_index);
+    uvm_va_block_unmap_cpu_chunk_on_gpus(block, chunk);
    return status;
 }

@ -1620,7 +1616,7 @@ void uvm_va_block_remove_cpu_chunks(uvm_va_block_t *va_block, uvm_va_block_regio
            uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], chunk_region);
            uvm_va_block_cpu_clear_resident_region(va_block, nid, chunk_region);
            uvm_cpu_chunk_remove_from_block(va_block, nid, page_index);
-            uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk, page_index);
+            uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk);
            uvm_cpu_chunk_free(chunk);
        }
    }
@ -2308,7 +2304,7 @@ static bool block_gpu_supports_2m(uvm_va_block_t *block, uvm_gpu_t *gpu)
    return uvm_mmu_page_size_supported(&gpu_va_space->page_tables, UVM_PAGE_SIZE_2M);
 }

-NvU32 uvm_va_block_gpu_big_page_size(uvm_va_block_t *va_block, uvm_gpu_t *gpu)
+NvU64 uvm_va_block_gpu_big_page_size(uvm_va_block_t *va_block, uvm_gpu_t *gpu)
 {
    uvm_gpu_va_space_t *gpu_va_space;

@ -2316,7 +2312,7 @@ NvU32 uvm_va_block_gpu_big_page_size(uvm_va_block_t *va_block, uvm_gpu_t *gpu)
    return gpu_va_space->page_tables.big_page_size;
 }

-static uvm_va_block_region_t range_big_page_region_all(NvU64 start, NvU64 end, NvU32 big_page_size)
+static uvm_va_block_region_t range_big_page_region_all(NvU64 start, NvU64 end, NvU64 big_page_size)
 {
    NvU64 first_addr = UVM_ALIGN_UP(start, big_page_size);
    NvU64 outer_addr = UVM_ALIGN_DOWN(end + 1, big_page_size);
@ -2330,20 +2326,20 @@ static uvm_va_block_region_t range_big_page_region_all(NvU64 start, NvU64 end, N
    return uvm_va_block_region((first_addr - start) / PAGE_SIZE, (outer_addr - start) / PAGE_SIZE);
 }

-static size_t range_num_big_pages(NvU64 start, NvU64 end, NvU32 big_page_size)
+static size_t range_num_big_pages(NvU64 start, NvU64 end, NvU64 big_page_size)
 {
    uvm_va_block_region_t region = range_big_page_region_all(start, end, big_page_size);
    return (size_t)uvm_div_pow2_64(uvm_va_block_region_size(region), big_page_size);
 }

-uvm_va_block_region_t uvm_va_block_big_page_region_all(uvm_va_block_t *va_block, NvU32 big_page_size)
+uvm_va_block_region_t uvm_va_block_big_page_region_all(uvm_va_block_t *va_block, NvU64 big_page_size)
 {
    return range_big_page_region_all(va_block->start, va_block->end, big_page_size);
 }

 uvm_va_block_region_t uvm_va_block_big_page_region_subset(uvm_va_block_t *va_block,
                                                          uvm_va_block_region_t region,
-                                                          NvU32 big_page_size)
+                                                          NvU64 big_page_size)
 {
    NvU64 start = uvm_va_block_region_start(va_block, region);
    NvU64 end = uvm_va_block_region_end(va_block, region);
@ -2361,12 +2357,12 @@ uvm_va_block_region_t uvm_va_block_big_page_region_subset(uvm_va_block_t *va_blo
    return big_region;
 }

-size_t uvm_va_block_num_big_pages(uvm_va_block_t *va_block, NvU32 big_page_size)
+size_t uvm_va_block_num_big_pages(uvm_va_block_t *va_block, NvU64 big_page_size)
 {
    return range_num_big_pages(va_block->start, va_block->end, big_page_size);
 }

-NvU64 uvm_va_block_big_page_addr(uvm_va_block_t *va_block, size_t big_page_index, NvU32 big_page_size)
+NvU64 uvm_va_block_big_page_addr(uvm_va_block_t *va_block, size_t big_page_index, NvU64 big_page_size)
 {
    NvU64 addr = UVM_ALIGN_UP(va_block->start, big_page_size) + (big_page_index * big_page_size);
    UVM_ASSERT(addr >= va_block->start);
@ -2374,7 +2370,7 @@ NvU64 uvm_va_block_big_page_addr(uvm_va_block_t *va_block, size_t big_page_index
    return addr;
 }

-uvm_va_block_region_t uvm_va_block_big_page_region(uvm_va_block_t *va_block, size_t big_page_index, NvU32 big_page_size)
+uvm_va_block_region_t uvm_va_block_big_page_region(uvm_va_block_t *va_block, size_t big_page_index, NvU64 big_page_size)
 {
    NvU64 page_addr = uvm_va_block_big_page_addr(va_block, big_page_index, big_page_size);

@ -2390,7 +2386,7 @@ uvm_va_block_region_t uvm_va_block_big_page_region(uvm_va_block_t *va_block, siz
 // uvm_va_block_gpu_state_t::big_ptes) corresponding to page_index. If
 // page_index cannot be covered by a big PTE due to alignment or block size,
 // MAX_BIG_PAGES_PER_UVM_VA_BLOCK is returned.
-size_t uvm_va_block_big_page_index(uvm_va_block_t *va_block, uvm_page_index_t page_index, NvU32 big_page_size)
+size_t uvm_va_block_big_page_index(uvm_va_block_t *va_block, uvm_page_index_t page_index, NvU64 big_page_size)
 {
    uvm_va_block_region_t big_region_all = uvm_va_block_big_page_region_all(va_block, big_page_size);
    size_t big_index;
@ -2415,7 +2411,7 @@ static void uvm_page_mask_init_from_big_ptes(uvm_va_block_t *block,
 {
    uvm_va_block_region_t big_region;
    size_t big_page_index;
-    NvU32 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu);
+    NvU64 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu);

    uvm_page_mask_zero(mask_out);

@ -2425,7 +2421,7 @@ static void uvm_page_mask_init_from_big_ptes(uvm_va_block_t *block,
    }
 }

-NvU32 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block, uvm_page_index_t page_index)
+NvU64 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block, uvm_page_index_t page_index)
 {
    if (!uvm_page_mask_test(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index))
        return 0;
@ -2439,7 +2435,7 @@ NvU32 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block, uvm_page_index_t page
    return PAGE_SIZE;
 }

-NvU32 uvm_va_block_page_size_gpu(uvm_va_block_t *va_block, uvm_gpu_id_t gpu_id, uvm_page_index_t page_index)
+NvU64 uvm_va_block_page_size_gpu(uvm_va_block_t *va_block, uvm_gpu_id_t gpu_id, uvm_page_index_t page_index)
 {
    uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id);
    size_t big_page_size, big_page_index;
@ -2467,7 +2463,7 @@ NvU32 uvm_va_block_page_size_gpu(uvm_va_block_t *va_block, uvm_gpu_id_t gpu_id,
 // resident. Note that this is different from uvm_va_block_page_size_* because
 // those return the size of the PTE which maps the page index, which may be
 // smaller than the physical allocation.
-static NvU32 block_phys_page_size(uvm_va_block_t *block, block_phys_page_t page)
+static NvU64 block_phys_page_size(uvm_va_block_t *block, block_phys_page_t page)
 {
    uvm_va_block_gpu_state_t *gpu_state;
    uvm_chunk_size_t chunk_size;
@ -2480,7 +2476,7 @@ static NvU32 block_phys_page_size(uvm_va_block_t *block, block_phys_page_t page)
            return 0;

        UVM_ASSERT(uvm_processor_mask_test(&block->resident, UVM_ID_CPU));
-        return (NvU32)uvm_cpu_chunk_get_size(chunk);
+        return uvm_cpu_chunk_get_size(chunk);
    }

    gpu_state = uvm_va_block_gpu_state_get(block, page.processor);
@ -2489,10 +2485,10 @@ static NvU32 block_phys_page_size(uvm_va_block_t *block, block_phys_page_t page)

    UVM_ASSERT(uvm_processor_mask_test(&block->resident, page.processor));
    block_gpu_chunk_index(block, block_get_gpu(block, page.processor), page.page_index, &chunk_size);
-    return (NvU32)chunk_size;
+    return chunk_size;
 }

-NvU32 uvm_va_block_get_physical_size(uvm_va_block_t *block,
+NvU64 uvm_va_block_get_physical_size(uvm_va_block_t *block,
                                     uvm_processor_id_t processor,
                                     uvm_page_index_t page_index)
 {
@ -3344,7 +3340,7 @@ static uvm_gpu_phys_address_t block_phys_page_address(uvm_va_block_t *block,

    if (UVM_ID_IS_CPU(block_page.processor)) {
        uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, block_page.nid, block_page.page_index);
-        NvU64 dma_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
+        NvU64 dma_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu);
        uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block,
                                                                       uvm_cpu_chunk_get_size(chunk),
                                                                       block_page.page_index);
@ -5387,7 +5383,7 @@ static bool block_check_gpu_chunks(uvm_va_block_t *block, uvm_gpu_id_t id)

        if (chunk) {
            if (chunk_size != uvm_gpu_chunk_get_size(chunk)) {
-                UVM_ERR_PRINT("chunk size mismatch: calc %u, actual %u. VA block [0x%llx, 0x%llx) GPU: %u page_index: %u chunk index: %zu\n",
+                UVM_ERR_PRINT("chunk size mismatch: calc %u, actual %u. VA block [0x%llx, 0x%llx) GPU: %u page_index: %u chunk index: %lu\n",
                              chunk_size,
                              uvm_gpu_chunk_get_size(chunk),
                              block->start,
@ -5399,7 +5395,7 @@ static bool block_check_gpu_chunks(uvm_va_block_t *block, uvm_gpu_id_t id)
            }

            if (chunk->state != UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) {
-                UVM_ERR_PRINT("Invalid chunk state %s. VA block [0x%llx, 0x%llx) GPU: %u page_index: %u chunk index: %zu chunk_size: %u\n",
+                UVM_ERR_PRINT("Invalid chunk state %s. VA block [0x%llx, 0x%llx) GPU: %u page_index: %u chunk index: %lu chunk_size: llu\n",
                              uvm_pmm_gpu_chunk_state_string(chunk->state),
                              block->start,
                              block->end + 1,
@ -5718,7 +5714,7 @@ static bool block_check_mappings_ptes(uvm_va_block_t *block, uvm_va_block_contex
    uvm_pte_bits_gpu_t pte_bit;
    uvm_processor_id_t resident_id;
    uvm_prot_t prot;
-    NvU32 big_page_size;
+    NvU64 big_page_size;
    size_t num_big_pages, big_page_index;
    uvm_va_block_region_t big_region, chunk_region;
    uvm_gpu_chunk_t *chunk;
@ -6170,7 +6166,7 @@ static void block_gpu_pte_big_split_write_4k(uvm_va_block_t *block,
    size_t big_page_index;
    uvm_processor_id_t curr_resident_id;
    uvm_prot_t curr_prot;
-    NvU32 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu);
+    NvU64 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu);

    if (UVM_ID_IS_INVALID(resident_id))
        UVM_ASSERT(new_prot == UVM_PROT_NONE);
@ -6252,7 +6248,7 @@ static void block_gpu_pte_clear_big(uvm_va_block_t *block,
 {
    uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
    uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu);
-    NvU32 big_page_size = gpu_va_space->page_tables.big_page_size;
+    NvU64 big_page_size = gpu_va_space->page_tables.big_page_size;
    uvm_gpu_phys_address_t pte_addr;
    NvU32 pte_size = uvm_mmu_pte_size(&gpu_va_space->page_tables, big_page_size);
    size_t big_page_index;
@ -6298,7 +6294,7 @@ static void block_gpu_pte_write_big(uvm_va_block_t *block,
    uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
    uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu);
    uvm_page_tree_t *tree = &gpu_va_space->page_tables;
-    NvU32 big_page_size = tree->big_page_size;
+    NvU64 big_page_size = tree->big_page_size;
    NvU32 pte_size = uvm_mmu_pte_size(tree, big_page_size);
    size_t big_page_index;
    uvm_va_block_region_t contig_region = {0};
@ -6376,7 +6372,7 @@ static void block_gpu_pte_merge_big_and_end(uvm_va_block_t *block,
 {
    uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
    uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
-    NvU32 big_page_size = tree->big_page_size;
+    NvU64 big_page_size = tree->big_page_size;
    NvU64 unmapped_pte_val = tree->hal->unmapped_pte(big_page_size);
    size_t big_page_index;
    DECLARE_BITMAP(dummy_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
@ -6937,7 +6933,7 @@ static void block_gpu_split_big(uvm_va_block_t *block,
    uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
    uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
    uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
-    NvU32 big_page_size = tree->big_page_size;
+    NvU64 big_page_size = tree->big_page_size;
    uvm_va_block_region_t big_region;
    uvm_processor_id_t resident_id;
    size_t big_page_index;
@ -7039,7 +7035,7 @@ static void block_gpu_map_big_and_4k(uvm_va_block_t *block,
    DECLARE_BITMAP(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
    uvm_va_block_region_t big_region;
    size_t big_page_index;
-    NvU32 big_page_size = tree->big_page_size;
+    NvU64 big_page_size = tree->big_page_size;
    uvm_membar_t tlb_membar = block_pte_op_membar(pte_op, gpu, resident_id);

    UVM_ASSERT(!gpu_state->pte_is_2m);
@ -7341,7 +7337,7 @@ static void block_gpu_unmap_big_and_4k(uvm_va_block_t *block,
    DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
    DECLARE_BITMAP(big_ptes_before_or_after, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
    DECLARE_BITMAP(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
-    NvU32 big_page_size = tree->big_page_size;
+    NvU64 big_page_size = tree->big_page_size;
    NvU64 unmapped_pte_val = tree->hal->unmapped_pte(big_page_size);

    UVM_ASSERT(!gpu_state->pte_is_2m);
@ -7487,7 +7483,7 @@ static void block_gpu_compute_new_pte_state(uvm_va_block_t *block,
 {
    uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
    uvm_va_block_region_t big_region_all, big_page_region, region;
-    NvU32 big_page_size;
+    NvU64 big_page_size;
    uvm_page_index_t page_index;
    size_t big_page_index;
    DECLARE_BITMAP(big_ptes_not_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
@ -7640,7 +7636,7 @@ static void block_gpu_compute_new_pte_state(uvm_va_block_t *block,
 // happens, the pending tracker is added to the block's tracker.
 static NV_STATUS block_alloc_pt_range_with_retry(uvm_va_block_t *va_block,
                                                 uvm_gpu_t *gpu,
-                                                 NvU32 page_size,
+                                                 NvU64 page_size,
                                                 uvm_page_table_range_t *page_table_range,
                                                 uvm_tracker_t *pending_tracker)
 {
@ -7763,13 +7759,13 @@ allocated:
 // sizes. See block_alloc_pt_range_with_retry.
 static NV_STATUS block_alloc_ptes_with_retry(uvm_va_block_t *va_block,
                                             uvm_gpu_t *gpu,
-                                             NvU32 page_sizes,
+                                             NvU64 page_sizes,
                                             uvm_tracker_t *pending_tracker)
 {
    uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
    uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(va_block, gpu);
    uvm_page_table_range_t *range;
-    NvU32 page_size;
+    NvU64 page_size;
    NV_STATUS status, final_status = NV_OK;

    UVM_ASSERT(gpu_state);
@ -7821,7 +7817,7 @@ static NV_STATUS block_alloc_ptes_new_state(uvm_va_block_t *va_block,
                                            uvm_va_block_new_pte_state_t *new_pte_state,
                                            uvm_tracker_t *pending_tracker)
 {
-    NvU32 page_sizes = 0;
+    NvU64 page_sizes = 0;

    if (new_pte_state->pte_is_2m) {
        page_sizes |= UVM_PAGE_SIZE_2M;
@ -7853,8 +7849,8 @@ static NV_STATUS block_pre_populate_pde1_gpu(uvm_va_block_t *block,
                                             uvm_gpu_va_space_t *gpu_va_space,
                                             uvm_tracker_t *pending_tracker)
 {
-    NvU32 page_sizes;
-    NvU32 big_page_size;
+    NvU64 page_sizes;
+    NvU64 big_page_size;
    uvm_gpu_t *gpu;
    uvm_va_block_gpu_state_t *gpu_state;

@ -9509,7 +9505,6 @@ static void block_kill(uvm_va_block_t *block)
    // Free CPU pages
    for_each_possible_uvm_node(nid) {
        uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, nid);
-        size_t index = node_to_index(nid);

        for_each_cpu_chunk_in_block_safe(chunk, page_index, next_page_index, block, nid) {
            // be conservative.
@ -9524,9 +9519,20 @@ static void block_kill(uvm_va_block_t *block)

        UVM_ASSERT(uvm_page_mask_empty(&node_state->allocated));
        UVM_ASSERT(node_state->chunks == 0);
-        kmem_cache_free(g_uvm_va_block_cpu_node_state_cache, block->cpu.node_state[index]);
    }

+    // While a per-NUMA node_state array is in use, all of its elements are
+    // expected to be valid. Therefore the teardown of these elements must occur
+    // as a single "transaction". This teardown must take place after freeing
+    // the CPU pages (see the "Free CPU pages" loop above). This is because as
+    // part of removing chunks from VA blocks, the per-page allocated bitmap is
+    // recomputed using the per-NUMA node_state array elements.
+    for_each_possible_uvm_node(nid) {
+        uvm_va_block_cpu_node_state_t *node_state;
+
+        node_state = block_node_state_get(block, nid);
+        kmem_cache_free(g_uvm_va_block_cpu_node_state_cache, node_state);
+    }
    uvm_kvfree((void *)block->cpu.node_state);
    block->cpu.node_state = NULL;

@ -9642,8 +9648,8 @@ static NV_STATUS block_split_presplit_ptes_gpu(uvm_va_block_t *existing, uvm_va_
    uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu->id);
    uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing);
    uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
-    NvU32 big_page_size = uvm_va_block_gpu_big_page_size(existing, gpu);
-    NvU32 alloc_sizes;
+    NvU64 big_page_size = uvm_va_block_gpu_big_page_size(existing, gpu);
+    NvU64 alloc_sizes;
    DECLARE_BITMAP(new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
    uvm_page_index_t new_start_page_index = uvm_va_block_cpu_page_index(existing, new->start);
    size_t big_page_index;
@ -9986,7 +9992,7 @@ static NV_STATUS block_split_cpu_chunk_one(uvm_va_block_t *block, uvm_page_index
        gpu = block_get_gpu(block, id);

        // If the parent chunk has not been mapped, there is nothing to split.
-        gpu_mapping_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
+        gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu);
        if (gpu_mapping_addr == 0)
            continue;

@ -10008,7 +10014,7 @@ static NV_STATUS block_split_cpu_chunk_one(uvm_va_block_t *block, uvm_page_index
 merge:
        for_each_gpu_id_in_mask(id, gpu_split_mask) {
            gpu = block_get_gpu(block, id);
-            gpu_mapping_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
+            gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu);
            uvm_pmm_sysmem_mappings_merge_gpu_mappings(&gpu->pmm_reverse_sysmem_mappings,
                                                       gpu_mapping_addr,
                                                       chunk_size);
@ -10194,7 +10200,7 @@ static void block_merge_cpu_chunks_one(uvm_va_block_t *block, uvm_page_index_t p
            continue;

        gpu = block_get_gpu(block, id);
-        gpu_mapping_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
+        gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu);
        if (gpu_mapping_addr == 0)
            continue;

@ -10646,8 +10652,7 @@ static void block_split_gpu(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_g
    for_each_possible_uvm_node(nid) {
        for_each_cpu_chunk_in_block(cpu_chunk, page_index, new, nid) {
            uvm_pmm_sysmem_mappings_reparent_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings,
-                                                         uvm_cpu_chunk_get_parent_gpu_phys_addr(cpu_chunk,
-                                                                                                gpu->parent),
+                                                         uvm_cpu_chunk_get_gpu_phys_addr(cpu_chunk, gpu),
                                                         new);
        }
    }
@ -10685,7 +10690,7 @@ static void block_split_gpu(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_g
    gpu_va_space = uvm_gpu_va_space_get(va_space, gpu);
    if (gpu_va_space) {
        if (existing_gpu_state->page_table_range_big.table) {
-            NvU32 big_page_size = uvm_va_block_gpu_big_page_size(existing, gpu);
+            NvU64 big_page_size = uvm_va_block_gpu_big_page_size(existing, gpu);

            // existing's end has not been adjusted yet
            existing_pages_big = range_num_big_pages(existing->start, new->start - 1, big_page_size);
@ -13614,7 +13619,7 @@ NV_STATUS uvm_test_va_residency_info(UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params,
    for_each_id_in_mask(id, &block->mapped) {
        uvm_processor_id_t processor_to_map;
        block_phys_page_t block_page;
-        NvU32 page_size = uvm_va_block_page_size_processor(block, id, page_index);
+        NvU64 page_size = uvm_va_block_page_size_processor(block, id, page_index);
        int nid = NUMA_NO_NODE;

        if (page_size == 0)
@ -13650,7 +13655,7 @@ NV_STATUS uvm_test_va_residency_info(UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params,
        if (uvm_processor_mask_test(resident_on_mask, UVM_ID_CPU)) {
            if (uvm_pmm_sysmem_mappings_indirect_supported()) {
                for_each_gpu_id(id) {
-                    NvU32 page_size = uvm_va_block_page_size_processor(block, id, page_index);
+                    NvU64 page_size = uvm_va_block_page_size_processor(block, id, page_index);
                    uvm_reverse_map_t sysmem_page;
                    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page_resident(block, page_index);
                    size_t num_pages;
@ -13665,8 +13670,7 @@ NV_STATUS uvm_test_va_residency_info(UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params,
                        continue;

                    num_pages = uvm_pmm_sysmem_mappings_dma_to_virt(&gpu->pmm_reverse_sysmem_mappings,
-                                                                    uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk,
-                                                                                                           gpu->parent),
+                                                                    uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu),
                                                                    uvm_cpu_chunk_get_size(chunk),
                                                                    &sysmem_page,
                                                                    1);
--- a/kernel-open/nvidia-uvm/uvm_va_block.h
+++ b/kernel-open/nvidia-uvm/uvm_va_block.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -111,8 +111,6 @@ typedef struct
    // Pages that have been evicted to sysmem
    uvm_page_mask_t evicted;

-    NvU64 *cpu_chunks_dma_addrs;
-
    // Array of naturally-aligned chunks. Each chunk has the largest possible
    // size which can fit within the block, so they are not uniform size.
    //
@ -2155,8 +2153,7 @@ NV_STATUS uvm_va_block_map_cpu_chunk_on_gpus(uvm_va_block_t *va_block,
 // Physically unmap a CPU chunk from all registered GPUs.
 // Locking: The va_block lock must be held.
 void uvm_va_block_unmap_cpu_chunk_on_gpus(uvm_va_block_t *va_block,
-                                          uvm_cpu_chunk_t *chunk,
-                                          uvm_page_index_t page_index);
+                                          uvm_cpu_chunk_t *chunk);

 // Remove any CPU chunks in the given region.
 // Locking: The va_block lock must be held.
@ -2166,19 +2163,19 @@ void uvm_va_block_remove_cpu_chunks(uvm_va_block_t *va_block, uvm_va_block_regio
 // specified processor in the block. Returns 0 if the address is not resident on
 // the specified processor.
 // Locking: The va_block lock must be held.
-NvU32 uvm_va_block_get_physical_size(uvm_va_block_t *block,
+NvU64 uvm_va_block_get_physical_size(uvm_va_block_t *block,
                                     uvm_processor_id_t processor,
                                     uvm_page_index_t page_index);

 // Get CPU page size or 0 if it is not mapped
-NvU32 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block,
+NvU64 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block,
                                 uvm_page_index_t page_index);

 // Get GPU page size or 0 if it is not mapped on the given GPU
-NvU32 uvm_va_block_page_size_gpu(uvm_va_block_t *va_block, uvm_gpu_id_t gpu_id, uvm_page_index_t page_index);
+NvU64 uvm_va_block_page_size_gpu(uvm_va_block_t *va_block, uvm_gpu_id_t gpu_id, uvm_page_index_t page_index);

 // Get page size or 0 if it is not mapped on the given processor
-static NvU32 uvm_va_block_page_size_processor(uvm_va_block_t *va_block,
+static NvU64 uvm_va_block_page_size_processor(uvm_va_block_t *va_block,
                                              uvm_processor_id_t processor_id,
                                              uvm_page_index_t page_index)
 {
@ -2189,10 +2186,10 @@ static NvU32 uvm_va_block_page_size_processor(uvm_va_block_t *va_block,
 }

 // Returns the big page size for the GPU VA space of the block
-NvU32 uvm_va_block_gpu_big_page_size(uvm_va_block_t *va_block, uvm_gpu_t *gpu);
+NvU64 uvm_va_block_gpu_big_page_size(uvm_va_block_t *va_block, uvm_gpu_t *gpu);

 // Returns the number of big pages in the VA block for the given size
-size_t uvm_va_block_num_big_pages(uvm_va_block_t *va_block, NvU32 big_page_size);
+size_t uvm_va_block_num_big_pages(uvm_va_block_t *va_block, NvU64 big_page_size);

 // Returns the number of big pages in the VA block for the big page size on the
 // given GPU
@ -2202,29 +2199,29 @@ static size_t uvm_va_block_gpu_num_big_pages(uvm_va_block_t *va_block, uvm_gpu_t
 }

 // Returns the start address of the given big page index and big page size
-NvU64 uvm_va_block_big_page_addr(uvm_va_block_t *va_block, size_t big_page_index, NvU32 big_page_size);
+NvU64 uvm_va_block_big_page_addr(uvm_va_block_t *va_block, size_t big_page_index, NvU64 big_page_size);

 // Returns the region [start, end] of the given big page index and big page size
 uvm_va_block_region_t uvm_va_block_big_page_region(uvm_va_block_t *va_block,
                                                   size_t big_page_index,
-                                                   NvU32 big_page_size);
+                                                   NvU64 big_page_size);

 // Returns the largest sub-region region of [start, end] which can fit big
 // pages. If the region cannot fit any big pages, an invalid region (0, 0) is
 // returned.
-uvm_va_block_region_t uvm_va_block_big_page_region_all(uvm_va_block_t *va_block, NvU32 big_page_size);
+uvm_va_block_region_t uvm_va_block_big_page_region_all(uvm_va_block_t *va_block, NvU64 big_page_size);

 // Returns the largest sub-region region of 'region' which can fit big pages.
 // If the region cannot fit any big pages, an invalid region (0, 0) is returned.
 uvm_va_block_region_t uvm_va_block_big_page_region_subset(uvm_va_block_t *va_block,
                                                          uvm_va_block_region_t region,
-                                                          NvU32 big_page_size);
+                                                          NvU64 big_page_size);

 // Returns the big page index (the bit index within
 // uvm_va_block_gpu_state_t::big_ptes) corresponding to page_index. If
 // page_index cannot be covered by a big PTE due to alignment or block size,
 // MAX_BIG_PAGES_PER_UVM_VA_BLOCK is returned.
-size_t uvm_va_block_big_page_index(uvm_va_block_t *va_block, uvm_page_index_t page_index, NvU32 big_page_size);
+size_t uvm_va_block_big_page_index(uvm_va_block_t *va_block, uvm_page_index_t page_index, NvU64 big_page_size);

 // Returns the new residency for a page that faulted or triggered access counter
 // notifications. The read_duplicate output parameter indicates if the page
--- a/kernel-open/nvidia-uvm/uvm_va_range.c
+++ b/kernel-open/nvidia-uvm/uvm_va_range.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -1853,7 +1853,7 @@ NV_STATUS uvm_api_alloc_semaphore_pool(UVM_ALLOC_SEMAPHORE_POOL_PARAMS *params,

    if (uvm_api_range_invalid(params->base, params->length))
        return NV_ERR_INVALID_ADDRESS;
-    if (params->gpuAttributesCount > UVM_MAX_GPUS_V2)
+    if (params->gpuAttributesCount > UVM_MAX_GPUS)
        return NV_ERR_INVALID_ARGUMENT;

    if (g_uvm_global.conf_computing_enabled && params->gpuAttributesCount == 0)
--- a/kernel-open/nvidia-uvm/uvm_va_range.h
+++ b/kernel-open/nvidia-uvm/uvm_va_range.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2022 NVIDIA Corporation
+    Copyright (c) 2015-2024 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -188,8 +188,7 @@ typedef struct
    // GPU which owns the allocation. For sysmem, this is the GPU that the
    // sysmem was originally allocated under. For the allocation to remain valid
    // we need to prevent the GPU from going away, similarly to P2P mapped
-    // memory.
-    // Similarly for EGM memory.
+    // memory and to EGM memory.
    //
    // This field is not used for sparse mappings as they don't have an
    // allocation and, hence, owning GPU.
@ -212,6 +211,7 @@ typedef struct
    // EGM memory. If true is_sysmem also has to be true and owning_gpu
    // has to be valid.
    bool is_egm;
+
    // GPU page tables mapping the allocation
    uvm_page_table_range_vec_t pt_range_vec;

--- a/kernel-open/nvidia-uvm/uvm_volta_host.c
+++ b/kernel-open/nvidia-uvm/uvm_volta_host.c
@ -199,7 +199,7 @@ void uvm_hal_volta_host_tlb_invalidate_va(uvm_push_t *push,
                                          NvU32 depth,
                                          NvU64 base,
                                          NvU64 size,
-                                          NvU32 page_size,
+                                          NvU64 page_size,
                                          uvm_membar_t membar)
 {
    NvU32 aperture_value;
@ -216,9 +216,9 @@ void uvm_hal_volta_host_tlb_invalidate_va(uvm_push_t *push,
    NvU32 log2_invalidation_size;
    uvm_gpu_t *gpu = uvm_push_get_gpu(push);

-    UVM_ASSERT_MSG(IS_ALIGNED(page_size, 1 << 12), "page_size 0x%x\n", page_size);
-    UVM_ASSERT_MSG(IS_ALIGNED(base, page_size), "base 0x%llx page_size 0x%x\n", base, page_size);
-    UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%x\n", size, page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(page_size, 1 << 12), "page_size 0x%llx\n", page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(base, page_size), "base 0x%llx page_size 0x%llx\n", base, page_size);
+    UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%llx\n", size, page_size);
    UVM_ASSERT_MSG(size > 0, "size 0x%llx\n", size);

    // The invalidation size must be a power-of-two number of pages containing
--- a/kernel-open/nvidia-uvm/uvm_volta_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_volta_mmu.c
@ -42,7 +42,7 @@ static NvU32 entries_per_index_volta(NvU32 depth)
    return 1;
 }

-static NvLength entry_offset_volta(NvU32 depth, NvU32 page_size)
+static NvLength entry_offset_volta(NvU32 depth, NvU64 page_size)
 {
    UVM_ASSERT(depth < 5);
    if (page_size == UVM_PAGE_SIZE_4K && depth == 3)
@ -252,7 +252,7 @@ static NvU64 make_pte_volta(uvm_aperture_t aperture, NvU64 address, uvm_prot_t p

 static uvm_mmu_mode_hal_t volta_mmu_mode_hal;

-uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_volta(NvU32 big_page_size)
+uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_volta(NvU64 big_page_size)
 {
    static bool initialized = false;

--- a/kernel-open/nvidia/libspdm_aead.c
+++ b/kernel-open/nvidia/libspdm_aead.c
@ -159,14 +159,7 @@ static int lkca_aead_internal(struct crypto_aead *aead,
    }

    if (rc != 0) {
-        if (enc) {
-            pr_info("aead.c: Encryption failed with error %i\n", rc);
-        } else {
-            pr_info("aead.c: Decryption failed with error %i\n", rc);
-            if (rc == -EBADMSG) {
-                pr_info("aead.c: Authentication tag mismatch!\n");
-            }
-        }
+        pr_info("Encryption FAILED\n");
    }

    *data_out_size = data_in_size;
--- a/kernel-open/nvidia/libspdm_rsa.c
+++ b/kernel-open/nvidia/libspdm_rsa.c
@ -39,7 +39,9 @@
 #define RSA_PSS_PADDING_ZEROS_SIZE_BYTE        (8)
 #define RSA_PSS_TRAILER_FIELD                  (0xbc)
 #define SHIFT_RIGHT_AND_GET_BYTE(val, x)       ((val >> x) & 0xFF)
+#ifndef BITS_TO_BYTES
 #define BITS_TO_BYTES(b)                       (b >> 3)
+#endif

 static const unsigned char zeroes[RSA_PSS_PADDING_ZEROS_SIZE_BYTE] = { 0 };

--- a/kernel-open/nvidia/nv-acpi.c
+++ b/kernel-open/nvidia/nv-acpi.c
@ -66,6 +66,9 @@ static NvBool battery_present = NV_FALSE;
 #define ACPI_VIDEO_CLASS    "video"
 #endif

+/* Maximum size of ACPI _DSM method's 4th argument */
+#define NV_MAX_ACPI_DSM_PARAM_SIZE     1024
+
 // Used for NVPCF event handling
 static acpi_handle nvpcf_handle = NULL;
 static acpi_handle nvpcf_device_handle = NULL;
@ -73,21 +76,6 @@ static nv_acpi_t  *nvpcf_nv_acpi_object = NULL;

 #define ACPI_NVPCF_EVENT_CHANGE    0xC0

-static int nv_acpi_get_device_handle(nv_state_t *nv, acpi_handle *dev_handle)
-{
-    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
-
-#if defined(DEVICE_ACPI_HANDLE)
-    *dev_handle = DEVICE_ACPI_HANDLE(nvl->dev);
-    return NV_TRUE;
-#elif defined (ACPI_HANDLE)
-    *dev_handle = ACPI_HANDLE(nvl->dev);
-    return NV_TRUE;
-#else
-    return NV_FALSE;
-#endif
-}
-
 /*
 * This callback will be invoked by the acpi_notifier_call_chain()
 */
@ -174,7 +162,7 @@ static void nv_acpi_nvpcf_event(acpi_handle handle, u32 event_type, void *data)
    }
    else
    {
-        nv_printf(NV_DBG_INFO,"NVRM: %s: NVPCF event 0x%x is not supported\n", event_type, __FUNCTION__);
+        nv_printf(NV_DBG_INFO,"NVRM: %s: NVPCF event 0x%x is not supported\n", __FUNCTION__, event_type);
    }
 }

@ -267,11 +255,10 @@ static void nv_acpi_notify_event(acpi_handle handle, u32 event_type, void *data)

 void nv_acpi_register_notifier(nv_linux_state_t *nvl)
 {
-    acpi_handle dev_handle  = NULL;
+    acpi_handle dev_handle  = ACPI_HANDLE(nvl->dev);

    /* Install the ACPI notifier corresponding to dGPU ACPI device. */
    if ((nvl->nv_acpi_object == NULL) &&
-        nv_acpi_get_device_handle(NV_STATE_PTR(nvl), &dev_handle) &&
        (dev_handle != NULL))
    {
        nvl->nv_acpi_object = nv_install_notifier(dev_handle, nv_acpi_notify_event, nvl);
@ -657,64 +644,36 @@ static NV_STATUS nv_acpi_nvif_method(
    return NV_OK;
 }

-#define MAX_INPUT_PARAM_SIZE     1024
-/*
- * This function executes a _DSM ACPI method.
- */
-NV_STATUS NV_API_CALL nv_acpi_dsm_method(
-    nv_state_t *nv,
-    NvU8  *pAcpiDsmGuid,
-    NvU32 acpiDsmRev,
-    NvBool acpiNvpcfDsmFunction,
-    NvU32 acpiDsmSubFunction,
-    void  *pInParams,
-    NvU16 inParamSize,
-    NvU32 *outStatus,
-    void  *pOutData,
-    NvU16 *pSize
+static NV_STATUS nv_acpi_evaluate_dsm_method(
+    acpi_handle   dev_handle,
+    NvU8         *pathname,
+    NvU8         *pAcpiDsmGuid,
+    NvU32         acpiDsmRev,
+    NvU32         acpiDsmSubFunction,
+    void         *arg3,
+    NvU16         arg3Size,
+    NvBool        bArg3Integer,
+    NvU32        *outStatus,
+    void         *pOutData,
+    NvU16        *pSize
 )
 {
-    NV_STATUS status = NV_ERR_OPERATING_SYSTEM;
-    acpi_status acpi_status;
+    NV_STATUS rmStatus = NV_OK;
+    acpi_status status;
    struct acpi_object_list input;
    union acpi_object *dsm = NULL;
    struct acpi_buffer output = { ACPI_ALLOCATE_BUFFER, NULL };
    union acpi_object dsm_params[4];
-    NvU8 *argument3 = NULL;
    NvU32 data_size;
-    acpi_handle dev_handle  = NULL;
-
-    if (!nv_acpi_get_device_handle(nv, &dev_handle))
-        return NV_ERR_NOT_SUPPORTED;
-
-    if (!dev_handle)
-        return NV_ERR_INVALID_ARGUMENT;
-
-    if ((!pInParams) || (inParamSize > MAX_INPUT_PARAM_SIZE) || (!pOutData) || (!pSize))
-    {
-        nv_printf(NV_DBG_INFO,
-                  "NVRM: %s: invalid argument(s)!\n", __FUNCTION__);
-        return NV_ERR_INVALID_ARGUMENT;
-    }

    if (!NV_MAY_SLEEP())
    {
 #if defined(DEBUG)
-        nv_printf(NV_DBG_INFO,
-                  "NVRM: %s: invalid argument(s)!\n", __FUNCTION__);
+        nv_printf(NV_DBG_ERRORS, "NVRM: %s: invalid context!\n", __FUNCTION__);
 #endif
        return NV_ERR_NOT_SUPPORTED;
    }

-    status = os_alloc_mem((void **)&argument3, inParamSize);
-    if (status != NV_OK)
-        return status;
-
-    //
-    // dsm_params[0].buffer.pointer and dsm_params[1].integer.value set in
-    // switch below based on acpiDsmFunction
-    //
-
    dsm_params[0].buffer.type    = ACPI_TYPE_BUFFER;
    dsm_params[0].buffer.length  = 0x10;
    dsm_params[0].buffer.pointer = pAcpiDsmGuid;
@ -725,35 +684,28 @@ NV_STATUS NV_API_CALL nv_acpi_dsm_method(
    dsm_params[2].integer.type   = ACPI_TYPE_INTEGER;
    dsm_params[2].integer.value  = acpiDsmSubFunction;

-    dsm_params[3].buffer.type    = ACPI_TYPE_BUFFER;
-    dsm_params[3].buffer.length  = inParamSize;
-    memcpy(argument3, pInParams, dsm_params[3].buffer.length);
-    dsm_params[3].buffer.pointer = argument3;
+    if (bArg3Integer)
+    {
+        dsm_params[3].integer.type  = ACPI_TYPE_INTEGER;
+        dsm_params[3].integer.value = *((NvU32 *)arg3);
+    }
+    else
+    {
+         dsm_params[3].buffer.type    = ACPI_TYPE_BUFFER;
+         dsm_params[3].buffer.length  = arg3Size;
+         dsm_params[3].buffer.pointer = arg3;
+    }

    // parameters for dsm calls (GUID, rev, subfunction, data)
    input.count = 4;
    input.pointer = dsm_params;

-    if (acpiNvpcfDsmFunction)
-    {
-        //
-        // acpi_evaluate_object() can operate with either valid object pathname or
-        // valid object handle. For NVPCF DSM function, use valid pathname as we do
-        // not have device handle for NVPCF device
-        //
-        dev_handle = NULL;
-        acpi_status = acpi_evaluate_object(dev_handle, "\\_SB.NPCF._DSM", &input, &output);
-    }
-    else
-    {
-        acpi_status = acpi_evaluate_object(dev_handle, "_DSM", &input, &output);
-    }
-
-    if (ACPI_FAILURE(acpi_status))
+    status = acpi_evaluate_object(dev_handle, pathname, &input, &output);
+    if (ACPI_FAILURE(status))
    {
        nv_printf(NV_DBG_INFO,
              "NVRM: %s: failed to evaluate _DSM method!\n", __FUNCTION__);
-        goto exit;
+        return NV_ERR_OPERATING_SYSTEM;
    }

    dsm = output.pointer;
@ -767,20 +719,80 @@ NV_STATUS NV_API_CALL nv_acpi_dsm_method(
                         dsm->buffer.pointer[0];
        }

-        status = nv_acpi_extract_object(dsm, pOutData, *pSize, &data_size);
+        rmStatus = nv_acpi_extract_object(dsm, pOutData, *pSize, &data_size);
        *pSize = data_size;

        kfree(output.pointer);
    }
-    if (status != NV_OK)
+    else
+    {
+        *pSize = 0;
+    }
+
+    if (rmStatus != NV_OK)
    {
        nv_printf(NV_DBG_ERRORS,
                  "NVRM: %s: DSM data invalid!\n", __FUNCTION__);
    }

-exit:
+    return rmStatus;
+}
+
+/*
+ * This function executes a _DSM ACPI method.
+ */
+NV_STATUS NV_API_CALL nv_acpi_dsm_method(
+    nv_state_t  *nv,
+    NvU8        *pAcpiDsmGuid,
+    NvU32        acpiDsmRev,
+    NvBool       acpiNvpcfDsmFunction,
+    NvU32        acpiDsmSubFunction,
+    void        *pInParams,
+    NvU16        inParamSize,
+    NvU32       *outStatus,
+    void        *pOutData,
+    NvU16       *pSize
+)
+{
+    NV_STATUS rmStatus     = NV_ERR_OPERATING_SYSTEM;
+    NvU8 *argument3        = NULL;
+    nv_linux_state_t *nvl  = NV_GET_NVL_FROM_NV_STATE(nv);
+    acpi_handle dev_handle = ACPI_HANDLE(nvl->dev);
+    NvU8         *pathname = "_DSM";
+
+    if (!dev_handle)
+        return NV_ERR_INVALID_ARGUMENT;
+
+    if ((!pInParams) || (inParamSize > NV_MAX_ACPI_DSM_PARAM_SIZE) || (!pOutData) || (!pSize))
+    {
+        nv_printf(NV_DBG_INFO,
+                  "NVRM: %s: invalid argument(s)!\n", __FUNCTION__);
+        return NV_ERR_INVALID_ARGUMENT;
+    }
+
+    rmStatus = os_alloc_mem((void **)&argument3, inParamSize);
+    if (rmStatus != NV_OK)
+        return rmStatus;
+
+    memcpy(argument3, pInParams, inParamSize);
+
+    if (acpiNvpcfDsmFunction)
+    {
+        //
+        // acpi_evaluate_object() can operate with either valid object pathname or
+        // valid object handle. For NVPCF DSM function, use valid pathname as we do
+        // not have device handle for NVPCF device
+        //
+        dev_handle = NULL;
+        pathname   = "\\_SB.NPCF._DSM";
+    }
+
+    rmStatus = nv_acpi_evaluate_dsm_method(dev_handle, pathname, pAcpiDsmGuid, acpiDsmRev,
+                                           acpiDsmSubFunction, argument3, inParamSize,
+                                           NV_FALSE, NULL, pOutData, pSize);
+
    os_free_mem(argument3);
-    return status;
+    return rmStatus;
 }

 /*
@ -796,13 +808,11 @@ NV_STATUS NV_API_CALL nv_acpi_ddc_method(
    acpi_status status;
    union acpi_object *ddc = NULL;
    NvU32 i, largestEdidSize;
-    acpi_handle dev_handle  = NULL;
+    nv_linux_state_t *nvl  = NV_GET_NVL_FROM_NV_STATE(nv);
+    acpi_handle dev_handle = ACPI_HANDLE(nvl->dev);
    acpi_handle lcd_dev_handle  = NULL;
    acpi_handle handle = NULL;

-    if (!nv_acpi_get_device_handle(nv, &dev_handle))
-        return NV_ERR_NOT_SUPPORTED;
-
    if (!dev_handle)
        return NV_ERR_INVALID_ARGUMENT;

@ -836,7 +846,7 @@ NV_STATUS NV_API_CALL nv_acpi_ddc_method(
            case 0x0400:
            case 0xA420:
                lcd_dev_handle = handle;
-                nv_printf(NV_DBG_INFO, "NVRM: %s Found LCD: %x\n",
+                nv_printf(NV_DBG_INFO, "NVRM: %s Found LCD: %llx\n",
                          __FUNCTION__, device_id);
                break;
            default:
@ -915,12 +925,10 @@ NV_STATUS NV_API_CALL nv_acpi_rom_method(
    union acpi_object *rom;
    union acpi_object rom_arg[2];
    struct acpi_object_list input = { 2, rom_arg };
-    acpi_handle dev_handle  = NULL;
+    nv_linux_state_t *nvl  = NV_GET_NVL_FROM_NV_STATE(nv);
+    acpi_handle dev_handle = ACPI_HANDLE(nvl->dev);
    uint32_t offset, length;

-    if (!nv_acpi_get_device_handle(nv, &dev_handle))
-        return NV_ERR_NOT_SUPPORTED;
-
    if (!dev_handle)
        return NV_ERR_INVALID_ARGUMENT;

@ -982,12 +990,10 @@ NV_STATUS NV_API_CALL nv_acpi_dod_method(
    acpi_status status;
    struct acpi_buffer output = { ACPI_ALLOCATE_BUFFER, NULL };
    union acpi_object *dod;
-    acpi_handle dev_handle = NULL;
+    nv_linux_state_t *nvl  = NV_GET_NVL_FROM_NV_STATE(nv);
+    acpi_handle dev_handle = ACPI_HANDLE(nvl->dev);
    NvU32 i, count = (*pSize / sizeof(NvU32));

-    if (!nv_acpi_get_device_handle(nv, &dev_handle))
-        return NV_ERR_NOT_SUPPORTED;
-
    if (!dev_handle)
        return NV_ERR_INVALID_ARGUMENT;

@ -1129,17 +1135,11 @@ NvBool nv_acpi_power_resource_method_present(
    struct pci_dev *pdev
 )
 {
-    acpi_handle handle = NULL;
+    acpi_handle handle = ACPI_HANDLE(&pdev->dev);
    struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL };
    union acpi_object *object_package, *object_reference;
    acpi_status status;

-#if defined(DEVICE_ACPI_HANDLE)
-    handle = DEVICE_ACPI_HANDLE(&pdev->dev);
-#elif defined (ACPI_HANDLE)
-    handle = ACPI_HANDLE(&pdev->dev);
-#endif
-
    if (!handle)
        return NV_FALSE;

@ -1198,7 +1198,8 @@ NV_STATUS NV_API_CALL nv_acpi_mux_method(
    union acpi_object *mux        = NULL;
    union acpi_object mux_arg     = { ACPI_TYPE_INTEGER };
    struct acpi_object_list input = { 1, &mux_arg };
-    acpi_handle dev_handle        = NULL;
+    nv_linux_state_t *nvl         = NV_GET_NVL_FROM_NV_STATE(nv);
+    acpi_handle dev_handle        = ACPI_HANDLE(nvl->dev);
    acpi_handle mux_dev_handle    = NULL;
    acpi_handle handle            = NULL;
    unsigned long long device_id  = 0;
@ -1216,9 +1217,6 @@ NV_STATUS NV_API_CALL nv_acpi_mux_method(
                  __FUNCTION__, pMethodName);
    }

-    if (!nv_acpi_get_device_handle(nv, &dev_handle))
-        return NV_ERR_NOT_SUPPORTED;
-
    if (!dev_handle)
        return NV_ERR_INVALID_ARGUMENT;

@ -1384,6 +1382,34 @@ NvBool NV_API_CALL nv_acpi_is_battery_present(void)
    return NV_FALSE;
 }

+NV_STATUS NV_API_CALL nv_acpi_d3cold_dsm_for_upstream_port(
+    nv_state_t *nv,
+    NvU8       *pAcpiDsmGuid,
+    NvU32       acpiDsmRev,
+    NvU32       acpiDsmSubFunction,
+    NvU32      *data
+)
+{
+    NV_STATUS rmStatus = NV_ERR_OPERATING_SYSTEM;
+    nv_linux_state_t *nvl  = NV_GET_NVL_FROM_NV_STATE(nv);
+    acpi_handle dev_handle = ACPI_HANDLE(nvl->dev->parent);
+    NvU32 outData     = 0;
+    NvU16 outDatasize = sizeof(NvU32);
+    NvU16 inParamSize = sizeof(NvU32);
+
+    if (!dev_handle)
+        return NV_ERR_INVALID_ARGUMENT;
+
+    rmStatus = nv_acpi_evaluate_dsm_method(dev_handle, "_DSM", pAcpiDsmGuid, acpiDsmRev,
+                                           acpiDsmSubFunction, data, inParamSize, NV_TRUE,
+                                           NULL, &outData, &outDatasize);
+
+    if (rmStatus == NV_OK)
+        *data = outData;
+
+    return rmStatus;
+}
+
 #else // NV_LINUX_ACPI_EVENTS_SUPPORTED

 void NV_API_CALL nv_acpi_methods_init(NvU32 *handlePresent)
@ -1426,6 +1452,17 @@ NV_STATUS NV_API_CALL nv_acpi_dsm_method(
    return NV_ERR_NOT_SUPPORTED;
 }

+NV_STATUS NV_API_CALL nv_acpi_d3cold_dsm_for_upstream_port(
+    nv_state_t *nv,
+    NvU8       *pAcpiDsmGuid,
+    NvU32       acpiDsmRev,
+    NvU32       acpiDsmSubFunction,
+    NvU32      *data
+)
+{
+    return NV_ERR_NOT_SUPPORTED;
+}
+
 NV_STATUS NV_API_CALL nv_acpi_ddc_method(
    nv_state_t *nv,
    void *pEdidBuffer,
--- a/kernel-open/nvidia/nv-caps-imex.c
+++ b/kernel-open/nvidia/nv-caps-imex.c
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -24,6 +24,7 @@
 #include "nv-linux.h"

 extern int NVreg_ImexChannelCount;
+extern int NVreg_CreateImexChannel0;

 static int nv_caps_imex_open(struct inode *inode, struct file *file)
 {
@ -104,6 +105,10 @@ int NV_API_CALL nv_caps_imex_init(void)
    if (NVreg_ImexChannelCount == 0)
    {
        nv_printf(NV_DBG_INFO, "nv-caps-imex is disabled.\n");
+
+        // Disable channel creation as well
+        NVreg_CreateImexChannel0 = 0;
+
        return 0;
    }

--- a/kernel-open/nvidia/nv-caps.c
+++ b/kernel-open/nvidia/nv-caps.c
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -26,6 +26,8 @@
 #include "nv-procfs.h"
 #include "nv-hash.h"

+#include "nvmisc.h"
+
 extern int NVreg_ModifyDeviceFiles;

 /* sys_close() or __close_fd() */
@ -49,7 +51,7 @@ typedef struct nv_cap_table_entry
    struct hlist_node hlist;
 } nv_cap_table_entry_t;

-#define NV_CAP_NUM_ENTRIES(_table) (sizeof(_table) / sizeof(_table[0]))
+#define NV_CAP_NUM_ENTRIES(_table) (NV_ARRAY_ELEMENTS(_table))

 static nv_cap_table_entry_t g_nv_cap_nvlink_table[] =
 {
@ -361,18 +363,28 @@ static ssize_t nv_cap_procfs_write(struct file *file,
    nv_cap_file_private_t *private = NULL;
    unsigned long bytes_left;
    char *proc_buffer;
+    int status;
+
+    status = nv_down_read_interruptible(&nv_system_pm_lock);
+    if (status < 0)
+    {
+        nv_printf(NV_DBG_ERRORS, "nv-caps: failed to lock the nv_system_pm_lock!\n");
+        return status;
+    }

    private = ((struct seq_file *)file->private_data)->private;
    bytes_left = (sizeof(private->buffer) - private->offset - 1);

    if (count == 0)
    {
-        return -EINVAL;
+        count = -EINVAL;
+        goto done;
    }

    if ((bytes_left == 0) || (count > bytes_left))
    {
-        return -ENOSPC;
+        count = -ENOSPC;
+        goto done;
    }

    proc_buffer = &private->buffer[private->offset];
@ -380,7 +392,8 @@ static ssize_t nv_cap_procfs_write(struct file *file,
    if (copy_from_user(proc_buffer, buffer, count))
    {
        nv_printf(NV_DBG_ERRORS, "nv-caps: failed to copy in proc data!\n");
-        return -EFAULT;
+        count = -EFAULT;
+        goto done;
    }

    private->offset += count;
@ -388,17 +401,28 @@ static ssize_t nv_cap_procfs_write(struct file *file,

    *pos = private->offset;

+done:
+    up_read(&nv_system_pm_lock);
+
    return count;
 }

 static int nv_cap_procfs_read(struct seq_file *s, void *v)
 {
+    int status;
    nv_cap_file_private_t *private = s->private;

+    status = nv_down_read_interruptible(&nv_system_pm_lock);
+    if (status < 0)
+    {
+        return status;
+    }
+
    seq_printf(s, "%s: %d\n", "DeviceFileMinor", private->minor);
    seq_printf(s, "%s: %d\n", "DeviceFileMode", private->permissions);
    seq_printf(s, "%s: %d\n", "DeviceFileModify", private->modify);

+    up_read(&nv_system_pm_lock);
    return 0;
 }

@ -423,14 +447,6 @@ static int nv_cap_procfs_open(struct inode *inode, struct file *file)
    if (rc < 0)
    {
        NV_KFREE(private, sizeof(nv_cap_file_private_t));
-        return rc;
-    }
-
-    rc = nv_down_read_interruptible(&nv_system_pm_lock);
-    if (rc < 0)
-    {
-        single_release(inode, file);
-        NV_KFREE(private, sizeof(nv_cap_file_private_t));
    }

    return rc;
@ -449,8 +465,6 @@ static int nv_cap_procfs_release(struct inode *inode, struct file *file)
        private = s->private;
    }

-    up_read(&nv_system_pm_lock);
-
    single_release(inode, file);

    if (private != NULL)
--- a/kernel-open/nvidia/nv-memdbg.c
+++ b/kernel-open/nvidia/nv-memdbg.c
@ -28,12 +28,21 @@
 * teardown.
 */

+#define NV_MEM_LOGGER_STACK_TRACE 0
+
+#if defined(NV_STACK_TRACE_PRESENT) && defined(NV_MEM_LOGGER) && defined(DEBUG)
+#define NV_MEM_LOGGER_STACK_TRACE 1
+#endif
+
 typedef struct {
    struct rb_node rb_node;
    void *addr;
    NvU64 size;
    NvU32 line;
    const char *file;
+#if NV_MEM_LOGGER_STACK_TRACE == 1
+    unsigned long stack_trace[32];
+#endif
 } nv_memdbg_node_t;

 struct
@ -117,6 +126,12 @@ void nv_memdbg_add(void *addr, NvU64 size, const char *file, int line)
        node->size = size;
        node->file = file;
        node->line = line;
+
+#if NV_MEM_LOGGER_STACK_TRACE == 1
+        memset(node->stack_trace, '\0', sizeof(node->stack_trace));
+
+        stack_trace_save(node->stack_trace, NV_ARRAY_ELEMENTS(node->stack_trace), 0);
+#endif
    }

    NV_SPIN_LOCK_IRQSAVE(&g_nv_memdbg.lock, flags);
@ -209,6 +224,10 @@ void nv_memdbg_exit(void)
                node->size, node->addr);
        }

+#if NV_MEM_LOGGER_STACK_TRACE == 1
+        stack_trace_print(node->stack_trace, NV_ARRAY_ELEMENTS(node->stack_trace), 1);
+#endif
+
        rb_erase(&node->rb_node, &g_nv_memdbg.rb_root);
        kfree(node);
    }
--- a/kernel-open/nvidia/nv-nano-timer.c
+++ b/kernel-open/nvidia/nv-nano-timer.c
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2020-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2020-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -62,7 +62,7 @@ nvidia_nano_timer_callback(
    nv_linux_state_t *nvl = nv_nstimer->nv_linux_state;
    nvidia_stack_t *sp = NULL;

-    if (nv_kmem_cache_alloc_stack(&sp) != 0)
+    if (nv_kmem_cache_alloc_stack_atomic(&sp) != 0)
    {
        nv_printf(NV_DBG_ERRORS, "NVRM: no cache memory \n");
        return;
@ -189,12 +189,6 @@ void NV_API_CALL nv_start_nano_timer(
    NvU32 time_us;

    time_us = (NvU32)(time_ns / 1000);
-
-    if (time_us == 0)
-    {
-        nv_printf(NV_DBG_WARNINGS, "NVRM: Timer value cannot be less than 1 usec.\n");
-    }
-
    time_jiffies = usecs_to_jiffies(time_us);
    mod_timer(&nv_nstimer->jiffy_timer, jiffies + time_jiffies);
 #endif
--- a/kernel-open/nvidia/nv-p2p.c
+++ b/kernel-open/nvidia/nv-p2p.c
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2011-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -31,6 +31,8 @@
 #include "nv-p2p.h"
 #include "rmp2pdefines.h"

+#include "nvmisc.h"
+
 typedef enum nv_p2p_page_table_type {
    NV_P2P_PAGE_TABLE_TYPE_NON_PERSISTENT = 0,
    NV_P2P_PAGE_TABLE_TYPE_PERSISTENT,
@ -50,6 +52,7 @@ typedef struct nv_p2p_mem_info {
        struct semaphore lock;
    } dma_mapping_list;
    void *private;
+    void *mig_info;
 } nv_p2p_mem_info_t;

 // declared and created in nv.c
@ -73,7 +76,7 @@ static struct nvidia_status_mapping {
 };

 #define NVIDIA_STATUS_MAPPINGS \
-    (sizeof(nvidia_status_mappings) / sizeof(struct nvidia_status_mapping))
+    NV_ARRAY_ELEMENTS(nvidia_status_mappings)

 static int nvidia_p2p_map_status(NV_STATUS status)
 {
@ -314,7 +317,7 @@ static NV_STATUS nv_p2p_put_pages(
         * callback which can free it unlike non-persistent page_table.
         */
        mem_info = container_of(*page_table, nv_p2p_mem_info_t, page_table);
-        status = rm_p2p_put_pages_persistent(sp, mem_info->private, *page_table);
+        status = rm_p2p_put_pages_persistent(sp, mem_info->private, *page_table, mem_info->mig_info);
    }
    else
    {
@ -412,6 +415,17 @@ static int nv_p2p_get_pages(
    NvU8 uuid[NVIDIA_P2P_GPU_UUID_LEN] = {0};
    int rc;

+    if (!NV_IS_ALIGNED64(virtual_address, NVRM_P2P_PAGESIZE_BIG_64K) ||
+        !NV_IS_ALIGNED64(length, NVRM_P2P_PAGESIZE_BIG_64K))
+    {
+        nv_printf(NV_DBG_ERRORS,
+                  "NVRM: Invalid argument in nv_p2p_get_pages,"
+                  "address or length are not aligned "
+                  "address=0x%llx, length=0x%llx\n",
+                  virtual_address, length);
+        return -EINVAL;
+    }
+
    rc = nv_kmem_cache_alloc_stack(&sp);
    if (rc != 0)
    {
@ -495,7 +509,7 @@ static int nv_p2p_get_pages(
        status = rm_p2p_get_pages_persistent(sp, virtual_address, length,
                                             &mem_info->private,
                                             physical_addresses, &entries,
-                                             *page_table, gpu_info);
+                                             *page_table, gpu_info, &mem_info->mig_info);
        if (status != NV_OK)
        {
            goto failed;
--- a/kernel-open/nvidia/nv-pci.c
+++ b/kernel-open/nvidia/nv-pci.c
@ -37,6 +37,10 @@
 #include <linux/kernfs.h>
 #endif

+#if !defined(NV_BUS_TYPE_HAS_IOMMU_OPS)
+#include <linux/iommu.h>
+#endif
+
 static void
 nv_check_and_exclude_gpu(
    nvidia_stack_t *sp,
@ -324,7 +328,7 @@ static NvU32 find_gpu_numa_nodes_in_srat(nv_linux_state_t *nvl)
            gi = (struct acpi_srat_generic_affinity *) subtable_header;
            gi_dbdf = *((NvU16 *)(&gi->device_handle[0])) << 16 |
                      *((NvU16 *)(&gi->device_handle[2]));
-
+            
            if (gi_dbdf == dev_dbdf) {
                numa_node = pxm_to_node(gi->proximity_domain);
                if (numa_node < MAX_NUMNODES) {
@ -349,7 +353,6 @@ exit:
    acpi_put_table(table_header);
    return pxm_count;
 }
-
 #endif

 static void
@ -375,6 +378,7 @@ nv_init_coherent_link_info
        return;

    gi_found = find_gpu_numa_nodes_in_srat(nvl);
+
    if (!gi_found &&
        (device_property_read_u64(nvl->dev, "nvidia,gpu-mem-pxm-start", &pxm_start) != 0 ||
         device_property_read_u64(nvl->dev, "nvidia,gpu-mem-pxm-count", &pxm_count) != 0))
@ -530,35 +534,20 @@ nv_pci_probe
    if (pci_dev->is_virtfn)
    {
 #if defined(NV_VGPU_KVM_BUILD)
-        nvl = pci_get_drvdata(pci_dev->physfn);
-        if (!nvl)
+#if defined(NV_BUS_TYPE_HAS_IOMMU_OPS)
+        if (pci_dev->dev.bus->iommu_ops == NULL) 
+#else
+        if ((pci_dev->dev.iommu != NULL) && (pci_dev->dev.iommu->iommu_dev != NULL) &&
+            (pci_dev->dev.iommu->iommu_dev->ops == NULL))
+#endif
        {
            nv_printf(NV_DBG_ERRORS, "NVRM: Aborting probe for VF %04x:%02x:%02x.%x "
-                      "since PF is not bound to nvidia driver.\n",
+                      "since IOMMU is not present on the system.\n",
                       NV_PCI_DOMAIN_NUMBER(pci_dev), NV_PCI_BUS_NUMBER(pci_dev),
                       NV_PCI_SLOT_NUMBER(pci_dev), PCI_FUNC(pci_dev->devfn));
            goto failed;
        }

-        if (pci_dev->dev.bus->iommu_ops == NULL) 
-        {
-            nv = NV_STATE_PTR(nvl);
-            if (rm_is_iommu_needed_for_sriov(sp, nv))
-            {
-                nv_printf(NV_DBG_ERRORS, "NVRM: Aborting probe for VF %04x:%02x:%02x.%x "
-                          "since IOMMU is not present on the system.\n",
-                           NV_PCI_DOMAIN_NUMBER(pci_dev), NV_PCI_BUS_NUMBER(pci_dev),
-                           NV_PCI_SLOT_NUMBER(pci_dev), PCI_FUNC(pci_dev->devfn));
-                goto failed;
-            }
-        }
-
-        if (nvidia_vgpu_vfio_probe(pci_dev) != NV_OK)
-        {
-            nv_printf(NV_DBG_ERRORS, "NVRM: Failed to register device to vGPU VFIO module");
-            goto failed;
-        }
-
        nv_kmem_cache_free_stack(sp);
        return 0;
 #else
@ -687,8 +676,8 @@ next_bar:
        // Invalid 32 or 64-bit BAR.
        nv_printf(NV_DBG_ERRORS,
            "NVRM: This PCI I/O region assigned to your NVIDIA device is invalid:\n"
-            "NVRM: BAR%d is %dM @ 0x%llx (PCI:%04x:%02x:%02x.%x)\n", i,
-            (NV_PCI_RESOURCE_SIZE(pci_dev, i) >> 20),
+            "NVRM: BAR%d is %" NvU64_fmtu "M @ 0x%" NvU64_fmtx " (PCI:%04x:%02x:%02x.%x)\n", i,
+            (NvU64)(NV_PCI_RESOURCE_SIZE(pci_dev, i) >> 20),
            (NvU64)NV_PCI_RESOURCE_START(pci_dev, i),
            NV_PCI_DOMAIN_NUMBER(pci_dev), NV_PCI_BUS_NUMBER(pci_dev),
            NV_PCI_SLOT_NUMBER(pci_dev), PCI_FUNC(pci_dev->devfn));
@ -708,10 +697,10 @@ next_bar:
                            nv_device_name))
    {
        nv_printf(NV_DBG_ERRORS,
-            "NVRM: request_mem_region failed for %dM @ 0x%llx. This can\n"
+            "NVRM: request_mem_region failed for %" NvU64_fmtu "M @ 0x%" NvU64_fmtx ". This can\n"
            "NVRM: occur when a driver such as rivatv is loaded and claims\n"
            "NVRM: ownership of the device's registers.\n",
-            (NV_PCI_RESOURCE_SIZE(pci_dev, regs_bar_index) >> 20),
+            (NvU64)(NV_PCI_RESOURCE_SIZE(pci_dev, regs_bar_index) >> 20),
            (NvU64)NV_PCI_RESOURCE_START(pci_dev, regs_bar_index));
        goto failed;
    }
--- a/kernel-open/nvidia/nv-procfs.c
+++ b/kernel-open/nvidia/nv-procfs.c
@ -197,28 +197,25 @@ nv_procfs_read_power(
 {
    nv_state_t *nv = s->private;
    nvidia_stack_t *sp = NULL;
-    const char *vidmem_power_status;
-    const char *dynamic_power_status;
-    const char *gc6_support;
-    const char *gcoff_support;
+    nv_power_info_t power_info;

    if (nv_kmem_cache_alloc_stack(&sp) != 0)
    {
        return 0;
    }

-    dynamic_power_status = rm_get_dynamic_power_management_status(sp, nv);
-    seq_printf(s, "Runtime D3 status:          %s\n", dynamic_power_status);
-
-    vidmem_power_status = rm_get_vidmem_power_status(sp, nv);
-    seq_printf(s, "Video Memory:               %s\n\n", vidmem_power_status);
+    rm_get_power_info(sp, nv, &power_info);
+    seq_printf(s, "Runtime D3 status:          %s\n", power_info.dynamic_power_status);
+    seq_printf(s, "Video Memory:               %s\n\n", power_info.vidmem_power_status);

    seq_printf(s, "GPU Hardware Support:\n");
-    gc6_support = rm_get_gpu_gcx_support(sp, nv, NV_TRUE);
-    seq_printf(s, " Video Memory Self Refresh: %s\n", gc6_support);
+    seq_printf(s, " Video Memory Self Refresh: %s\n", power_info.gc6_support);
+    seq_printf(s, " Video Memory Off:          %s\n\n", power_info.gcoff_support);

-    gcoff_support = rm_get_gpu_gcx_support(sp, nv, NV_FALSE);
-    seq_printf(s, " Video Memory Off:          %s\n", gcoff_support);
+    seq_printf(s, "S0ix Power Management:\n");
+    seq_printf(s, " Platform Support:          %s\n",
+               nv_platform_supports_s0ix() ? "Supported" : "Not Supported");
+    seq_printf(s, " Status:                    %s\n", power_info.s0ix_status);

    nv_kmem_cache_free_stack(sp);
    return 0;
--- a/kernel-open/nvidia/nv-reg.h
+++ b/kernel-open/nvidia/nv-reg.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2006-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2006-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -869,6 +869,8 @@
 * NVreg_ModifyDeviceFiles, NVreg_DeviceFileGID, NVreg_DeviceFileUID
 * and NVreg_DeviceFileMode will be honored by nvidia-modprobe.
 *
+ * Also, refer to the NVreg_CreateImexChannel0 option.
+ *
 * Possible values:
 *  0 - Disable IMEX using CUDA driver's fabric handles.
 *  N - N IMEX channels will be enabled in the driver to facilitate N
@ -878,6 +880,29 @@
 #define __NV_IMEX_CHANNEL_COUNT ImexChannelCount
 #define NV_REG_IMEX_CHANNEL_COUNT NV_REG_STRING(__NV_IMEX_CHANNEL_COUNT)

+/*
+ * Option: NVreg_CreateImexChannel0
+ *
+ * Description:
+ *
+ * This option allows users to specify whether the NVIDIA driver must create
+ * the IMEX channel 0 by default. The channel will be created automatically
+ * when an application (e.g. nvidia-smi, nvidia-persistenced) is run.
+ *
+ * Note that users are advised to enable this option only in trusted
+ * environments where it is acceptable for applications to share the same
+ * IMEX channel.
+ *
+ * For more details on IMEX channels, refer to the NVreg_ImexChannelCount
+ * option.
+ *
+ * Possible values:
+ *  0 - Do not create IMEX channel 0 (default).
+ *  1 - Create IMEX channel 0.
+ */
+#define __NV_CREATE_IMEX_CHANNEL_0 CreateImexChannel0
+#define NV_CREATE_IMEX_CHANNEL_0 NV_REG_STRING(__CREATE_IMEX_CHANNEL_0)
+
 #if defined(NV_DEFINE_REGISTRY_KEY_TABLE)

 /*
@ -927,6 +952,7 @@ NV_DEFINE_REG_STRING_ENTRY(__NV_EXCLUDED_GPUS, NULL);
 NV_DEFINE_REG_ENTRY(__NV_DMA_REMAP_PEER_MMIO, NV_DMA_REMAP_PEER_MMIO_ENABLE);
 NV_DEFINE_REG_STRING_ENTRY(__NV_RM_NVLINK_BW, NULL);
 NV_DEFINE_REG_ENTRY_GLOBAL(__NV_IMEX_CHANNEL_COUNT, 2048);
+NV_DEFINE_REG_ENTRY_GLOBAL(__NV_CREATE_IMEX_CHANNEL_0, 0);

 /*
 *----------------registry database definition----------------------
@ -974,6 +1000,7 @@ nv_parm_t nv_parms[] = {
    NV_DEFINE_PARAMS_TABLE_ENTRY(__NV_OPENRM_ENABLE_UNSUPPORTED_GPUS),
    NV_DEFINE_PARAMS_TABLE_ENTRY(__NV_DMA_REMAP_PEER_MMIO),
    NV_DEFINE_PARAMS_TABLE_ENTRY(__NV_IMEX_CHANNEL_COUNT),
+    NV_DEFINE_PARAMS_TABLE_ENTRY(__NV_CREATE_IMEX_CHANNEL_0),
    {NULL, NULL}
 };

--- a/kernel-open/nvidia/nv-vm.c
+++ b/kernel-open/nvidia/nv-vm.c
@ -514,7 +514,6 @@ NV_STATUS nv_alloc_system_pages(
    struct device *dev = at->dev;
    dma_addr_t bus_addr;

-    // Order should be zero except for EGM allocations.
    unsigned int alloc_page_size = PAGE_SIZE << at->order;
    unsigned int alloc_num_pages = NV_CEIL(at->num_pages * PAGE_SIZE, alloc_page_size);

@ -523,7 +522,7 @@ NV_STATUS nv_alloc_system_pages(
    unsigned int os_pages_in_page = alloc_page_size / PAGE_SIZE;

    nv_printf(NV_DBG_MEMINFO,
-            "NVRM: VM: %u: %u order0 pages, %u order\n", __FUNCTION__, at->num_pages, at->order);
+            "NVRM: VM: %s: %u order0 pages, %u order\n", __FUNCTION__, at->num_pages, at->order);

    gfp_mask = nv_compute_gfp_mask(nv, at);

@ -641,7 +640,6 @@ void nv_free_system_pages(
    unsigned int i;
    struct device *dev = at->dev;

-    // Order should be zero except for EGM allocations.
    unsigned int alloc_page_size = PAGE_SIZE << at->order;
    unsigned int os_pages_in_page = alloc_page_size / PAGE_SIZE;

--- a/kernel-open/nvidia/nv-vtophys.c
+++ b/kernel-open/nvidia/nv-vtophys.c
@ -29,7 +29,7 @@
 NvU64 NV_API_CALL nv_get_kern_phys_address(NvU64 address)
 {
    /* direct-mapped kernel address */
-    if (virt_addr_valid(address))
+    if (virt_addr_valid((void *)address))
        return __pa(address);

    nv_printf(NV_DBG_ERRORS,
--- a/kernel-open/nvidia/nv.c
+++ b/kernel-open/nvidia/nv.c
@ -3131,6 +3131,7 @@ NV_STATUS NV_API_CALL
 nv_alias_pages(
    nv_state_t *nv,
    NvU32 page_cnt,
+    NvU64 page_size,
    NvU32 contiguous,
    NvU32 cache_type,
    NvU64 guest_id,
@ -3152,7 +3153,14 @@ nv_alias_pages(

    at->cache_type = cache_type;
    if (contiguous)
+    {
        at->flags.contig = NV_TRUE;
+        at->order = get_order(at->num_pages * PAGE_SIZE);
+    }
+    else
+    {
+        at->order = get_order(page_size);
+    }
 #if defined(NVCPU_AARCH64)
    if (at->cache_type != NV_MEMORY_CACHED)
        at->flags.aliased = NV_TRUE;
@ -3160,8 +3168,6 @@ nv_alias_pages(

    at->flags.guest = NV_TRUE;

-    at->order = get_order(at->num_pages * PAGE_SIZE);
-
    for (i=0; i < at->num_pages; ++i)
    {
        page_ptr = at->page_table[i];
@ -3271,7 +3277,7 @@ NV_STATUS NV_API_CALL nv_register_user_pages(
    nv_linux_state_t *nvl;
    nvidia_pte_t *page_ptr;

-    nv_printf(NV_DBG_MEMINFO, "NVRM: VM: nv_register_user_pages: 0x%x\n", page_count);
+    nv_printf(NV_DBG_MEMINFO, "NVRM: VM: nv_register_user_pages: 0x%" NvU64_fmtx"\n", page_count);
    user_pages = *priv_data;
    nvl = NV_GET_NVL_FROM_NV_STATE(nv);

@ -3332,7 +3338,7 @@ void NV_API_CALL nv_unregister_user_pages(
 {
    nv_alloc_t *at = *priv_data;

-    nv_printf(NV_DBG_MEMINFO, "NVRM: VM: nv_unregister_user_pages: 0x%x\n", page_count);
+    nv_printf(NV_DBG_MEMINFO, "NVRM: VM: nv_unregister_user_pages: 0x%" NvU64_fmtx "\n", page_count);

    NV_PRINT_AT(NV_DBG_MEMINFO, at);

@ -6133,7 +6139,10 @@ void NV_API_CALL nv_get_screen_info(
    {
        NvU64 physAddr = screen_info.lfb_base;
 #if defined(VIDEO_CAPABILITY_64BIT_BASE)
-        physAddr |= (NvU64)screen_info.ext_lfb_base << 32;
+        if  (screen_info.capabilities & VIDEO_CAPABILITY_64BIT_BASE)
+        {
+            physAddr |= (NvU64)screen_info.ext_lfb_base << 32;
+        }
 #endif

        /* Make sure base address is mapped to GPU BAR */
--- a/kernel-open/nvidia/nv_gpu_ops.h
+++ b/kernel-open/nvidia/nv_gpu_ops.h
@ -285,12 +285,15 @@ NV_STATUS nvGpuOpsFlushReplayableFaultBuffer(gpuFaultInfo *pFaultInfo,
 NV_STATUS nvGpuOpsTogglePrefetchFaults(gpuFaultInfo *pFaultInfo,
                                       NvBool bEnable);

-// Interface used for CCSL
+NV_STATUS nvGpuOpsKeyRotationChannelDisable(struct gpuChannel *channelList[],
+                                            NvU32 channelListCount);

+// Interface used for CCSL
 NV_STATUS nvGpuOpsCcslContextInit(struct ccslContext_t **ctx,
                                  gpuChannelHandle channel);
 NV_STATUS nvGpuOpsCcslContextClear(struct ccslContext_t *ctx);
-NV_STATUS nvGpuOpsCcslContextUpdate(struct ccslContext_t *ctx);
+NV_STATUS nvGpuOpsCcslContextUpdate(UvmCslContext *contextList[],
+                                    NvU32 contextListCount);
 NV_STATUS nvGpuOpsCcslRotateIv(struct ccslContext_t *ctx,
                               NvU8 direction);
 NV_STATUS nvGpuOpsCcslEncrypt(struct ccslContext_t *ctx,
--- a/kernel-open/nvidia/nv_uvm_interface.c
+++ b/kernel-open/nvidia/nv_uvm_interface.c
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2013-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2013-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -1478,6 +1478,15 @@ NV_STATUS nvUvmInterfacePagingChannelPushStream(UvmGpuPagingChannelHandle channe
 }
 EXPORT_SYMBOL(nvUvmInterfacePagingChannelPushStream);

+NV_STATUS nvUvmInterfaceKeyRotationChannelDisable(uvmGpuChannelHandle channelList[],
+                                                  NvU32 channeListCount)
+{
+    nvidia_stack_t *sp = nvUvmGetSafeStack();
+
+    return rm_gpu_ops_key_rotation_channel_disable(sp, ((gpuChannelHandle *)channelList), channeListCount);
+}
+EXPORT_SYMBOL(nvUvmInterfaceKeyRotationChannelDisable);
+
 NV_STATUS nvUvmInterfaceCslInitContext(UvmCslContext *uvmCslContext,
                                       uvmGpuChannelHandle channel)
 {
@ -1516,12 +1525,13 @@ void nvUvmInterfaceDeinitCslContext(UvmCslContext *uvmCslContext)
 }
 EXPORT_SYMBOL(nvUvmInterfaceDeinitCslContext);

-NV_STATUS nvUvmInterfaceCslUpdateContext(UvmCslContext *uvmCslContext)
+NV_STATUS nvUvmInterfaceCslUpdateContext(UvmCslContext *contextList[],
+                                         NvU32 contextListCount)
 {
    NV_STATUS status;
-    nvidia_stack_t *sp = uvmCslContext->nvidia_stack;
+    nvidia_stack_t *sp = contextList[0]->nvidia_stack;

-    status = rm_gpu_ops_ccsl_context_update(sp, uvmCslContext->ctx);
+    status = rm_gpu_ops_ccsl_context_update(sp, contextList, contextListCount);

    return status;
 }
--- a/kernel-open/nvidia/nvidia.Kbuild
+++ b/kernel-open/nvidia/nvidia.Kbuild
@ -195,6 +195,7 @@ NV_CONFTEST_FUNCTION_COMPILE_TESTS += devm_clk_bulk_get_all
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += get_task_ioprio
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += mdev_set_iommu_device
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += offline_and_remove_memory
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += stack_trace
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += crypto_tfm_ctx_aligned

 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_gpl_of_node_to_nid
@ -227,6 +228,8 @@ NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_tsec_comms_clear_in
 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_tsec_comms_alloc_mem_from_gscco
 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_tsec_comms_free_gscco_mem
 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_memory_block_size_bytes
+NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_tegra_platform_is_fpga
+NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_tegra_platform_is_sim
 NV_CONFTEST_SYMBOL_COMPILE_TESTS += crypto

 NV_CONFTEST_TYPE_COMPILE_TESTS += dma_ops
@ -251,6 +254,7 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += pci_driver_has_driver_managed_dma
 NV_CONFTEST_TYPE_COMPILE_TESTS += vm_area_struct_has_const_vm_flags
 NV_CONFTEST_TYPE_COMPILE_TESTS += memory_failure_has_trapno_arg
 NV_CONFTEST_TYPE_COMPILE_TESTS += foll_longterm_present
+NV_CONFTEST_TYPE_COMPILE_TESTS += bus_type_has_iommu_ops

 NV_CONFTEST_GENERIC_COMPILE_TESTS += dom0_kernel_present
 NV_CONFTEST_GENERIC_COMPILE_TESTS += nvidia_vgpu_kvm_build
--- a/src/common/displayport/inc/dp_configcaps.h
+++ b/src/common/displayport/inc/dp_configcaps.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -464,6 +464,9 @@ namespace DisplayPort
        virtual bool             getStreamStatusChanged() = 0;
        virtual void             clearStreamStatusChanged() =0;

+        virtual bool             getDpTunnelingIrq() = 0;
+        virtual void             clearDpTunnelingIrq() = 0;
+
        virtual void             setDirtyLinkStatus(bool dirty) = 0;
        virtual void             refreshLinkStatus() = 0;
        virtual bool             isLinkStatusValid(unsigned lanes) = 0;
@ -529,6 +532,15 @@ namespace DisplayPort
        virtual bool readPsrEvtIndicator(vesaPsrEventIndicator *psrErr) = 0;
        virtual bool readPrSinkDebugInfo(panelReplaySinkDebugInfo *prDbgInfo) = 0;

+        virtual bool     getDpTunnelBwAllocationSupported() = 0;
+        virtual bool     getDpTunnelEstimatedBw(NvU8 &estimatedBw) = 0;
+        virtual bool     getDpTunnelGranularityMultiplier(NvU8 &granularityMultiplier) = 0;
+        virtual TriState getDpTunnelBwRequestStatus() = 0;
+        virtual bool     setDpTunnelBwAllocation(bool bEnable) = 0;
+        virtual bool     hasDpTunnelEstimatedBwChanged() = 0;
+        virtual bool     hasDpTunnelBwAllocationCapabilityChanged() = 0;
+        virtual bool     writeDpTunnelRequestedBw(NvU8 requestedBw) = 0;
+
        virtual ~DPCDHAL() {}

    };
@ -536,7 +548,876 @@ namespace DisplayPort
    //
    //  Implement interface
    //
-    DPCDHAL * MakeDPCDHAL(AuxBus *  bus, Timer * timer);
+    DPCDHAL * MakeDPCDHAL(AuxBus *  bus, Timer * timer, MainLink * main);
+
+    struct DPCDHALImpl : DPCDHAL
+    {
+        AuxRetry  bus;
+        Timer    * timer;
+        bool      dpcdOffline;
+        bool      bGrantsPostLtRequest;
+        bool      pc2Disabled;
+        bool      uprequestEnable;
+        bool      upstreamIsSource;
+        bool      bMultistream;
+        bool      bGpuFECSupported;
+        bool      bLttprSupported;
+        bool      bBypassILREdpRevCheck;
+        NvU32     overrideDpcdMaxLinkRate;
+        NvU32     overrideDpcdRev;
+        NvU32     overrideDpcdMaxLaneCount;
+
+        NvU32     gpuDPSupportedVersions;
+
+        struct _LegacyPort: public LegacyPort
+        {
+            DwnStreamPortType         type;
+            DwnStreamPortAttribute    nonEDID;
+
+            NvU64                     maxTmdsClkRate;
+
+            DwnStreamPortType getDownstreamPortType()
+            {
+                return type;
+            }
+
+            DwnStreamPortAttribute getDownstreamNonEDIDPortAttribute()
+            {
+                return nonEDID;
+            }
+
+            NvU64 getMaxTmdsClkRate()
+            {
+                return maxTmdsClkRate;
+            }
+
+        } legacyPort[16];
+
+        struct
+        {
+            unsigned  revisionMajor, revisionMinor;                 // DPCD offset 0
+            bool      supportsESI;
+            LinkRate  maxLinkRate;                                  // DPCD offset 1
+            unsigned  maxLaneCount;                                 // DPCD offset 2
+            unsigned  maxLanesAtHBR;
+            unsigned  maxLanesAtRBR;
+            bool      enhancedFraming;
+            bool      bPostLtAdjustmentSupport;
+
+            bool      supportsNoHandshakeTraining;
+            bool      bSupportsTPS4;
+            unsigned  NORP;                                         // DPCD offset 4
+
+            bool      detailedCapInfo;                              // DPCD offset 5
+            bool      downStreamPortPresent;
+            NvU8      downStreamPortType;
+
+            unsigned  downStreamPortCount;                          // DPCD offset 7
+            bool      ouiSupported;
+            bool      msaTimingParIgnored;
+
+            NvU16     linkRateTable[NV_DPCD_SUPPORTED_LINK_RATES__SIZE]; // DPCD offset 10 ~ 1F
+
+            bool      supportsMultistream;                          // DPCD offset 21
+            unsigned  numberAudioEndpoints;                         // DPCD offset 22
+            bool      overrideToSST;                                // force to SST even if MST capable
+            bool      noLinkTraining;                               // DPCD offset 330h
+
+            bool      extendedRxCapsPresent;                        // DPCD offset 000Eh [7] - Extended Receiver Capability present
+
+            // DPCD Offset 2211h;
+            unsigned  extendedSleepWakeTimeoutRequestMs;
+            // DPCD Offset 0119h [0] - If we grant the extendedSleepWakeTimeoutRequest
+            bool      bExtendedSleepWakeTimeoutGranted;
+
+            bool      bFECSupported;
+
+            // DPCD Offset F0002h - Number of Physical Repeaters present (after mapping) between Source and Sink
+            unsigned  phyRepeaterCount;
+            // DPCD offset 700 - EDP_DPCD_REV
+            unsigned  eDpRevision;
+
+            struct
+            {
+                unsigned  revisionMajor, revisionMinor;             // DPCD offset F0000h
+                LinkRate  maxLinkRate;                              // DPCD offset F0001h
+                unsigned  maxLaneCount;                             // DPCD offset F0004h
+                unsigned  phyRepeaterExtendedWakeTimeoutMs;         // DPCD offset F0005h
+                // The array to keep track of FEC capability of each LTTPR
+                bool      bFECSupportedRepeater[NV_DPCD14_PHY_REPEATER_CNT_MAX];
+                // If all the LTTPRs supports FEC
+                bool      bFECSupported;
+
+            } repeaterCaps;
+
+            struct
+            {
+                bool     bIsSupported;
+                bool     bUsb4DriverSupport;
+                bool     bIsPanelReplayOptimizationSupported;
+                bool     bIsBwAllocationSupported;
+                NvU8     maxLaneCount;
+                LinkRate maxLinkRate;
+            } dpInTunnelingCaps;
+
+            PCONCaps pconCaps;
+            vesaPsrSinkCaps psrCaps;
+            NvU32    videoFallbackFormats;                          // DPCD offset 0200h
+
+        } caps;
+
+        bool bIsDpTunnelBwAllocationEnabled;
+
+        struct
+        {
+            unsigned  sinkCount;                                    // DPCD offset 200
+            bool      automatedTestRequest;
+            bool      cpIRQ;
+            bool      mccsIRQ;
+            bool      downRepMsgRdy;
+            bool      upReqMsgRdy;
+            bool      prErrorStatus;                                // DPCD offset 2004h[3]
+            bool      rxCapChanged;                                 // DPCD offset 2005
+            bool      linkStatusChanged;                            // DPCD offset 2005
+            bool      streamStatusChanged;                          // DPCD offset 2005
+            bool      hdmiLinkStatusChanged;                        // DPCD offset 2005
+            bool      dpTunnelingIrq;                               // DPCD offset 2005
+            NvU8      eightyBitCustomPat[10];                       // DPCD offset 250 - 259
+
+            struct
+            {
+                struct
+                {
+                    bool clockRecoveryDone;
+                    bool channelEqualizationDone;
+                    bool symbolLocked;
+                } laneStatus[4];                                         // DPCD offset 202, 203
+
+                bool interlaneAlignDone;                                 // DPCD offset 204
+                bool downstmPortChng;
+                bool linkStatusUpdated;
+
+                //
+                // (ESI specific) signifies that we have link trained and should
+                // update the link status in the next query to isLinkLost. Keep in
+                // mind that linkStatusChanged might still be zero.
+                //
+                bool linkStatusDirtied;
+            } laneStatusIntr;
+
+            struct
+            {
+                bool testRequestTraining;                          // DPCD offset 218
+                LinkRate testRequestLinkRate;                      // DPCD offset 219
+                unsigned testRequestLaneCount;                     // DPCD offset 220
+            } testTraining;
+
+            struct
+            {
+                bool testRequestEdidRead;                          // DPCD offset 218
+            } testEdid;
+
+            struct
+            {
+                bool                testRequestPattern;            // DPCD offset 218
+                TestPatternType     testPatRequested;              // DPCD offset 221
+                NvU16               testHorTotalPixels;            // DPCD offset 222, 223
+                NvU16               testVerTotalLines;             // DPCD offset 224, 225
+                NvU16               testHorStartPixels;            // DPCD offset 226, 227
+                NvU16               testVerStartLines;             // DPCD offset 228, 229
+                NvU16               testHsyncWidthPixels;          // DPCD offset 22A, 22B
+                bool                testHsyncPolarity;
+                NvU16               testVsyncWidthLines;           // DPCD offset 22C, 22D
+                bool                testVsyncPolarity;
+                NvU16               testActiveWidthPixels;         // DPCD offset 22E, 22F
+                NvU16               testActiveHeightLines;         // DPCD offset 230, 231
+            } testPattern;
+
+            struct
+            {
+                bool testRequestPhyCompliance;                     // DPCD offset 218
+                LinkQualityPatternType phyTestPattern;             // DPCD offset 248
+            } testPhyCompliance;
+
+        } interrupts;
+
+        bool bIndexedLinkrateCapable, bIndexedLinkrateEnabled;
+
+        public:
+        DPCDHALImpl(AuxBus * bus, Timer * timer)
+        : bus(bus), timer(timer), bGrantsPostLtRequest(false), uprequestEnable(false),
+          upstreamIsSource(false), bMultistream(false), bGpuFECSupported(false),
+          bBypassILREdpRevCheck(false), overrideDpcdMaxLinkRate(0),
+          overrideDpcdRev(0), gpuDPSupportedVersions(0), bIsDpTunnelBwAllocationEnabled(false)
+        {
+            // start with default caps.
+            dpcdOffline = true;
+
+            //
+            // fill out the bare minimum caps required ...
+            // this should be extended in for more dpcd offsets in future.
+            //
+            caps.revisionMajor = 0x1;
+            caps.revisionMinor = 0x1;
+            caps.supportsESI = false;
+            caps.maxLinkRate = HBR3;
+            caps.maxLaneCount = 4;
+            caps.enhancedFraming = true;
+            caps.downStreamPortPresent = true;
+            caps.downStreamPortCount = 1;
+
+            // populate the sinkcount interrupt
+            interrupts.sinkCount = 1;
+        }
+
+        ~DPCDHALImpl()
+        {
+        }
+
+        virtual void setAuxBus(AuxBus * bus)
+        {
+            this->bus = bus;
+        }
+
+        bool isDpcdOffline()
+        {
+            return dpcdOffline;
+        }
+
+        void setDPCDOffline(bool bOffline)
+        {
+            dpcdOffline = bOffline;
+        }
+
+        void updateDPCDOffline();
+
+        void setPC2Disabled(bool disabled)
+        {
+            pc2Disabled = disabled;
+        }
+
+        void setLttprSupported(bool isLttprSupported)
+        {
+            bLttprSupported = isLttprSupported;
+        }
+
+        bool isPC2Disabled()
+        {
+            return pc2Disabled;
+        }
+
+        virtual void parseAndReadCaps();
+        virtual PCONCaps * getPCONCaps()
+        {
+            return &(caps.pconCaps);
+        }
+
+        // DPCD offset 0
+        virtual unsigned getRevisionMajor()
+        {
+            return caps.revisionMajor;
+        }
+
+        virtual unsigned getRevisionMinor()
+        {
+            return caps.revisionMinor;
+        }
+
+        // DPCD offset F0000h
+        virtual unsigned lttprGetRevisionMajor()
+        {
+            return caps.repeaterCaps.revisionMajor;
+        }
+
+        virtual unsigned lttprGetRevisionMinor()
+        {
+            return caps.repeaterCaps.revisionMinor;
+        }
+
+        virtual LinkRate getMaxLinkRate();
+
+        // DPCD offset 2
+        virtual unsigned getMaxLaneCount();
+
+        virtual bool getNoLinkTraining()
+        {
+            return caps.noLinkTraining;
+        }
+
+        virtual unsigned getPhyRepeaterCount()
+        {
+            return caps.phyRepeaterCount;
+        }
+
+        // Max lanes supported at the desired link rate.
+        virtual unsigned getMaxLaneCountSupportedAtLinkRate(LinkRate linkRate);
+
+        virtual bool getEnhancedFraming()
+        {
+            return caps.enhancedFraming;
+        }
+
+        // DPCD offset 5
+        virtual bool getDownstreamPort(NvU8 *portType)
+        {
+            *portType = caps.downStreamPortType;
+            return caps.downStreamPortPresent;
+        }
+
+        virtual bool getSupportsNoHandshakeTraining()
+        {
+            return caps.supportsNoHandshakeTraining;
+        }
+
+        // DPCD offset 7
+        virtual unsigned getLegacyPortCount()
+        {
+            return caps.downStreamPortCount;
+        }
+
+        virtual LegacyPort * getLegacyPort(unsigned index)
+        {
+            return &legacyPort[index];
+        }
+
+        virtual bool getMsaTimingparIgnored()
+        {
+            return caps.msaTimingParIgnored;
+        }
+
+        virtual bool getOuiSupported()
+        {
+            return caps.ouiSupported;
+        }
+
+        virtual bool getSDPExtnForColorimetry();
+
+        virtual bool getRootAsyncSDPSupported();
+
+        virtual AuxRetry::status setOuiSource(unsigned ouiId, const char * model,
+                                              size_t modelNameLength, NvU8 chipRevision);
+        virtual bool getOuiSource(unsigned &ouiId, char * modelName,
+                                  size_t modelNameBufferSize, NvU8 & chipRevision);
+        virtual bool getOuiSink(unsigned &ouiId, char * modelName,
+                                size_t modelNameBufferSize, NvU8 & chipRevision);
+
+        // DPCD offset 21h
+        virtual bool getSupportsMultistream()
+        {
+            return caps.supportsMultistream && (!caps.overrideToSST);
+        }
+
+        virtual void setSupportsESI(bool bIsESISupported)
+        {
+            caps.supportsESI = bIsESISupported;
+        }
+
+        //
+        // Single stream specific caps
+        // DPCD offset 22h
+        //
+        virtual unsigned getNumberOfAudioEndpoints();
+
+        // DPCD offset 30h
+        virtual bool getGUID(GUID & guid);
+        virtual AuxRetry::status setGUID(GUID & guid);
+
+        void parsePortDescriptors();
+
+        //
+        //  Notifications of external events
+        //
+        virtual void notifyIRQ()
+        {
+            parseAndReadInterrupts();
+        }
+
+        virtual void populateFakeDpcd();
+
+        // DPCD override routine: Max link rate override.
+        void overrideMaxLinkRate(NvU32 overrideMaxLinkRate);
+
+        // DPCD override routine: Max lane count override.
+        void overrideMaxLaneCount(NvU32 maxLaneCount)
+        {
+            caps.maxLaneCount = maxLaneCount;
+            overrideDpcdMaxLaneCount = maxLaneCount;
+        }
+
+        // DPCD override routine: Max lane count override at a given link rate.
+        void skipCableBWCheck(NvU32 maxLaneAtHighRate, NvU32 maxLaneAtLowRate)
+        {
+            caps.maxLanesAtHBR = maxLaneAtHighRate;
+            caps.maxLanesAtRBR = maxLaneAtLowRate;
+        }
+
+        // DPCD override routine: Optimal link config (link rate and lane count) override.
+        void overrideOptimalLinkCfg(LinkRate optimalLinkRate,
+                                    NvU32 optimalLaneCount)
+        {
+            caps.maxLinkRate = optimalLinkRate;
+            caps.maxLaneCount = optimalLaneCount;
+        }
+
+        // DPCD override routine: Optimal link rate
+        void overrideOptimalLinkRate(LinkRate optimalLinkRate)
+        {
+            caps.maxLinkRate = optimalLinkRate;
+        }
+
+        virtual void notifyHPD(bool status, bool bSkipDPCDRead);
+        virtual bool isPostLtAdjustRequestSupported()
+        {
+            //
+            // If the upstream DPTX and downstream DPRX both support TPS4,
+            // TPS4 shall be used instead of POST_LT_ADJ_REQ.
+            //
+            NvBool bTps4Supported = FLD_TEST_DRF(0073_CTRL_CMD_DP, _GET_CAPS_DP_VERSIONS_SUPPORTED,
+                                                 _DP1_4, _YES, gpuDPSupportedVersions) &&
+                                    caps.bSupportsTPS4;
+            return bGrantsPostLtRequest && !bTps4Supported;
+        }
+
+        virtual void setPostLtAdjustRequestGranted(bool bGrantPostLtRequest);
+        virtual bool getIsPostLtAdjRequestInProgress();
+        virtual TrainingPatternSelectType getTrainingPatternSelect();
+        virtual bool setTrainingMultiLaneSet(NvU8 numLanes,
+                                             NvU8 *voltSwingSet,
+                                             NvU8 *preEmphasisSet);
+
+        virtual AuxRetry::status setIgnoreMSATimingParamters(bool msaTimingParamIgnoreEn);
+
+        virtual AuxRetry::status setLinkQualPatternSet(LinkQualityPatternType linkQualPattern, unsigned laneCount);
+        virtual AuxRetry::status setLinkQualLaneSet(unsigned lane, LinkQualityPatternType linkQualPattern);
+
+        virtual AuxRetry::status setMessagingEnable(bool _uprequestEnable, bool _upstreamIsSource);
+        virtual AuxRetry::status setMultistreamLink(bool enable);
+        virtual AuxRetry::status setMultistreamHotplugMode(MultistreamHotplugMode notifyType);
+
+        bool parseTestRequestTraining(NvU8 * buffer /* 0x18-0x28 valid */);
+        void parseAutomatedTestRequest(bool testRequestPending);
+
+        virtual bool parseTestRequestPhy();
+
+        virtual bool interruptCapabilitiesChanged()
+        {
+            return interrupts.rxCapChanged;
+        }
+
+        virtual void clearInterruptCapabilitiesChanged()
+        {
+            NvU8 irqVector = 0;
+            irqVector = FLD_SET_DRF(_DPCD, _LINK_SERVICE_IRQ_VECTOR_ESI0, _RX_CAP_CHANGED, _YES, irqVector);
+            bus.write(NV_DPCD_LINK_SERVICE_IRQ_VECTOR_ESI0, &irqVector, sizeof irqVector);
+        }
+
+        virtual bool isPanelReplayErrorSet()
+        {
+            return interrupts.prErrorStatus;
+        }
+
+        virtual void readPanelReplayError();
+        virtual void clearPanelReplayError()
+        {
+            NvU8 irqVector = 0U;
+            irqVector = FLD_SET_DRF(_DPCD, _DEVICE_SERVICE_IRQ_VECTOR_ESI1,
+                                    _PANEL_REPLAY_ERROR_STATUS, _YES, irqVector);
+            bus.write(NV_DPCD_DEVICE_SERVICE_IRQ_VECTOR_ESI1, &irqVector,
+                      sizeof irqVector);
+        }
+
+        virtual bool getLinkStatusChanged()
+        {
+            return interrupts.linkStatusChanged;
+        }
+
+        virtual void clearLinkStatusChanged()
+        {
+            NvU8 irqVector = 0;
+            irqVector = FLD_SET_DRF(_DPCD, _LINK_SERVICE_IRQ_VECTOR_ESI0, _LINK_STATUS_CHANGED, _YES, irqVector);
+            bus.write(NV_DPCD_LINK_SERVICE_IRQ_VECTOR_ESI0, &irqVector, sizeof irqVector);
+        }
+
+        virtual bool getHdmiLinkStatusChanged()
+        {
+            return interrupts.hdmiLinkStatusChanged;
+        }
+
+        virtual void clearHdmiLinkStatusChanged()
+        {
+            NvU8 irqVector = 0;
+            irqVector = FLD_SET_DRF(_DPCD, _LINK_SERVICE_IRQ_VECTOR_ESI0, _HDMI_LINK_STATUS_CHANGED, _YES, irqVector);
+            bus.write(NV_DPCD_LINK_SERVICE_IRQ_VECTOR_ESI0, &irqVector, sizeof irqVector);
+        }
+
+        virtual bool getStreamStatusChanged()
+        {
+            return interrupts.streamStatusChanged;
+        }
+
+        virtual void clearStreamStatusChanged()
+        {
+            NvU8 irqVector = 0;
+            irqVector = FLD_SET_DRF(_DPCD, _LINK_SERVICE_IRQ_VECTOR_ESI0, _STREAM_STATUS_CHANGED, _YES, irqVector);
+            bus.write(NV_DPCD_LINK_SERVICE_IRQ_VECTOR_ESI0, &irqVector, sizeof irqVector);
+        }
+
+        virtual bool getDpTunnelingIrq()
+        {
+            return interrupts.dpTunnelingIrq;
+        }
+
+        virtual void clearDpTunnelingIrq()
+        {
+            NvU8 irqVector = 0;
+            irqVector = FLD_SET_DRF(_DPCD20, _LINK_SERVICE_IRQ_VECTOR_ESI0, _DP_TUNNELING_IRQ, _YES, irqVector);
+            bus.write(NV_DPCD20_LINK_SERVICE_IRQ_VECTOR_ESI0, &irqVector, sizeof irqVector);
+        }
+
+        virtual bool isLinkStatusValid(unsigned lanes);
+        virtual void refreshLinkStatus();
+        virtual void setDirtyLinkStatus(bool dirty)
+        {
+            interrupts.laneStatusIntr.linkStatusDirtied = dirty;
+        }
+
+        void parseAndReadInterruptsESI();
+
+        void readLTTPRLinkStatus(NvS32 rxIndex, NvU8 *buffer);
+        void resetIntrLaneStatus();
+
+        void fetchLinkStatusESI();
+        void fetchLinkStatusLegacy();
+
+        virtual bool readTraining(NvU8* voltageSwingLane,  NvU8* preemphasisLane,
+                                  NvU8* trainingScoreLane, NvU8* postCursor,
+                                  NvU8  activeLaneCount);
+
+        virtual bool isLaneSettingsChanged(NvU8* oldVoltageSwingLane,
+                                           NvU8* newVoltageSwingLane,
+                                           NvU8* oldPreemphasisLane,
+                                           NvU8* newPreemphasisLane,
+                                           NvU8 activeLaneCount);
+
+        void parseAndReadInterruptsLegacy();
+
+        void parseAndReadInterrupts()
+        {
+            if (caps.supportsESI)
+                parseAndReadInterruptsESI();       // DP 1.2 should use the new ESI region
+            else
+                parseAndReadInterruptsLegacy();
+
+        }
+
+        virtual int getSinkCount() // DPCD offset 200
+        {
+            return interrupts.sinkCount;
+        }
+
+        //
+        // This was introduced as part of WAR for HP SDC Panel since their
+        // TCON sets DPCD 0x200 SINK_COUNT=0. It should never be called to
+        // set the SinkCount in other cases since SinkCount comes from DPCD.
+        //
+        virtual void setSinkCount(int sinkCount)
+        {
+            interrupts.sinkCount = sinkCount;
+        }
+
+        virtual bool interruptContentProtection()
+        {
+            return interrupts.cpIRQ;
+        }
+
+        virtual void clearInterruptContentProtection();
+
+        virtual bool intteruptMCCS()
+        {
+            return interrupts.mccsIRQ;
+        }
+
+        virtual void clearInterruptMCCS();
+
+        virtual bool interruptDownReplyReady()
+        {
+            return interrupts.downRepMsgRdy;
+        }
+
+        virtual bool interruptUpRequestReady()
+        {
+            return interrupts.upReqMsgRdy;
+        }
+
+        virtual void clearInterruptDownReplyReady();
+        virtual void clearInterruptUpRequestReady();
+
+        virtual bool getLaneStatusSymbolLock(int lane)
+        {
+            return interrupts.laneStatusIntr.laneStatus[lane].symbolLocked;
+        }
+
+        virtual bool getLaneStatusClockRecoveryDone(int lane)
+        {
+            return interrupts.laneStatusIntr.laneStatus[lane].clockRecoveryDone;
+        }
+
+        virtual bool getInterlaneAlignDone()                                             // DPCD offset 204
+        {
+            return interrupts.laneStatusIntr.interlaneAlignDone;
+        }
+
+        virtual bool getDownStreamPortStatusChange()
+        {
+            return interrupts.laneStatusIntr.downstmPortChng;
+        }
+
+        virtual bool getPendingTestRequestTraining()                                    // DPCD offset 218
+        {
+            return interrupts.testTraining.testRequestTraining;
+        }
+
+        virtual bool getPendingAutomatedTestRequest()
+        {
+            return interrupts.automatedTestRequest;
+        }
+
+        virtual bool getPendingTestRequestEdidRead()
+        {
+            return interrupts.testEdid.testRequestEdidRead;
+        }
+
+        virtual bool getPendingTestRequestPhyCompliance()
+        {
+            return interrupts.testPhyCompliance.testRequestPhyCompliance;
+        }
+
+        virtual void getTestRequestTraining(LinkRate & rate, unsigned & lanes) // DPCD offset 219, 220
+        {
+            rate = interrupts.testTraining.testRequestLinkRate;
+            lanes = interrupts.testTraining.testRequestLaneCount;
+        }
+
+        virtual LinkQualityPatternType getPhyTestPattern()                            // DPCD offset 248
+        {
+            return interrupts.testPhyCompliance.phyTestPattern;
+        }
+
+        virtual void getCustomTestPattern(NvU8 *testPattern)                         // DPCD offset 250 - 259
+        {
+            int i;
+
+            for (i = 0; i < 10; i++)
+            {
+                testPattern[i] = interrupts.eightyBitCustomPat[i];
+            }
+        }
+
+        virtual bool getBKSV(NvU8 *bKSV);
+        virtual bool getBCaps(BCaps &bCaps, NvU8 * rawByte);
+        virtual bool getHdcp22BCaps(BCaps &bCaps, NvU8 *rawByte);
+        virtual bool getBinfo(BInfo &bInfo);
+        virtual bool getRxStatus(const HDCPState &hdcpState, NvU8 *data);
+
+        virtual AuxRetry::status setTestResponseChecksum(NvU8 checksum)
+        {
+            if (caps.revisionMajor <= 0)
+                DP_ASSERT(0 && "Something is wrong, revision major should be > 0");
+
+            return bus.write(NV_DPCD_TEST_EDID_CHKSUM, &checksum, sizeof checksum);
+        }
+
+        virtual AuxRetry::status setTestResponse(bool ack, bool edidChecksumWrite);
+
+        //  Message box encoding
+        virtual AuxRetry::status writeDownRequestMessageBox(NvU8 * data, size_t length)
+        {
+            //
+            //  We can assume no message was sent if this fails.
+            //     Reasoning:
+            //        Sinks are not allowed to DEFER except on the first 16 byte write.
+            //        If there isn't enough room for the 48 byte packet, that write
+            //        will defer.
+            //
+            return bus.write(NV_DPCD_MBOX_DOWN_REQ, data, (unsigned)length);
+        }
+
+        virtual size_t getDownRequestMessageBoxSize()
+        {
+            return DP_MESSAGEBOX_SIZE;
+        }
+
+        virtual AuxRetry::status writeUpReplyMessageBox(NvU8 * data, size_t length)
+        {
+            if (caps.revisionMajor <= 0)
+                DP_ASSERT(0 && "Something is wrong, revision major should be > 0");
+
+            //
+            //  We can assume no message was sent if this fails.
+            //     Reasoning:
+            //        Sinks are not allowed to DEFER except on the first 16 byte write.
+            //        If there isn't enough room for the 48 byte packet, that write
+            //        will defer.
+            //
+            return bus.write(NV_DPCD_MBOX_UP_REP, data, (unsigned)length);
+        }
+
+        virtual size_t getUpReplyMessageBoxSize()
+        {
+            return DP_MESSAGEBOX_SIZE;
+        }
+
+        virtual AuxRetry::status readDownReplyMessageBox(NvU32 offset, NvU8 * data, size_t length)
+        {
+            //  if (caps.revisionMajor <= 0)
+            //        DP_ASSERT(0 && "Something is wrong, revision major should be > 0");
+
+            DP_ASSERT(offset + length <= DP_MESSAGEBOX_SIZE);
+
+            return bus.read(NV_DPCD_MBOX_DOWN_REP + offset, data, (unsigned)length);
+        }
+
+        virtual size_t getDownReplyMessageBoxSize()
+        {
+            return DP_MESSAGEBOX_SIZE;
+        }
+
+        virtual  AuxRetry::status readUpRequestMessageBox(NvU32 offset, NvU8 * data, size_t length)
+        {
+            if (caps.revisionMajor <= 0)
+                DP_ASSERT(0 && "Something is wrong, revision major should be > 0");
+
+            DP_ASSERT(offset + length <= DP_MESSAGEBOX_SIZE);
+
+            return bus.read(NV_DPCD_MBOX_UP_REQ + offset, data, (unsigned)length);
+        }
+
+        virtual size_t getUpRequestMessageBoxSize()
+        {
+            return DP_MESSAGEBOX_SIZE;
+        }
+
+        virtual size_t getTransactionSize()
+        {
+            return bus.getDirect()->transactionSize();
+        }
+
+        virtual PowerState getPowerState();
+        virtual bool setPowerState(PowerState newState);
+        virtual void payloadTableClearACT();
+        virtual bool payloadWaitForACTReceived();
+        virtual bool payloadAllocate(unsigned streamId, unsigned begin, unsigned count);
+
+        void overrideMultiStreamCap(bool mstCapable)
+        {
+            caps.overrideToSST = !mstCapable;
+        }
+
+        bool getMultiStreamCapOverride()
+        {
+            return caps.overrideToSST;
+        }
+
+        bool getDpcdMultiStreamCap(void)
+        {
+            return caps.supportsMultistream;
+        }
+
+        virtual void setGpuDPSupportedVersions(NvU32 _gpuDPSupportedVersions);
+
+        void setGpuFECSupported(bool bSupportFEC)
+        {
+            bGpuFECSupported = bSupportFEC;
+        }
+
+        void applyRegkeyOverrides(const DP_REGKEY_DATABASE& dpRegkeyDatabase);
+
+        // To clear pending message {DOWN_REP/UP_REQ} and reply true if existed.
+        virtual bool clearPendingMsg();
+
+        virtual bool isMessagingEnabled();
+
+        virtual void setIndexedLinkrateEnabled(bool val)
+        {
+            bIndexedLinkrateEnabled = val;
+        }
+
+        virtual bool isIndexedLinkrateEnabled()
+        {
+            return bIndexedLinkrateEnabled;
+        }
+
+        virtual bool isIndexedLinkrateCapable()
+        {
+            return bIndexedLinkrateCapable;
+        }
+
+        virtual NvU16 *getLinkRateTable();
+
+        virtual NvU32 getVideoFallbackSupported()
+        {
+            return caps.videoFallbackFormats;
+        }
+
+        virtual bool getRawLinkRateTable(NvU8 *buffer);
+
+        virtual void resetProtocolConverter()
+        {
+            NvU8    data = 0;
+            bus.write(NV_DPCD14_PCON_FRL_LINK_CONFIG_1, &data, sizeof(data));
+            bus.write(NV_DPCD14_PCON_FRL_LINK_CONFIG_2, &data, sizeof(data));
+        }
+
+        virtual bool setSourceControlMode(bool bEnableSourceControlMode, bool bEnableFRLMode);
+
+        virtual bool checkPCONFrlReady(bool *bFrlReady);
+
+        virtual bool setupPCONFrlLinkAssessment(NvU32   linkBwMask,
+                                                bool    bEnableExtendLTMode = false,
+                                                bool    bEnableConcurrentMode = false);
+
+        virtual bool checkPCONFrlLinkStatus(NvU32 *frlRateMask);
+        virtual bool queryHdmiLinkStatus(bool *bLinkActive, bool *bLinkReady);
+
+        virtual NvU32 restorePCONFrlLink(NvU32   linkBwMask,
+                                        bool    bEnableExtendLTMode     = false,
+                                        bool    bEnableConcurrentMode   = false);
+
+        virtual void readPsrCapabilities(vesaPsrSinkCaps *caps)
+        {
+            dpMemCopy(caps, &this->caps.psrCaps, sizeof(vesaPsrSinkCaps));
+        }
+
+        virtual bool updatePsrConfiguration(vesaPsrConfig psrcfg);
+        virtual bool readPsrConfiguration(vesaPsrConfig *psrcfg);
+
+        virtual bool readPsrState(vesaPsrState *psrState);
+        virtual bool readPsrDebugInfo(vesaPsrDebugStatus *psrDbgState);
+
+        virtual bool writePsrErrorStatus(vesaPsrErrorStatus psrErr);
+        virtual bool readPsrErrorStatus(vesaPsrErrorStatus *psrErr);
+
+        virtual bool writePsrEvtIndicator(vesaPsrEventIndicator psrEvt);
+        virtual bool readPsrEvtIndicator(vesaPsrEventIndicator *psrEvt);
+
+        virtual bool readPrSinkDebugInfo(panelReplaySinkDebugInfo *prDbgInfo);
+
+        bool getDpTunnelBwAllocationSupported()
+        {
+            return false;
+        }
+
+        virtual bool     getDpTunnelGranularityMultiplier(NvU8 &granularityMultiplier);
+        virtual TriState getDpTunnelBwRequestStatus();
+        virtual bool     setDpTunnelBwAllocation(bool bEnable);
+
+        bool getDpTunnelEstimatedBw(NvU8 &estimatedBw);
+        bool hasDpTunnelEstimatedBwChanged();
+        bool hasDpTunnelBwAllocationCapabilityChanged();
+        bool writeDpTunnelRequestedBw(NvU8 requestedBw);
+
+    };
+
 }

 #endif //INCLUDED_DP_CONFIGCAPS_H
--- a/src/common/displayport/inc/dp_connector.h
+++ b/src/common/displayport/inc/dp_connector.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -65,6 +65,7 @@ namespace DisplayPort
        DP_IMP_ERROR_INSUFFICIENT_BANDWIDTH,
        DP_IMP_ERROR_INSUFFICIENT_BANDWIDTH_DSC,
        DP_IMP_ERROR_INSUFFICIENT_BANDWIDTH_NO_DSC,
+        DP_IMP_ERROR_INSUFFICIENT_DP_TUNNELING_BANDWIDTH,
        DP_IMP_ERROR_WATERMARK_BLANKING,
        DP_IMP_ERROR_PPS_COLOR_FORMAT_NOT_SUPPORTED,
        DP_IMP_ERROR_PPS_INVALID_HBLANK,
@ -274,6 +275,10 @@ namespace DisplayPort

        virtual DscCaps getDscCaps() = 0;

+        virtual NvBool isDynamicPPSSupported() = 0;
+
+        virtual NvBool isDynamicDscToggleSupported() = 0;
+
        //
        // This function returns the device itself or its parent device that is doing
        // DSC decompression for it.
@ -321,8 +326,14 @@ namespace DisplayPort
        virtual bool isMSAOverMSTCapable() = 0;
        virtual bool isFakedMuxDevice() = 0;
        virtual bool setPanelReplayConfig(panelReplayConfig prcfg) = 0;
+        virtual bool getPanelReplayConfig(panelReplayConfig *pPrcfg) = 0;
        virtual bool isPanelReplaySupported() = 0;
        virtual bool getPanelReplayStatus(PanelReplayStatus *pPrStatus) = 0;
+        virtual bool getDeviceSpecificData(NvU8 *oui, NvU8 *deviceIdString,
+                                           NvU8 *hwRevision, NvU8 *swMajorRevision,
+                                           NvU8 *swMinorRevision) = 0;
+
+        virtual bool setModeList(DisplayPort::DpModesetParams *pModeList, unsigned numModes) = 0;

    protected:
            virtual ~Device() {}
@ -594,6 +605,8 @@ namespace DisplayPort
        virtual void notifyGPUCapabilityChange() = 0;
        virtual void notifyHBR2WAREngage() = 0;

+        virtual bool dpUpdateDscStream(Group *target, NvU32 dscBpp) = 0;
+
        // Create a new Group.  Note that if you wish to do a modeset but send the
        // stream nowhere, you may do a modeset with an EMPTY group.  This is expected
        // to be the mechanism by which monitor faking is implemented.
@ -710,6 +723,7 @@ namespace DisplayPort
        virtual bool setTestPattern(NV0073_CTRL_DP_TESTPATTERN testPattern,
                                    NvU8 laneMask, NV0073_CTRL_DP_CSTM cstm,
                                    NvBool bIsHBR2, NvBool bSkipLaneDataOverride) = 0;
+
        // "data" is an array of NV0073_CTRL_MAX_LANES unsigned ints
        virtual bool getLaneConfig(NvU32 *numLanes, NvU32 *data) = 0;
        // "data" is an array of NV0073_CTRL_MAX_LANES unsigned ints
@ -735,6 +749,7 @@ namespace DisplayPort
    virtual bool updatePsrLinkState(bool bTurnOnLink) = 0;

    virtual bool readPrSinkDebugInfo(panelReplaySinkDebugInfo *prDbgInfo) = 0;
+    virtual void enableDpTunnelingBwAllocationSupport() = 0;

    protected:
           virtual ~Connector() {}
--- a/src/common/displayport/inc/dp_connectorimpl.h
+++ b/src/common/displayport/inc/dp_connectorimpl.h
@ -49,6 +49,9 @@
 #define    HDCP_FLAGS_ABORT_DEVICE_INVALID     0x00080000 // Abort due to an invalid device in DP1.2 topology
 #define    HDCP_FLAGS_ABORT_HOP_LIMIT_EXCEEDED 0x80000000 // Abort, number of devices in DP1.2 topology exceeds supported limit

+#define    DP_TUNNEL_REQUEST_BW_MAX_TIME_MS          (1000U)
+#define    DP_TUNNEL_REQUEST_BW_POLLING_INTERVAL_MS    (10U)
+
 static inline unsigned getDataClockMultiplier(NvU64 linkRate, NvU64 laneCount)
 {
    //
@ -192,6 +195,7 @@ namespace DisplayPort
        bool compoundQueryResult;
        unsigned compoundQueryCount;
        unsigned compoundQueryLocalLinkPBN;
+        NvU64 compoundQueryUsedTunnelingBw;
        bool compoundQueryForceEnableFEC;

        unsigned freeSlots;
@ -309,7 +313,6 @@ namespace DisplayPort
        bool        bNoFallbackInPostLQA;

        bool        bReportDeviceLostBeforeNew;
-        bool        bEnableAudioBeyond48K;
        bool        bDisableSSC;
        bool        bEnableFastLT;
        NvU32       maxLinkRateFromRegkey;
@ -348,9 +351,6 @@ namespace DisplayPort
        //
        bool        bPowerDownPhyBeforeD3;

-        // Force DSC on sink irrespective of LT status
-        bool        bForceDscOnSink;
-
        //
        // Reset the MSTM_CTRL registers on branch device irrespective of
        // IRQ VECTOR register having stale message. Certain branch devices
@ -362,6 +362,11 @@ namespace DisplayPort
        bool        bForceClearPendingMsg;
        bool        bSkipFakeDeviceDpcdAccess;

+        NvU64       allocatedDpTunnelBw;
+        NvU64       allocatedDpTunnelBwShadow;
+        bool        bForceDisableTunnelBwAllocation;
+        bool        bClientRequestedDpTunnelBwAllocation;
+        bool        bIsDpTunnelBwAllocationEnabled;

        Group *perHeadAttachedGroup[NV_MAX_HEADS];
        NvU32 inTransitionHeadMask;
@ -444,6 +449,9 @@ namespace DisplayPort
                                         const DpModesetParams &modesetParams,      // Modeset info
                                         DscParams *pDscParams = NULL,              // DSC parameters
                                         DP_IMP_ERROR *pErrorCode = NULL);          // Error Status code
+        virtual bool compoundQueryAttachTunneling(const DpModesetParams &modesetParams,
+                                                  DscParams *pDscParams = NULL,
+                                                  DP_IMP_ERROR *pErrorCode = NULL);

        virtual bool endCompoundQuery();

@ -495,6 +503,7 @@ namespace DisplayPort
        char tagHDCPReauthentication;
        char tagDelayedHdcpCapRead;
        char tagDelayedHDCPCPIrqHandling;
+        char tagDpBwAllocationChanged;

        //
        //  Enable disable TMDS mode
@ -563,6 +572,18 @@ namespace DisplayPort
        bool willLinkSupportModeSST(const LinkConfiguration & linkConfig, const ModesetInfo & modesetInfo);
        void forceLinkTraining();

+        bool     updateDpTunnelBwAllocation();
+        void     configureDpTunnelBwAllocation();
+        TriState requestDpTunnelBw(NvU8 requestedBw);
+        bool     allocateDpTunnelBw(NvU64 bandwidth);
+        bool     allocateMaxDpTunnelBw();
+        NvU64    getMaxTunnelBw();
+
+        void enableDpTunnelingBwAllocationSupport()
+        {
+            bClientRequestedDpTunnelBwAllocation = true;
+        }
+
        void assessLink(LinkTrainingType trainType = NORMAL_LINK_TRAINING);

        bool isLinkInD3();
@ -594,8 +615,8 @@ namespace DisplayPort
        void populateDscBranchCaps(DSC_INFO* dscInfo, DeviceImpl * dev);
        void populateDscModesetInfo(MODESET_INFO * pModesetInfo, const DpModesetParams * pModesetParams);

-        bool train(const LinkConfiguration & lConfig, bool force, LinkTrainingType trainType = NORMAL_LINK_TRAINING);
-        bool validateLinkConfiguration(const LinkConfiguration & lConfig);
+        virtual bool train(const LinkConfiguration & lConfig, bool force, LinkTrainingType trainType = NORMAL_LINK_TRAINING);
+        virtual bool validateLinkConfiguration(const LinkConfiguration & lConfig);

        virtual bool assessPCONLinkCapability(PCONLinkControl *params);
        bool trainPCONFrlLink(PCONLinkControl *pConControl);
@ -606,12 +627,12 @@ namespace DisplayPort
        // the lowest level function(nearest to the hal) for the connector.
        bool rawTrain(const LinkConfiguration & lConfig, bool force, LinkTrainingType linkTrainingType);

-        bool enableFlush();
-        bool beforeAddStream(GroupImpl * group, bool force=false, bool forFlushMode = false);
-        void afterAddStream(GroupImpl * group);
-        void beforeDeleteStream(GroupImpl * group, bool forFlushMode = false);
-        void afterDeleteStream(GroupImpl * group);
-        void disableFlush(bool test=false);
+        virtual bool enableFlush();
+        virtual bool beforeAddStream(GroupImpl * group, bool force=false, bool forFlushMode = false);
+        virtual void afterAddStream(GroupImpl * group);
+        virtual void beforeDeleteStream(GroupImpl * group, bool forFlushMode = false);
+        virtual void afterDeleteStream(GroupImpl * group);
+        virtual void disableFlush(bool test=false);

        bool beforeAddStreamMST(GroupImpl * group, bool force = false, bool forFlushMode = false);

@ -619,7 +640,7 @@ namespace DisplayPort

        bool deleteAllVirtualChannels();
        void clearTimeslices();
-        bool allocateTimeslice(GroupImpl * targetGroup);
+        virtual bool allocateTimeslice(GroupImpl * targetGroup);
        void freeTimeslice(GroupImpl * targetGroup);
        void flushTimeslotsToHardware();
        bool getHDCPAbortCodesDP12(NvU32 &hdcpAbortCodesDP12);
@ -629,6 +650,7 @@ namespace DisplayPort
        bool handleCPIRQ();
        void handleSSC();
        void handleMCCSIRQ();
+        void handleDpTunnelingIrq();
        void handleHdmiLinkStatusChanged();
        void sortActiveGroups(bool ascending);
        void configInit();
@ -639,7 +661,7 @@ namespace DisplayPort
        void notifyLongPulseInternal(bool statusConnected);
        virtual void notifyLongPulse(bool status);
        virtual void notifyShortPulse();
-        virtual Group * newGroup() ;
+        virtual Group * newGroup();
        virtual void destroy();
        virtual void createFakeMuxDevice(const NvU8 *buffer, NvU32 bufferSize);
        virtual void deleteFakeMuxDevice();
@ -664,6 +686,7 @@ namespace DisplayPort
        Group * createFirmwareGroup();
        virtual void notifyGPUCapabilityChange();
        virtual void notifyHBR2WAREngage();
+        bool dpUpdateDscStream(Group *target, NvU32 dscBpp);

        bool getTestPattern(NV0073_CTRL_DP_TESTPATTERN *testPattern);
        bool setTestPattern(NV0073_CTRL_DP_TESTPATTERN testPattern, NvU8 laneMask, NV0073_CTRL_DP_CSTM cstm, NvBool bIsHBR2, NvBool bSkipLaneDataOverride = false);
@ -707,16 +730,16 @@ namespace DisplayPort
    //
    struct DevicePendingEDIDRead : protected EdidReadMultistream::EdidReadMultistreamEventSink, public ListElement
    {
-        EdidReadMultistream      reader;
-        DiscoveryManager::Device device;
        ConnectorImpl *          parent;
+        DiscoveryManager::Device device;
+        EdidReadMultistream      reader;

        void mstEdidCompleted(EdidReadMultistream * from);
        void mstEdidReadFailed(EdidReadMultistream * from);

    public:
        DevicePendingEDIDRead(ConnectorImpl *  _parent, MessageManager * manager, DiscoveryManager::Device dev)
-            : reader(_parent->timer, manager, this, dev.address), device(dev), parent(_parent)
+            : parent(_parent), device(dev), reader(_parent->timer, manager, this, dev.address)
        {
        }
    };
--- a/src/common/displayport/inc/dp_deviceimpl.h
+++ b/src/common/displayport/inc/dp_deviceimpl.h
@ -44,6 +44,7 @@ namespace DisplayPort
    #define HDCP_BCAPS_DDC_EN_BIT 0x80
    #define HDCP_BCAPS_DP_EN_BIT  0x01
    #define HDCP_I2C_CLIENT_ADDR  0x74
+    #define DEVICE_OUI_SIZE       3

    struct GroupImpl;
    struct ConnectorImpl;
@ -170,7 +171,6 @@ namespace DisplayPort

        // Panel replay Caps
        PanelReplayCaps prCaps;
-
        bool bIsFakedMuxDevice;
        bool bIsPreviouslyFakedMuxDevice;
        bool bisMarkedForDeletion;
@ -202,6 +202,8 @@ namespace DisplayPort
        bool bSkipFakeDeviceDpcdAccess;

        DeviceImpl(DPCDHAL * hal, ConnectorImpl * connector, DeviceImpl * parent, bool bSkipFakeDeviceDpcdAccess);
+        NvU64 maxModeBwRequired;
+
        ~DeviceImpl();

        virtual bool isCableOk();
@ -380,6 +382,11 @@ namespace DisplayPort
            return dpcdRevisionMinor >= minor;
        }

+        NvU64 getMaxModeBwRequired()
+        {
+            return maxModeBwRequired;
+        }
+
        virtual void queryGUID2();

        virtual bool getSDPExtnForColorimetrySupported();
@ -445,6 +452,7 @@ namespace DisplayPort
        bool isPanelReplaySupported(void);
        void getPanelReplayCaps(void);
        bool setPanelReplayConfig(panelReplayConfig prcfg);
+        bool getPanelReplayConfig(panelReplayConfig *pPrcfg);
        bool getPanelReplayStatus(PanelReplayStatus *pPrStatus);

        NvBool getDSCSupport();
@ -481,6 +489,11 @@ namespace DisplayPort
        unsigned getDscMaxSliceWidth();
        unsigned getDscDecoderColorDepthSupportMask();
        void setDscDecompressionDevice(bool bDscCapBasedOnParent);
+        virtual bool getDeviceSpecificData(NvU8 *oui, NvU8 *deviceIdString, 
+                                           NvU8 *hwRevision, NvU8 *swMajorRevision, 
+                                           NvU8 *swMinorRevision);
+
+        virtual bool setModeList(DisplayPort::DpModesetParams *pModeList, unsigned numModes);
    };
    class DeviceHDCPDetection : public Object, MessageManager::Message::MessageEventSink, Timer::TimerCallback
    {
--- a/src/common/displayport/inc/dp_edid.h
+++ b/src/common/displayport/inc/dp_edid.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2010-2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2010-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -124,26 +124,7 @@ namespace DisplayPort
            return this->patchedChecksum;
        }

-        bool isValidHeader() const
-        {
-            NvU8 validHeaderData[8] = {
-                0x00, 0xFF, 0xFF, 0xFF, 0xFF,
-                0xFF, 0xFF, 0x00};
-
-            if (buffer.getLength() < 0x8)
-                return false;
-
-            for (unsigned i = 0; i < 8; i++)
-            {
-                if (buffer.data[i] != validHeaderData[i])
-                {
-                    DP_LOG(("DP-EDID> Invalid EDID Header"));
-                    return false;
-                }
-            }
-
-            return true;
-        }
+        bool isValidHeader() const;

        unsigned getManufId() const
        {
--- a/src/common/displayport/inc/dp_evoadapter.h
+++ b/src/common/displayport/inc/dp_evoadapter.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -147,6 +147,7 @@ namespace DisplayPort
        // Defines the same as NV0073_CTRL_CMD_DP_GET_CAPS_PARAMS.dpVersionsSupported
        //
        NvU32   _gpuSupportedDpVersions;
+
        bool _isStreamCloningEnabled;
        bool _needForceRmEdid;
        bool _skipPowerdownEDPPanelWhenHeadDetach;
@ -156,10 +157,11 @@ namespace DisplayPort
        bool _useDfpMaxLinkRateCaps;
        bool _applyLinkBwOverrideWarRegVal;
        bool _isDynamicMuxCapable;
+        bool _isMDMEnabled;
        bool _enableMSAOverrideOverMST;
-
        bool _isLTPhyRepeaterSupported;
        bool _isMSTPCONCapsReadDisabled;
+        bool _isDownspreadSupported;
        //
        // LTTPR count reported by RM, it might not be the same with DPLib probe
        // For example, some Intel LTTPR might not be ready to response 0xF0000 probe
@ -258,6 +260,16 @@ namespace DisplayPort
            return (_isDynamicMuxCapable && _isEDP);
        }

+        virtual bool isMDMEnabled()
+        {
+            return (_isMDMEnabled && _isEDP);
+        }
+
+        virtual bool isDownspreadSupported()
+        {
+            return _isDownspreadSupported;
+        }
+
        // Get GPU DSC capabilities
        virtual void getDscCaps(bool *pbDscSupported,
                                unsigned *pEncoderColorFormatMask,
@ -313,6 +325,11 @@ namespace DisplayPort
            return this->_isLTPhyRepeaterSupported;
        }

+        EvoInterface * getProvider()
+        {
+            return this->provider;
+        }
+
        // Return the current mux state. Returns false if device is not mux capable
        bool getDynamicMuxState(NvU32 *muxState);

@ -334,8 +351,8 @@ namespace DisplayPort
        virtual bool getMaxLinkConfigFromUefi(NvU8 &linkRate, NvU8 &laneCount);
        virtual bool setDpMSAParameters(bool bStereoEnable, const NV0073_CTRL_CMD_DP_SET_MSA_PROPERTIES_PARAMS &msaparams);
        virtual bool setDpStereoMSAParameters(bool bStereoEnable, const NV0073_CTRL_CMD_DP_SET_MSA_PROPERTIES_PARAMS &msaparams);
-        virtual bool setFlushMode();
-        virtual void clearFlushMode(unsigned headMask, bool testMode=false);
+        bool setFlushMode();
+        void clearFlushMode(unsigned headMask, bool testMode=false);

        virtual bool dscCrcTransaction(NvBool bEnable, gpuDscCrc *data, NvU16 *headIndex);

--- a/src/common/displayport/inc/dp_hostimp.h
+++ b/src/common/displayport/inc/dp_hostimp.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2015-2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2015-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -32,12 +32,14 @@

 #include "nvtypes.h"
 #include "dp_tracing.h"
+#include "dp_printf.h"

 extern "C" void * dpMalloc(NvLength size);
 extern "C" void dpFree(void * ptr);
 extern "C" void dpDebugBreakpoint();
 // Note: dpPrint() implementations are expected to append a newline themselves.
 extern "C" void dpPrint(const char * formatter, ...);
+extern "C" void dpPrintf(DP_LOG_LEVEL severity, const char * formatter, ...);
 extern "C" void dpTraceEvent(NV_DP_TRACING_EVENT event,
                             NV_DP_TRACING_PRIORITY priority, NvU32 numArgs, ...);

--- a/src/common/displayport/inc/dp_internal.h
+++ b/src/common/displayport/inc/dp_internal.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -108,13 +108,6 @@ template <class T> void dp_used(const T & /*x*/) {}
 //

 #if NV_DP_ASSERT_ENABLED
-#define DP_LOG(x)                                                           \
-    do                                                                      \
-    {                                                                       \
-        dpPrint x;                                                          \
-        addDpLogRecord x;                                                   \
-    }while (false)
-
 #define DP_ASSERT(x)                                                        \
    if (!(x))                                                               \
    {                                                                       \
@ -123,9 +116,6 @@ template <class T> void dp_used(const T & /*x*/) {}
        dpDebugBreakpoint();                                                \
    }
 #else
-
-#define DP_LOG(x)
-
 #define DP_ASSERT(x)                                                        \
    {                                                                       \
        DP_USED(x);                                                         \
--- a/Show More
+++ b/Show More