From 3bf16b890caa8fd6b5db08b5c2437b51c758ac9d Mon Sep 17 00:00:00 2001
From: Bernhard Stoeckner <bstoeckner@nvidia.com>
Date: Tue, 19 Mar 2024 16:56:28 +0100
Subject: [PATCH] 550.67

---
 CHANGELOG.md                                  |   2 +
 README.md                                     |   9 +-
 kernel-open/Kbuild                            |   4 +-
 kernel-open/common/inc/nv-linux.h             |  27 +--
 kernel-open/conftest.sh                       |  24 +-
 kernel-open/nvidia-drm/nvidia-drm-drv.c       |  30 ++-
 .../nvidia-modeset/nvidia-modeset-linux.c     |  12 +-
 kernel-open/nvidia-uvm/uvm_channel_test.c     |  16 +-
 .../nvidia-uvm/uvm_fault_buffer_flush_test.c  |   6 +-
 kernel-open/nvidia-uvm/uvm_global.h           |   6 +
 kernel-open/nvidia-uvm/uvm_gpu.h              |  19 +-
 .../nvidia-uvm/uvm_gpu_access_counters.c      |   6 +-
 .../nvidia-uvm/uvm_gpu_replayable_faults.c    |  22 +-
 kernel-open/nvidia-uvm/uvm_hmm.c              | 141 ++++++------
 kernel-open/nvidia-uvm/uvm_hmm.h              |  52 +++--
 kernel-open/nvidia-uvm/uvm_hopper.c           |  26 ++-
 kernel-open/nvidia-uvm/uvm_hopper_ce.c        |   2 -
 kernel-open/nvidia-uvm/uvm_map_external.c     |  38 ++-
 kernel-open/nvidia-uvm/uvm_migrate.c          |  48 ++--
 kernel-open/nvidia-uvm/uvm_mmu.c              |   2 +
 kernel-open/nvidia-uvm/uvm_perf_thrashing.c   |  36 +--
 kernel-open/nvidia-uvm/uvm_pmm_sysmem_test.c  |  25 +-
 kernel-open/nvidia-uvm/uvm_policy.c           |   3 +-
 kernel-open/nvidia-uvm/uvm_push.c             |  53 ++++-
 kernel-open/nvidia-uvm/uvm_push.h             |  28 ++-
 kernel-open/nvidia-uvm/uvm_range_group.c      |   9 +-
 kernel-open/nvidia-uvm/uvm_tools.c            |  28 ++-
 kernel-open/nvidia-uvm/uvm_tracker.c          |  12 +-
 kernel-open/nvidia-uvm/uvm_va_block.c         |  40 ++--
 kernel-open/nvidia-uvm/uvm_va_block.h         |  16 +-
 kernel-open/nvidia-uvm/uvm_va_block_types.h   |  12 +-
 kernel-open/nvidia-uvm/uvm_va_range.c         | 132 +++++++----
 kernel-open/nvidia-uvm/uvm_va_range.h         |   4 +
 kernel-open/nvidia-uvm/uvm_va_space.c         | 217 ++++++++++++------
 kernel-open/nvidia-uvm/uvm_va_space.h         |  38 ++-
 kernel-open/nvidia-uvm/uvm_va_space_mm.c      |  24 +-
 kernel-open/nvidia-uvm/uvm_va_space_mm.h      |   5 +
 kernel-open/nvidia/nv.c                       |   6 +-
 kernel-open/nvidia/nvidia.Kbuild              |   1 +
 kernel-open/nvidia/os-interface.c             | 155 ++++++++++++-
 kernel-open/nvidia/os-mlock.c                 |  67 +++++-
 src/common/displayport/inc/dp_connectorimpl.h |   3 +
 src/common/displayport/inc/dp_linkconfig.h    |   4 +-
 .../displayport/inc/dp_regkeydatabase.h       |   6 +
 .../displayport/src/dp_connectorimpl.cpp      |  35 ++-
 src/common/displayport/src/dp_evoadapter.cpp  |   3 +-
 src/common/inc/nvBldVer.h                     |  20 +-
 src/common/inc/nvUnixVersion.h                |   2 +-
 src/common/modeset/timing/nvt_edid.c          |   5 +-
 src/common/modeset/timing/nvt_edidext_861.c   |  34 +--
 .../inband/interface/nvlink_inband_msg.h      |   3 +-
 .../interface/nvlink_kern_shutdown_entry.c    |  16 +-
 .../nvswitch/kernel/inc/boards_nvswitch.h     |   2 +
 src/common/nvswitch/kernel/ls10/intr_ls10.c   |  12 +-
 src/common/nvswitch/kernel/ls10/pmgr_ls10.c   |  27 ++-
 src/common/nvswitch/kernel/nvswitch.c         |   2 +-
 src/nvidia-modeset/Makefile                   |   1 +
 src/nvidia/Makefile                           |   1 +
 .../arch/nvalloc/unix/src/os-hypervisor.c     |  16 +-
 src/nvidia/generated/g_gpu_nvoc.h             |   1 +
 src/nvidia/generated/g_nv_name_released.h     |   3 +
 src/nvidia/generated/g_spdm_nvoc.h            |   5 +-
 src/nvidia/generated/g_vgpuconfigapi_nvoc.h   |   2 +-
 .../inc/kernel/gpu/gsp/message_queue_priv.h   |  20 ++
 .../gpu/bif/arch/maxwell/kernel_bif_gm107.c   |   7 +
 src/nvidia/src/kernel/gpu/device.c            |   2 +-
 src/nvidia/src/kernel/gpu/fsp/kern_fsp.c      |  40 +++-
 src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c    |   8 +
 .../src/kernel/gpu/gsp/message_queue_cpu.c    |  47 ++--
 .../src/kernel/gpu/mem_sys/kern_mem_sys.c     |   2 +-
 .../kernel/gpu/mem_sys/kern_mem_sys_ctrl.c    |  11 +-
 .../src/kernel/gpu/nvenc/nvencsession.c       |  49 +++-
 .../gpu/nvlink/kernel_nvlinkcorelibtrain.c    |  19 +-
 .../kernel/gpu/spdm/arch/hopper/spdm_gh100.c  | 159 ++++++++-----
 src/nvidia/src/kernel/gpu/spdm/spdm.c         |   5 +
 src/nvidia/src/kernel/mem_mgr/mem_export.c    |   3 +-
 .../kernel/virtualization/kernel_vgpu_mgr.c   |  10 +-
 version.mk                                    |   2 +-
 78 files changed, 1400 insertions(+), 590 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c82e4c896..b093915ce 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,8 @@
 
 ## Release 550 Entries
 
+### [550.67] 2024-03-19
+
 ### [550.54.15] 2024-03-18
 
 ### [550.54.14] 2024-02-23
diff --git a/README.md b/README.md
index 77ad719f9..f075ea3fb 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # NVIDIA Linux Open GPU Kernel Module Source
 
 This is the source release of the NVIDIA Linux open GPU kernel modules,
-version 550.54.15.
+version 550.67.
 
 
 ## How to Build
@@ -17,7 +17,7 @@ as root:
 
 Note that the kernel modules built here must be used with GSP
 firmware and user-space NVIDIA GPU driver components from a corresponding
-550.54.15 driver release.  This can be achieved by installing
+550.67 driver release.  This can be achieved by installing
 the NVIDIA GPU driver from the .run file using the `--no-kernel-modules`
 option.  E.g.,
 
@@ -188,7 +188,7 @@ encountered specific to them.
 For details on feature support and limitations, see the NVIDIA GPU driver
 end user README here:
 
-https://us.download.nvidia.com/XFree86/Linux-x86_64/550.54.15/README/kernel_open.html
+https://us.download.nvidia.com/XFree86/Linux-x86_64/550.67/README/kernel_open.html
 
 For vGPU support, please refer to the README.vgpu packaged in the vGPU Host
 Package for more details.
@@ -867,6 +867,7 @@ Subsystem Device ID.
 | NVIDIA GeForce RTX 4080 SUPER                   | 2702           |
 | NVIDIA GeForce RTX 4080                         | 2704           |
 | NVIDIA GeForce RTX 4070 Ti SUPER                | 2705           |
+| NVIDIA GeForce RTX 4070                         | 2709           |
 | NVIDIA GeForce RTX 4090 Laptop GPU              | 2717           |
 | NVIDIA RTX 5000 Ada Generation Laptop GPU       | 2730           |
 | NVIDIA GeForce RTX 4090 Laptop GPU              | 2757           |
@@ -874,6 +875,7 @@ Subsystem Device ID.
 | NVIDIA GeForce RTX 4070 Ti                      | 2782           |
 | NVIDIA GeForce RTX 4070 SUPER                   | 2783           |
 | NVIDIA GeForce RTX 4070                         | 2786           |
+| NVIDIA GeForce RTX 4060 Ti                      | 2788           |
 | NVIDIA GeForce RTX 4080 Laptop GPU              | 27A0           |
 | NVIDIA RTX 4000 SFF Ada Generation              | 27B0 1028 16FA |
 | NVIDIA RTX 4000 SFF Ada Generation              | 27B0 103C 16FA |
@@ -896,6 +898,7 @@ Subsystem Device ID.
 | NVIDIA RTX 3500 Ada Generation Embedded GPU     | 27FB           |
 | NVIDIA GeForce RTX 4060 Ti                      | 2803           |
 | NVIDIA GeForce RTX 4060 Ti                      | 2805           |
+| NVIDIA GeForce RTX 4060                         | 2808           |
 | NVIDIA GeForce RTX 4070 Laptop GPU              | 2820           |
 | NVIDIA RTX 3000 Ada Generation Laptop GPU       | 2838           |
 | NVIDIA GeForce RTX 4070 Laptop GPU              | 2860           |
diff --git a/kernel-open/Kbuild b/kernel-open/Kbuild
index 05b4c610a..294171806 100644
--- a/kernel-open/Kbuild
+++ b/kernel-open/Kbuild
@@ -72,7 +72,7 @@ EXTRA_CFLAGS += -I$(src)/common/inc
 EXTRA_CFLAGS += -I$(src)
 EXTRA_CFLAGS += -Wall $(DEFINES) $(INCLUDES) -Wno-cast-qual -Wno-format-extra-args
 EXTRA_CFLAGS += -D__KERNEL__ -DMODULE -DNVRM
-EXTRA_CFLAGS += -DNV_VERSION_STRING=\"550.54.15\"
+EXTRA_CFLAGS += -DNV_VERSION_STRING=\"550.67\"
 
 ifneq ($(SYSSRCHOST1X),)
  EXTRA_CFLAGS += -I$(SYSSRCHOST1X)
@@ -170,6 +170,8 @@ NV_CONFTEST_CMD := /bin/sh $(NV_CONFTEST_SCRIPT) \
 NV_CFLAGS_FROM_CONFTEST := $(shell $(NV_CONFTEST_CMD) build_cflags)
 
 NV_CONFTEST_CFLAGS = $(NV_CFLAGS_FROM_CONFTEST) $(EXTRA_CFLAGS) -fno-pie
+NV_CONFTEST_CFLAGS += $(call cc-disable-warning,pointer-sign)
+NV_CONFTEST_CFLAGS += $(call cc-option,-fshort-wchar,)
 NV_CONFTEST_CFLAGS += -Wno-error
 
 NV_CONFTEST_COMPILE_TEST_HEADERS := $(obj)/conftest/macros.h
diff --git a/kernel-open/common/inc/nv-linux.h b/kernel-open/common/inc/nv-linux.h
index 893d9317c..43d7c80a9 100644
--- a/kernel-open/common/inc/nv-linux.h
+++ b/kernel-open/common/inc/nv-linux.h
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2001-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2001-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: MIT
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -1989,31 +1989,6 @@ static inline NvBool nv_platform_use_auto_online(nv_linux_state_t *nvl)
     return nvl->numa_info.use_auto_online;
 }
 
-typedef struct {
-    NvU64 base;
-    NvU64 size;
-    NvU32 nodeId;
-    int ret;
-} remove_numa_memory_info_t;
-
-static void offline_numa_memory_callback
-(
-    void *args
-)
-{
-#ifdef NV_OFFLINE_AND_REMOVE_MEMORY_PRESENT
-    remove_numa_memory_info_t *pNumaInfo = (remove_numa_memory_info_t *)args;
-#ifdef NV_REMOVE_MEMORY_HAS_NID_ARG
-    pNumaInfo->ret = offline_and_remove_memory(pNumaInfo->nodeId,
-                                               pNumaInfo->base,
-                                               pNumaInfo->size);
-#else
-    pNumaInfo->ret = offline_and_remove_memory(pNumaInfo->base,
-                                               pNumaInfo->size);
-#endif
-#endif
-}
-
 typedef enum
 {
     NV_NUMA_STATUS_DISABLED             = 0,
diff --git a/kernel-open/conftest.sh b/kernel-open/conftest.sh
index 3b33a862d..964241ee6 100755
--- a/kernel-open/conftest.sh
+++ b/kernel-open/conftest.sh
@@ -3096,6 +3096,22 @@ compile_test() {
 
         ;;
 
+        foll_longterm_present)
+            #
+            # Determine if FOLL_LONGTERM enum is present or not
+            #
+            # Added by commit 932f4a630a69 ("mm/gup: replace
+            # get_user_pages_longterm() with FOLL_LONGTERM") in
+            # v5.2
+            #
+            CODE="
+            #include <linux/mm.h>
+            int foll_longterm = FOLL_LONGTERM;
+            "
+
+            compile_check_conftest "$CODE" "NV_FOLL_LONGTERM_PRESENT" "" "types"
+        ;;
+
         vfio_pin_pages_has_vfio_device_arg)
             #
             # Determine if vfio_pin_pages() kABI accepts "struct vfio_device *"
@@ -5152,11 +5168,15 @@ compile_test() {
             # commit 49a3f51dfeee ("drm/gem: Use struct dma_buf_map in GEM
             # vmap ops and convert GEM backends") in v5.11.
             #
+            # Note that the 'map' argument type is changed from 'struct dma_buf_map'
+            # to 'struct iosys_map' by commit 7938f4218168 ("dma-buf-map: Rename
+            # to iosys-map) in v5.18.
+            #
             CODE="
             #include <drm/drm_gem.h>
             int conftest_drm_gem_object_vmap_has_map_arg(
-                    struct drm_gem_object *obj, struct dma_buf_map *map) {
-                return obj->funcs->vmap(obj, map);
+                    struct drm_gem_object *obj) {
+                return obj->funcs->vmap(obj, NULL);
             }"
 
             compile_check_conftest "$CODE" "NV_DRM_GEM_OBJECT_VMAP_HAS_MAP_ARG" "" "types"
diff --git a/kernel-open/nvidia-drm/nvidia-drm-drv.c b/kernel-open/nvidia-drm/nvidia-drm-drv.c
index 7c8ee7ccb..9de3f3fa7 100644
--- a/kernel-open/nvidia-drm/nvidia-drm-drv.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-drv.c
@@ -1903,8 +1903,33 @@ void nv_drm_remove_devices(void)
  */
 void nv_drm_suspend_resume(NvBool suspend)
 {
+    static DEFINE_MUTEX(nv_drm_suspend_mutex);
+    static NvU32 nv_drm_suspend_count = 0;
+    struct nv_drm_device *nv_dev;
+
+    mutex_lock(&nv_drm_suspend_mutex);
+
+    /*
+     * Count the number of times the driver is asked to suspend. Suspend all DRM
+     * devices on the first suspend call and resume them on the last resume
+     * call.  This is necessary because the kernel may call nvkms_suspend()
+     * simultaneously for each GPU, but NVKMS itself also suspends all GPUs on
+     * the first call.
+     */
+    if (suspend) {
+        if (nv_drm_suspend_count++ > 0) {
+            goto done;
+        }
+    } else {
+        BUG_ON(nv_drm_suspend_count == 0);
+
+        if (--nv_drm_suspend_count > 0) {
+            goto done;
+        }
+    }
+
 #if defined(NV_DRM_ATOMIC_MODESET_AVAILABLE)
-    struct nv_drm_device *nv_dev = dev_list;
+    nv_dev = dev_list;
 
     /*
      * NVKMS shuts down all heads on suspend. Update DRM state accordingly.
@@ -1930,6 +1955,9 @@ void nv_drm_suspend_resume(NvBool suspend)
         }
     }
 #endif /* NV_DRM_ATOMIC_MODESET_AVAILABLE */
+
+done:
+    mutex_unlock(&nv_drm_suspend_mutex);
 }
 
 #endif /* NV_DRM_AVAILABLE */
diff --git a/kernel-open/nvidia-modeset/nvidia-modeset-linux.c b/kernel-open/nvidia-modeset/nvidia-modeset-linux.c
index db6f059b6..f1f40c41c 100644
--- a/kernel-open/nvidia-modeset/nvidia-modeset-linux.c
+++ b/kernel-open/nvidia-modeset/nvidia-modeset-linux.c
@@ -56,7 +56,11 @@
 #include "nv-lock.h"
 #include "nv-chardev-numbers.h"
 
-#if !defined(CONFIG_RETPOLINE)
+/*
+ * Commit aefb2f2e619b ("x86/bugs: Rename CONFIG_RETPOLINE =>
+ * CONFIG_MITIGATION_RETPOLINE) in v6.8 renamed CONFIG_RETPOLINE.
+ */
+#if !defined(CONFIG_RETPOLINE) && !defined(CONFIG_MITIGATION_RETPOLINE)
 #include "nv-retpoline.h"
 #endif
 
@@ -499,8 +503,9 @@ nvkms_event_queue_changed(nvkms_per_open_handle_t *pOpenKernel,
 
 static void nvkms_suspend(NvU32 gpuId)
 {
+    nvKmsKapiSuspendResume(NV_TRUE /* suspend */);
+
     if (gpuId == 0) {
-        nvKmsKapiSuspendResume(NV_TRUE /* suspend */);
         nvkms_write_lock_pm_lock();
     }
 
@@ -517,8 +522,9 @@ static void nvkms_resume(NvU32 gpuId)
 
     if (gpuId == 0) {
         nvkms_write_unlock_pm_lock();
-        nvKmsKapiSuspendResume(NV_FALSE /* suspend */);
     }
+
+    nvKmsKapiSuspendResume(NV_FALSE /* suspend */);
 }
 
 
diff --git a/kernel-open/nvidia-uvm/uvm_channel_test.c b/kernel-open/nvidia-uvm/uvm_channel_test.c
index 88641e20a..4e46a9bfc 100644
--- a/kernel-open/nvidia-uvm/uvm_channel_test.c
+++ b/kernel-open/nvidia-uvm/uvm_channel_test.c
@@ -691,12 +691,16 @@ static NV_STATUS stress_test_all_gpus_in_va(uvm_va_space_t *va_space,
             if (uvm_test_rng_range_32(&rng, 0, 1) == 0) {
                 NvU32 random_stream_index = uvm_test_rng_range_32(&rng, 0, num_streams - 1);
                 uvm_test_stream_t *random_stream = &streams[random_stream_index];
-                uvm_push_acquire_tracker(&stream->push, &random_stream->tracker);
-                snapshot_counter(&stream->push,
-                                 random_stream->counter_mem,
-                                 stream->other_stream_counter_snapshots_mem,
-                                 i,
-                                 random_stream->queued_counter_repeat);
+
+                if ((random_stream->push.gpu == gpu) || uvm_push_allow_dependencies_across_gpus()) {
+                    uvm_push_acquire_tracker(&stream->push, &random_stream->tracker);
+
+                    snapshot_counter(&stream->push,
+                                     random_stream->counter_mem,
+                                     stream->other_stream_counter_snapshots_mem,
+                                     i,
+                                     random_stream->queued_counter_repeat);
+                }
             }
 
             uvm_push_end(&stream->push);
diff --git a/kernel-open/nvidia-uvm/uvm_fault_buffer_flush_test.c b/kernel-open/nvidia-uvm/uvm_fault_buffer_flush_test.c
index d3739afdf..c9d58c90d 100644
--- a/kernel-open/nvidia-uvm/uvm_fault_buffer_flush_test.c
+++ b/kernel-open/nvidia-uvm/uvm_fault_buffer_flush_test.c
@@ -51,8 +51,10 @@ NV_STATUS uvm_test_fault_buffer_flush(UVM_TEST_FAULT_BUFFER_FLUSH_PARAMS *params
 
     uvm_va_space_up_read(va_space);
 
-    if (uvm_processor_mask_empty(retained_gpus))
-        return NV_ERR_INVALID_DEVICE;
+    if (uvm_processor_mask_empty(retained_gpus)) {
+        status = NV_ERR_INVALID_DEVICE;
+        goto out;
+    }
 
     for (i = 0; i < params->iterations; i++) {
         if (fatal_signal_pending(current)) {
diff --git a/kernel-open/nvidia-uvm/uvm_global.h b/kernel-open/nvidia-uvm/uvm_global.h
index 3c0ed7a3e..810fc297e 100644
--- a/kernel-open/nvidia-uvm/uvm_global.h
+++ b/kernel-open/nvidia-uvm/uvm_global.h
@@ -409,4 +409,10 @@ NV_STATUS uvm_service_block_context_init(void);
 // Release fault service contexts if any exist.
 void uvm_service_block_context_exit(void);
 
+// Allocate a service block context
+uvm_service_block_context_t *uvm_service_block_context_alloc(struct mm_struct *mm);
+
+// Free a servic block context
+void uvm_service_block_context_free(uvm_service_block_context_t *service_context);
+
 #endif // __UVM_GLOBAL_H__
diff --git a/kernel-open/nvidia-uvm/uvm_gpu.h b/kernel-open/nvidia-uvm/uvm_gpu.h
index 514493ad9..09335483b 100644
--- a/kernel-open/nvidia-uvm/uvm_gpu.h
+++ b/kernel-open/nvidia-uvm/uvm_gpu.h
@@ -160,6 +160,10 @@ struct uvm_service_block_context_struct
     // Pages whose permissions need to be revoked from other processors
     uvm_page_mask_t revocation_mask;
 
+    // Temporary mask used in service_va_block_locked() in
+    // uvm_gpu_access_counters.c.
+    uvm_processor_mask_t update_processors;
+
     struct
     {
         // Per-processor mask with the pages that will be resident after
@@ -593,16 +597,21 @@ typedef enum
     UVM_GPU_LINK_MAX
 } uvm_gpu_link_type_t;
 
-// UVM does not support P2P copies on pre-Pascal GPUs. Pascal+ GPUs only
-// support virtual addresses in P2P copies. Therefore, a peer identity mapping
-// needs to be created.
-// Ampere+ GPUs support physical peer copies, too, so identity mappings are not
-// needed
 typedef enum
 {
+    // Peer copies can be disallowed for a variety of reasons. For example,
+    // P2P transfers are disabled in pre-Pascal GPUs because there is no
+    // compelling use case for direct peer migrations.
     UVM_GPU_PEER_COPY_MODE_UNSUPPORTED,
+
+    // Pascal+ GPUs support virtual addresses in P2P copies. Virtual peer copies
+    // require the creation of peer identity mappings.
     UVM_GPU_PEER_COPY_MODE_VIRTUAL,
+
+    // Ampere+ GPUs support virtual and physical peer copies. Physical peer
+    // copies do not depend on peer identity mappings.
     UVM_GPU_PEER_COPY_MODE_PHYSICAL,
+
     UVM_GPU_PEER_COPY_MODE_COUNT
 } uvm_gpu_peer_copy_mode_t;
 
diff --git a/kernel-open/nvidia-uvm/uvm_gpu_access_counters.c b/kernel-open/nvidia-uvm/uvm_gpu_access_counters.c
index f36876bbf..a24e405fa 100644
--- a/kernel-open/nvidia-uvm/uvm_gpu_access_counters.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_access_counters.c
@@ -1087,12 +1087,12 @@ static NV_STATUS service_va_block_locked(uvm_processor_id_t processor,
     // pages to be serviced
     if (page_count > 0) {
         uvm_processor_id_t id;
-        uvm_processor_mask_t update_processors;
+        uvm_processor_mask_t *update_processors = &service_context->update_processors;
 
-        uvm_processor_mask_and(&update_processors, &va_block->resident, &service_context->resident_processors);
+        uvm_processor_mask_and(update_processors, &va_block->resident, &service_context->resident_processors);
 
         // Remove pages that are already resident in the destination processors
-        for_each_id_in_mask(id, &update_processors) {
+        for_each_id_in_mask(id, update_processors) {
             bool migrate_pages;
             uvm_page_mask_t *residency_mask = uvm_va_block_resident_mask_get(va_block, id, NUMA_NO_NODE);
             UVM_ASSERT(residency_mask);
diff --git a/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c b/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
index e91b5e5b8..4eb989dec 100644
--- a/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
@@ -357,12 +357,18 @@ static NV_STATUS push_cancel_on_gpu(uvm_gpu_t *gpu,
 {
     NV_STATUS status;
     uvm_push_t push;
-    uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable;
+    uvm_tracker_t *replay_tracker = &gpu->parent->fault_buffer_info.replayable.replay_tracker;
+
+    UVM_ASSERT(tracker != NULL);
+
+    status = uvm_tracker_add_tracker_safe(tracker, replay_tracker);
+    if (status != NV_OK)
+        return status;
 
     if (global_cancel) {
         status = uvm_push_begin_acquire(gpu->channel_manager,
                                         UVM_CHANNEL_TYPE_MEMOPS,
-                                        &replayable_faults->replay_tracker,
+                                        tracker,
                                         &push,
                                         "Cancel targeting instance_ptr {0x%llx:%s}\n",
                                         instance_ptr.address,
@@ -371,7 +377,7 @@ static NV_STATUS push_cancel_on_gpu(uvm_gpu_t *gpu,
     else {
         status = uvm_push_begin_acquire(gpu->channel_manager,
                                         UVM_CHANNEL_TYPE_MEMOPS,
-                                        &replayable_faults->replay_tracker,
+                                        tracker,
                                         &push,
                                         "Cancel targeting instance_ptr {0x%llx:%s} gpc %u client %u\n",
                                         instance_ptr.address,
@@ -382,17 +388,15 @@ static NV_STATUS push_cancel_on_gpu(uvm_gpu_t *gpu,
 
     UVM_ASSERT(status == NV_OK);
     if (status != NV_OK) {
-        UVM_ERR_PRINT("Failed to create push and acquire replay tracker before pushing cancel: %s, GPU %s\n",
+        UVM_ERR_PRINT("Failed to create push and acquire trackers before pushing cancel: %s, GPU %s\n",
                       nvstatusToString(status),
                       uvm_gpu_name(gpu));
         return status;
     }
 
-    uvm_push_acquire_tracker(&push, tracker);
-
     if (global_cancel)
         gpu->parent->host_hal->cancel_faults_global(&push, instance_ptr);
-     else
+    else
         gpu->parent->host_hal->cancel_faults_targeted(&push, instance_ptr, gpc_id, client_id);
 
     // We don't need to put the cancel in the GPU replay tracker since we wait
@@ -403,7 +407,9 @@ static NV_STATUS push_cancel_on_gpu(uvm_gpu_t *gpu,
     if (status != NV_OK)
         UVM_ERR_PRINT("Failed to wait for pushed cancel: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu));
 
-    uvm_tracker_clear(&replayable_faults->replay_tracker);
+    // The cancellation is complete, so the input trackers must be complete too.
+    uvm_tracker_clear(tracker);
+    uvm_tracker_clear(replay_tracker);
 
     return status;
 }
diff --git a/kernel-open/nvidia-uvm/uvm_hmm.c b/kernel-open/nvidia-uvm/uvm_hmm.c
index 5fe277e2a..5060e6ca5 100644
--- a/kernel-open/nvidia-uvm/uvm_hmm.c
+++ b/kernel-open/nvidia-uvm/uvm_hmm.c
@@ -92,7 +92,7 @@ typedef struct
 {
     uvm_va_block_t *va_block;
     uvm_va_block_retry_t *va_block_retry;
-    uvm_va_block_context_t *va_block_context;
+    uvm_service_block_context_t *service_context;
     uvm_va_block_region_t region;
     uvm_processor_id_t dest_id;
     uvm_make_resident_cause_t cause;
@@ -713,7 +713,7 @@ void uvm_hmm_migrate_finish(uvm_va_block_t *va_block)
 // Migrate the given range [start end] within a va_block to dest_id.
 static NV_STATUS hmm_migrate_range(uvm_va_block_t *va_block,
                                    uvm_va_block_retry_t *va_block_retry,
-                                   uvm_va_block_context_t *va_block_context,
+                                   uvm_service_block_context_t *service_context,
                                    uvm_processor_id_t dest_id,
                                    NvU64 start,
                                    NvU64 end,
@@ -737,7 +737,7 @@ static NV_STATUS hmm_migrate_range(uvm_va_block_t *va_block,
                                            va_block_retry,
                                            uvm_va_block_migrate_locked(va_block,
                                                                        va_block_retry,
-                                                                       va_block_context,
+                                                                       service_context,
                                                                        region,
                                                                        dest_id,
                                                                        mode,
@@ -916,14 +916,14 @@ static NV_STATUS split_block_if_needed(uvm_va_block_t *va_block,
                                        NvU64 end,
                                        uvm_va_block_t **out_va_block)
 {
-    uvm_va_block_context_t *va_block_context;
+    uvm_service_block_context_t *service_context;
     uvm_va_space_t *va_space;
     struct mm_struct *mm;
     struct vm_area_struct *vma;
     uvm_va_block_region_t region;
     NvU64 addr, from, to;
     uvm_va_block_t *new;
-    NV_STATUS status;
+    NV_STATUS status = NV_OK;
 
     if (va_block->start < start) {
         status = hmm_split_block(va_block, start - 1, &new);
@@ -942,15 +942,18 @@ static NV_STATUS split_block_if_needed(uvm_va_block_t *va_block,
         // Keep the right part, the left part will be deleted.
     }
 
-    *out_va_block = va_block;
-
     // Migrate any GPU data to sysmem before destroying the HMM va_block.
     // We do this because the new va_range might be for a UVM external
     // allocation which could be converting an address range that was first
     // operated on by UVM-HMM and the exteral allocation should see that data.
     va_space = va_block->hmm.va_space;
     mm = va_space->va_space_mm.mm;
-    va_block_context = uvm_va_space_block_context(va_space, mm);
+
+    service_context = uvm_service_block_context_alloc(mm);
+    if (!service_context)
+        return NV_ERR_NO_MEMORY;
+
+    *out_va_block = va_block;
 
     for (addr = va_block->start; addr < va_block->end; addr = to + 1) {
         vma = find_vma_intersection(mm, addr, va_block->end);
@@ -964,21 +967,23 @@ static NV_STATUS split_block_if_needed(uvm_va_block_t *va_block,
         if (!uvm_hmm_vma_is_valid(vma, from, false))
             continue;
 
-        va_block_context->hmm.vma = vma;
+        service_context->block_context->hmm.vma = vma;
 
         status = hmm_migrate_range(va_block,
                                    NULL,
-                                   va_block_context,
+                                   service_context,
                                    UVM_ID_CPU,
                                    from,
                                    to,
                                    UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP,
                                    NULL);
         if (status != NV_OK)
-            return status;
+            break;
     }
 
-    return NV_OK;
+    uvm_service_block_context_free(service_context);
+
+    return status;
 }
 
 // Normally, the HMM va_block is destroyed when the va_space is destroyed
@@ -1089,12 +1094,17 @@ static NV_STATUS hmm_set_preferred_location_locked(uvm_va_block_t *va_block,
                                                    NvU64 end,
                                                    uvm_tracker_t *out_tracker)
 {
-    uvm_processor_mask_t set_accessed_by_processors;
+    uvm_processor_mask_t *set_accessed_by_processors;
     const uvm_va_policy_t *old_policy;
     uvm_va_policy_node_t *node;
     uvm_va_block_region_t region;
     uvm_processor_id_t id;
-    NV_STATUS status, tracker_status;
+    NV_STATUS status = NV_OK;
+    NV_STATUS tracker_status;
+
+    set_accessed_by_processors = uvm_processor_mask_cache_alloc();
+    if (!set_accessed_by_processors)
+        return NV_ERR_NO_MEMORY;
 
     // Note that we can't just call uvm_va_policy_set_range() for the whole
     // range [addr end] because we need to examine the old value of
@@ -1107,25 +1117,27 @@ static NV_STATUS hmm_set_preferred_location_locked(uvm_va_block_t *va_block,
         // If the old preferred location is a valid processor ID, remote
         // mappings should be established to the new preferred location if
         // accessed-by is set.
-        uvm_processor_mask_zero(&set_accessed_by_processors);
+        uvm_processor_mask_zero(set_accessed_by_processors);
 
         if (UVM_ID_IS_VALID(old_policy->preferred_location) &&
             uvm_processor_mask_test(&old_policy->accessed_by, old_policy->preferred_location))
-            uvm_processor_mask_set(&set_accessed_by_processors, old_policy->preferred_location);
+            uvm_processor_mask_set(set_accessed_by_processors, old_policy->preferred_location);
 
         if (!uvm_va_policy_set_preferred_location(va_block,
                                                   region,
                                                   preferred_location,
                                                   preferred_cpu_nid,
-                                                  old_policy))
-            return NV_ERR_NO_MEMORY;
+                                                  old_policy)) {
+            status = NV_ERR_NO_MEMORY;
+            break;
+        }
 
         // Establish new remote mappings if the old preferred location had
         // accessed-by set.
-        for_each_id_in_mask(id, &set_accessed_by_processors) {
+        for_each_id_in_mask(id, set_accessed_by_processors) {
             status = uvm_va_block_set_accessed_by_locked(va_block, va_block_context, id, region, out_tracker);
             if (status != NV_OK)
-                return status;
+                break;
         }
 
         // Even though the UVM_VA_BLOCK_RETRY_LOCKED() may unlock and relock
@@ -1143,10 +1155,11 @@ static NV_STATUS hmm_set_preferred_location_locked(uvm_va_block_t *va_block,
             status = tracker_status;
 
         if (status != NV_OK)
-            return status;
+            break;
     }
 
-    return NV_OK;
+    uvm_processor_mask_cache_free(set_accessed_by_processors);
+    return status;
 }
 
 NV_STATUS uvm_hmm_set_preferred_location(uvm_va_space_t *va_space,
@@ -2128,6 +2141,7 @@ static NV_STATUS migrate_alloc_on_cpu(uvm_va_block_t *va_block,
 
     return status;
 }
+
 static NV_STATUS uvm_hmm_devmem_fault_alloc_and_copy(uvm_hmm_devmem_fault_context_t *devmem_fault_context)
 {
     uvm_processor_id_t processor_id;
@@ -2400,6 +2414,7 @@ static NV_STATUS hmm_block_atomic_fault_locked(uvm_processor_id_t processor_id,
 {
     uvm_va_block_region_t region = service_context->region;
     struct page **pages = service_context->block_context->hmm.pages;
+    struct vm_area_struct *vma = service_context->block_context->hmm.vma;
     int npages;
     uvm_page_index_t page_index;
     uvm_make_resident_cause_t cause;
@@ -2417,12 +2432,9 @@ static NV_STATUS hmm_block_atomic_fault_locked(uvm_processor_id_t processor_id,
         else
             cause = UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER;
 
-        status = uvm_hmm_va_block_migrate_locked(va_block,
-                                                 va_block_retry,
-                                                 service_context->block_context,
-                                                 UVM_ID_CPU,
-                                                 region,
-                                                 cause);
+        UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block, vma, region));
+
+        status = uvm_hmm_va_block_migrate_locked(va_block, va_block_retry, service_context, UVM_ID_CPU, region, cause);
         if (status != NV_OK)
             goto done;
 
@@ -2439,7 +2451,7 @@ static NV_STATUS hmm_block_atomic_fault_locked(uvm_processor_id_t processor_id,
     // mmap() files so we check for that here and report a fatal fault.
     // Otherwise with the current Linux 6.1 make_device_exclusive_range(),
     // it doesn't make the page exclusive and we end up in an endless loop.
-    if (service_context->block_context->hmm.vma->vm_flags & (VM_SHARED | VM_HUGETLB)) {
+    if (vma->vm_flags & (VM_SHARED | VM_HUGETLB)) {
         status = NV_ERR_NOT_SUPPORTED;
         goto done;
     }
@@ -2662,6 +2674,8 @@ static NV_STATUS dmamap_src_sysmem_pages(uvm_va_block_t *va_block,
     uvm_page_index_t page_index;
     NV_STATUS status = NV_OK;
 
+    UVM_ASSERT(service_context);
+
     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
         struct page *src_page;
 
@@ -2966,7 +2980,7 @@ static NV_STATUS uvm_hmm_migrate_alloc_and_copy(struct vm_area_struct *vma,
 {
     uvm_va_block_t *va_block;
     uvm_va_block_retry_t *va_block_retry;
-    uvm_va_block_context_t *va_block_context;
+    uvm_service_block_context_t *service_context;
     const unsigned long *src_pfns;
     unsigned long *dst_pfns;
     uvm_va_block_region_t region;
@@ -2976,9 +2990,9 @@ static NV_STATUS uvm_hmm_migrate_alloc_and_copy(struct vm_area_struct *vma,
 
     va_block = uvm_hmm_migrate_event->va_block;
     va_block_retry = uvm_hmm_migrate_event->va_block_retry;
-    va_block_context = uvm_hmm_migrate_event->va_block_context;
-    src_pfns = va_block_context->hmm.src_pfns;
-    dst_pfns = va_block_context->hmm.dst_pfns;
+    service_context = uvm_hmm_migrate_event->service_context;
+    src_pfns = service_context->block_context->hmm.src_pfns;
+    dst_pfns = service_context->block_context->hmm.dst_pfns;
     region = uvm_hmm_migrate_event->region;
     dest_id = uvm_hmm_migrate_event->dest_id;
     page_mask = &uvm_hmm_migrate_event->page_mask;
@@ -2994,7 +3008,7 @@ static NV_STATUS uvm_hmm_migrate_alloc_and_copy(struct vm_area_struct *vma,
                                       region,
                                       page_mask,
                                       &uvm_hmm_migrate_event->same_devmem_page_mask,
-                                      va_block_context);
+                                      service_context->block_context);
     }
     else {
         status = dmamap_src_sysmem_pages(va_block,
@@ -3004,14 +3018,15 @@ static NV_STATUS uvm_hmm_migrate_alloc_and_copy(struct vm_area_struct *vma,
                                          region,
                                          page_mask,
                                          dest_id,
-                                         NULL);
+                                         service_context);
     }
+
     if (status != NV_OK)
         return status;
 
     status = uvm_va_block_make_resident_copy(va_block,
                                              va_block_retry,
-                                             va_block_context,
+                                             service_context->block_context,
                                              dest_id,
                                              region,
                                              page_mask,
@@ -3050,7 +3065,7 @@ static NV_STATUS uvm_hmm_migrate_finalize(uvm_hmm_migrate_event_t *uvm_hmm_migra
 
     va_block = uvm_hmm_migrate_event->va_block;
     va_block_retry = uvm_hmm_migrate_event->va_block_retry;
-    va_block_context = uvm_hmm_migrate_event->va_block_context;
+    va_block_context = uvm_hmm_migrate_event->service_context->block_context;
     region = uvm_hmm_migrate_event->region;
     dest_id = uvm_hmm_migrate_event->dest_id;
     page_mask = &uvm_hmm_migrate_event->page_mask;
@@ -3090,12 +3105,13 @@ static NV_STATUS uvm_hmm_migrate_finalize(uvm_hmm_migrate_event_t *uvm_hmm_migra
 // TODO: Bug 3900785: investigate ways to implement async migration.
 NV_STATUS uvm_hmm_va_block_migrate_locked(uvm_va_block_t *va_block,
                                           uvm_va_block_retry_t *va_block_retry,
-                                          uvm_va_block_context_t *va_block_context,
+                                          uvm_service_block_context_t *service_context,
                                           uvm_processor_id_t dest_id,
                                           uvm_va_block_region_t region,
                                           uvm_make_resident_cause_t cause)
 {
     uvm_hmm_migrate_event_t uvm_hmm_migrate_event;
+    uvm_va_block_context_t *va_block_context = service_context->block_context;
     struct vm_area_struct *vma = va_block_context->hmm.vma;
     NvU64 start;
     NvU64 end;
@@ -3106,6 +3122,7 @@ NV_STATUS uvm_hmm_va_block_migrate_locked(uvm_va_block_t *va_block,
     UVM_ASSERT(vma);
     UVM_ASSERT(va_block_context->mm == vma->vm_mm);
     uvm_assert_mmap_lock_locked(va_block_context->mm);
+    UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block, vma, region));
     uvm_assert_rwsem_locked(&va_block->hmm.va_space->lock);
     uvm_assert_mutex_locked(&va_block->hmm.migrate_lock);
     uvm_assert_mutex_locked(&va_block->lock);
@@ -3116,7 +3133,7 @@ NV_STATUS uvm_hmm_va_block_migrate_locked(uvm_va_block_t *va_block,
 
     uvm_hmm_migrate_event.va_block = va_block;
     uvm_hmm_migrate_event.va_block_retry = va_block_retry;
-    uvm_hmm_migrate_event.va_block_context = va_block_context;
+    uvm_hmm_migrate_event.service_context = service_context;
     uvm_hmm_migrate_event.region = region;
     uvm_hmm_migrate_event.dest_id = dest_id;
     uvm_hmm_migrate_event.cause = cause;
@@ -3202,7 +3219,7 @@ NV_STATUS uvm_hmm_va_block_migrate_locked(uvm_va_block_t *va_block,
 }
 
 NV_STATUS uvm_hmm_migrate_ranges(uvm_va_space_t *va_space,
-                                 uvm_va_block_context_t *va_block_context,
+                                 uvm_service_block_context_t *service_context,
                                  NvU64 base,
                                  NvU64 length,
                                  uvm_processor_id_t dest_id,
@@ -3214,11 +3231,12 @@ NV_STATUS uvm_hmm_migrate_ranges(uvm_va_space_t *va_space,
     uvm_va_block_retry_t va_block_retry;
     NvU64 addr, end, last_address;
     NV_STATUS status = NV_OK;
+    uvm_va_block_context_t *block_context = service_context->block_context;
 
     if (!uvm_hmm_is_enabled(va_space))
         return NV_ERR_INVALID_ADDRESS;
 
-    mm = va_block_context->mm;
+    mm = block_context->mm;
     UVM_ASSERT(mm == va_space->va_space_mm.mm);
     uvm_assert_mmap_lock_locked(mm);
     uvm_assert_rwsem_locked(&va_space->lock);
@@ -3228,7 +3246,7 @@ NV_STATUS uvm_hmm_migrate_ranges(uvm_va_space_t *va_space,
     for (addr = base; addr < last_address; addr = end + 1) {
         struct vm_area_struct *vma;
 
-        status = hmm_va_block_find_create(va_space, addr, false, &va_block_context->hmm.vma, &va_block);
+        status = hmm_va_block_find_create(va_space, addr, false, &block_context->hmm.vma, &va_block);
         if (status != NV_OK)
             return status;
 
@@ -3236,18 +3254,11 @@ NV_STATUS uvm_hmm_migrate_ranges(uvm_va_space_t *va_space,
         if (end > last_address)
             end = last_address;
 
-        vma = va_block_context->hmm.vma;
+        vma = block_context->hmm.vma;
         if (end > vma->vm_end - 1)
             end = vma->vm_end - 1;
 
-        status = hmm_migrate_range(va_block,
-                                   &va_block_retry,
-                                   va_block_context,
-                                   dest_id,
-                                   addr,
-                                   end,
-                                   mode,
-                                   out_tracker);
+        status = hmm_migrate_range(va_block, &va_block_retry, service_context, dest_id, addr, end, mode, out_tracker);
         if (status != NV_OK)
             break;
     }
@@ -3283,12 +3294,13 @@ NV_STATUS uvm_hmm_va_block_evict_chunk_prep(uvm_va_block_t *va_block,
 // Note that the caller must initialize va_block_context->hmm.src_pfns by
 // calling uvm_hmm_va_block_evict_chunk_prep() before calling this.
 static NV_STATUS hmm_va_block_evict_chunks(uvm_va_block_t *va_block,
-                                           uvm_va_block_context_t *va_block_context,
+                                           uvm_service_block_context_t *service_context,
                                            const uvm_page_mask_t *pages_to_evict,
                                            uvm_va_block_region_t region,
                                            uvm_make_resident_cause_t cause,
                                            bool *out_accessed_by_set)
 {
+    uvm_va_block_context_t *va_block_context = service_context->block_context;
     NvU64 start = uvm_va_block_region_start(va_block, region);
     NvU64 end = uvm_va_block_region_end(va_block, region);
     unsigned long *src_pfns = va_block_context->hmm.src_pfns;
@@ -3296,7 +3308,7 @@ static NV_STATUS hmm_va_block_evict_chunks(uvm_va_block_t *va_block,
     uvm_hmm_migrate_event_t uvm_hmm_migrate_event = {
         .va_block = va_block,
         .va_block_retry = NULL,
-        .va_block_context = va_block_context,
+        .service_context = service_context,
         .region = region,
         .dest_id = UVM_ID_CPU,
         .cause = cause,
@@ -3329,13 +3341,7 @@ static NV_STATUS hmm_va_block_evict_chunks(uvm_va_block_t *va_block,
         // TODO: Bug 3660922: Need to handle read duplication at some point.
         UVM_ASSERT(uvm_page_mask_region_empty(cpu_resident_mask, region));
 
-        status = migrate_alloc_on_cpu(va_block,
-                                      src_pfns,
-                                      dst_pfns,
-                                      region,
-                                      page_mask,
-                                      NULL,
-                                      va_block_context);
+        status = migrate_alloc_on_cpu(va_block, src_pfns, dst_pfns, region, page_mask, NULL, va_block_context);
         if (status != NV_OK)
             goto err;
 
@@ -3369,13 +3375,13 @@ err:
 }
 
 NV_STATUS uvm_hmm_va_block_evict_chunks(uvm_va_block_t *va_block,
-                                        uvm_va_block_context_t *va_block_context,
+                                        uvm_service_block_context_t *service_context,
                                         const uvm_page_mask_t *pages_to_evict,
                                         uvm_va_block_region_t region,
                                         bool *out_accessed_by_set)
 {
     return hmm_va_block_evict_chunks(va_block,
-                                     va_block_context,
+                                     service_context,
                                      pages_to_evict,
                                      region,
                                      UVM_MAKE_RESIDENT_CAUSE_EVICTION,
@@ -3384,11 +3390,12 @@ NV_STATUS uvm_hmm_va_block_evict_chunks(uvm_va_block_t *va_block,
 
 NV_STATUS uvm_hmm_va_block_evict_pages_from_gpu(uvm_va_block_t *va_block,
                                                 uvm_gpu_t *gpu,
-                                                uvm_va_block_context_t *va_block_context,
+                                                uvm_service_block_context_t *service_context,
                                                 const uvm_page_mask_t *pages_to_evict,
                                                 uvm_va_block_region_t region)
 {
-    unsigned long *src_pfns = va_block_context->hmm.src_pfns;
+    uvm_va_block_context_t *block_context = service_context->block_context;
+    unsigned long *src_pfns = block_context->hmm.src_pfns;
     uvm_va_block_gpu_state_t *gpu_state;
     uvm_page_index_t page_index;
     uvm_gpu_chunk_t *gpu_chunk;
@@ -3401,7 +3408,7 @@ NV_STATUS uvm_hmm_va_block_evict_pages_from_gpu(uvm_va_block_t *va_block,
     UVM_ASSERT(gpu_state->chunks);
 
     // Fill in the src_pfns[] with the ZONE_DEVICE private PFNs of the GPU.
-    memset(src_pfns, 0, sizeof(va_block_context->hmm.src_pfns));
+    memset(src_pfns, 0, sizeof(block_context->hmm.src_pfns));
 
     // TODO: Bug 3368756: add support for large GPU pages.
     for_each_va_block_page_in_region_mask(page_index, pages_to_evict, region) {
@@ -3409,7 +3416,7 @@ NV_STATUS uvm_hmm_va_block_evict_pages_from_gpu(uvm_va_block_t *va_block,
                                                   gpu,
                                                   uvm_va_block_cpu_page_address(va_block, page_index));
         status = uvm_hmm_va_block_evict_chunk_prep(va_block,
-                                                   va_block_context,
+                                                   block_context,
                                                    gpu_chunk,
                                                    uvm_va_block_region_for_page(page_index));
         if (status != NV_OK)
@@ -3417,7 +3424,7 @@ NV_STATUS uvm_hmm_va_block_evict_pages_from_gpu(uvm_va_block_t *va_block,
     }
 
     return hmm_va_block_evict_chunks(va_block,
-                                     va_block_context,
+                                     service_context,
                                      pages_to_evict,
                                      region,
                                      UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE,
diff --git a/kernel-open/nvidia-uvm/uvm_hmm.h b/kernel-open/nvidia-uvm/uvm_hmm.h
index 8c6a4f58d..9e20b973d 100644
--- a/kernel-open/nvidia-uvm/uvm_hmm.h
+++ b/kernel-open/nvidia-uvm/uvm_hmm.h
@@ -287,16 +287,17 @@ typedef struct
                                               uvm_va_block_retry_t *va_block_retry,
                                               uvm_service_block_context_t *service_context);
 
-    // This is called to migrate a region within a HMM va_block.
-    // va_block_context must not be NULL and va_block_context->hmm.vma
-    // must be valid.
+    // This is called to migrate a region within a HMM va_block. service_context
+    // must not be NULL, service_context->va_block_context must not be NULL and
+    // service_context->va_block_context->hmm.vma must be valid.
+    //
     // Special return values (besides things like NV_ERR_NO_MEMORY):
     // NV_WARN_MORE_PROCESSING_REQUIRED indicates that one or more pages could
     // not be migrated and that a retry might succeed after unlocking the
     // va_block lock, va_space lock, and mmap lock.
     NV_STATUS uvm_hmm_va_block_migrate_locked(uvm_va_block_t *va_block,
                                               uvm_va_block_retry_t *va_block_retry,
-                                              uvm_va_block_context_t *va_block_context,
+                                              uvm_service_block_context_t *service_context,
                                               uvm_processor_id_t dest_id,
                                               uvm_va_block_region_t region,
                                               uvm_make_resident_cause_t cause);
@@ -304,13 +305,14 @@ typedef struct
     // This is called to migrate an address range of HMM allocations via
     // UvmMigrate().
     //
-    // va_block_context must not be NULL. The caller is not required to set
-    // va_block_context->hmm.vma.
+    // service_context and service_context->va_block_context must not be NULL.
+    // The caller is not required to set
+    // service_context->va_block_context->hmm.vma.
     //
     // Locking: the va_space->va_space_mm.mm mmap_lock must be locked and
     // the va_space read lock must be held.
     NV_STATUS uvm_hmm_migrate_ranges(uvm_va_space_t *va_space,
-                                     uvm_va_block_context_t *va_block_context,
+                                     uvm_service_block_context_t *service_context,
                                      NvU64 base,
                                      NvU64 length,
                                      uvm_processor_id_t dest_id,
@@ -329,27 +331,31 @@ typedef struct
                                                 uvm_gpu_chunk_t *gpu_chunk,
                                                 uvm_va_block_region_t chunk_region);
 
-    // Migrate pages to system memory for the given page mask.
-    // Note that the mmap lock is not held and there is no MM retained.
-    // This must be called after uvm_hmm_va_block_evict_chunk_prep() has
-    // initialized va_block_context->hmm.src_pfns[] for the source GPU physical
-    // PFNs being migrated. Note that the input mask 'pages_to_evict' can be
-    // modified. If any of the evicted pages has the accessed by policy set,
-    // then record that by setting out_accessed_by_set.
+    // Migrate pages to system memory for the given page mask. Note that the
+    // mmap lock is not held and there is no MM retained. This must be called
+    // after uvm_hmm_va_block_evict_chunk_prep() has initialized
+    // service_context->va_block_context->hmm.src_pfns[] for the source GPU
+    // physical PFNs being migrated. Note that the input mask 'pages_to_evict'
+    // can be modified. If any of the evicted pages has the accessed by policy
+    // set, then record that by setting out_accessed_by_set.
+    // The caller is not required to set
+    // service_context->va_block_context->hmm.vma, it will be cleared in
+    // uvm_hmm_va_block_evict_chunks().
     // Locking: the va_block lock must be locked.
     NV_STATUS uvm_hmm_va_block_evict_chunks(uvm_va_block_t *va_block,
-                                            uvm_va_block_context_t *va_block_context,
+                                            uvm_service_block_context_t *service_context,
                                             const uvm_page_mask_t *pages_to_evict,
                                             uvm_va_block_region_t region,
                                             bool *out_accessed_by_set);
 
-    // Migrate pages from the given GPU to system memory for the given page
-    // mask and region. va_block_context must not be NULL.
-    // Note that the mmap lock is not held and there is no MM retained.
+    // Migrate pages from the given GPU to system memory for the given page mask
+    // and region. uvm_service_block_context_t and
+    // uvm_service_block_context_t->va_block_context must not be NULL. Note that
+    // the mmap lock is not held and there is no MM retained.
     // Locking: the va_block lock must be locked.
     NV_STATUS uvm_hmm_va_block_evict_pages_from_gpu(uvm_va_block_t *va_block,
                                                     uvm_gpu_t *gpu,
-                                                    uvm_va_block_context_t *va_block_context,
+                                                    uvm_service_block_context_t *service_context,
                                                     const uvm_page_mask_t *pages_to_evict,
                                                     uvm_va_block_region_t region);
 
@@ -572,7 +578,7 @@ typedef struct
 
     static NV_STATUS uvm_hmm_va_block_migrate_locked(uvm_va_block_t *va_block,
                                                      uvm_va_block_retry_t *va_block_retry,
-                                                     uvm_va_block_context_t *va_block_context,
+                                                     uvm_service_block_context_t *service_context,
                                                      uvm_processor_id_t dest_id,
                                                      uvm_va_block_region_t region,
                                                      uvm_make_resident_cause_t cause)
@@ -581,7 +587,7 @@ typedef struct
     }
 
     static NV_STATUS uvm_hmm_migrate_ranges(uvm_va_space_t *va_space,
-                                            uvm_va_block_context_t *va_block_context,
+                                            uvm_service_block_context_t *service_context,
                                             NvU64 base,
                                             NvU64 length,
                                             uvm_processor_id_t dest_id,
@@ -606,7 +612,7 @@ typedef struct
     }
 
     static NV_STATUS uvm_hmm_va_block_evict_chunks(uvm_va_block_t *va_block,
-                                                   uvm_va_block_context_t *va_block_context,
+                                                   uvm_service_block_context_t *service_context,
                                                    const uvm_page_mask_t *pages_to_evict,
                                                    uvm_va_block_region_t region,
                                                    bool *out_accessed_by_set)
@@ -616,7 +622,7 @@ typedef struct
 
     static NV_STATUS uvm_hmm_va_block_evict_pages_from_gpu(uvm_va_block_t *va_block,
                                                            uvm_gpu_t *gpu,
-                                                           uvm_va_block_context_t *va_block_context,
+                                                           uvm_service_block_context_t *service_context,
                                                            const uvm_page_mask_t *pages_to_evict,
                                                            uvm_va_block_region_t region)
     {
diff --git a/kernel-open/nvidia-uvm/uvm_hopper.c b/kernel-open/nvidia-uvm/uvm_hopper.c
index fdda40987..7d80faa16 100644
--- a/kernel-open/nvidia-uvm/uvm_hopper.c
+++ b/kernel-open/nvidia-uvm/uvm_hopper.c
@@ -27,6 +27,24 @@
 #include "uvm_mem.h"
 #include "uvm_hopper_fault_buffer.h"
 
+static uvm_gpu_peer_copy_mode_t hopper_peer_copy_mode(uvm_parent_gpu_t *parent_gpu)
+{
+    // In Confidential Computing the Copy Engine supports encrypted copies
+    // between peers. But in Hopper these transfers require significant
+    // software support (ex: unprotected vidmem), so in practice they are not
+    // allowed.
+    if (g_uvm_global.conf_computing_enabled)
+        return UVM_GPU_PEER_COPY_MODE_UNSUPPORTED;
+
+    // TODO: Bug 4174553: In some Grace Hopper setups, physical peer copies
+    // result on errors. Force peer copies to use virtual addressing until the
+    // issue is clarified.
+    if (uvm_parent_gpu_is_coherent(parent_gpu))
+        return UVM_GPU_PEER_COPY_MODE_VIRTUAL;
+
+    return g_uvm_global.peer_copy_mode;
+}
+
 void uvm_hal_hopper_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
 {
     parent_gpu->tlb_batch.va_invalidate_supported = true;
@@ -58,14 +76,10 @@ void uvm_hal_hopper_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
     parent_gpu->flat_vidmem_va_base = (64 * UVM_SIZE_1PB) + (32 * UVM_SIZE_1TB);
 
     // Physical CE writes to vidmem are non-coherent with respect to the CPU on
-    // GH180.
+    // Grace Hopper.
     parent_gpu->ce_phys_vidmem_write_supported = !uvm_parent_gpu_is_coherent(parent_gpu);
 
-    // TODO: Bug 4174553: [HGX-SkinnyJoe][GH180] channel errors discussion/debug
-    //                    portion for the uvm tests became nonresponsive after
-    //                    some time and then failed even after reboot
-    parent_gpu->peer_copy_mode = uvm_parent_gpu_is_coherent(parent_gpu) ?
-                                                           UVM_GPU_PEER_COPY_MODE_VIRTUAL : g_uvm_global.peer_copy_mode;
+    parent_gpu->peer_copy_mode = hopper_peer_copy_mode(parent_gpu);
 
     // All GR context buffers may be mapped to 57b wide VAs. All "compute" units
     // accessing GR context buffers support the 57-bit VA range.
diff --git a/kernel-open/nvidia-uvm/uvm_hopper_ce.c b/kernel-open/nvidia-uvm/uvm_hopper_ce.c
index 320205402..d938da70b 100644
--- a/kernel-open/nvidia-uvm/uvm_hopper_ce.c
+++ b/kernel-open/nvidia-uvm/uvm_hopper_ce.c
@@ -480,7 +480,6 @@ static NvU64 encrypt_iv_address(uvm_push_t *push, uvm_gpu_address_t dst)
     return iv_address;
 }
 
-// TODO: Bug 3842953: adapt CE encrypt/decrypt for p2p encrypted transfers
 void uvm_hal_hopper_ce_encrypt(uvm_push_t *push,
                                uvm_gpu_address_t dst,
                                uvm_gpu_address_t src,
@@ -530,7 +529,6 @@ void uvm_hal_hopper_ce_encrypt(uvm_push_t *push,
     encrypt_or_decrypt(push, dst, src, size);
 }
 
-// TODO: Bug 3842953: adapt CE encrypt/decrypt for p2p encrypted transfers
 void uvm_hal_hopper_ce_decrypt(uvm_push_t *push,
                                uvm_gpu_address_t dst,
                                uvm_gpu_address_t src,
diff --git a/kernel-open/nvidia-uvm/uvm_map_external.c b/kernel-open/nvidia-uvm/uvm_map_external.c
index bde906342..99c84134b 100644
--- a/kernel-open/nvidia-uvm/uvm_map_external.c
+++ b/kernel-open/nvidia-uvm/uvm_map_external.c
@@ -970,7 +970,7 @@ static NV_STATUS uvm_map_external_allocation(uvm_va_space_t *va_space, UVM_MAP_E
 {
     uvm_va_range_t *va_range = NULL;
     uvm_gpu_t *mapping_gpu;
-    uvm_processor_mask_t mapped_gpus;
+    uvm_processor_mask_t *mapped_gpus;
     NV_STATUS status = NV_OK;
     size_t i;
     uvm_map_rm_params_t map_rm_params;
@@ -988,6 +988,10 @@ static NV_STATUS uvm_map_external_allocation(uvm_va_space_t *va_space, UVM_MAP_E
     if (params->gpuAttributesCount == 0 || params->gpuAttributesCount > UVM_MAX_GPUS_V2)
         return NV_ERR_INVALID_ARGUMENT;
 
+    mapped_gpus = uvm_processor_mask_cache_alloc();
+    if (!mapped_gpus)
+        return NV_ERR_NO_MEMORY;
+
     uvm_va_space_down_read_rm(va_space);
     va_range = uvm_va_range_find(va_space, params->base);
 
@@ -995,10 +999,11 @@ static NV_STATUS uvm_map_external_allocation(uvm_va_space_t *va_space, UVM_MAP_E
         va_range->type != UVM_VA_RANGE_TYPE_EXTERNAL ||
         va_range->node.end < params->base + params->length - 1) {
         uvm_va_space_up_read_rm(va_space);
+        uvm_processor_mask_cache_free(mapped_gpus);
         return NV_ERR_INVALID_ADDRESS;
     }
 
-    uvm_processor_mask_zero(&mapped_gpus);
+    uvm_processor_mask_zero(mapped_gpus);
     for (i = 0; i < params->gpuAttributesCount; i++) {
         if (uvm_api_mapping_type_invalid(params->perGpuAttributes[i].gpuMappingType) ||
             uvm_api_caching_type_invalid(params->perGpuAttributes[i].gpuCachingType) ||
@@ -1034,7 +1039,7 @@ static NV_STATUS uvm_map_external_allocation(uvm_va_space_t *va_space, UVM_MAP_E
         if (status != NV_OK)
             goto error;
 
-        uvm_processor_mask_set(&mapped_gpus, mapping_gpu->id);
+        uvm_processor_mask_set(mapped_gpus, mapping_gpu->id);
     }
 
     // Wait for outstanding page table operations to finish across all GPUs. We
@@ -1043,6 +1048,8 @@ static NV_STATUS uvm_map_external_allocation(uvm_va_space_t *va_space, UVM_MAP_E
     status = uvm_tracker_wait_deinit(&tracker);
 
     uvm_va_space_up_read_rm(va_space);
+    uvm_processor_mask_cache_free(mapped_gpus);
+
     return status;
 
 error:
@@ -1051,7 +1058,7 @@ error:
     (void)uvm_tracker_wait_deinit(&tracker);
 
     // Tear down only those mappings we created during this call
-    for_each_va_space_gpu_in_mask(mapping_gpu, va_space, &mapped_gpus) {
+    for_each_va_space_gpu_in_mask(mapping_gpu, va_space, mapped_gpus) {
         uvm_ext_gpu_range_tree_t *range_tree = uvm_ext_gpu_range_tree(va_range, mapping_gpu);
         uvm_ext_gpu_map_t *ext_map, *ext_map_next;
 
@@ -1067,6 +1074,7 @@ error:
     }
 
     uvm_va_space_up_read_rm(va_space);
+    uvm_processor_mask_cache_free(mapped_gpus);
 
     return status;
 }
@@ -1356,9 +1364,7 @@ static NV_STATUS uvm_free(uvm_va_space_t *va_space, NvU64 base, NvU64 length)
 {
     uvm_va_range_t *va_range;
     NV_STATUS status = NV_OK;
-    // TODO: Bug 4351121: retained_mask should be pre-allocated, not on the
-    // stack.
-    uvm_processor_mask_t retained_mask;
+    uvm_processor_mask_t *retained_mask = NULL;
     LIST_HEAD(deferred_free_list);
 
     if (uvm_api_range_invalid_4k(base, length))
@@ -1391,17 +1397,25 @@ static NV_STATUS uvm_free(uvm_va_space_t *va_space, NvU64 base, NvU64 length)
     }
 
     if (va_range->type == UVM_VA_RANGE_TYPE_EXTERNAL) {
+        retained_mask = va_range->external.retained_mask;
+
+        // Set the retained_mask to NULL to prevent
+        // uvm_va_range_destroy_external() from freeing the mask.
+        va_range->external.retained_mask = NULL;
+
+        UVM_ASSERT(retained_mask);
+
         // External ranges may have deferred free work, so the GPUs may have to
         // be retained. Construct the mask of all the GPUs that need to be
         // retained.
-        uvm_processor_mask_and(&retained_mask, &va_range->external.mapped_gpus, &va_space->registered_gpus);
+        uvm_processor_mask_and(retained_mask, &va_range->external.mapped_gpus, &va_space->registered_gpus);
     }
 
     uvm_va_range_destroy(va_range, &deferred_free_list);
 
     // If there is deferred work, retain the required GPUs.
     if (!list_empty(&deferred_free_list))
-        uvm_global_gpu_retain(&retained_mask);
+        uvm_global_gpu_retain(retained_mask);
 
 out:
     uvm_va_space_up_write(va_space);
@@ -1409,9 +1423,13 @@ out:
     if (!list_empty(&deferred_free_list)) {
         UVM_ASSERT(status == NV_OK);
         uvm_deferred_free_object_list(&deferred_free_list);
-        uvm_global_gpu_release(&retained_mask);
+        uvm_global_gpu_release(retained_mask);
     }
 
+    // Free the mask allocated in uvm_va_range_create_external() since
+    // uvm_va_range_destroy() won't free this mask.
+    uvm_processor_mask_cache_free(retained_mask);
+
     return status;
 }
 
diff --git a/kernel-open/nvidia-uvm/uvm_migrate.c b/kernel-open/nvidia-uvm/uvm_migrate.c
index 12f86feef..618b71867 100644
--- a/kernel-open/nvidia-uvm/uvm_migrate.c
+++ b/kernel-open/nvidia-uvm/uvm_migrate.c
@@ -214,13 +214,14 @@ static NV_STATUS block_migrate_add_mappings(uvm_va_block_t *va_block,
 
 NV_STATUS uvm_va_block_migrate_locked(uvm_va_block_t *va_block,
                                       uvm_va_block_retry_t *va_block_retry,
-                                      uvm_va_block_context_t *va_block_context,
+                                      uvm_service_block_context_t *service_context,
                                       uvm_va_block_region_t region,
                                       uvm_processor_id_t dest_id,
                                       uvm_migrate_mode_t mode,
                                       uvm_tracker_t *out_tracker)
 {
     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
+    uvm_va_block_context_t *va_block_context = service_context->block_context;
     NV_STATUS status, tracker_status = NV_OK;
 
     uvm_assert_mutex_locked(&va_block->lock);
@@ -229,7 +230,7 @@ NV_STATUS uvm_va_block_migrate_locked(uvm_va_block_t *va_block,
     if (uvm_va_block_is_hmm(va_block)) {
         status = uvm_hmm_va_block_migrate_locked(va_block,
                                                  va_block_retry,
-                                                 va_block_context,
+                                                 service_context,
                                                  dest_id,
                                                  region,
                                                  UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE);
@@ -438,7 +439,7 @@ static void preunmap_multi_block(uvm_va_range_t *va_range,
 }
 
 static NV_STATUS uvm_va_range_migrate_multi_block(uvm_va_range_t *va_range,
-                                                  uvm_va_block_context_t *va_block_context,
+                                                  uvm_service_block_context_t *service_context,
                                                   NvU64 start,
                                                   NvU64 end,
                                                   uvm_processor_id_t dest_id,
@@ -470,10 +471,11 @@ static NV_STATUS uvm_va_range_migrate_multi_block(uvm_va_range_t *va_range,
                                                     max(start, va_block->start),
                                                     min(end, va_block->end));
 
-        status = UVM_VA_BLOCK_LOCK_RETRY(va_block, &va_block_retry,
+        status = UVM_VA_BLOCK_LOCK_RETRY(va_block,
+                                         &va_block_retry,
                                          uvm_va_block_migrate_locked(va_block,
                                                                      &va_block_retry,
-                                                                     va_block_context,
+                                                                     service_context,
                                                                      region,
                                                                      dest_id,
                                                                      mode,
@@ -486,7 +488,7 @@ static NV_STATUS uvm_va_range_migrate_multi_block(uvm_va_range_t *va_range,
 }
 
 static NV_STATUS uvm_va_range_migrate(uvm_va_range_t *va_range,
-                                      uvm_va_block_context_t *va_block_context,
+                                      uvm_service_block_context_t *service_context,
                                       NvU64 start,
                                       NvU64 end,
                                       uvm_processor_id_t dest_id,
@@ -510,7 +512,7 @@ static NV_STATUS uvm_va_range_migrate(uvm_va_range_t *va_range,
             preunmap_range_end = min(preunmap_range_end - 1, end);
 
             preunmap_multi_block(va_range,
-                                 va_block_context,
+                                 service_context->block_context,
                                  preunmap_range_start,
                                  preunmap_range_end,
                                  dest_id);
@@ -520,7 +522,7 @@ static NV_STATUS uvm_va_range_migrate(uvm_va_range_t *va_range,
         }
 
         status = uvm_va_range_migrate_multi_block(va_range,
-                                                  va_block_context,
+                                                  service_context,
                                                   preunmap_range_start,
                                                   preunmap_range_end,
                                                   dest_id,
@@ -536,7 +538,7 @@ static NV_STATUS uvm_va_range_migrate(uvm_va_range_t *va_range,
 }
 
 static NV_STATUS uvm_migrate_ranges(uvm_va_space_t *va_space,
-                                    uvm_va_block_context_t *va_block_context,
+                                    uvm_service_block_context_t *service_context,
                                     uvm_va_range_t *first_va_range,
                                     NvU64 base,
                                     NvU64 length,
@@ -552,13 +554,7 @@ static NV_STATUS uvm_migrate_ranges(uvm_va_space_t *va_space,
 
     if (!first_va_range) {
         // For HMM, we iterate over va_blocks since there is no va_range.
-        return uvm_hmm_migrate_ranges(va_space,
-                                      va_block_context,
-                                      base,
-                                      length,
-                                      dest_id,
-                                      mode,
-                                      out_tracker);
+        return uvm_hmm_migrate_ranges(va_space, service_context, base, length, dest_id, mode, out_tracker);
     }
 
     UVM_ASSERT(first_va_range == uvm_va_space_iter_first(va_space, base, base));
@@ -587,7 +583,9 @@ static NV_STATUS uvm_migrate_ranges(uvm_va_space_t *va_space,
             if (!iter.migratable) {
                 // Only return NV_WARN_MORE_PROCESSING_REQUIRED if the pages aren't
                 // already resident at dest_id.
-                if (!uvm_va_policy_preferred_location_equal(policy, dest_id, va_block_context->make_resident.dest_nid))
+                if (!uvm_va_policy_preferred_location_equal(policy,
+                                                            dest_id,
+                                                            service_context->block_context->make_resident.dest_nid))
                     skipped_migrate = true;
             }
             else if (uvm_processor_mask_test(&va_range->uvm_lite_gpus, dest_id) &&
@@ -599,7 +597,7 @@ static NV_STATUS uvm_migrate_ranges(uvm_va_space_t *va_space,
             }
             else {
                 status = uvm_va_range_migrate(va_range,
-                                              va_block_context,
+                                              service_context,
                                               iter.start,
                                               iter.end,
                                               dest_id,
@@ -636,7 +634,7 @@ static NV_STATUS uvm_migrate(uvm_va_space_t *va_space,
                              uvm_tracker_t *out_tracker)
 {
     NV_STATUS status = NV_OK;
-    uvm_va_block_context_t *va_block_context;
+    uvm_service_block_context_t *service_context;
     bool do_mappings;
     bool do_two_passes;
     bool is_single_block;
@@ -654,11 +652,11 @@ static NV_STATUS uvm_migrate(uvm_va_space_t *va_space,
     else if (!first_va_range)
         return NV_ERR_INVALID_ADDRESS;
 
-    va_block_context = uvm_va_block_context_alloc(mm);
-    if (!va_block_context)
+    service_context = uvm_service_block_context_alloc(mm);
+    if (!service_context)
         return NV_ERR_NO_MEMORY;
 
-    va_block_context->make_resident.dest_nid = dest_nid;
+    service_context->block_context->make_resident.dest_nid = dest_nid;
 
     // We perform two passes (unless the migration only covers a single VA
     // block or UVM_MIGRATE_FLAG_SKIP_CPU_MAP is passed). This helps in the
@@ -688,7 +686,7 @@ static NV_STATUS uvm_migrate(uvm_va_space_t *va_space,
         should_do_cpu_preunmap = migration_should_do_cpu_preunmap(va_space, UVM_MIGRATE_PASS_FIRST, is_single_block);
 
         status = uvm_migrate_ranges(va_space,
-                                    va_block_context,
+                                    service_context,
                                     first_va_range,
                                     base,
                                     length,
@@ -706,7 +704,7 @@ static NV_STATUS uvm_migrate(uvm_va_space_t *va_space,
         should_do_cpu_preunmap = migration_should_do_cpu_preunmap(va_space, pass, is_single_block);
 
         status = uvm_migrate_ranges(va_space,
-                                    va_block_context,
+                                    service_context,
                                     first_va_range,
                                     base,
                                     length,
@@ -716,7 +714,7 @@ static NV_STATUS uvm_migrate(uvm_va_space_t *va_space,
                                     out_tracker);
     }
 
-    uvm_va_block_context_free(va_block_context);
+    uvm_service_block_context_free(service_context);
 
     return status;
 }
diff --git a/kernel-open/nvidia-uvm/uvm_mmu.c b/kernel-open/nvidia-uvm/uvm_mmu.c
index 83f888451..a66b23a2b 100644
--- a/kernel-open/nvidia-uvm/uvm_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_mmu.c
@@ -2357,6 +2357,8 @@ NV_STATUS uvm_mmu_create_peer_identity_mappings(uvm_gpu_t *gpu, uvm_gpu_t *peer)
     NvU64 phys_offset;
     uvm_gpu_identity_mapping_t *peer_mapping;
 
+    UVM_ASSERT(gpu->parent->peer_copy_mode < UVM_GPU_PEER_COPY_MODE_COUNT);
+
     if (gpu->parent->peer_copy_mode != UVM_GPU_PEER_COPY_MODE_VIRTUAL || peer->mem_info.size == 0)
         return NV_OK;
 
diff --git a/kernel-open/nvidia-uvm/uvm_perf_thrashing.c b/kernel-open/nvidia-uvm/uvm_perf_thrashing.c
index 29841ee93..2858e6f06 100644
--- a/kernel-open/nvidia-uvm/uvm_perf_thrashing.c
+++ b/kernel-open/nvidia-uvm/uvm_perf_thrashing.c
@@ -901,6 +901,7 @@ static pinned_page_t *find_pinned_page(block_thrashing_info_t *block_thrashing,
 //
 static NV_STATUS thrashing_pin_page(va_space_thrashing_info_t *va_space_thrashing,
                                     uvm_va_block_t *va_block,
+                                    uvm_va_block_context_t *va_block_context,
                                     block_thrashing_info_t *block_thrashing,
                                     page_thrashing_info_t *page_thrashing,
                                     uvm_page_index_t page_index,
@@ -908,17 +909,17 @@ static NV_STATUS thrashing_pin_page(va_space_thrashing_info_t *va_space_thrashin
                                     uvm_processor_id_t residency,
                                     uvm_processor_id_t requester)
 {
-    uvm_processor_mask_t current_residency;
+    uvm_processor_mask_t *current_residency = &va_block_context->scratch_processor_mask;
 
     uvm_assert_mutex_locked(&va_block->lock);
     UVM_ASSERT(!uvm_processor_mask_test(&page_thrashing->throttled_processors, requester));
 
-    uvm_va_block_page_resident_processors(va_block, page_index, &current_residency);
+    uvm_va_block_page_resident_processors(va_block, page_index, current_residency);
 
     // If we are pinning the page for the first time or we are pinning it on a
     // different location that the current location, reset the throttling state
     // to make sure that we flush any pending ThrottlingEnd events.
-    if (!page_thrashing->pinned || !uvm_processor_mask_test(&current_residency, residency))
+    if (!page_thrashing->pinned || !uvm_processor_mask_test(current_residency, residency))
         thrashing_throttling_reset_page(va_block, block_thrashing, page_thrashing, page_index);
 
     if (!page_thrashing->pinned) {
@@ -1120,8 +1121,7 @@ static NV_STATUS unmap_remote_pinned_pages(uvm_va_block_t *va_block,
                 continue;
         }
         else {
-            uvm_page_mask_copy(&va_block_context->caller_page_mask,
-                               &block_thrashing->pinned_pages.mask);
+            uvm_page_mask_copy(&va_block_context->caller_page_mask, &block_thrashing->pinned_pages.mask);
         }
 
         status = uvm_va_block_unmap(va_block,
@@ -1148,7 +1148,7 @@ NV_STATUS uvm_perf_thrashing_unmap_remote_pinned_pages_all(uvm_va_block_t *va_bl
                                                            uvm_va_block_region_t region)
 {
     block_thrashing_info_t *block_thrashing;
-    uvm_processor_mask_t unmap_processors;
+    uvm_processor_mask_t *unmap_processors = &va_block_context->unmap_processors_mask;
     const uvm_va_policy_t *policy = uvm_va_policy_get_region(va_block, region);
 
     uvm_assert_mutex_locked(&va_block->lock);
@@ -1162,9 +1162,9 @@ NV_STATUS uvm_perf_thrashing_unmap_remote_pinned_pages_all(uvm_va_block_t *va_bl
 
     // Unmap all mapped processors (that are not SetAccessedBy) with
     // no copy of the page
-    uvm_processor_mask_andnot(&unmap_processors, &va_block->mapped, &policy->accessed_by);
+    uvm_processor_mask_andnot(unmap_processors, &va_block->mapped, &policy->accessed_by);
 
-    return unmap_remote_pinned_pages(va_block, va_block_context, block_thrashing, region, &unmap_processors);
+    return unmap_remote_pinned_pages(va_block, va_block_context, block_thrashing, region, unmap_processors);
 }
 
 // Check that we are not migrating pages away from its pinned location and
@@ -1391,22 +1391,23 @@ static bool thrashing_processors_can_access(uvm_va_space_t *va_space,
 }
 
 static bool thrashing_processors_have_fast_access_to(uvm_va_space_t *va_space,
+                                                     uvm_va_block_context_t *va_block_context,
                                                      page_thrashing_info_t *page_thrashing,
                                                      uvm_processor_id_t to)
 {
-    uvm_processor_mask_t fast_to;
+    uvm_processor_mask_t *fast_to = &va_block_context->fast_access_mask;
 
     if (UVM_ID_IS_INVALID(to))
         return false;
 
     // Combine NVLINK and native atomics mask since we could have PCIe
     // atomics in the future
-    uvm_processor_mask_and(&fast_to,
+    uvm_processor_mask_and(fast_to,
                            &va_space->has_nvlink[uvm_id_value(to)],
                            &va_space->has_native_atomics[uvm_id_value(to)]);
-    uvm_processor_mask_set(&fast_to, to);
+    uvm_processor_mask_set(fast_to, to);
 
-    return uvm_processor_mask_subset(&page_thrashing->processors, &fast_to);
+    return uvm_processor_mask_subset(&page_thrashing->processors, fast_to);
 }
 
 static void thrashing_processors_common_locations(uvm_va_space_t *va_space,
@@ -1488,7 +1489,7 @@ static uvm_perf_thrashing_hint_t get_hint_for_migration_thrashing(va_space_thras
         hint.pin.residency = preferred_location;
     }
     else if (!preferred_location_is_thrashing(preferred_location, page_thrashing) &&
-             thrashing_processors_have_fast_access_to(va_space, page_thrashing, closest_resident_id)) {
+             thrashing_processors_have_fast_access_to(va_space, va_block_context, page_thrashing, closest_resident_id)){
         // This is a fast path for those scenarios in which all thrashing
         // processors have fast (NVLINK + native atomics) access to the current
         // residency. This is skipped if the preferred location is thrashing and
@@ -1545,15 +1546,15 @@ static uvm_perf_thrashing_hint_t get_hint_for_migration_thrashing(va_space_thras
                 hint.pin.residency = requester;
             }
             else {
-                uvm_processor_mask_t common_locations;
+                uvm_processor_mask_t *common_locations = &va_block_context->scratch_processor_mask;
 
-                thrashing_processors_common_locations(va_space, page_thrashing, &common_locations);
-                if (uvm_processor_mask_empty(&common_locations)) {
+                thrashing_processors_common_locations(va_space, page_thrashing, common_locations);
+                if (uvm_processor_mask_empty(common_locations)) {
                     hint.pin.residency = requester;
                 }
                 else {
                     // Find the common location that is closest to the requester
-                    hint.pin.residency = uvm_processor_mask_find_closest_id(va_space, &common_locations, requester);
+                    hint.pin.residency = uvm_processor_mask_find_closest_id(va_space, common_locations, requester);
                 }
             }
         }
@@ -1725,6 +1726,7 @@ done:
     if (hint.type == UVM_PERF_THRASHING_HINT_TYPE_PIN) {
         NV_STATUS status = thrashing_pin_page(va_space_thrashing,
                                               va_block,
+                                              va_block_context,
                                               block_thrashing,
                                               page_thrashing,
                                               page_index,
diff --git a/kernel-open/nvidia-uvm/uvm_pmm_sysmem_test.c b/kernel-open/nvidia-uvm/uvm_pmm_sysmem_test.c
index 66a9837a4..0d1b971a9 100644
--- a/kernel-open/nvidia-uvm/uvm_pmm_sysmem_test.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_sysmem_test.c
@@ -1207,35 +1207,38 @@ static NV_STATUS test_cpu_chunk_numa_alloc(uvm_va_space_t *va_space)
 NV_STATUS uvm_test_cpu_chunk_api(UVM_TEST_CPU_CHUNK_API_PARAMS *params, struct file *filp)
 {
     uvm_va_space_t *va_space = uvm_va_space_get(filp);
-    uvm_processor_mask_t test_gpus;
+    uvm_processor_mask_t *test_gpus;
     uvm_gpu_t *gpu;
     NV_STATUS status = NV_OK;
 
-    uvm_va_space_down_read(va_space);
-    uvm_processor_mask_and(&test_gpus,
-                           &va_space->registered_gpus,
-                           &va_space->accessible_from[uvm_id_value(UVM_ID_CPU)]);
+    test_gpus = uvm_processor_mask_cache_alloc();
+    if (!test_gpus)
+        return NV_ERR_NO_MEMORY;
 
-    for_each_va_space_gpu_in_mask(gpu, va_space, &test_gpus) {
+    uvm_va_space_down_read(va_space);
+    uvm_processor_mask_and(test_gpus, &va_space->registered_gpus, &va_space->accessible_from[uvm_id_value(UVM_ID_CPU)]);
+
+    for_each_va_space_gpu_in_mask(gpu, va_space, test_gpus) {
         TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_basic(gpu, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE), done);
         TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_basic(gpu, UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO), done);
         TEST_NV_CHECK_GOTO(test_cpu_chunk_split_and_merge(gpu), done);
         TEST_NV_CHECK_GOTO(test_cpu_chunk_dirty(gpu), done);
     }
 
-    TEST_NV_CHECK_GOTO(test_cpu_chunk_free(va_space, &test_gpus), done);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_free(va_space, test_gpus), done);
     TEST_NV_CHECK_GOTO(test_cpu_chunk_numa_alloc(va_space), done);
 
-    if (uvm_processor_mask_get_gpu_count(&test_gpus) >= 3) {
+    if (uvm_processor_mask_get_gpu_count(test_gpus) >= 3) {
         uvm_gpu_t *gpu2, *gpu3;
 
-        gpu = uvm_processor_mask_find_first_va_space_gpu(&test_gpus, va_space);
-        gpu2 = uvm_processor_mask_find_next_va_space_gpu(&test_gpus, va_space, gpu);
-        gpu3 = uvm_processor_mask_find_next_va_space_gpu(&test_gpus, va_space, gpu2);
+        gpu = uvm_processor_mask_find_first_va_space_gpu(test_gpus, va_space);
+        gpu2 = uvm_processor_mask_find_next_va_space_gpu(test_gpus, va_space, gpu);
+        gpu3 = uvm_processor_mask_find_next_va_space_gpu(test_gpus, va_space, gpu2);
         TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_array(gpu, gpu2, gpu3), done);
     }
 
 done:
     uvm_va_space_up_read(va_space);
+    uvm_processor_mask_cache_free(test_gpus);
     return status;
 }
diff --git a/kernel-open/nvidia-uvm/uvm_policy.c b/kernel-open/nvidia-uvm/uvm_policy.c
index 38c53eeb6..f03e5578f 100644
--- a/kernel-open/nvidia-uvm/uvm_policy.c
+++ b/kernel-open/nvidia-uvm/uvm_policy.c
@@ -720,7 +720,6 @@ static NV_STATUS va_block_unset_read_duplication_locked(uvm_va_block_t *va_block
     uvm_page_mask_t *break_read_duplication_pages = &va_block_context->caller_page_mask;
     const uvm_va_policy_t *policy = uvm_va_range_get_policy(va_block->va_range);
     uvm_processor_id_t preferred_location = policy->preferred_location;
-    uvm_processor_mask_t accessed_by = policy->accessed_by;
 
     uvm_assert_mutex_locked(&va_block->lock);
 
@@ -779,7 +778,7 @@ static NV_STATUS va_block_unset_read_duplication_locked(uvm_va_block_t *va_block
     }
 
     // 2- Re-establish SetAccessedBy mappings
-    for_each_id_in_mask(processor_id, &accessed_by) {
+    for_each_id_in_mask(processor_id, &policy->accessed_by) {
         status = uvm_va_block_set_accessed_by_locked(va_block,
                                                      va_block_context,
                                                      processor_id,
diff --git a/kernel-open/nvidia-uvm/uvm_push.c b/kernel-open/nvidia-uvm/uvm_push.c
index 4445e520e..b9c3f17e0 100644
--- a/kernel-open/nvidia-uvm/uvm_push.c
+++ b/kernel-open/nvidia-uvm/uvm_push.c
@@ -25,6 +25,7 @@
 #include "uvm_forward_decl.h"
 #include "uvm_push.h"
 #include "uvm_channel.h"
+#include "uvm_global.h"
 #include "uvm_hal.h"
 #include "uvm_kvmalloc.h"
 #include "uvm_linux.h"
@@ -55,6 +56,13 @@ static uvm_push_acquire_info_t *push_acquire_info_from_push(uvm_push_t *push)
     return &channel->push_acquire_infos[push->push_info_index];
 }
 
+bool uvm_push_allow_dependencies_across_gpus(void)
+{
+    // In Confidential Computing a GPU semaphore release cannot be waited on
+    // (acquired by) any other GPU, due to a mix of HW and SW constraints.
+    return !g_uvm_global.conf_computing_enabled;
+}
+
 // Acquire a single tracker entry. Subsequently pushed GPU work will not start
 // before the work tracked by tracker entry is complete.
 static void push_acquire_tracker_entry(uvm_push_t *push,
@@ -77,9 +85,14 @@ static void push_acquire_tracker_entry(uvm_push_t *push,
     if (channel == entry_channel)
         return;
 
-    semaphore_va = uvm_channel_tracking_semaphore_get_gpu_va_in_channel(entry_channel, channel);
     gpu = uvm_channel_get_gpu(channel);
 
+    // If dependencies across GPUs are disallowed, the caller is required to
+    // previously wait on such dependencies.
+    if (gpu != uvm_tracker_entry_gpu(tracker_entry))
+        UVM_ASSERT(uvm_push_allow_dependencies_across_gpus());
+
+    semaphore_va = uvm_channel_tracking_semaphore_get_gpu_va_in_channel(entry_channel, channel);
     gpu->parent->host_hal->semaphore_acquire(push, semaphore_va, (NvU32)tracker_entry->value);
 
     if (push_acquire_info) {
@@ -188,6 +201,17 @@ static void push_fill_info(uvm_push_t *push,
         push_set_description(push, format, args);
 }
 
+static NV_STATUS wait_for_other_gpus_if_needed(uvm_tracker_t *tracker, uvm_gpu_t *gpu)
+{
+    if (tracker == NULL)
+        return NV_OK;
+
+    if (uvm_push_allow_dependencies_across_gpus())
+        return NV_OK;
+
+    return uvm_tracker_wait_for_other_gpus(tracker, gpu);
+}
+
 static NV_STATUS push_begin_acquire_with_info(uvm_channel_t *channel,
                                               uvm_tracker_t *tracker,
                                               uvm_push_t *push,
@@ -234,6 +258,10 @@ NV_STATUS __uvm_push_begin_acquire_with_info(uvm_channel_manager_t *manager,
         UVM_ASSERT(dst_gpu != manager->gpu);
     }
 
+    status = wait_for_other_gpus_if_needed(tracker, manager->gpu);
+    if (status != NV_OK)
+        return status;
+
     status = push_reserve_channel(manager, type, dst_gpu, &channel);
     if (status != NV_OK)
         return status;
@@ -262,6 +290,10 @@ NV_STATUS __uvm_push_begin_acquire_on_channel_with_info(uvm_channel_t *channel,
     va_list args;
     NV_STATUS status;
 
+    status = wait_for_other_gpus_if_needed(tracker, uvm_channel_get_gpu(channel));
+    if (status != NV_OK)
+        return status;
+
     status = uvm_channel_reserve(channel, 1);
     if (status != NV_OK)
         return status;
@@ -276,20 +308,19 @@ NV_STATUS __uvm_push_begin_acquire_on_channel_with_info(uvm_channel_t *channel,
     return status;
 }
 
-__attribute__ ((format(printf, 7, 8)))
-NV_STATUS __uvm_push_begin_acquire_on_reserved_channel_with_info(uvm_channel_t *channel,
-                                                                 uvm_tracker_t *tracker,
-                                                                 uvm_push_t *push,
-                                                                 const char *filename,
-                                                                 const char *function,
-                                                                 int line,
-                                                                 const char *format, ...)
+__attribute__ ((format(printf, 6, 7)))
+NV_STATUS __uvm_push_begin_on_reserved_channel_with_info(uvm_channel_t *channel,
+                                                         uvm_push_t *push,
+                                                         const char *filename,
+                                                         const char *function,
+                                                         int line,
+                                                         const char *format, ...)
 {
     va_list args;
     NV_STATUS status;
 
     va_start(args, format);
-    status = push_begin_acquire_with_info(channel, tracker, push, filename, function, line, format, args);
+    status = push_begin_acquire_with_info(channel, NULL, push, filename, function, line, format, args);
     va_end(args);
 
     return status;
@@ -308,6 +339,7 @@ bool uvm_push_info_is_tracking_acquires(void)
 void uvm_push_end(uvm_push_t *push)
 {
     uvm_push_flag_t flag;
+
     uvm_channel_end_push(push);
 
     flag = find_first_bit(push->flags, UVM_PUSH_FLAG_COUNT);
@@ -319,6 +351,7 @@ void uvm_push_end(uvm_push_t *push)
 NV_STATUS uvm_push_wait(uvm_push_t *push)
 {
     uvm_tracker_entry_t entry;
+
     uvm_push_get_tracker_entry(push, &entry);
 
     return uvm_tracker_wait_for_entry(&entry);
diff --git a/kernel-open/nvidia-uvm/uvm_push.h b/kernel-open/nvidia-uvm/uvm_push.h
index b94af0b37..e246cbe12 100644
--- a/kernel-open/nvidia-uvm/uvm_push.h
+++ b/kernel-open/nvidia-uvm/uvm_push.h
@@ -208,14 +208,13 @@ NV_STATUS __uvm_push_begin_acquire_on_channel_with_info(uvm_channel_t *channel,
                                                         const char *format, ...);
 
 // Internal helper for uvm_push_begin_on_reserved channel
-__attribute__ ((format(printf, 7, 8)))
-NV_STATUS __uvm_push_begin_acquire_on_reserved_channel_with_info(uvm_channel_t *channel,
-                                                                 uvm_tracker_t *tracker,
-                                                                 uvm_push_t *push,
-                                                                 const char *filename,
-                                                                 const char *function,
-                                                                 int line,
-                                                                 const char *format, ...);
+__attribute__ ((format(printf, 6, 7)))
+NV_STATUS __uvm_push_begin_on_reserved_channel_with_info(uvm_channel_t *channel,
+                                                         uvm_push_t *push,
+                                                         const char *filename,
+                                                         const char *function,
+                                                         int line,
+                                                         const char *format, ...);
 // Begin a push on a channel of channel_type type
 // Picks the first available channel. If all channels of the given type are
 // busy, spin waits for one to become available.
@@ -269,8 +268,8 @@ NV_STATUS __uvm_push_begin_acquire_on_reserved_channel_with_info(uvm_channel_t *
 //
 // Locking: on success acquires the concurrent push semaphore until
 //          uvm_push_end()
-#define uvm_push_begin_on_reserved_channel(channel, push, format, ...)              \
-    __uvm_push_begin_acquire_on_reserved_channel_with_info((channel), NULL, (push), \
+#define uvm_push_begin_on_reserved_channel(channel, push, format, ...) \
+    __uvm_push_begin_on_reserved_channel_with_info((channel), (push),  \
         __FILE__, __FUNCTION__, __LINE__, (format), ##__VA_ARGS__)
 
 // Same as uvm_push_begin_on_channel except it also acquires the input tracker
@@ -324,6 +323,11 @@ static void uvm_push_get_tracker_entry(uvm_push_t *push, uvm_tracker_entry_t *en
 // Subsequently pushed GPU work will not start before all the work tracked by
 // tracker is complete.
 // Notably a NULL tracker is handled the same way as an empty tracker.
+//
+// If dependencies across GPUs are not allowed in the current configuration
+// (see uvm_push_allow_dependencies_across_gpus), the caller is responsible for
+// ensuring that the input tracker does not contain dependencies on GPUs other
+// than the one associated with the push.
 void uvm_push_acquire_tracker(uvm_push_t *push, uvm_tracker_t *tracker);
 
 // Set a push flag
@@ -480,4 +484,8 @@ static uvm_push_info_t *uvm_push_info_from_push(uvm_push_t *push)
     return &channel->push_infos[push->push_info_index];
 }
 
+// Returns true if a push is allowed to depend on pushes on other GPUs: work
+// dependencies across GPUs are permitted.
+bool uvm_push_allow_dependencies_across_gpus(void);
+
 #endif // __UVM_PUSH_H__
diff --git a/kernel-open/nvidia-uvm/uvm_range_group.c b/kernel-open/nvidia-uvm/uvm_range_group.c
index 0509c62c9..3458b778f 100644
--- a/kernel-open/nvidia-uvm/uvm_range_group.c
+++ b/kernel-open/nvidia-uvm/uvm_range_group.c
@@ -182,7 +182,7 @@ static NV_STATUS uvm_range_group_va_range_migrate_block_locked(uvm_va_range_t *v
     NV_STATUS status;
     NV_STATUS tracker_status;
     uvm_gpu_id_t gpu_id;
-    uvm_processor_mask_t map_mask;
+    uvm_processor_mask_t *map_mask = &va_block_context->caller_processor_mask;
     uvm_va_policy_t *policy = uvm_va_range_get_policy(va_range);
 
     // Set the migration CPU NUMA node from the policy.
@@ -212,6 +212,7 @@ static NV_STATUS uvm_range_group_va_range_migrate_block_locked(uvm_va_range_t *v
                                             NULL,
                                             UVM_MAKE_RESIDENT_CAUSE_API_SET_RANGE_GROUP);
     }
+
     if (status != NV_OK)
         return status;
 
@@ -228,12 +229,12 @@ static NV_STATUS uvm_range_group_va_range_migrate_block_locked(uvm_va_range_t *v
         goto out;
 
     // 2- Map faultable SetAccessedBy GPUs.
-    uvm_processor_mask_and(&map_mask,
+    uvm_processor_mask_and(map_mask,
                            &uvm_va_range_get_policy(va_range)->accessed_by,
                            &va_range->va_space->can_access[uvm_id_value(policy->preferred_location)]);
-    uvm_processor_mask_andnot(&map_mask, &map_mask, &va_range->uvm_lite_gpus);
+    uvm_processor_mask_andnot(map_mask, map_mask, &va_range->uvm_lite_gpus);
 
-    for_each_gpu_id_in_mask(gpu_id, &map_mask) {
+    for_each_gpu_id_in_mask(gpu_id, map_mask) {
         status = uvm_va_block_add_mappings(va_block,
                                            va_block_context,
                                            gpu_id,
diff --git a/kernel-open/nvidia-uvm/uvm_tools.c b/kernel-open/nvidia-uvm/uvm_tools.c
index 2aa876e25..cb7bc4dca 100644
--- a/kernel-open/nvidia-uvm/uvm_tools.c
+++ b/kernel-open/nvidia-uvm/uvm_tools.c
@@ -1538,12 +1538,18 @@ void uvm_tools_record_read_duplicate(uvm_va_block_t *va_block,
                                      uvm_va_block_region_t region,
                                      const uvm_page_mask_t *page_mask)
 {
+    uvm_processor_mask_t *resident_processors;
     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
 
     if (!va_space->tools.enabled)
         return;
 
+    resident_processors = uvm_processor_mask_cache_alloc();
+    if (!resident_processors)
+        return;
+
     uvm_down_read(&va_space->tools.lock);
+
     if (tools_is_event_enabled_version(va_space, UvmEventTypeReadDuplicate, UvmToolsEventQueueVersion_V1)) {
         UvmEventEntry_V1 entry;
         UvmEventReadDuplicateInfo_V1 *info_read_duplicate = &entry.eventData.readDuplicate;
@@ -1556,20 +1562,20 @@ void uvm_tools_record_read_duplicate(uvm_va_block_t *va_block,
         info_read_duplicate->timeStamp = NV_GETTIME();
 
         for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
-            uvm_processor_mask_t resident_processors;
             uvm_processor_id_t id;
 
             info_read_duplicate->address = uvm_va_block_cpu_page_address(va_block, page_index);
             info_read_duplicate->processors = 0;
 
-            uvm_va_block_page_resident_processors(va_block, page_index, &resident_processors);
-            for_each_id_in_mask(id, &resident_processors)
-                __set_bit(uvm_parent_id_value_from_processor_id(id),
-                          (unsigned long *)&info_read_duplicate->processors);
+            uvm_va_block_page_resident_processors(va_block, page_index, resident_processors);
+
+            for_each_id_in_mask(id, resident_processors)
+                __set_bit(uvm_parent_id_value_from_processor_id(id), (unsigned long *)&info_read_duplicate->processors);
 
             uvm_tools_record_event_v1(va_space, &entry);
         }
     }
+
     if (tools_is_event_enabled_version(va_space, UvmEventTypeReadDuplicate, UvmToolsEventQueueVersion_V2)) {
         UvmEventEntry_V2 entry;
         UvmEventReadDuplicateInfo_V2 *info_read_duplicate = &entry.eventData.readDuplicate;
@@ -1582,21 +1588,23 @@ void uvm_tools_record_read_duplicate(uvm_va_block_t *va_block,
         info_read_duplicate->timeStamp = NV_GETTIME();
 
         for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
-            uvm_processor_mask_t resident_processors;
             uvm_processor_id_t id;
 
             info_read_duplicate->address = uvm_va_block_cpu_page_address(va_block, page_index);
             memset(info_read_duplicate->processors, 0, sizeof(info_read_duplicate->processors));
 
-            uvm_va_block_page_resident_processors(va_block, page_index, &resident_processors);
-            for_each_id_in_mask(id, &resident_processors)
-                __set_bit(uvm_id_value(id),
-                          (unsigned long *)info_read_duplicate->processors);
+            uvm_va_block_page_resident_processors(va_block, page_index, resident_processors);
+
+            for_each_id_in_mask(id, resident_processors)
+                __set_bit(uvm_id_value(id), (unsigned long *)info_read_duplicate->processors);
 
             uvm_tools_record_event_v2(va_space, &entry);
         }
     }
+
     uvm_up_read(&va_space->tools.lock);
+
+    uvm_processor_mask_cache_free(resident_processors);
 }
 
 void uvm_tools_record_read_duplicate_invalidate(uvm_va_block_t *va_block,
diff --git a/kernel-open/nvidia-uvm/uvm_tracker.c b/kernel-open/nvidia-uvm/uvm_tracker.c
index 542e5e154..0f8a04bc6 100644
--- a/kernel-open/nvidia-uvm/uvm_tracker.c
+++ b/kernel-open/nvidia-uvm/uvm_tracker.c
@@ -200,18 +200,26 @@ NV_STATUS uvm_tracker_add_tracker(uvm_tracker_t *dst, uvm_tracker_t *src)
     NV_STATUS status;
     uvm_tracker_entry_t *src_entry;
 
+    UVM_ASSERT(dst != NULL);
+
+    if (src == NULL)
+        return NV_OK;
+
     if (src == dst)
         return NV_OK;
 
+    if (uvm_tracker_is_empty(src))
+        return NV_OK;
+
     status = uvm_tracker_reserve(dst, src->size);
     if (status == NV_ERR_NO_MEMORY) {
         uvm_tracker_remove_completed(dst);
         uvm_tracker_remove_completed(src);
         status = reserve_for_entries_from_tracker(dst, src);
     }
-    if (status != NV_OK) {
+
+    if (status != NV_OK)
         return status;
-    }
 
     for_each_tracker_entry(src_entry, src) {
         status = uvm_tracker_add_entry(dst, src_entry);
diff --git a/kernel-open/nvidia-uvm/uvm_va_block.c b/kernel-open/nvidia-uvm/uvm_va_block.c
index 42ce467d7..906a4dae7 100644
--- a/kernel-open/nvidia-uvm/uvm_va_block.c
+++ b/kernel-open/nvidia-uvm/uvm_va_block.c
@@ -9341,35 +9341,43 @@ void uvm_va_block_unmap_preferred_location_uvm_lite(uvm_va_block_t *va_block, uv
 //
 // Notably the caller needs to support allocation-retry as
 // uvm_va_block_migrate_locked() requires that.
-static NV_STATUS block_evict_pages_from_gpu(uvm_va_block_t *va_block,
-                                            uvm_va_block_context_t *va_block_context,
-                                            uvm_gpu_t *gpu)
+static NV_STATUS block_evict_pages_from_gpu(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm)
 {
     NV_STATUS status = NV_OK;
     const uvm_page_mask_t *resident = uvm_va_block_resident_mask_get(va_block, gpu->id, NUMA_NO_NODE);
     uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block);
     uvm_va_block_region_t subregion;
+    uvm_service_block_context_t *service_context;
+
+    service_context = uvm_service_block_context_alloc(mm);
+    if (!service_context)
+        return NV_ERR_NO_MEMORY;
 
     // Move all subregions resident on the GPU to the CPU
     for_each_va_block_subregion_in_mask(subregion, resident, region) {
         if (uvm_va_block_is_hmm(va_block)) {
-            status = uvm_hmm_va_block_evict_pages_from_gpu(va_block, gpu, va_block_context, resident, subregion);
+            status = uvm_hmm_va_block_evict_pages_from_gpu(va_block, gpu, service_context, resident, subregion);
         }
         else {
             status = uvm_va_block_migrate_locked(va_block,
                                                  NULL,
-                                                 va_block_context,
+                                                 service_context,
                                                  subregion,
                                                  UVM_ID_CPU,
                                                  UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP,
                                                  NULL);
         }
+
         if (status != NV_OK)
-            return status;
+            break;
     }
 
-    UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, gpu->id));
-    return NV_OK;
+    if (status == NV_OK)
+        UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, gpu->id));
+
+    uvm_service_block_context_free(service_context);
+
+    return status;
 }
 
 void uvm_va_block_unregister_gpu_locked(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm)
@@ -9393,7 +9401,7 @@ void uvm_va_block_unregister_gpu_locked(uvm_va_block_t *va_block, uvm_gpu_t *gpu
     // we don't rely on any state of the block across the call.
     // TODO: Bug 4494289: Prevent setting the global error on allocation
     // failures.
-    status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, NULL, block_evict_pages_from_gpu(va_block, va_block_context, gpu));
+    status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, NULL, block_evict_pages_from_gpu(va_block, gpu, mm));
     if (status != NV_OK) {
         UVM_ERR_PRINT("Failed to evict GPU pages on GPU unregister: %s, GPU %s\n",
                       nvstatusToString(status),
@@ -12981,6 +12989,7 @@ NV_STATUS uvm_va_block_evict_chunks(uvm_va_block_t *va_block,
     uvm_va_block_region_t chunk_region;
     size_t num_gpu_chunks = block_num_gpu_chunks(va_block, gpu);
     size_t chunks_to_evict = 0;
+    uvm_service_block_context_t *service_context;
     uvm_va_block_context_t *block_context;
     uvm_page_mask_t *pages_to_evict;
     uvm_va_block_test_t *va_block_test = uvm_va_block_get_test(va_block);
@@ -13008,13 +13017,17 @@ NV_STATUS uvm_va_block_evict_chunks(uvm_va_block_t *va_block,
     // allocations. If mappings need to be created,
     // block_add_eviction_mappings() will be scheduled below.
     mm = uvm_va_space_mm_retain(va_space);
-    block_context = uvm_va_block_context_alloc(mm);
-    if (!block_context) {
+
+    service_context = uvm_service_block_context_alloc(mm);
+    if (!service_context) {
         if (mm)
             uvm_va_space_mm_release(va_space);
+
         return NV_ERR_NO_MEMORY;
     }
 
+    block_context = service_context->block_context;
+
     pages_to_evict = &block_context->caller_page_mask;
     uvm_page_mask_zero(pages_to_evict);
     chunk_region.outer = 0;
@@ -13051,7 +13064,7 @@ NV_STATUS uvm_va_block_evict_chunks(uvm_va_block_t *va_block,
 
     if (uvm_va_block_is_hmm(va_block)) {
         status = uvm_hmm_va_block_evict_chunks(va_block,
-                                               block_context,
+                                               service_context,
                                                pages_to_evict,
                                                uvm_va_block_region_from_block(va_block),
                                                &accessed_by_set);
@@ -13168,7 +13181,8 @@ NV_STATUS uvm_va_block_evict_chunks(uvm_va_block_t *va_block,
     }
 
 out:
-    uvm_va_block_context_free(block_context);
+    uvm_service_block_context_free(service_context);
+
     if (mm)
         uvm_va_space_mm_release(va_space);
 
diff --git a/kernel-open/nvidia-uvm/uvm_va_block.h b/kernel-open/nvidia-uvm/uvm_va_block.h
index 009c0d804..220df10b0 100644
--- a/kernel-open/nvidia-uvm/uvm_va_block.h
+++ b/kernel-open/nvidia-uvm/uvm_va_block.h
@@ -1504,17 +1504,19 @@ uvm_gpu_chunk_t *uvm_va_block_lookup_gpu_chunk(uvm_va_block_t *va_block, uvm_gpu
 // The caller needs to handle allocation-retry. va_block_retry can be NULL if
 // the destination is the CPU.
 //
-// va_block_context must not be NULL and policy for the region must match. See
-// the comments for uvm_va_block_check_policy_is_valid().  If va_block is a HMM
-// block, va_block_context->hmm.vma must be valid.  See the comments for
+// service_context and service_context->block_context must not be NULL and
+// policy for the region must match. See the comments for
+// uvm_va_block_check_policy_is_valid().  If va_block is a HMM block,
+// service->block_context->hmm.vma must be valid.  See the comments for
 // uvm_hmm_check_context_vma_is_valid() in uvm_hmm.h.
 //
-// LOCKING: The caller must hold the va_block lock. If va_block_context->mm !=
-//          NULL, va_block_context->mm->mmap_lock must be held in at least
-//          read mode.
+// LOCKING: The caller must hold the va_block lock. If
+//          service_context->va_block_context->mm != NULL,
+//          service_context->va_block_context->mm->mmap_lock must be held in at
+//          least read mode.
 NV_STATUS uvm_va_block_migrate_locked(uvm_va_block_t *va_block,
                                       uvm_va_block_retry_t *va_block_retry,
-                                      uvm_va_block_context_t *va_block_context,
+                                      uvm_service_block_context_t *service_context,
                                       uvm_va_block_region_t region,
                                       uvm_processor_id_t dest_id,
                                       uvm_migrate_mode_t mode,
diff --git a/kernel-open/nvidia-uvm/uvm_va_block_types.h b/kernel-open/nvidia-uvm/uvm_va_block_types.h
index 9912c0b37..a2595f300 100644
--- a/kernel-open/nvidia-uvm/uvm_va_block_types.h
+++ b/kernel-open/nvidia-uvm/uvm_va_block_types.h
@@ -167,6 +167,10 @@ typedef struct
     // block APIs.
     uvm_page_mask_t caller_page_mask;
 
+    // Available as scratch space for the caller. Not used by any of the VA
+    // block APIs.
+    uvm_processor_mask_t caller_processor_mask;
+
     // Available as scratch space for the internal APIs. This is like a caller-
     // save register: it shouldn't be used across function calls which also take
     // this block_context.
@@ -180,9 +184,15 @@ typedef struct
     // this va_block_context.
     uvm_processor_mask_t scratch_processor_mask;
 
-    // Temporary mask in block_add_eviction_mappings().
+    // Temporary mask used in block_add_eviction_mappings().
     uvm_processor_mask_t map_processors_eviction;
 
+    // Temporary mask used in uvm_perf_thrashing_unmap_remote_pinned_pages_all.
+    uvm_processor_mask_t unmap_processors_mask;
+
+    // Temporary mask used in thrashing_processors_have_fast_access().
+    uvm_processor_mask_t fast_access_mask;
+
     // State used by uvm_va_block_make_resident
     struct uvm_make_resident_context_struct
     {
diff --git a/kernel-open/nvidia-uvm/uvm_va_range.c b/kernel-open/nvidia-uvm/uvm_va_range.c
index b4adc2542..c4bbaa8c2 100644
--- a/kernel-open/nvidia-uvm/uvm_va_range.c
+++ b/kernel-open/nvidia-uvm/uvm_va_range.c
@@ -222,6 +222,7 @@ NV_STATUS uvm_va_range_create_external(uvm_va_space_t *va_space,
 {
     NV_STATUS status;
     uvm_va_range_t *va_range = NULL;
+    uvm_processor_mask_t *retained_mask = NULL;
     NvU32 i;
 
     status = uvm_va_range_alloc_reclaim(va_space,
@@ -233,6 +234,16 @@ NV_STATUS uvm_va_range_create_external(uvm_va_space_t *va_space,
     if (status != NV_OK)
         return status;
 
+    UVM_ASSERT(!va_range->external.retained_mask);
+
+    retained_mask = uvm_processor_mask_cache_alloc();
+    if (!retained_mask) {
+        status = NV_ERR_NO_MEMORY;
+        goto error;
+    }
+
+    va_range->external.retained_mask = retained_mask;
+
     for (i = 0; i < ARRAY_SIZE(va_range->external.gpu_ranges); i++) {
         uvm_mutex_init(&va_range->external.gpu_ranges[i].lock, UVM_LOCK_ORDER_EXT_RANGE_TREE);
         uvm_range_tree_init(&va_range->external.gpu_ranges[i].tree);
@@ -249,6 +260,7 @@ NV_STATUS uvm_va_range_create_external(uvm_va_space_t *va_space,
 
 error:
     uvm_va_range_destroy(va_range, NULL);
+
     return status;
 }
 
@@ -438,6 +450,8 @@ static void uvm_va_range_destroy_external(uvm_va_range_t *va_range, struct list_
 {
     uvm_gpu_t *gpu;
 
+    uvm_processor_mask_cache_free(va_range->external.retained_mask);
+
     if (uvm_processor_mask_empty(&va_range->external.mapped_gpus))
         return;
 
@@ -1318,7 +1332,6 @@ static NV_STATUS range_unmap_mask(uvm_va_range_t *va_range,
     if (uvm_processor_mask_empty(mask))
         return NV_OK;
 
-
     for_each_va_block_in_va_range(va_range, block) {
         NV_STATUS status;
         uvm_va_block_region_t region = uvm_va_block_region_from_block(block);
@@ -1338,14 +1351,19 @@ static NV_STATUS range_unmap_mask(uvm_va_range_t *va_range,
 
 static NV_STATUS range_unmap(uvm_va_range_t *va_range, uvm_processor_id_t processor, uvm_tracker_t *out_tracker)
 {
-    uvm_processor_mask_t mask;
+    uvm_processor_mask_t *mask;
+    uvm_va_space_t *va_space = va_range->va_space;
+
+    uvm_assert_rwsem_locked_write(&va_space->lock);
+
+    mask = &va_space->unmap_mask;
 
     UVM_ASSERT_MSG(va_range->type == UVM_VA_RANGE_TYPE_MANAGED, "type 0x%x\n", va_range->type);
 
-    uvm_processor_mask_zero(&mask);
-    uvm_processor_mask_set(&mask, processor);
+    uvm_processor_mask_zero(mask);
+    uvm_processor_mask_set(mask, processor);
 
-    return range_unmap_mask(va_range, &mask, out_tracker);
+    return range_unmap_mask(va_range, mask, out_tracker);
 }
 
 static NV_STATUS range_map_uvm_lite_gpus(uvm_va_range_t *va_range, uvm_tracker_t *out_tracker)
@@ -1434,10 +1452,10 @@ NV_STATUS uvm_va_range_set_preferred_location(uvm_va_range_t *va_range,
                                               struct mm_struct *mm,
                                               uvm_tracker_t *out_tracker)
 {
-    NV_STATUS status;
-    uvm_processor_mask_t all_uvm_lite_gpus;
-    uvm_processor_mask_t new_uvm_lite_gpus;
-    uvm_processor_mask_t set_accessed_by_processors;
+    NV_STATUS status = NV_OK;
+    uvm_processor_mask_t *all_uvm_lite_gpus = NULL;
+    uvm_processor_mask_t *new_uvm_lite_gpus = NULL;
+    uvm_processor_mask_t *set_accessed_by_processors = NULL;
     uvm_range_group_range_iter_t iter;
     uvm_range_group_range_t *rgr = NULL;
     uvm_va_space_t *va_space = va_range->va_space;
@@ -1448,9 +1466,27 @@ NV_STATUS uvm_va_range_set_preferred_location(uvm_va_range_t *va_range,
     uvm_assert_rwsem_locked_write(&va_space->lock);
     UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
 
+    all_uvm_lite_gpus = uvm_processor_mask_cache_alloc();
+    if (!all_uvm_lite_gpus) {
+        status = NV_ERR_NO_MEMORY;
+        goto out;
+    }
+
+    new_uvm_lite_gpus = uvm_processor_mask_cache_alloc();
+    if (!new_uvm_lite_gpus) {
+        status = NV_ERR_NO_MEMORY;
+        goto out;
+    }
+
+    set_accessed_by_processors = uvm_processor_mask_cache_alloc();
+    if (!set_accessed_by_processors) {
+        status = NV_ERR_NO_MEMORY;
+        goto out;
+    }
+
     va_range_policy = uvm_va_range_get_policy(va_range);
     if (uvm_va_policy_preferred_location_equal(va_range_policy, preferred_location, preferred_cpu_nid))
-        return NV_OK;
+        goto out;
 
     // Mark all range group ranges within this VA range as migrated since the preferred location has changed.
     uvm_range_group_for_each_range_in(rgr, va_space, va_range->node.start, va_range->node.end) {
@@ -1463,14 +1499,15 @@ NV_STATUS uvm_va_range_set_preferred_location(uvm_va_range_t *va_range,
     // Calculate the new UVM-Lite GPUs mask, but don't update va_range state so
     // that we can keep block_page_check_mappings() happy while updating the
     // mappings.
-    calc_uvm_lite_gpus_mask(va_space, preferred_location, &va_range_policy->accessed_by, &new_uvm_lite_gpus);
+    calc_uvm_lite_gpus_mask(va_space, preferred_location, &va_range_policy->accessed_by, new_uvm_lite_gpus);
 
     // If the range contains non-migratable range groups, check that new UVM-Lite GPUs
     // can all map the new preferred location.
     if (!uvm_range_group_all_migratable(va_space, va_range->node.start, va_range->node.end) &&
         UVM_ID_IS_VALID(preferred_location) &&
-        !uvm_processor_mask_subset(&new_uvm_lite_gpus, &va_space->accessible_from[uvm_id_value(preferred_location)])) {
-        return NV_ERR_INVALID_DEVICE;
+        !uvm_processor_mask_subset(new_uvm_lite_gpus, &va_space->accessible_from[uvm_id_value(preferred_location)])) {
+        status = NV_ERR_INVALID_DEVICE;
+        goto out;
     }
 
     if (UVM_ID_IS_INVALID(preferred_location)) {
@@ -1479,7 +1516,7 @@ NV_STATUS uvm_va_range_set_preferred_location(uvm_va_range_t *va_range,
                 // Clear the range group assocation for any unmigratable ranges if there is no preferred location
                 status = uvm_range_group_assign_range(va_space, NULL, iter.start, iter.end);
                 if (status != NV_OK)
-                    return status;
+                    goto out;
             }
         }
     }
@@ -1489,33 +1526,33 @@ NV_STATUS uvm_va_range_set_preferred_location(uvm_va_range_t *va_range,
     //    have stale mappings to the old preferred location.
     //  - GPUs that will continue to be UVM-Lite GPUs or are new UVM-Lite GPUs
     //    need to be unmapped so that the new preferred location can be mapped.
-    uvm_processor_mask_or(&all_uvm_lite_gpus, &va_range->uvm_lite_gpus, &new_uvm_lite_gpus);
-    status = range_unmap_mask(va_range, &all_uvm_lite_gpus, out_tracker);
+    uvm_processor_mask_or(all_uvm_lite_gpus, &va_range->uvm_lite_gpus, new_uvm_lite_gpus);
+    status = range_unmap_mask(va_range, all_uvm_lite_gpus, out_tracker);
     if (status != NV_OK)
-        return status;
+        goto out;
 
     // GPUs that stop being UVM-Lite, but are in the accessed_by mask need to
     // have any possible mappings established.
-    uvm_processor_mask_andnot(&set_accessed_by_processors, &va_range->uvm_lite_gpus, &new_uvm_lite_gpus);
+    uvm_processor_mask_andnot(set_accessed_by_processors, &va_range->uvm_lite_gpus, new_uvm_lite_gpus);
 
     // A GPU which had been in UVM-Lite mode before must still be in UVM-Lite
     // mode if it is the new preferred location. Otherwise we'd have to be more
     // careful below to not establish remote mappings to the new preferred
     // location.
     if (UVM_ID_IS_GPU(preferred_location))
-        UVM_ASSERT(!uvm_processor_mask_test(&set_accessed_by_processors, preferred_location));
+        UVM_ASSERT(!uvm_processor_mask_test(set_accessed_by_processors, preferred_location));
 
     // The old preferred location should establish new remote mappings if it has
     // accessed-by set.
     if (UVM_ID_IS_VALID(va_range_policy->preferred_location))
-        uvm_processor_mask_set(&set_accessed_by_processors, va_range_policy->preferred_location);
+        uvm_processor_mask_set(set_accessed_by_processors, va_range_policy->preferred_location);
 
-    uvm_processor_mask_and(&set_accessed_by_processors, &set_accessed_by_processors, &va_range_policy->accessed_by);
+    uvm_processor_mask_and(set_accessed_by_processors, set_accessed_by_processors, &va_range_policy->accessed_by);
 
     // Now update the va_range state
     va_range_policy->preferred_location = preferred_location;
     va_range_policy->preferred_nid = preferred_cpu_nid;
-    uvm_processor_mask_copy(&va_range->uvm_lite_gpus, &new_uvm_lite_gpus);
+    uvm_processor_mask_copy(&va_range->uvm_lite_gpus, new_uvm_lite_gpus);
 
     va_block_context = uvm_va_space_block_context(va_space, mm);
 
@@ -1523,10 +1560,10 @@ NV_STATUS uvm_va_range_set_preferred_location(uvm_va_range_t *va_range,
         uvm_processor_id_t id;
         uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block);
 
-        for_each_id_in_mask(id, &set_accessed_by_processors) {
+        for_each_id_in_mask(id, set_accessed_by_processors) {
             status = uvm_va_block_set_accessed_by(va_block, va_block_context, id);
             if (status != NV_OK)
-                return status;
+                goto out;
         }
 
         // Also, mark CPU pages as dirty and remove remote mappings from the new
@@ -1549,13 +1586,20 @@ NV_STATUS uvm_va_range_set_preferred_location(uvm_va_range_t *va_range,
         uvm_mutex_unlock(&va_block->lock);
 
         if (status != NV_OK)
-            return status;
+            goto out;
     }
 
     // And lastly map all of the current UVM-Lite GPUs to the resident pages on
     // the new preferred location. Anything that's not resident right now will
     // get mapped on the next PreventMigration().
-    return range_map_uvm_lite_gpus(va_range, out_tracker);
+    status = range_map_uvm_lite_gpus(va_range, out_tracker);
+
+out:
+    uvm_processor_mask_cache_free(set_accessed_by_processors);
+    uvm_processor_mask_cache_free(new_uvm_lite_gpus);
+    uvm_processor_mask_cache_free(all_uvm_lite_gpus);
+
+    return status;
 }
 
 NV_STATUS uvm_va_range_set_accessed_by(uvm_va_range_t *va_range,
@@ -1563,50 +1607,60 @@ NV_STATUS uvm_va_range_set_accessed_by(uvm_va_range_t *va_range,
                                        struct mm_struct *mm,
                                        uvm_tracker_t *out_tracker)
 {
-    NV_STATUS status;
+    NV_STATUS status = NV_OK;
     uvm_va_block_t *va_block;
-    uvm_processor_mask_t new_uvm_lite_gpus;
     uvm_va_space_t *va_space = va_range->va_space;
     uvm_va_policy_t *policy = uvm_va_range_get_policy(va_range);
-    uvm_va_block_context_t *va_block_context;
+    uvm_va_block_context_t *va_block_context = uvm_va_space_block_context(va_space, mm);
+    uvm_processor_mask_t *new_uvm_lite_gpus;
+
+    // va_block_context->scratch_processor_mask cannot be used since
+    // range_unmap() calls uvm_va_space_block_context(), which re-
+    // initializes the VA block context structure.
+    new_uvm_lite_gpus = uvm_processor_mask_cache_alloc();
+    if (!new_uvm_lite_gpus)
+        return NV_ERR_NO_MEMORY;
 
     // If the range belongs to a non-migratable range group and that processor_id is a non-faultable GPU,
     // check it can map the preferred location
     if (!uvm_range_group_all_migratable(va_space, va_range->node.start, va_range->node.end) &&
         UVM_ID_IS_GPU(processor_id) &&
         !uvm_processor_mask_test(&va_space->faultable_processors, processor_id) &&
-        !uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(policy->preferred_location)], processor_id))
-        return NV_ERR_INVALID_DEVICE;
+        !uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(policy->preferred_location)], processor_id)) {
+        status = NV_ERR_INVALID_DEVICE;
+        goto out;
+    }
 
     uvm_processor_mask_set(&policy->accessed_by, processor_id);
 
     // If a GPU is already a UVM-Lite GPU then there is nothing else to do.
     if (uvm_processor_mask_test(&va_range->uvm_lite_gpus, processor_id))
-        return NV_OK;
+        goto out;
 
     // Calculate the new UVM-Lite GPUs mask, but don't update it in the va range
     // yet so that we can keep block_page_check_mappings() happy while updating
     // the mappings.
-    calc_uvm_lite_gpus_mask(va_space, policy->preferred_location, &policy->accessed_by, &new_uvm_lite_gpus);
+    calc_uvm_lite_gpus_mask(va_space, policy->preferred_location, &policy->accessed_by, new_uvm_lite_gpus);
 
-    if (uvm_processor_mask_test(&new_uvm_lite_gpus, processor_id)) {
+    if (uvm_processor_mask_test(new_uvm_lite_gpus, processor_id)) {
         // GPUs that become UVM-Lite GPUs need to unmap everything so that they
         // can map the preferred location.
         status = range_unmap(va_range, processor_id, out_tracker);
         if (status != NV_OK)
-            return status;
+            goto out;
     }
 
-    uvm_processor_mask_copy(&va_range->uvm_lite_gpus, &new_uvm_lite_gpus);
-    va_block_context = uvm_va_space_block_context(va_space, mm);
+    uvm_processor_mask_copy(&va_range->uvm_lite_gpus, new_uvm_lite_gpus);
 
     for_each_va_block_in_va_range(va_range, va_block) {
         status = uvm_va_block_set_accessed_by(va_block, va_block_context, processor_id);
         if (status != NV_OK)
-            return status;
+            goto out;
     }
 
-    return NV_OK;
+out:
+    uvm_processor_mask_cache_free(new_uvm_lite_gpus);
+    return status;
 }
 
 void uvm_va_range_unset_accessed_by(uvm_va_range_t *va_range,
diff --git a/kernel-open/nvidia-uvm/uvm_va_range.h b/kernel-open/nvidia-uvm/uvm_va_range.h
index 8e9a8ef1a..7ca40f841 100644
--- a/kernel-open/nvidia-uvm/uvm_va_range.h
+++ b/kernel-open/nvidia-uvm/uvm_va_range.h
@@ -252,6 +252,10 @@ typedef struct
     // range because each GPU is able to map a completely different set of
     // allocations to the same VA range.
     uvm_ext_gpu_range_tree_t gpu_ranges[UVM_ID_MAX_GPUS];
+
+    // Dynamically allocated page mask allocated in
+    // uvm_va_range_create_external() and used and freed in uvm_free().
+    uvm_processor_mask_t *retained_mask;
 } uvm_va_range_external_t;
 
 // va_range state when va_range.type == UVM_VA_RANGE_TYPE_CHANNEL. This
diff --git a/kernel-open/nvidia-uvm/uvm_va_space.c b/kernel-open/nvidia-uvm/uvm_va_space.c
index 10d7ed9bb..3645ab259 100644
--- a/kernel-open/nvidia-uvm/uvm_va_space.c
+++ b/kernel-open/nvidia-uvm/uvm_va_space.c
@@ -119,15 +119,27 @@ static NV_STATUS register_gpu_peers(uvm_va_space_t *va_space, uvm_gpu_t *gpu)
 static bool va_space_check_processors_masks(uvm_va_space_t *va_space)
 {
     uvm_processor_id_t processor;
-    uvm_processor_mask_t processors;
+    uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
+    uvm_processor_mask_t *processors = &block_context->scratch_processor_mask;
 
     uvm_assert_rwsem_locked_write(&va_space->lock);
 
-    uvm_processor_mask_copy(&processors, &va_space->registered_gpus);
-    uvm_processor_mask_set(&processors, UVM_ID_CPU);
+    uvm_processor_mask_copy(processors, &va_space->registered_gpus);
+    uvm_processor_mask_set(processors, UVM_ID_CPU);
 
-    for_each_id_in_mask(processor, &processors) {
+    for_each_id_in_mask(processor, processors) {
         uvm_processor_id_t other_processor;
+        bool check_can_copy_from = true;
+
+        if (UVM_ID_IS_GPU(processor)) {
+            uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, processor);
+
+            // Peer copies between two processors can be disabled even when they
+            // are NvLink peers, or there is HW support for atomics between
+            // them.
+            if (gpu->parent->peer_copy_mode == UVM_GPU_PEER_COPY_MODE_UNSUPPORTED)
+                check_can_copy_from = false;
+        }
 
         UVM_ASSERT(processor_mask_array_test(va_space->can_access, processor, processor));
         UVM_ASSERT(processor_mask_array_test(va_space->accessible_from, processor, processor));
@@ -137,8 +149,11 @@ static bool va_space_check_processors_masks(uvm_va_space_t *va_space)
 
         // NVLINK
         UVM_ASSERT(!processor_mask_array_test(va_space->has_nvlink, processor, processor));
-        UVM_ASSERT(uvm_processor_mask_subset(&va_space->has_nvlink[uvm_id_value(processor)],
-                                             &va_space->can_copy_from[uvm_id_value(processor)]));
+
+        if (check_can_copy_from) {
+            UVM_ASSERT(uvm_processor_mask_subset(&va_space->has_nvlink[uvm_id_value(processor)],
+                                                 &va_space->can_copy_from[uvm_id_value(processor)]));
+        }
 
         // Peers
         UVM_ASSERT(!processor_mask_array_test(va_space->indirect_peers, processor, processor));
@@ -147,8 +162,12 @@ static bool va_space_check_processors_masks(uvm_va_space_t *va_space)
 
         // Atomics
         UVM_ASSERT(processor_mask_array_test(va_space->has_native_atomics, processor, processor));
-        UVM_ASSERT(uvm_processor_mask_subset(&va_space->has_native_atomics[uvm_id_value(processor)],
-                                             &va_space->can_copy_from[uvm_id_value(processor)]));
+
+        if (check_can_copy_from) {
+            UVM_ASSERT(uvm_processor_mask_subset(&va_space->has_native_atomics[uvm_id_value(processor)],
+                                                 &va_space->can_copy_from[uvm_id_value(processor)]));
+        }
+
         UVM_ASSERT(uvm_processor_mask_subset(&va_space->has_native_atomics[uvm_id_value(processor)],
                                              &va_space->can_access[uvm_id_value(processor)]));
 
@@ -178,6 +197,7 @@ NV_STATUS uvm_va_space_create(struct address_space *mapping, uvm_va_space_t **va
     }
 
     uvm_init_rwsem(&va_space->lock, UVM_LOCK_ORDER_VA_SPACE);
+    uvm_mutex_init(&va_space->closest_processors.mask_mutex, UVM_LOCK_ORDER_LEAF);
     uvm_mutex_init(&va_space->serialize_writers_lock, UVM_LOCK_ORDER_VA_SPACE_SERIALIZE_WRITERS);
     uvm_mutex_init(&va_space->read_acquire_write_release_lock,
                    UVM_LOCK_ORDER_VA_SPACE_READ_ACQUIRE_WRITE_RELEASE_LOCK);
@@ -329,7 +349,6 @@ static void unregister_gpu(uvm_va_space_t *va_space,
     if (gpu->parent->isr.replayable_faults.handling) {
         UVM_ASSERT(uvm_processor_mask_test(&va_space->faultable_processors, gpu->id));
         uvm_processor_mask_clear(&va_space->faultable_processors, gpu->id);
-        UVM_ASSERT(uvm_processor_mask_test(&va_space->system_wide_atomics_enabled_processors, gpu->id));
         uvm_processor_mask_clear(&va_space->system_wide_atomics_enabled_processors, gpu->id);
     }
     else {
@@ -427,7 +446,7 @@ void uvm_va_space_destroy(uvm_va_space_t *va_space)
     uvm_va_range_t *va_range, *va_range_next;
     uvm_gpu_t *gpu;
     uvm_gpu_id_t gpu_id;
-    uvm_processor_mask_t retained_gpus;
+    uvm_processor_mask_t *retained_gpus = &va_space->registered_gpus_teardown;
     LIST_HEAD(deferred_free_list);
 
     // Remove the VA space from the global list before we start tearing things
@@ -455,7 +474,7 @@ void uvm_va_space_destroy(uvm_va_space_t *va_space)
     // registered GPUs in the VA space, so those faults will be canceled.
     uvm_va_space_down_write(va_space);
 
-    uvm_processor_mask_copy(&retained_gpus, &va_space->registered_gpus);
+    uvm_processor_mask_copy(retained_gpus, &va_space->registered_gpus);
 
     bitmap_copy(va_space->enabled_peers_teardown, va_space->enabled_peers, UVM_MAX_UNIQUE_GPU_PAIRS);
 
@@ -507,7 +526,7 @@ void uvm_va_space_destroy(uvm_va_space_t *va_space)
 
     nv_kthread_q_flush(&g_uvm_global.global_q);
 
-    for_each_gpu_in_mask(gpu, &retained_gpus) {
+    for_each_gpu_in_mask(gpu, retained_gpus) {
         if (!gpu->parent->isr.replayable_faults.handling) {
             UVM_ASSERT(!gpu->parent->isr.non_replayable_faults.handling);
             continue;
@@ -523,6 +542,15 @@ void uvm_va_space_destroy(uvm_va_space_t *va_space)
 
         if (gpu->parent->access_counters_supported)
             uvm_parent_gpu_access_counters_disable(gpu->parent, va_space);
+
+        // Free the processor masks allocated in uvm_va_space_register_gpu().
+        // The mask is also freed in uvm_va_space_unregister_gpu() but that
+        // function won't be called in uvm_release() and uvm_release_deferred()
+        // path.
+        uvm_processor_mask_cache_free(va_space->peers_to_release[uvm_id_value(gpu->id)]);
+
+        // Set the pointer to NULL to avoid accidental re-use and double free.
+        va_space->peers_to_release[uvm_id_value(gpu->id)] = NULL;
     }
 
     // Check that all CPU/GPU affinity masks are empty
@@ -554,14 +582,14 @@ void uvm_va_space_destroy(uvm_va_space_t *va_space)
     // Release the GPUs and their peer counts. Do not use
     // for_each_gpu_in_mask for the outer loop as it reads the GPU
     // state, which might get destroyed.
-    for_each_gpu_id_in_mask(gpu_id, &retained_gpus) {
+    for_each_gpu_id_in_mask(gpu_id, retained_gpus) {
         uvm_gpu_t *peer_gpu;
 
         gpu = uvm_gpu_get(gpu_id);
 
-        uvm_processor_mask_clear(&retained_gpus, gpu_id);
+        uvm_processor_mask_clear(retained_gpus, gpu_id);
 
-        for_each_gpu_in_mask(peer_gpu, &retained_gpus) {
+        for_each_gpu_in_mask(peer_gpu, retained_gpus) {
             NvU32 peer_table_index = uvm_gpu_peer_table_index(gpu->id, peer_gpu->id);
             if (test_bit(peer_table_index, va_space->enabled_peers_teardown)) {
                 uvm_gpu_peer_t *peer_caps = &g_uvm_global.peers[peer_table_index];
@@ -679,6 +707,7 @@ NV_STATUS uvm_va_space_register_gpu(uvm_va_space_t *va_space,
     uvm_gpu_t *gpu;
     uvm_gpu_t *other_gpu;
     bool gpu_can_access_sysmem = true;
+    uvm_processor_mask_t *peers_to_release = NULL;
 
     status = uvm_gpu_retain_by_uuid(gpu_uuid, user_rm_device, &gpu);
     if (status != NV_OK)
@@ -733,6 +762,16 @@ NV_STATUS uvm_va_space_register_gpu(uvm_va_space_t *va_space,
         gpu_can_access_sysmem = false;
     }
 
+    UVM_ASSERT(!va_space->peers_to_release[uvm_id_value(gpu->id)]);
+
+    peers_to_release = uvm_processor_mask_cache_alloc();
+    if (!peers_to_release) {
+        status = NV_ERR_NO_MEMORY;
+        goto done;
+    }
+
+    va_space->peers_to_release[uvm_id_value(gpu->id)] = peers_to_release;
+
     uvm_processor_mask_set(&va_space->registered_gpus, gpu->id);
     va_space->registered_gpus_table[uvm_id_gpu_index(gpu->id)] = gpu;
 
@@ -832,6 +871,10 @@ cleanup:
     // a deferred_free_list, mm, etc.
     unregister_gpu(va_space, gpu, NULL, NULL, NULL);
 
+    va_space->peers_to_release[uvm_id_value(gpu->id)] = NULL;
+
+    uvm_processor_mask_cache_free(peers_to_release);
+
 done:
     UVM_ASSERT(va_space_check_processors_masks(va_space));
 
@@ -856,7 +899,7 @@ NV_STATUS uvm_va_space_unregister_gpu(uvm_va_space_t *va_space, const NvProcesso
     uvm_gpu_va_space_t *gpu_va_space;
     struct mm_struct *mm;
     uvm_gpu_id_t peer_gpu_id;
-    uvm_processor_mask_t peers_to_release;
+    uvm_processor_mask_t *peers_to_release;
     LIST_HEAD(deferred_free_list);
 
     // Stopping channels requires holding the VA space lock in read mode, so do
@@ -917,8 +960,12 @@ NV_STATUS uvm_va_space_unregister_gpu(uvm_va_space_t *va_space, const NvProcesso
     if (uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->id))
         UVM_ASSERT(uvm_gpu_va_space_get(va_space, gpu) == gpu_va_space);
 
+    peers_to_release = va_space->peers_to_release[uvm_id_value(gpu->id)];
+
+    va_space->peers_to_release[uvm_id_value(gpu->id)] = NULL;
+
     // This will call disable_peers for all GPU's peers, including NVLink
-    unregister_gpu(va_space, gpu, mm, &deferred_free_list, &peers_to_release);
+    unregister_gpu(va_space, gpu, mm, &deferred_free_list, peers_to_release);
 
     UVM_ASSERT(uvm_processor_mask_test(&va_space->gpu_unregister_in_progress, gpu->id));
     uvm_processor_mask_clear(&va_space->gpu_unregister_in_progress, gpu->id);
@@ -939,12 +986,16 @@ NV_STATUS uvm_va_space_unregister_gpu(uvm_va_space_t *va_space, const NvProcesso
 
     // Do not use for_each_gpu_in_mask as it reads the peer GPU state,
     // which might get destroyed when we release the peer entry.
-    for_each_gpu_id_in_mask(peer_gpu_id, &peers_to_release) {
+    UVM_ASSERT(peers_to_release);
+
+    for_each_gpu_id_in_mask(peer_gpu_id, peers_to_release) {
         uvm_gpu_t *peer_gpu = uvm_gpu_get(peer_gpu_id);
         UVM_ASSERT(uvm_gpu_peer_caps(gpu, peer_gpu)->link_type == UVM_GPU_LINK_PCIE);
         uvm_gpu_release_pcie_peer_access(gpu, peer_gpu);
     }
 
+    uvm_processor_mask_cache_free(peers_to_release);
+
     uvm_gpu_release_locked(gpu);
 
     uvm_mutex_unlock(&g_uvm_global.global_lock);
@@ -1026,7 +1077,6 @@ static NV_STATUS enable_peers(uvm_va_space_t *va_space, uvm_gpu_t *gpu0, uvm_gpu
         return NV_ERR_NOT_COMPATIBLE;
     }
 
-    // TODO: Bug 3848497: Disable GPU Peer Mapping when HCC is enabled
     processor_mask_array_set(va_space->can_access, gpu0->id, gpu1->id);
     processor_mask_array_set(va_space->can_access, gpu1->id, gpu0->id);
     processor_mask_array_set(va_space->accessible_from, gpu0->id, gpu1->id);
@@ -1711,49 +1761,59 @@ uvm_processor_id_t uvm_processor_mask_find_closest_id(uvm_va_space_t *va_space,
                                                       const uvm_processor_mask_t *candidates,
                                                       uvm_processor_id_t src)
 {
-    uvm_processor_mask_t mask;
-    uvm_processor_id_t id;
+    uvm_processor_mask_t *mask = &va_space->closest_processors.mask;
+    uvm_processor_id_t closest_id;
 
     // Highest priority: the local processor itself
     if (uvm_processor_mask_test(candidates, src))
         return src;
 
-    // NvLink peers
-    if (uvm_processor_mask_and(&mask, candidates, &va_space->has_nvlink[uvm_id_value(src)])) {
+    uvm_mutex_lock(&va_space->closest_processors.mask_mutex);
+
+    if (uvm_processor_mask_and(mask, candidates, &va_space->has_nvlink[uvm_id_value(src)])) {
+        // NvLink peers
         uvm_processor_mask_t *indirect_peers;
-        uvm_processor_mask_t direct_peers;
+        uvm_processor_mask_t *direct_peers = &va_space->closest_processors.direct_peers;
 
         indirect_peers = &va_space->indirect_peers[uvm_id_value(src)];
 
-        // Direct peers, prioritizing GPU peers over CPU
-        if (uvm_processor_mask_andnot(&direct_peers, &mask, indirect_peers)) {
-            id = uvm_processor_mask_find_first_gpu_id(&direct_peers);
-            return UVM_ID_IS_INVALID(id)? UVM_ID_CPU : id;
+        if (uvm_processor_mask_andnot(direct_peers, mask, indirect_peers)) {
+            // Direct peers, prioritizing GPU peers over CPU
+            closest_id = uvm_processor_mask_find_first_gpu_id(direct_peers);
+            if (UVM_ID_IS_INVALID(closest_id))
+                closest_id = UVM_ID_CPU;
         }
+        else {
+            // Indirect peers
+            UVM_ASSERT(UVM_ID_IS_GPU(src));
+            UVM_ASSERT(!uvm_processor_mask_test(mask, UVM_ID_CPU));
 
-        // Indirect peers
-        UVM_ASSERT(UVM_ID_IS_GPU(src));
-        UVM_ASSERT(!uvm_processor_mask_test(&mask, UVM_ID_CPU));
-
-        return uvm_processor_mask_find_first_gpu_id(&mask);
+            closest_id = uvm_processor_mask_find_first_gpu_id(mask);
+        }
     }
-
-    // If source is GPU, prioritize PCIe peers over CPU
-    if (uvm_processor_mask_and(&mask, candidates, &va_space->can_access[uvm_id_value(src)])) {
+    else if (uvm_processor_mask_and(mask, candidates, &va_space->can_access[uvm_id_value(src)])) {
+        // If source is GPU, prioritize PCIe peers over CPU
         // CPUs only have direct access to GPU memory over NVLINK, not PCIe, and
         // should have been selected above
         UVM_ASSERT(UVM_ID_IS_GPU(src));
 
-        id = uvm_processor_mask_find_first_gpu_id(&mask);
-        return UVM_ID_IS_INVALID(id)? UVM_ID_CPU : id;
+        closest_id = uvm_processor_mask_find_first_gpu_id(mask);
+        if (UVM_ID_IS_INVALID(closest_id))
+            closest_id = UVM_ID_CPU;
+    }
+    else {
+        // No GPUs with direct access are in the mask. Just pick the first
+        // processor in the mask, if any.
+        closest_id = uvm_processor_mask_find_first_id(candidates);
     }
 
-    // No GPUs with direct access are in the mask. Just pick the first
-    // processor in the mask, if any.
-    return uvm_processor_mask_find_first_id(candidates);
+    uvm_mutex_unlock(&va_space->closest_processors.mask_mutex);
+
+    return closest_id;
 }
 
-static void uvm_deferred_free_object_channel(uvm_deferred_free_object_t *object, uvm_processor_mask_t *flushed_gpus)
+static void uvm_deferred_free_object_channel(uvm_deferred_free_object_t *object,
+                                             uvm_parent_processor_mask_t *flushed_parent_gpus)
 {
     uvm_user_channel_t *channel = container_of(object, uvm_user_channel_t, deferred_free);
     uvm_gpu_t *gpu = channel->gpu;
@@ -1761,9 +1821,10 @@ static void uvm_deferred_free_object_channel(uvm_deferred_free_object_t *object,
     // Flush out any faults with this instance pointer still in the buffer. This
     // prevents us from re-allocating the same instance pointer for a new
     // channel and mis-attributing old faults to it.
-    if (gpu->parent->replayable_faults_supported && !uvm_processor_mask_test(flushed_gpus, gpu->id)) {
+    if (gpu->parent->replayable_faults_supported &&
+        !uvm_parent_processor_mask_test(flushed_parent_gpus, gpu->parent->id)) {
         uvm_gpu_fault_buffer_flush(gpu);
-        uvm_processor_mask_set(flushed_gpus, gpu->id);
+        uvm_parent_processor_mask_set(flushed_parent_gpus, gpu->parent->id);
     }
 
     uvm_user_channel_destroy_detached(channel);
@@ -1772,17 +1833,20 @@ static void uvm_deferred_free_object_channel(uvm_deferred_free_object_t *object,
 void uvm_deferred_free_object_list(struct list_head *deferred_free_list)
 {
     uvm_deferred_free_object_t *object, *next;
-    uvm_processor_mask_t flushed_gpus;
+    uvm_parent_processor_mask_t flushed_parent_gpus;
 
-    // Used if there are any channels in the list
-    uvm_processor_mask_zero(&flushed_gpus);
+    // flushed_parent_gpus prevents redundant fault buffer flushes by tracking
+    // the parent GPUs on which the flush already happened. Flushing the fault
+    // buffer on one GPU instance will flush it for all other instances on that
+    // parent GPU.
+    uvm_parent_processor_mask_zero(&flushed_parent_gpus);
 
     list_for_each_entry_safe(object, next, deferred_free_list, list_node) {
         list_del(&object->list_node);
 
         switch (object->type) {
             case UVM_DEFERRED_FREE_OBJECT_TYPE_CHANNEL:
-                uvm_deferred_free_object_channel(object, &flushed_gpus);
+                uvm_deferred_free_object_channel(object, &flushed_parent_gpus);
                 break;
             case UVM_DEFERRED_FREE_OBJECT_GPU_VA_SPACE:
                 destroy_gpu_va_space(container_of(object, uvm_gpu_va_space_t, deferred_free));
@@ -2169,6 +2233,31 @@ static LIST_HEAD(g_cpu_service_block_context_list);
 
 static uvm_spinlock_t g_cpu_service_block_context_list_lock;
 
+uvm_service_block_context_t *uvm_service_block_context_alloc(struct mm_struct *mm)
+{
+    uvm_service_block_context_t *service_context = uvm_kvmalloc(sizeof(*service_context));
+
+    if (!service_context)
+        return NULL;
+
+    service_context->block_context = uvm_va_block_context_alloc(mm);
+    if (!service_context->block_context) {
+        uvm_kvfree(service_context);
+        service_context = NULL;
+    }
+
+    return service_context;
+}
+
+void uvm_service_block_context_free(uvm_service_block_context_t *service_context)
+{
+    if (!service_context)
+        return;
+
+    uvm_va_block_context_free(service_context->block_context);
+    uvm_kvfree(service_context);
+}
+
 NV_STATUS uvm_service_block_context_init(void)
 {
     unsigned num_preallocated_contexts = 4;
@@ -2177,17 +2266,11 @@ NV_STATUS uvm_service_block_context_init(void)
 
     // Pre-allocate some fault service contexts for the CPU and add them to the global list
     while (num_preallocated_contexts-- > 0) {
-        uvm_service_block_context_t *service_context = uvm_kvmalloc(sizeof(*service_context));
+        uvm_service_block_context_t *service_context = uvm_service_block_context_alloc(NULL);
 
         if (!service_context)
             return NV_ERR_NO_MEMORY;
 
-        service_context->block_context = uvm_va_block_context_alloc(NULL);
-        if (!service_context->block_context) {
-            uvm_kvfree(service_context);
-            return NV_ERR_NO_MEMORY;
-        }
-
         list_add(&service_context->cpu_fault.service_context_list, &g_cpu_service_block_context_list);
     }
 
@@ -2199,11 +2282,13 @@ void uvm_service_block_context_exit(void)
     uvm_service_block_context_t *service_context, *service_context_tmp;
 
     // Free fault service contexts for the CPU and add clear the global list
-    list_for_each_entry_safe(service_context, service_context_tmp, &g_cpu_service_block_context_list,
+    list_for_each_entry_safe(service_context,
+                             service_context_tmp,
+                             &g_cpu_service_block_context_list,
                              cpu_fault.service_context_list) {
-        uvm_va_block_context_free(service_context->block_context);
-        uvm_kvfree(service_context);
+        uvm_service_block_context_free(service_context);
     }
+
     INIT_LIST_HEAD(&g_cpu_service_block_context_list);
 }
 
@@ -2215,7 +2300,8 @@ static uvm_service_block_context_t *service_block_context_cpu_alloc(void)
 
     uvm_spin_lock(&g_cpu_service_block_context_list_lock);
 
-    service_context = list_first_entry_or_null(&g_cpu_service_block_context_list, uvm_service_block_context_t,
+    service_context = list_first_entry_or_null(&g_cpu_service_block_context_list,
+                                               uvm_service_block_context_t,
                                                cpu_fault.service_context_list);
 
     if (service_context)
@@ -2223,17 +2309,10 @@ static uvm_service_block_context_t *service_block_context_cpu_alloc(void)
 
     uvm_spin_unlock(&g_cpu_service_block_context_list_lock);
 
-    if (!service_context) {
-        service_context = uvm_kvmalloc(sizeof(*service_context));
-        service_context->block_context = uvm_va_block_context_alloc(NULL);
-        if (!service_context->block_context) {
-            uvm_kvfree(service_context);
-            service_context = NULL;
-        }
-    }
-    else {
+    if (!service_context)
+        service_context = uvm_service_block_context_alloc(NULL);
+    else
         uvm_va_block_context_init(service_context->block_context, NULL);
-    }
 
     return service_context;
 }
diff --git a/kernel-open/nvidia-uvm/uvm_va_space.h b/kernel-open/nvidia-uvm/uvm_va_space.h
index a7e2eb000..9aa7a96de 100644
--- a/kernel-open/nvidia-uvm/uvm_va_space.h
+++ b/kernel-open/nvidia-uvm/uvm_va_space.h
@@ -230,9 +230,11 @@ struct uvm_va_space_struct
     uvm_processor_mask_t accessible_from[UVM_ID_MAX_PROCESSORS];
 
     // Pre-computed masks that contain, for each processor memory, a mask with
-    // the processors that can directly copy to and from its memory. This is
-    // almost the same as accessible_from masks, but also requires peer identity
-    // mappings to be supported for peer access.
+    // the processors that can directly copy to and from its memory, using the
+    // Copy Engine. These masks are usually the same as accessible_from masks.
+    //
+    // In certain configurations, peer identity mappings must be created to
+    // enable CE copies between peers.
     uvm_processor_mask_t can_copy_from[UVM_ID_MAX_PROCESSORS];
 
     // Pre-computed masks that contain, for each processor, a mask of processors
@@ -265,6 +267,22 @@ struct uvm_va_space_struct
     // Mask of processors that are participating in system-wide atomics
     uvm_processor_mask_t system_wide_atomics_enabled_processors;
 
+    // Temporary copy of registered_gpus used to avoid allocation during VA
+    // space destroy.
+    uvm_processor_mask_t registered_gpus_teardown;
+
+    // Allocated in uvm_va_space_register_gpu(), used and free'd in
+    // uvm_va_space_unregister_gpu().
+    uvm_processor_mask_t *peers_to_release[UVM_ID_MAX_PROCESSORS];
+
+    // Mask of processors to unmap. Used in range_unmap().
+    uvm_processor_mask_t unmap_mask;
+
+    // Available as scratch space for the internal APIs. This is like a caller-
+    // save register: it shouldn't be used across function calls which also take
+    // this va_space.
+    uvm_processor_mask_t scratch_processor_mask;
+
     // Mask of physical GPUs where access counters are enabled on this VA space
     uvm_parent_processor_mask_t access_counters_enabled_processors;
 
@@ -349,6 +367,20 @@ struct uvm_va_space_struct
         uvm_hmm_va_space_t hmm;
     };
 
+    struct
+    {
+        // Temporary mask used to calculate closest_processors in
+        // uvm_processor_mask_find_closest_id.
+        uvm_processor_mask_t mask;
+
+        // Temporary mask to hold direct_peers in
+        // uvm_processor_mask_find_closest_id.
+        uvm_processor_mask_t direct_peers;
+
+        // Protects the mask and direct_peers above.
+        uvm_mutex_t mask_mutex;
+    } closest_processors;
+
     struct
     {
         bool  page_prefetch_enabled;
diff --git a/kernel-open/nvidia-uvm/uvm_va_space_mm.c b/kernel-open/nvidia-uvm/uvm_va_space_mm.c
index 26dc0b698..46d5f47e8 100644
--- a/kernel-open/nvidia-uvm/uvm_va_space_mm.c
+++ b/kernel-open/nvidia-uvm/uvm_va_space_mm.c
@@ -417,9 +417,7 @@ static void uvm_va_space_mm_shutdown(uvm_va_space_t *va_space)
     uvm_va_space_mm_t *va_space_mm = &va_space->va_space_mm;
     uvm_gpu_va_space_t *gpu_va_space;
     uvm_gpu_t *gpu;
-    // TODO: Bug 4351121: retained_gpus should be pre-allocated, not on the
-    // stack.
-    uvm_processor_mask_t retained_gpus;
+    uvm_processor_mask_t *retained_gpus = &va_space_mm->scratch_processor_mask;
     uvm_parent_processor_mask_t flushed_parent_gpus;
     LIST_HEAD(deferred_free_list);
 
@@ -443,32 +441,34 @@ static void uvm_va_space_mm_shutdown(uvm_va_space_t *va_space)
 
     // Detach all channels to prevent pending untranslated faults from getting
     // to this VA space. This also removes those channels from the VA space and
-    // puts them on the deferred free list, so only one thread will do this.
+    // puts them on the deferred free list.
     uvm_va_space_down_write(va_space);
     uvm_va_space_detach_all_user_channels(va_space, &deferred_free_list);
-    uvm_processor_mask_and(&retained_gpus, &va_space->registered_gpus, &va_space->faultable_processors);
-    uvm_global_gpu_retain(&retained_gpus);
+    uvm_processor_mask_and(retained_gpus, &va_space->registered_gpus, &va_space->faultable_processors);
+    uvm_global_gpu_retain(retained_gpus);
     uvm_va_space_up_write(va_space);
 
+    // It's ok to use retained_gpus outside the lock since there can only be one
+    // thread executing in uvm_va_space_mm_shutdown at a time.
+
     // Flush the fault buffer on all registered faultable GPUs.
     // This will avoid spurious cancels of stale pending translated
     // faults after we set UVM_VA_SPACE_MM_STATE_RELEASED later.
     uvm_parent_processor_mask_zero(&flushed_parent_gpus);
-    for_each_gpu_in_mask(gpu, &retained_gpus) {
+    for_each_gpu_in_mask(gpu, retained_gpus) {
         if (!uvm_parent_processor_mask_test_and_set(&flushed_parent_gpus, gpu->parent->id))
             uvm_gpu_fault_buffer_flush(gpu);
     }
 
-    uvm_global_gpu_release(&retained_gpus);
+    uvm_global_gpu_release(retained_gpus);
 
     // Call nvUvmInterfaceUnsetPageDirectory. This has no effect on non-MPS.
     // Under MPS this guarantees that no new GPU accesses will be made using
     // this mm.
     //
-    // We need only one thread to make this call, but two threads in here could
-    // race for it, or we could have one thread in here and one in
-    // destroy_gpu_va_space. Serialize these by starting in write mode then
-    // downgrading to read.
+    // We need only one thread to make this call, but we could have one thread
+    // in here and one in destroy_gpu_va_space. Serialize these by starting in
+    // write mode then downgrading to read.
     uvm_va_space_down_write(va_space);
     uvm_va_space_downgrade_write_rm(va_space);
     for_each_gpu_va_space(gpu_va_space, va_space)
diff --git a/kernel-open/nvidia-uvm/uvm_va_space_mm.h b/kernel-open/nvidia-uvm/uvm_va_space_mm.h
index 9186f8313..a8a46b807 100644
--- a/kernel-open/nvidia-uvm/uvm_va_space_mm.h
+++ b/kernel-open/nvidia-uvm/uvm_va_space_mm.h
@@ -83,6 +83,11 @@ struct uvm_va_space_mm_struct
     // Wait queue for threads waiting for retainers to finish (retained_count
     // going to 0 when not alive).
     wait_queue_head_t last_retainer_wait_queue;
+
+    // Available as scratch space for the internal APIs. This is like a caller-
+    // save register: it shouldn't be used across function calls which also take
+    // this va_space_mm.
+    uvm_processor_mask_t scratch_processor_mask;
 };
 
 static bool uvm_va_space_mm_alive(struct uvm_va_space_mm_struct *va_space_mm)
diff --git a/kernel-open/nvidia/nv.c b/kernel-open/nvidia/nv.c
index 43b7dab14..ae2b19d97 100644
--- a/kernel-open/nvidia/nv.c
+++ b/kernel-open/nvidia/nv.c
@@ -57,7 +57,11 @@
 #include "nv-dmabuf.h"
 #include "nv-caps-imex.h"
 
-#if !defined(CONFIG_RETPOLINE)
+/*
+ * Commit aefb2f2e619b ("x86/bugs: Rename CONFIG_RETPOLINE =>
+ * CONFIG_MITIGATION_RETPOLINE) in v6.8 renamed CONFIG_RETPOLINE.
+ */
+#if !defined(CONFIG_RETPOLINE) && !defined(CONFIG_MITIGATION_RETPOLINE)
 #include "nv-retpoline.h"
 #endif
 
diff --git a/kernel-open/nvidia/nvidia.Kbuild b/kernel-open/nvidia/nvidia.Kbuild
index 7908ce716..5766c9223 100644
--- a/kernel-open/nvidia/nvidia.Kbuild
+++ b/kernel-open/nvidia/nvidia.Kbuild
@@ -250,6 +250,7 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += num_registered_fb
 NV_CONFTEST_TYPE_COMPILE_TESTS += pci_driver_has_driver_managed_dma
 NV_CONFTEST_TYPE_COMPILE_TESTS += vm_area_struct_has_const_vm_flags
 NV_CONFTEST_TYPE_COMPILE_TESTS += memory_failure_has_trapno_arg
+NV_CONFTEST_TYPE_COMPILE_TESTS += foll_longterm_present
 
 NV_CONFTEST_GENERIC_COMPILE_TESTS += dom0_kernel_present
 NV_CONFTEST_GENERIC_COMPILE_TESTS += nvidia_vgpu_kvm_build
diff --git a/kernel-open/nvidia/os-interface.c b/kernel-open/nvidia/os-interface.c
index 36a8185c2..27bf35d4b 100644
--- a/kernel-open/nvidia/os-interface.c
+++ b/kernel-open/nvidia/os-interface.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 1999-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1999-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: MIT
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -2201,6 +2201,8 @@ static int os_numa_verify_gpu_memory_zone(struct notifier_block *nb,
     return NOTIFY_OK;
 }
 
+#define ADD_REMOVE_GPU_MEMORY_NUM_SEGMENTS 4
+
 NV_STATUS NV_API_CALL os_numa_add_gpu_memory
 (
     void *handle,
@@ -2214,7 +2216,12 @@ NV_STATUS NV_API_CALL os_numa_add_gpu_memory
     nv_linux_state_t *nvl = pci_get_drvdata(handle);
     nv_state_t *nv = NV_STATE_PTR(nvl);
     NvU64 base = offset + nvl->coherent_link_info.gpu_mem_pa;
-    int ret;
+    int ret = 0;
+    NvU64 memblock_size;
+    NvU64 size_remaining;
+    NvU64 calculated_segment_size;
+    NvU64 segment_size;
+    NvU64 segment_base;
     os_numa_gpu_mem_hotplug_notifier_t notifier =
     {
         .start_pa = base,
@@ -2247,11 +2254,49 @@ NV_STATUS NV_API_CALL os_numa_add_gpu_memory
         goto failed;
     }
 
+    //
+    // Adding all memory at once can take a long time. Split up memory into segments
+    // with schedule() in between to prevent soft lockups. Memory segments for
+    // add_memory_driver_managed() need to be aligned to memblock size.
+    //
+    // If there are any issues splitting into segments, then add all memory at once.
+    //
+    if (os_numa_memblock_size(&memblock_size) == NV_OK)
+    {
+        calculated_segment_size = NV_ALIGN_UP(size / ADD_REMOVE_GPU_MEMORY_NUM_SEGMENTS, memblock_size);
+    }
+    else
+    {
+        // Don't split into segments, add all memory at once
+        calculated_segment_size = size;
+    }
+
+    segment_size = calculated_segment_size;
+    segment_base = base;
+    size_remaining = size;
+
+    while ((size_remaining > 0) &&
+           (ret == 0))
+    {
+        if (segment_size > size_remaining)
+        {
+            segment_size = size_remaining;
+        }
+
 #ifdef NV_ADD_MEMORY_DRIVER_MANAGED_HAS_MHP_FLAGS_ARG
-    ret = add_memory_driver_managed(node, base, size, "System RAM (NVIDIA)", MHP_NONE);
+        ret = add_memory_driver_managed(node, segment_base, segment_size, "System RAM (NVIDIA)", MHP_NONE);
 #else
-    ret = add_memory_driver_managed(node, base, size, "System RAM (NVIDIA)");
+        ret = add_memory_driver_managed(node, segment_base, segment_size, "System RAM (NVIDIA)");
 #endif
+        nv_printf(NV_DBG_SETUP, "NVRM: add_memory_driver_managed() returns: %d for segment_base: 0x%llx, segment_size: 0x%llx\n",
+                  ret, segment_base, segment_size);
+
+        segment_base += segment_size;
+        size_remaining -= segment_size;
+
+        // Yield CPU to prevent soft lockups
+        schedule();
+    }
     unregister_memory_notifier(&notifier.memory_notifier);
 
     if (ret == 0)
@@ -2265,14 +2310,33 @@ NV_STATUS NV_API_CALL os_numa_add_gpu_memory
             zone_end_pfn(zone) != end_pfn)
         {
             nv_printf(NV_DBG_ERRORS, "NVRM: GPU memory zone movable auto onlining failed!\n");
+
 #ifdef NV_OFFLINE_AND_REMOVE_MEMORY_PRESENT
-#ifdef NV_REMOVE_MEMORY_HAS_NID_ARG
-            if (offline_and_remove_memory(node, base, size) != 0)
-#else
-            if (offline_and_remove_memory(base, size) != 0)
-#endif
+            // Since zone movable auto onlining failed, need to remove the added memory.
+            segment_size = calculated_segment_size;
+            segment_base = base;
+            size_remaining = size;
+
+            while (size_remaining > 0)
             {
-                nv_printf(NV_DBG_ERRORS, "NVRM: offline_and_remove_memory failed\n");
+                if (segment_size > size_remaining)
+                {
+                    segment_size = size_remaining;
+                }
+
+#ifdef NV_REMOVE_MEMORY_HAS_NID_ARG
+                ret = offline_and_remove_memory(node, segment_base, segment_size);
+#else
+                ret = offline_and_remove_memory(segment_base, segment_size);
+#endif
+                nv_printf(NV_DBG_SETUP, "NVRM: offline_and_remove_memory() returns: %d for segment_base: 0x%llx, segment_size: 0x%llx\n",
+                          ret, segment_base, segment_size);
+
+                segment_base += segment_size;
+                size_remaining -= segment_size;
+
+                // Yield CPU to prevent soft lockups
+                schedule();
             }
 #endif
             goto failed;
@@ -2323,6 +2387,77 @@ failed:
     return NV_ERR_NOT_SUPPORTED;
 }
 
+
+typedef struct {
+    NvU64 base;
+    NvU64 size;
+    NvU32 nodeId;
+    int ret;
+} remove_numa_memory_info_t;
+
+static void offline_numa_memory_callback
+(
+    void *args
+)
+{
+#ifdef NV_OFFLINE_AND_REMOVE_MEMORY_PRESENT
+    remove_numa_memory_info_t *pNumaInfo = (remove_numa_memory_info_t *)args;
+    int ret = 0;
+    NvU64 memblock_size;
+    NvU64 size_remaining;
+    NvU64 calculated_segment_size;
+    NvU64 segment_size;
+    NvU64 segment_base;
+
+    //
+    // Removing all memory at once can take a long time. Split up memory into segments
+    // with schedule() in between to prevent soft lockups. Memory segments for
+    // offline_and_remove_memory() need to be aligned to memblock size.
+    //
+    // If there are any issues splitting into segments, then remove all memory at once.
+    //
+    if (os_numa_memblock_size(&memblock_size) == NV_OK)
+    {
+        calculated_segment_size = NV_ALIGN_UP(pNumaInfo->size / ADD_REMOVE_GPU_MEMORY_NUM_SEGMENTS, memblock_size);
+    }
+    else
+    {
+        // Don't split into segments, remove all memory at once
+        calculated_segment_size = pNumaInfo->size;
+    }
+
+    segment_size = calculated_segment_size;
+    segment_base = pNumaInfo->base;
+    size_remaining = pNumaInfo->size;
+
+    while (size_remaining > 0)
+    {
+        if (segment_size > size_remaining)
+        {
+            segment_size = size_remaining;
+        }
+
+#ifdef NV_REMOVE_MEMORY_HAS_NID_ARG
+        ret = offline_and_remove_memory(pNumaInfo->nodeId,
+                                        segment_base,
+                                        segment_size);
+#else
+        ret = offline_and_remove_memory(segment_base,
+                                        segment_size);
+#endif
+        nv_printf(NV_DBG_SETUP, "NVRM: offline_and_remove_memory() returns: %d for segment_base: 0x%llx, segment_size: 0x%llx\n",
+                  ret, segment_base, segment_size);
+        pNumaInfo->ret |= ret;
+
+        segment_base += segment_size;
+        size_remaining -= segment_size;
+
+        // Yield CPU to prevent soft lockups
+        schedule();
+    }
+#endif
+}
+
 NV_STATUS NV_API_CALL os_numa_remove_gpu_memory
 (
     void *handle,
diff --git a/kernel-open/nvidia/os-mlock.c b/kernel-open/nvidia/os-mlock.c
index 46f99a194..08271526e 100644
--- a/kernel-open/nvidia/os-mlock.c
+++ b/kernel-open/nvidia/os-mlock.c
@@ -26,6 +26,12 @@
 #include "os-interface.h"
 #include "nv-linux.h"
 
+#if defined(NVCPU_FAMILY_X86) && defined(NV_FOLL_LONGTERM_PRESENT) && \
+    (defined(NV_PIN_USER_PAGES_HAS_ARGS_VMAS) ||                      \
+     defined(NV_GET_USER_PAGES_HAS_ARGS_FLAGS_VMAS))
+#define NV_NUM_PIN_PAGES_PER_ITERATION 0x80000
+#endif
+
 static inline int nv_follow_pfn(struct vm_area_struct *vma,
                                 unsigned long address,
                                 unsigned long *pfn)
@@ -163,9 +169,15 @@ NV_STATUS NV_API_CALL os_lock_user_pages(
     NV_STATUS rmStatus;
     struct mm_struct *mm = current->mm;
     struct page **user_pages;
-    NvU64 i, pinned;
+    NvU64 i;
+    NvU64 npages = page_count;
+    NvU64 pinned = 0;
     unsigned int gup_flags = DRF_VAL(_LOCK_USER_PAGES, _FLAGS, _WRITE, flags) ? FOLL_WRITE : 0;
-    int ret;
+    long ret;
+
+#if defined(NVCPU_FAMILY_X86) && defined(NV_FOLL_LONGTERM_PRESENT)
+    gup_flags |= FOLL_LONGTERM;
+#endif
 
     if (!NV_MAY_SLEEP())
     {
@@ -185,16 +197,51 @@ NV_STATUS NV_API_CALL os_lock_user_pages(
 
     nv_mmap_read_lock(mm);
     ret = NV_PIN_USER_PAGES((unsigned long)address,
-                            page_count, gup_flags, user_pages);
-    nv_mmap_read_unlock(mm);
-    pinned = ret;
-
-    if (ret < 0)
+                            npages, gup_flags, user_pages);
+    if (ret > 0)
     {
-        os_free_mem(user_pages);
-        return NV_ERR_INVALID_ADDRESS;
+        pinned = ret;
     }
-    else if (pinned < page_count)
+#if defined(NVCPU_FAMILY_X86) && defined(NV_FOLL_LONGTERM_PRESENT) && \
+    (defined(NV_PIN_USER_PAGES_HAS_ARGS_VMAS) ||                      \
+     defined(NV_GET_USER_PAGES_HAS_ARGS_FLAGS_VMAS))
+    //
+    // NV_PIN_USER_PAGES() passes in NULL for the vmas parameter (if required)
+    // in pin_user_pages() (or get_user_pages() if pin_user_pages() does not
+    // exist). For kernels which do not contain the commit 52650c8b466b
+    // (mm/gup: remove the vma allocation from gup_longterm_locked()), if
+    // FOLL_LONGTERM is passed in, this results in the kernel trying to kcalloc
+    // the vmas array, and since the limit for kcalloc is 4 MB, it results in
+    // NV_PIN_USER_PAGES() failing with ENOMEM if more than
+    // NV_NUM_PIN_PAGES_PER_ITERATION pages are requested on 64-bit systems.
+    //
+    // As a workaround, if we requested more than
+    // NV_NUM_PIN_PAGES_PER_ITERATION pages and failed with ENOMEM, try again
+    // with multiple calls of NV_NUM_PIN_PAGES_PER_ITERATION pages at a time.
+    //
+    else if ((ret == -ENOMEM) &&
+             (page_count > NV_NUM_PIN_PAGES_PER_ITERATION))
+    {
+        for (pinned = 0; pinned < page_count; pinned += ret)
+        {
+            npages = page_count - pinned;
+            if (npages > NV_NUM_PIN_PAGES_PER_ITERATION)
+            {
+                npages = NV_NUM_PIN_PAGES_PER_ITERATION;
+            }
+
+            ret = NV_PIN_USER_PAGES(((unsigned long) address) + (pinned * PAGE_SIZE),
+                                    npages, gup_flags, &user_pages[pinned]);
+            if (ret <= 0)
+            {
+                break;
+            }
+        }
+    }
+#endif
+    nv_mmap_read_unlock(mm);
+
+    if (pinned < page_count)
     {
         for (i = 0; i < pinned; i++)
             NV_UNPIN_USER_PAGE(user_pages[i]);
diff --git a/src/common/displayport/inc/dp_connectorimpl.h b/src/common/displayport/inc/dp_connectorimpl.h
index f62ff826c..8ebc7c42f 100644
--- a/src/common/displayport/inc/dp_connectorimpl.h
+++ b/src/common/displayport/inc/dp_connectorimpl.h
@@ -348,6 +348,9 @@ namespace DisplayPort
         //
         bool        bPowerDownPhyBeforeD3;
 
+        // Force DSC on sink irrespective of LT status
+        bool        bForceDscOnSink;
+
         //
         // Reset the MSTM_CTRL registers on branch device irrespective of
         // IRQ VECTOR register having stale message. Certain branch devices
diff --git a/src/common/displayport/inc/dp_linkconfig.h b/src/common/displayport/inc/dp_linkconfig.h
index 4d741d4d5..3204b00f0 100644
--- a/src/common/displayport/inc/dp_linkconfig.h
+++ b/src/common/displayport/inc/dp_linkconfig.h
@@ -294,8 +294,8 @@ namespace DisplayPort
             else
             {
                 // if FEC is not enabled, link overhead comprises only of
-                // 0.05% downspread.
-                return rate - 5 * rate/ 1000;
+                // 0.6% downspread.
+                return rate - 6 * rate/ 1000;
 
             }
         }
diff --git a/src/common/displayport/inc/dp_regkeydatabase.h b/src/common/displayport/inc/dp_regkeydatabase.h
index e79f783a0..99416c16e 100644
--- a/src/common/displayport/inc/dp_regkeydatabase.h
+++ b/src/common/displayport/inc/dp_regkeydatabase.h
@@ -79,6 +79,11 @@
 //
 #define NV_DP_REGKEY_MST_PCON_CAPS_READ_DISABLED       "DP_BUG_4388987_WAR"
 
+//
+// Bug 4459839 : This regkey will enable DSC irrespective of LT status.
+//
+#define NV_DP_REGKEY_FORCE_DSC_ON_SINK                 "DP_FORCE_DSC_ON_SINK"
+
 //
 // Data Base used to store all the regkey values.
 // The actual data base is declared statically in dp_evoadapter.cpp.
@@ -113,6 +118,7 @@ struct DP_REGKEY_DATABASE
     bool  bPowerDownPhyBeforeD3;
     bool  bReassessMaxLink;
     bool  bMSTPCONCapsReadDisabled;
+    bool  bForceDscOnSink;
 };
 
 #endif //INCLUDED_DP_REGKEYDATABASE_H
diff --git a/src/common/displayport/src/dp_connectorimpl.cpp b/src/common/displayport/src/dp_connectorimpl.cpp
index 042275c1e..ba9838242 100644
--- a/src/common/displayport/src/dp_connectorimpl.cpp
+++ b/src/common/displayport/src/dp_connectorimpl.cpp
@@ -174,6 +174,7 @@ void ConnectorImpl::applyRegkeyOverrides(const DP_REGKEY_DATABASE& dpRegkeyDatab
     this->bDscMstCapBug3143315          = dpRegkeyDatabase.bDscMstCapBug3143315;
     this->bPowerDownPhyBeforeD3         = dpRegkeyDatabase.bPowerDownPhyBeforeD3;
     this->bReassessMaxLink              = dpRegkeyDatabase.bReassessMaxLink;
+    this->bForceDscOnSink               = dpRegkeyDatabase.bForceDscOnSink;
 }
 
 void ConnectorImpl::setPolicyModesetOrderMitigation(bool enabled)
@@ -3129,7 +3130,7 @@ bool ConnectorImpl::notifyAttachBegin(Group *                target,       // Gr
 
     // if LT is successful, see if panel supports DSC and if so, set DSC enabled/disabled
     // according to the mode requested.
-    if(bLinkTrainingStatus)
+    if(bLinkTrainingStatus || bForceDscOnSink)
     {
         for (Device * dev = target->enumDevices(0); dev; dev = target->enumDevices(dev))
         {
@@ -4631,6 +4632,11 @@ bool ConnectorImpl::trainLinkOptimized(LinkConfiguration lConfig)
                 }
             }
 
+            //
+            // There is no point in fallback here since we are link training  
+            // to loweset link config that can support the mode.
+            //
+            lowestSelected.policy.setSkipFallBack(true);
             bLinkTrainingSuccessful = train(lowestSelected, false);
             //
             // If LT failed, check if skipLT was marked. If so, clear the flag and
@@ -4648,16 +4654,37 @@ bool ConnectorImpl::trainLinkOptimized(LinkConfiguration lConfig)
             }
             if (!bLinkTrainingSuccessful)
             {
-                // Try fall back to max link config and if that fails try original assessed link configuration
+                // If optimized link config fails, try max link config with fallback. 
                 if (!train(getMaxLinkConfig(), false))
                 {
+                    //
+                    // Note here that if highest link config fails and a lower  
+                    // link config passes, link training will be returned as 
+                    // failure but activeLinkConfig will be set to that passing config.
+                    // 
                     if (!willLinkSupportModeSST(activeLinkConfig, groupAttached->lastModesetInfo))
                     {
+                        //
+                        // If none of the link configs pass LT or a fall back link config passed LT 
+                        // but cannot support the mode, then we will force the optimized link config
+                        // on the link and mark LT as fail.
+                        //
                         train(lowestSelected, true);
-
-                        // Mark link training as failed since we forced it
                         bLinkTrainingSuccessful = false;
                     }
+                    else
+                    {
+                        //
+                        // If a fallback link config pass LT and can support 
+                        // the mode, mark LT as pass.
+                        //
+                        bLinkTrainingSuccessful = true;
+                    }
+                }
+                else
+                {
+                    // If LT passes at max link config, mark LT as pass.
+                    bLinkTrainingSuccessful = true;
                 }
             }
         }
diff --git a/src/common/displayport/src/dp_evoadapter.cpp b/src/common/displayport/src/dp_evoadapter.cpp
index d5f306b86..652e9c580 100644
--- a/src/common/displayport/src/dp_evoadapter.cpp
+++ b/src/common/displayport/src/dp_evoadapter.cpp
@@ -94,7 +94,8 @@ const struct
     {NV_DP_DSC_MST_CAP_BUG_3143315,                 &dpRegkeyDatabase.bDscMstCapBug3143315,            DP_REG_VAL_BOOL},
     {NV_DP_REGKEY_POWER_DOWN_PHY,                   &dpRegkeyDatabase.bPowerDownPhyBeforeD3,           DP_REG_VAL_BOOL},
     {NV_DP_REGKEY_REASSESS_MAX_LINK,                &dpRegkeyDatabase.bReassessMaxLink,                DP_REG_VAL_BOOL},
-    {NV_DP_REGKEY_MST_PCON_CAPS_READ_DISABLED,      &dpRegkeyDatabase.bMSTPCONCapsReadDisabled,        DP_REG_VAL_BOOL}
+    {NV_DP_REGKEY_MST_PCON_CAPS_READ_DISABLED,      &dpRegkeyDatabase.bMSTPCONCapsReadDisabled,        DP_REG_VAL_BOOL},
+    {NV_DP_REGKEY_FORCE_DSC_ON_SINK,                &dpRegkeyDatabase.bForceDscOnSink,                 DP_REG_VAL_BOOL},
 };
 
 EvoMainLink::EvoMainLink(EvoInterface * provider, Timer * timer) :
diff --git a/src/common/inc/nvBldVer.h b/src/common/inc/nvBldVer.h
index 34e5de039..247a7c166 100644
--- a/src/common/inc/nvBldVer.h
+++ b/src/common/inc/nvBldVer.h
@@ -36,25 +36,25 @@
 // and then checked back in. You cannot make changes to these sections without
 // corresponding changes to the buildmeister script
 #ifndef NV_BUILD_BRANCH
-    #define NV_BUILD_BRANCH             r551_40
+    #define NV_BUILD_BRANCH             r550_00
 #endif
 #ifndef NV_PUBLIC_BRANCH
-    #define NV_PUBLIC_BRANCH             r551_40
+    #define NV_PUBLIC_BRANCH             r550_00
 #endif
 
 #if defined(NV_LINUX) || defined(NV_BSD) || defined(NV_SUNOS)
-#define NV_BUILD_BRANCH_VERSION         "rel/gpu_drv/r550/r551_40-171"
-#define NV_BUILD_CHANGELIST_NUM         (33992326)
+#define NV_BUILD_BRANCH_VERSION         "rel/gpu_drv/r550/r550_00-204"
+#define NV_BUILD_CHANGELIST_NUM         (34025356)
 #define NV_BUILD_TYPE                   "Official"
-#define NV_BUILD_NAME                   "rel/gpu_drv/r550/r551_40-171"
-#define NV_LAST_OFFICIAL_CHANGELIST_NUM (33992326)
+#define NV_BUILD_NAME                   "rel/gpu_drv/r550/r550_00-204"
+#define NV_LAST_OFFICIAL_CHANGELIST_NUM (34025356)
 
 #else     /* Windows builds */
-#define NV_BUILD_BRANCH_VERSION         "r551_40-15"
-#define NV_BUILD_CHANGELIST_NUM         (33992326)
+#define NV_BUILD_BRANCH_VERSION         "r550_00-192"
+#define NV_BUILD_CHANGELIST_NUM         (34025356)
 #define NV_BUILD_TYPE                   "Official"
-#define NV_BUILD_NAME                   "551.78"
-#define NV_LAST_OFFICIAL_CHANGELIST_NUM (33992326)
+#define NV_BUILD_NAME                   "551.86"
+#define NV_LAST_OFFICIAL_CHANGELIST_NUM (34025356)
 #define NV_BUILD_BRANCH_BASE_VERSION    R550
 #endif
 // End buildmeister python edited section
diff --git a/src/common/inc/nvUnixVersion.h b/src/common/inc/nvUnixVersion.h
index 908083972..8da8010ef 100644
--- a/src/common/inc/nvUnixVersion.h
+++ b/src/common/inc/nvUnixVersion.h
@@ -4,7 +4,7 @@
 #if defined(NV_LINUX) || defined(NV_BSD) || defined(NV_SUNOS) || defined(NV_VMWARE) || defined(NV_QNX) || defined(NV_INTEGRITY) || \
     (defined(RMCFG_FEATURE_PLATFORM_GSP) && RMCFG_FEATURE_PLATFORM_GSP == 1)
 
-#define NV_VERSION_STRING               "550.54.15"
+#define NV_VERSION_STRING               "550.67"
 
 #else
 
diff --git a/src/common/modeset/timing/nvt_edid.c b/src/common/modeset/timing/nvt_edid.c
index e0122dd59..0d199f51d 100644
--- a/src/common/modeset/timing/nvt_edid.c
+++ b/src/common/modeset/timing/nvt_edid.c
@@ -2388,7 +2388,8 @@ NvU32 NvTiming_EDIDStrongValidationMask(NvU8 *pEdid, NvU32 length)
 
                     if (parseCta861DataBlockInfo(pData_collection, (NvU32)ctaDTD_Offset - 4, NULL) == NVT_STATUS_SUCCESS)
                     {
-                        pData_collection++;
+                        pData_collection++; // go to the next byte. skip Tag+Length byte
+
                         if (ctaBlockTag == NVT_CEA861_TAG_VIDEO)
                         {
                             for (i=0; i < ctaPayload; i++)
@@ -2432,6 +2433,8 @@ NvU32 NvTiming_EDIDStrongValidationMask(NvU8 *pEdid, NvU32 length)
                     }
                     else
                     {
+                        pData_collection++; // go to the next byte. skip Tag+Length byte
+
                         ret |= NVT_EDID_VALIDATION_ERR_MASK(NVT_EDID_VALIDATION_ERR_EXT_CTA_INVALID_DATA_BLOCK);
                         pData_collection += ctaPayload;
                     }
diff --git a/src/common/modeset/timing/nvt_edidext_861.c b/src/common/modeset/timing/nvt_edidext_861.c
index 36a453153..d614bae31 100644
--- a/src/common/modeset/timing/nvt_edidext_861.c
+++ b/src/common/modeset/timing/nvt_edidext_861.c
@@ -1609,7 +1609,6 @@ void getEdidHDM1_4bVsdbTiming(NVT_EDID_INFO *pInfo)
 CODE_SEGMENT(PAGE_DD_CODE)
 NVT_STATUS get861ExtInfo(NvU8 *p, NvU32 size, NVT_EDID_CEA861_INFO *p861info)
 {
-
     NvU32 dtd_offset;
     // sanity check
     if (p == NULL || size < sizeof(EDIDV1STRUC))
@@ -1725,8 +1724,8 @@ NVT_STATUS parseCta861DataBlockInfo(NvU8 *p,
                     if (payload >= 1)
                     {
                         ext_tag = p[i]; 
-                        if      (ext_tag == NVT_CEA861_EXT_TAG_VIDEO_CAP && payload < 2)               return NVT_STATUS_ERR;
-                        else if (ext_tag == NVT_CEA861_EXT_TAG_COLORIMETRY && payload < 3)             return NVT_STATUS_ERR;
+                        if      (ext_tag == NVT_CEA861_EXT_TAG_VIDEO_CAP && (payload != 2))            return NVT_STATUS_ERR;
+                        else if (ext_tag == NVT_CEA861_EXT_TAG_COLORIMETRY && payload != 3)            return NVT_STATUS_ERR;
                         else if (ext_tag == NVT_CEA861_EXT_TAG_VIDEO_FORMAT_PREFERENCE && payload < 2) return NVT_STATUS_ERR;
                         else if (ext_tag == NVT_CEA861_EXT_TAG_YCBCR420_VIDEO && payload < 2)          return NVT_STATUS_ERR;
                         else if (ext_tag == NVT_CEA861_EXT_TAG_YCBCR420_CAP && payload < 1)            return NVT_STATUS_ERR;
@@ -1856,19 +1855,22 @@ NVT_STATUS parseCta861DataBlockInfo(NvU8 *p,
         }
         else if (tag == NVT_CTA861_TAG_VIDEO_FORMAT)
         {
-            p861info->vfdb[vfd_index].info.vfd_len =  p[i] & 0x03;
-            p861info->vfdb[vfd_index].info.ntsc    = (p[i] & 0x40) >> 6;
-            p861info->vfdb[vfd_index].info.y420    = (p[i] & 0x80) >> 7;
-            p861info->vfdb[vfd_index].total_vfd    = (NvU8)(payload - 1) / (p861info->vfdb[vfd_index].info.vfd_len + 1);
-
-            i++; payload--;
-
-            for (j = 0; j < payload; j++, i++)
+            if (payload > 0)
             {
-                p861info->vfdb[vfd_index].video_format_desc[j] = p[i];
-            }      
+                p861info->vfdb[vfd_index].info.vfd_len =  p[i] & 0x03;
+                p861info->vfdb[vfd_index].info.ntsc    = (p[i] & 0x40) >> 6;
+                p861info->vfdb[vfd_index].info.y420    = (p[i] & 0x80) >> 7;
+                p861info->vfdb[vfd_index].total_vfd    = (NvU8)(payload - 1) / (p861info->vfdb[vfd_index].info.vfd_len + 1);
 
-            p861info->total_vfdb = ++vfd_index;
+                i++; payload--;
+
+                for (j = 0; (j < payload) && (p861info->vfdb[vfd_index].total_vfd != 0); j++, i++)
+                {
+                    p861info->vfdb[vfd_index].video_format_desc[j] = p[i];
+                }
+
+                p861info->total_vfdb = ++vfd_index;
+            }
         }
         else if (tag == NVT_CEA861_TAG_EXTENDED_FLAG)
         {
@@ -1879,14 +1881,14 @@ NVT_STATUS parseCta861DataBlockInfo(NvU8 *p,
                 {
                     p861info->video_capability = p[i + 1] & NVT_CEA861_VIDEO_CAPABILITY_MASK;
                     p861info->valid.VCDB = 1;
-                    i += 2;
+                    i += payload;
                 }
                 else if (ext_tag == NVT_CEA861_EXT_TAG_COLORIMETRY && payload >= 3)
                 {
                     p861info->colorimetry.byte1 = p[i + 1] & NVT_CEA861_COLORIMETRY_MASK;
                     p861info->colorimetry.byte2 = p[i + 2] & NVT_CEA861_GAMUT_METADATA_MASK;
                     p861info->valid.colorimetry = 1;
-                    i += 3;
+                    i += payload;
                 }
                 else if (ext_tag == NVT_CEA861_EXT_TAG_VIDEO_FORMAT_PREFERENCE && payload >= 2)
                 {
diff --git a/src/common/nvlink/inband/interface/nvlink_inband_msg.h b/src/common/nvlink/inband/interface/nvlink_inband_msg.h
index 4d0e61851..f0a39c016 100644
--- a/src/common/nvlink/inband/interface/nvlink_inband_msg.h
+++ b/src/common/nvlink/inband/interface/nvlink_inband_msg.h
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: MIT
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -80,6 +80,7 @@ typedef struct
 
 #define NVLINK_INBAND_GPU_PROBE_CAPS_SRIOV_ENABLED NVBIT(0)
 #define NVLINK_INBAND_GPU_PROBE_CAPS_PROBE_UPDATE  NVBIT(1)
+#define NVLINK_INBAND_GPU_PROBE_CAPS_EGM_SUPPORT   NVBIT(2)
 
 /* Add more caps as need in the future */
 
diff --git a/src/common/nvlink/kernel/nvlink/interface/nvlink_kern_shutdown_entry.c b/src/common/nvlink/kernel/nvlink/interface/nvlink_kern_shutdown_entry.c
index b6e838f12..80c0f6527 100644
--- a/src/common/nvlink/kernel/nvlink/interface/nvlink_kern_shutdown_entry.c
+++ b/src/common/nvlink/kernel/nvlink/interface/nvlink_kern_shutdown_entry.c
@@ -378,6 +378,21 @@ nvlink_lib_powerdown_links_from_active_to_off
         lockLinkCount++;
     }
 
+    if (lockLinkCount == 0)
+    {
+        if (conns != NULL)
+            nvlink_free((void *)conns);
+
+        if (lockLinks != NULL)
+            nvlink_free((void *)lockLinks);
+
+         // Release the top-level lock
+        nvlink_lib_top_lock_release();
+        NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
+            "%s: No conns were found\n", __FUNCTION__));
+        return NVL_NOT_FOUND;
+    }
+
     // Acquire the per-link locks for all links captured
     status = nvlink_lib_link_locks_acquire(lockLinks, lockLinkCount);
     if (status != NVL_SUCCESS)
@@ -923,4 +938,3 @@ nvlink_core_powerdown_floorswept_conns_to_off_end:
 
     return status;
 }
-
diff --git a/src/common/nvswitch/kernel/inc/boards_nvswitch.h b/src/common/nvswitch/kernel/inc/boards_nvswitch.h
index 984ffa455..96c63784f 100644
--- a/src/common/nvswitch/kernel/inc/boards_nvswitch.h
+++ b/src/common/nvswitch/kernel/inc/boards_nvswitch.h
@@ -38,6 +38,7 @@
 #define NVSWITCH_BOARD_LS10_5612_0002_ES    0x03D6
 #define NVSWITCH_BOARD_LS10_4697_0000_895   0x03B9
 #define NVSWITCH_BOARD_LS10_4262_0000_895   0x04FE
+#define NVSWITCH_BOARD_LS10_4300_0000_895   0x0571
 
 #define NVSWITCH_BOARD_UNKNOWN_NAME             "UNKNOWN"
 
@@ -48,5 +49,6 @@
 #define NVSWITCH_BOARD_LS10_5612_0002_ES_NAME   "LS10_5612_0002_ES"
 #define NVSWITCH_BOARD_LS10_4697_0000_895_NAME  "LS10_4697_0000_895"
 #define NVSWITCH_BOARD_LS10_4262_0000_895_NAME  "LS10_4262_0000_895"
+#define NVSWITCH_BOARD_LS10_4300_0000_895_NAME  "LS10_4300_0000_895"
 
 #endif // _BOARDS_NVSWITCH_H_
diff --git a/src/common/nvswitch/kernel/ls10/intr_ls10.c b/src/common/nvswitch/kernel/ls10/intr_ls10.c
index 6e12d6d30..055dc7268 100644
--- a/src/common/nvswitch/kernel/ls10/intr_ls10.c
+++ b/src/common/nvswitch/kernel/ls10/intr_ls10.c
@@ -894,9 +894,9 @@ _nvswitch_collect_error_info_ls10
             {
                 data->flags |= NVSWITCH_RAW_ERROR_LOG_DATA_FLAG_ROUTE_HDR;
                 NVSWITCH_PRINT(device, INFO,
-                    "ROUTE: HEADER: 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x,\n",
-                    data->data[i-8], data->data[i-7], data->data[i-6], data->data[i-5],
-                    data->data[i-4], data->data[i-3], data->data[i-2], data->data[i-1]);
+                    "ROUTE: HEADER: 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x,\n",
+                    data->data[i-7], data->data[i-6], data->data[i-5], data->data[i-4],
+                    data->data[i-3], data->data[i-2], data->data[i-1]);
             }
         }
     }
@@ -940,9 +940,9 @@ _nvswitch_collect_error_info_ls10
             {
                 data->flags |= NVSWITCH_RAW_ERROR_LOG_DATA_FLAG_INGRESS_HDR;
                 NVSWITCH_PRINT(device, INFO,
-                    "INGRESS: HEADER: 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x,\n",
-                    data->data[i-7], data->data[i-6], data->data[i-5], data->data[i-4],
-                    data->data[i-3], data->data[i-2], data->data[i-1]);
+                    "INGRESS: HEADER: 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x,\n",
+                    data->data[i-6], data->data[i-5], data->data[i-4], data->data[i-3],
+                    data->data[i-2], data->data[i-1]);
             }
         }
     }
diff --git a/src/common/nvswitch/kernel/ls10/pmgr_ls10.c b/src/common/nvswitch/kernel/ls10/pmgr_ls10.c
index 2fa83b955..890b7c173 100644
--- a/src/common/nvswitch/kernel/ls10/pmgr_ls10.c
+++ b/src/common/nvswitch/kernel/ls10/pmgr_ls10.c
@@ -32,6 +32,7 @@
 #include "export_nvswitch.h"
 #include "soe/soe_nvswitch.h"
 #include "soe/soeifcore.h"
+#include "boards_nvswitch.h"
 
 #include "nvswitch/ls10/dev_pmgr.h"
 
@@ -176,6 +177,16 @@ static const NVSWITCH_GPIO_INFO nvswitch_gpio_pin_Default[] =
 
 static const NvU32 nvswitch_gpio_pin_Default_size = NV_ARRAY_ELEMENTS(nvswitch_gpio_pin_Default);
 
+static const NVSWITCH_GPIO_INFO nvswitch_gpio_pin_4300[] =
+{
+    NVSWITCH_DESCRIBE_GPIO_PIN( 0, _INSTANCE_ID0,   0, IN),          // Instance ID bit 0
+    NVSWITCH_DESCRIBE_GPIO_PIN( 1, _INSTANCE_ID1,   0, IN),          // Instance ID bit 1
+    NVSWITCH_DESCRIBE_GPIO_PIN( 2, _INSTANCE_ID2,   0, IN),          // Instance ID bit 2
+    NVSWITCH_DESCRIBE_GPIO_PIN( 6, _INSTANCE_ID3,   0, IN),          // Instance ID bit 3
+    NVSWITCH_DESCRIBE_GPIO_PIN( 7, _INSTANCE_ID4,   0, IN),          // Instance ID bit 4
+};
+static const NvU32 nvswitch_gpio_pin_4300_size = NV_ARRAY_ELEMENTS(nvswitch_gpio_pin_4300);
+
 //
 // Initialize the software state of the switch I2C & GPIO interface
 // Temporarily forcing default GPIO values.
@@ -191,6 +202,8 @@ nvswitch_init_pmgr_devices_ls10
 {
     ls10_device *chip_device = NVSWITCH_GET_CHIP_DEVICE_LS10(device);
     PNVSWITCH_OBJI2C pI2c = device->pI2c;
+    NvlStatus retval;
+    NvU16 boardId;
 
     if (IS_FMODEL(device) || IS_EMULATION(device) || IS_RTLSIM(device))
     {
@@ -200,8 +213,18 @@ nvswitch_init_pmgr_devices_ls10
     }
     else
     {
-        chip_device->gpio_pin = nvswitch_gpio_pin_Default;
-        chip_device->gpio_pin_size = nvswitch_gpio_pin_Default_size;
+        retval = nvswitch_get_board_id(device, &boardId);
+        if (retval == NVL_SUCCESS &&
+            boardId == NVSWITCH_BOARD_LS10_4300_0000_895)
+        {
+            chip_device->gpio_pin = nvswitch_gpio_pin_4300;
+            chip_device->gpio_pin_size = nvswitch_gpio_pin_4300_size;
+        }
+        else
+        {
+            chip_device->gpio_pin = nvswitch_gpio_pin_Default;
+            chip_device->gpio_pin_size = nvswitch_gpio_pin_Default_size;
+        }
     }
 
     pI2c->device_list = NULL;
diff --git a/src/common/nvswitch/kernel/nvswitch.c b/src/common/nvswitch/kernel/nvswitch.c
index 724c634da..dea84f65b 100644
--- a/src/common/nvswitch/kernel/nvswitch.c
+++ b/src/common/nvswitch/kernel/nvswitch.c
@@ -62,7 +62,7 @@ static NvlStatus _nvswitch_ctrl_inband_flush_data(nvswitch_device *device, NVSWI
 #define NVSWITCH_DEV_CMD_DISPATCH_RESERVED(cmd)                                  \
     case cmd:                                                                    \
     {                                                                            \
-        retval = -NVL_ERR_NOT_IMPLEMENTED;                                       \
+        retval = -NVL_ERR_NOT_SUPPORTED;                                         \
         break;                                                                   \
     }                                                                            \
 
diff --git a/src/nvidia-modeset/Makefile b/src/nvidia-modeset/Makefile
index 66edbf4e8..d49a3bfb4 100644
--- a/src/nvidia-modeset/Makefile
+++ b/src/nvidia-modeset/Makefile
@@ -95,6 +95,7 @@ endif
 ifeq ($(TARGET_ARCH),aarch64)
   CFLAGS += -mgeneral-regs-only
   CFLAGS += -march=armv8-a
+  CFLAGS += -ffixed-x18
   CONDITIONAL_CFLAGS += $(call TEST_CC_ARG, -mno-outline-atomics)
 endif
 
diff --git a/src/nvidia/Makefile b/src/nvidia/Makefile
index e2f1c6728..0f70514b7 100644
--- a/src/nvidia/Makefile
+++ b/src/nvidia/Makefile
@@ -90,6 +90,7 @@ ifeq ($(TARGET_ARCH),aarch64)
   CFLAGS += -mgeneral-regs-only
   CFLAGS += -march=armv8-a
   CFLAGS += -mstrict-align
+  CFLAGS += -ffixed-x18
   CONDITIONAL_CFLAGS += $(call TEST_CC_ARG, -mno-outline-atomics)
 endif
 
diff --git a/src/nvidia/arch/nvalloc/unix/src/os-hypervisor.c b/src/nvidia/arch/nvalloc/unix/src/os-hypervisor.c
index d26c60346..ee9f85b84 100644
--- a/src/nvidia/arch/nvalloc/unix/src/os-hypervisor.c
+++ b/src/nvidia/arch/nvalloc/unix/src/os-hypervisor.c
@@ -74,7 +74,7 @@ NV_STATUS hypervisorInjectInterrupt_IMPL
     NV_STATUS status = NV_ERR_NOT_SUPPORTED;
 
     if (pVgpuNsIntr->pVgpuVfioRef)
-        status = osVgpuInjectInterrupt(pVgpuNsIntr->pVgpuVfioRef);
+        return NV_ERR_NOT_SUPPORTED;
     else
     {
         if (pVgpuNsIntr->guestMSIAddr && pVgpuNsIntr->guestMSIData)
@@ -142,14 +142,22 @@ static NV_STATUS get_available_instances(
 
                 swizzIdInUseMask = kmigmgrGetSwizzIdInUseMask(pGpu, pKernelMIGManager);
 
+                if (!vgpuTypeInfo->gpuInstanceSize)
+                {
+                    // Query for a non MIG vgpuType
+                    NV_PRINTF(LEVEL_INFO, "%s Query for a non MIG vGPU type \n",
+                              __FUNCTION__);
+                    rmStatus = NV_OK;
+                    goto exit;
+                }
+
                 rmStatus = kvgpumgrGetPartitionFlag(vgpuTypeInfo->vgpuTypeId,
                                                    &partitionFlag);
                 if (rmStatus != NV_OK)
                 {
                     // Query for a non MIG vgpuType
-                    NV_PRINTF(LEVEL_ERROR, "%s Query for a non MIG vGPU type \n",
+                    NV_PRINTF(LEVEL_ERROR, "%s failed to get partition flags.\n",
                               __FUNCTION__);
-                    rmStatus = NV_OK;
                     goto exit;
                 }
 
@@ -192,7 +200,7 @@ static NV_STATUS get_available_instances(
                 if (vgpuTypeInfo->gpuInstanceSize)
                 {
                     // Query for a MIG vgpuType
-                    NV_PRINTF(LEVEL_ERROR, "%s Query for a MIG vGPU type \n",
+                    NV_PRINTF(LEVEL_INFO, "%s Query for a MIG vGPU type \n",
                               __FUNCTION__);
                     rmStatus = NV_OK;
                     goto exit;
diff --git a/src/nvidia/generated/g_gpu_nvoc.h b/src/nvidia/generated/g_gpu_nvoc.h
index 22a84d102..609001809 100644
--- a/src/nvidia/generated/g_gpu_nvoc.h
+++ b/src/nvidia/generated/g_gpu_nvoc.h
@@ -1255,6 +1255,7 @@ struct OBJGPU {
     TMR_EVENT *pVideoTimerEvent;
     NVENC_SESSION_LIST nvencSessionList;
     NvU32 encSessionStatsReportingState;
+    NvBool bNvEncSessionDataProcessingWorkItemPending;
     NVFBC_SESSION_LIST nvfbcSessionList;
     struct OBJVASPACE *pFabricVAS;
     NvBool bPipelinedPteMemEnabled;
diff --git a/src/nvidia/generated/g_nv_name_released.h b/src/nvidia/generated/g_nv_name_released.h
index 39471b1fb..8802fed20 100644
--- a/src/nvidia/generated/g_nv_name_released.h
+++ b/src/nvidia/generated/g_nv_name_released.h
@@ -1014,6 +1014,7 @@ static const CHIPS_RELEASED sChipsReleased[] = {
     { 0x2702, 0x0000, 0x0000, "NVIDIA GeForce RTX 4080 SUPER" },
     { 0x2704, 0x0000, 0x0000, "NVIDIA GeForce RTX 4080" },
     { 0x2705, 0x0000, 0x0000, "NVIDIA GeForce RTX 4070 Ti SUPER" },
+    { 0x2709, 0x0000, 0x0000, "NVIDIA GeForce RTX 4070" },
     { 0x2717, 0x0000, 0x0000, "NVIDIA GeForce RTX 4090 Laptop GPU" },
     { 0x2730, 0x0000, 0x0000, "NVIDIA RTX 5000 Ada Generation Laptop GPU" },
     { 0x2757, 0x0000, 0x0000, "NVIDIA GeForce RTX 4090 Laptop GPU" },
@@ -1021,6 +1022,7 @@ static const CHIPS_RELEASED sChipsReleased[] = {
     { 0x2782, 0x0000, 0x0000, "NVIDIA GeForce RTX 4070 Ti" },
     { 0x2783, 0x0000, 0x0000, "NVIDIA GeForce RTX 4070 SUPER" },
     { 0x2786, 0x0000, 0x0000, "NVIDIA GeForce RTX 4070" },
+    { 0x2788, 0x0000, 0x0000, "NVIDIA GeForce RTX 4060 Ti" },
     { 0x27A0, 0x0000, 0x0000, "NVIDIA GeForce RTX 4080 Laptop GPU" },
     { 0x27B0, 0x16fa, 0x1028, "NVIDIA RTX 4000 SFF Ada Generation" },
     { 0x27B0, 0x16fa, 0x103c, "NVIDIA RTX 4000 SFF Ada Generation" },
@@ -1043,6 +1045,7 @@ static const CHIPS_RELEASED sChipsReleased[] = {
     { 0x27FB, 0x0000, 0x0000, "NVIDIA RTX 3500 Ada Generation Embedded GPU" },
     { 0x2803, 0x0000, 0x0000, "NVIDIA GeForce RTX 4060 Ti" },
     { 0x2805, 0x0000, 0x0000, "NVIDIA GeForce RTX 4060 Ti" },
+    { 0x2808, 0x0000, 0x0000, "NVIDIA GeForce RTX 4060" },
     { 0x2820, 0x0000, 0x0000, "NVIDIA GeForce RTX 4070 Laptop GPU" },
     { 0x2838, 0x0000, 0x0000, "NVIDIA RTX 3000 Ada Generation Laptop GPU" },
     { 0x2860, 0x0000, 0x0000, "NVIDIA GeForce RTX 4070 Laptop GPU" },
diff --git a/src/nvidia/generated/g_spdm_nvoc.h b/src/nvidia/generated/g_spdm_nvoc.h
index efd67d58b..f2404ca2b 100644
--- a/src/nvidia/generated/g_spdm_nvoc.h
+++ b/src/nvidia/generated/g_spdm_nvoc.h
@@ -7,7 +7,7 @@ extern "C" {
 #endif
 
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: MIT
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -108,6 +108,9 @@ struct Spdm {
     NvU32 sessionMsgCount;
     PTMR_EVENT pHeartbeatEvent;
     NvU32 heartbeatPeriodSec;
+    NvU8 *pTransportBuffer;
+    NvU32 transportBufferSize;
+    NvU32 pendingResponseSize;
 };
 
 #ifndef __NVOC_CLASS_Spdm_TYPEDEF__
diff --git a/src/nvidia/generated/g_vgpuconfigapi_nvoc.h b/src/nvidia/generated/g_vgpuconfigapi_nvoc.h
index a06627982..e90cc278f 100644
--- a/src/nvidia/generated/g_vgpuconfigapi_nvoc.h
+++ b/src/nvidia/generated/g_vgpuconfigapi_nvoc.h
@@ -7,7 +7,7 @@ extern "C" {
 #endif
 
 /*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: MIT
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
diff --git a/src/nvidia/inc/kernel/gpu/gsp/message_queue_priv.h b/src/nvidia/inc/kernel/gpu/gsp/message_queue_priv.h
index 4b21af185..f25a7dfcf 100644
--- a/src/nvidia/inc/kernel/gpu/gsp/message_queue_priv.h
+++ b/src/nvidia/inc/kernel/gpu/gsp/message_queue_priv.h
@@ -103,4 +103,24 @@ typedef struct MESSAGE_QUEUE_COLLECTION
 #define GSP_MSG_QUEUE_HEADER_SIZE                                   RM_PAGE_SIZE
 #define GSP_MSG_QUEUE_HEADER_ALIGN                                             4   // 2 ^ 4 = 16
 
+/*!
+ * Calculate 32-bit checksum
+ *
+ * This routine assumes that the data is padded out with zeros to the next
+ * 8-byte alignment, and it is OK to read past the end to the 8-byte alignment.
+ */
+static NV_INLINE NvU32 _checkSum32(void *pData, NvU32 uLen)
+{
+    NvU64 *p        = (NvU64 *)pData;
+    NvU64 *pEnd     = (NvU64 *)((NvUPtr)pData + uLen);
+    NvU64  checkSum = 0;
+
+    NV_ASSERT_CHECKED(uLen > 0);
+
+    while (p < pEnd)
+        checkSum ^= *p++;
+
+    return NvU64_HI32(checkSum) ^ NvU64_LO32(checkSum);
+}
+
 #endif // _MESSAGE_QUEUE_PRIV_H_
diff --git a/src/nvidia/src/kernel/gpu/bif/arch/maxwell/kernel_bif_gm107.c b/src/nvidia/src/kernel/gpu/bif/arch/maxwell/kernel_bif_gm107.c
index 2605eb375..097417ad0 100644
--- a/src/nvidia/src/kernel/gpu/bif/arch/maxwell/kernel_bif_gm107.c
+++ b/src/nvidia/src/kernel/gpu/bif/arch/maxwell/kernel_bif_gm107.c
@@ -585,6 +585,13 @@ kbifRestorePcieConfigRegisters_GM107
     NvU64     timeStampStart;
     NvU64     timeStampEnd;
 
+    if (pKernelBif->xveRegmapRef[0].bufBootConfigSpace == NULL)
+    {
+        NV_PRINTF(LEVEL_ERROR, "Config space buffer is NULL!\n");
+        NV_ASSERT(0);
+        return NV_ERR_OBJECT_NOT_FOUND;
+    }
+
     // Restore pcie config space for function 0
     status = _kbifRestorePcieConfigRegisters_GM107(pGpu, pKernelBif,
                                                    &pKernelBif->xveRegmapRef[0]);
diff --git a/src/nvidia/src/kernel/gpu/device.c b/src/nvidia/src/kernel/gpu/device.c
index c1b1ff08b..a8b49ba14 100644
--- a/src/nvidia/src/kernel/gpu/device.c
+++ b/src/nvidia/src/kernel/gpu/device.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: MIT
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
diff --git a/src/nvidia/src/kernel/gpu/fsp/kern_fsp.c b/src/nvidia/src/kernel/gpu/fsp/kern_fsp.c
index 5258cef0b..408fc27cb 100644
--- a/src/nvidia/src/kernel/gpu/fsp/kern_fsp.c
+++ b/src/nvidia/src/kernel/gpu/fsp/kern_fsp.c
@@ -259,32 +259,50 @@ kfspPollForQueueEmpty_IMPL
     KernelFsp *pKernelFsp
 )
 {
+    NV_STATUS status = NV_OK;
     RMTIMEOUT timeout;
 
-    gpuSetTimeout(pGpu, GPU_TIMEOUT_DEFAULT, &timeout, GPU_TIMEOUT_FLAGS_OSTIMER | GPU_TIMEOUT_FLAGS_BYPASS_THREAD_STATE);
+    gpuSetTimeout(pGpu, GPU_TIMEOUT_DEFAULT, &timeout,
+        GPU_TIMEOUT_FLAGS_OSTIMER |
+        GPU_TIMEOUT_FLAGS_BYPASS_THREAD_STATE);
 
     while (!kfspIsQueueEmpty(pGpu, pKernelFsp))
     {
         //
-        // For now we assume that any response from FSP before RM message send is complete
-        // indicates an error and we should abort.
+        // For now we assume that any response from FSP before RM message
+        // send is complete indicates an error and we should abort.
+        //
+        // Ongoing dicussion on usefullness of this check. Bug to be filed.
         //
         if (!kfspIsMsgQueueEmpty(pGpu, pKernelFsp))
         {
             kfspReadMessage(pGpu, pKernelFsp, NULL, 0);
-            NV_PRINTF(LEVEL_ERROR, "Received error message from FSP while waiting for CMDQ to be empty.\n");
-            return NV_ERR_GENERIC;
+            NV_PRINTF(LEVEL_ERROR,
+                "Received error message from FSP while waiting for CMDQ to be empty.\n");
+            status = NV_ERR_GENERIC;
+            break;
         }
 
-        if (gpuCheckTimeout(pGpu, &timeout) == NV_ERR_TIMEOUT)
-        {
-            NV_PRINTF(LEVEL_ERROR, "Timed out waiting for FSP command queue to be empty.\n");
-            return NV_ERR_TIMEOUT;
-        }
         osSpinLoop();
+
+        status = gpuCheckTimeout(pGpu, &timeout);
+        if (status != NV_OK)
+        {
+            if ((status == NV_ERR_TIMEOUT) &&
+                kfspIsQueueEmpty(pGpu, pKernelFsp))
+            {
+                status = NV_OK;
+            }
+            else
+            {
+                NV_PRINTF(LEVEL_ERROR,
+                    "Timed out waiting for FSP command queue to be empty.\n");
+            }
+            break;
+        }
     }
 
-    return NV_OK;
+    return status;
 }
 
 /*!
diff --git a/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c b/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c
index f4541858a..f3adbba73 100644
--- a/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c
+++ b/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c
@@ -846,6 +846,14 @@ _kgspRpcEventIsGpuDegradedCallback
     OBJRPC  *pRpc
 )
 {
+    RPC_PARAMS(nvlink_is_gpu_degraded, _v17_00);
+    KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu);
+    NV2080_CTRL_NVLINK_IS_GPU_DEGRADED_PARAMS_v17_00 *dest = &rpc_params->params;
+
+    if(dest->bIsGpuDegraded)
+    {
+        knvlinkSetDegradedMode(pGpu, pKernelNvlink, dest->linkId);
+    }
 }
 
 static void
diff --git a/src/nvidia/src/kernel/gpu/gsp/message_queue_cpu.c b/src/nvidia/src/kernel/gpu/gsp/message_queue_cpu.c
index 9b8584c70..0eb247a32 100644
--- a/src/nvidia/src/kernel/gpu/gsp/message_queue_cpu.c
+++ b/src/nvidia/src/kernel/gpu/gsp/message_queue_cpu.c
@@ -476,24 +476,6 @@ void GspMsgQueuesCleanup(MESSAGE_QUEUE_COLLECTION **ppMQCollection)
     *ppMQCollection = NULL;
 }
 
-/*!
- * Calculate 32-bit checksum
- *
- * This routine assumes that the data is padded out with zeros to the next
- * 8-byte alignment, and it is OK to read past the end to the 8-byte alignment.
- */
-static NV_INLINE NvU32 _checkSum32(void *pData, NvU32 uLen)
-{
-    NvU64 *p        = (NvU64 *)pData;
-    NvU64 *pEnd     = (NvU64 *)((NvUPtr)pData + uLen);
-    NvU64  checkSum = 0;
-
-    while (p < pEnd)
-        checkSum ^= *p++;
-
-    return NvU64_HI32(checkSum) ^ NvU64_LO32(checkSum);
-}
-
 /*!
  * GspMsgQueueSendCommand
  *
@@ -532,7 +514,7 @@ NV_STATUS GspMsgQueueSendCommand(MESSAGE_QUEUE_INFO *pMQI, OBJGPU *pGpu)
 
     pCQE->seqNum    = pMQI->txSeqNum;
     pCQE->elemCount = GSP_MSG_QUEUE_BYTES_TO_ELEMENTS(uElementSize);
-    pCQE->checkSum  = 0;
+    pCQE->checkSum  = 0; // The checkSum field is included in the checksum calculation, so zero it.
 
     if (gpuIsCCFeatureEnabled(pGpu))
     {
@@ -666,7 +648,8 @@ NV_STATUS GspMsgQueueReceiveStatus(MESSAGE_QUEUE_INFO *pMQI, OBJGPU *pGpu)
     NvU32       nRetries;
     NvU32       nMaxRetries  = 3;
     NvU32       nElements    = 1;  // Assume record fits in one queue element for now.
-    NvU32       uElementSize = 0;
+    NvU32       uElementSize;
+    NvU32       checkSum;
     NvU32       seqMismatchDiff = NV_U32_MAX;
     NV_STATUS   nvStatus     = NV_OK;
 
@@ -717,15 +700,23 @@ NV_STATUS GspMsgQueueReceiveStatus(MESSAGE_QUEUE_INFO *pMQI, OBJGPU *pGpu)
         // Retry if checksum fails.
         if (gpuIsCCFeatureEnabled(pGpu))
         {
-            // In Confidential Compute scenario, checksum includes complete element range.
-            if (_checkSum32(pMQI->pCmdQueueElement, (nElements * GSP_MSG_QUEUE_ELEMENT_SIZE_MIN)) != 0)
-            {
-                NV_PRINTF(LEVEL_ERROR, "Bad checksum.\n");
-                nvStatus = NV_ERR_INVALID_DATA;
-                continue;
-            }
+            //
+            // In the Confidential Compute scenario, the actual message length
+            // is inside the encrypted payload, and we can't access it before
+            // decryption, therefore the checksum encompasses the whole element
+            // range. This makes checksum verification significantly slower
+            // because messages are typically much smaller than element size.
+            //
+            checkSum = _checkSum32(pMQI->pCmdQueueElement,
+                                   (nElements * GSP_MSG_QUEUE_ELEMENT_SIZE_MIN));
         } else
-        if (_checkSum32(pMQI->pCmdQueueElement, uElementSize) != 0)
+        {
+            checkSum = _checkSum32(pMQI->pCmdQueueElement,
+                                   (GSP_MSG_QUEUE_ELEMENT_HDR_SIZE +
+                                    pMQI->pCmdQueueElement->rpc.length));
+        }
+
+        if (checkSum != 0)
         {
             NV_PRINTF(LEVEL_ERROR, "Bad checksum.\n");
             nvStatus = NV_ERR_INVALID_DATA;
diff --git a/src/nvidia/src/kernel/gpu/mem_sys/kern_mem_sys.c b/src/nvidia/src/kernel/gpu/mem_sys/kern_mem_sys.c
index 9ba9072ef..18e155ffb 100644
--- a/src/nvidia/src/kernel/gpu/mem_sys/kern_mem_sys.c
+++ b/src/nvidia/src/kernel/gpu/mem_sys/kern_mem_sys.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: MIT
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
diff --git a/src/nvidia/src/kernel/gpu/mem_sys/kern_mem_sys_ctrl.c b/src/nvidia/src/kernel/gpu/mem_sys/kern_mem_sys_ctrl.c
index 7710b0ab6..970fa0480 100644
--- a/src/nvidia/src/kernel/gpu/mem_sys/kern_mem_sys_ctrl.c
+++ b/src/nvidia/src/kernel/gpu/mem_sys/kern_mem_sys_ctrl.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: MIT
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -475,11 +475,14 @@ _kmemsysGetFbInfos
                         // It will be zero unless VGA display memory is reserved
                         if (pKernelMemorySystem->fbOverrideStartKb != 0)
                         {
+                            status = NV_OK;
                             data = NvU64_LO32(pKernelMemorySystem->fbOverrideStartKb);
-                            NV_ASSERT(((NvU64) data << 10ULL) == pKernelMemorySystem->fbOverrideStartKb);
+                            NV_ASSERT_OR_ELSE((NvU64) data == pKernelMemorySystem->fbOverrideStartKb,
+                                              status = NV_ERR_INVALID_DATA);
+                            
                         }
-					    else
-				    	{
+                        else
+                        {
                             //
                             // Returns start of heap in kbytes. This is zero unless
                             // VGA display memory is reserved.
diff --git a/src/nvidia/src/kernel/gpu/nvenc/nvencsession.c b/src/nvidia/src/kernel/gpu/nvenc/nvencsession.c
index 2e5bd2770..1f01e34a2 100644
--- a/src/nvidia/src/kernel/gpu/nvenc/nvencsession.c
+++ b/src/nvidia/src/kernel/gpu/nvenc/nvencsession.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2012-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2012-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: MIT
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -159,6 +159,7 @@ nvencsessionConstruct_IMPL
         (listCount(&(pGpu->nvencSessionList)) == 1))
     {
         // Register 1Hz timer callback for this GPU.
+        pGpu->bNvEncSessionDataProcessingWorkItemPending = NV_FALSE;
         status = osSchedule1HzCallback(pGpu,
                                        _gpuNvEncSessionDataProcessingCallback,
                                        NULL,
@@ -379,8 +380,7 @@ _gpuNvEncSessionProcessBuffer(POBJGPU pGpu, NvencSession *pNvencSession)
     portMemFree(pLocalSessionInfoBuffer);
 }
 
-static void
-_gpuNvEncSessionDataProcessingCallback(POBJGPU pGpu, void *data)
+static void _gpuNvEncSessionDataProcessing(OBJGPU *pGpu)
 {
     PNVENC_SESSION_LIST_ITEM  pNvencSessionListItem;
     PNVENC_SESSION_LIST_ITEM  pNvencSessionListItemNext;
@@ -416,3 +416,46 @@ _gpuNvEncSessionDataProcessingCallback(POBJGPU pGpu, void *data)
         }
     }
 }
+
+static void _gpuNvEncSessionDataProcessingWorkItem(NvU32 gpuInstance, void *pArgs)
+{
+    OBJGPU *pGpu;
+
+    pGpu = gpumgrGetGpu(gpuInstance);
+    if (pGpu == NULL)
+    {
+        NV_PRINTF(LEVEL_ERROR, "NVENC Sessions GPU instance is invalid\n");
+        return;
+    }
+
+    _gpuNvEncSessionDataProcessing(pGpu);
+    pGpu->bNvEncSessionDataProcessingWorkItemPending = NV_FALSE;
+}
+
+static void
+_gpuNvEncSessionDataProcessingCallback(POBJGPU pGpu, void *data)
+{
+    NV_STATUS   status;
+
+    if (!pGpu->bNvEncSessionDataProcessingWorkItemPending)
+    {
+        status = osQueueWorkItemWithFlags(pGpu,
+                                          _gpuNvEncSessionDataProcessingWorkItem,
+                                          NULL,
+                                          OS_QUEUE_WORKITEM_FLAGS_LOCK_SEMA
+                                          | OS_QUEUE_WORKITEM_FLAGS_LOCK_GPU_GROUP_DEVICE_RW);
+        if (status != NV_OK)
+        {
+            NV_PRINTF(LEVEL_ERROR,
+                      "NVENC session queuing async callback failed, status=%x\n",
+                      status);
+
+            // Call directly to do NVENC session data processing
+            _gpuNvEncSessionDataProcessing(pGpu);
+        }
+        else
+        {
+            pGpu->bNvEncSessionDataProcessingWorkItemPending = NV_TRUE;
+        }
+    }
+}
diff --git a/src/nvidia/src/kernel/gpu/nvlink/kernel_nvlinkcorelibtrain.c b/src/nvidia/src/kernel/gpu/nvlink/kernel_nvlinkcorelibtrain.c
index 669a7a287..43df2eba4 100644
--- a/src/nvidia/src/kernel/gpu/nvlink/kernel_nvlinkcorelibtrain.c
+++ b/src/nvidia/src/kernel/gpu/nvlink/kernel_nvlinkcorelibtrain.c
@@ -1034,6 +1034,7 @@ knvlinkCoreShutdownDeviceLinks_IMPL
     OBJSYS      *pSys  = SYS_GET_INSTANCE();
     NvU32        count = 0;
     NvU32        linkId;
+    NvlStatus    status = NV_OK;
 
     // Skip link shutdown where fabric manager is present, for nvlink version bellow 4.0
     if ((pKernelNvlink->ipVerNvlink < NVLINK_VERSION_40 &&
@@ -1096,13 +1097,23 @@ knvlinkCoreShutdownDeviceLinks_IMPL
     // Trigger laneshutdown through core lib if shutdown is supported
     if (pKernelNvlink->getProperty(pKernelNvlink, PDB_PROP_KNVLINK_LANE_SHUTDOWN_ENABLED) && (count > 0))
     {
-        if (nvlink_lib_powerdown_links_from_active_to_off(
-                        pLinks, count, NVLINK_STATE_CHANGE_SYNC))
+        status = nvlink_lib_powerdown_links_from_active_to_off(
+                        pLinks, count, NVLINK_STATE_CHANGE_SYNC);
+        if (status != NVL_SUCCESS)
         {
-            NV_PRINTF(LEVEL_ERROR, "Unable to turn off links for the GPU%d\n",
+            if (status == NVL_NOT_FOUND)
+            {
+                // Bug 4419022
+                NV_PRINTF(LEVEL_ERROR, "Need to shutdown all links unilaterally for GPU%d\n",
+                      pGpu->gpuInstance);
+            }
+            else
+            {
+                NV_PRINTF(LEVEL_ERROR, "Unable to turn off links for the GPU%d\n",
                       pGpu->gpuInstance);
 
-            return NV_ERR_INVALID_STATE;
+                return NV_ERR_INVALID_STATE;
+            }
         }
     }
 
diff --git a/src/nvidia/src/kernel/gpu/spdm/arch/hopper/spdm_gh100.c b/src/nvidia/src/kernel/gpu/spdm/arch/hopper/spdm_gh100.c
index 86e8c76da..2e5b6a310 100644
--- a/src/nvidia/src/kernel/gpu/spdm/arch/hopper/spdm_gh100.c
+++ b/src/nvidia/src/kernel/gpu/spdm/arch/hopper/spdm_gh100.c
@@ -51,6 +51,14 @@
 // Regardless of whether Requester is configured to support these,
 // we only expect Responder to provide these capabilities.
 //
+
+//
+// TODO: SPDM_CAPABILITIES_FLAGS_GH100 and g_SpdmAlgoCheckTable_GH100 is expected capabilities flags
+//       and attributions what GH100 receive from responder. Currently, we have only 1 responder
+//       and return fixed capabilities flags and attributions.
+//       If we want to support different return capabilitis and attributions afterwards, we need
+//       to refactor spdmCheckConnection_GH100().
+//
 #define SPDM_CAPABILITIES_FLAGS_GH100 \
         SPDM_GET_CAPABILITIES_RESPONSE_FLAGS_CERT_CAP       | \
         SPDM_GET_CAPABILITIES_RESPONSE_FLAGS_MEAS_CAP_SIG   | \
@@ -64,21 +72,6 @@
         SPDM_GET_CAPABILITIES_RESPONSE_FLAGS_HBEAT_CAP;
 
 /* ------------------------ Static Variables ------------------------------- */
-//
-// For transport functionality, we require access to the GPU and Spdm objects,
-// as well as additional state (temporary response buffer).
-//
-// However, libspdm transport layer is implemented via callbacks which currently
-// do not support passing any custom parameters, meaning we must use static variables
-// to access these objects. If we ever require multiple instances of the Spdm object,
-// this will be an issue.
-//
-static OBJGPU *g_pGpu                = NULL;
-static Spdm   *g_pSpdm               = NULL;
-static NvU8   *g_pTransportBuffer    = NULL;
-static NvU32   g_transportBufferSize = 0;
-static NvU32   g_pendingResponseSize = 0;
-
 static SPDM_ALGO_CHECK_ENTRY g_SpdmAlgoCheckTable_GH100[] =
 {
     { LIBSPDM_DATA_MEASUREMENT_SPEC,       SPDM_MEASUREMENT_SPECIFICATION_DMTF },
@@ -127,7 +120,6 @@ static libspdm_return_t _spdmSendMessageGsp(void *spdm_context, size_t message_s
 static libspdm_return_t _spdmReceiveMessageGsp(void *spdm_context, size_t *message_size,
                                                void **message, uint64_t timeout);
 
-
 /* ------------------------ Static Functions ------------------------------- */
 //
 // Hardcoding check for libspdm secured message callbacks version.
@@ -311,6 +303,8 @@ _spdmEncodeMessageGsp
     void                                *pSecuredMessageContext = NULL;
     NV_SPDM_DESC_HEADER                 *pNvSpdmDescHdr         = NULL;
     NvU32                                payloadSize            = 0;
+    Spdm                                *pSpdm                  = NULL;
+    size_t                               dataSize               = sizeof(void *);
 
     // Check libspdm parameters.
     if (spdm_context == NULL || message == NULL || message_size == 0 ||
@@ -332,6 +326,21 @@ _spdmEncodeMessageGsp
         return LIBSPDM_STATUS_INVALID_MSG_FIELD;
     }
 
+    status = libspdm_get_data(spdm_context, LIBSPDM_DATA_APP_CONTEXT_DATA,
+                              NULL, (void *)&pSpdm, &dataSize);
+
+    if (status != LIBSPDM_STATUS_SUCCESS)
+    {
+        NV_PRINTF(LEVEL_ERROR, ", spdmStatus != LIBSPDM_STATUS_SUCCESS \n ");
+        return status;
+    }
+
+    if (pSpdm == NULL)
+    {
+        NV_PRINTF(LEVEL_ERROR, " pSpdm == NULL, SPDM context probably corrupted !! \n ");
+        return LIBSPDM_STATUS_INVALID_STATE_LOCAL;
+    }
+
     // Initialize descriptor header.
     pNvSpdmDescHdr = (NV_SPDM_DESC_HEADER *)*transport_message;
     portMemSet(pNvSpdmDescHdr, 0, sizeof(NV_SPDM_DESC_HEADER));
@@ -401,7 +410,7 @@ _spdmEncodeMessageGsp
     }
 
     // Check final encrypted message size.
-    if (*transport_message_size > g_pSpdm->payloadBufferSize)
+    if (*transport_message_size > pSpdm->payloadBufferSize)
     {
         return LIBSPDM_STATUS_BUFFER_TOO_SMALL;
     }
@@ -432,6 +441,8 @@ _spdmDecodeMessageGsp
     void                                  *pSecuredMessageContext = NULL;
     libspdm_return_t                       status                 = LIBSPDM_STATUS_SUCCESS;
     spdm_secured_message_a_data_header1_t *pSpdmSecuredMsgHdr     = NULL;
+    Spdm                                  *pSpdm                  = NULL;
+    size_t                                 dataSize               = sizeof(void *);
 
     // Check libspdm parameters.
     if (spdm_context == NULL || session_id == NULL || is_app_message == NULL ||
@@ -447,10 +458,25 @@ _spdmDecodeMessageGsp
         return LIBSPDM_STATUS_INVALID_PARAMETER;
     }
 
+    status = libspdm_get_data(spdm_context, LIBSPDM_DATA_APP_CONTEXT_DATA,
+                              NULL, (void *)&pSpdm, &dataSize);
+
+    if (status != LIBSPDM_STATUS_SUCCESS)
+    {
+        NV_PRINTF(LEVEL_ERROR, " spdmStatus != LIBSPDM_STATUS_SUCCESS \n ");
+        return status;
+    }
+
+    if (pSpdm == NULL)
+    {
+        NV_PRINTF(LEVEL_ERROR, " pSpdm == NULL, SPDM context probably corrupted !! \n ");
+        return LIBSPDM_STATUS_INVALID_STATE_LOCAL;
+    }
+
     // Retrieve NV-header from message, and perform basic validation.
     pNvSpdmDescHdr = (NV_SPDM_DESC_HEADER *)transport_message;
     if (transport_message_size < sizeof(NV_SPDM_DESC_HEADER) ||
-        transport_message_size > g_pSpdm->payloadBufferSize)
+        transport_message_size > pSpdm->payloadBufferSize)
     {
         return LIBSPDM_STATUS_INVALID_MSG_FIELD;
     }
@@ -566,11 +592,11 @@ _spdmSendMessageGsp
     uint64_t    timeout
 )
 {
-    NV_STATUS        nvStatus   = NV_OK;
-    libspdm_return_t spdmStatus = LIBSPDM_STATUS_SUCCESS;
-
-    // Ensure size is cleared to indicate no response pending in buffer yet
-    g_pendingResponseSize = 0;
+    NV_STATUS                   nvStatus   = NV_OK;
+    libspdm_return_t            spdmStatus = LIBSPDM_STATUS_SUCCESS;
+    Spdm                       *pSpdm      = NULL;
+    OBJGPU                     *pGpu       = NULL;
+    size_t                      dataSize   = sizeof(void *);
 
     // Check libspdm parameters.
     if (message_size == 0 || message == NULL)
@@ -578,23 +604,44 @@ _spdmSendMessageGsp
         return LIBSPDM_STATUS_INVALID_PARAMETER;
     }
 
-    if (g_pGpu == NULL || g_pSpdm == NULL)
+    spdmStatus = libspdm_get_data(spdm_context, LIBSPDM_DATA_APP_CONTEXT_DATA,
+                              NULL, (void *)&pSpdm, &dataSize);
+
+    if (spdmStatus != LIBSPDM_STATUS_SUCCESS)
     {
+        NV_PRINTF(LEVEL_ERROR,"  spdmStatus != LIBSPDM_STATUS_SUCCESS \n ");
+        return spdmStatus;
+    }
+
+    if (pSpdm == NULL)
+    {
+        NV_PRINTF(LEVEL_ERROR, " pSpdm == NULL, SPDM context probably corrupted !! \n ");
         return LIBSPDM_STATUS_INVALID_STATE_LOCAL;
     }
 
-    if (g_transportBufferSize < message_size)
+    pGpu = ENG_GET_GPU(pSpdm);
+
+    if (pGpu == NULL)
+    {
+        NV_PRINTF(LEVEL_ERROR, " pGpu == NULL, SPDM context probably corrupted !! \n ");
+        return LIBSPDM_STATUS_INVALID_STATE_LOCAL;
+    }
+
+    // Ensure size is cleared to indicate no response pending in buffer yet
+    pSpdm->pendingResponseSize = 0;
+
+    if (pSpdm->transportBufferSize < message_size)
     {
         return LIBSPDM_STATUS_BUFFER_TOO_SMALL;
     }
 
     // Fill transport buffer with message and send
-    g_pendingResponseSize = g_transportBufferSize;
-    portMemCopy(g_pTransportBuffer, g_transportBufferSize, message, message_size);
+    pSpdm->pendingResponseSize = pSpdm->transportBufferSize;
+    portMemCopy(pSpdm->pTransportBuffer, pSpdm->transportBufferSize, message, message_size);
 
-    nvStatus = spdmMessageProcess_HAL(g_pGpu, g_pSpdm,
-                                      g_pTransportBuffer, message_size,
-                                      g_pTransportBuffer, &g_pendingResponseSize);
+    nvStatus = spdmMessageProcess_HAL(pGpu, pSpdm,
+                                      pSpdm->pTransportBuffer, message_size,
+                                      pSpdm->pTransportBuffer, &pSpdm->pendingResponseSize);
     if (nvStatus != NV_OK)
     {
         spdmStatus = LIBSPDM_STATUS_SEND_FAIL;
@@ -603,7 +650,7 @@ _spdmSendMessageGsp
     if (spdmStatus != LIBSPDM_STATUS_SUCCESS)
     {
         // If message failed, size is cleared to indicate no response pending
-        g_pendingResponseSize = 0;
+        pSpdm->pendingResponseSize = 0;
     }
 
     return spdmStatus;
@@ -623,7 +670,9 @@ _spdmReceiveMessageGsp
     uint64_t   timeout
 )
 {
-    libspdm_return_t spdmStatus = LIBSPDM_STATUS_SUCCESS;
+    libspdm_return_t   spdmStatus = LIBSPDM_STATUS_SUCCESS;
+    Spdm              *pSpdm      = NULL;
+    size_t             dataSize   = sizeof(void *);
 
     // Check libspdm parameters.
     if (message_size == NULL || message == NULL || *message == NULL)
@@ -631,25 +680,36 @@ _spdmReceiveMessageGsp
         return LIBSPDM_STATUS_INVALID_PARAMETER;
     }
 
-    if (g_pGpu == NULL || g_pSpdm == NULL)
+    spdmStatus = libspdm_get_data(spdm_context, LIBSPDM_DATA_APP_CONTEXT_DATA,
+                              NULL, (void *)&pSpdm, &dataSize);
+
+    if (spdmStatus != LIBSPDM_STATUS_SUCCESS)
     {
-        return LIBSPDM_STATUS_INVALID_STATE_LOCAL;
+        NV_PRINTF(LEVEL_ERROR, " spdmStatus != LIBSPDM_STATUS_SUCCESS \n ");
+        return spdmStatus;
     }
 
+    if (pSpdm == NULL)
+    {
+        NV_PRINTF(LEVEL_ERROR, " pSpdm  == NULL, SPDM context probably corrupted !! \n ");
+        return LIBSPDM_STATUS_INVALID_STATE_LOCAL;
+    }
     // Basic validation to ensure we have a real response.
-    if (g_pendingResponseSize == 0 || g_pendingResponseSize > *message_size)
+    if (pSpdm->pendingResponseSize == 0 ||
+        pSpdm->pendingResponseSize > *message_size)
     {
         spdmStatus = LIBSPDM_STATUS_RECEIVE_FAIL;
         goto ErrorExit;
     }
 
-    portMemCopy(*message, *message_size, g_pTransportBuffer, g_pendingResponseSize);
-    *message_size = g_pendingResponseSize;
+    portMemCopy(*message, *message_size,
+                pSpdm->pTransportBuffer, pSpdm->pendingResponseSize);
+    *message_size = pSpdm->pendingResponseSize;
 
 ErrorExit:
 
     // Ensure size is cleared to indicate no response pending in buffer
-    g_pendingResponseSize = 0;
+    pSpdm->pendingResponseSize = 0;
 
     return spdmStatus;
 }
@@ -673,18 +733,14 @@ spdmDeviceInit_GH100
         return NV_ERR_INVALID_ARGUMENT;
     }
 
-    g_pGpu                = pGpu;
-    g_pSpdm               = pSpdm;
-    g_pendingResponseSize = 0;
-    g_pTransportBuffer    = portMemAllocNonPaged(pSpdm->payloadBufferSize);
-
-    if (g_pTransportBuffer == NULL)
+    pSpdm->pendingResponseSize = 0;
+    pSpdm->pTransportBuffer    = portMemAllocNonPaged(pSpdm->payloadBufferSize);
+    if (pSpdm->pTransportBuffer == NULL)
     {
-        g_transportBufferSize = 0;
+        pSpdm->transportBufferSize = 0;
         return NV_ERR_NO_MEMORY;
     }
-
-    g_transportBufferSize = pSpdm->payloadBufferSize;
+    pSpdm->transportBufferSize = pSpdm->payloadBufferSize;
 
     // Register transport layer functionality with library.
     libspdm_register_transport_layer_func(pSpdm->pLibspdmContext,
@@ -703,7 +759,6 @@ spdmDeviceInit_GH100
     return NV_OK;
 }
 
-
 /*!
  * To deinitialize the GSP SPDM Responder, we need to release the surface for
  * SPDM communication. GSP-RM will handle the rest.
@@ -717,10 +772,10 @@ spdmDeviceDeinit_GH100
 )
 {
     // Just-in-case, portMemFree handles NULL.
-    portMemFree(g_pTransportBuffer);
-    g_pTransportBuffer    = NULL;
-    g_transportBufferSize = 0;
-    g_pendingResponseSize = 0;
+    portMemFree(pSpdm->pTransportBuffer);
+    pSpdm->pTransportBuffer     = NULL;
+    pSpdm->transportBufferSize  = 0;
+    pSpdm->pendingResponseSize  = 0;
 
     return NV_OK;
 }
diff --git a/src/nvidia/src/kernel/gpu/spdm/spdm.c b/src/nvidia/src/kernel/gpu/spdm/spdm.c
index a8b415cc8..0b1543499 100644
--- a/src/nvidia/src/kernel/gpu/spdm/spdm.c
+++ b/src/nvidia/src/kernel/gpu/spdm/spdm.c
@@ -432,6 +432,11 @@ spdmContextInit_IMPL
 
     libspdm_init_msg_log(pSpdm->pLibspdmContext, pSpdm->pMsgLog, pSpdm->msgLogMaxSize);
 
+
+    // Store SPDM object pointer to libspdm context
+    CHECK_SPDM_STATUS(libspdm_set_data(pSpdm->pLibspdmContext, LIBSPDM_DATA_APP_CONTEXT_DATA,
+                                       NULL, (void *)&pSpdm, sizeof(void *)));
+
     //
     // Perform any device-specific initialization. spdmDeviceInit is also
     // responsible for registering transport layer functions with libspdm.
diff --git a/src/nvidia/src/kernel/mem_mgr/mem_export.c b/src/nvidia/src/kernel/mem_mgr/mem_export.c
index 7684b26eb..30d98ee55 100644
--- a/src/nvidia/src/kernel/mem_mgr/mem_export.c
+++ b/src/nvidia/src/kernel/mem_mgr/mem_export.c
@@ -606,7 +606,8 @@ _memoryexportVerifyMem
     if (pGpu == NULL)
         return NV_OK;
 
-    if (pKernelMIGGpuInstance != NULL)
+    // MIG is about vidmem partitioning, so limit the check.
+    if ((pKernelMIGGpuInstance != NULL) && (addrSpace == ADDR_FBMEM))
     {
         if ((pKernelMIGGpuInstance->pMemoryPartitionHeap != pSrcMemory->pHeap))
             return NV_ERR_INVALID_OBJECT_PARENT;
diff --git a/src/nvidia/src/kernel/virtualization/kernel_vgpu_mgr.c b/src/nvidia/src/kernel/virtualization/kernel_vgpu_mgr.c
index 8dc9cf040..adf9923d0 100644
--- a/src/nvidia/src/kernel/virtualization/kernel_vgpu_mgr.c
+++ b/src/nvidia/src/kernel/virtualization/kernel_vgpu_mgr.c
@@ -1396,15 +1396,9 @@ NvU32 kvgpumgrGetPgpuSubdevIdEncoding(OBJGPU *pGpu, NvU8 *pgpuString,
         return NV_U32_MAX;
     }
 
-    switch (chipID)
-    {
-        default:
-            // The encoding of the subdevice ID is its value converted to string
-            bytes = NvU32ToAsciiStr(subID, SUBDEVID_ENCODED_VALUE_SIZE,
+    // The encoding of the subdevice ID is its value converted to string
+    bytes = NvU32ToAsciiStr(subID, SUBDEVID_ENCODED_VALUE_SIZE,
                                     pgpuString, NV_FALSE);
-            break;
-    }
-
     return bytes;
 }
 
diff --git a/version.mk b/version.mk
index 58b18ce38..220eb34bc 100644
--- a/version.mk
+++ b/version.mk
@@ -1,4 +1,4 @@
-NVIDIA_VERSION = 550.54.15
+NVIDIA_VERSION = 550.67
 
 # This file.
 VERSION_MK_FILE := $(lastword $(MAKEFILE_LIST))