545.23.06

2025-03-21 13:29:11 +01:00 · 2023-10-17 09:25:29 -07:00 · 2023-10-17 09:25:29 -07:00 · b5bf85a8e3
commit b5bf85a8e3
parent f59818b751
917 changed files with 132480 additions and 110015 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,17 @@
 # Changelog

+## Release 545 Entries
+
+### [545.23.06] 2023-10-17
+
+#### Fixed
+
+- Fix always-false conditional, [#493](https://github.com/NVIDIA/open-gpu-kernel-modules/pull/493) by @meme8383
+
+#### Added
+
+- Added beta-quality support for GeForce and Workstation GPUs. Please see the "Open Linux Kernel Modules" chapter in the NVIDIA GPU driver end user README for details.
+
 ## Release 535 Entries

 ### [535.113.01] 2023-09-21
--- a/README.md
+++ b/README.md
@ -1,7 +1,7 @@
 # NVIDIA Linux Open GPU Kernel Module Source

 This is the source release of the NVIDIA Linux open GPU kernel modules,
-version 535.113.01.
+version 545.23.06.


 ## How to Build
@ -17,7 +17,7 @@ as root:

 Note that the kernel modules built here must be used with GSP
 firmware and user-space NVIDIA GPU driver components from a corresponding
-535.113.01 driver release.  This can be achieved by installing
+545.23.06 driver release.  This can be achieved by installing
 the NVIDIA GPU driver from the .run file using the `--no-kernel-modules`
 option.  E.g.,

@ -179,16 +179,16 @@ software applications.

 ## Compatible GPUs

-The open-gpu-kernel-modules can be used on any Turing or later GPU
-(see the table below). However, in the 535.113.01 release,
-GeForce and Workstation support is still considered alpha-quality.
+The NVIDIA open kernel modules can be used on any Turing or later GPU
+(see the table below). However, in the __DRIVER_VERION__ release, GeForce and
+Workstation support is considered to be Beta quality. The open kernel modules
+are suitable for broad usage, and NVIDIA requests feedback on any issues
+encountered specific to them.

-To enable use of the open kernel modules on GeForce and Workstation GPUs,
-set the "NVreg_OpenRmEnableUnsupportedGpus" nvidia.ko kernel module
-parameter to 1. For more details, see the NVIDIA GPU driver end user
-README here:
+For details on feature support and limitations, see the NVIDIA GPU driver
+end user README here:

-https://us.download.nvidia.com/XFree86/Linux-x86_64/535.113.01/README/kernel_open.html
+https://us.download.nvidia.com/XFree86/Linux-x86_64/545.23.06/README/kernel_open.html

 In the below table, if three IDs are listed, the first is the PCI Device 
 ID, the second is the PCI Subsystem Vendor ID, and the third is the PCI
--- a/kernel-open/Kbuild
+++ b/kernel-open/Kbuild
@ -72,12 +72,24 @@ EXTRA_CFLAGS += -I$(src)/common/inc
 EXTRA_CFLAGS += -I$(src)
 EXTRA_CFLAGS += -Wall $(DEFINES) $(INCLUDES) -Wno-cast-qual -Wno-error -Wno-format-extra-args
 EXTRA_CFLAGS += -D__KERNEL__ -DMODULE -DNVRM
-EXTRA_CFLAGS += -DNV_VERSION_STRING=\"535.113.01\"
+EXTRA_CFLAGS += -DNV_VERSION_STRING=\"545.23.06\"

 ifneq ($(SYSSRCHOST1X),)
 EXTRA_CFLAGS += -I$(SYSSRCHOST1X)
 endif

+# Some Android kernels prohibit driver use of filesystem functions like
+# filp_open() and kernel_read(). Disable the NV_FILESYSTEM_ACCESS_AVAILABLE
+# functionality that uses those functions when building for Android.
+
+PLATFORM_IS_ANDROID ?= 0
+
+ifeq ($(PLATFORM_IS_ANDROID),1)
+ EXTRA_CFLAGS += -DNV_FILESYSTEM_ACCESS_AVAILABLE=0
+else
+ EXTRA_CFLAGS += -DNV_FILESYSTEM_ACCESS_AVAILABLE=1
+endif
+
 EXTRA_CFLAGS += -Wno-unused-function

 ifneq ($(NV_BUILD_TYPE),debug)
@ -92,7 +104,6 @@ endif

 ifeq ($(NV_BUILD_TYPE),debug)
 EXTRA_CFLAGS += -g
- EXTRA_CFLAGS += $(call cc-option,-gsplit-dwarf,)
 endif

 EXTRA_CFLAGS += -ffreestanding
@ -214,6 +225,7 @@ $(obj)/conftest/patches.h: $(NV_CONFTEST_SCRIPT)
 NV_HEADER_PRESENCE_TESTS = \
 asm/system.h \
 drm/drmP.h \
+ drm/drm_aperture.h \
 drm/drm_auth.h \
 drm/drm_gem.h \
 drm/drm_crtc.h \
@ -224,6 +236,7 @@ NV_HEADER_PRESENCE_TESTS = \
 drm/drm_encoder.h \
 drm/drm_atomic_uapi.h \
 drm/drm_drv.h \
+ drm/drm_fbdev_generic.h \
 drm/drm_framebuffer.h \
 drm/drm_connector.h \
 drm/drm_probe_helper.h \
@ -257,6 +270,7 @@ NV_HEADER_PRESENCE_TESTS = \
 linux/sched/task_stack.h \
 xen/ioemu.h \
 linux/fence.h \
+ linux/dma-fence.h \
 linux/dma-resv.h \
 soc/tegra/chip-id.h \
 soc/tegra/fuse.h \
@ -302,6 +316,7 @@ NV_HEADER_PRESENCE_TESTS = \
 linux/mdev.h \
 soc/tegra/bpmp-abi.h \
 soc/tegra/bpmp.h \
+ linux/sync_file.h \
 linux/cc_platform.h \
 asm/cpufeature.h

--- a/kernel-open/common/inc/nv-chardev-numbers.h
+++ b/kernel-open/common/inc/nv-chardev-numbers.h
@ -0,0 +1,43 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+
+#ifndef _NV_CHARDEV_NUMBERS_H_
+#define _NV_CHARDEV_NUMBERS_H_
+
+// NVIDIA's reserved major character device number (Linux).
+#define NV_MAJOR_DEVICE_NUMBER  195
+
+// Minor numbers 0 to 247 reserved for regular devices
+#define NV_MINOR_DEVICE_NUMBER_REGULAR_MAX         247
+
+// Minor numbers 248 to 253 currently unused
+
+// Minor number 254 reserved for the modeset device (provided by NVKMS)
+#define NV_MINOR_DEVICE_NUMBER_MODESET_DEVICE      254
+
+// Minor number 255 reserved for the control device
+#define NV_MINOR_DEVICE_NUMBER_CONTROL_DEVICE      255
+
+#endif  // _NV_CHARDEV_NUMBERS_H_
+
--- a/kernel-open/common/inc/nv-ioctl-numa.h
+++ b/kernel-open/common/inc/nv-ioctl-numa.h
@ -25,14 +25,12 @@
 #ifndef NV_IOCTL_NUMA_H
 #define NV_IOCTL_NUMA_H

-#if defined(NV_LINUX)
-
 #include <nv-ioctl-numbers.h>

-#if defined(NV_KERNEL_INTERFACE_LAYER)
-
+#if defined(NV_KERNEL_INTERFACE_LAYER) && defined(NV_LINUX)
 #include <linux/types.h>
-
+#elif defined (NV_KERNEL_INTERFACE_LAYER) && defined(NV_BSD)
+#include <sys/stdint.h>
 #else

 #include <stdint.h>
@ -81,5 +79,3 @@ typedef struct nv_ioctl_set_numa_status
 #define NV_IOCTL_NUMA_STATUS_OFFLINE_FAILED         6

 #endif
-
-#endif
--- a/kernel-open/common/inc/nv-kthread-q-os.h
+++ b/kernel-open/common/inc/nv-kthread-q-os.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2012-2013 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2016 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -21,27 +21,42 @@
 * DEALINGS IN THE SOFTWARE.
 */

-#ifndef _NV_FRONTEND_H_
-#define _NV_FRONTEND_H_
+#ifndef __NV_KTHREAD_QUEUE_OS_H__
+#define __NV_KTHREAD_QUEUE_OS_H__

-#include "nvtypes.h"
-#include "nv-linux.h"
-#include "nv-register-module.h"
+#include <linux/types.h>            // atomic_t
+#include <linux/list.h>             // list
+#include <linux/sched.h>            // task_struct
+#include <linux/numa.h>             // NUMA_NO_NODE
+#include <linux/semaphore.h>

-#define NV_MAX_MODULE_INSTANCES                 8
+#include "conftest.h"

-#define NV_FRONTEND_MINOR_NUMBER(x)             minor((x)->i_rdev)
+struct nv_kthread_q
+{
+    struct list_head q_list_head;
+    spinlock_t q_lock;

-#define NV_FRONTEND_CONTROL_DEVICE_MINOR_MAX    255
-#define NV_FRONTEND_CONTROL_DEVICE_MINOR_MIN    (NV_FRONTEND_CONTROL_DEVICE_MINOR_MAX - \
-                                                 NV_MAX_MODULE_INSTANCES)
+    // This is a counting semaphore. It gets incremented and decremented
+    // exactly once for each item that is added to the queue.
+    struct semaphore q_sem;
+    atomic_t main_loop_should_exit;

-#define NV_FRONTEND_IS_CONTROL_DEVICE(x)        ((x <= NV_FRONTEND_CONTROL_DEVICE_MINOR_MAX) && \
-                                                 (x > NV_FRONTEND_CONTROL_DEVICE_MINOR_MIN))
+    struct task_struct *q_kthread;
+};

-int nvidia_frontend_add_device(nvidia_module_t *, nv_linux_state_t *);
-int nvidia_frontend_remove_device(nvidia_module_t *, nv_linux_state_t *);
+struct nv_kthread_q_item
+{
+    struct list_head q_list_node;
+    nv_q_func_t function_to_run;
+    void *function_args;
+};

-extern nvidia_module_t *nv_minor_num_table[];
+
+#ifndef NUMA_NO_NODE
+#define NUMA_NO_NODE (-1)
+#endif
+
+#define NV_KTHREAD_NO_NODE NUMA_NO_NODE

 #endif
--- a/kernel-open/common/inc/nv-kthread-q.h
+++ b/kernel-open/common/inc/nv-kthread-q.h
@ -24,13 +24,14 @@
 #ifndef __NV_KTHREAD_QUEUE_H__
 #define __NV_KTHREAD_QUEUE_H__

-#include <linux/types.h>            // atomic_t
-#include <linux/list.h>             // list
-#include <linux/sched.h>            // task_struct
-#include <linux/numa.h>             // NUMA_NO_NODE
-#include <linux/semaphore.h>
+struct nv_kthread_q;
+struct nv_kthread_q_item;
+typedef struct nv_kthread_q nv_kthread_q_t;
+typedef struct nv_kthread_q_item nv_kthread_q_item_t;

-#include "conftest.h"
+typedef void (*nv_q_func_t)(void *args);
+
+#include "nv-kthread-q-os.h"

 ////////////////////////////////////////////////////////////////////////////////
 // nv_kthread_q:
@ -85,38 +86,6 @@
 //
 ////////////////////////////////////////////////////////////////////////////////

-typedef struct nv_kthread_q nv_kthread_q_t;
-typedef struct nv_kthread_q_item nv_kthread_q_item_t;
-
-typedef void (*nv_q_func_t)(void *args);
-
-struct nv_kthread_q
-{
-    struct list_head q_list_head;
-    spinlock_t q_lock;
-
-    // This is a counting semaphore. It gets incremented and decremented
-    // exactly once for each item that is added to the queue.
-    struct semaphore q_sem;
-    atomic_t main_loop_should_exit;
-
-    struct task_struct *q_kthread;
-};
-
-struct nv_kthread_q_item
-{
-    struct list_head q_list_node;
-    nv_q_func_t function_to_run;
-    void *function_args;
-};
-
-
-#ifndef NUMA_NO_NODE
-#define NUMA_NO_NODE (-1)
-#endif
-
-#define NV_KTHREAD_NO_NODE NUMA_NO_NODE
-
 //
 // The queue must not be used before calling this routine.
 //
@ -155,10 +124,7 @@ int nv_kthread_q_init_on_node(nv_kthread_q_t *q,
 // This routine is the same as nv_kthread_q_init_on_node() with the exception
 // that the queue stack will be allocated on the NUMA node of the caller.
 //
-static inline int nv_kthread_q_init(nv_kthread_q_t *q, const char *qname)
-{
-    return nv_kthread_q_init_on_node(q, qname, NV_KTHREAD_NO_NODE);
-}
+int nv_kthread_q_init(nv_kthread_q_t *q, const char *qname);

 //
 // The caller is responsible for stopping all queues, by calling this routine
--- a/kernel-open/common/inc/nv-linux.h
+++ b/kernel-open/common/inc/nv-linux.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2001-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2001-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -248,7 +248,7 @@ NV_STATUS nvos_forward_error_to_cray(struct pci_dev *, NvU32,
 #undef NV_SET_PAGES_UC_PRESENT
 #endif

-#if !defined(NVCPU_AARCH64) && !defined(NVCPU_PPC64LE)
+#if !defined(NVCPU_AARCH64) && !defined(NVCPU_PPC64LE) && !defined(NVCPU_RISCV64)
 #if !defined(NV_SET_MEMORY_UC_PRESENT) && !defined(NV_SET_PAGES_UC_PRESENT)
 #error "This driver requires the ability to change memory types!"
 #endif
@ -430,6 +430,11 @@ extern NvBool nvos_is_chipset_io_coherent(void);
 #define CACHE_FLUSH()            asm volatile("sync;  \n" \
                                              "isync; \n" ::: "memory")
 #define WRITE_COMBINE_FLUSH()    CACHE_FLUSH()
+#elif defined(NVCPU_RISCV64)
+#define CACHE_FLUSH()            mb()
+#define WRITE_COMBINE_FLUSH()    CACHE_FLUSH()
+#else
+#error "CACHE_FLUSH() and WRITE_COMBINE_FLUSH() need to be defined for this architecture."
 #endif

 typedef enum
@ -440,7 +445,7 @@ typedef enum
    NV_MEMORY_TYPE_DEVICE_MMIO, /* All kinds of MMIO referred by NVRM e.g. BARs and MCFG of device */
 } nv_memory_type_t;

-#if defined(NVCPU_AARCH64) || defined(NVCPU_PPC64LE)
+#if defined(NVCPU_AARCH64) || defined(NVCPU_PPC64LE) || defined(NVCPU_RISCV64)
 #define NV_ALLOW_WRITE_COMBINING(mt)    1
 #elif defined(NVCPU_X86_64)
 #if defined(NV_ENABLE_PAT_SUPPORT)
@ -753,7 +758,6 @@ static inline dma_addr_t nv_phys_to_dma(struct device *dev, NvU64 pa)
 #define NV_VMA_FILE(vma)              ((vma)->vm_file)

 #define NV_DEVICE_MINOR_NUMBER(x)     minor((x)->i_rdev)
-#define NV_CONTROL_DEVICE_MINOR       255

 #define NV_PCI_DISABLE_DEVICE(pci_dev)                           \
    {                                                            \
@ -1646,20 +1650,11 @@ typedef struct nvidia_event
    nv_event_t event;
 } nvidia_event_t;

-typedef enum
-{
-    NV_FOPS_STACK_INDEX_MMAP,
-    NV_FOPS_STACK_INDEX_IOCTL,
-    NV_FOPS_STACK_INDEX_COUNT
-} nvidia_entry_point_index_t;
-
 typedef struct
 {
    nv_file_private_t nvfp;

    nvidia_stack_t *sp;
-    nvidia_stack_t *fops_sp[NV_FOPS_STACK_INDEX_COUNT];
-    struct semaphore fops_sp_lock[NV_FOPS_STACK_INDEX_COUNT];
    nv_alloc_t *free_list;
    void *nvptr;
    nvidia_event_t *event_data_head, *event_data_tail;
@ -1689,28 +1684,6 @@ static inline nv_linux_file_private_t *nv_get_nvlfp_from_nvfp(nv_file_private_t

 #define NV_STATE_PTR(nvl)   &(((nv_linux_state_t *)(nvl))->nv_state)

-static inline nvidia_stack_t *nv_nvlfp_get_sp(nv_linux_file_private_t *nvlfp, nvidia_entry_point_index_t which)
-{
-#if defined(NVCPU_X86_64)
-    if (rm_is_altstack_in_use())
-    {
-        down(&nvlfp->fops_sp_lock[which]);
-        return nvlfp->fops_sp[which];
-    }
-#endif
-    return NULL;
-}
-
-static inline void nv_nvlfp_put_sp(nv_linux_file_private_t *nvlfp, nvidia_entry_point_index_t which)
-{
-#if defined(NVCPU_X86_64)
-    if (rm_is_altstack_in_use())
-    {
-        up(&nvlfp->fops_sp_lock[which]);
-    }
-#endif
-}
-
 #define NV_ATOMIC_READ(data)            atomic_read(&(data))
 #define NV_ATOMIC_SET(data,val)         atomic_set(&(data), (val))
 #define NV_ATOMIC_INC(data)             atomic_inc(&(data))
--- a/kernel-open/common/inc/nv-pgprot.h
+++ b/kernel-open/common/inc/nv-pgprot.h
@ -119,6 +119,13 @@ static inline pgprot_t pgprot_modify_writecombine(pgprot_t old_prot)
 #define NV_PGPROT_WRITE_COMBINED(old_prot)    old_prot
 #define NV_PGPROT_READ_ONLY(old_prot)                                         \
    __pgprot(pgprot_val((old_prot)) & ~NV_PAGE_RW)
+#elif defined(NVCPU_RISCV64)
+#define NV_PGPROT_WRITE_COMBINED_DEVICE(old_prot)                             \
+    pgprot_writecombine(old_prot)
+/* Don't attempt to mark sysmem pages as write combined on riscv */
+#define NV_PGPROT_WRITE_COMBINED(old_prot)     old_prot
+#define NV_PGPROT_READ_ONLY(old_prot)                                         \
+            __pgprot(pgprot_val((old_prot)) & ~_PAGE_WRITE)
 #else
 /* Writecombine is not supported */
 #undef NV_PGPROT_WRITE_COMBINED_DEVICE(old_prot)
--- a/kernel-open/common/inc/nv-proto.h
+++ b/kernel-open/common/inc/nv-proto.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 1999-2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1999-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -25,10 +25,8 @@
 #define _NV_PROTO_H_

 #include "nv-pci.h"
-#include "nv-register-module.h"

 extern const char *nv_device_name;
-extern nvidia_module_t nv_fops;

 void        nv_acpi_register_notifier   (nv_linux_state_t *);
 void        nv_acpi_unregister_notifier (nv_linux_state_t *);
@ -86,7 +84,7 @@ void          nv_shutdown_adapter(nvidia_stack_t *, nv_state_t *, nv_linux_state
 void          nv_dev_free_stacks(nv_linux_state_t *);
 NvBool        nv_lock_init_locks(nvidia_stack_t *, nv_state_t *);
 void          nv_lock_destroy_locks(nvidia_stack_t *, nv_state_t *);
-void          nv_linux_add_device_locked(nv_linux_state_t *);
+int           nv_linux_add_device_locked(nv_linux_state_t *);
 void          nv_linux_remove_device_locked(nv_linux_state_t *);
 NvBool        nv_acpi_power_resource_method_present(struct pci_dev *);

--- a/kernel-open/common/inc/nv.h
+++ b/kernel-open/common/inc/nv.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 1999-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1999-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -42,6 +42,7 @@
 #include <nv-caps.h>
 #include <nv-firmware.h>
 #include <nv-ioctl.h>
+#include <nv-ioctl-numa.h>
 #include <nvmisc.h>

 extern nv_cap_t *nvidia_caps_root;
@ -50,9 +51,6 @@ extern const NvBool nv_is_rm_firmware_supported_os;

 #include <nv-kernel-interface-api.h>

-/* NVIDIA's reserved major character device number (Linux). */
-#define NV_MAJOR_DEVICE_NUMBER 195
-
 #define GPU_UUID_LEN    (16)

 /*
@ -478,8 +476,6 @@ typedef struct nv_state_t
    /* Bool to check if dma-buf is supported */
    NvBool dma_buf_supported;

-    NvBool printed_openrm_enable_unsupported_gpus_error;
-
    /* Check if NVPCF DSM function is implemented under NVPCF or GPU device scope */
    NvBool nvpcf_dsm_in_gpu_scope;

@ -505,6 +501,7 @@ struct nv_file_private_t
    NvHandle *handles;
    NvU16 maxHandles;
    NvU32 deviceInstance;
+    NvU32 gpuInstanceId;
    NvU8 metadata[64];

    nv_file_private_t *ctl_nvfp;
@ -765,7 +762,7 @@ nv_state_t*  NV_API_CALL  nv_get_ctl_state       (void);
 void   NV_API_CALL  nv_set_dma_address_size      (nv_state_t *, NvU32 );

 NV_STATUS  NV_API_CALL  nv_alias_pages           (nv_state_t *, NvU32, NvU32, NvU32, NvU64, NvU64 *, void **);
-NV_STATUS  NV_API_CALL  nv_alloc_pages           (nv_state_t *, NvU32, NvBool, NvU32, NvBool, NvBool, NvS32, NvU64 *, void **);
+NV_STATUS  NV_API_CALL  nv_alloc_pages           (nv_state_t *, NvU32, NvU64, NvBool, NvU32, NvBool, NvBool, NvS32, NvU64 *, void **);
 NV_STATUS  NV_API_CALL  nv_free_pages            (nv_state_t *, NvU32, NvBool, NvU32, void *);

 NV_STATUS  NV_API_CALL  nv_register_user_pages   (nv_state_t *, NvU64, NvU64 *, void *, void **);
@ -981,7 +978,7 @@ NV_STATUS  NV_API_CALL  rm_dma_buf_dup_mem_handle (nvidia_stack_t *, nv_state_t
 void       NV_API_CALL  rm_dma_buf_undup_mem_handle(nvidia_stack_t *, nv_state_t *, NvHandle, NvHandle);
 NV_STATUS  NV_API_CALL  rm_dma_buf_map_mem_handle (nvidia_stack_t *, nv_state_t *, NvHandle, NvHandle, NvU64, NvU64, void *, nv_phys_addr_range_t **, NvU32 *);
 void       NV_API_CALL  rm_dma_buf_unmap_mem_handle(nvidia_stack_t *, nv_state_t *, NvHandle, NvHandle, NvU64, nv_phys_addr_range_t **, NvU32);
-NV_STATUS  NV_API_CALL  rm_dma_buf_get_client_and_device(nvidia_stack_t *, nv_state_t *, NvHandle, NvHandle *, NvHandle *, NvHandle *, void **, NvBool *);
+NV_STATUS  NV_API_CALL  rm_dma_buf_get_client_and_device(nvidia_stack_t *, nv_state_t *, NvHandle, NvHandle, NvHandle *, NvHandle *, NvHandle *, void **, NvBool *);
 void       NV_API_CALL  rm_dma_buf_put_client_and_device(nvidia_stack_t *, nv_state_t *, NvHandle, NvHandle, NvHandle, void *);
 NV_STATUS  NV_API_CALL  rm_log_gpu_crash          (nv_stack_t *, nv_state_t *);

@ -993,7 +990,7 @@ NvBool     NV_API_CALL rm_gpu_need_4k_page_isolation(nv_state_t *);
 NvBool     NV_API_CALL rm_is_chipset_io_coherent(nv_stack_t *);
 NvBool     NV_API_CALL rm_init_event_locks(nvidia_stack_t *, nv_state_t *);
 void       NV_API_CALL rm_destroy_event_locks(nvidia_stack_t *, nv_state_t *);
-NV_STATUS  NV_API_CALL rm_get_gpu_numa_info(nvidia_stack_t *, nv_state_t *, NvS32 *, NvU64 *, NvU64 *, NvU64 *, NvU32 *);
+NV_STATUS  NV_API_CALL rm_get_gpu_numa_info(nvidia_stack_t *, nv_state_t *, nv_ioctl_numa_info_t *);
 NV_STATUS  NV_API_CALL rm_gpu_numa_online(nvidia_stack_t *, nv_state_t *);
 NV_STATUS  NV_API_CALL rm_gpu_numa_offline(nvidia_stack_t *, nv_state_t *);
 NvBool     NV_API_CALL rm_is_device_sequestered(nvidia_stack_t *, nv_state_t *);
@ -1008,7 +1005,7 @@ void       NV_API_CALL rm_cleanup_dynamic_power_management(nvidia_stack_t *, nv_
 void       NV_API_CALL rm_enable_dynamic_power_management(nvidia_stack_t *, nv_state_t *);
 NV_STATUS  NV_API_CALL rm_ref_dynamic_power(nvidia_stack_t *, nv_state_t *, nv_dynamic_power_mode_t);
 void       NV_API_CALL rm_unref_dynamic_power(nvidia_stack_t *, nv_state_t *, nv_dynamic_power_mode_t);
-NV_STATUS  NV_API_CALL rm_transition_dynamic_power(nvidia_stack_t *, nv_state_t *, NvBool);
+NV_STATUS  NV_API_CALL rm_transition_dynamic_power(nvidia_stack_t *, nv_state_t *, NvBool, NvBool *);
 const char* NV_API_CALL rm_get_vidmem_power_status(nvidia_stack_t *, nv_state_t *);
 const char* NV_API_CALL rm_get_dynamic_power_management_status(nvidia_stack_t *, nv_state_t *);
 const char* NV_API_CALL rm_get_gpu_gcx_support(nvidia_stack_t *, nv_state_t *, NvBool);
@ -1023,7 +1020,8 @@ NV_STATUS  NV_API_CALL  nv_vgpu_create_request(nvidia_stack_t *, nv_state_t *, c
 NV_STATUS  NV_API_CALL  nv_vgpu_delete(nvidia_stack_t *, const NvU8 *, NvU16);
 NV_STATUS  NV_API_CALL  nv_vgpu_get_type_ids(nvidia_stack_t *, nv_state_t *, NvU32 *, NvU32 *, NvBool, NvU8, NvBool);
 NV_STATUS  NV_API_CALL  nv_vgpu_get_type_info(nvidia_stack_t *, nv_state_t *, NvU32, char *, int, NvU8);
-NV_STATUS  NV_API_CALL  nv_vgpu_get_bar_info(nvidia_stack_t *, nv_state_t *, const NvU8 *, NvU64 *, NvU32, void *);
+NV_STATUS  NV_API_CALL  nv_vgpu_get_bar_info(nvidia_stack_t *, nv_state_t *, const NvU8 *, NvU64 *, NvU32, void *, NvBool *);
+NV_STATUS  NV_API_CALL  nv_vgpu_get_hbm_info(nvidia_stack_t *, nv_state_t *, const NvU8 *, NvU64 *, NvU64 *);
 NV_STATUS  NV_API_CALL  nv_vgpu_start(nvidia_stack_t *, const NvU8 *, void *, NvS32 *, NvU8 *, NvU32);
 NV_STATUS  NV_API_CALL  nv_vgpu_get_sparse_mmap(nvidia_stack_t *, nv_state_t *, const NvU8 *, NvU64 **, NvU64 **, NvU32 *);
 NV_STATUS  NV_API_CALL  nv_vgpu_process_vf_info(nvidia_stack_t *, nv_state_t *, NvU8, NvU32, NvU8, NvU8, NvU8, NvBool, void *);
--- a/kernel-open/common/inc/nv_speculation_barrier.h
+++ b/kernel-open/common/inc/nv_speculation_barrier.h
@ -86,7 +86,7 @@
 /* Not currently implemented for MSVC/ARM64. See bug 3366890. */
 #   define nv_speculation_barrier()
 #   define speculation_barrier() nv_speculation_barrier()
-#elif defined(NVCPU_NVRISCV64) && NVOS_IS_LIBOS
+#elif defined(NVCPU_IS_RISCV64)
 #   define nv_speculation_barrier()
 #else
 #error "Unknown compiler/chip family"
--- a/kernel-open/common/inc/nv_uvm_types.h
+++ b/kernel-open/common/inc/nv_uvm_types.h
@ -104,6 +104,10 @@ typedef struct UvmGpuMemoryInfo_tag
    // Out: Set to TRUE, if the allocation is in sysmem.
    NvBool sysmem;

+    // Out: Set to TRUE, if this allocation is treated as EGM.
+    //      sysmem is also TRUE when egm is TRUE.
+    NvBool egm;
+
    // Out: Set to TRUE, if the allocation is a constructed
    //      under a Device or Subdevice.
    //      All permutations of sysmem and deviceDescendant are valid.
@ -125,6 +129,8 @@ typedef struct UvmGpuMemoryInfo_tag

    // Out: Uuid of the GPU to which the allocation belongs.
    //      This is only valid if deviceDescendant is NV_TRUE.
+    //      When egm is NV_TRUE, this is also the UUID of the GPU
+    //      for which EGM is local.
    //      Note: If the allocation is owned by a device in
    //      an SLI group and the allocation is broadcast
    //      across the SLI group, this UUID will be any one
@ -332,7 +338,7 @@ typedef struct UvmGpuPagingChannelAllocParams_tag

 // The max number of Copy Engines supported by a GPU.
 // The gpu ops build has a static assert that this is the correct number.
-#define UVM_COPY_ENGINE_COUNT_MAX 10
+#define UVM_COPY_ENGINE_COUNT_MAX 64

 typedef struct
 {
@ -566,11 +572,8 @@ typedef struct UvmPlatformInfo_tag
    // Out: ATS (Address Translation Services) is supported
    NvBool atsSupported;

-    // Out: True if HW trusted execution, such as AMD's SEV-SNP or Intel's TDX,
-    // is enabled in the VM, indicating that Confidential Computing must be
-    // also enabled in the GPU(s); these two security features are either both
-    // enabled, or both disabled.
-    NvBool confComputingEnabled;
+    // Out: AMD SEV (Secure Encrypted Virtualization) is enabled
+    NvBool sevEnabled;
 } UvmPlatformInfo;

 typedef struct UvmGpuClientInfo_tag
@ -683,6 +686,10 @@ typedef struct UvmGpuInfo_tag
    // to NVSwitch peers.
    NvBool connectedToSwitch;
    NvU64 nvswitchMemoryWindowStart;
+
+    // local EGM properties
+    NvBool   egmEnabled;
+    NvU8     egmPeerId;
 } UvmGpuInfo;

 typedef struct UvmGpuFbInfo_tag
--- a/kernel-open/common/inc/nvkms-api-types.h
+++ b/kernel-open/common/inc/nvkms-api-types.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2014-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2014-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -45,6 +45,11 @@

 #define NVKMS_DEVICE_ID_TEGRA                 0x0000ffff

+#define NVKMS_MAX_SUPERFRAME_VIEWS            4
+
+#define NVKMS_LOG2_LUT_ARRAY_SIZE             10
+#define NVKMS_LUT_ARRAY_SIZE                  (1 << NVKMS_LOG2_LUT_ARRAY_SIZE)
+
 typedef NvU32 NvKmsDeviceHandle;
 typedef NvU32 NvKmsDispHandle;
 typedef NvU32 NvKmsConnectorHandle;
@ -179,6 +184,14 @@ enum NvKmsEventType {
    NVKMS_EVENT_TYPE_FLIP_OCCURRED,
 };

+enum NvKmsFlipResult {
+    NV_KMS_FLIP_RESULT_SUCCESS = 0,    /* Success */
+    NV_KMS_FLIP_RESULT_INVALID_PARAMS, /* Parameter validation failed */
+    NV_KMS_FLIP_RESULT_IN_PROGRESS,    /* Flip would fail because an outstanding
+                                          flip containing changes that cannot be
+                                          queued is in progress */
+};
+
 typedef enum {
    NV_EVO_SCALER_1TAP      = 0,
    NV_EVO_SCALER_2TAPS     = 1,
@ -221,6 +234,16 @@ struct NvKmsUsageBounds {
    } layer[NVKMS_MAX_LAYERS_PER_HEAD];
 };

+/*!
+ * Per-component arrays of NvU16s describing the LUT; used for both the input
+ * LUT and output LUT.
+ */
+struct NvKmsLutRamps {
+    NvU16 red[NVKMS_LUT_ARRAY_SIZE];   /*! in */
+    NvU16 green[NVKMS_LUT_ARRAY_SIZE]; /*! in */
+    NvU16 blue[NVKMS_LUT_ARRAY_SIZE];  /*! in */
+};
+
 /*
 * A 3x4 row-major colorspace conversion matrix.
 *
@ -531,6 +554,18 @@ typedef struct {
    NvBool noncoherent;
 } NvKmsDispIOCoherencyModes;

+enum NvKmsInputColorRange {
+    /*
+     * If DEFAULT is provided, driver will assume full range for RGB formats
+     * and limited range for YUV formats.
+     */
+    NVKMS_INPUT_COLORRANGE_DEFAULT = 0,
+
+    NVKMS_INPUT_COLORRANGE_LIMITED = 1,
+
+    NVKMS_INPUT_COLORRANGE_FULL = 2,
+};
+
 enum NvKmsInputColorSpace {
    /* Unknown colorspace; no de-gamma will be applied */
    NVKMS_INPUT_COLORSPACE_NONE = 0,
@ -542,6 +577,12 @@ enum NvKmsInputColorSpace {
    NVKMS_INPUT_COLORSPACE_BT2100_PQ = 2,
 };

+enum NvKmsOutputColorimetry {
+    NVKMS_OUTPUT_COLORIMETRY_DEFAULT = 0,
+
+    NVKMS_OUTPUT_COLORIMETRY_BT2100 = 1,
+};
+
 enum NvKmsOutputTf {
    /*
     * NVKMS itself won't apply any OETF (clients are still
@ -552,6 +593,17 @@ enum NvKmsOutputTf {
    NVKMS_OUTPUT_TF_PQ = 2,
 };

+/*!
+ * EOTF Data Byte 1 as per CTA-861-G spec.
+ * This is expected to match exactly with the spec.
+ */
+enum NvKmsInfoFrameEOTF {
+    NVKMS_INFOFRAME_EOTF_SDR_GAMMA = 0,
+    NVKMS_INFOFRAME_EOTF_HDR_GAMMA = 1,
+    NVKMS_INFOFRAME_EOTF_ST2084 = 2,
+    NVKMS_INFOFRAME_EOTF_HLG = 3,
+};
+
 /*!
 * HDR Static Metadata Type1 Descriptor as per CEA-861.3 spec.
 * This is expected to match exactly with the spec.
@ -605,4 +657,29 @@ struct NvKmsHDRStaticMetadata {
    NvU16 maxFALL;
 };

+/*!
+ * A superframe is made of two or more video streams that are combined in
+ * a specific way. A DP serializer (an external device connected to a Tegra
+ * ARM SOC over DP or HDMI) can receive a video stream comprising multiple
+ * videos combined into a single frame and then split it into multiple
+ * video streams. The following structure describes the number of views
+ * and dimensions of each view inside a superframe.
+ */
+struct NvKmsSuperframeInfo {
+    NvU8 numViews;
+    struct {
+        /* x offset inside superframe at which this view starts */
+        NvU16 x;
+
+        /* y offset inside superframe at which this view starts */
+        NvU16 y;
+
+        /* Horizontal active width in pixels for this view */
+        NvU16 width;
+
+        /* Vertical active height in lines for this view */
+        NvU16 height;
+    } view[NVKMS_MAX_SUPERFRAME_VIEWS];
+};
+
 #endif /* NVKMS_API_TYPES_H */
--- a/kernel-open/common/inc/nvkms-kapi.h
+++ b/kernel-open/common/inc/nvkms-kapi.h
@ -49,6 +49,8 @@ struct NvKmsKapiDevice;
 struct NvKmsKapiMemory;
 struct NvKmsKapiSurface;
 struct NvKmsKapiChannelEvent;
+struct NvKmsKapiSemaphoreSurface;
+struct NvKmsKapiSemaphoreSurfaceCallback;

 typedef NvU32 NvKmsKapiConnector;
 typedef NvU32 NvKmsKapiDisplay;
@ -67,6 +69,14 @@ typedef NvU32 NvKmsKapiDisplay;
 */
 typedef void NvKmsChannelEventProc(void *dataPtr, NvU32 dataU32);

+/*
+ * Note: Same as above, this function must not call back into NVKMS-KAPI, nor
+ * directly into RM. Doing so could cause deadlocks given the notification
+ * function will most likely be called from within RM's interrupt handler
+ * callchain.
+ */
+typedef void NvKmsSemaphoreSurfaceCallbackProc(void *pData);
+
 /** @} */

 /**
@ -126,6 +136,11 @@ struct NvKmsKapiDeviceResourcesInfo {
        NvU32 validCursorCompositionModes;
        NvU64 supportedCursorSurfaceMemoryFormats;

+        struct {
+            NvU64 maxSubmittedOffset;
+            NvU64 stride;
+        } semsurf;
+
        struct {
            NvU16 validRRTransforms;
            NvU32 validCompositionModes;
@ -218,8 +233,10 @@ struct NvKmsKapiLayerConfig {
    struct NvKmsRRParams rrParams;
    struct NvKmsKapiSyncpt syncptParams;

-    struct NvKmsHDRStaticMetadata hdrMetadata;
-    NvBool hdrMetadataSpecified;
+    struct {
+        struct NvKmsHDRStaticMetadata val;
+        NvBool enabled;
+    } hdrMetadata;

    enum NvKmsOutputTf tf;

@ -233,16 +250,21 @@ struct NvKmsKapiLayerConfig {
    NvU16 dstWidth, dstHeight;

    enum NvKmsInputColorSpace inputColorSpace;
+    struct NvKmsCscMatrix csc;
+    NvBool cscUseMain;
 };

 struct NvKmsKapiLayerRequestedConfig {
    struct NvKmsKapiLayerConfig config;
    struct {
-        NvBool surfaceChanged : 1;
-        NvBool srcXYChanged   : 1;
-        NvBool srcWHChanged   : 1;
-        NvBool dstXYChanged   : 1;
-        NvBool dstWHChanged   : 1;
+        NvBool surfaceChanged     : 1;
+        NvBool srcXYChanged       : 1;
+        NvBool srcWHChanged       : 1;
+        NvBool dstXYChanged       : 1;
+        NvBool dstWHChanged       : 1;
+        NvBool cscChanged         : 1;
+        NvBool tfChanged          : 1;
+        NvBool hdrMetadataChanged : 1;
    } flags;
 };

@ -286,14 +308,41 @@ struct NvKmsKapiHeadModeSetConfig {
    struct NvKmsKapiDisplayMode mode;

    NvBool vrrEnabled;
+
+    struct {
+        NvBool enabled;
+        enum NvKmsInfoFrameEOTF eotf;
+        struct NvKmsHDRStaticMetadata staticMetadata;
+    } hdrInfoFrame;
+
+    enum NvKmsOutputColorimetry colorimetry;
+
+    struct {
+        struct {
+            NvBool specified;
+            NvU32 depth;
+            NvU32 start;
+            NvU32 end;
+            struct NvKmsLutRamps *pRamps;
+        } input;
+
+        struct {
+            NvBool specified;
+            NvBool enabled;
+            struct NvKmsLutRamps *pRamps;
+        } output;
+    } lut;
 };

 struct NvKmsKapiHeadRequestedConfig {
    struct NvKmsKapiHeadModeSetConfig modeSetConfig;
    struct {
-        NvBool activeChanged   : 1;
-        NvBool displaysChanged : 1;
-        NvBool modeChanged     : 1;
+        NvBool activeChanged       : 1;
+        NvBool displaysChanged     : 1;
+        NvBool modeChanged         : 1;
+        NvBool hdrInfoFrameChanged : 1;
+        NvBool colorimetryChanged  : 1;
+        NvBool lutChanged      : 1;
    } flags;

    struct NvKmsKapiCursorRequestedConfig cursorRequestedConfig;
@ -318,6 +367,7 @@ struct NvKmsKapiHeadReplyConfig {
 };

 struct NvKmsKapiModeSetReplyConfig {
+    enum NvKmsFlipResult flipResult;
    struct NvKmsKapiHeadReplyConfig
        headReplyConfig[NVKMS_KAPI_MAX_HEADS];
 };
@ -434,6 +484,12 @@ enum NvKmsKapiAllocationType {
    NVKMS_KAPI_ALLOCATION_TYPE_OFFSCREEN = 2,
 };

+typedef enum NvKmsKapiRegisterWaiterResultRec {
+    NVKMS_KAPI_REG_WAITER_FAILED,
+    NVKMS_KAPI_REG_WAITER_SUCCESS,
+    NVKMS_KAPI_REG_WAITER_ALREADY_SIGNALLED,
+} NvKmsKapiRegisterWaiterResult;
+
 struct NvKmsKapiFunctionsTable {

    /*!
@ -519,8 +575,8 @@ struct NvKmsKapiFunctionsTable {
    );

    /*!
-     * Revoke permissions previously granted. Only one (dispIndex, head,
-     * display) is currently supported.
+     * Revoke modeset permissions previously granted. Only one (dispIndex,
+     * head, display) is currently supported.
     *
     * \param [in]  device     A device returned by allocateDevice().
     *
@ -537,6 +593,34 @@ struct NvKmsKapiFunctionsTable {
        NvKmsKapiDisplay display
    );

+    /*!
+     * Grant modeset sub-owner permissions to fd. This is used by clients to
+     * convert drm 'master' permissions into nvkms sub-owner permission.
+     *
+     * \param [in]  fd         fd from opening /dev/nvidia-modeset.
+     *
+     * \param [in]  device     A device returned by allocateDevice().
+     *
+     * \return NV_TRUE on success, NV_FALSE on failure.
+     */
+    NvBool (*grantSubOwnership)
+    (
+        NvS32 fd,
+        struct NvKmsKapiDevice *device
+    );
+
+    /*!
+     * Revoke sub-owner permissions previously granted.
+     *
+     * \param [in]  device     A device returned by allocateDevice().
+     *
+     * \return NV_TRUE on success, NV_FALSE on failure.
+     */
+    NvBool (*revokeSubOwnership)
+    (
+        struct NvKmsKapiDevice *device
+    );
+
    /*!
     * Registers for notification, via
     * NvKmsKapiAllocateDeviceParams::eventCallback, of the events specified
@ -1122,6 +1206,199 @@ struct NvKmsKapiFunctionsTable {
                                       NvP64 dmaBuf,
                                       NvU32 limit);

+    /*!
+     * Import a semaphore surface allocated elsewhere to NVKMS and return a
+     * handle to the new object.
+     *
+     * \param [in] device            A device allocated using allocateDevice().
+     *
+     * \param [in] nvKmsParamsUser   Userspace pointer to driver-specific
+     *                               parameters describing the semaphore
+     *                               surface being imported.
+     *
+     * \param [in] nvKmsParamsSize   Size of the driver-specific parameter
+     *                               struct.
+     *
+     * \param [out] pSemaphoreMap    Returns a CPU mapping of the semaphore
+     *                               surface's semaphore memory to the client.
+     *
+     * \param [out] pMaxSubmittedMap Returns a CPU mapping of the semaphore
+     *                               surface's semaphore memory to the client.
+     *
+     * \return struct NvKmsKapiSemaphoreSurface* on success, NULL on failure.
+     */
+    struct NvKmsKapiSemaphoreSurface* (*importSemaphoreSurface)
+    (
+         struct NvKmsKapiDevice *device,
+         NvU64 nvKmsParamsUser,
+         NvU64 nvKmsParamsSize,
+         void **pSemaphoreMap,
+         void **pMaxSubmittedMap
+    );
+
+    /*!
+     * Free an imported semaphore surface.
+     *
+     * \param [in]  device              The device passed to
+     *                                  importSemaphoreSurface() when creating
+     *                                  semaphoreSurface.
+     *
+     * \param [in]  semaphoreSurface    A semaphore surface returned by
+     *                                  importSemaphoreSurface().
+     */
+    void (*freeSemaphoreSurface)
+    (
+        struct NvKmsKapiDevice *device,
+        struct NvKmsKapiSemaphoreSurface *semaphoreSurface
+    );
+
+    /*!
+     * Register a callback to be called when a semaphore reaches a value.
+     *
+     * The callback will be called when the semaphore at index in
+     * semaphoreSurface reaches the value wait_value.  The callback will
+     * be called at most once and is automatically unregistered when called.
+     * It may also be unregistered (i.e., cancelled) explicitly using the
+     * unregisterSemaphoreSurfaceCallback() function. To avoid leaking the
+     * memory used to track the registered callback, callers must ensure one
+     * of these methods of unregistration is used for every successful
+     * callback registration that returns a non-NULL pCallbackHandle.
+     *
+     * \param [in]  device              The device passed to
+     *                                  importSemaphoreSurface() when creating
+     *                                  semaphoreSurface.
+     *
+     * \param [in]  semaphoreSurface    A semaphore surface returned by
+     *                                  importSemaphoreSurface().
+     *
+     * \param [in]  pCallback           A pointer to the function to call when
+     *                                  the specified value is reached. NULL
+     *                                  means no callback.
+     *
+     * \param [in]  pData               Arbitrary data to be passed back to the
+     *                                  callback as its sole parameter.
+     *
+     * \param [in]  index               The index of the semaphore within
+     *                                  semaphoreSurface.
+     *
+     * \param [in]  wait_value          The value the semaphore must reach or
+     *                                  exceed before the callback is called.
+     *
+     * \param [in]  new_value           The value the semaphore will be set to
+     *                                  when it reaches or exceeds <wait_value>.
+     *                                  0 means do not update the value.
+     *
+     * \param [out] pCallbackHandle     On success, the value pointed to will
+     *                                  contain an opaque handle to the
+     *                                  registered callback that may be used to
+     *                                  cancel it if needed. Unused if pCallback
+     *                                  is NULL.
+     *
+     * \return NVKMS_KAPI_REG_WAITER_SUCCESS if the waiter was registered or if
+     *         no callback was requested and the semaphore at <index> has
+     *         already reached or exceeded <wait_value>
+     *
+     *         NVKMS_KAPI_REG_WAITER_ALREADY_SIGNALLED if a callback was
+     *         requested and the semaphore at <index> has already reached or
+     *         exceeded <wait_value>
+     *
+     *         NVKMS_KAPI_REG_WAITER_FAILED if waiter registration failed.
+     */
+    NvKmsKapiRegisterWaiterResult
+    (*registerSemaphoreSurfaceCallback)
+    (
+        struct NvKmsKapiDevice *device,
+        struct NvKmsKapiSemaphoreSurface *semaphoreSurface,
+        NvKmsSemaphoreSurfaceCallbackProc *pCallback,
+        void *pData,
+        NvU64 index,
+        NvU64 wait_value,
+        NvU64 new_value,
+        struct NvKmsKapiSemaphoreSurfaceCallback **pCallbackHandle
+    );
+
+    /*!
+     * Unregister a callback registered via registerSemaphoreSurfaceCallback()
+     *
+     * If the callback has not yet been called, this function will cancel the
+     * callback and free its associated resources.
+     *
+     * Note this function treats the callback handle as a pointer. While this
+     * function does not dereference that pointer itself, the underlying call
+     * to RM does within a properly guarded critical section that first ensures
+     * it is not in the process of being used within a callback. This means
+     * the callstack must take into consideration that pointers are not in
+     * general unique handles if they may have been freed, since a subsequent
+     * malloc could return the same pointer value at that point. This callchain
+     * avoids that by leveraging the behavior of the underlying RM APIs:
+     *
+     * 1) A callback handle is referenced relative to its corresponding
+     *    (semaphore surface, index, wait_value) tuple here and within RM. It
+     *    is not a valid handle outside of that scope.
+     *
+     * 2) A callback can not be registered against an already-reached value
+     *    for a given semaphore surface index.
+     *
+     * 3) A given callback handle can not be registered twice against the same
+     *    (semaphore surface, index, wait_value) tuple, so unregistration will
+     *    never race with registration at the RM level, and would only race at
+     *    a higher level if used incorrectly. Since this is kernel code, we
+     *    can safely assume there won't be malicious clients purposely misuing
+     *    the API, but the burden is placed on the caller to ensure its usage
+     *    does not lead to races at higher levels.
+     *
+     * These factors considered together ensure any valid registered handle is
+     * either still in the relevant waiter list and refers to the same event/
+     * callback as when it was registered, or has been removed from the list
+     * as part of a critical section that also destroys the list itself and
+     * makes future lookups in that list impossible, and hence eliminates the
+     * chance of comparing a stale handle with a new handle of the same value
+     * as part of a lookup.
+     *
+     * \param [in]  device              The device passed to
+     *                                  importSemaphoreSurface() when creating
+     *                                  semaphoreSurface.
+     *
+     * \param [in]  semaphoreSurface    The semaphore surface passed to
+     *                                  registerSemaphoreSurfaceCallback() when
+     *                                  registering the callback.
+     *
+     * \param [in]  index               The index passed to
+     *                                  registerSemaphoreSurfaceCallback() when
+     *                                  registering the callback.
+     *
+     * \param [in]  wait_value          The wait_value passed to
+     *                                  registerSemaphoreSurfaceCallback() when
+     *                                  registering the callback.
+     *
+     * \param [in]  callbackHandle      The callback handle returned by
+     *                                  registerSemaphoreSurfaceCallback().
+     */
+    NvBool
+    (*unregisterSemaphoreSurfaceCallback)
+    (
+        struct NvKmsKapiDevice *device,
+        struct NvKmsKapiSemaphoreSurface *semaphoreSurface,
+        NvU64 index,
+        NvU64 wait_value,
+        struct NvKmsKapiSemaphoreSurfaceCallback *callbackHandle
+    );
+
+    /*!
+     * Update the value of a semaphore surface from the CPU.
+     *
+     * Update the semaphore value at the specified index from the CPU, then
+     * wake up any pending CPU waiters associated with that index that are
+     * waiting on it reaching a value <= the new value.
+     */
+    NvBool
+    (*setSemaphoreSurfaceValue)
+    (
+        struct NvKmsKapiDevice *device,
+        struct NvKmsKapiSemaphoreSurface *semaphoreSurface,
+        NvU64 index,
+        NvU64 new_value
+    );
 };

 /** @} */
--- a/kernel-open/common/inc/os-interface.h
+++ b/kernel-open/common/inc/os-interface.h
@ -162,7 +162,7 @@ NvBool      NV_API_CALL  os_is_vgx_hyper             (void);
 NV_STATUS   NV_API_CALL  os_inject_vgx_msi           (NvU16, NvU64, NvU32);
 NvBool      NV_API_CALL  os_is_grid_supported        (void);
 NvU32       NV_API_CALL  os_get_grid_csp_support     (void);
-void        NV_API_CALL  os_get_screen_info          (NvU64 *, NvU16 *, NvU16 *, NvU16 *, NvU16 *, NvU64, NvU64);
+void        NV_API_CALL  os_get_screen_info          (NvU64 *, NvU32 *, NvU32 *, NvU32 *, NvU32 *, NvU64, NvU64);
 void        NV_API_CALL  os_bug_check                (NvU32, const char *);
 NV_STATUS   NV_API_CALL  os_lock_user_pages          (void *, NvU64, void **, NvU32);
 NV_STATUS   NV_API_CALL  os_lookup_user_io_memory    (void *, NvU64, NvU64 **, void**);
@ -230,12 +230,14 @@ extern NvBool os_dma_buf_enabled;
 * ---------------------------------------------------------------------------
 */

-#define NV_DBG_INFO       0x0
-#define NV_DBG_SETUP      0x1
-#define NV_DBG_USERERRORS 0x2
+#define NV_DBG_INFO       0x1
+#define NV_DBG_SETUP      0x2
 #define NV_DBG_WARNINGS   0x3
 #define NV_DBG_ERRORS     0x4
+#define NV_DBG_HW_ERRORS  0x5
+#define NV_DBG_FATAL      0x6

+#define NV_DBG_FORCE_LEVEL(level) ((level) | (1 << 8))

 void NV_API_CALL  out_string(const char *str);
 int  NV_API_CALL  nv_printf(NvU32 debuglevel, const char *printf_format, ...);
--- a/kernel-open/conftest.sh
+++ b/kernel-open/conftest.sh
@ -316,7 +316,7 @@ export_symbol_present_conftest() {
    SYMBOL="$1"
    TAB='	'

-    if grep -e "${TAB}${SYMBOL}${TAB}.*${TAB}EXPORT_SYMBOL.*\$" \
+    if grep -e "${TAB}${SYMBOL}${TAB}.*${TAB}EXPORT_SYMBOL\(_GPL\)\?\s*\$" \
               "$OUTPUT/Module.symvers" >/dev/null 2>&1; then
        echo "#define NV_IS_EXPORT_SYMBOL_PRESENT_$SYMBOL 1" |
            append_conftest "symbols"
@ -337,7 +337,7 @@ export_symbol_gpl_conftest() {
    SYMBOL="$1"
    TAB='	'

-    if grep -e "${TAB}${SYMBOL}${TAB}.*${TAB}EXPORT_\(UNUSED_\)*SYMBOL_GPL\$" \
+    if grep -e "${TAB}${SYMBOL}${TAB}.*${TAB}EXPORT_\(UNUSED_\)*SYMBOL_GPL\s*\$" \
               "$OUTPUT/Module.symvers" >/dev/null 2>&1; then
        echo "#define NV_IS_EXPORT_SYMBOL_GPL_$SYMBOL 1" |
            append_conftest "symbols"
@ -549,10 +549,8 @@ compile_test() {
            # Determine if the set_pages_array_uc() function is present.
            # It does not exist on all architectures.
            #
-            # set_pages_array_uc() was added by commit
-            # 0f3507555f6fa4acbc85a646d6e8766230db38fc ("x86, CPA: Add
-            # set_pages_arrayuc and set_pages_array_wb") in v2.6.30-rc1 (Thu Mar
-            # 19 14:51:15 2009)
+            # Added by commit 0f3507555f6f ("x86, CPA: Add set_pages_arrayuc
+            # and set_pages_array_wb") in v2.6.30.
            #
            CODE="
            #include <linux/types.h>
@ -597,8 +595,8 @@ compile_test() {
            #
            # Added by commit 3c299dc22635 ("PCI: add
            # pci_get_domain_bus_and_slot function") in 2.6.33 but aarch64
-            # support was added by commit d1e6dc91b532 
-            # ("arm64: Add architectural support for PCI") in 3.18-rc1
+            # support was added by commit d1e6dc91b532 ("arm64: Add
+            # architectural support for PCI") in 3.18.
            #
            CODE="
            #include <linux/pci.h>
@ -1242,26 +1240,6 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_VFIO_DEVICE_GFX_PLANE_INFO_PRESENT" "" "types"
        ;;

-        vfio_device_migration_has_start_pfn)
-            #
-            # Determine if the 'vfio_device_migration_info' structure has
-            # a 'start_pfn' field.
-            #
-            # This member was present in proposed interface for vGPU Migration
-            # ("[PATCH v3 0/5] Add migration support for VFIO device ")
-            # https://lists.gnu.org/archive/html/qemu-devel/2019-02/msg05176.html
-            # which is not present in upstreamed commit a8a24f3f6e38 (vfio: UAPI
-            # for migration interface for device state) in v5.8 (2020-05-29)
-            #
-            CODE="
-            #include <linux/vfio.h>
-            int conftest_vfio_device_migration_has_start_pfn(void) {
-                return offsetof(struct vfio_device_migration_info, start_pfn);
-            }"
-
-            compile_check_conftest "$CODE" "NV_VFIO_DEVICE_MIGRATION_HAS_START_PFN" "" "types"
-        ;;
-
        vfio_uninit_group_dev)
            #
            # Determine if vfio_uninit_group_dev() function is present or not.
@ -1411,9 +1389,8 @@ compile_test() {
            #
            # Determine if the pde_data() function is present.
            #
-            # The commit c28198889c15 removed the function
-            # 'PDE_DATA()', and replaced it with 'pde_data()'
-            # ("proc: remove PDE_DATA() completely") in v5.17-rc1.
+            # PDE_DATA() was replaced with pde_data() by commit 359745d78351
+            # ("proc: remove PDE_DATA() completely") in v5.17.
            #
            CODE="
            #include <linux/proc_fs.h>
@ -1554,8 +1531,8 @@ compile_test() {
            # based implementation") in v4.5
            #
            # Commit 0a0f0d8be76d ("dma-mapping: split <linux/dma-mapping.h>")
-            # in v5.10-rc1 (2020-09-22), moved get_dma_ops() function
-            # prototype from <linux/dma-mapping.h> to <linux/dma-map-ops.h>.
+            # in v5.10 moved get_dma_ops() function prototype from
+            # <linux/dma-mapping.h> to <linux/dma-map-ops.h>.
            #
            CODE="
            #if defined(NV_LINUX_DMA_MAP_OPS_H_PRESENT)
@ -1725,9 +1702,8 @@ compile_test() {

        kernel_write_has_pointer_pos_arg)
            #
-            # Determine the pos argument type, which was changed by
-            # commit e13ec939e96b1 (fs: fix kernel_write prototype) on
-            # 9/1/2017.
+            # Determine the pos argument type, which was changed by commit
+            # e13ec939e96b ("fs: fix kernel_write prototype") in v4.14.
            #
            echo "$CONFTEST_PREAMBLE
            #include <linux/fs.h>
@ -1750,9 +1726,8 @@ compile_test() {

        kernel_read_has_pointer_pos_arg)
            #
-            # Determine the pos argument type, which was changed by
-            # commit bdd1d2d3d251c (fs: fix kernel_read prototype) on
-            # 9/1/2017.
+            # Determine the pos argument type, which was changed by commit
+            # bdd1d2d3d251 ("fs: fix kernel_read prototype") in v4.14.
            #
            echo "$CONFTEST_PREAMBLE
            #include <linux/fs.h>
@ -1777,8 +1752,8 @@ compile_test() {
            #
            # Determine if vm_insert_pfn_prot function is present
            #
-            # Added by commit 1745cbc5d0de ("mm: Add vm_insert_pfn_prot()") in
-            # v3.16.59
+            # Added by commit 1745cbc5d0de ("mm: Add vm_insert_pfn_prot()")
+            # in v4.6.
            #
            # Removed by commit f5e6d1d5f8f3 ("mm: introduce
            # vmf_insert_pfn_prot()") in v4.20.
@ -1995,7 +1970,7 @@ compile_test() {
            # attached drivers") in v3.14 (2013-12-11)
            #
            # The commit 57bb1ee60340 ("drm: Compile out legacy chunks from
-            # struct drm_device") compiles out the legacy chunks like
+            # struct drm_device") in v5.11 compiles out the legacy chunks like
            # drm_driver::legacy_dev_list.
            #
            CODE="
@ -2018,14 +1993,14 @@ compile_test() {
            #
            # Determine if jiffies_to_timespec() is present
            #
-            # removed by commit 751addac78b6
-            # ("y2038: remove obsolete jiffies conversion functions")
-            # in v5.6-rc1 (2019-12-13).
-        CODE="
-        #include <linux/jiffies.h>
-        void conftest_jiffies_to_timespec(void){
-            jiffies_to_timespec();
-        }"
+            # Removed by commit 751addac78b6 ("y2038: remove obsolete jiffies
+            # conversion functions") in v5.6.
+            #
+            CODE="
+            #include <linux/jiffies.h>
+            void conftest_jiffies_to_timespec(void){
+                jiffies_to_timespec();
+            }"
            compile_check_conftest "$CODE" "NV_JIFFIES_TO_TIMESPEC_PRESENT" "" "functions"
        ;;

@ -2035,14 +2010,21 @@ compile_test() {
            #   drm_universal_plane_init()
            #   drm_crtc_init_with_planes()
            #   drm_encoder_init()
-            # have a 'name' argument, which was added by these commits:
-            #   drm_universal_plane_init:   2015-12-09  b0b3b7951114315d65398c27648705ca1c322faa
-            #   drm_crtc_init_with_planes:  2015-12-09  f98828769c8838f526703ef180b3088a714af2f9
-            #   drm_encoder_init:           2015-12-09  13a3d91f17a5f7ed2acd275d18b6acfdb131fb15
+            # have a 'name' argument.
            #
-            # Additionally determine whether drm_universal_plane_init() has a
-            # 'format_modifiers' argument, which was added by:
-            #   2017-07-23  e6fc3b68558e4c6d8d160b5daf2511b99afa8814
+            # drm_universal_plane_init was updated by commit b0b3b7951114
+            # ("drm: Pass 'name' to drm_universal_plane_init()") in v4.5.
+            #
+            # drm_crtc_init_with_planes was updated by commit f98828769c88
+            # ("drm: Pass 'name' to drm_crtc_init_with_planes()") in v4.5.
+            #
+            # drm_encoder_init was updated by commit 13a3d91f17a5 ("drm: Pass
+            # 'name' to drm_encoder_init()") in v4.5.
+            #
+            # Additionally, determine whether drm_universal_plane_init() has
+            # a 'format_modifiers' argument, which was added by commit
+            # e6fc3b68558e ("drm: Plumb modifiers through plane init") in
+            # v4.14.
            #
            CODE="
            #if defined(NV_DRM_DRMP_H_PRESENT)
@ -2239,7 +2221,7 @@ compile_test() {
            # correction properties") in v4.6 (2016-03-08).
            #
            # Removed by commit f8ed34ac7b45 ("drm: drm_helper_crtc_enable_color_mgmt()
-            # => drm_crtc_enable_color_mgmt()") in v4.8-rc1 (2016-06-07).
+            # => drm_crtc_enable_color_mgmt()") in v4.8.
            #
            CODE="
            #include <drm/drm_crtc_helper.h>
@ -2257,11 +2239,11 @@ compile_test() {
            # present.
            #
            # Added by commit f8ed34ac7b45 ("drm: drm_helper_crtc_enable_color_mgmt()
-            # => drm_crtc_enable_color_mgmt()") in v4.8-rc1 (2016-06-07), replacing
+            # => drm_crtc_enable_color_mgmt()") in v4.8, replacing
            # drm_helper_crtc_enable_color_mgmt().
            #
            # Moved to drm_color_mgmt.[ch] by commit f1e2f66ce2d9 ("drm: Extract
-            # drm_color_mgmt.[hc]") in v4.9-rc1 (2016-09-22)
+            # drm_color_mgmt.[hc]") in v4.9.
            #
            CODE="
            #if defined(NV_DRM_DRM_CRTC_H_PRESENT)
@ -2288,8 +2270,7 @@ compile_test() {
            # Accidentally moved to drm_atomic_state_helper.[ch] by commit
            # 9ef8a9dc4b21 ("drm: Extract drm_atomic_state_helper.[ch]")
            # and moved back to drm_atomic_helper.[ch] by commit 1d8224e790c7
-            # ("drm: Fix up drm_atomic_state_helper.[hc] extraction") in
-            # v5.0-rc1
+            # ("drm: Fix up drm_atomic_state_helper.[hc] extraction") in v5.0.
            #
            # Removed by commit 6ca2ab8086af ("drm: automatic legacy gamma
            # support") in v5.12 (2020-12-15)
@ -2353,8 +2334,8 @@ compile_test() {
            #
            # Added by commit 210647af897a ("PCI: Rename pci_remove_bus_device
            # to pci_stop_and_remove_bus_device") in v3.4 (2012-02-25) but
-            # aarch64 support was added by commit d1e6dc91b532 
-            # ("arm64: Add architectural support for PCI") in v3.18-rc1.
+            # aarch64 support was added by commit d1e6dc91b532 ("arm64: Add
+            # architectural support for PCI") in v3.18.
            #
            CODE="
            #include <linux/types.h>
@ -2451,8 +2432,8 @@ compile_test() {
            #
            # Determine if the 'pci_dev' data type has a 'ats_enabled' member.
            #
-            # Added by commit d544d75ac96aa ("PCI: Embed ATS info directly
-            # into struct pci_dev") in v4.3-rc1 (2015-08-14)
+            # Added by commit d544d75ac96a ("PCI: Embed ATS info directly
+            # into struct pci_dev") in v4.3.
            #
            CODE="
            #include <linux/pci.h>
@ -2483,9 +2464,9 @@ compile_test() {
            # commit 768ae309a961 ("mm: replace get_user_pages() write/force
            # parameters with gup_flags") in v4.9 (2016-10-13)
            #
-            # Removed vmas parameter from get_user_pages() by commit 7bbf9c8c99
+            # Removed vmas parameter from get_user_pages() by commit 54d020692b34
            # ("mm/gup: remove unused vmas parameter from get_user_pages()")
-            # in linux-next, expected in v6.5-rc1
+            # in v6.5.
            #
            # linux-4.4.168 cherry-picked commit 768ae309a961 without
            # c12d2da56d0e which is covered in Conftest #3.
@ -2653,11 +2634,11 @@ compile_test() {
            #
            # get_user_pages_remote() removed 'tsk' parameter by
            # commit 64019a2e467a ("mm/gup: remove task_struct pointer for
-            # all gup code") in v5.9-rc1 (2020-08-11).
+            # all gup code") in v5.9.
            #
            # Removed vmas parameter from get_user_pages_remote() by commit
-            # a4bde14d549 ("mm/gup: remove vmas parameter from get_user_pages_remote()")
-            # in linux-next, expected in v6.5-rc1
+            # ca5e863233e8 ("mm/gup: remove vmas parameter from
+            # get_user_pages_remote()") in v6.5.
            #

            #
@ -2856,15 +2837,14 @@ compile_test() {
            #
            # Determine if the function pin_user_pages() is present.
            # Presence of pin_user_pages() also implies the presence of
-            # unpin-user_page(). Both were added in the v5.6-rc1
+            # unpin-user_page().
            #
-            # pin_user_pages() was added by commit eddb1c228f7951d399240
-            # ("mm/gup: introduce pin_user_pages*() and FOLL_PIN") in
-            # v5.6-rc1 (2020-01-30)
+            # pin_user_pages() was added by commit eddb1c228f79 ("mm/gup:
+            # introduce pin_user_pages*() and FOLL_PIN") in v5.6.
            #
            # Removed vmas parameter from pin_user_pages() by commit
-            # 40896a02751("mm/gup: remove vmas parameter from pin_user_pages()")
-            # in linux-next, expected in v6.5-rc1
+            # 4c630f307455 ("mm/gup: remove vmas parameter from
+            # pin_user_pages()") in v6.5.

            set_pin_user_pages_defines () {
                if [ "$1" = "" ]; then
@ -2929,13 +2909,13 @@ compile_test() {
            # ("mm/gup: introduce pin_user_pages*() and FOLL_PIN")
            # in v5.6 (2020-01-30)

-            # pin_user_pages_remote() removed 'tsk' parameter by
-            # commit 64019a2e467a ("mm/gup: remove task_struct pointer for
-            # all gup code") in v5.9-rc1 (2020-08-11).
+            # pin_user_pages_remote() removed 'tsk' parameter by commit
+            # 64019a2e467a ("mm/gup: remove task_struct pointer for all gup
+            # code") in v5.9.
            #
            # Removed unused vmas parameter from pin_user_pages_remote() by
-            # commit 83bcc2e132 ("mm/gup: remove unused vmas parameter from
-            # pin_user_pages_remote()") in linux-next, expected in v6.5-rc1
+            # commit 0b295316b3a9 ("mm/gup: remove unused vmas parameter from
+            # pin_user_pages_remote()") in v6.5.

            #
            # This function sets the NV_PIN_USER_PAGES_REMOTE_* macros as per
@ -3098,8 +3078,8 @@ compile_test() {
            #
            # Determine if enable_apicv boolean is exported by kernel.
            #
-            # Added by commit fdf513e37a3bd ("KVM: x86: Use common 'enable_apicv'
-            # variable for both APICv and AVIC")
+            # Added by commit fdf513e37a3b ("KVM: x86: Use common
+            # 'enable_apicv' variable for both APICv and AVIC") in v5.14.
            #
            CODE="
            $CONFTEST_PREAMBLE
@ -4027,9 +4007,8 @@ compile_test() {
            # Determine if drm_connector_attach_vrr_capable_property and
            # drm_connector_set_vrr_capable_property is present
            #
-            # Added by commit ba1b0f6c73d4ea1390f0d5381f715ffa20c75f09 ("drm:
-            # Add vrr_capable property to the drm connector") in v5.0-rc1
-            # (2018-11-28)
+            # Added by commit ba1b0f6c73d4 ("drm: Add vrr_capable property to
+            # the drm connector") in v5.0.
            #
            CODE="
            #if defined(NV_DRM_DRM_CONNECTOR_H_PRESENT)
@ -4339,16 +4318,21 @@ compile_test() {
            # with the logic of "functions" the presence of
            # *either*_alpha_property or _blend_mode_property would be enough
            # to cause NV_DRM_ALPHA_BLENDING_AVAILABLE to be defined.
+
+            # drm_plane_create_alpha_property was added by commit
+            # ae0e28265e21 ("drm/blend: Add a generic alpha property") in
+            # v4.18.
+            #
+            # drm_plane_create_blend_mode_property was added by commit
+            # a5ec8332d428 ("drm: Add per-plane pixel blend mode property")
+            # in v4.20.
            #
            CODE="
            #if defined(NV_DRM_DRM_BLEND_H_PRESENT)
            #include <drm/drm_blend.h>
            #endif
            void conftest_drm_alpha_blending_available(void) {
-                /* 2018-04-11 ae0e28265e216dad11d4cbde42fc15e92919af78 */
                (void)drm_plane_create_alpha_property;
-
-                /* 2018-08-23 a5ec8332d4280500544e316f76c04a7adc02ce03 */
                (void)drm_plane_create_blend_mode_property;
            }"

@ -4359,10 +4343,10 @@ compile_test() {
            #
            # Determine if the DRM subsystem supports rotation.
            #
-            # drm_plane_create_rotation_property() was added on 2016-09-26 by
-            # d138dd3c0c70979215f3184cf36f95875e37932e (drm: Add support for
-            # optional per-plane rotation property) in linux kernel. Presence
-            # of it is sufficient to say that DRM subsystem support rotation.
+            # drm_plane_create_rotation_property() was added by commit
+            # d138dd3c0c70 ("drm: Add support for optional per-plane rotation
+            # property") in v4.10.  Presence of it is sufficient to say that
+            # DRM subsystem support rotation.
            #
            CODE="
            #if defined(NV_DRM_DRM_BLEND_H_PRESENT)
@ -4381,8 +4365,8 @@ compile_test() {
            #
            # The DRIVER_PRIME flag was added by commit 3248877ea179 (drm:
            # base prime/dma-buf support (v5)) in v3.4 (2011-11-25) and is
-            # removed by commit 0424fdaf883a (drm/prime: Actually remove
-            # DRIVER_PRIME everywhere) on 2019-06-17.
+            # removed by commit 0424fdaf883a ("drm/prime: Actually remove
+            # DRIVER_PRIME everywhere") in v5.4.
            #
            # DRIVER_PRIME definition moved from drmP.h to drm_drv.h by
            # commit 85e634bce01a (drm: Extract drm_drv.h) in v4.10
@ -4415,10 +4399,10 @@ compile_test() {
            #
            # drm_connector_for_each_possible_encoder() is added by commit
            # 83aefbb887b5 (drm: Add drm_connector_for_each_possible_encoder())
-            # in v4.19. The definition and prorotype is changed to take only
-            # two arguments connector and encoder, by commit 62afb4ad425a
-            # (drm/connector: Allow max possible encoders to attach to a
-            # connector) in v5.5rc1.
+            # in v4.19.  The definition and prototype is changed to take only
+            # two arguments connector and encoder by commit 62afb4ad425a
+            # ("drm/connector: Allow max possible encoders to attach to a
+            # connector") in v5.5.
            #
            echo "$CONFTEST_PREAMBLE
            #if defined(NV_DRM_DRMP_H_PRESENT)
@ -4468,6 +4452,24 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_MMU_NOTIFIER_OPS_HAS_INVALIDATE_RANGE" "" "types"
        ;;

+        mmu_notifier_ops_arch_invalidate_secondary_tlbs)
+            #
+            # Determine if the mmu_notifier_ops struct has the
+            # 'arch_invalidate_secondary_tlbs' member.
+            #
+            # struct mmu_notifier_ops.invalidate_range was renamed to
+            # arch_invalidate_secondary_tlbs by commit 1af5a8109904
+            # ("mmu_notifiers: rename invalidate_range notifier") due to be
+            # added in v6.6
+           CODE="
+            #include <linux/mmu_notifier.h>
+            int conftest_mmu_notifier_ops_arch_invalidate_secondary_tlbs(void) {
+                return offsetof(struct mmu_notifier_ops, arch_invalidate_secondary_tlbs);
+            }"
+
+            compile_check_conftest "$CODE" "NV_MMU_NOTIFIER_OPS_HAS_ARCH_INVALIDATE_SECONDARY_TLBS" "" "types"
+        ;;
+
        drm_format_num_planes)
            #
            # Determine if drm_format_num_planes() function is present.
@ -4523,8 +4525,8 @@ compile_test() {
            #
            # Determine if the 'struct proc_ops' type is present.
            #
-            # Added by commit d56c0d45f0e2 ("proc: decouple proc from VFS with
-            # "struct proc_ops"") in 5.6-rc1
+            # Added by commit d56c0d45f0e2 ("proc: decouple proc from VFS
+            # with "struct proc_ops"") in v5.6.
            #
            CODE="
            #include <linux/proc_fs.h>
@ -4582,8 +4584,8 @@ compile_test() {
            # Determine if 'drm_crtc_state' structure has a
            # 'vrr_enabled' field.
            #
-            # Added by commit 1398958cfd8d331342d657d37151791dd7256b40 ("drm:
-            # Add vrr_enabled property to drm CRTC") in v5.0-rc1 (2018-11-28)
+            # Added by commit 1398958cfd8d ("drm: Add vrr_enabled property to
+            # drm CRTC") in v5.0.
            #
            CODE="
            #if defined(NV_DRM_DRM_CRTC_H_PRESENT)
@ -4604,11 +4606,11 @@ compile_test() {
            # Added by commit fb7fcc96a86cf ("timekeeping: Standardize on
            # ktime_get_*() naming") in 4.18 (2018-04-27)
            #
-        CODE="
-        #include <linux/ktime.h>
-        void conftest_ktime_get_raw_ts64(void){
-            ktime_get_raw_ts64();
-        }"
+            CODE="
+            #include <linux/ktime.h>
+            void conftest_ktime_get_raw_ts64(void){
+                ktime_get_raw_ts64();
+            }"
            compile_check_conftest "$CODE" "NV_KTIME_GET_RAW_TS64_PRESENT" "" "functions"
        ;;

@ -4619,11 +4621,11 @@ compile_test() {
            # Added by commit d6d29896c665d ("timekeeping: Provide timespec64
            # based interfaces") in 3.17 (2014-07-16)
            #
-        CODE="
-        #include <linux/ktime.h>
-        void conftest_ktime_get_real_ts64(void){
-            ktime_get_real_ts64();
-        }"
+            CODE="
+            #include <linux/ktime.h>
+            void conftest_ktime_get_real_ts64(void){
+                ktime_get_real_ts64();
+            }"
            compile_check_conftest "$CODE" "NV_KTIME_GET_REAL_TS64_PRESENT" "" "functions"
        ;;

@ -4643,8 +4645,9 @@ compile_test() {
            # -the "modifier[]" member of the AddFB2 ioctl's parameter
            #  structure.
            #
-            # All these were added by commit e3eb3250d84e (drm: add support for
-            # tiled/compressed/etc modifier in addfb2) in 4.1-rc1 (2015-02-05).
+            # All these were added by commit e3eb3250d84e ("drm: add support
+            # for tiled/compressed/etc modifier in addfb2") in v4.1.
+            #
            CODE="
            #include <drm/drm_mode.h>
            #include <drm/drm_fourcc.h>
@ -4664,11 +4667,11 @@ compile_test() {
            # Added by commit 361a3bf00582 ("time64: Add time64.h header and
            # define struct timespec64") in 3.17 (2014-07-16)
            #
-        CODE="
-        #include <linux/time.h>
+            CODE="
+            #include <linux/time.h>

-        struct timespec64 ts64;
-        "
+            struct timespec64 ts64;
+            "
            compile_check_conftest "$CODE" "NV_TIMESPEC64_PRESENT" "" "types"

        ;;
@ -4680,15 +4683,15 @@ compile_test() {
            # The third argument to __vmalloc, page protection
            # 'pgprot_t prot', was removed by commit 88dca4ca5a93
            # (mm: remove the pgprot argument to __vmalloc)
-            # in v5.8-rc1 (2020-06-01).
-        CODE="
-        #include <linux/vmalloc.h>
-
-        void conftest_vmalloc_has_pgprot_t_arg(void) {
-            pgprot_t prot;
-            (void)__vmalloc(0, 0, prot);
-        }"
+            # in v5.8.
+            #
+            CODE="
+            #include <linux/vmalloc.h>

+            void conftest_vmalloc_has_pgprot_t_arg(void) {
+                pgprot_t prot;
+                (void)__vmalloc(0, 0, prot);
+            }"
            compile_check_conftest "$CODE" "NV_VMALLOC_HAS_PGPROT_T_ARG" "" "types"

        ;;
@ -4699,7 +4702,8 @@ compile_test() {
            #
            # Kernel commit da1c55f1b272 ("mmap locking API: rename mmap_sem
            # to mmap_lock") replaced the field 'mmap_sem' by 'mmap_lock'
-            # in v5.8-rc1 (2020-06-08).
+            # in v5.8.
+            #
            CODE="
            #include <linux/mm_types.h>

@ -4789,9 +4793,9 @@ compile_test() {
        ;;

        pci_enable_atomic_ops_to_root)
-            # pci_enable_atomic_ops_to_root was added by
-            # commit 430a23689dea ("PCI: Add pci_enable_atomic_ops_to_root()")
-            # in v4.16-rc1 (2018-01-05)
+            #
+            # pci_enable_atomic_ops_to_root was added by commit 430a23689dea
+            # ("PCI: Add pci_enable_atomic_ops_to_root()") in v4.16.
            #
            CODE="
            #include <linux/pci.h>
@ -4808,11 +4812,11 @@ compile_test() {
            # Added by commit a7c3e901a46ff54c016d040847eda598a9e3e653 ("mm:
            # introduce kv[mz]alloc helpers") in v4.12 (2017-05-08).
            #
-        CODE="
-        #include <linux/mm.h>
-        void conftest_kvmalloc(void){
-            kvmalloc();
-        }"
+            CODE="
+            #include <linux/mm.h>
+            void conftest_kvmalloc(void){
+                kvmalloc();
+            }"
            compile_check_conftest "$CODE" "NV_KVMALLOC_PRESENT" "" "functions"

        ;;
@ -4821,12 +4825,11 @@ compile_test() {
            #
            # Determine if the function drm_gem_object_put_unlocked() is present.
            #
-            # In v5.9-rc1, commit 2f4dd13d4bb8 ("drm/gem: add
-            # drm_gem_object_put helper") removes drm_gem_object_put_unlocked()
-            # function and replace its definition by transient macro. Commit
-            # ab15d56e27be ("drm: remove transient
-            # drm_gem_object_put_unlocked()") finally removes
-            # drm_gem_object_put_unlocked() macro.
+            # Replaced with a transient macro by commit 2f4dd13d4bb8 ("drm/gem:
+            # add drm_gem_object_put helper") in v5.9.
+            #
+            # Finally removed by commit ab15d56e27be ("drm: remove transient
+            # drm_gem_object_put_unlocked()") in v5.9.
            #
            CODE="
            #if defined(NV_DRM_DRMP_H_PRESENT)
@ -4849,7 +4852,7 @@ compile_test() {
            # field.
            #
            # Removed by commit 0425662fdf05 ("drm: Nuke mode->vrefresh") in
-            # v5.9-rc1.
+            # v5.9.
            #
            CODE="
            #include <drm/drm_modes.h>
@ -4867,7 +4870,7 @@ compile_test() {
            # Determine if drm_driver::master_set() returns integer value
            #
            # Changed to void by commit 907f53200f98 ("drm: vmwgfx: remove
-            # drm_driver::master_set() return type") in v5.9-rc1.
+            # drm_driver::master_set() return type") in v5.9.
            #
            CODE="
            #if defined(NV_DRM_DRMP_H_PRESENT)
@ -4893,7 +4896,7 @@ compile_test() {
            # function pointer.
            #
            # drm_driver::gem_free_object is removed by commit 1a9458aeb8eb
-            # ("drm: remove drm_driver::gem_free_object") in v5.9-rc1.
+            # ("drm: remove drm_driver::gem_free_object") in v5.9.
            #
            CODE="
            #if defined(NV_DRM_DRMP_H_PRESENT)
@ -4916,7 +4919,7 @@ compile_test() {
            # Determine if vga_tryget() is present
            #
            # vga_tryget() was removed by commit f369bc3f9096 ("vgaarb: mark
-            # vga_tryget static") in v5.9-rc1 (2020-08-01).
+            # vga_tryget static") in v5.9.
            #
            CODE="
            #include <linux/vgaarb.h>
@ -4933,7 +4936,7 @@ compile_test() {
            #
            # pci_channel_state was removed by commit 16d79cd4e23b ("PCI: Use
            # 'pci_channel_state_t' instead of 'enum pci_channel_state'") in
-            # v5.9-rc1 (2020-07-02).
+            # v5.9.
            #
            CODE="
            #include <linux/pci.h>
@ -4949,7 +4952,8 @@ compile_test() {
            # Determine if 'cc_platform_has()' is present.
            #
            # Added by commit aa5a461171f9 ("x86/sev: Add an x86 version of
-            # cc_platform_has()") in v5.15.3 (2021-10-04)
+            # cc_platform_has()") in v5.16.
+            #
            CODE="
            #if defined(NV_LINUX_CC_PLATFORM_H_PRESENT)
            #include <linux/cc_platform.h>
@ -4966,8 +4970,9 @@ compile_test() {
            #
            # Determine if drm_prime_pages_to_sg() has 'dev' argument.
            #
-            # drm_prime_pages_to_sg() is updated to take 'dev' argument by commit
-            # 707d561f77b5 ("drm: allow limiting the scatter list size.").
+            # drm_prime_pages_to_sg() is updated to take 'dev' argument by
+            # commit 707d561f77b5 ("drm: allow limiting the scatter list
+            # size.") in v5.10.
            #
            CODE="
            #if defined(NV_DRM_DRMP_H_PRESENT)
@ -4991,9 +4996,9 @@ compile_test() {
            # Determine if drm_driver structure has the GEM and PRIME callback
            # function pointers.
            #
-            # The GEM and PRIME callback are removed from drm_driver
-            # structure, by commit d693def4fd1c ("drm: Remove obsolete GEM and
-            # PRIME callbacks from struct drm_driver").
+            # The GEM and PRIME callbacks are removed from drm_driver
+            # structure by commit d693def4fd1c ("drm: Remove obsolete GEM and
+            # PRIME callbacks from struct drm_driver") in v5.11.
            #
            CODE="
            #if defined(NV_DRM_DRMP_H_PRESENT)
@ -5022,8 +5027,8 @@ compile_test() {
            # Determine if drm_crtc_helper_funcs::atomic_check takes 'state'
            # argument of 'struct drm_atomic_state' type.
            #
-            # The commit 29b77ad7b9ca ("drm/atomic: Pass the full state to CRTC
-            # atomic_check") passed the full atomic state to
+            # Commit 29b77ad7b9ca ("drm/atomic: Pass the full state to CRTC
+            # atomic_check") in v5.11 passed the full atomic state to
            # drm_crtc_helper_funcs::atomic_check()
            #
            # To test the signature of drm_crtc_helper_funcs::atomic_check(),
@ -5059,9 +5064,9 @@ compile_test() {
            # Determine if drm_gem_object_funcs::vmap takes 'map'
            # argument of 'struct dma_buf_map' type.
            #
-            # The commit 49a3f51dfeee ("drm/gem: Use struct dma_buf_map in GEM
-            # vmap ops and convert GEM backends") update
-            # drm_gem_object_funcs::vmap to take 'map' argument.
+            # drm_gem_object_funcs::vmap is updated to take 'map' argument by
+            # commit 49a3f51dfeee ("drm/gem: Use struct dma_buf_map in GEM
+            # vmap ops and convert GEM backends") in v5.11.
            #
            CODE="
            #include <drm/drm_gem.h>
@ -5078,7 +5083,7 @@ compile_test() {
            # Determine if seq_read_iter() is present
            #
            # seq_read_iter() was added by commit d4d50710a8b4 ("seq_file:
-            # add seq_read_iter") in v5.10-rc1 (2020-11-04).
+            # add seq_read_iter") in v5.10.
            #
            CODE="
            #include <linux/seq_file.h>
@ -5096,7 +5101,7 @@ compile_test() {
            #
            # The commit 07f4f97d7b4b ("vga_switcheroo: Use device link for HDA
            # controller") has moved 'PCI_CLASS_MULTIMEDIA_HD_AUDIO' macro from
-            # <sound/hdaudio.h> to <linux/pci_ids.h> in v4.17-rc1 (2018-03-03).
+            # <sound/hdaudio.h> to <linux/pci_ids.h> in v4.17.
            #
            CODE="
            #include <linux/pci_ids.h>
@ -5114,6 +5119,9 @@ compile_test() {
            # unsafe_follow_pfn() was added by commit 69bacee7f9ad
            # ("mm: Add unsafe_follow_pfn") in v5.13-rc1.
            #
+            # Note: this commit never made it to the linux kernel, so
+            # unsafe_follow_pfn() never existed.
+            #
            CODE="
            #include <linux/mm.h>
            void conftest_unsafe_follow_pfn(void) {
@ -5128,8 +5136,8 @@ compile_test() {
            # Determine if drm_plane_helper_funcs::atomic_check takes 'state'
            # argument of 'struct drm_atomic_state' type.
            #
-            # The commit 7c11b99a8e58 ("drm/atomic: Pass the full state to
-            # planes atomic_check") passed the full atomic state to
+            # Commit 7c11b99a8e58 ("drm/atomic: Pass the full state to planes
+            # atomic_check") in v5.13 passes the full atomic state to
            # drm_plane_helper_funcs::atomic_check()
            #
            # To test the signature of drm_plane_helper_funcs::atomic_check(),
@ -5193,7 +5201,7 @@ compile_test() {
            # Determine if the add_memory_driver_managed function is present
            #
            # Added by commit 7b7b27214bba ("mm/memory_hotplug: introduce
-            # add_memory_driver_managed()") in v5.8-rc1 (2020-06-05)
+            # add_memory_driver_managed()") in v5.8.
            #
            CODE="
            #include <linux/memory_hotplug.h>
@ -5208,8 +5216,8 @@ compile_test() {
            #
            # Check if add_memory_driver_managed() has mhp_flags arg.
            #
-            # Added by commit b6117199787c ("mm/memory_hotplug: prepare passing flags to
-            # add_memory() and friends") in v5.10-rc1 (2020-10-16)
+            # Added by commit b6117199787c ("mm/memory_hotplug: prepare
+            # passing flags to add_memory() and friends") in v5.10.
            #
            CODE="
            #include <linux/memory_hotplug.h>
@ -5226,8 +5234,8 @@ compile_test() {
            #
            # Check if remove_memory() has nid parameter.
            #
-            # Removed by commit e1c158e4956612e7 ("mm/memory_hotplug: remove nid
-            # parameter from remove_memory() and friends") in v5.15-rc1 (2021-09-09)
+            # Removed by commit e1c158e49566 ("mm/memory_hotplug: remove nid
+            # parameter from remove_memory() and friends") in v5.15.
            #
            CODE="
            #include <linux/memory_hotplug.h>
@ -5242,8 +5250,8 @@ compile_test() {
            #
            # Determine if the offline_and_remove_memory function is present.
            #
-            # Added by commit 08b3acd7a68fc179 ("mm/memory_hotplug: Introduce
-            # offline_and_remove_memory()") in v5.8-rc1 (2020-06-05)
+            # Added by commit 08b3acd7a68f ("mm/memory_hotplug: Introduce
+            # offline_and_remove_memory()") in v5.8.
            #
            CODE="
            #include <linux/memory_hotplug.h>
@ -5258,8 +5266,8 @@ compile_test() {
            #
            # Determine if the device_property_read_u64 function is present
            #
-            # Added by commit b31384fa5de37a1 ("Driver core: Unified device
-            # properties interface for platform firmware") in v3.19-rc1 (2014-11-05)
+            # Added by commit b31384fa5de3 ("Driver core: Unified device
+            # properties interface for platform firmware") in v3.19.
            #
            CODE="
            #include <linux/acpi.h>
@ -5274,8 +5282,12 @@ compile_test() {
            #
            # Determine if of_property_count_elems_of_size is present
            #
-            # Added by commit 1df09bcof (" Move OF property and graph API from
-            # base.c to property.c"
+            # Added by commit ad54a0cfbeb4 ("of: add functions to count
+            # number of elements in a property") in v3.15.
+            #
+            # Moved from base.c to property.c by commit 1df09bc66f9b ("of:
+            # Move OF property and graph API from base.c to property.c") in
+            # v4.13.
            #
            # Test if linux/of.h header file inclusion is successful or not,
            # depending on that check, for of_property_count_elems_of_size
@ -5306,8 +5318,12 @@ compile_test() {
            #
            # Determine if of_property_read_variable_u8_array is present
            #
-            # Added by commit 1df09bcof (" Move OF property and graph API from
-            # base.c to property.c"
+            # Added by commit a67e9472da42 ("of: Add array read functions
+            # with min/max size limits") in v4.9.
+            #
+            # Moved from base.c to property.c by commit 1df09bc66f9b ("of:
+            # Move OF property and graph API from base.c to property.c") in
+            # v4.13.
            #
            # Test if linux/of.h header file inclusion is successful or not,
            # depending on that, check for of_property_read_variable_u8_array
@ -5338,8 +5354,15 @@ compile_test() {
            #
            # Determine if of_property_read_variable_u32_array is present
            #
-            # Added by commit 1df09bcof (" Move OF property and graph API from
-            # base.c to property.c"
+            # Added by commit a67e9472da42 ("of: Add array read functions
+            # with min/max size limits") in v4.9.
+            #
+            # Moved from base.c to property.c by commit 1df09bc66f9b ("of:
+            # Move OF property and graph API from base.c to property.c") in
+            # v4.13.
+            #
+            # Note: this can probably be combined with the
+            # of_property_read_variable_u8_array conftest above.
            #
            # Test if linux/of.h header file inclusion is successful or not,
            # depending on that, check for of_property_read_variable_u32_array
@ -5370,8 +5393,8 @@ compile_test() {
            #
            # Determine if devm_of_platform_populate() function is present
            #
-            # Added by commit 38b0b21of (add devm_ functions for populate and
-            # depopulate")
+            # Added by commit 38b0b219fbe8 ("of: add devm_ functions for
+            # populate and depopulate") in v4.12.
            #
            CODE="
            #if defined(NV_LINUX_OF_PLATFORM_H_PRESENT)
@ -5389,8 +5412,13 @@ compile_test() {
            #
            # Determine if of_dma_configure() function is present
            #
-            # Added by commit 591c1eeof ("configure the platform device
-            # dma parameters")
+            # Added by commit 591c1ee465ce ("of: configure the platform
+            # device dma parameters") in v3.16.  However, it was a static,
+            # non-exported function at that time.
+            #
+            # It was moved from platform.c to device.c and made public by
+            # commit 1f5c69aa51f9 ("of: Move of_dma_configure() to device.c
+            # to help re-use") in v4.1.
            #
            CODE="
            #if defined(NV_LINUX_OF_DEVICE_H_PRESENT)
@ -5409,8 +5437,8 @@ compile_test() {
            #
            # Determine if icc_get() function is present
            #
-            # Added by commit 11f1cec ("interconnect: Add generic on-chip
-            # interconnect API")
+            # Added by commit 11f1ceca7031 ("interconnect: Add generic
+            # on-chip interconnect API") in v5.1.
            #
            CODE="
            #if defined(NV_LINUX_INTERCONNECT_H_PRESENT)
@ -5429,8 +5457,8 @@ compile_test() {
            #
            # Determine if icc_set_bw() function is present
            #
-            # Added by commit 11f1cec ("interconnect: Add generic on-chip
-            # interconnect API")
+            # Added by commit 11f1ceca7031 ("interconnect: Add generic
+            # on-chip interconnect API") in v5.1.
            #
            CODE="
            #if defined(NV_LINUX_INTERCONNECT_H_PRESENT)
@ -5449,8 +5477,8 @@ compile_test() {
            #
            # Determine if icc_put() function is present
            #
-            # Added by commit 11f1cec ("interconnect: Add generic on-chip
-            # interconnect API")
+            # Added by commit 11f1ceca7031 ("interconnect: Add generic
+            # on-chip interconnect API") in v5.1.
            #
            CODE="
            #if defined(NV_LINUX_INTERCONNECT_H_PRESENT)
@ -5469,7 +5497,8 @@ compile_test() {
            #
            # Determine if i2c_new_client_device() function is present
            #
-            # Added by commit 390fd04i2c ("remove deprecated i2c_new_device API")
+            # Added by commit 390fd0475af5 ("i2c: remove deprecated
+            # i2c_new_device API") in v5.8.
            #
            CODE="
            #include <linux/i2c.h>
@ -5486,7 +5515,8 @@ compile_test() {
            #
            # Determine if i2c_unregister_device() function is present
            #
-            # Added by commit 9c1600ei2c ("Add i2c_board_info and i2c_new_device()")
+            # Added by commit 9c1600eda42e ("i2c: Add i2c_board_info and
+            # i2c_new_device()") in v2.6.22.
            #
            CODE="
            #include <linux/i2c.h>
@ -5503,8 +5533,8 @@ compile_test() {
            #
            # Determine if of_get_named_gpio() function is present
            #
-            # Added by commit a6b0919 ("of/gpio: Add new method for getting gpios
-            # under different property names")
+            # Added by commit a6b0919140b4 ("of/gpio: Add new method for
+            # getting gpios under different property names") in v3.1.
            #
            CODE="
            #if defined(NV_LINUX_OF_GPIO_H_PRESENT)
@ -5523,7 +5553,8 @@ compile_test() {
            #
            # Determine if devm_gpio_request_one() function is present
            #
-            # Added by commit 09d71ff (gpiolib: Implement devm_gpio_request_one()")
+            # Added by commit 09d71ff19404 ("gpiolib: Implement
+            # devm_gpio_request_one()") in v3.5.
            #
            CODE="
            #if defined(NV_LINUX_GPIO_H_PRESENT)
@ -5542,7 +5573,8 @@ compile_test() {
            #
            # Determine if gpio_direction_input() function is present
            #
-            # Added by commit c7caf86 (gpio: remove gpio_ensure_requested()")
+            # Added by commit c7caf86823c7 ("gpio: remove
+            # gpio_ensure_requested()") in v3.17.
            #
            CODE="
            #if defined(NV_LINUX_GPIO_H_PRESENT)
@ -5561,7 +5593,8 @@ compile_test() {
            #
            # Determine if gpio_direction_output() function is present
            #
-            # Added by commit c7caf86 (gpio: remove gpio_ensure_requested()")
+            # Added by commit c7caf86823c7 ("gpio: remove
+            # gpio_ensure_requested()") in v3.17.
            #
            CODE="
            #if defined(NV_LINUX_GPIO_H_PRESENT)
@ -5580,8 +5613,8 @@ compile_test() {
            #
            # Determine if gpio_get_value() function is present
            #
-            # Added by commit 7563bbf ("gpiolib/arches: Centralise bolierplate
-            # asm/gpio.h")
+            # Added by commit 7563bbf89d06 ("gpiolib/arches: Centralise
+            # bolierplate asm/gpio.h") in v3.5.
            #
            CODE="
            #if defined(NV_LINUX_GPIO_H_PRESENT)
@ -5600,8 +5633,8 @@ compile_test() {
            #
            # Determine if gpio_set_value() function is present
            #
-            # Added by commit 7563bbf ("gpiolib/arches: Centralise bolierplate
-            # asm/gpio.h")
+            # Added by commit 7563bbf89d06 ("gpiolib/arches: Centralise
+            # bolierplate asm/gpio.h") in v3.5.
            #
            CODE="
            #if defined(NV_LINUX_GPIO_H_PRESENT)
@ -5620,8 +5653,8 @@ compile_test() {
            #
            # Determine if gpio_to_irq() function is present
            #
-            # Added by commit 7563bbf ("gpiolib/arches: Centralise bolierplate
-            # asm/gpio.h")
+            # Added by commit 7563bbf89d06 ("gpiolib/arches: Centralise
+            # bolierplate asm/gpio.h") in v3.5.
            #
            CODE="
            #if defined(NV_LINUX_GPIO_H_PRESENT)
@ -5636,14 +5669,29 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_GPIO_TO_IRQ_PRESENT" "" "functions"
        ;;

+        migrate_vma_setup)
+            #
+            # Determine if migrate_vma_setup() function is present
+            #
+            # Added by commit a7d1f22bb74f ("mm: turn migrate_vma upside
+            # down") in v5.4.
+            #
+            CODE="
+            #include <linux/migrate.h>
+            int conftest_migrate_vma_setup(void) {
+                migrate_vma_setup();
+            }"
+
+            compile_check_conftest "$CODE" "NV_MIGRATE_VMA_SETUP_PRESENT" "" "functions"
+        ;;
+
        migrate_vma_added_flags)
            #
            # Determine if migrate_vma structure has flags
            #
-            # flags were added to struct migrate_vma by commit
-            # 5143192cd410c4fc83be09a2e73423765aee072b ("mm/migrate: add a flags
-            # parameter to_migrate_vma) in v5.9.
-            # (2020-07-28).
+            # Added by commit 5143192cd410 ("mm/migrate: add a flags
+            # parameter to migrate_vma") in v5.9.
+            #
            CODE="
            #include <linux/migrate.h>
            int conftest_migrate_vma_added_flags(void) {
@ -5658,7 +5706,7 @@ compile_test() {
            # Determine if the 'drm_device' structure has a 'pdev' field.
            #
            # Removed by commit b347e04452ff ("drm: Remove pdev field from
-            # struct drm_device") in v5.14-rc1.
+            # struct drm_device") in v5.14.
            #
            CODE="
            #if defined(NV_DRM_DRMP_H_PRESENT)
@ -5712,9 +5760,9 @@ compile_test() {
            #
            # Determine if ioasid_get() function is present
            #
-            # ioasid_get() function was added by commit
-            # cb4789b0d19ff231ce9f73376a023341300aed96 (iommu/ioasid: Add ioasidreferences) in v5.11.
-            # (2020-11-23).
+            # Added by commit cb4789b0d19f ("iommu/ioasid: Add ioasid
+            # references") in v5.11.
+            #
            CODE="
            #if defined(NV_LINUX_IOASID_H_PRESENT)
            #include <linux/ioasid.h>
@ -5751,9 +5799,8 @@ compile_test() {
            #
            # Determine if the 'drm_crtc_state' structure has 'no_vblank'.
            #
-            # drm_crtc_state::no_vblank was added by commit b25c60af7a877
-            # ("drm/crtc: Add a generic infrastructure to fake VBLANK events")
-            # in 4.18.0-rc3 (2018-07-03).
+            # Added by commit b25c60af7a87 ("drm/crtc: Add a generic
+            # infrastructure to fake VBLANK events") in v4.19.
            #
            CODE="
            #include <drm/drm_crtc.h>
@ -5773,7 +5820,7 @@ compile_test() {
            # an 'allow_fb_modifiers' field in the 'drm_mode_config' structure,
            # is added by commit e3eb3250d84e ("drm: add support for
            # tiled/compressed/etc modifier in addfb2") in v4.1, and removed by
-            # commit 3d082157a242 ("drm: remove allow_fb_modifiers") in v5.18-rc1.
+            # commit 3d082157a242 ("drm: remove allow_fb_modifiers") in v5.18.
            #
            # The 'struct drm_mode_config' definition, is moved to
            # drm_mode_config.h file by commit 28575f165d36 ("drm: Extract
@ -5811,9 +5858,8 @@ compile_test() {
            #
            # Determine if drm_mode.h has 'hdr_output_metadata' structure.
            #
-            # struct hdr_output_metadata was added by commit fbb5d0353c62d
-            # ("drm: Add HDR source metadata property") in 5.1.0-rc5
-            # (2019-05-16)
+            # Added by commit fbb5d0353c62 ("drm: Add HDR source metadata
+            # property") in v5.3.
            #
            CODE="
            #include <drm/drm_mode.h>
@ -5840,9 +5886,8 @@ compile_test() {
            #
            # Determine if the platform_irq_count() function is present
            #
-            # platform_irq_count was added by commit
-            # 4b83555d5098e73cf2c5ca7f86c17ca0ba3b968e ("driver-core: platform: Add platform_irq_count()")
-            # in 4.5-rc1 (2016-01-07)
+            # Added by commit 4b83555d5098 ("driver-core: platform: Add
+            # platform_irq_count()") in v4.5.
            #
            CODE="
            #include <linux/platform_device.h>
@ -5856,7 +5901,8 @@ compile_test() {
            #
            # Determine if devm_clk_bulk_get_all() function is present
            #
-            # Added by commit f08c2e286 ("clk: add managed version of clk_bulk_get_all")
+            # Added by commit f08c2e2865f6 ("clk: add managed version of
+            # clk_bulk_get_all") in v4.20.
            #
            CODE="
            #if defined(NV_LINUX_CLK_H_PRESENT)
@ -5923,7 +5969,7 @@ compile_test() {
            # dma_resv_add_excl_fence() and dma_resv_add_shared_fence() were
            # removed and replaced with dma_resv_add_fence() by commit
            # 73511edf8b19 ("dma-buf: specify usage while adding fences to
-            # dma_resv obj v7") in linux-next, expected in v5.19-rc1.
+            # dma_resv obj v7") in v5.19.
            #
            CODE="
            #if defined(NV_LINUX_DMA_RESV_H_PRESENT)
@ -5943,7 +5989,7 @@ compile_test() {
            # dma_resv_reserve_shared() was removed and replaced with
            # dma_resv_reserve_fences() by commit c8d4c18bfbc4
            # ("dma-buf/drivers: make reserving a shared slot mandatory v4") in
-            # linux-next, expected in v5.19-rc1.
+            # v5.19.
            #
            CODE="
            #if defined(NV_LINUX_DMA_RESV_H_PRESENT)
@ -5963,8 +6009,7 @@ compile_test() {
            #
            # reservation_object_reserve_shared() function prototype was updated
            # to take 'num_fences' argument by commit ca05359f1e64 ("dma-buf:
-            # allow reserving more than one shared fence slot") in v4.21-rc1
-            # (2018-12-14).
+            # allow reserving more than one shared fence slot") in v5.0.
            #
            CODE="
            #include <linux/reservation.h>
@ -5981,9 +6026,8 @@ compile_test() {
            #
            # Determine if the __get_task_ioprio() function is present.
            #
-            # __get_task_ioprio was added by commit 893e5d32d583
-            # ("block: Generalize get_current_ioprio() for any task") for
-            # v5.20 linux-next (2022-06-23).
+            # Added by commit 893e5d32d583 ("block: Generalize
+            # get_current_ioprio() for any task") in v6.0.
            #
            CODE="
            #include <linux/ioprio.h>
@ -5998,9 +6042,8 @@ compile_test() {
            #
            # Determine if 'num_registered_fb' variable is present.
            #
-            # 'num_registered_fb' was removed by commit 5727dcfd8486
-            # ("fbdev: Make registered_fb[] private to fbmem.c") for
-            # v5.20 linux-next (2022-07-27).
+            # Removed by commit 5727dcfd8486 ("fbdev: Make registered_fb[]
+            # private to fbmem.c") in v6.1.
            #
            CODE="
            #include <linux/fb.h>
@ -6146,9 +6189,8 @@ compile_test() {
            #
            # Determine if 'struct drm_connector' has an 'override_edid' member.
            #
-            # Removed by commit 90b575f52c6ab ("drm/edid: detach debugfs EDID
-            # override from EDID property update") in linux-next, expected in
-            # v6.2-rc1.
+            # Removed by commit 90b575f52c6a ("drm/edid: detach debugfs EDID
+            # override from EDID property update") in v6.2.
            #
            CODE="
            #if defined(NV_DRM_DRM_CRTC_H_PRESENT)
@ -6190,10 +6232,9 @@ compile_test() {
            # Determine if the 'vm_area_struct' structure has
            # const 'vm_flags'.
            #
-            # A union of '__vm_flags' and 'const vm_flags' was added 
-            # by commit bc292ab00f6c ("mm: introduce vma->vm_flags
-            # wrapper functions") in mm-stable branch (2023-02-09)
-            # of the akpm/mm maintainer tree.
+            # A union of '__vm_flags' and 'const vm_flags' was added by
+            # commit bc292ab00f6c ("mm: introduce vma->vm_flags wrapper
+            # functions") in v6.3.
            #
            CODE="
            #include <linux/mm_types.h>
@ -6209,8 +6250,8 @@ compile_test() {
            # Determine if the 'drm_driver' structure has a 'dumb_destroy'
            # function pointer.
            #
-            # Removed by commit 96a7b60f6ddb2 ("drm: remove dumb_destroy
-            # callback") in v6.3 linux-next (2023-02-10).
+            # Removed by commit 96a7b60f6ddb ("drm: remove dumb_destroy
+            # callback") in v6.4.
            #
            CODE="
            #if defined(NV_DRM_DRMP_H_PRESENT)
@ -6232,9 +6273,8 @@ compile_test() {
            #
            # Check if memory_failure() has trapno parameter.
            #
-            # trapno argument was removed by commit
-            # 83b57531c58f4173d1c0d0b2c0bc88c853c32ea5 ("mm/memory_failure:
-            # Remove unused trapno from memory_failure") in v4.15.0 (2017-7-9)
+            # Removed by commit 83b57531c58f ("mm/memory_failure: Remove
+            # unused trapno from memory_failure") in v4.16.
            #
            CODE="
            #include <linux/mm.h>
@ -6251,9 +6291,8 @@ compile_test() {
            #
            # Check if memory_failure() flag MF_SW_SIMULATED is defined.
            #
-            # MF_SW_SIMULATED was added by commit
-            # 67f22ba7750f940bcd7e1b12720896c505c2d63f ("mm/hwpoison:
-            # fix unpoison_memory()") in v5.19.0-rc2 (2022-6-16)
+            # Added by commit 67f22ba7750f ("mm/memory-failure: disable
+            # unpoison once hw error happens") in v5.19.
            #
            CODE="
            #include <linux/mm.h>
@ -6264,6 +6303,186 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_MEMORY_FAILURE_MF_SW_SIMULATED_DEFINED" "" "types"
        ;;

+        sync_file_get_fence)
+            #
+            # Determine if sync_file_get_fence() function is present
+            #
+            # Added by commit 972526a40932 ("dma-buf/sync_file: add
+            # sync_file_get_fence()") in v4.9.
+            #
+            CODE="
+            #if defined(NV_LINUX_SYNC_FILE_H_PRESENT)
+            #include <linux/sync_file.h>
+            #endif
+            void conftest_sync_file_get_fence(void)
+            {
+                sync_file_get_fence();
+            }"
+
+            compile_check_conftest "$CODE" "NV_SYNC_FILE_GET_FENCE_PRESENT" "" "functions"
+        ;;
+
+        dma_fence_set_error)
+            #
+            # Determine if dma_fence_set_error() function is present
+            #
+            # Added by commit a009e975da5c ("dma-fence: Introduce
+            # drm_fence_set_error() helper") in v4.11.
+            #
+            CODE="
+            #if defined(NV_LINUX_DMA_FENCE_H_PRESENT)
+            #include <linux/dma-fence.h>
+            #endif
+            void conftest_dma_fence_set_error(void)
+            {
+                dma_fence_set_error();
+            }"
+
+            compile_check_conftest "$CODE" "NV_DMA_FENCE_SET_ERROR_PRESENT" "" "functions"
+        ;;
+
+        fence_set_error)
+            #
+            # Determine if fence_set_error() function is present
+            #
+            # fence_set_error is a different name for dma_fence_set_error
+            # present in kernels where commit a009e975da5c ("dma-fence:
+            # Introduce drm_fence_set_error() helper") from v4.11 was
+            # backported, but commit f54d1867005c ("dma-buf: Rename struct fence
+            # to dma_fence") from v4.10 was not. In particular, Tegra v4.9
+            # kernels, such as commit f5e0724e76c2 ("dma-fence: Introduce
+            # drm_fence_set_error() helper") in NVIDIA Linux for Tegra (L4T) r31
+            # and r32 kernels in the L4T kernel repo
+            # git://nv-tegra.nvidia.com/linux-4.9.git, contain this function.
+            #
+            CODE="
+            #if defined(NV_LINUX_FENCE_H_PRESENT)
+            #include <linux/fence.h>
+            #endif
+            void conftest_fence_set_error(void)
+            {
+                fence_set_error();
+            }"
+
+            compile_check_conftest "$CODE" "NV_FENCE_SET_ERROR_PRESENT" "" "functions"
+        ;;
+
+        fence_ops_use_64bit_seqno)
+            #
+            # Determine if dma_fence_ops has the use_64bit_seqno member
+            #
+            # 64-bit fence seqno support was actually added by commit
+            # b312d8ca3a7c ("dma-buf: make fence sequence numbers 64 bit v2")
+            # in v5.1, but the field to explicitly declare support for it
+            # didn't get added until commit 5e498abf1485 ("dma-buf:
+            # explicitely note that dma-fence-chains use 64bit seqno") in
+            # v5.2. Since it is currently trivial to work around the lack of
+            # native 64-bit seqno in our driver, we'll use the work-around path
+            # for kernels prior to v5.2 to avoid further ifdefing of the code.
+            #
+            CODE="
+            #if defined(NV_LINUX_DMA_FENCE_H_PRESENT)
+            #include <linux/dma-fence.h>
+            #endif
+            int conftest_fence_ops(void)
+            {
+                return offsetof(struct dma_fence_ops, use_64bit_seqno);
+            }"
+
+            compile_check_conftest "$CODE" "NV_DMA_FENCE_OPS_HAS_USE_64BIT_SEQNO" "" "types"
+        ;;
+
+        drm_fbdev_generic_setup)
+            #
+            # Determine whether drm_fbdev_generic_setup is present.
+            #
+            # Added by commit 9060d7f49376 ("drm/fb-helper: Finish the
+            # generic fbdev emulation") in v4.19.
+            #
+            CODE="
+            #include <drm/drm_fb_helper.h>
+            #if defined(NV_DRM_DRM_FBDEV_GENERIC_H_PRESENT)
+            #include <drm/drm_fbdev_generic.h>
+            #endif
+            void conftest_drm_fbdev_generic_setup(void) {
+                drm_fbdev_generic_setup();
+            }"
+
+            compile_check_conftest "$CODE" "NV_DRM_FBDEV_GENERIC_SETUP_PRESENT" "" "functions"
+        ;;
+
+        drm_aperture_remove_conflicting_pci_framebuffers)
+            #
+            # Determine whether drm_aperture_remove_conflicting_pci_framebuffers is present.
+            #
+            # Added by commit 2916059147ea ("drm/aperture: Add infrastructure
+            # for aperture ownership") in v5.14.
+            #
+            CODE="
+            #if defined(NV_DRM_DRM_APERTURE_H_PRESENT)
+            #include <drm/drm_aperture.h>
+            #endif
+            void conftest_drm_aperture_remove_conflicting_pci_framebuffers(void) {
+                drm_aperture_remove_conflicting_pci_framebuffers();
+            }"
+
+            compile_check_conftest "$CODE" "NV_DRM_APERTURE_REMOVE_CONFLICTING_PCI_FRAMEBUFFERS_PRESENT" "" "functions"
+        ;;
+
+        drm_aperture_remove_conflicting_pci_framebuffers_has_driver_arg)
+            #
+            # Determine whether drm_aperture_remove_conflicting_pci_framebuffers
+            # takes a struct drm_driver * as its second argument.
+            #
+            # Prior to commit 97c9bfe3f6605d41eb8f1206e6e0f62b31ba15d6, the
+            # second argument was a char * pointer to the driver's name.
+            #
+            # To test if drm_aperture_remove_conflicting_pci_framebuffers() has
+            # a req_driver argument, define a function with the expected
+            # signature and then define the corresponding function
+            # implementation with the expected signature. Successful compilation
+            # indicates that this function has the expected signature.
+            #
+            # This change occurred in commit 97c9bfe3f660 ("drm/aperture: Pass
+            # DRM driver structure instead of driver name") in v5.15
+            # (2021-06-29).
+            #
+            CODE="
+            #if defined(NV_DRM_DRM_DRV_H_PRESENT)
+            #include <drm/drm_drv.h>
+            #endif
+            #if defined(NV_DRM_DRM_APERTURE_H_PRESENT)
+            #include <drm/drm_aperture.h>
+            #endif
+            typeof(drm_aperture_remove_conflicting_pci_framebuffers) conftest_drm_aperture_remove_conflicting_pci_framebuffers;
+            int conftest_drm_aperture_remove_conflicting_pci_framebuffers(struct pci_dev *pdev,
+                                                                          const struct drm_driver *req_driver)
+            {
+                return 0;
+            }"
+
+            compile_check_conftest "$CODE" "NV_DRM_APERTURE_REMOVE_CONFLICTING_PCI_FRAMEBUFFERS_HAS_DRIVER_ARG" "" "types"
+	;;
+
+        find_next_bit_wrap)
+            # Determine if 'find_next_bit_wrap' is defined.
+            #
+            # The function was added by commit 6cc18331a987 ("lib/find_bit:
+            # add find_next{,_and}_bit_wrap") in v6.1-rc1 (2022-09-19).
+            #
+            # Ideally, we would want to be able to include linux/find.h.
+            # However, linux/find.h does not allow direct inclusion. Rather
+            # it has to be included through linux/bitmap.h.
+            #
+            CODE="
+            #include <linux/bitmap.h>
+            void conftest_find_next_bit_wrap(void) {
+                  (void)find_next_bit_wrap();
+            }"
+
+            compile_check_conftest "$CODE" "NV_FIND_NEXT_BIT_WRAP_PRESENT" "" "functions"
+        ;;
+
        crypto)
            #
            # Determine if we support various crypto functions.
@ -6342,6 +6561,29 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_MPOL_PREFERRED_MANY_PRESENT" "" "types"
        ;;

+        drm_connector_attach_hdr_output_metadata_property)
+            #
+            # Determine if the function
+            # drm_connector_attach_hdr_output_metadata_property() is present.
+            #
+            # Added by commit e057b52c1d90 ("drm/connector: Create a helper to
+            # attach the hdr_output_metadata property") in v5.14.
+            #
+            CODE="
+            #if defined(NV_DRM_DRM_CRTC_H_PRESENT)
+            #include <drm/drm_crtc.h>
+            #endif
+            #if defined(NV_DRM_DRM_CONNECTOR_H_PRESENT)
+            #include <drm/drm_connector.h>
+            #endif
+
+            void conftest_drm_connector_attach_hdr_output_metadata_property(void) {
+                drm_connector_attach_hdr_output_metadata_property();
+            }"
+
+            compile_check_conftest "$CODE" "NV_DRM_CONNECTOR_ATTACH_HDR_OUTPUT_METADATA_PROPERTY_PRESENT" "" "functions"
+        ;;
+
        mmu_interval_notifier)
            #
            # Determine if mmu_interval_notifier struct is present or not
@ -6357,11 +6599,48 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_MMU_INTERVAL_NOTIFIER" "" "types"
        ;;

+        drm_mode_create_dp_colorspace_property_has_supported_colorspaces_arg)
+            # Determine if drm_mode_create_dp_colorspace_property() takes the
+            # 'supported_colorspaces' argument.
+            #
+            # The 'u32 supported_colorspaces' argument was added to
+            # drm_mode_create_dp_colorspace_property() by linux-next commit
+            # c265f340eaa8 ("drm/connector: Allow drivers to pass list of
+            # supported colorspaces").
+            #
+            # To test if drm_mode_create_dp_colorspace_property() has the
+            # 'supported_colorspaces' argument, declare a function prototype
+            # with typeof drm_mode_create_dp_colorspace_property and then
+            # define the corresponding function implementation with the
+            # expected signature. Successful compilation indicates that
+            # drm_mode_create_dp_colorspace_property() has the
+            # 'supported_colorspaces' argument.
+            #
+            CODE="
+            #if defined(NV_DRM_DRM_CRTC_H_PRESENT)
+            #include <drm/drm_crtc.h>
+            #endif
+            #if defined(NV_DRM_DRM_CONNECTOR_H_PRESENT)
+            #include <drm/drm_connector.h>
+            #endif
+
+            typeof(drm_mode_create_dp_colorspace_property) conftest_drm_mode_create_dp_colorspace_property_has_supported_colorspaces_arg;
+            int conftest_drm_mode_create_dp_colorspace_property_has_supported_colorspaces_arg(struct drm_connector *connector,
+                                                                                              u32 supported_colorspaces)
+            {
+                return 0;
+            }"
+
+            compile_check_conftest "$CODE" "NV_DRM_MODE_CREATE_DP_COLORSPACE_PROPERTY_HAS_SUPPORTED_COLORSPACES_ARG" "" "types"
+        ;;
+
        # When adding a new conftest entry, please use the correct format for
-        # specifying the relevant upstream Linux kernel commit.
+        # specifying the relevant upstream Linux kernel commit.  Please
+        # avoid specifying -rc kernels, and only use SHAs that actually exist
+        # in the upstream Linux kernel git repository.
        #
-        # <function> was added|removed|etc by commit <sha> ("<commit message")
-        # in <kernel-version> (<commit date>).
+        # Added|Removed|etc by commit <short-sha> ("<commit message") in
+        # <kernel-version>.

        *)
            # Unknown test name given
--- a/kernel-open/nvidia-drm/nv-kthread-q.c
+++ b/kernel-open/nvidia-drm/nv-kthread-q.c
@ -0,0 +1,334 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2016 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "nv-kthread-q.h"
+#include "nv-list-helpers.h"
+
+#include <linux/kthread.h>
+#include <linux/interrupt.h>
+#include <linux/completion.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+
+#if defined(NV_LINUX_BUG_H_PRESENT)
+    #include <linux/bug.h>
+#else
+    #include <asm/bug.h>
+#endif
+
+// Today's implementation is a little simpler and more limited than the
+// API description allows for in nv-kthread-q.h. Details include:
+//
+// 1. Each nv_kthread_q instance is a first-in, first-out queue.
+//
+// 2. Each nv_kthread_q instance is serviced by exactly one kthread.
+//
+// You can create any number of queues, each of which gets its own
+// named kernel thread (kthread). You can then insert arbitrary functions
+// into the queue, and those functions will be run in the context of the
+// queue's kthread.
+
+#ifndef WARN
+    // Only *really* old kernels (2.6.9) end up here. Just use a simple printk
+    // to implement this, because such kernels won't be supported much longer.
+    #define WARN(condition, format...) ({                    \
+        int __ret_warn_on = !!(condition);                   \
+        if (unlikely(__ret_warn_on))                         \
+            printk(KERN_ERR format);                         \
+        unlikely(__ret_warn_on);                             \
+    })
+#endif
+
+#define NVQ_WARN(fmt, ...)                                   \
+    do {                                                     \
+        if (in_interrupt()) {                                \
+            WARN(1, "nv_kthread_q: [in interrupt]: " fmt,    \
+            ##__VA_ARGS__);                                  \
+        }                                                    \
+        else {                                               \
+            WARN(1, "nv_kthread_q: task: %s: " fmt,          \
+                 current->comm,                              \
+                 ##__VA_ARGS__);                             \
+        }                                                    \
+    } while (0)
+
+static int _main_loop(void *args)
+{
+    nv_kthread_q_t *q = (nv_kthread_q_t *)args;
+    nv_kthread_q_item_t *q_item = NULL;
+    unsigned long flags;
+
+    while (1) {
+        // Normally this thread is never interrupted. However,
+        // down_interruptible (instead of down) is called here,
+        // in order to avoid being classified as a potentially
+        // hung task, by the kernel watchdog.
+        while (down_interruptible(&q->q_sem))
+            NVQ_WARN("Interrupted during semaphore wait\n");
+
+        if (atomic_read(&q->main_loop_should_exit))
+            break;
+
+        spin_lock_irqsave(&q->q_lock, flags);
+
+        // The q_sem semaphore prevents us from getting here unless there is
+        // at least one item in the list, so an empty list indicates a bug.
+        if (unlikely(list_empty(&q->q_list_head))) {
+            spin_unlock_irqrestore(&q->q_lock, flags);
+            NVQ_WARN("_main_loop: Empty queue: q: 0x%p\n", q);
+            continue;
+        }
+
+        // Consume one item from the queue
+        q_item = list_first_entry(&q->q_list_head,
+                                   nv_kthread_q_item_t,
+                                   q_list_node);
+
+        list_del_init(&q_item->q_list_node);
+
+        spin_unlock_irqrestore(&q->q_lock, flags);
+
+        // Run the item
+        q_item->function_to_run(q_item->function_args);
+
+        // Make debugging a little simpler by clearing this between runs:
+        q_item = NULL;
+    }
+
+    while (!kthread_should_stop())
+        schedule();
+
+    return 0;
+}
+
+void nv_kthread_q_stop(nv_kthread_q_t *q)
+{
+    // check if queue has been properly initialized
+    if (unlikely(!q->q_kthread))
+        return;
+
+    nv_kthread_q_flush(q);
+
+    // If this assertion fires, then a caller likely either broke the API rules,
+    // by adding items after calling nv_kthread_q_stop, or possibly messed up
+    // with inadequate flushing of self-rescheduling q_items.
+    if (unlikely(!list_empty(&q->q_list_head)))
+        NVQ_WARN("list not empty after flushing\n");
+
+    if (likely(!atomic_read(&q->main_loop_should_exit))) {
+
+        atomic_set(&q->main_loop_should_exit, 1);
+
+        // Wake up the kthread so that it can see that it needs to stop:
+        up(&q->q_sem);
+
+        kthread_stop(q->q_kthread);
+        q->q_kthread = NULL;
+    }
+}
+
+// When CONFIG_VMAP_STACK is defined, the kernel thread stack allocator used by
+// kthread_create_on_node relies on a 2 entry, per-core cache to minimize
+// vmalloc invocations. The cache is NUMA-unaware, so when there is a hit, the
+// stack location ends up being a function of the core assigned to the current
+// thread, instead of being a function of the specified NUMA node. The cache was
+// added to the kernel in commit ac496bf48d97f2503eaa353996a4dd5e4383eaf0
+// ("fork: Optimize task creation by caching two thread stacks per CPU if
+// CONFIG_VMAP_STACK=y")
+//
+// To work around the problematic cache, we create up to three kernel threads
+//   -If the first thread's stack is resident on the preferred node, return this
+//    thread.
+//   -Otherwise, create a second thread. If its stack is resident on the
+//    preferred node, stop the first thread and return this one.
+//   -Otherwise, create a third thread. The stack allocator does not find a
+//    cached stack, and so falls back to vmalloc, which takes the NUMA hint into
+//    consideration. The first two threads are then stopped.
+//
+// When CONFIG_VMAP_STACK is not defined, the first kernel thread is returned.
+//
+// This function is never invoked when there is no NUMA preference (preferred
+// node is NUMA_NO_NODE).
+static struct task_struct *thread_create_on_node(int (*threadfn)(void *data),
+                                                 nv_kthread_q_t *q,
+                                                 int preferred_node,
+                                                 const char *q_name)
+{
+
+    unsigned i, j;
+    const static unsigned attempts = 3;
+    struct task_struct *thread[3];
+
+    for (i = 0;; i++) {
+        struct page *stack;
+
+        thread[i] = kthread_create_on_node(threadfn, q, preferred_node, q_name);
+
+        if (unlikely(IS_ERR(thread[i]))) {
+
+            // Instead of failing, pick the previous thread, even if its
+            // stack is not allocated on the preferred node.
+            if (i > 0)
+                i--;
+
+            break;
+        }
+
+        // vmalloc is not used to allocate the stack, so simply return the
+        // thread, even if its stack may not be allocated on the preferred node
+        if (!is_vmalloc_addr(thread[i]->stack))
+            break;
+
+        // Ran out of attempts - return thread even if its stack may not be
+        // allocated on the preferred node
+        if ((i == (attempts - 1)))
+            break;
+
+        // Get the NUMA node where the first page of the stack is resident. If
+        // it is the preferred node, select this thread.
+        stack = vmalloc_to_page(thread[i]->stack);
+        if (page_to_nid(stack) == preferred_node)
+            break;
+    }
+
+    for (j = i; j > 0; j--)
+        kthread_stop(thread[j - 1]);
+
+    return thread[i];
+}
+
+int nv_kthread_q_init_on_node(nv_kthread_q_t *q, const char *q_name, int preferred_node)
+{
+    memset(q, 0, sizeof(*q));
+
+    INIT_LIST_HEAD(&q->q_list_head);
+    spin_lock_init(&q->q_lock);
+    sema_init(&q->q_sem, 0);
+
+    if (preferred_node == NV_KTHREAD_NO_NODE) {
+        q->q_kthread = kthread_create(_main_loop, q, q_name);
+    }
+    else {
+        q->q_kthread = thread_create_on_node(_main_loop, q, preferred_node, q_name);
+    }
+
+    if (IS_ERR(q->q_kthread)) {
+        int err = PTR_ERR(q->q_kthread);
+
+        // Clear q_kthread before returning so that nv_kthread_q_stop() can be
+        // safely called on it making error handling easier.
+        q->q_kthread = NULL;
+
+        return err;
+    }
+
+    wake_up_process(q->q_kthread);
+
+    return 0;
+}
+
+int nv_kthread_q_init(nv_kthread_q_t *q, const char *qname)
+{
+    return nv_kthread_q_init_on_node(q, qname, NV_KTHREAD_NO_NODE);
+}
+
+// Returns true (non-zero) if the item was actually scheduled, and false if the
+// item was already pending in a queue.
+static int _raw_q_schedule(nv_kthread_q_t *q, nv_kthread_q_item_t *q_item)
+{
+    unsigned long flags;
+    int ret = 1;
+
+    spin_lock_irqsave(&q->q_lock, flags);
+
+    if (likely(list_empty(&q_item->q_list_node)))
+        list_add_tail(&q_item->q_list_node, &q->q_list_head);
+    else
+        ret = 0;
+
+    spin_unlock_irqrestore(&q->q_lock, flags);
+
+    if (likely(ret))
+        up(&q->q_sem);
+
+    return ret;
+}
+
+void nv_kthread_q_item_init(nv_kthread_q_item_t *q_item,
+                            nv_q_func_t function_to_run,
+                            void *function_args)
+{
+    INIT_LIST_HEAD(&q_item->q_list_node);
+    q_item->function_to_run = function_to_run;
+    q_item->function_args   = function_args;
+}
+
+// Returns true (non-zero) if the q_item got scheduled, false otherwise.
+int nv_kthread_q_schedule_q_item(nv_kthread_q_t *q,
+                                 nv_kthread_q_item_t *q_item)
+{
+    if (unlikely(atomic_read(&q->main_loop_should_exit))) {
+        NVQ_WARN("Not allowed: nv_kthread_q_schedule_q_item was "
+                   "called with a non-alive q: 0x%p\n", q);
+        return 0;
+    }
+
+    return _raw_q_schedule(q, q_item);
+}
+
+static void _q_flush_function(void *args)
+{
+    struct completion *completion = (struct completion *)args;
+    complete(completion);
+}
+
+
+static void _raw_q_flush(nv_kthread_q_t *q)
+{
+    nv_kthread_q_item_t q_item;
+    DECLARE_COMPLETION_ONSTACK(completion);
+
+    nv_kthread_q_item_init(&q_item, _q_flush_function, &completion);
+
+    _raw_q_schedule(q, &q_item);
+
+    // Wait for the flush item to run. Once it has run, then all of the
+    // previously queued items in front of it will have run, so that means
+    // the flush is complete.
+    wait_for_completion(&completion);
+}
+
+void nv_kthread_q_flush(nv_kthread_q_t *q)
+{
+    if (unlikely(atomic_read(&q->main_loop_should_exit))) {
+        NVQ_WARN("Not allowed: nv_kthread_q_flush was called after "
+                   "nv_kthread_q_stop. q: 0x%p\n", q);
+        return;
+    }
+
+    // This 2x flush is not a typing mistake. The queue really does have to be
+    // flushed twice, in order to take care of the case of a q_item that
+    // reschedules itself.
+    _raw_q_flush(q);
+    _raw_q_flush(q);
+}
--- a/kernel-open/nvidia-drm/nvidia-dma-fence-helper.h
+++ b/kernel-open/nvidia-drm/nvidia-dma-fence-helper.h
@ -43,9 +43,13 @@
 #if defined(NV_LINUX_FENCE_H_PRESENT)
 typedef struct fence nv_dma_fence_t;
 typedef struct fence_ops nv_dma_fence_ops_t;
+typedef struct fence_cb nv_dma_fence_cb_t;
+typedef fence_func_t nv_dma_fence_func_t;
 #else
 typedef struct dma_fence nv_dma_fence_t;
 typedef struct dma_fence_ops nv_dma_fence_ops_t;
+typedef struct dma_fence_cb nv_dma_fence_cb_t;
+typedef dma_fence_func_t nv_dma_fence_func_t;
 #endif

 #if defined(NV_LINUX_FENCE_H_PRESENT)
@ -97,6 +101,14 @@ static inline int nv_dma_fence_signal(nv_dma_fence_t *fence) {
 #endif
 }

+static inline int nv_dma_fence_signal_locked(nv_dma_fence_t *fence) {
+#if defined(NV_LINUX_FENCE_H_PRESENT)
+    return fence_signal_locked(fence);
+#else
+    return dma_fence_signal_locked(fence);
+#endif
+}
+
 static inline u64 nv_dma_fence_context_alloc(unsigned num) {
 #if defined(NV_LINUX_FENCE_H_PRESENT)
    return fence_context_alloc(num);
@ -108,7 +120,7 @@ static inline u64 nv_dma_fence_context_alloc(unsigned num) {
 static inline void
 nv_dma_fence_init(nv_dma_fence_t *fence,
                  const nv_dma_fence_ops_t *ops,
-                  spinlock_t *lock, u64 context, unsigned seqno) {
+                  spinlock_t *lock, u64 context, uint64_t seqno) {
 #if defined(NV_LINUX_FENCE_H_PRESENT)
    fence_init(fence, ops, lock, context, seqno);
 #else
@ -116,6 +128,29 @@ nv_dma_fence_init(nv_dma_fence_t *fence,
 #endif
 }

+static inline void
+nv_dma_fence_set_error(nv_dma_fence_t *fence,
+                       int error) {
+#if defined(NV_DMA_FENCE_SET_ERROR_PRESENT)
+    return dma_fence_set_error(fence, error);
+#elif defined(NV_FENCE_SET_ERROR_PRESENT)
+    return fence_set_error(fence, error);
+#else
+    fence->status = error;
+#endif
+}
+
+static inline int
+nv_dma_fence_add_callback(nv_dma_fence_t *fence,
+                          nv_dma_fence_cb_t *cb,
+                          nv_dma_fence_func_t func) {
+#if defined(NV_LINUX_FENCE_H_PRESENT)
+    return fence_add_callback(fence, cb, func);
+#else
+    return dma_fence_add_callback(fence, cb, func);
+#endif
+}
+
 #endif /* defined(NV_DRM_FENCE_AVAILABLE) */

 #endif /* __NVIDIA_DMA_FENCE_HELPER_H__ */
--- a/kernel-open/nvidia-drm/nvidia-dma-resv-helper.h
+++ b/kernel-open/nvidia-drm/nvidia-dma-resv-helper.h
@ -121,6 +121,20 @@ static inline void nv_dma_resv_add_excl_fence(nv_dma_resv_t *obj,
 #endif
 }

+static inline void nv_dma_resv_add_shared_fence(nv_dma_resv_t *obj,
+                                                nv_dma_fence_t *fence)
+{
+#if defined(NV_LINUX_DMA_RESV_H_PRESENT)
+#if defined(NV_DMA_RESV_ADD_FENCE_PRESENT)
+    dma_resv_add_fence(obj, fence, DMA_RESV_USAGE_READ);
+#else
+    dma_resv_add_shared_fence(obj, fence);
+#endif
+#else
+    reservation_object_add_shared_fence(obj, fence);
+#endif
+}
+
 #endif /* defined(NV_DRM_FENCE_AVAILABLE) */

 #endif /* __NVIDIA_DMA_RESV_HELPER_H__ */
--- a/kernel-open/nvidia-drm/nvidia-drm-conftest.h
+++ b/kernel-open/nvidia-drm/nvidia-drm-conftest.h
@ -61,4 +61,15 @@
 #undef NV_DRM_FENCE_AVAILABLE
 #endif

+/*
+ * We can support color management if either drm_helper_crtc_enable_color_mgmt()
+ * or drm_crtc_enable_color_mgmt() exist.
+ */
+#if defined(NV_DRM_HELPER_CRTC_ENABLE_COLOR_MGMT_PRESENT) || \
+    defined(NV_DRM_CRTC_ENABLE_COLOR_MGMT_PRESENT)
+#define NV_DRM_COLOR_MGMT_AVAILABLE
+#else
+#undef NV_DRM_COLOR_MGMT_AVAILABLE
+#endif
+
 #endif /* defined(__NVIDIA_DRM_CONFTEST_H__) */
--- a/kernel-open/nvidia-drm/nvidia-drm-connector.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-connector.c
@ -349,10 +349,125 @@ nv_drm_connector_best_encoder(struct drm_connector *connector)
    return NULL;
 }

+#if defined(NV_DRM_MODE_CREATE_DP_COLORSPACE_PROPERTY_HAS_SUPPORTED_COLORSPACES_ARG)
+static const NvU32 __nv_drm_connector_supported_colorspaces =
+    BIT(DRM_MODE_COLORIMETRY_BT2020_RGB) |
+    BIT(DRM_MODE_COLORIMETRY_BT2020_YCC);
+#endif
+
+#if defined(NV_DRM_CONNECTOR_ATTACH_HDR_OUTPUT_METADATA_PROPERTY_PRESENT)
+static int
+__nv_drm_connector_atomic_check(struct drm_connector *connector,
+                                struct drm_atomic_state *state)
+{
+    struct drm_connector_state *new_connector_state =
+        drm_atomic_get_new_connector_state(state, connector);
+    struct drm_connector_state *old_connector_state =
+        drm_atomic_get_old_connector_state(state, connector);
+    struct nv_drm_device *nv_dev = to_nv_device(connector->dev);
+
+    struct drm_crtc *crtc = new_connector_state->crtc;
+    struct drm_crtc_state *crtc_state;
+    struct nv_drm_crtc_state *nv_crtc_state;
+    struct NvKmsKapiHeadRequestedConfig *req_config;
+
+    if (!crtc) {
+        return 0;
+    }
+
+    crtc_state = drm_atomic_get_new_crtc_state(state, crtc);
+    nv_crtc_state = to_nv_crtc_state(crtc_state);
+    req_config = &nv_crtc_state->req_config;
+
+    /*
+     * Override metadata for the entire head instead of allowing NVKMS to derive
+     * it from the layers' metadata.
+     *
+     * This is the metadata that will sent to the display, and if applicable,
+     * layers will be tone mapped to this metadata rather than that of the
+     * display.
+     */
+    req_config->flags.hdrInfoFrameChanged =
+        !drm_connector_atomic_hdr_metadata_equal(old_connector_state,
+                                                 new_connector_state);
+    if (new_connector_state->hdr_output_metadata &&
+        new_connector_state->hdr_output_metadata->data) {
+
+        /*
+         * Note that HDMI definitions are used here even though we might not
+         * be using HDMI. While that seems odd, it is consistent with
+         * upstream behavior.
+         */
+
+        struct hdr_output_metadata *hdr_metadata =
+            new_connector_state->hdr_output_metadata->data;
+        struct hdr_metadata_infoframe *info_frame =
+            &hdr_metadata->hdmi_metadata_type1;
+        unsigned int i;
+
+        if (hdr_metadata->metadata_type != HDMI_STATIC_METADATA_TYPE1) {
+            return -EINVAL;
+        }
+
+        for (i = 0; i < ARRAY_SIZE(info_frame->display_primaries); i++) {
+            req_config->modeSetConfig.hdrInfoFrame.staticMetadata.displayPrimaries[i].x =
+                info_frame->display_primaries[i].x;
+            req_config->modeSetConfig.hdrInfoFrame.staticMetadata.displayPrimaries[i].y =
+                info_frame->display_primaries[i].y;
+        }
+
+        req_config->modeSetConfig.hdrInfoFrame.staticMetadata.whitePoint.x =
+            info_frame->white_point.x;
+        req_config->modeSetConfig.hdrInfoFrame.staticMetadata.whitePoint.y =
+            info_frame->white_point.y;
+        req_config->modeSetConfig.hdrInfoFrame.staticMetadata.maxDisplayMasteringLuminance =
+            info_frame->max_display_mastering_luminance;
+        req_config->modeSetConfig.hdrInfoFrame.staticMetadata.minDisplayMasteringLuminance =
+            info_frame->min_display_mastering_luminance;
+        req_config->modeSetConfig.hdrInfoFrame.staticMetadata.maxCLL =
+            info_frame->max_cll;
+        req_config->modeSetConfig.hdrInfoFrame.staticMetadata.maxFALL =
+            info_frame->max_fall;
+
+        req_config->modeSetConfig.hdrInfoFrame.eotf = info_frame->eotf;
+
+        req_config->modeSetConfig.hdrInfoFrame.enabled = NV_TRUE;
+    } else {
+        req_config->modeSetConfig.hdrInfoFrame.enabled = NV_FALSE;
+    }
+
+    req_config->flags.colorimetryChanged =
+        (old_connector_state->colorspace != new_connector_state->colorspace);
+    // When adding a case here, also add to __nv_drm_connector_supported_colorspaces
+    switch (new_connector_state->colorspace) {
+        case DRM_MODE_COLORIMETRY_DEFAULT:
+            req_config->modeSetConfig.colorimetry =
+                NVKMS_OUTPUT_COLORIMETRY_DEFAULT;
+            break;
+        case DRM_MODE_COLORIMETRY_BT2020_RGB:
+        case DRM_MODE_COLORIMETRY_BT2020_YCC:
+            // Ignore RGB/YCC
+            // See https://patchwork.freedesktop.org/patch/525496/?series=111865&rev=4
+            req_config->modeSetConfig.colorimetry =
+                NVKMS_OUTPUT_COLORIMETRY_BT2100;
+            break;
+        default:
+            // XXX HDR TODO: Add support for more color spaces
+            NV_DRM_DEV_LOG_ERR(nv_dev, "Unsupported color space");
+            return -EINVAL;
+    }
+
+    return 0;
+}
+#endif /* defined(NV_DRM_CONNECTOR_ATTACH_HDR_OUTPUT_METADATA_PROPERTY_PRESENT) */
+
 static const struct drm_connector_helper_funcs nv_connector_helper_funcs = {
    .get_modes    = nv_drm_connector_get_modes,
    .mode_valid   = nv_drm_connector_mode_valid,
    .best_encoder = nv_drm_connector_best_encoder,
+#if defined(NV_DRM_CONNECTOR_ATTACH_HDR_OUTPUT_METADATA_PROPERTY_PRESENT)
+    .atomic_check = __nv_drm_connector_atomic_check,
+#endif
 };

 static struct drm_connector*
@ -405,6 +520,32 @@ nv_drm_connector_new(struct drm_device *dev,
            DRM_CONNECTOR_POLL_CONNECT | DRM_CONNECTOR_POLL_DISCONNECT;
    }

+#if defined(NV_DRM_CONNECTOR_ATTACH_HDR_OUTPUT_METADATA_PROPERTY_PRESENT)
+    if (nv_connector->type == NVKMS_CONNECTOR_TYPE_HDMI) {
+#if defined(NV_DRM_MODE_CREATE_DP_COLORSPACE_PROPERTY_HAS_SUPPORTED_COLORSPACES_ARG)
+        if (drm_mode_create_hdmi_colorspace_property(
+                &nv_connector->base,
+                __nv_drm_connector_supported_colorspaces) == 0) {
+#else
+        if (drm_mode_create_hdmi_colorspace_property(&nv_connector->base) == 0) {
+#endif
+            drm_connector_attach_colorspace_property(&nv_connector->base);
+        }
+        drm_connector_attach_hdr_output_metadata_property(&nv_connector->base);
+    } else if (nv_connector->type == NVKMS_CONNECTOR_TYPE_DP) {
+#if defined(NV_DRM_MODE_CREATE_DP_COLORSPACE_PROPERTY_HAS_SUPPORTED_COLORSPACES_ARG)
+        if (drm_mode_create_dp_colorspace_property(
+                &nv_connector->base,
+                __nv_drm_connector_supported_colorspaces) == 0) {
+#else
+        if (drm_mode_create_dp_colorspace_property(&nv_connector->base) == 0) {
+#endif
+            drm_connector_attach_colorspace_property(&nv_connector->base);
+        }
+        drm_connector_attach_hdr_output_metadata_property(&nv_connector->base);
+    }
+#endif /* defined(NV_DRM_CONNECTOR_ATTACH_HDR_OUTPUT_METADATA_PROPERTY_PRESENT) */
+
    /* Register connector with DRM subsystem */

    ret = drm_connector_register(&nv_connector->base);
--- a/kernel-open/nvidia-drm/nvidia-drm-crtc.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-crtc.c
@ -48,6 +48,11 @@
 #include <linux/host1x-next.h>
 #endif

+#if defined(NV_DRM_DRM_COLOR_MGMT_H_PRESENT)
+#include <drm/drm_color_mgmt.h>
+#endif
+
+
 #if defined(NV_DRM_HAS_HDR_OUTPUT_METADATA)
 static int
 nv_drm_atomic_replace_property_blob_from_id(struct drm_device *dev,
@ -399,27 +404,25 @@ plane_req_config_update(struct drm_plane *plane,
        }

        for (i = 0; i < ARRAY_SIZE(info_frame->display_primaries); i ++) {
-            req_config->config.hdrMetadata.displayPrimaries[i].x =
+            req_config->config.hdrMetadata.val.displayPrimaries[i].x =
                info_frame->display_primaries[i].x;
-            req_config->config.hdrMetadata.displayPrimaries[i].y =
+            req_config->config.hdrMetadata.val.displayPrimaries[i].y =
                info_frame->display_primaries[i].y;
        }

-        req_config->config.hdrMetadata.whitePoint.x =
+        req_config->config.hdrMetadata.val.whitePoint.x =
            info_frame->white_point.x;
-        req_config->config.hdrMetadata.whitePoint.y =
+        req_config->config.hdrMetadata.val.whitePoint.y =
            info_frame->white_point.y;
-        req_config->config.hdrMetadata.maxDisplayMasteringLuminance =
+        req_config->config.hdrMetadata.val.maxDisplayMasteringLuminance =
            info_frame->max_display_mastering_luminance;
-        req_config->config.hdrMetadata.minDisplayMasteringLuminance =
+        req_config->config.hdrMetadata.val.minDisplayMasteringLuminance =
            info_frame->min_display_mastering_luminance;
-        req_config->config.hdrMetadata.maxCLL =
+        req_config->config.hdrMetadata.val.maxCLL =
            info_frame->max_cll;
-        req_config->config.hdrMetadata.maxFALL =
+        req_config->config.hdrMetadata.val.maxFALL =
            info_frame->max_fall;

-        req_config->config.hdrMetadataSpecified = true;
-
        switch (info_frame->eotf) {
            case HDMI_EOTF_SMPTE_ST2084:
                req_config->config.tf = NVKMS_OUTPUT_TF_PQ;
@ -432,10 +435,21 @@ plane_req_config_update(struct drm_plane *plane,
                NV_DRM_DEV_LOG_ERR(nv_dev, "Unsupported EOTF");
                return -1;
        }
+
+        req_config->config.hdrMetadata.enabled = true;
    } else {
-        req_config->config.hdrMetadataSpecified = false;
+        req_config->config.hdrMetadata.enabled = false;
        req_config->config.tf = NVKMS_OUTPUT_TF_NONE;
    }
+
+    req_config->flags.hdrMetadataChanged =
+        ((old_config.hdrMetadata.enabled !=
+          req_config->config.hdrMetadata.enabled) ||
+         memcmp(&old_config.hdrMetadata.val,
+                &req_config->config.hdrMetadata.val,
+                sizeof(struct NvKmsHDRStaticMetadata)));
+
+    req_config->flags.tfChanged = (old_config.tf != req_config->config.tf);
 #endif

    /*
@ -692,9 +706,11 @@ static inline void __nv_drm_plane_atomic_destroy_state(
 #endif

 #if defined(NV_DRM_HAS_HDR_OUTPUT_METADATA)
-    struct nv_drm_plane_state *nv_drm_plane_state =
-        to_nv_drm_plane_state(state);
-    drm_property_blob_put(nv_drm_plane_state->hdr_output_metadata);
+    {
+        struct nv_drm_plane_state *nv_drm_plane_state =
+            to_nv_drm_plane_state(state);
+        drm_property_blob_put(nv_drm_plane_state->hdr_output_metadata);
+    }
 #endif
 }

@ -800,6 +816,9 @@ nv_drm_atomic_crtc_duplicate_state(struct drm_crtc *crtc)
        &(to_nv_crtc_state(crtc->state)->req_config),
        &nv_state->req_config);

+    nv_state->ilut_ramps = NULL;
+    nv_state->olut_ramps = NULL;
+
    return &nv_state->base;
 }

@ -823,6 +842,9 @@ static void nv_drm_atomic_crtc_destroy_state(struct drm_crtc *crtc,

    __nv_drm_atomic_helper_crtc_destroy_state(crtc, &nv_state->base);

+    nv_drm_free(nv_state->ilut_ramps);
+    nv_drm_free(nv_state->olut_ramps);
+
    nv_drm_free(nv_state);
 }

@ -833,6 +855,9 @@ static struct drm_crtc_funcs nv_crtc_funcs = {
    .destroy                = nv_drm_crtc_destroy,
    .atomic_duplicate_state = nv_drm_atomic_crtc_duplicate_state,
    .atomic_destroy_state   = nv_drm_atomic_crtc_destroy_state,
+#if defined(NV_DRM_ATOMIC_HELPER_LEGACY_GAMMA_SET_PRESENT)
+    .gamma_set = drm_atomic_helper_legacy_gamma_set,
+#endif
 };

 /*
@ -866,6 +891,198 @@ static int head_modeset_config_attach_connector(
    return 0;
 }

+#if defined(NV_DRM_COLOR_MGMT_AVAILABLE)
+static int color_mgmt_config_copy_lut(struct NvKmsLutRamps *nvkms_lut,
+                                      struct drm_color_lut *drm_lut,
+                                      uint64_t lut_len)
+{
+    uint64_t i = 0;
+    if (lut_len != NVKMS_LUT_ARRAY_SIZE) {
+        return -EINVAL;
+    }
+
+    /*
+     * Both NvKms and drm LUT values are 16-bit linear values. NvKms LUT ramps
+     * are in arrays in a single struct while drm LUT ramps are an array of
+     * structs.
+     */
+    for (i = 0; i < lut_len; i++) {
+        nvkms_lut->red[i]   = drm_lut[i].red;
+        nvkms_lut->green[i] = drm_lut[i].green;
+        nvkms_lut->blue[i]  = drm_lut[i].blue;
+    }
+    return 0;
+}
+
+static void color_mgmt_config_ctm_to_csc(struct NvKmsCscMatrix *nvkms_csc,
+                                         struct drm_color_ctm  *drm_ctm)
+{
+    int y;
+
+    /* CTM is a 3x3 matrix while ours is 3x4. Zero out the last column. */
+    nvkms_csc->m[0][3] = nvkms_csc->m[1][3] = nvkms_csc->m[2][3] = 0;
+
+    for (y = 0; y < 3; y++) {
+        int x;
+
+        for (x = 0; x < 3; x++) {
+            /*
+             * Values in the CTM are encoded in S31.32 sign-magnitude fixed-
+             * point format, while NvKms CSC values are signed 2's-complement
+             * S15.16 (Ssign-extend12-3.16?) fixed-point format.
+             */
+            NvU64 ctmVal = drm_ctm->matrix[y*3 + x];
+            NvU64 signBit = ctmVal & (1ULL << 63);
+            NvU64 magnitude = ctmVal & ~signBit;
+
+            /*
+             * Drop the low 16 bits of the fractional part and the high 17 bits
+             * of the integral part. Drop 17 bits to avoid corner cases where
+             * the highest resulting bit is a 1, causing the `cscVal = -cscVal`
+             * line to result in a positive number.
+             */
+            NvS32 cscVal = (magnitude >> 16) & ((1ULL << 31) - 1);
+            if (signBit) {
+                cscVal = -cscVal;
+            }
+
+            nvkms_csc->m[y][x] = cscVal;
+        }
+    }
+}
+
+static int color_mgmt_config_set(struct nv_drm_crtc_state *nv_crtc_state,
+                                 struct NvKmsKapiHeadRequestedConfig *req_config)
+{
+    struct NvKmsKapiHeadModeSetConfig *modeset_config =
+        &req_config->modeSetConfig;
+    struct drm_crtc_state *crtc_state = &nv_crtc_state->base;
+    int ret = 0;
+
+    struct drm_color_lut *degamma_lut = NULL;
+    struct drm_color_ctm *ctm = NULL;
+    struct drm_color_lut *gamma_lut = NULL;
+    uint64_t degamma_len = 0;
+    uint64_t gamma_len = 0;
+
+    int i;
+    struct drm_plane *plane;
+    struct drm_plane_state *plane_state;
+
+    /*
+     * According to the comment in the Linux kernel's
+     * drivers/gpu/drm/drm_color_mgmt.c, if any of these properties are NULL,
+     * that LUT or CTM needs to be changed to a linear LUT or identity matrix
+     * respectively.
+     */
+
+    req_config->flags.lutChanged = NV_TRUE;
+    if (crtc_state->degamma_lut) {
+        nv_crtc_state->ilut_ramps = nv_drm_calloc(1, sizeof(*nv_crtc_state->ilut_ramps));
+        if (!nv_crtc_state->ilut_ramps) {
+            ret = -ENOMEM;
+            goto fail;
+        }
+
+        degamma_lut = (struct drm_color_lut *)crtc_state->degamma_lut->data;
+        degamma_len = crtc_state->degamma_lut->length /
+                      sizeof(struct drm_color_lut);
+
+        if ((ret = color_mgmt_config_copy_lut(nv_crtc_state->ilut_ramps,
+                                              degamma_lut,
+                                              degamma_len)) != 0) {
+            goto fail;
+        }
+
+        modeset_config->lut.input.specified = NV_TRUE;
+        modeset_config->lut.input.depth     = 30; /* specify the full LUT */
+        modeset_config->lut.input.start     = 0;
+        modeset_config->lut.input.end       = degamma_len - 1;
+        modeset_config->lut.input.pRamps    = nv_crtc_state->ilut_ramps;
+    } else {
+        /* setting input.end to 0 is equivalent to disabling the LUT, which
+         * should be equivalent to a linear LUT */
+        modeset_config->lut.input.specified = NV_TRUE;
+        modeset_config->lut.input.depth     = 30; /* specify the full LUT */
+        modeset_config->lut.input.start     = 0;
+        modeset_config->lut.input.end       = 0;
+        modeset_config->lut.input.pRamps    = NULL;
+    }
+
+    nv_drm_for_each_new_plane_in_state(crtc_state->state, plane,
+                                       plane_state, i) {
+        struct nv_drm_plane *nv_plane = to_nv_plane(plane);
+        uint32_t layer = nv_plane->layer_idx;
+        struct NvKmsKapiLayerRequestedConfig *layer_config;
+
+        if (layer == NVKMS_KAPI_LAYER_INVALID_IDX || plane_state->crtc != crtc_state->crtc) {
+            continue;
+        }
+        layer_config = &req_config->layerRequestedConfig[layer];
+
+        if (layer == NVKMS_KAPI_LAYER_PRIMARY_IDX && crtc_state->ctm) {
+            ctm = (struct drm_color_ctm *)crtc_state->ctm->data;
+
+            color_mgmt_config_ctm_to_csc(&layer_config->config.csc, ctm);
+            layer_config->config.cscUseMain = NV_FALSE;
+        } else {
+            /* When crtc_state->ctm is unset, this also sets the main layer to
+             * the identity matrix.
+             */
+            layer_config->config.csc = NVKMS_IDENTITY_CSC_MATRIX;
+        }
+        layer_config->flags.cscChanged = NV_TRUE;
+    }
+
+    if (crtc_state->gamma_lut) {
+        nv_crtc_state->olut_ramps = nv_drm_calloc(1, sizeof(*nv_crtc_state->olut_ramps));
+        if (!nv_crtc_state->olut_ramps) {
+            ret = -ENOMEM;
+            goto fail;
+        }
+
+        gamma_lut = (struct drm_color_lut *)crtc_state->gamma_lut->data;
+        gamma_len = crtc_state->gamma_lut->length /
+                    sizeof(struct drm_color_lut);
+
+        if ((ret = color_mgmt_config_copy_lut(nv_crtc_state->olut_ramps,
+                                              gamma_lut,
+                                              gamma_len)) != 0) {
+            goto fail;
+        }
+
+        modeset_config->lut.output.specified = NV_TRUE;
+        modeset_config->lut.output.enabled   = NV_TRUE;
+        modeset_config->lut.output.pRamps    = nv_crtc_state->olut_ramps;
+    } else {
+        /* disabling the output LUT should be equivalent to setting a linear
+         * LUT */
+        modeset_config->lut.output.specified = NV_TRUE;
+        modeset_config->lut.output.enabled   = NV_FALSE;
+        modeset_config->lut.output.pRamps    = NULL;
+    }
+
+    return 0;
+
+fail:
+    /* free allocated state */
+    nv_drm_free(nv_crtc_state->ilut_ramps);
+    nv_drm_free(nv_crtc_state->olut_ramps);
+
+    /* remove dangling pointers */
+    nv_crtc_state->ilut_ramps = NULL;
+    nv_crtc_state->olut_ramps = NULL;
+    modeset_config->lut.input.pRamps = NULL;
+    modeset_config->lut.output.pRamps = NULL;
+
+    /* prevent attempts at reading NULLs */
+    modeset_config->lut.input.specified = NV_FALSE;
+    modeset_config->lut.output.specified = NV_FALSE;
+
+    return ret;
+}
+#endif /* NV_DRM_COLOR_MGMT_AVAILABLE */
+
 /**
 * nv_drm_crtc_atomic_check() can fail after it has modified
 * the 'nv_drm_crtc_state::req_config', that is fine because 'nv_drm_crtc_state'
@ -887,6 +1104,9 @@ static int nv_drm_crtc_atomic_check(struct drm_crtc *crtc,
    struct NvKmsKapiHeadRequestedConfig *req_config =
        &nv_crtc_state->req_config;
    int ret = 0;
+#if defined(NV_DRM_COLOR_MGMT_AVAILABLE)
+    struct nv_drm_device *nv_dev = to_nv_device(crtc_state->crtc->dev);
+#endif

    if (crtc_state->mode_changed) {
        drm_mode_to_nvkms_display_mode(&crtc_state->mode,
@ -925,6 +1145,25 @@ static int nv_drm_crtc_atomic_check(struct drm_crtc *crtc,
        req_config->flags.activeChanged = NV_TRUE;
    }

+#if defined(NV_DRM_CRTC_STATE_HAS_VRR_ENABLED)
+    req_config->modeSetConfig.vrrEnabled = crtc_state->vrr_enabled;
+#endif
+
+#if defined(NV_DRM_COLOR_MGMT_AVAILABLE)
+    if (nv_dev->drmMasterChangedSinceLastAtomicCommit &&
+        (crtc_state->degamma_lut ||
+         crtc_state->ctm ||
+         crtc_state->gamma_lut)) {
+
+        crtc_state->color_mgmt_changed = NV_TRUE;
+    }
+    if (crtc_state->color_mgmt_changed) {
+        if ((ret = color_mgmt_config_set(nv_crtc_state, req_config)) != 0) {
+            return ret;
+        }
+    }
+#endif
+
    return ret;
 }

@ -1156,6 +1395,8 @@ nv_drm_plane_create(struct drm_device *dev,
            plane,
            validLayerRRTransforms);

+    nv_drm_free(formats);
+
    return plane;

 failed_plane_init:
@ -1220,6 +1461,22 @@ static struct drm_crtc *__nv_drm_crtc_create(struct nv_drm_device *nv_dev,

    drm_crtc_helper_add(&nv_crtc->base, &nv_crtc_helper_funcs);

+#if defined(NV_DRM_COLOR_MGMT_AVAILABLE)
+#if defined(NV_DRM_CRTC_ENABLE_COLOR_MGMT_PRESENT)
+    drm_crtc_enable_color_mgmt(&nv_crtc->base, NVKMS_LUT_ARRAY_SIZE, true,
+                               NVKMS_LUT_ARRAY_SIZE);
+#else
+    drm_helper_crtc_enable_color_mgmt(&nv_crtc->base, NVKMS_LUT_ARRAY_SIZE,
+                                      NVKMS_LUT_ARRAY_SIZE);
+#endif
+    ret = drm_mode_crtc_set_gamma_size(&nv_crtc->base, NVKMS_LUT_ARRAY_SIZE);
+    if (ret != 0) {
+        NV_DRM_DEV_LOG_WARN(
+            nv_dev,
+            "Failed to initialize legacy gamma support for head %u", head);
+    }
+#endif
+
    return &nv_crtc->base;

 failed_init_crtc:
@ -1328,10 +1585,16 @@ static void NvKmsKapiCrcsToDrm(const struct NvKmsKapiCrcs *crcs,
 {
    drmCrcs->outputCrc32.value = crcs->outputCrc32.value;
    drmCrcs->outputCrc32.supported = crcs->outputCrc32.supported;
+    drmCrcs->outputCrc32.__pad0 = 0;
+    drmCrcs->outputCrc32.__pad1 = 0;
    drmCrcs->rasterGeneratorCrc32.value = crcs->rasterGeneratorCrc32.value;
    drmCrcs->rasterGeneratorCrc32.supported = crcs->rasterGeneratorCrc32.supported;
+    drmCrcs->rasterGeneratorCrc32.__pad0 = 0;
+    drmCrcs->rasterGeneratorCrc32.__pad1 = 0;
    drmCrcs->compositorCrc32.value = crcs->compositorCrc32.value;
    drmCrcs->compositorCrc32.supported = crcs->compositorCrc32.supported;
+    drmCrcs->compositorCrc32.__pad0 = 0;
+    drmCrcs->compositorCrc32.__pad1 = 0;
 }

 int nv_drm_get_crtc_crc32_v2_ioctl(struct drm_device *dev,
--- a/kernel-open/nvidia-drm/nvidia-drm-crtc.h
+++ b/kernel-open/nvidia-drm/nvidia-drm-crtc.h
@ -129,6 +129,9 @@ struct nv_drm_crtc_state {
     */
    struct NvKmsKapiHeadRequestedConfig req_config;

+    struct NvKmsLutRamps *ilut_ramps;
+    struct NvKmsLutRamps *olut_ramps;
+
    /**
     * @nv_flip:
     *
--- a/kernel-open/nvidia-drm/nvidia-drm-drv.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-drv.c
@ -44,6 +44,10 @@
 #include <drm/drmP.h>
 #endif

+#if defined(NV_DRM_DRM_ATOMIC_UAPI_H_PRESENT)
+#include <drm/drm_atomic_uapi.h>
+#endif
+
 #if defined(NV_DRM_DRM_VBLANK_H_PRESENT)
 #include <drm/drm_vblank.h>
 #endif
@ -60,6 +64,15 @@
 #include <drm/drm_ioctl.h>
 #endif

+#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE)
+#include <drm/drm_aperture.h>
+#include <drm/drm_fb_helper.h>
+#endif
+
+#if defined(NV_DRM_DRM_FBDEV_GENERIC_H_PRESENT)
+#include <drm/drm_fbdev_generic.h>
+#endif
+
 #include <linux/pci.h>

 /*
@ -84,6 +97,11 @@
 #include <drm/drm_atomic_helper.h>
 #endif

+static int nv_drm_revoke_modeset_permission(struct drm_device *dev,
+                                            struct drm_file *filep,
+                                            NvU32 dpyId);
+static int nv_drm_revoke_sub_ownership(struct drm_device *dev);
+
 static struct nv_drm_device *dev_list = NULL;

 static const char* nv_get_input_colorspace_name(
@ -460,6 +478,11 @@ static int nv_drm_load(struct drm_device *dev, unsigned long flags)

    nv_dev->supportsSyncpts = resInfo.caps.supportsSyncpts;

+    nv_dev->semsurf_stride = resInfo.caps.semsurf.stride;
+
+    nv_dev->semsurf_max_submitted_offset =
+        resInfo.caps.semsurf.maxSubmittedOffset;
+
 #if defined(NV_DRM_FORMAT_MODIFIERS_PRESENT)
    gen = nv_dev->pageKindGeneration;
    kind = nv_dev->genericPageKind;
@ -546,6 +569,8 @@ static void __nv_drm_unload(struct drm_device *dev)

    mutex_lock(&nv_dev->lock);

+    WARN_ON(nv_dev->subOwnershipGranted);
+
    /* Disable event handling */

    atomic_set(&nv_dev->enable_event_handling, false);
@ -595,9 +620,15 @@ static int __nv_drm_master_set(struct drm_device *dev,
 {
    struct nv_drm_device *nv_dev = to_nv_device(dev);

-    if (!nvKms->grabOwnership(nv_dev->pDevice)) {
+    /*
+     * If this device is driving a framebuffer, then nvidia-drm already has
+     * modeset ownership. Otherwise, grab ownership now.
+     */
+    if (!nv_dev->hasFramebufferConsole &&
+        !nvKms->grabOwnership(nv_dev->pDevice)) {
        return -EINVAL;
    }
+    nv_dev->drmMasterChangedSinceLastAtomicCommit = NV_TRUE;

    return 0;
 }
@ -631,6 +662,9 @@ void nv_drm_master_drop(struct drm_device *dev, struct drm_file *file_priv)
    struct nv_drm_device *nv_dev = to_nv_device(dev);
    int err;

+    nv_drm_revoke_modeset_permission(dev, file_priv, 0);
+    nv_drm_revoke_sub_ownership(dev);
+
    /*
     * After dropping nvkms modeset onwership, it is not guaranteed that
     * drm and nvkms modeset state will remain in sync.  Therefore, disable
@ -655,7 +689,9 @@ void nv_drm_master_drop(struct drm_device *dev, struct drm_file *file_priv)

    drm_modeset_unlock_all(dev);

-    nvKms->releaseOwnership(nv_dev->pDevice);
+    if (!nv_dev->hasFramebufferConsole) {
+        nvKms->releaseOwnership(nv_dev->pDevice);
+    }
 }
 #endif /* NV_DRM_ATOMIC_MODESET_AVAILABLE */

@ -693,15 +729,24 @@ static int nv_drm_get_dev_info_ioctl(struct drm_device *dev,

    params->gpu_id = nv_dev->gpu_info.gpu_id;
    params->primary_index = dev->primary->index;
+    params->generic_page_kind = 0;
+    params->page_kind_generation = 0;
+    params->sector_layout = 0;
+    params->supports_sync_fd = false;
+    params->supports_semsurf = false;
+
 #if defined(NV_DRM_ATOMIC_MODESET_AVAILABLE)
    params->generic_page_kind = nv_dev->genericPageKind;
    params->page_kind_generation = nv_dev->pageKindGeneration;
    params->sector_layout = nv_dev->sectorLayout;
-#else
-    params->generic_page_kind = 0;
-    params->page_kind_generation = 0;
-    params->sector_layout = 0;
-#endif
+    /* Semaphore surfaces are only supported if the modeset = 1 parameter is set */
+    if ((nv_dev->pDevice) != NULL && (nv_dev->semsurf_stride != 0)) {
+        params->supports_semsurf = true;
+#if defined(NV_SYNC_FILE_GET_FENCE_PRESENT)
+        params->supports_sync_fd = true;
+#endif /* defined(NV_SYNC_FILE_GET_FENCE_PRESENT) */
+    }
+#endif /* defined(NV_DRM_ATOMIC_MODESET_AVAILABLE) */

    return 0;
 }
@ -833,10 +878,10 @@ static NvU32 nv_drm_get_head_bit_from_connector(struct drm_connector *connector)
    return 0;
 }

-static int nv_drm_grant_permission_ioctl(struct drm_device *dev, void *data,
-                                         struct drm_file *filep)
+static int nv_drm_grant_modeset_permission(struct drm_device *dev,
+                                           struct drm_nvidia_grant_permissions_params *params,
+                                           struct drm_file *filep)
 {
-    struct drm_nvidia_grant_permissions_params *params = data;
    struct nv_drm_device *nv_dev = to_nv_device(dev);
    struct nv_drm_connector *target_nv_connector = NULL;
    struct nv_drm_crtc *target_nv_crtc = NULL;
@ -958,26 +1003,102 @@ done:
    return ret;
 }

-static bool nv_drm_revoke_connector(struct nv_drm_device *nv_dev,
-                                    struct nv_drm_connector *nv_connector)
+static int nv_drm_grant_sub_ownership(struct drm_device *dev,
+                                      struct drm_nvidia_grant_permissions_params *params)
 {
-    bool ret = true;
-    if (nv_connector->modeset_permission_crtc) {
-        if (nv_connector->nv_detected_encoder) {
-            ret = nvKms->revokePermissions(
-                nv_dev->pDevice, nv_connector->modeset_permission_crtc->head,
-                nv_connector->nv_detected_encoder->hDisplay);
-        }
-        nv_connector->modeset_permission_crtc->modeset_permission_filep = NULL;
-        nv_connector->modeset_permission_crtc = NULL;
+    int ret = -EINVAL;
+    struct nv_drm_device *nv_dev = to_nv_device(dev);
+    struct drm_modeset_acquire_ctx *pctx;
+#if NV_DRM_MODESET_LOCK_ALL_END_ARGUMENT_COUNT == 3
+    struct drm_modeset_acquire_ctx ctx;
+    DRM_MODESET_LOCK_ALL_BEGIN(dev, ctx, DRM_MODESET_ACQUIRE_INTERRUPTIBLE,
+                                ret);
+    pctx = &ctx;
+#else
+    mutex_lock(&dev->mode_config.mutex);
+    pctx = dev->mode_config.acquire_ctx;
+#endif
+
+    if (nv_dev->subOwnershipGranted ||
+        !nvKms->grantSubOwnership(params->fd, nv_dev->pDevice)) {
+        goto done;
    }
-    nv_connector->modeset_permission_filep = NULL;
-    return ret;
+
+    /*
+     * When creating an ownership grant, shut down all heads and disable flip
+     * notifications.
+     */
+    ret = nv_drm_atomic_helper_disable_all(dev, pctx);
+    if (ret != 0) {
+        NV_DRM_DEV_LOG_ERR(
+            nv_dev,
+            "nv_drm_atomic_helper_disable_all failed with error code %d!",
+            ret);
+    }
+
+    atomic_set(&nv_dev->enable_event_handling, false);
+    nv_dev->subOwnershipGranted = NV_TRUE;
+
+    ret = 0;
+
+done:
+#if NV_DRM_MODESET_LOCK_ALL_END_ARGUMENT_COUNT == 3
+    DRM_MODESET_LOCK_ALL_END(dev, ctx, ret);
+#else
+    mutex_unlock(&dev->mode_config.mutex);
+#endif
+    return 0;
 }

-static int nv_drm_revoke_permission(struct drm_device *dev,
-                                    struct drm_file *filep, NvU32 dpyId)
+static int nv_drm_grant_permission_ioctl(struct drm_device *dev, void *data,
+                                         struct drm_file *filep)
 {
+    struct drm_nvidia_grant_permissions_params *params = data;
+
+    if (params->type == NV_DRM_PERMISSIONS_TYPE_MODESET) {
+        return nv_drm_grant_modeset_permission(dev, params, filep);
+    } else if (params->type == NV_DRM_PERMISSIONS_TYPE_SUB_OWNER) {
+        return nv_drm_grant_sub_ownership(dev, params);
+    }
+
+    return -EINVAL;
+}
+
+static int
+nv_drm_atomic_disable_connector(struct drm_atomic_state *state,
+                                struct nv_drm_connector *nv_connector)
+{
+    struct drm_crtc_state *crtc_state;
+    struct drm_connector_state *connector_state;
+    int ret = 0;
+
+    if (nv_connector->modeset_permission_crtc) {
+        crtc_state = drm_atomic_get_crtc_state(
+            state, &nv_connector->modeset_permission_crtc->base);
+        if (!crtc_state) {
+            return -EINVAL;
+        }
+
+        crtc_state->active = false;
+        ret = drm_atomic_set_mode_prop_for_crtc(crtc_state, NULL);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    connector_state = drm_atomic_get_connector_state(state, &nv_connector->base);
+    if (!connector_state) {
+        return -EINVAL;
+    }
+
+    return drm_atomic_set_crtc_for_connector(connector_state, NULL);
+}
+
+static int nv_drm_revoke_modeset_permission(struct drm_device *dev,
+                                            struct drm_file *filep, NvU32 dpyId)
+{
+    struct drm_modeset_acquire_ctx *pctx;
+    struct drm_atomic_state *state;
    struct drm_connector *connector;
    struct drm_crtc *crtc;
    int ret = 0;
@ -988,10 +1109,19 @@ static int nv_drm_revoke_permission(struct drm_device *dev,
    struct drm_modeset_acquire_ctx ctx;
    DRM_MODESET_LOCK_ALL_BEGIN(dev, ctx, DRM_MODESET_ACQUIRE_INTERRUPTIBLE,
                               ret);
+    pctx = &ctx;
 #else
    mutex_lock(&dev->mode_config.mutex);
+    pctx = dev->mode_config.acquire_ctx;
 #endif

+    state = drm_atomic_state_alloc(dev);
+    if (!state) {
+        ret = -ENOMEM;
+        goto done;
+    }
+    state->acquire_ctx = pctx;
+
    /*
     * If dpyId is set, only revoke those specific resources. Otherwise,
     * it is from closing the file so revoke all resources for that filep.
@ -1003,10 +1133,13 @@ static int nv_drm_revoke_permission(struct drm_device *dev,
        struct nv_drm_connector *nv_connector = to_nv_connector(connector);
        if (nv_connector->modeset_permission_filep == filep &&
            (!dpyId || nv_drm_connector_is_dpy_id(connector, dpyId))) {
-            if (!nv_drm_connector_revoke_permissions(dev, nv_connector)) {
-                ret = -EINVAL;
-                // Continue trying to revoke as much as possible.
+            ret = nv_drm_atomic_disable_connector(state, nv_connector);
+            if (ret < 0) {
+                goto done;
            }
+
+            // Continue trying to revoke as much as possible.
+            nv_drm_connector_revoke_permissions(dev, nv_connector);
        }
    }
 #if defined(NV_DRM_CONNECTOR_LIST_ITER_PRESENT)
@ -1020,6 +1153,25 @@ static int nv_drm_revoke_permission(struct drm_device *dev,
        }
    }

+    ret = drm_atomic_commit(state);
+done:
+#if defined(NV_DRM_ATOMIC_STATE_REF_COUNTING_PRESENT)
+    drm_atomic_state_put(state);
+#else
+    if (ret != 0) {
+        drm_atomic_state_free(state);
+    } else {
+        /*
+         * In case of success, drm_atomic_commit() takes care to cleanup and
+         * free @state.
+         *
+         * Comment placed above drm_atomic_commit() says: The caller must not
+         * free or in any other way access @state. If the function fails then
+         * the caller must clean up @state itself.
+         */
+    }
+#endif
+
 #if NV_DRM_MODESET_LOCK_ALL_END_ARGUMENT_COUNT == 3
    DRM_MODESET_LOCK_ALL_END(dev, ctx, ret);
 #else
@ -1029,14 +1181,55 @@ static int nv_drm_revoke_permission(struct drm_device *dev,
    return ret;
 }

+static int nv_drm_revoke_sub_ownership(struct drm_device *dev)
+{
+    int ret = -EINVAL;
+    struct nv_drm_device *nv_dev = to_nv_device(dev);
+#if NV_DRM_MODESET_LOCK_ALL_END_ARGUMENT_COUNT == 3
+    struct drm_modeset_acquire_ctx ctx;
+    DRM_MODESET_LOCK_ALL_BEGIN(dev, ctx, DRM_MODESET_ACQUIRE_INTERRUPTIBLE,
+                               ret);
+#else
+    mutex_lock(&dev->mode_config.mutex);
+#endif
+
+    if (!nv_dev->subOwnershipGranted) {
+        goto done;
+    }
+
+    if (!nvKms->revokeSubOwnership(nv_dev->pDevice)) {
+        NV_DRM_DEV_LOG_ERR(nv_dev, "Failed to revoke sub-ownership from NVKMS");
+        goto done;
+    }
+
+    nv_dev->subOwnershipGranted = NV_FALSE;
+    atomic_set(&nv_dev->enable_event_handling, true);
+    ret = 0;
+
+done:
+#if NV_DRM_MODESET_LOCK_ALL_END_ARGUMENT_COUNT == 3
+    DRM_MODESET_LOCK_ALL_END(dev, ctx, ret);
+#else
+    mutex_unlock(&dev->mode_config.mutex);
+#endif
+    return ret;
+}
+
 static int nv_drm_revoke_permission_ioctl(struct drm_device *dev, void *data,
                                          struct drm_file *filep)
 {
    struct drm_nvidia_revoke_permissions_params *params = data;
-    if (!params->dpyId) {
-        return -EINVAL;
+
+    if (params->type == NV_DRM_PERMISSIONS_TYPE_MODESET) {
+        if (!params->dpyId) {
+            return -EINVAL;
+        }
+        return nv_drm_revoke_modeset_permission(dev, filep, params->dpyId);
+    } else if (params->type == NV_DRM_PERMISSIONS_TYPE_SUB_OWNER) {
+        return nv_drm_revoke_sub_ownership(dev);
    }
-    return nv_drm_revoke_permission(dev, filep, params->dpyId);
+
+    return -EINVAL;
 }

 static void nv_drm_postclose(struct drm_device *dev, struct drm_file *filep)
@ -1051,7 +1244,7 @@ static void nv_drm_postclose(struct drm_device *dev, struct drm_file *filep)
        dev->mode_config.num_connector > 0 &&
        dev->mode_config.connector_list.next != NULL &&
        dev->mode_config.connector_list.prev != NULL) {
-        nv_drm_revoke_permission(dev, filep, 0);
+        nv_drm_revoke_modeset_permission(dev, filep, 0);
    }
 }
 #endif /* NV_DRM_ATOMIC_MODESET_AVAILABLE */
@ -1310,6 +1503,18 @@ static const struct drm_ioctl_desc nv_drm_ioctls[] = {
    DRM_IOCTL_DEF_DRV(NVIDIA_GEM_PRIME_FENCE_ATTACH,
                      nv_drm_gem_prime_fence_attach_ioctl,
                      DRM_RENDER_ALLOW|DRM_UNLOCKED),
+    DRM_IOCTL_DEF_DRV(NVIDIA_SEMSURF_FENCE_CTX_CREATE,
+                      nv_drm_semsurf_fence_ctx_create_ioctl,
+                      DRM_RENDER_ALLOW|DRM_UNLOCKED),
+    DRM_IOCTL_DEF_DRV(NVIDIA_SEMSURF_FENCE_CREATE,
+                      nv_drm_semsurf_fence_create_ioctl,
+                      DRM_RENDER_ALLOW|DRM_UNLOCKED),
+    DRM_IOCTL_DEF_DRV(NVIDIA_SEMSURF_FENCE_WAIT,
+                      nv_drm_semsurf_fence_wait_ioctl,
+                      DRM_RENDER_ALLOW|DRM_UNLOCKED),
+    DRM_IOCTL_DEF_DRV(NVIDIA_SEMSURF_FENCE_ATTACH,
+                      nv_drm_semsurf_fence_attach_ioctl,
+                      DRM_RENDER_ALLOW|DRM_UNLOCKED),
 #endif

    DRM_IOCTL_DEF_DRV(NVIDIA_GET_CLIENT_CAPABILITY,
@ -1513,6 +1718,30 @@ static void nv_drm_register_drm_device(const nv_gpu_info_t *gpu_info)
        goto failed_drm_register;
    }

+#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE)
+    if (nv_drm_fbdev_module_param &&
+        drm_core_check_feature(dev, DRIVER_MODESET)) {
+
+        if (!nvKms->grabOwnership(nv_dev->pDevice)) {
+            NV_DRM_DEV_LOG_ERR(nv_dev, "Failed to grab NVKMS modeset ownership");
+            goto failed_grab_ownership;
+        }
+
+        if (device->bus == &pci_bus_type) {
+            struct pci_dev *pdev = to_pci_dev(device);
+
+#if defined(NV_DRM_APERTURE_REMOVE_CONFLICTING_PCI_FRAMEBUFFERS_HAS_DRIVER_ARG)
+            drm_aperture_remove_conflicting_pci_framebuffers(pdev, &nv_drm_driver);
+#else
+            drm_aperture_remove_conflicting_pci_framebuffers(pdev, nv_drm_driver.name);
+#endif
+        }
+        drm_fbdev_generic_setup(dev, 32);
+
+        nv_dev->hasFramebufferConsole = NV_TRUE;
+    }
+#endif /* defined(NV_DRM_FBDEV_GENERIC_AVAILABLE) */
+
    /* Add NVIDIA-DRM device into list */

    nv_dev->next = dev_list;
@ -1520,6 +1749,12 @@ static void nv_drm_register_drm_device(const nv_gpu_info_t *gpu_info)

    return; /* Success */

+#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE)
+failed_grab_ownership:
+
+    drm_dev_unregister(dev);
+#endif
+
 failed_drm_register:

    nv_drm_dev_free(dev);
@ -1582,9 +1817,16 @@ void nv_drm_remove_devices(void)
 {
    while (dev_list != NULL) {
        struct nv_drm_device *next = dev_list->next;
+        struct drm_device *dev = dev_list->dev;

-        drm_dev_unregister(dev_list->dev);
-        nv_drm_dev_free(dev_list->dev);
+#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE)
+        if (dev_list->hasFramebufferConsole) {
+            drm_atomic_helper_shutdown(dev);
+            nvKms->releaseOwnership(dev_list->pDevice);
+        }
+#endif
+        drm_dev_unregister(dev);
+        nv_drm_dev_free(dev);

        nv_drm_free(dev_list);

--- a/kernel-open/nvidia-drm/nvidia-drm-fence.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-fence.c
@ -38,6 +38,8 @@

 #include "nvidia-dma-fence-helper.h"

+#define NV_DRM_SEMAPHORE_SURFACE_FENCE_MAX_TIMEOUT_MS 5000
+
 struct nv_drm_fence_context;

 struct nv_drm_fence_context_ops {
@ -45,17 +47,19 @@ struct nv_drm_fence_context_ops {
 };

 struct nv_drm_fence_context {
+    struct nv_drm_gem_object base;
+
    const struct nv_drm_fence_context_ops *ops;

    struct nv_drm_device *nv_dev;
-    uint32_t context;
+    uint64_t context;
+
+    NvU64 fenceSemIndex; /* Index into semaphore surface */
 };

 struct nv_drm_prime_fence_context {
    struct nv_drm_fence_context base;

-    NvU64 fenceSemIndex; /* Index into semaphore surface */
-
    /* Mapped semaphore surface */
    struct NvKmsKapiMemory *pSemSurface;
    NvU32 *pLinearAddress;
@ -181,7 +185,7 @@ static void nv_drm_gem_prime_fence_event

        /* Index into surface with 16 byte stride */
        unsigned int seqno = *((nv_fence_context->pLinearAddress) +
-                               (nv_fence_context->fenceSemIndex * 4));
+                               (nv_fence_context->base.fenceSemIndex * 4));

        if (nv_fence->base.seqno > seqno) {
            /*
@ -199,8 +203,8 @@ static void nv_drm_gem_prime_fence_event
 }

 static inline struct nv_drm_prime_fence_context*
-to_prime_fence_context(struct nv_drm_fence_context *nv_fence_context) {
-    return (struct nv_drm_prime_fence_context *)nv_fence_context;
+to_nv_prime_fence_context(struct nv_drm_fence_context *nv_fence_context) {
+    return container_of(nv_fence_context, struct nv_drm_prime_fence_context, base);
 }

 static void __nv_drm_prime_fence_context_destroy(
@ -208,7 +212,7 @@ static void __nv_drm_prime_fence_context_destroy(
 {
    struct nv_drm_device *nv_dev = nv_fence_context->nv_dev;
    struct nv_drm_prime_fence_context *nv_prime_fence_context =
-        to_prime_fence_context(nv_fence_context);
+        to_nv_prime_fence_context(nv_fence_context);

    /*
     * Free channel event before destroying the fence context, otherwise event
@ -293,9 +297,9 @@ __nv_drm_prime_fence_context_new(
        .base.ops = &nv_drm_prime_fence_context_ops,
        .base.nv_dev = nv_dev,
        .base.context = nv_dma_fence_context_alloc(1),
+        .base.fenceSemIndex = p->index,
        .pSemSurface = pSemSurface,
        .pLinearAddress = pLinearAddress,
-        .fenceSemIndex = p->index,
    };

    INIT_LIST_HEAD(&nv_prime_fence_context->pending);
@ -391,47 +395,37 @@ int nv_drm_fence_supported_ioctl(struct drm_device *dev,
    return nv_dev->pDevice ? 0 : -EINVAL;
 }

-struct nv_drm_gem_fence_context {
-    struct nv_drm_gem_object base;
-    struct nv_drm_fence_context *nv_fence_context;
-};
-
-static inline struct nv_drm_gem_fence_context *to_gem_fence_context(
+static inline struct nv_drm_fence_context *to_nv_fence_context(
    struct nv_drm_gem_object *nv_gem)
 {
    if (nv_gem != NULL) {
-        return container_of(nv_gem, struct nv_drm_gem_fence_context, base);
+        return container_of(nv_gem, struct nv_drm_fence_context, base);
    }

    return NULL;
 }

 /*
- * Tear down of the 'struct nv_drm_gem_fence_context' object is not expected
+ * Tear down of the 'struct nv_drm_fence_context' object is not expected
 * to be happen from any worker thread, if that happen it causes dead-lock
 * because tear down sequence calls to flush all existing
 * worker thread.
 */
 static void
-__nv_drm_gem_fence_context_free(struct nv_drm_gem_object *nv_gem)
+__nv_drm_fence_context_gem_free(struct nv_drm_gem_object *nv_gem)
 {
-    struct nv_drm_gem_fence_context *nv_gem_fence_context =
-        to_gem_fence_context(nv_gem);
-    struct nv_drm_fence_context *nv_fence_context =
-        nv_gem_fence_context->nv_fence_context;
+    struct nv_drm_fence_context *nv_fence_context = to_nv_fence_context(nv_gem);

    nv_fence_context->ops->destroy(nv_fence_context);
-
-    nv_drm_free(nv_gem_fence_context);
 }

-const struct nv_drm_gem_object_funcs nv_gem_fence_context_ops = {
-    .free = __nv_drm_gem_fence_context_free,
+const struct nv_drm_gem_object_funcs nv_fence_context_gem_ops = {
+    .free = __nv_drm_fence_context_gem_free,
 };

 static inline
-struct nv_drm_gem_fence_context *
-__nv_drm_gem_object_fence_context_lookup(
+struct nv_drm_fence_context *
+__nv_drm_fence_context_lookup(
    struct drm_device *dev,
    struct drm_file *filp,
    u32 handle)
@ -439,43 +433,31 @@ __nv_drm_gem_object_fence_context_lookup(
    struct nv_drm_gem_object *nv_gem =
            nv_drm_gem_object_lookup(dev, filp, handle);

-    if (nv_gem != NULL && nv_gem->ops != &nv_gem_fence_context_ops) {
+    if (nv_gem != NULL && nv_gem->ops != &nv_fence_context_gem_ops) {
        nv_drm_gem_object_unreference_unlocked(nv_gem);
        return NULL;
    }

-    return to_gem_fence_context(nv_gem);
+    return to_nv_fence_context(nv_gem);
 }

 static int
-__nv_drm_gem_fence_context_create(struct drm_device *dev,
-                                  struct nv_drm_fence_context *nv_fence_context,
-                                  u32 *handle,
-                                  struct drm_file *filep)
+__nv_drm_fence_context_gem_init(struct drm_device *dev,
+                                struct nv_drm_fence_context *nv_fence_context,
+                                u32 *handle,
+                                struct drm_file *filep)
 {
    struct nv_drm_device *nv_dev = to_nv_device(dev);
-    struct nv_drm_gem_fence_context *nv_gem_fence_context = NULL;
-
-    if ((nv_gem_fence_context = nv_drm_calloc(
-                1,
-                sizeof(struct nv_drm_gem_fence_context))) == NULL) {
-        goto done;
-    }
-
-    nv_gem_fence_context->nv_fence_context = nv_fence_context;

    nv_drm_gem_object_init(nv_dev,
-                           &nv_gem_fence_context->base,
-                           &nv_gem_fence_context_ops,
+                           &nv_fence_context->base,
+                           &nv_fence_context_gem_ops,
                           0 /* size */,
                           NULL /* pMemory */);

    return nv_drm_gem_handle_create_drop_reference(filep,
-                                                   &nv_gem_fence_context->base,
+                                                   &nv_fence_context->base,
                                                   handle);
-
-done:
-    return -ENOMEM;
 }

 int nv_drm_prime_fence_context_create_ioctl(struct drm_device *dev,
@ -491,10 +473,10 @@ int nv_drm_prime_fence_context_create_ioctl(struct drm_device *dev,
        goto done;
    }

-    err = __nv_drm_gem_fence_context_create(dev,
-                                            &nv_prime_fence_context->base,
-                                            &p->handle,
-                                            filep);
+    err = __nv_drm_fence_context_gem_init(dev,
+                                          &nv_prime_fence_context->base,
+                                          &p->handle,
+                                          filep);
    if (err) {
        __nv_drm_prime_fence_context_destroy(&nv_prime_fence_context->base);
    }
@ -505,6 +487,31 @@ done:
    return -ENOMEM;
 }

+static int __nv_drm_gem_attach_fence(struct nv_drm_gem_object *nv_gem,
+                                     nv_dma_fence_t *fence,
+                                     bool shared)
+{
+    nv_dma_resv_t *resv = nv_drm_gem_res_obj(nv_gem);
+    int ret;
+
+    nv_dma_resv_lock(resv, NULL);
+
+    ret = nv_dma_resv_reserve_fences(resv, 1, shared);
+    if (ret == 0) {
+        if (shared) {
+            nv_dma_resv_add_shared_fence(resv, fence);
+        } else {
+            nv_dma_resv_add_excl_fence(resv, fence);
+        }
+    } else {
+        NV_DRM_LOG_ERR("Failed to reserve fence. Error code: %d", ret);
+    }
+
+    nv_dma_resv_unlock(resv);
+
+    return ret;
+}
+
 int nv_drm_gem_prime_fence_attach_ioctl(struct drm_device *dev,
                                        void *data, struct drm_file *filep)
 {
@ -513,10 +520,13 @@ int nv_drm_gem_prime_fence_attach_ioctl(struct drm_device *dev,
    struct drm_nvidia_gem_prime_fence_attach_params *p = data;

    struct nv_drm_gem_object *nv_gem;
-    struct nv_drm_gem_fence_context *nv_gem_fence_context;
-
+    struct nv_drm_fence_context *nv_fence_context;
    nv_dma_fence_t *fence;
-    nv_dma_resv_t *resv;
+
+    if (p->__pad != 0) {
+        NV_DRM_DEV_LOG_ERR(nv_dev, "Padding fields must be zeroed");
+        goto done;
+    }

    nv_gem = nv_drm_gem_object_lookup(nv_dev->dev, filep, p->handle);

@ -529,7 +539,7 @@ int nv_drm_gem_prime_fence_attach_ioctl(struct drm_device *dev,
        goto done;
    }

-    if((nv_gem_fence_context = __nv_drm_gem_object_fence_context_lookup(
+    if((nv_fence_context = __nv_drm_fence_context_lookup(
                nv_dev->dev,
                filep,
                p->fence_context_handle)) == NULL) {
@ -542,7 +552,7 @@ int nv_drm_gem_prime_fence_attach_ioctl(struct drm_device *dev,
        goto fence_context_lookup_failed;
    }

-    if (nv_gem_fence_context->nv_fence_context->ops !=
+    if (nv_fence_context->ops !=
        &nv_drm_prime_fence_context_ops) {

        NV_DRM_DEV_LOG_ERR(
@ -554,7 +564,7 @@ int nv_drm_gem_prime_fence_attach_ioctl(struct drm_device *dev,
    }

    fence = __nv_drm_prime_fence_context_create_fence(
-                to_prime_fence_context(nv_gem_fence_context->nv_fence_context),
+                to_nv_prime_fence_context(nv_fence_context),
                p->sem_thresh);

    if (IS_ERR(fence)) {
@ -567,26 +577,12 @@ int nv_drm_gem_prime_fence_attach_ioctl(struct drm_device *dev,
        goto fence_context_create_fence_failed;
    }

-    resv = nv_drm_gem_res_obj(nv_gem);
+    ret = __nv_drm_gem_attach_fence(nv_gem, fence, true /* exclusive */);

-    nv_dma_resv_lock(resv, NULL);
-
-    ret = nv_dma_resv_reserve_fences(resv, 1, false);
-    if (ret == 0) {
-        nv_dma_resv_add_excl_fence(resv, fence);
-    } else {
-        NV_DRM_DEV_LOG_ERR(
-            nv_dev,
-            "Failed to reserve fence. Error code: %d", ret);
-    }
-
-    nv_dma_resv_unlock(resv);
-
-    /* dma_resv_add_excl_fence takes its own reference to the fence. */
    nv_dma_fence_put(fence);

 fence_context_create_fence_failed:
-    nv_drm_gem_object_unreference_unlocked(&nv_gem_fence_context->base);
+    nv_drm_gem_object_unreference_unlocked(&nv_fence_context->base);

 fence_context_lookup_failed:
    nv_drm_gem_object_unreference_unlocked(nv_gem);
@ -595,6 +591,1224 @@ done:
    return ret;
 }

+struct nv_drm_semsurf_fence {
+    nv_dma_fence_t base;
+    spinlock_t lock;
+
+    /*
+     * When unsignaled, node in the associated fence context's pending fence
+     * list. The list holds a reference to the fence
+     */
+    struct list_head pending_node;
+
+#if !defined(NV_DMA_FENCE_OPS_HAS_USE_64BIT_SEQNO)
+    /* 64-bit version of base.seqno on kernels with 32-bit fence seqno */
+    NvU64 wait_value;
+#endif
+
+    /*
+     * Raw absolute kernel time (time domain and scale are treated as opaque)
+     * when this fence times out.
+     */
+    unsigned long timeout;
+};
+
+struct nv_drm_semsurf_fence_callback {
+    struct nv_drm_semsurf_fence_ctx *ctx;
+    nv_drm_work work;
+    NvU64 wait_value;
+};
+
+struct nv_drm_sync_fd_wait_data {
+    nv_dma_fence_cb_t dma_fence_cb;
+    struct nv_drm_semsurf_fence_ctx *ctx;
+    nv_drm_work work; /* Deferred second half of fence wait callback */
+
+    /* Could use a lockless list data structure here instead */
+    struct list_head pending_node;
+
+    NvU64 pre_wait_value;
+    NvU64 post_wait_value;
+};
+
+struct nv_drm_semsurf_fence_ctx {
+    struct nv_drm_fence_context base;
+
+    /* The NVKMS KAPI reference to the context's semaphore surface */
+    struct NvKmsKapiSemaphoreSurface *pSemSurface;
+
+    /* CPU mapping of the semaphore slot values */
+    union {
+        volatile void *pVoid;
+        volatile NvU32 *p32;
+        volatile NvU64 *p64;
+    } pSemMapping;
+    volatile NvU64 *pMaxSubmittedMapping;
+
+    /* work thread for fence timeouts and waits */
+    nv_drm_workthread worker;
+
+    /* Timeout timer and associated workthread work */
+    nv_drm_timer timer;
+    nv_drm_work timeout_work;
+
+    /* Protects access to everything below */
+    spinlock_t lock;
+
+    /* List of pending fences which are not yet signaled */
+    struct list_head pending_fences;
+
+    /* List of pending fence wait operations */
+    struct list_head pending_waits;
+
+    /*
+     * Tracking data for the single in-flight callback associated with this
+     * context. Either both pointers will be valid, or both will be NULL.
+     *
+     * Note it is not safe to dereference these values outside of the context
+     * lock unless it is certain the associated callback is not yet active,
+     * or has been canceled. Their memory is owned by the callback itself as
+     * soon as it is registered. Subtly, this means these variables can not
+     * be used as output parameters to the function that registers the callback.
+     */
+    struct {
+        struct nv_drm_semsurf_fence_callback *local;
+        struct NvKmsKapiSemaphoreSurfaceCallback *nvKms;
+    } callback;
+
+    /*
+     * Wait value associated with either the above or a being-registered
+     * callback. May differ from callback->local->wait_value if it is the
+     * latter. Zero if no callback is currently needed.
+     */
+    NvU64 current_wait_value;
+};
+
+static inline struct nv_drm_semsurf_fence_ctx*
+to_semsurf_fence_ctx(
+    struct nv_drm_fence_context *nv_fence_context
+)
+{
+    return container_of(nv_fence_context,
+                        struct nv_drm_semsurf_fence_ctx,
+                        base);
+}
+
+static inline NvU64
+__nv_drm_get_semsurf_fence_seqno(const struct nv_drm_semsurf_fence *nv_fence)
+{
+#if defined(NV_DMA_FENCE_OPS_HAS_USE_64BIT_SEQNO)
+    return nv_fence->base.seqno;
+#else
+    return nv_fence->wait_value;
+#endif
+}
+
+#ifndef READ_ONCE
+#define READ_ONCE(x) ACCESS_ONCE(x)
+#endif
+
+static inline NvU64
+__nv_drm_get_semsurf_ctx_seqno(struct nv_drm_semsurf_fence_ctx *ctx)
+{
+    NvU64 semVal;
+
+    if (ctx->pMaxSubmittedMapping) {
+        /* 32-bit GPU semaphores */
+        NvU64 maxSubmitted = READ_ONCE(*ctx->pMaxSubmittedMapping);
+
+        /*
+         * Must happen after the max submitted read! See
+         * NvTimeSemFermiGetPayload() for full details.
+         */
+        semVal = READ_ONCE(*ctx->pSemMapping.p32);
+
+        if ((maxSubmitted & 0xFFFFFFFFull) < semVal) {
+            maxSubmitted -= 0x100000000ull;
+        }
+
+        semVal |= (maxSubmitted & 0xffffffff00000000ull);
+    } else {
+        /* 64-bit GPU semaphores */
+        semVal = READ_ONCE(*ctx->pSemMapping.p64);
+    }
+
+    return semVal;
+}
+
+static void
+__nv_drm_semsurf_force_complete_pending(struct nv_drm_semsurf_fence_ctx *ctx)
+{
+    unsigned long flags;
+
+    /*
+     * No locks are needed for the pending_fences list. This code runs after all
+     * other possible references to the fence context have been removed. The
+     * fences have their own individual locks to protect themselves.
+     */
+    while (!list_empty(&ctx->pending_fences)) {
+        struct nv_drm_semsurf_fence *nv_fence = list_first_entry(
+            &ctx->pending_fences,
+            typeof(*nv_fence),
+            pending_node);
+        nv_dma_fence_t *fence = &nv_fence->base;
+
+        list_del(&nv_fence->pending_node);
+
+        nv_dma_fence_set_error(fence, -ETIMEDOUT);
+        nv_dma_fence_signal(fence);
+
+        /* Remove the pending list's reference */
+        nv_dma_fence_put(fence);
+    }
+
+    /*
+     * The pending waits are also referenced by the fences they are waiting on,
+     * but those fences are guaranteed to complete in finite time. Just keep the
+     * the context alive until they do so.
+     */
+    spin_lock_irqsave(&ctx->lock, flags);
+    while (!list_empty(&ctx->pending_waits)) {
+        spin_unlock_irqrestore(&ctx->lock, flags);
+        nv_drm_yield();
+        spin_lock_irqsave(&ctx->lock, flags);
+    }
+    spin_unlock_irqrestore(&ctx->lock, flags);
+}
+
+/* Forward declaration */
+static void
+__nv_drm_semsurf_ctx_reg_callbacks(struct nv_drm_semsurf_fence_ctx *ctx);
+
+static void
+__nv_drm_semsurf_ctx_fence_callback_work(void *data)
+{
+    struct nv_drm_semsurf_fence_callback *callback = data;
+
+    __nv_drm_semsurf_ctx_reg_callbacks(callback->ctx);
+
+    nv_drm_free(callback);
+}
+
+static struct nv_drm_semsurf_fence_callback*
+__nv_drm_semsurf_new_callback(struct nv_drm_semsurf_fence_ctx *ctx)
+{
+    struct nv_drm_semsurf_fence_callback *newCallback =
+        nv_drm_calloc(1, sizeof(*newCallback));
+
+    if (!newCallback) {
+        return NULL;
+    }
+
+    newCallback->ctx = ctx;
+    nv_drm_workthread_work_init(&newCallback->work,
+                                __nv_drm_semsurf_ctx_fence_callback_work,
+                                newCallback);
+
+    return newCallback;
+}
+
+static void
+__nv_drm_semsurf_ctx_process_completed(struct nv_drm_semsurf_fence_ctx *ctx,
+                                       NvU64 *newWaitValueOut,
+                                       unsigned long *newTimeoutOut)
+{
+    struct list_head finished;
+    struct list_head timed_out;
+    struct nv_drm_semsurf_fence *nv_fence;
+    nv_dma_fence_t *fence;
+    NvU64 currentSeqno = __nv_drm_get_semsurf_ctx_seqno(ctx);
+    NvU64 fenceSeqno = 0;
+    unsigned long flags;
+    unsigned long fenceTimeout = 0;
+    unsigned long now = nv_drm_timer_now();
+
+    INIT_LIST_HEAD(&finished);
+    INIT_LIST_HEAD(&timed_out);
+
+    spin_lock_irqsave(&ctx->lock, flags);
+
+    while (!list_empty(&ctx->pending_fences)) {
+        nv_fence = list_first_entry(&ctx->pending_fences,
+                                    typeof(*nv_fence),
+                                    pending_node);
+
+        fenceSeqno = __nv_drm_get_semsurf_fence_seqno(nv_fence);
+        fenceTimeout = nv_fence->timeout;
+
+        if (fenceSeqno <= currentSeqno) {
+            list_move_tail(&nv_fence->pending_node, &finished);
+        } else if (fenceTimeout <= now) {
+            list_move_tail(&nv_fence->pending_node, &timed_out);
+        } else {
+            break;
+        }
+    }
+
+    /*
+     * If the caller passes non-NULL newWaitValueOut and newTimeoutOut
+     * parameters, it establishes a contract. If the returned values are
+     * non-zero, the caller must attempt to register a callback associated with
+     * the new wait value and reset the context's timer to the specified
+     * timeout.
+     */
+    if (newWaitValueOut && newTimeoutOut) {
+        if (list_empty(&ctx->pending_fences)) {
+            /* No pending fences, so no waiter is needed. */
+            ctx->current_wait_value = fenceSeqno = 0;
+            fenceTimeout = 0;
+        } else if (fenceSeqno == ctx->current_wait_value) {
+            /*
+             * The context already has a waiter registered, or in the process of
+             * being registered, for this fence. Indicate to the caller no new
+             * waiter registration is needed, and leave the ctx state alone.
+             */
+            fenceSeqno = 0;
+            fenceTimeout = 0;
+        } else {
+            /* A new waiter must be registered. Prep the context */
+            ctx->current_wait_value = fenceSeqno;
+        }
+
+        *newWaitValueOut = fenceSeqno;
+        *newTimeoutOut = fenceTimeout;
+    }
+
+    spin_unlock_irqrestore(&ctx->lock, flags);
+
+    while (!list_empty(&finished)) {
+        nv_fence = list_first_entry(&finished, typeof(*nv_fence), pending_node);
+        list_del_init(&nv_fence->pending_node);
+        fence = &nv_fence->base;
+        nv_dma_fence_signal(fence);
+        nv_dma_fence_put(fence); /* Drops the pending list's reference */
+    }
+
+    while (!list_empty(&timed_out)) {
+        nv_fence = list_first_entry(&timed_out, typeof(*nv_fence),
+                                    pending_node);
+        list_del_init(&nv_fence->pending_node);
+        fence = &nv_fence->base;
+        nv_dma_fence_set_error(fence, -ETIMEDOUT);
+        nv_dma_fence_signal(fence);
+        nv_dma_fence_put(fence); /* Drops the pending list's reference */
+    }
+}
+
+static void
+__nv_drm_semsurf_ctx_callback(void *data)
+{
+    struct nv_drm_semsurf_fence_callback *callback = data;
+    struct nv_drm_semsurf_fence_ctx *ctx = callback->ctx;
+    unsigned long flags;
+
+    spin_lock_irqsave(&ctx->lock, flags);
+    /* If this was the context's currently registered callback, clear it. */
+    if (ctx->callback.local == callback) {
+        ctx->callback.local = NULL;
+        ctx->callback.nvKms = NULL;
+    }
+    /* If storing of this callback may have been pending, prevent it. */
+    if (ctx->current_wait_value == callback->wait_value) {
+        ctx->current_wait_value = 0;
+    }
+    spin_unlock_irqrestore(&ctx->lock, flags);
+
+    /*
+     * This is redundant with the __nv_drm_semsurf_ctx_reg_callbacks() call from
+     * __nv_drm_semsurf_ctx_fence_callback_work(), which will be called by the
+     * work enqueued below, but calling it here as well allows unblocking
+     * waiters with less latency.
+     */
+    __nv_drm_semsurf_ctx_process_completed(ctx, NULL, NULL);
+
+    if (!nv_drm_workthread_add_work(&ctx->worker, &callback->work)) {
+        /*
+         * The context is shutting down. It will force-signal all fences when
+         * doing so, so there's no need for any more callback handling.
+         */
+        nv_drm_free(callback);
+    }
+}
+
+/*
+ * Take spin lock, attempt to stash newNvKmsCallback/newCallback in ctx.
+ * If current_wait_value in fence context != new_wait_value, we raced with
+ * someone registering a newer waiter. Release spin lock, and unregister our
+ * waiter. It isn't needed anymore.
+ */
+static bool
+__nv_drm_semsurf_ctx_store_callback(
+    struct nv_drm_semsurf_fence_ctx *ctx,
+    NvU64 new_wait_value,
+    struct NvKmsKapiSemaphoreSurfaceCallback *newNvKmsCallback,
+    struct nv_drm_semsurf_fence_callback *newCallback)
+{
+    struct nv_drm_device *nv_dev = ctx->base.nv_dev;
+    struct NvKmsKapiSemaphoreSurfaceCallback *oldNvKmsCallback;
+    struct nv_drm_semsurf_fence_callback *oldCallback = NULL;
+    NvU64 oldWaitValue;
+    unsigned long flags;
+    bool installed = false;
+
+    spin_lock_irqsave(&ctx->lock, flags);
+    if (ctx->current_wait_value == new_wait_value) {
+        oldCallback = ctx->callback.local;
+        oldNvKmsCallback = ctx->callback.nvKms;
+        oldWaitValue = oldCallback ? oldCallback->wait_value : 0;
+        ctx->callback.local = newCallback;
+        ctx->callback.nvKms = newNvKmsCallback;
+        installed = true;
+    }
+    spin_unlock_irqrestore(&ctx->lock, flags);
+
+    if (oldCallback) {
+        if (nvKms->unregisterSemaphoreSurfaceCallback(nv_dev->pDevice,
+                                                      ctx->pSemSurface,
+                                                      ctx->base.fenceSemIndex,
+                                                      oldWaitValue,
+                                                      oldNvKmsCallback)) {
+            /*
+             * The old callback was successfully canceled, and its NVKMS and RM
+             * resources have been freed. Free its local tracking data.
+             */
+            nv_drm_free(oldCallback);
+        } else {
+            /*
+             * The new callback is already running. It will do no harm, and free
+             * itself.
+             */
+        }
+    }
+
+    return installed;
+}
+
+/*
+ * Processes completed fences and registers an RM callback and a timeout timer
+ * for the next incomplete fence, if any. To avoid calling in to RM while
+ * holding a spinlock, this is done in a loop until the state settles.
+ *
+ * Can NOT be called from in an atomic context/interrupt handler.
+ */
+static void
+__nv_drm_semsurf_ctx_reg_callbacks(struct nv_drm_semsurf_fence_ctx *ctx)
+
+{
+    struct nv_drm_device *nv_dev = ctx->base.nv_dev;
+    struct nv_drm_semsurf_fence_callback *newCallback =
+        __nv_drm_semsurf_new_callback(ctx);
+    struct NvKmsKapiSemaphoreSurfaceCallback *newNvKmsCallback;
+    NvU64 newWaitValue;
+    unsigned long newTimeout;
+    NvKmsKapiRegisterWaiterResult kapiRet;
+
+    if (!newCallback) {
+        NV_DRM_DEV_LOG_ERR(
+            nv_dev,
+            "Failed to allocate new fence signal callback data");
+        return;
+    }
+
+    do {
+        /*
+         * Process any completed or timed out fences. This returns the wait
+         * value and timeout of the first remaining pending fence, or 0/0
+         * if no pending fences remain. It will also tag the context as
+         * waiting for the value returned.
+         */
+        __nv_drm_semsurf_ctx_process_completed(ctx,
+                                               &newWaitValue,
+                                               &newTimeout);
+
+        if (newWaitValue == 0) {
+            /* No fences remain, so no callback is needed. */
+            nv_drm_free(newCallback);
+            newCallback = NULL;
+            return;
+        }
+
+        newCallback->wait_value = newWaitValue;
+
+        /*
+         * Attempt to register a callback for the remaining fences. Note this
+         * code may be running concurrently in multiple places, attempting to
+         * register a callback for the same value, a value greater than
+         * newWaitValue if more fences have since completed, or a value less
+         * than newWaitValue if new fences have been created tracking lower
+         * values than the previously lowest pending one. Hence, even if this
+         * registration succeeds, the callback may be discarded
+         */
+        kapiRet =
+            nvKms->registerSemaphoreSurfaceCallback(nv_dev->pDevice,
+                                                    ctx->pSemSurface,
+                                                    __nv_drm_semsurf_ctx_callback,
+                                                    newCallback,
+                                                    ctx->base.fenceSemIndex,
+                                                    newWaitValue,
+                                                    0,
+                                                    &newNvKmsCallback);
+    } while (kapiRet == NVKMS_KAPI_REG_WAITER_ALREADY_SIGNALLED);
+
+    /* Can't deref newCallback at this point unless kapiRet indicates failure */
+
+    if (kapiRet != NVKMS_KAPI_REG_WAITER_SUCCESS) {
+        /*
+         * This is expected if another thread concurrently registered a callback
+         * for the same value, which is fine. That thread's callback will do the
+         * same work this thread's would have. Clean this one up and return.
+         *
+         * Another possibility is that an allocation or some other low-level
+         * operation that can spuriously fail has caused this failure, or of
+         * course a bug resulting in invalid usage of the
+         * registerSemaphoreSurfaceCallback() API. There is no good way to
+         * handle such failures, so the fence timeout will be relied upon to
+         * guarantee forward progress in those cases.
+         */
+        nv_drm_free(newCallback);
+        return;
+    }
+
+    nv_drm_mod_timer(&ctx->timer, newTimeout);
+
+    if (!__nv_drm_semsurf_ctx_store_callback(ctx,
+                                             newWaitValue,
+                                             newNvKmsCallback,
+                                             newCallback)) {
+        /*
+         * Another thread registered a callback for a different value before
+         * this thread's callback could be stored in the context, or the
+         * callback is already running. That's OK. One of the following is true:
+         *
+         * -A new fence with a lower value has been registered, and the callback
+         *  associated with that fence is now active and associated with the
+         *  context.
+         *
+         * -This fence has already completed, and a new callback associated with
+         *  a higher value has been registered and associated with the context.
+         *  This lower-value callback is no longer needed, as any fences
+         *  associated with it must have been marked completed before
+         *  registering the higher-value callback.
+         *
+         * -The callback started running and cleared ctx->current_wait_value
+         *  before the callback could be stored in the context. Work to signal
+         *  the fence is now pending.
+         *
+         * Hence, it is safe to request cancellation of the callback and free
+         * the associated data if cancellation succeeds.
+         */
+        if (nvKms->unregisterSemaphoreSurfaceCallback(nv_dev->pDevice,
+                                                      ctx->pSemSurface,
+                                                      ctx->base.fenceSemIndex,
+                                                      newWaitValue,
+                                                      newNvKmsCallback)) {
+            /* RM callback successfully canceled. Free local tracking data */
+            nv_drm_free(newCallback);
+        }
+    }
+}
+
+static void __nv_drm_semsurf_fence_ctx_destroy(
+    struct nv_drm_fence_context *nv_fence_context)
+{
+    struct nv_drm_device *nv_dev = nv_fence_context->nv_dev;
+    struct nv_drm_semsurf_fence_ctx *ctx =
+        to_semsurf_fence_ctx(nv_fence_context);
+    struct NvKmsKapiSemaphoreSurfaceCallback *pendingNvKmsCallback;
+    NvU64 pendingWaitValue;
+    unsigned long flags;
+
+    /*
+     * The workthread must be shut down before the timer is stopped to ensure
+     * the timer does not queue work that restarts itself.
+     */
+    nv_drm_workthread_shutdown(&ctx->worker);
+
+    nv_drm_del_timer_sync(&ctx->timer);
+
+    /*
+     * The semaphore surface could still be sending callbacks, so it is still
+     * not safe to dereference the ctx->callback pointers. However,
+     * unregistering a callback via its handle is safe, as that code in NVKMS
+     * takes care to avoid dereferencing the handle until it knows the callback
+     * has been canceled in RM. This unregistration must be done to ensure the
+     * callback data is not leaked in NVKMS if it is still pending, as freeing
+     * the semaphore surface only cleans up RM's callback data.
+     */
+    spin_lock_irqsave(&ctx->lock, flags);
+    pendingNvKmsCallback = ctx->callback.nvKms;
+    pendingWaitValue = ctx->callback.local ?
+        ctx->callback.local->wait_value : 0;
+    spin_unlock_irqrestore(&ctx->lock, flags);
+
+    if (pendingNvKmsCallback) {
+        WARN_ON(pendingWaitValue == 0);
+        nvKms->unregisterSemaphoreSurfaceCallback(nv_dev->pDevice,
+                                                  ctx->pSemSurface,
+                                                  ctx->base.fenceSemIndex,
+                                                  pendingWaitValue,
+                                                  pendingNvKmsCallback);
+    }
+
+    nvKms->freeSemaphoreSurface(nv_dev->pDevice, ctx->pSemSurface);
+
+    /*
+     * Now that the semaphore surface, the timer, and the workthread are gone:
+     *
+     * -No more RM/NVKMS callbacks will arrive, nor are any in progress. Freeing
+     *  the semaphore surface cancels all its callbacks associated with this
+     *  instance of it, and idles any pending callbacks.
+     *
+     * -No more timer callbacks will arrive, nor are any in flight.
+     *
+     * -The workthread has been idled and is no longer running.
+     *
+     * Further, given the destructor is running, no other references to the
+     * fence context exist, so this code can assume no concurrent access to the
+     * fence context's data will happen from here on out.
+     */
+
+    if (ctx->callback.local) {
+        nv_drm_free(ctx->callback.local);
+        ctx->callback.local = NULL;
+        ctx->callback.nvKms = NULL;
+    }
+
+    __nv_drm_semsurf_force_complete_pending(ctx);
+
+    nv_drm_free(nv_fence_context);
+}
+
+static void
+__nv_drm_semsurf_ctx_timeout_work(void *data)
+{
+    struct nv_drm_semsurf_fence_ctx *ctx = data;
+
+    __nv_drm_semsurf_ctx_reg_callbacks(ctx);
+}
+
+static void
+__nv_drm_semsurf_ctx_timeout_callback(nv_drm_timer *timer)
+{
+    struct nv_drm_semsurf_fence_ctx *ctx =
+        container_of(timer, typeof(*ctx), timer);
+
+    /*
+     * Schedule work to register new waiter & timer on a worker thread.
+     *
+     * It does not matter if this fails. There are two possible failure cases:
+     *
+     * - ctx->timeout_work is already scheduled. That existing scheduled work
+     *   will do at least as much as work scheduled right now and executed
+     *   immediately, which is sufficient.
+     *
+     * - The context is shutting down. In this case, all fences will be force-
+     *   signalled, so no further callbacks or timeouts are needed.
+     *
+     * Note this work may schedule a new timeout timer. To ensure that doesn't
+     * happen while context shutdown is shutting down and idling the timer, the
+     * the worker thread must be shut down before the timer is stopped.
+     */
+    nv_drm_workthread_add_work(&ctx->worker, &ctx->timeout_work);
+}
+
+static struct nv_drm_fence_context_ops
+nv_drm_semsurf_fence_ctx_ops = {
+    .destroy = __nv_drm_semsurf_fence_ctx_destroy,
+};
+
+static struct nv_drm_semsurf_fence_ctx*
+__nv_drm_semsurf_fence_ctx_new(
+    struct nv_drm_device *nv_dev,
+    struct drm_nvidia_semsurf_fence_ctx_create_params *p
+)
+{
+    struct nv_drm_semsurf_fence_ctx *ctx;
+    struct NvKmsKapiSemaphoreSurface *pSemSurface;
+    uint8_t *semMapping;
+    uint8_t *maxSubmittedMapping;
+    char worker_name[20+16+1]; /* strlen(nvidia-drm/timeline-) + 16 for %llx + NUL */
+
+    pSemSurface = nvKms->importSemaphoreSurface(nv_dev->pDevice,
+                                                p->nvkms_params_ptr,
+                                                p->nvkms_params_size,
+                                                (void **)&semMapping,
+                                                (void **)&maxSubmittedMapping);
+    if (!pSemSurface) {
+        NV_DRM_DEV_LOG_ERR(
+            nv_dev,
+            "Failed to import semaphore surface");
+
+        goto failed;
+    }
+
+    /*
+     * Allocate a fence context object and initialize it.
+     */
+
+    if ((ctx = nv_drm_calloc(1, sizeof(*ctx))) == NULL) {
+        goto failed_alloc_fence_context;
+    }
+
+    semMapping += (p->index * nv_dev->semsurf_stride);
+    if (maxSubmittedMapping) {
+        maxSubmittedMapping += (p->index * nv_dev->semsurf_stride) +
+            nv_dev->semsurf_max_submitted_offset;
+    }
+
+    /*
+     * nv_dma_fence_context_alloc() cannot fail, so we do not need
+     * to check a return value.
+     */
+
+    *ctx = (struct nv_drm_semsurf_fence_ctx) {
+        .base.ops = &nv_drm_semsurf_fence_ctx_ops,
+        .base.nv_dev = nv_dev,
+        .base.context = nv_dma_fence_context_alloc(1),
+        .base.fenceSemIndex = p->index,
+        .pSemSurface = pSemSurface,
+        .pSemMapping.pVoid = semMapping,
+        .pMaxSubmittedMapping = (volatile NvU64 *)maxSubmittedMapping,
+        .callback.local = NULL,
+        .callback.nvKms = NULL,
+        .current_wait_value = 0,
+    };
+
+    spin_lock_init(&ctx->lock);
+    INIT_LIST_HEAD(&ctx->pending_fences);
+    INIT_LIST_HEAD(&ctx->pending_waits);
+
+    sprintf(worker_name, "nvidia-drm/timeline-%llx",
+            (long long unsigned)ctx->base.context);
+    if (!nv_drm_workthread_init(&ctx->worker, worker_name)) {
+        goto failed_alloc_worker;
+    }
+
+    nv_drm_workthread_work_init(&ctx->timeout_work,
+                                __nv_drm_semsurf_ctx_timeout_work,
+                                ctx);
+
+    nv_drm_timer_setup(&ctx->timer, __nv_drm_semsurf_ctx_timeout_callback);
+
+    return ctx;
+
+failed_alloc_worker:
+    nv_drm_free(ctx);
+
+failed_alloc_fence_context:
+    nvKms->freeSemaphoreSurface(nv_dev->pDevice, pSemSurface);
+
+failed:
+    return NULL;
+
+}
+
+int nv_drm_semsurf_fence_ctx_create_ioctl(struct drm_device *dev,
+                                          void *data,
+                                          struct drm_file *filep)
+{
+    struct nv_drm_device *nv_dev = to_nv_device(dev);
+    struct drm_nvidia_semsurf_fence_ctx_create_params *p = data;
+    struct nv_drm_semsurf_fence_ctx *ctx;
+    int err;
+
+    if (p->__pad != 0) {
+        NV_DRM_DEV_LOG_ERR(nv_dev, "Padding fields must be zeroed");
+        return -EINVAL;
+    }
+
+    ctx = __nv_drm_semsurf_fence_ctx_new(nv_dev, p);
+
+    if (!ctx) {
+        return -ENOMEM;
+    }
+
+    err = __nv_drm_fence_context_gem_init(dev, &ctx->base, &p->handle, filep);
+
+    if (err) {
+        __nv_drm_semsurf_fence_ctx_destroy(&ctx->base);
+    }
+
+    return err;
+}
+
+static inline struct nv_drm_semsurf_fence*
+to_nv_drm_semsurf_fence(nv_dma_fence_t *fence)
+{
+    return container_of(fence, struct nv_drm_semsurf_fence, base);
+}
+
+static const char*
+__nv_drm_semsurf_fence_op_get_timeline_name(nv_dma_fence_t *fence)
+{
+    return "nvidia.semaphore_surface";
+}
+
+static bool
+__nv_drm_semsurf_fence_op_enable_signaling(nv_dma_fence_t *fence)
+{
+    // DO NOTHING - Could defer RM callback registration until this point
+    return true;
+}
+
+static void
+__nv_drm_semsurf_fence_op_release(nv_dma_fence_t *fence)
+{
+    struct nv_drm_semsurf_fence *nv_fence =
+        to_nv_drm_semsurf_fence(fence);
+
+    nv_drm_free(nv_fence);
+}
+
+static const nv_dma_fence_ops_t nv_drm_semsurf_fence_ops = {
+    .get_driver_name = nv_drm_gem_fence_op_get_driver_name,
+    .get_timeline_name = __nv_drm_semsurf_fence_op_get_timeline_name,
+    .enable_signaling = __nv_drm_semsurf_fence_op_enable_signaling,
+    .release = __nv_drm_semsurf_fence_op_release,
+    .wait = nv_dma_fence_default_wait,
+#if defined(NV_DMA_FENCE_OPS_HAS_USE_64BIT_SEQNO)
+    .use_64bit_seqno = true,
+#endif
+};
+
+/*
+ * Completes fence initialization, places a new reference to the fence in the
+ * context's pending fence list, and updates/registers any RM callbacks and
+ * timeout timers if necessary.
+ *
+ * Can NOT be called from in an atomic context/interrupt handler.
+ */
+static void
+__nv_drm_semsurf_ctx_add_pending(struct nv_drm_semsurf_fence_ctx *ctx,
+                                 struct nv_drm_semsurf_fence *nv_fence,
+                                 NvU64 timeoutMS)
+{
+    struct list_head *pending;
+    unsigned long flags;
+
+    if (timeoutMS > NV_DRM_SEMAPHORE_SURFACE_FENCE_MAX_TIMEOUT_MS) {
+        timeoutMS = NV_DRM_SEMAPHORE_SURFACE_FENCE_MAX_TIMEOUT_MS;
+    }
+
+    /* Add a reference to the fence for the list */
+    nv_dma_fence_get(&nv_fence->base);
+    INIT_LIST_HEAD(&nv_fence->pending_node);
+
+    nv_fence->timeout = nv_drm_timeout_from_ms(timeoutMS);
+
+    spin_lock_irqsave(&ctx->lock, flags);
+
+    list_for_each(pending, &ctx->pending_fences) {
+        struct nv_drm_semsurf_fence *pending_fence =
+            list_entry(pending, typeof(*pending_fence), pending_node);
+        if (__nv_drm_get_semsurf_fence_seqno(pending_fence) >
+            __nv_drm_get_semsurf_fence_seqno(nv_fence)) {
+            /* Inserts 'nv_fence->pending_node' before 'pending' */
+            list_add_tail(&nv_fence->pending_node, pending);
+            break;
+        }
+    }
+
+    if (list_empty(&nv_fence->pending_node)) {
+        /*
+         * Inserts 'fence->pending_node' at the end of 'ctx->pending_fences',
+         * or as the head if the list is empty
+         */
+        list_add_tail(&nv_fence->pending_node, &ctx->pending_fences);
+    }
+
+    /* Fence is live starting... now! */
+    spin_unlock_irqrestore(&ctx->lock, flags);
+
+    /* Register new wait and timeout callbacks, if necessary */
+    __nv_drm_semsurf_ctx_reg_callbacks(ctx);
+}
+
+static nv_dma_fence_t *__nv_drm_semsurf_fence_ctx_create_fence(
+    struct nv_drm_device *nv_dev,
+    struct nv_drm_semsurf_fence_ctx *ctx,
+    NvU64 wait_value,
+    NvU64 timeout_value_ms)
+{
+    struct nv_drm_semsurf_fence *nv_fence;
+    nv_dma_fence_t *fence;
+    int ret = 0;
+
+    if (timeout_value_ms == 0 ||
+        timeout_value_ms > NV_DRM_SEMAPHORE_SURFACE_FENCE_MAX_TIMEOUT_MS) {
+        timeout_value_ms = NV_DRM_SEMAPHORE_SURFACE_FENCE_MAX_TIMEOUT_MS;
+    }
+
+    if ((nv_fence = nv_drm_calloc(1, sizeof(*nv_fence))) == NULL) {
+        ret = -ENOMEM;
+        goto out;
+    }
+
+    fence = &nv_fence->base;
+    spin_lock_init(&nv_fence->lock);
+#if !defined(NV_DMA_FENCE_OPS_HAS_USE_64BIT_SEQNO)
+    nv_fence->wait_value = wait_value;
+#endif
+
+    /* Initializes the fence with one reference (for the caller) */
+    nv_dma_fence_init(fence, &nv_drm_semsurf_fence_ops,
+                      &nv_fence->lock,
+                      ctx->base.context, wait_value);
+
+    __nv_drm_semsurf_ctx_add_pending(ctx, nv_fence, timeout_value_ms);
+
+out:
+    /* Returned fence has one reference reserved for the caller. */
+    return ret != 0 ? ERR_PTR(ret) : &nv_fence->base;
+}
+
+int nv_drm_semsurf_fence_create_ioctl(struct drm_device *dev,
+                                      void *data,
+                                      struct drm_file *filep)
+{
+    struct nv_drm_device *nv_dev = to_nv_device(dev);
+    struct drm_nvidia_semsurf_fence_create_params *p = data;
+    struct nv_drm_fence_context *nv_fence_context;
+    nv_dma_fence_t *fence;
+    int ret = -EINVAL;
+    int fd;
+
+    if (p->__pad != 0) {
+        NV_DRM_DEV_LOG_ERR(nv_dev, "Padding fields must be zeroed");
+        goto done;
+    }
+
+    if ((nv_fence_context = __nv_drm_fence_context_lookup(
+                                nv_dev->dev,
+                                filep,
+                                p->fence_context_handle)) == NULL) {
+        NV_DRM_DEV_LOG_ERR(
+            nv_dev,
+            "Failed to lookup gem object for fence context: 0x%08x",
+            p->fence_context_handle);
+
+        goto done;
+    }
+
+    if (nv_fence_context->ops != &nv_drm_semsurf_fence_ctx_ops) {
+        NV_DRM_DEV_LOG_ERR(
+            nv_dev,
+            "Wrong fence context type: 0x%08x",
+            p->fence_context_handle);
+
+        goto fence_context_create_fence_failed;
+    }
+
+    fence = __nv_drm_semsurf_fence_ctx_create_fence(
+        nv_dev,
+        to_semsurf_fence_ctx(nv_fence_context),
+        p->wait_value,
+        p->timeout_value_ms);
+
+    if (IS_ERR(fence)) {
+        ret = PTR_ERR(fence);
+
+        NV_DRM_DEV_LOG_ERR(
+            nv_dev,
+            "Failed to allocate fence: 0x%08x", p->fence_context_handle);
+
+        goto fence_context_create_fence_failed;
+    }
+
+    if ((fd = nv_drm_create_sync_file(fence)) < 0) {
+        ret = fd;
+
+        NV_DRM_DEV_LOG_ERR(
+            nv_dev,
+            "Failed to create sync file from fence on ctx 0x%08x",
+            p->fence_context_handle);
+
+        goto fence_context_create_sync_failed;
+    }
+
+    p->fd = fd;
+    ret = 0;
+
+fence_context_create_sync_failed:
+    /*
+     * Release this function's reference to the fence.  If successful, the sync
+     * FD will still hold a reference, and the pending list (if the fence hasn't
+     * already been signaled) will also retain a reference.
+     */
+    nv_dma_fence_put(fence);
+
+fence_context_create_fence_failed:
+    nv_drm_gem_object_unreference_unlocked(&nv_fence_context->base);
+
+done:
+    return ret;
+}
+
+static void
+__nv_drm_semsurf_free_wait_data(struct nv_drm_sync_fd_wait_data *wait_data)
+{
+    struct nv_drm_semsurf_fence_ctx *ctx = wait_data->ctx;
+    unsigned long flags;
+
+    spin_lock_irqsave(&ctx->lock, flags);
+    list_del(&wait_data->pending_node);
+    spin_unlock_irqrestore(&ctx->lock, flags);
+
+    nv_drm_free(wait_data);
+}
+
+static void
+__nv_drm_semsurf_wait_fence_work_cb
+(
+    void *arg
+)
+{
+    struct nv_drm_sync_fd_wait_data *wait_data = arg;
+    struct nv_drm_semsurf_fence_ctx *ctx = wait_data->ctx;
+    struct nv_drm_device *nv_dev = ctx->base.nv_dev;
+    NvKmsKapiRegisterWaiterResult ret;
+
+    /*
+     * Note this command applies "newValue" immediately if the semaphore has
+     * already reached "waitValue." It only returns NVKMS_KAPI_ALREADY_SIGNALLED
+     * if a separate notification was requested as well.
+     */
+    ret = nvKms->registerSemaphoreSurfaceCallback(nv_dev->pDevice,
+                                                  ctx->pSemSurface,
+                                                  NULL,
+                                                  NULL,
+                                                  ctx->base.fenceSemIndex,
+                                                  wait_data->pre_wait_value,
+                                                  wait_data->post_wait_value,
+                                                  NULL);
+
+    if (ret != NVKMS_KAPI_REG_WAITER_SUCCESS) {
+        NV_DRM_DEV_LOG_ERR(nv_dev,
+                           "Failed to register auto-value-update on pre-wait value for sync FD semaphore surface");
+    }
+
+    __nv_drm_semsurf_free_wait_data(wait_data);
+}
+
+static void
+__nv_drm_semsurf_wait_fence_cb
+(
+    nv_dma_fence_t *fence,
+    nv_dma_fence_cb_t *cb
+)
+{
+    struct nv_drm_sync_fd_wait_data *wait_data =
+        container_of(cb, typeof(*wait_data), dma_fence_cb);
+    struct nv_drm_semsurf_fence_ctx *ctx = wait_data->ctx;
+
+    /*
+     * Defer registering the wait with RM to a worker thread, since
+     * this function may be called in interrupt context, which
+     * could mean arriving here directly from RM's top/bottom half
+     * handler when the fence being waited on came from an RM-managed GPU.
+     */
+    if (!nv_drm_workthread_add_work(&ctx->worker, &wait_data->work)) {
+        /*
+         * The context is shutting down. RM would likely just drop
+         * the wait anyway as part of that, so do nothing. Either the
+         * client is exiting uncleanly, or it is a bug in the client
+         * in that it didn't consume its wait before destroying the
+         * fence context used to instantiate it.
+         */
+        __nv_drm_semsurf_free_wait_data(wait_data);
+    }
+
+    /* Don't need to reference the fence anymore, just the fence context. */
+    nv_dma_fence_put(fence);
+}
+
+int nv_drm_semsurf_fence_wait_ioctl(struct drm_device *dev,
+                                    void *data,
+                                    struct drm_file *filep)
+{
+    struct nv_drm_device *nv_dev = to_nv_device(dev);
+    struct drm_nvidia_semsurf_fence_wait_params *p = data;
+    struct nv_drm_fence_context *nv_fence_context;
+    struct nv_drm_semsurf_fence_ctx *ctx;
+    struct nv_drm_sync_fd_wait_data *wait_data = NULL;
+    nv_dma_fence_t *fence;
+    unsigned long flags;
+    int ret = -EINVAL;
+
+    if (p->pre_wait_value >= p->post_wait_value) {
+        NV_DRM_DEV_LOG_ERR(
+            nv_dev,
+            "Non-monotonic wait values specified to fence wait: 0x%llu, 0x%llu",
+            p->pre_wait_value, p->post_wait_value);
+        goto done;
+    }
+
+    if ((nv_fence_context = __nv_drm_fence_context_lookup(
+                                nv_dev->dev,
+                                filep,
+                                p->fence_context_handle)) == NULL) {
+        NV_DRM_DEV_LOG_ERR(
+            nv_dev,
+            "Failed to lookup gem object for fence context: 0x%08x",
+            p->fence_context_handle);
+
+        goto done;
+    }
+
+    if (nv_fence_context->ops != &nv_drm_semsurf_fence_ctx_ops) {
+        NV_DRM_DEV_LOG_ERR(
+            nv_dev,
+            "Wrong fence context type: 0x%08x",
+            p->fence_context_handle);
+
+        goto fence_context_sync_lookup_failed;
+    }
+
+    ctx = to_semsurf_fence_ctx(nv_fence_context);
+
+    wait_data = nv_drm_calloc(1, sizeof(*wait_data));
+
+    if (!wait_data) {
+        NV_DRM_DEV_LOG_ERR(
+            nv_dev,
+            "Failed to allocate callback data for sync FD wait: %d", p->fd);
+
+        goto fence_context_sync_lookup_failed;
+    }
+
+    fence = nv_drm_sync_file_get_fence(p->fd);
+
+    if (!fence) {
+        NV_DRM_DEV_LOG_ERR(
+            nv_dev,
+            "Attempt to wait on invalid sync FD: %d", p->fd);
+
+        goto fence_context_sync_lookup_failed;
+    }
+
+    wait_data->ctx = ctx;
+    wait_data->pre_wait_value = p->pre_wait_value;
+    wait_data->post_wait_value = p->post_wait_value;
+    nv_drm_workthread_work_init(&wait_data->work,
+                                __nv_drm_semsurf_wait_fence_work_cb,
+                                wait_data);
+
+    spin_lock_irqsave(&ctx->lock, flags);
+    list_add(&wait_data->pending_node, &ctx->pending_waits);
+    spin_unlock_irqrestore(&ctx->lock, flags);
+
+    ret = nv_dma_fence_add_callback(fence,
+                                    &wait_data->dma_fence_cb,
+                                    __nv_drm_semsurf_wait_fence_cb);
+
+    if (ret) {
+       if (ret == -ENOENT) {
+           /* The fence is already signaled */
+       } else {
+           NV_DRM_LOG_ERR(
+               "Failed to add dma_fence callback. Signaling early!");
+           /* Proceed as if the fence wait succeeded */
+       }
+
+       /* Execute second half of wait immediately, avoiding the worker thread */
+       nv_dma_fence_put(fence);
+        __nv_drm_semsurf_wait_fence_work_cb(wait_data);
+    }
+
+    ret = 0;
+
+fence_context_sync_lookup_failed:
+    if (ret && wait_data) {
+        /*
+         * Do not use __nv_drm_semsurf_free_wait_data() here, as the wait_data
+         * has not been added to the pending list yet.
+         */
+        nv_drm_free(wait_data);
+    }
+
+    nv_drm_gem_object_unreference_unlocked(&nv_fence_context->base);
+
+done:
+    return 0;
+}
+
+int nv_drm_semsurf_fence_attach_ioctl(struct drm_device *dev,
+                                      void *data,
+                                      struct drm_file *filep)
+{
+    struct nv_drm_device *nv_dev = to_nv_device(dev);
+    struct drm_nvidia_semsurf_fence_attach_params *p = data;
+    struct nv_drm_gem_object *nv_gem = NULL;
+    struct nv_drm_fence_context *nv_fence_context = NULL;
+    nv_dma_fence_t *fence;
+    int ret = -EINVAL;
+
+    nv_gem = nv_drm_gem_object_lookup(nv_dev->dev, filep, p->handle);
+
+    if (!nv_gem) {
+        NV_DRM_DEV_LOG_ERR(
+            nv_dev,
+            "Failed to lookup gem object for fence attach: 0x%08x",
+            p->handle);
+
+        goto done;
+    }
+
+    nv_fence_context = __nv_drm_fence_context_lookup(
+        nv_dev->dev,
+        filep,
+        p->fence_context_handle);
+
+    if (!nv_fence_context) {
+        NV_DRM_DEV_LOG_ERR(
+            nv_dev,
+            "Failed to lookup gem object for fence context: 0x%08x",
+            p->fence_context_handle);
+
+        goto done;
+    }
+
+    if (nv_fence_context->ops != &nv_drm_semsurf_fence_ctx_ops) {
+        NV_DRM_DEV_LOG_ERR(
+            nv_dev,
+            "Wrong fence context type: 0x%08x",
+            p->fence_context_handle);
+
+        goto done;
+    }
+
+    fence = __nv_drm_semsurf_fence_ctx_create_fence(
+        nv_dev,
+        to_semsurf_fence_ctx(nv_fence_context),
+        p->wait_value,
+        p->timeout_value_ms);
+
+    if (IS_ERR(fence)) {
+        ret = PTR_ERR(fence);
+
+        NV_DRM_DEV_LOG_ERR(
+            nv_dev,
+            "Failed to allocate fence: 0x%08x", p->handle);
+
+        goto done;
+    }
+
+    ret = __nv_drm_gem_attach_fence(nv_gem, fence, p->shared);
+
+    nv_dma_fence_put(fence);
+
+done:
+    if (nv_fence_context) {
+        nv_drm_gem_object_unreference_unlocked(&nv_fence_context->base);
+    }
+
+    if (nv_gem) {
+        nv_drm_gem_object_unreference_unlocked(nv_gem);
+    }
+
+    return ret;
+}
+
 #endif /* NV_DRM_FENCE_AVAILABLE */

 #endif /* NV_DRM_AVAILABLE */
--- a/kernel-open/nvidia-drm/nvidia-drm-fence.h
+++ b/kernel-open/nvidia-drm/nvidia-drm-fence.h
@ -41,6 +41,22 @@ int nv_drm_prime_fence_context_create_ioctl(struct drm_device *dev,
 int nv_drm_gem_prime_fence_attach_ioctl(struct drm_device *dev,
                                        void *data, struct drm_file *filep);

+int nv_drm_semsurf_fence_ctx_create_ioctl(struct drm_device *dev,
+                                          void *data,
+                                          struct drm_file *filep);
+
+int nv_drm_semsurf_fence_create_ioctl(struct drm_device *dev,
+                                      void *data,
+                                      struct drm_file *filep);
+
+int nv_drm_semsurf_fence_wait_ioctl(struct drm_device *dev,
+                                    void *data,
+                                    struct drm_file *filep);
+
+int nv_drm_semsurf_fence_attach_ioctl(struct drm_device *dev,
+                                      void *data,
+                                      struct drm_file *filep);
+
 #endif /* NV_DRM_FENCE_AVAILABLE */

 #endif /* NV_DRM_AVAILABLE */
--- a/kernel-open/nvidia-drm/nvidia-drm-gem-nvkms-memory.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-gem-nvkms-memory.c
@ -465,7 +465,7 @@ int nv_drm_gem_alloc_nvkms_memory_ioctl(struct drm_device *dev,
        goto failed;
    }

-    if (p->__pad != 0) {
+    if ((p->__pad0 != 0) || (p->__pad1 != 0)) {
        ret = -EINVAL;
        NV_DRM_DEV_LOG_ERR(nv_dev, "non-zero value in padding field");
        goto failed;
--- a/kernel-open/nvidia-drm/nvidia-drm-gem.h
+++ b/kernel-open/nvidia-drm/nvidia-drm-gem.h
@ -95,6 +95,16 @@ static inline struct nv_drm_gem_object *to_nv_gem_object(
 * 3e70fd160cf0b1945225eaa08dd2cb8544f21cb8 (2018-11-15).
 */

+static inline void
+nv_drm_gem_object_reference(struct nv_drm_gem_object *nv_gem)
+{
+#if defined(NV_DRM_GEM_OBJECT_GET_PRESENT)
+    drm_gem_object_get(&nv_gem->base);
+#else
+    drm_gem_object_reference(&nv_gem->base);
+#endif
+}
+
 static inline void
 nv_drm_gem_object_unreference_unlocked(struct nv_drm_gem_object *nv_gem)
 {
--- a/kernel-open/nvidia-drm/nvidia-drm-helper.h
+++ b/kernel-open/nvidia-drm/nvidia-drm-helper.h
@ -306,6 +306,36 @@ int nv_drm_atomic_helper_disable_all(struct drm_device *dev,
    for_each_plane_in_state(__state, plane, plane_state, __i)
 #endif

+/*
+ * for_each_new_plane_in_state() was added by kernel commit
+ * 581e49fe6b411f407102a7f2377648849e0fa37f which was Signed-off-by:
+ *      Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
+ *      Daniel Vetter <daniel.vetter@ffwll.ch>
+ *
+ * This commit also added the old_state and new_state pointers to
+ * __drm_planes_state. Because of this, the best that can be done on kernel
+ * versions without this macro is for_each_plane_in_state.
+ */
+
+/**
+ * nv_drm_for_each_new_plane_in_state - iterate over all planes in an atomic update
+ * @__state: &struct drm_atomic_state pointer
+ * @plane: &struct drm_plane iteration cursor
+ * @new_plane_state: &struct drm_plane_state iteration cursor for the new state
+ * @__i: int iteration cursor, for macro-internal use
+ *
+ * This iterates over all planes in an atomic update, tracking only the new
+ * state. This is useful in enable functions, where we need the new state the
+ * hardware should be in when the atomic commit operation has completed.
+ */
+#if !defined(for_each_new_plane_in_state)
+#define nv_drm_for_each_new_plane_in_state(__state, plane, new_plane_state, __i) \
+    nv_drm_for_each_plane_in_state(__state, plane, new_plane_state, __i)
+#else
+#define nv_drm_for_each_new_plane_in_state(__state, plane, new_plane_state, __i) \
+    for_each_new_plane_in_state(__state, plane, new_plane_state, __i)
+#endif
+
 static inline struct drm_connector *
 nv_drm_connector_lookup(struct drm_device *dev, struct drm_file *filep,
                        uint32_t id)
--- a/kernel-open/nvidia-drm/nvidia-drm-ioctl.h
+++ b/kernel-open/nvidia-drm/nvidia-drm-ioctl.h
@ -48,6 +48,10 @@
 #define DRM_NVIDIA_GET_CONNECTOR_ID_FOR_DPY_ID      0x11
 #define DRM_NVIDIA_GRANT_PERMISSIONS                0x12
 #define DRM_NVIDIA_REVOKE_PERMISSIONS               0x13
+#define DRM_NVIDIA_SEMSURF_FENCE_CTX_CREATE         0x14
+#define DRM_NVIDIA_SEMSURF_FENCE_CREATE             0x15
+#define DRM_NVIDIA_SEMSURF_FENCE_WAIT               0x16
+#define DRM_NVIDIA_SEMSURF_FENCE_ATTACH             0x17

 #define DRM_IOCTL_NVIDIA_GEM_IMPORT_NVKMS_MEMORY                           \
    DRM_IOWR((DRM_COMMAND_BASE + DRM_NVIDIA_GEM_IMPORT_NVKMS_MEMORY),      \
@ -133,6 +137,26 @@
    DRM_IOWR((DRM_COMMAND_BASE + DRM_NVIDIA_REVOKE_PERMISSIONS),        \
             struct drm_nvidia_revoke_permissions_params)

+#define DRM_IOCTL_NVIDIA_SEMSURF_FENCE_CTX_CREATE                       \
+    DRM_IOWR((DRM_COMMAND_BASE +                                        \
+              DRM_NVIDIA_SEMSURF_FENCE_CTX_CREATE),                     \
+              struct drm_nvidia_semsurf_fence_ctx_create_params)
+
+#define DRM_IOCTL_NVIDIA_SEMSURF_FENCE_CREATE                           \
+    DRM_IOWR((DRM_COMMAND_BASE +                                        \
+              DRM_NVIDIA_SEMSURF_FENCE_CREATE),                         \
+              struct drm_nvidia_semsurf_fence_create_params)
+
+#define DRM_IOCTL_NVIDIA_SEMSURF_FENCE_WAIT                             \
+    DRM_IOW((DRM_COMMAND_BASE +                                         \
+              DRM_NVIDIA_SEMSURF_FENCE_WAIT),                           \
+              struct drm_nvidia_semsurf_fence_wait_params)
+
+#define DRM_IOCTL_NVIDIA_SEMSURF_FENCE_ATTACH                           \
+    DRM_IOW((DRM_COMMAND_BASE +                                         \
+              DRM_NVIDIA_SEMSURF_FENCE_ATTACH),                         \
+              struct drm_nvidia_semsurf_fence_attach_params)
+
 struct drm_nvidia_gem_import_nvkms_memory_params {
    uint64_t mem_size;           /* IN */

@ -158,6 +182,8 @@ struct drm_nvidia_get_dev_info_params {
    uint32_t generic_page_kind;    /* OUT */
    uint32_t page_kind_generation; /* OUT */
    uint32_t sector_layout;        /* OUT */
+    uint32_t supports_sync_fd;     /* OUT */
+    uint32_t supports_semsurf;     /* OUT */
 };

 struct drm_nvidia_prime_fence_context_create_params {
@ -179,6 +205,7 @@ struct drm_nvidia_gem_prime_fence_attach_params {
    uint32_t handle;                /* IN GEM handle to attach fence to */
    uint32_t fence_context_handle;  /* IN GEM handle to fence context on which fence is run on */
    uint32_t sem_thresh;            /* IN Semaphore value to reach before signal */
+    uint32_t __pad;
 };

 struct drm_nvidia_get_client_capability_params {
@ -190,6 +217,8 @@ struct drm_nvidia_get_client_capability_params {
 struct drm_nvidia_crtc_crc32 {
    uint32_t value; /* Read value, undefined if supported is false */
    uint8_t supported; /* Supported boolean, true if readable by hardware */
+    uint8_t __pad0;
+    uint16_t __pad1;
 };

 struct drm_nvidia_crtc_crc32_v2_out {
@ -229,10 +258,11 @@ struct drm_nvidia_gem_alloc_nvkms_memory_params {
    uint32_t handle;              /* OUT */
    uint8_t  block_linear;        /* IN */
    uint8_t  compressible;        /* IN/OUT */
-    uint16_t __pad;
+    uint16_t __pad0;

    uint64_t memory_size;         /* IN */
    uint32_t flags;               /* IN */
+    uint32_t __pad1;
 };

 struct drm_nvidia_gem_export_dmabuf_memory_params {
@ -266,13 +296,90 @@ struct drm_nvidia_get_connector_id_for_dpy_id_params {
    uint32_t connectorId; /* OUT */
 };

+enum drm_nvidia_permissions_type {
+    NV_DRM_PERMISSIONS_TYPE_MODESET = 2,
+    NV_DRM_PERMISSIONS_TYPE_SUB_OWNER = 3
+};
+
 struct drm_nvidia_grant_permissions_params {
    int32_t fd;           /* IN */
    uint32_t dpyId;       /* IN */
+    uint32_t type;        /* IN */
 };

 struct drm_nvidia_revoke_permissions_params {
    uint32_t dpyId;       /* IN */
+    uint32_t type;        /* IN */
+};
+
+struct drm_nvidia_semsurf_fence_ctx_create_params {
+    uint64_t index;             /* IN Index of the desired semaphore in the
+                                 * fence context's semaphore surface */
+
+    /* Params for importing userspace semaphore surface */
+    uint64_t nvkms_params_ptr;  /* IN */
+    uint64_t nvkms_params_size; /* IN */
+
+    uint32_t handle;            /* OUT GEM handle to fence context */
+    uint32_t __pad;
+};
+
+struct drm_nvidia_semsurf_fence_create_params {
+    uint32_t fence_context_handle;  /* IN GEM handle to fence context on which
+                                     * fence is run on */
+
+    uint32_t timeout_value_ms;      /* IN Timeout value in ms for the fence
+                                     * after which the fence will be signaled
+                                     * with its error status set to -ETIMEDOUT.
+                                     * Default timeout value is 5000ms */
+
+    uint64_t wait_value;            /* IN Semaphore value to reach before signal */
+
+    int32_t  fd;                    /* OUT sync FD object representing the
+                                     * semaphore at the specified index reaching
+                                     * a value >= wait_value */
+    uint32_t __pad;
+};
+
+/*
+ * Note there is no provision for timeouts in this ioctl. The kernel
+ * documentation asserts timeouts should be handled by fence producers, and
+ * that waiters should not second-guess their logic, as it is producers rather
+ * than consumers that have better information when it comes to determining a
+ * reasonable timeout for a given workload.
+ */
+struct drm_nvidia_semsurf_fence_wait_params {
+    uint32_t fence_context_handle;  /* IN GEM handle to fence context which will
+                                     * be used to wait on the sync FD.  Need not
+                                     * be the fence context used to create the
+                                     * sync FD. */
+
+    int32_t  fd;                    /* IN sync FD object to wait on */
+
+    uint64_t pre_wait_value;        /* IN Wait for the semaphore represented by
+                                     * fence_context to reach this value before
+                                     * waiting for the sync file. */
+
+    uint64_t post_wait_value;       /* IN Signal the semaphore represented by
+                                     * fence_context to this value after waiting
+                                     * for the sync file */
+};
+
+struct drm_nvidia_semsurf_fence_attach_params {
+    uint32_t handle;                /* IN GEM handle of buffer */
+
+    uint32_t fence_context_handle;  /* IN GEM handle of fence context */
+
+    uint32_t timeout_value_ms;      /* IN Timeout value in ms for the fence
+                                     * after which the fence will be signaled
+                                     * with its error status set to -ETIMEDOUT.
+                                     * Default timeout value is 5000ms */
+
+    uint32_t shared;                /* IN If true, fence will reserve shared
+                                     * access to the buffer, otherwise it will
+                                     * reserve exclusive access */
+
+    uint64_t wait_value;            /* IN Semaphore value to reach before signal */
 };

 #endif /* _UAPI_NVIDIA_DRM_IOCTL_H_ */
--- a/kernel-open/nvidia-drm/nvidia-drm-linux.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-linux.c
@ -35,7 +35,13 @@
 #include <drm/drmP.h>
 #endif

+#if defined(NV_LINUX_SYNC_FILE_H_PRESENT)
+#include <linux/file.h>
+#include <linux/sync_file.h>
+#endif
+
 #include <linux/vmalloc.h>
+#include <linux/sched.h>

 #include "nv-mm.h"

@ -45,6 +51,14 @@ MODULE_PARM_DESC(
 bool nv_drm_modeset_module_param = false;
 module_param_named(modeset, nv_drm_modeset_module_param, bool, 0400);

+#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE)
+MODULE_PARM_DESC(
+    fbdev,
+    "Create a framebuffer device (1 = enable, 0 = disable (default)) (EXPERIMENTAL)");
+bool nv_drm_fbdev_module_param = false;
+module_param_named(fbdev, nv_drm_fbdev_module_param, bool, 0400);
+#endif
+
 void *nv_drm_calloc(size_t nmemb, size_t size)
 {
    size_t total_size = nmemb * size;
@ -81,14 +95,10 @@ char *nv_drm_asprintf(const char *fmt, ...)

 #if defined(NVCPU_X86) || defined(NVCPU_X86_64)
  #define WRITE_COMBINE_FLUSH()    asm volatile("sfence":::"memory")
-#elif defined(NVCPU_FAMILY_ARM)
-  #if defined(NVCPU_ARM)
-    #define WRITE_COMBINE_FLUSH()  { dsb(); outer_sync(); }
-  #elif defined(NVCPU_AARCH64)
-    #define WRITE_COMBINE_FLUSH()  mb()
-  #endif
 #elif defined(NVCPU_PPC64LE)
  #define WRITE_COMBINE_FLUSH()    asm volatile("sync":::"memory")
+#else
+  #define WRITE_COMBINE_FLUSH()    mb()
 #endif

 void nv_drm_write_combine_flush(void)
@ -160,6 +170,122 @@ void nv_drm_vunmap(void *address)
    vunmap(address);
 }

+bool nv_drm_workthread_init(nv_drm_workthread *worker, const char *name)
+{
+    worker->shutting_down = false;
+    if (nv_kthread_q_init(&worker->q, name)) {
+        return false;
+    }
+
+    spin_lock_init(&worker->lock);
+
+    return true;
+}
+
+void nv_drm_workthread_shutdown(nv_drm_workthread *worker)
+{
+    unsigned long flags;
+
+    spin_lock_irqsave(&worker->lock, flags);
+    worker->shutting_down = true;
+    spin_unlock_irqrestore(&worker->lock, flags);
+
+    nv_kthread_q_stop(&worker->q);
+}
+
+void nv_drm_workthread_work_init(nv_drm_work *work,
+                                 void (*callback)(void *),
+                                 void *arg)
+{
+    nv_kthread_q_item_init(work, callback, arg);
+}
+
+int nv_drm_workthread_add_work(nv_drm_workthread *worker, nv_drm_work *work)
+{
+    unsigned long flags;
+    int ret = 0;
+
+    spin_lock_irqsave(&worker->lock, flags);
+    if (!worker->shutting_down) {
+        ret = nv_kthread_q_schedule_q_item(&worker->q, work);
+    }
+    spin_unlock_irqrestore(&worker->lock, flags);
+
+    return ret;
+}
+
+void nv_drm_timer_setup(nv_drm_timer *timer, void (*callback)(nv_drm_timer *nv_drm_timer))
+{
+    nv_timer_setup(timer, callback);
+}
+
+void nv_drm_mod_timer(nv_drm_timer *timer, unsigned long timeout_native)
+{
+    mod_timer(&timer->kernel_timer, timeout_native);
+}
+
+unsigned long nv_drm_timer_now(void)
+{
+    return jiffies;
+}
+
+unsigned long nv_drm_timeout_from_ms(NvU64 relative_timeout_ms)
+{
+    return jiffies + msecs_to_jiffies(relative_timeout_ms);
+}
+
+bool nv_drm_del_timer_sync(nv_drm_timer *timer)
+{
+    if (del_timer_sync(&timer->kernel_timer)) {
+        return true;
+    } else {
+        return false;
+    }
+}
+
+#if defined(NV_DRM_FENCE_AVAILABLE)
+int nv_drm_create_sync_file(nv_dma_fence_t *fence)
+{
+#if defined(NV_LINUX_SYNC_FILE_H_PRESENT)
+    struct sync_file *sync;
+    int fd = get_unused_fd_flags(O_CLOEXEC);
+
+    if (fd < 0) {
+        return fd;
+    }
+
+    /* sync_file_create() generates its own reference to the fence */
+    sync = sync_file_create(fence);
+
+    if (IS_ERR(sync)) {
+        put_unused_fd(fd);
+        return PTR_ERR(sync);
+    }
+
+    fd_install(fd, sync->file);
+
+    return fd;
+#else /* defined(NV_LINUX_SYNC_FILE_H_PRESENT) */
+    return -EINVAL;
+#endif  /* defined(NV_LINUX_SYNC_FILE_H_PRESENT) */
+}
+
+nv_dma_fence_t *nv_drm_sync_file_get_fence(int fd)
+{
+#if defined(NV_SYNC_FILE_GET_FENCE_PRESENT)
+    return sync_file_get_fence(fd);
+#else /* defined(NV_SYNC_FILE_GET_FENCE_PRESENT) */
+    return NULL;
+#endif  /* defined(NV_SYNC_FILE_GET_FENCE_PRESENT) */
+}
+#endif /* defined(NV_DRM_FENCE_AVAILABLE) */
+
+void nv_drm_yield(void)
+{
+    set_current_state(TASK_INTERRUPTIBLE);
+    schedule_timeout(1);
+}
+
 #endif /* NV_DRM_AVAILABLE */

 /*************************************************************************
--- a/kernel-open/nvidia-drm/nvidia-drm-modeset.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-modeset.c
@ -237,6 +237,14 @@ nv_drm_atomic_apply_modeset_config(struct drm_device *dev,
    int i;
    int ret;

+    /*
+     * If sub-owner permission was granted to another NVKMS client, disallow
+     * modesets through the DRM interface.
+     */
+    if (nv_dev->subOwnershipGranted) {
+        return -EINVAL;
+    }
+
    memset(requested_config, 0, sizeof(*requested_config));

    /* Loop over affected crtcs and construct NvKmsKapiRequestedModeSetConfig */
@ -274,9 +282,6 @@ nv_drm_atomic_apply_modeset_config(struct drm_device *dev,

                nv_new_crtc_state->nv_flip = NULL;
            }
-#if defined(NV_DRM_CRTC_STATE_HAS_VRR_ENABLED)
-            requested_config->headRequestedConfig[nv_crtc->head].modeSetConfig.vrrEnabled = new_crtc_state->vrr_enabled;
-#endif
        }
    }

@ -292,7 +297,9 @@ nv_drm_atomic_apply_modeset_config(struct drm_device *dev,
                                   requested_config,
                                   &reply_config,
                                   commit)) {
-        return -EINVAL;
+        if (commit || reply_config.flipResult != NV_KMS_FLIP_RESULT_IN_PROGRESS) {
+            return -EINVAL;
+        }
    }

    if (commit && nv_dev->supportsSyncpts) {
@ -388,42 +395,56 @@ int nv_drm_atomic_commit(struct drm_device *dev,
    struct nv_drm_device *nv_dev = to_nv_device(dev);

    /*
-     * drm_mode_config_funcs::atomic_commit() mandates to return -EBUSY
-     * for nonblocking commit if previous updates (commit tasks/flip event) are
-     * pending. In case of blocking commits it mandates to wait for previous
-     * updates to complete.
+     * XXX: drm_mode_config_funcs::atomic_commit() mandates to return -EBUSY
+     * for nonblocking commit if the commit would need to wait for previous
+     * updates (commit tasks/flip event) to complete. In case of blocking
+     * commits it mandates to wait for previous updates to complete. However,
+     * the kernel DRM-KMS documentation does explicitly allow maintaining a
+     * queue of outstanding commits.
+     *
+     * Our system already implements such a queue, but due to
+     * bug 4054608, it is currently not used.
     */
-    if (nonblock) {
-        nv_drm_for_each_crtc_in_state(state, crtc, crtc_state, i) {
-            struct nv_drm_crtc *nv_crtc = to_nv_crtc(crtc);
+    nv_drm_for_each_crtc_in_state(state, crtc, crtc_state, i) {
+        struct nv_drm_crtc *nv_crtc = to_nv_crtc(crtc);

-            /*
-             * Here you aren't required to hold nv_drm_crtc::flip_list_lock
-             * because:
-             *
-             * The core DRM driver acquires lock for all affected crtcs before
-             * calling into ->commit() hook, therefore it is not possible for
-             * other threads to call into ->commit() hook affecting same crtcs
-             * and enqueue flip objects into flip_list -
-             *
-             *   nv_drm_atomic_commit_internal()
-             *     |-> nv_drm_atomic_apply_modeset_config(commit=true)
-             *           |-> nv_drm_crtc_enqueue_flip()
-             *
-             * Only possibility is list_empty check races with code path
-             * dequeuing flip object -
-             *
-             *   __nv_drm_handle_flip_event()
-             *     |-> nv_drm_crtc_dequeue_flip()
-             *
-             * But this race condition can't lead list_empty() to return
-             * incorrect result. nv_drm_crtc_dequeue_flip() in the middle of
-             * updating the list could not trick us into thinking the list is
-             * empty when it isn't.
-             */
+        /*
+         * Here you aren't required to hold nv_drm_crtc::flip_list_lock
+         * because:
+         *
+         * The core DRM driver acquires lock for all affected crtcs before
+         * calling into ->commit() hook, therefore it is not possible for
+         * other threads to call into ->commit() hook affecting same crtcs
+         * and enqueue flip objects into flip_list -
+         *
+         *   nv_drm_atomic_commit_internal()
+         *     |-> nv_drm_atomic_apply_modeset_config(commit=true)
+         *           |-> nv_drm_crtc_enqueue_flip()
+         *
+         * Only possibility is list_empty check races with code path
+         * dequeuing flip object -
+         *
+         *   __nv_drm_handle_flip_event()
+         *     |-> nv_drm_crtc_dequeue_flip()
+         *
+         * But this race condition can't lead list_empty() to return
+         * incorrect result. nv_drm_crtc_dequeue_flip() in the middle of
+         * updating the list could not trick us into thinking the list is
+         * empty when it isn't.
+         */
+        if (nonblock) {
            if (!list_empty(&nv_crtc->flip_list)) {
                return -EBUSY;
            }
+        } else {
+            if (wait_event_timeout(
+                    nv_dev->flip_event_wq,
+                    list_empty(&nv_crtc->flip_list),
+                    3 * HZ /* 3 second */) == 0) {
+                NV_DRM_DEV_LOG_ERR(
+                    nv_dev,
+                    "Flip event timeout on head %u", nv_crtc->head);
+            }
        }
    }

@ -467,6 +488,7 @@ int nv_drm_atomic_commit(struct drm_device *dev,

        goto done;
    }
+    nv_dev->drmMasterChangedSinceLastAtomicCommit = NV_FALSE;

    nv_drm_for_each_crtc_in_state(state, crtc, crtc_state, i) {
        struct nv_drm_crtc *nv_crtc = to_nv_crtc(crtc);
--- a/kernel-open/nvidia-drm/nvidia-drm-os-interface.h
+++ b/kernel-open/nvidia-drm/nvidia-drm-os-interface.h
@ -29,10 +29,47 @@

 #if defined(NV_DRM_AVAILABLE)

+#if defined(NV_DRM_FENCE_AVAILABLE)
+#include "nvidia-dma-fence-helper.h"
+#endif
+
+#if defined(NV_LINUX)
+#include "nv-kthread-q.h"
+#include "linux/spinlock.h"
+
+typedef struct nv_drm_workthread {
+    spinlock_t lock;
+    struct nv_kthread_q q;
+    bool shutting_down;
+} nv_drm_workthread;
+
+typedef nv_kthread_q_item_t nv_drm_work;
+
+#else /* defined(NV_LINUX) */
+#error "Need to define deferred work primitives for this OS"
+#endif /* else defined(NV_LINUX) */
+
+#if defined(NV_LINUX)
+#include "nv-timer.h"
+
+typedef struct nv_timer nv_drm_timer;
+
+#else /* defined(NV_LINUX) */
+#error "Need to define kernel timer callback primitives for this OS"
+#endif /* else defined(NV_LINUX) */
+
+#if defined(NV_DRM_FBDEV_GENERIC_SETUP_PRESENT) && defined(NV_DRM_APERTURE_REMOVE_CONFLICTING_PCI_FRAMEBUFFERS_PRESENT)
+#define NV_DRM_FBDEV_GENERIC_AVAILABLE
+#endif
+
 struct page;

 /* Set to true when the atomic modeset feature is enabled. */
 extern bool nv_drm_modeset_module_param;
+#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE)
+/* Set to true when the nvidia-drm driver should install a framebuffer device */
+extern bool nv_drm_fbdev_module_param;
+#endif

 void *nv_drm_calloc(size_t nmemb, size_t size);

@ -51,6 +88,37 @@ void *nv_drm_vmap(struct page **pages, unsigned long pages_count);

 void nv_drm_vunmap(void *address);

-#endif
+bool nv_drm_workthread_init(nv_drm_workthread *worker, const char *name);
+
+/* Can be called concurrently with nv_drm_workthread_add_work() */
+void nv_drm_workthread_shutdown(nv_drm_workthread *worker);
+
+void nv_drm_workthread_work_init(nv_drm_work *work,
+                                 void (*callback)(void *),
+                                 void *arg);
+
+/* Can be called concurrently with nv_drm_workthread_shutdown() */
+int nv_drm_workthread_add_work(nv_drm_workthread *worker, nv_drm_work *work);
+
+void nv_drm_timer_setup(nv_drm_timer *timer,
+                        void (*callback)(nv_drm_timer *nv_drm_timer));
+
+void nv_drm_mod_timer(nv_drm_timer *timer, unsigned long relative_timeout_ms);
+
+bool nv_drm_del_timer_sync(nv_drm_timer *timer);
+
+unsigned long nv_drm_timer_now(void);
+
+unsigned long nv_drm_timeout_from_ms(NvU64 relative_timeout_ms);
+
+#if defined(NV_DRM_FENCE_AVAILABLE)
+int nv_drm_create_sync_file(nv_dma_fence_t *fence);
+
+nv_dma_fence_t *nv_drm_sync_file_get_fence(int fd);
+#endif /* defined(NV_DRM_FENCE_AVAILABLE) */
+
+void nv_drm_yield(void);
+
+#endif /* defined(NV_DRM_AVAILABLE) */

 #endif /* __NVIDIA_DRM_OS_INTERFACE_H__ */
--- a/kernel-open/nvidia-drm/nvidia-drm-priv.h
+++ b/kernel-open/nvidia-drm/nvidia-drm-priv.h
@ -46,12 +46,33 @@
 #define NV_DRM_LOG_ERR(__fmt, ...) \
    DRM_ERROR("[nvidia-drm] " __fmt "\n", ##__VA_ARGS__)

+/*
+ * DRM_WARN() was added in v4.9 by kernel commit
+ * 30b0da8d556e65ff935a56cd82c05ba0516d3e4a
+ *
+ * Before this commit, only DRM_INFO and DRM_ERROR were defined and
+ * DRM_INFO(fmt, ...) was defined as
+ * printk(KERN_INFO "[" DRM_NAME "] " fmt, ##__VA_ARGS__). So, if
+ * DRM_WARN is undefined this defines NV_DRM_LOG_WARN following the
+ * same pattern as DRM_INFO.
+ */
+#ifdef DRM_WARN
+#define NV_DRM_LOG_WARN(__fmt, ...) \
+    DRM_WARN("[nvidia-drm] " __fmt "\n", ##__VA_ARGS__)
+#else
+#define NV_DRM_LOG_WARN(__fmt, ...) \
+    printk(KERN_WARNING "[" DRM_NAME "] [nvidia-drm] " __fmt "\n", ##__VA_ARGS__)
+#endif
+
 #define NV_DRM_LOG_INFO(__fmt, ...) \
    DRM_INFO("[nvidia-drm] " __fmt "\n", ##__VA_ARGS__)

 #define NV_DRM_DEV_LOG_INFO(__dev, __fmt, ...) \
    NV_DRM_LOG_INFO("[GPU ID 0x%08x] " __fmt, __dev->gpu_info.gpu_id, ##__VA_ARGS__)

+#define NV_DRM_DEV_LOG_WARN(__dev, __fmt, ...) \
+    NV_DRM_LOG_WARN("[GPU ID 0x%08x] " __fmt, __dev->gpu_info.gpu_id, ##__VA_ARGS__)
+
 #define NV_DRM_DEV_LOG_ERR(__dev, __fmt, ...) \
    NV_DRM_LOG_ERR("[GPU ID 0x%08x] " __fmt, __dev->gpu_info.gpu_id, ##__VA_ARGS__)

@ -117,9 +138,26 @@ struct nv_drm_device {

 #endif

+#if defined(NV_DRM_FENCE_AVAILABLE)
+    NvU64 semsurf_stride;
+    NvU64 semsurf_max_submitted_offset;
+#endif
+
    NvBool hasVideoMemory;

    NvBool supportsSyncpts;
+    NvBool subOwnershipGranted;
+    NvBool hasFramebufferConsole;
+
+    /**
+     * @drmMasterChangedSinceLastAtomicCommit:
+     *
+     * This flag is set in nv_drm_master_set and reset after a completed atomic
+     * commit. It is used to restore or recommit state that is lost by the
+     * NvKms modeset owner change, such as the CRTC color management
+     * properties.
+     */
+    NvBool drmMasterChangedSinceLastAtomicCommit;

    struct drm_property *nv_out_fence_property;
    struct drm_property *nv_input_colorspace_property;
--- a/kernel-open/nvidia-drm/nvidia-drm.Kbuild
+++ b/kernel-open/nvidia-drm/nvidia-drm.Kbuild
@ -19,6 +19,7 @@ NVIDIA_DRM_SOURCES += nvidia-drm/nvidia-drm-modeset.c
 NVIDIA_DRM_SOURCES += nvidia-drm/nvidia-drm-fence.c
 NVIDIA_DRM_SOURCES += nvidia-drm/nvidia-drm-linux.c
 NVIDIA_DRM_SOURCES += nvidia-drm/nvidia-drm-helper.c
+NVIDIA_DRM_SOURCES += nvidia-drm/nv-kthread-q.c
 NVIDIA_DRM_SOURCES += nvidia-drm/nv-pci-table.c
 NVIDIA_DRM_SOURCES += nvidia-drm/nvidia-drm-gem-nvkms-memory.c
 NVIDIA_DRM_SOURCES += nvidia-drm/nvidia-drm-gem-user-memory.c
@ -79,6 +80,17 @@ NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_rotation_available
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_vma_offset_exact_lookup_locked
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_gem_object_put_unlocked
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += nvhost_dma_fence_unpack
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += list_is_first
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += timer_setup
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += dma_fence_set_error
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += fence_set_error
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += sync_file_get_fence
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_aperture_remove_conflicting_pci_framebuffers
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_fbdev_generic_setup
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_connector_attach_hdr_output_metadata_property
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_helper_crtc_enable_color_mgmt
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_crtc_enable_color_mgmt
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_atomic_helper_legacy_gamma_set

 NV_CONFTEST_TYPE_COMPILE_TESTS += drm_bus_present
 NV_CONFTEST_TYPE_COMPILE_TESTS += drm_bus_has_bus_type
@ -133,3 +145,6 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += drm_connector_lookup
 NV_CONFTEST_TYPE_COMPILE_TESTS += drm_connector_put
 NV_CONFTEST_TYPE_COMPILE_TESTS += vm_area_struct_has_const_vm_flags
 NV_CONFTEST_TYPE_COMPILE_TESTS += drm_driver_has_dumb_destroy
+NV_CONFTEST_TYPE_COMPILE_TESTS += fence_ops_use_64bit_seqno
+NV_CONFTEST_TYPE_COMPILE_TESTS += drm_aperture_remove_conflicting_pci_framebuffers_has_driver_arg
+NV_CONFTEST_TYPE_COMPILE_TESTS += drm_mode_create_dp_colorspace_property_has_supported_colorspaces_arg
--- a/kernel-open/nvidia-modeset/nv-kthread-q.c
+++ b/kernel-open/nvidia-modeset/nv-kthread-q.c
@ -247,6 +247,11 @@ int nv_kthread_q_init_on_node(nv_kthread_q_t *q, const char *q_name, int preferr
    return 0;
 }

+int nv_kthread_q_init(nv_kthread_q_t *q, const char *qname)
+{
+    return nv_kthread_q_init_on_node(q, qname, NV_KTHREAD_NO_NODE);
+}
+
 // Returns true (non-zero) if the item was actually scheduled, and false if the
 // item was already pending in a queue.
 static int _raw_q_schedule(nv_kthread_q_t *q, nv_kthread_q_item_t *q_item)
--- a/kernel-open/nvidia-modeset/nvidia-modeset-linux.c
+++ b/kernel-open/nvidia-modeset/nvidia-modeset-linux.c
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2015-21 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2015-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -65,9 +65,15 @@
 static bool output_rounding_fix = true;
 module_param_named(output_rounding_fix, output_rounding_fix, bool, 0400);

+static bool disable_hdmi_frl = false;
+module_param_named(disable_hdmi_frl, disable_hdmi_frl, bool, 0400);
+
 static bool disable_vrr_memclk_switch = false;
 module_param_named(disable_vrr_memclk_switch, disable_vrr_memclk_switch, bool, 0400);

+static bool hdmi_deepcolor = false;
+module_param_named(hdmi_deepcolor, hdmi_deepcolor, bool, 0400);
+
 /* These parameters are used for fault injection tests.  Normally the defaults
 * should be used. */
 MODULE_PARM_DESC(fail_malloc, "Fail the Nth call to nvkms_alloc");
@ -78,6 +84,7 @@ MODULE_PARM_DESC(malloc_verbose, "Report information about malloc calls on modul
 static bool malloc_verbose = false;
 module_param_named(malloc_verbose, malloc_verbose, bool, 0400);

+#if NVKMS_CONFIG_FILE_SUPPORTED
 /* This parameter is used to find the dpy override conf file */
 #define NVKMS_CONF_FILE_SPECIFIED (nvkms_conf != NULL)

@ -86,6 +93,7 @@ MODULE_PARM_DESC(config_file,
                 "(default: disabled)");
 static char *nvkms_conf = NULL;
 module_param_named(config_file, nvkms_conf, charp, 0400);
+#endif

 static atomic_t nvkms_alloc_called_count;

@ -94,11 +102,21 @@ NvBool nvkms_output_rounding_fix(void)
    return output_rounding_fix;
 }

+NvBool nvkms_disable_hdmi_frl(void)
+{
+    return disable_hdmi_frl;
+}
+
 NvBool nvkms_disable_vrr_memclk_switch(void)
 {
    return disable_vrr_memclk_switch;
 }

+NvBool nvkms_hdmi_deepcolor(void)
+{
+    return hdmi_deepcolor;
+}
+
 #define NVKMS_SYNCPT_STUBS_NEEDED

 /*************************************************************************
@ -335,7 +353,7 @@ NvU64 nvkms_get_usec(void)
    struct timespec64 ts;
    NvU64 ns;

-    ktime_get_real_ts64(&ts);
+    ktime_get_raw_ts64(&ts);

    ns = timespec64_to_ns(&ts);
    return ns / 1000;
@ -1382,6 +1400,7 @@ static void nvkms_proc_exit(void)
 /*************************************************************************
 * NVKMS Config File Read
 ************************************************************************/
+#if NVKMS_CONFIG_FILE_SUPPORTED
 static NvBool nvkms_fs_mounted(void)
 {
    return current->fs != NULL;
@ -1489,6 +1508,11 @@ static void nvkms_read_config_file_locked(void)

    nvkms_free(buffer, buf_size);
 }
+#else
+static void nvkms_read_config_file_locked(void)
+{
+}
+#endif

 /*************************************************************************
 * NVKMS KAPI functions
--- a/kernel-open/nvidia-modeset/nvidia-modeset-os-interface.h
+++ b/kernel-open/nvidia-modeset/nvidia-modeset-os-interface.h
@ -97,8 +97,9 @@ typedef struct {
 } NvKmsSyncPtOpParams;

 NvBool nvkms_output_rounding_fix(void);
-
+NvBool nvkms_disable_hdmi_frl(void);
 NvBool nvkms_disable_vrr_memclk_switch(void);
+NvBool nvkms_hdmi_deepcolor(void);

 void   nvkms_call_rm    (void *ops);
 void*  nvkms_alloc      (size_t size,
--- a/kernel-open/nvidia-modeset/nvidia-modeset.Kbuild
+++ b/kernel-open/nvidia-modeset/nvidia-modeset.Kbuild
@ -58,6 +58,18 @@ nvidia-modeset-y += $(NVIDIA_MODESET_BINARY_OBJECT_O)
 NVIDIA_MODESET_CFLAGS += -I$(src)/nvidia-modeset
 NVIDIA_MODESET_CFLAGS += -UDEBUG -U_DEBUG -DNDEBUG -DNV_BUILD_MODULE_INSTANCES=0

+# Some Android kernels prohibit driver use of filesystem functions like
+# filp_open() and kernel_read().  Disable the NVKMS_CONFIG_FILE_SUPPORTED
+# functionality that uses those functions when building for Android.
+
+PLATFORM_IS_ANDROID ?= 0
+
+ifeq ($(PLATFORM_IS_ANDROID),1)
+  NVIDIA_MODESET_CFLAGS += -DNVKMS_CONFIG_FILE_SUPPORTED=0
+else
+  NVIDIA_MODESET_CFLAGS += -DNVKMS_CONFIG_FILE_SUPPORTED=1
+endif
+
 $(call ASSIGN_PER_OBJ_CFLAGS, $(NVIDIA_MODESET_OBJECTS), $(NVIDIA_MODESET_CFLAGS))


--- a/kernel-open/nvidia-modeset/nvkms.h
+++ b/kernel-open/nvidia-modeset/nvkms.h
@ -66,6 +66,8 @@ enum NvKmsClientType {
    NVKMS_CLIENT_KERNEL_SPACE,
 };

+struct NvKmsPerOpenDev;
+
 NvBool nvKmsIoctl(
    void *pOpenVoid,
    NvU32 cmd,
@ -104,4 +106,6 @@ NvBool nvKmsKapiGetFunctionsTableInternal
 NvBool nvKmsGetBacklight(NvU32 display_id, void *drv_priv, NvU32 *brightness);
 NvBool nvKmsSetBacklight(NvU32 display_id, void *drv_priv, NvU32 brightness);

+NvBool nvKmsOpenDevHasSubOwnerPermissionOrBetter(const struct NvKmsPerOpenDev *pOpenDev);
+
 #endif /* __NV_KMS_H__ */
--- a/kernel-open/nvidia-peermem/nvidia-peermem.c
+++ b/kernel-open/nvidia-peermem/nvidia-peermem.c
@ -249,8 +249,8 @@ static int nv_dma_map(struct sg_table *sg_head, void *context,
    nv_mem_context->sg_allocated = 1;
    for_each_sg(sg_head->sgl, sg, nv_mem_context->npages, i) {
        sg_set_page(sg, NULL, nv_mem_context->page_size, 0);
-        sg->dma_address = dma_mapping->dma_addresses[i];
-        sg->dma_length = nv_mem_context->page_size;
+        sg_dma_address(sg) = dma_mapping->dma_addresses[i];
+        sg_dma_len(sg) = nv_mem_context->page_size;
    }
    nv_mem_context->sg_head = *sg_head;
    *nmap = nv_mem_context->npages;
@ -304,8 +304,13 @@ static void nv_mem_put_pages_common(int nc,
        return;

    if (nc) {
+#ifdef NVIDIA_P2P_CAP_GET_PAGES_PERSISTENT_API
        ret = nvidia_p2p_put_pages_persistent(nv_mem_context->page_virt_start,
                                              nv_mem_context->page_table, 0);
+#else
+        ret = nvidia_p2p_put_pages(0, 0, nv_mem_context->page_virt_start,
+                                   nv_mem_context->page_table);
+#endif
    } else {
        ret = nvidia_p2p_put_pages(0, 0, nv_mem_context->page_virt_start,
                                   nv_mem_context->page_table);
@ -412,9 +417,15 @@ static int nv_mem_get_pages_nc(unsigned long addr,
    nv_mem_context->core_context = core_context;
    nv_mem_context->page_size = GPU_PAGE_SIZE;

+#ifdef NVIDIA_P2P_CAP_GET_PAGES_PERSISTENT_API
    ret = nvidia_p2p_get_pages_persistent(nv_mem_context->page_virt_start,
                                          nv_mem_context->mapped_size,
                                          &nv_mem_context->page_table, 0);
+#else
+    ret = nvidia_p2p_get_pages(0, 0, nv_mem_context->page_virt_start, nv_mem_context->mapped_size,
+                               &nv_mem_context->page_table, NULL, NULL);
+#endif
+
    if (ret < 0) {
        peer_err("error %d while calling nvidia_p2p_get_pages() with NULL callback\n", ret);
        return ret;
@ -459,8 +470,6 @@ static int __init nv_mem_client_init(void)
    }

 #if defined (NV_MLNX_IB_PEER_MEM_SYMBOLS_PRESENT)
-    int status = 0;
-
    // off by one, to leave space for the trailing '1' which is flagging
    // the new client type
    BUG_ON(strlen(DRV_NAME) > IB_PEER_MEMORY_NAME_MAX-1);
@ -489,7 +498,7 @@ static int __init nv_mem_client_init(void)
                         &mem_invalidate_callback);
    if (!reg_handle) {
        peer_err("nv_mem_client_init -- error while registering traditional client\n");
-        status = -EINVAL;
+        rc = -EINVAL;
        goto out;
    }

@ -499,12 +508,12 @@ static int __init nv_mem_client_init(void)
    reg_handle_nc = ib_register_peer_memory_client(&nv_mem_client_nc, NULL);
    if (!reg_handle_nc) {
        peer_err("nv_mem_client_init -- error while registering nc client\n");
-        status = -EINVAL;
+        rc = -EINVAL;
        goto out;
    }

 out:
-    if (status) {
+    if (rc) {
        if (reg_handle) {
            ib_unregister_peer_memory_client(reg_handle);
            reg_handle = NULL;
@ -516,7 +525,7 @@ out:
        }
    }

-    return status;
+    return rc;
 #else
    return -EINVAL;
 #endif
--- a/kernel-open/nvidia-uvm/clc365.h
+++ b/kernel-open/nvidia-uvm/clc365.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2022 NVIDIA Corporation
+    Copyright (c) 2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
--- a/kernel-open/nvidia-uvm/clc369.h
+++ b/kernel-open/nvidia-uvm/clc369.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2022 NVIDIA Corporation
+    Copyright (c) 2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
--- a/kernel-open/nvidia-uvm/nv-kthread-q.c
+++ b/kernel-open/nvidia-uvm/nv-kthread-q.c
@ -247,6 +247,11 @@ int nv_kthread_q_init_on_node(nv_kthread_q_t *q, const char *q_name, int preferr
    return 0;
 }

+int nv_kthread_q_init(nv_kthread_q_t *q, const char *qname)
+{
+    return nv_kthread_q_init_on_node(q, qname, NV_KTHREAD_NO_NODE);
+}
+
 // Returns true (non-zero) if the item was actually scheduled, and false if the
 // item was already pending in a queue.
 static int _raw_q_schedule(nv_kthread_q_t *q, nv_kthread_q_item_t *q_item)
--- a/kernel-open/nvidia-uvm/nvidia-uvm-sources.Kbuild
+++ b/kernel-open/nvidia-uvm/nvidia-uvm-sources.Kbuild
@ -27,6 +27,7 @@ NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_rm_mem.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_channel.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_lock.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_hal.c
+NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_processors.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_range_tree.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_rb_tree.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_range_allocator.c
--- a/kernel-open/nvidia-uvm/nvidia-uvm.Kbuild
+++ b/kernel-open/nvidia-uvm/nvidia-uvm.Kbuild
@ -82,10 +82,12 @@ NV_CONFTEST_FUNCTION_COMPILE_TESTS += set_pages_uc
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += ktime_get_raw_ts64
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += ioasid_get
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += mm_pasid_drop
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += migrate_vma_setup
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += mmget_not_zero
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += mmgrab
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += iommu_sva_bind_device_has_drvdata_arg
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += vm_fault_to_errno
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += find_next_bit_wrap

 NV_CONFTEST_TYPE_COMPILE_TESTS += backing_dev_info
 NV_CONFTEST_TYPE_COMPILE_TESTS += mm_context_t
@ -99,6 +101,7 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += kmem_cache_has_kobj_remove_work
 NV_CONFTEST_TYPE_COMPILE_TESTS += sysfs_slab_unlink
 NV_CONFTEST_TYPE_COMPILE_TESTS += vm_fault_t
 NV_CONFTEST_TYPE_COMPILE_TESTS += mmu_notifier_ops_invalidate_range
+NV_CONFTEST_TYPE_COMPILE_TESTS += mmu_notifier_ops_arch_invalidate_secondary_tlbs
 NV_CONFTEST_TYPE_COMPILE_TESTS += proc_ops
 NV_CONFTEST_TYPE_COMPILE_TESTS += timespec64
 NV_CONFTEST_TYPE_COMPILE_TESTS += mm_has_mmap_lock
@ -113,4 +116,3 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += mpol_preferred_many_present
 NV_CONFTEST_TYPE_COMPILE_TESTS += mmu_interval_notifier

 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_int_active_memcg
-NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_migrate_vma_setup
--- a/kernel-open/nvidia-uvm/nvstatus.c
+++ b/kernel-open/nvidia-uvm/nvstatus.c
@ -24,11 +24,11 @@
 #include "nvstatus.h"

 #if !defined(NV_PRINTF_STRING_SECTION)
-#if defined(NVRM) && NVCPU_IS_RISCV64
+#if defined(NVRM) && NVOS_IS_LIBOS
 #define NV_PRINTF_STRING_SECTION         __attribute__ ((section (".logging")))
-#else // defined(NVRM) && NVCPU_IS_RISCV64
+#else // defined(NVRM) && NVOS_IS_LIBOS
 #define NV_PRINTF_STRING_SECTION
-#endif // defined(NVRM) && NVCPU_IS_RISCV64
+#endif // defined(NVRM) && NVOS_IS_LIBOS
 #endif // !defined(NV_PRINTF_STRING_SECTION)

 /*
--- a/kernel-open/nvidia-uvm/uvm.c
+++ b/kernel-open/nvidia-uvm/uvm.c
@ -571,7 +571,6 @@ static void uvm_vm_open_managed_entry(struct vm_area_struct *vma)
 static void uvm_vm_close_managed(struct vm_area_struct *vma)
 {
    uvm_va_space_t *va_space = uvm_va_space_get(vma->vm_file);
-    uvm_processor_id_t gpu_id;
    bool make_zombie = false;

    if (current->mm != NULL)
@ -606,12 +605,6 @@ static void uvm_vm_close_managed(struct vm_area_struct *vma)

    uvm_destroy_vma_managed(vma, make_zombie);

-    // Notify GPU address spaces that the fault buffer needs to be flushed to
-    // avoid finding stale entries that can be attributed to new VA ranges
-    // reallocated at the same address.
-    for_each_gpu_id_in_mask(gpu_id, &va_space->registered_gpu_va_spaces) {
-        uvm_processor_mask_set_atomic(&va_space->needs_fault_buffer_flush, gpu_id);
-    }
    uvm_va_space_up_write(va_space);

    if (current->mm != NULL)
--- a/kernel-open/nvidia-uvm/uvm.h
+++ b/kernel-open/nvidia-uvm/uvm.h
@ -216,6 +216,10 @@ NV_STATUS UvmDeinitialize(void);
 // Note that it is not required to release VA ranges that were reserved with
 // UvmReserveVa().
 //
+// This is useful for per-process checkpoint and restore, where kernel-mode
+// state needs to be reconfigured to match the expectations of a pre-existing
+// user-mode process.
+//
 // UvmReopen() closes the open file returned by UvmGetFileDescriptor() and
 // replaces it with a new open file with the same name.
 //
--- a/kernel-open/nvidia-uvm/uvm_ats_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_ats_faults.c
@ -114,6 +114,8 @@ static void flush_tlb_write_faults(uvm_gpu_va_space_t *gpu_va_space,
 {
    uvm_ats_fault_invalidate_t *ats_invalidate;

+    uvm_ats_smmu_invalidate_tlbs(gpu_va_space, addr, size);
+
    if (client_type == UVM_FAULT_CLIENT_TYPE_GPC)
        ats_invalidate = &gpu_va_space->gpu->parent->fault_buffer_info.replayable.ats_invalidate;
    else
@ -588,4 +590,3 @@ NV_STATUS uvm_ats_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space,

    return status;
 }
-
--- a/kernel-open/nvidia-uvm/uvm_ats_sva.c
+++ b/kernel-open/nvidia-uvm/uvm_ats_sva.c
@ -29,8 +29,12 @@
 #include "uvm_va_space.h"
 #include "uvm_va_space_mm.h"

+#include <asm/io.h>
 #include <linux/iommu.h>
 #include <linux/mm_types.h>
+#include <linux/acpi.h>
+#include <linux/device.h>
+#include <linux/mmu_context.h>

 // linux/sched/mm.h is needed for mmget_not_zero and mmput to get the mm
 // reference required for the iommu_sva_bind_device() call. This header is not
@ -46,17 +50,271 @@
 #define UVM_IOMMU_SVA_BIND_DEVICE(dev, mm) iommu_sva_bind_device(dev, mm)
 #endif

+// Base address of SMMU CMDQ-V for GSMMU0.
+#define SMMU_CMDQV_BASE_ADDR(smmu_base) (smmu_base + 0x200000)
+#define SMMU_CMDQV_BASE_LEN 0x00830000
+
+// CMDQV configuration is done by firmware but we check status here.
+#define SMMU_CMDQV_CONFIG 0x0
+#define SMMU_CMDQV_CONFIG_CMDQV_EN BIT(0)
+
+// Used to map a particular VCMDQ to a VINTF.
+#define SMMU_CMDQV_CMDQ_ALLOC_MAP(vcmdq_id) (0x200 + 0x4 * (vcmdq_id))
+#define SMMU_CMDQV_CMDQ_ALLOC_MAP_ALLOC BIT(0)
+
+// Shift for the field containing the index of the virtual interface
+// owning the VCMDQ.
+#define SMMU_CMDQV_CMDQ_ALLOC_MAP_VIRT_INTF_INDX_SHIFT 15
+
+// Base address for the VINTF registers.
+#define SMMU_VINTF_BASE_ADDR(cmdqv_base_addr, vintf_id) (cmdqv_base_addr + 0x1000 + 0x100 * (vintf_id))
+
+// Virtual interface (VINTF) configuration registers. The WAR only
+// works on baremetal so we need to configure ourselves as the
+// hypervisor owner.
+#define SMMU_VINTF_CONFIG 0x0
+#define SMMU_VINTF_CONFIG_ENABLE BIT(0)
+#define SMMU_VINTF_CONFIG_HYP_OWN BIT(17)
+
+#define SMMU_VINTF_STATUS 0x0
+#define SMMU_VINTF_STATUS_ENABLED BIT(0)
+
+// Caclulates the base address for a particular VCMDQ instance.
+#define SMMU_VCMDQ_BASE_ADDR(cmdqv_base_addr, vcmdq_id) (cmdqv_base_addr + 0x10000 + 0x80 * (vcmdq_id))
+
+// SMMU command queue consumer index register. Updated by SMMU
+// when commands are consumed.
+#define SMMU_VCMDQ_CONS 0x0
+
+// SMMU command queue producer index register. Updated by UVM when
+// commands are added to the queue.
+#define SMMU_VCMDQ_PROD 0x4
+
+// Configuration register used to enable a VCMDQ.
+#define SMMU_VCMDQ_CONFIG 0x8
+#define SMMU_VCMDQ_CONFIG_ENABLE BIT(0)
+
+// Status register used to check the VCMDQ is enabled.
+#define SMMU_VCMDQ_STATUS 0xc
+#define SMMU_VCMDQ_STATUS_ENABLED BIT(0)
+
+// Base address offset for the VCMDQ registers.
+#define SMMU_VCMDQ_CMDQ_BASE 0x10000
+
+// Size of the command queue. Each command is 8 bytes and we can't
+// have a command queue greater than one page.
+#define SMMU_VCMDQ_CMDQ_BASE_LOG2SIZE 9
+#define SMMU_VCMDQ_CMDQ_ENTRIES (1UL << SMMU_VCMDQ_CMDQ_BASE_LOG2SIZE)
+
+// We always use VINTF63 for the WAR
+#define VINTF 63
+static void smmu_vintf_write32(void __iomem *smmu_cmdqv_base, int reg, NvU32 val)
+{
+    iowrite32(val, SMMU_VINTF_BASE_ADDR(smmu_cmdqv_base, VINTF) + reg);
+}
+
+static NvU32 smmu_vintf_read32(void __iomem *smmu_cmdqv_base, int reg)
+{
+    return ioread32(SMMU_VINTF_BASE_ADDR(smmu_cmdqv_base, VINTF) + reg);
+}
+
+// We always use VCMDQ127 for the WAR
+#define VCMDQ 127
+void smmu_vcmdq_write32(void __iomem *smmu_cmdqv_base, int reg, NvU32 val)
+{
+    iowrite32(val, SMMU_VCMDQ_BASE_ADDR(smmu_cmdqv_base, VCMDQ) + reg);
+}
+
+NvU32 smmu_vcmdq_read32(void __iomem *smmu_cmdqv_base, int reg)
+{
+    return ioread32(SMMU_VCMDQ_BASE_ADDR(smmu_cmdqv_base, VCMDQ) + reg);
+}
+
+static void smmu_vcmdq_write64(void __iomem *smmu_cmdqv_base, int reg, NvU64 val)
+{
+    iowrite64(val, SMMU_VCMDQ_BASE_ADDR(smmu_cmdqv_base, VCMDQ) + reg);
+}
+
+// Fix for Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU
+// TLB invalidates on read-only to read-write upgrades
+static NV_STATUS uvm_ats_smmu_war_init(uvm_parent_gpu_t *parent_gpu)
+{
+    uvm_spin_loop_t spin;
+    NV_STATUS status;
+    unsigned long cmdqv_config;
+    void __iomem *smmu_cmdqv_base;
+    struct acpi_iort_node *node;
+    struct acpi_iort_smmu_v3 *iort_smmu;
+
+    node = *(struct acpi_iort_node **) dev_get_platdata(parent_gpu->pci_dev->dev.iommu->iommu_dev->dev->parent);
+    iort_smmu = (struct acpi_iort_smmu_v3 *) node->node_data;
+
+    smmu_cmdqv_base = ioremap(SMMU_CMDQV_BASE_ADDR(iort_smmu->base_address), SMMU_CMDQV_BASE_LEN);
+    if (!smmu_cmdqv_base)
+        return NV_ERR_NO_MEMORY;
+
+    parent_gpu->smmu_war.smmu_cmdqv_base = smmu_cmdqv_base;
+    cmdqv_config = ioread32(smmu_cmdqv_base + SMMU_CMDQV_CONFIG);
+    if (!(cmdqv_config & SMMU_CMDQV_CONFIG_CMDQV_EN)) {
+        status = NV_ERR_OBJECT_NOT_FOUND;
+        goto out;
+    }
+
+    // Allocate SMMU CMDQ pages for WAR
+    parent_gpu->smmu_war.smmu_cmdq = alloc_page(NV_UVM_GFP_FLAGS | __GFP_ZERO);
+    if (!parent_gpu->smmu_war.smmu_cmdq) {
+        status = NV_ERR_NO_MEMORY;
+        goto out;
+    }
+
+    // Initialise VINTF for the WAR
+    smmu_vintf_write32(smmu_cmdqv_base, SMMU_VINTF_CONFIG, SMMU_VINTF_CONFIG_ENABLE | SMMU_VINTF_CONFIG_HYP_OWN);
+    UVM_SPIN_WHILE(!(smmu_vintf_read32(smmu_cmdqv_base, SMMU_VINTF_STATUS) & SMMU_VINTF_STATUS_ENABLED), &spin);
+
+    // Allocate VCMDQ to VINTF
+    iowrite32((VINTF << SMMU_CMDQV_CMDQ_ALLOC_MAP_VIRT_INTF_INDX_SHIFT) | SMMU_CMDQV_CMDQ_ALLOC_MAP_ALLOC,
+              smmu_cmdqv_base + SMMU_CMDQV_CMDQ_ALLOC_MAP(VCMDQ));
+
+    BUILD_BUG_ON((SMMU_VCMDQ_CMDQ_BASE_LOG2SIZE + 3) > PAGE_SHIFT);
+    smmu_vcmdq_write64(smmu_cmdqv_base, SMMU_VCMDQ_CMDQ_BASE,
+                       page_to_phys(parent_gpu->smmu_war.smmu_cmdq) | SMMU_VCMDQ_CMDQ_BASE_LOG2SIZE);
+    smmu_vcmdq_write32(smmu_cmdqv_base, SMMU_VCMDQ_CONS, 0);
+    smmu_vcmdq_write32(smmu_cmdqv_base, SMMU_VCMDQ_PROD, 0);
+    smmu_vcmdq_write32(smmu_cmdqv_base, SMMU_VCMDQ_CONFIG, SMMU_VCMDQ_CONFIG_ENABLE);
+    UVM_SPIN_WHILE(!(smmu_vcmdq_read32(smmu_cmdqv_base, SMMU_VCMDQ_STATUS) & SMMU_VCMDQ_STATUS_ENABLED), &spin);
+
+    uvm_mutex_init(&parent_gpu->smmu_war.smmu_lock, UVM_LOCK_ORDER_LEAF);
+    parent_gpu->smmu_war.smmu_prod = 0;
+    parent_gpu->smmu_war.smmu_cons = 0;
+
+    return NV_OK;
+
+out:
+    iounmap(parent_gpu->smmu_war.smmu_cmdqv_base);
+    parent_gpu->smmu_war.smmu_cmdqv_base = NULL;
+
+    return status;
+}
+
+static void uvm_ats_smmu_war_deinit(uvm_parent_gpu_t *parent_gpu)
+{
+    void __iomem *smmu_cmdqv_base = parent_gpu->smmu_war.smmu_cmdqv_base;
+    NvU32 cmdq_alloc_map;
+
+    if (parent_gpu->smmu_war.smmu_cmdqv_base) {
+        smmu_vcmdq_write32(smmu_cmdqv_base, SMMU_VCMDQ_CONFIG, 0);
+        cmdq_alloc_map = ioread32(smmu_cmdqv_base + SMMU_CMDQV_CMDQ_ALLOC_MAP(VCMDQ));
+        iowrite32(cmdq_alloc_map & SMMU_CMDQV_CMDQ_ALLOC_MAP_ALLOC, smmu_cmdqv_base + SMMU_CMDQV_CMDQ_ALLOC_MAP(VCMDQ));
+        smmu_vintf_write32(smmu_cmdqv_base, SMMU_VINTF_CONFIG, 0);
+    }
+
+    if (parent_gpu->smmu_war.smmu_cmdq)
+        __free_page(parent_gpu->smmu_war.smmu_cmdq);
+
+    if (parent_gpu->smmu_war.smmu_cmdqv_base)
+        iounmap(parent_gpu->smmu_war.smmu_cmdqv_base);
+}
+
+// The SMMU on ARM64 can run under different translation regimes depending on
+// what features the OS and CPU variant support. The CPU for GH180 supports
+// virtualisation extensions and starts the kernel at EL2 meaning SMMU operates
+// under the NS-EL2-E2H translation regime. Therefore we need to use the
+// TLBI_EL2_* commands which invalidate TLB entries created under this
+// translation regime.
+#define CMDQ_OP_TLBI_EL2_ASID 0x21;
+#define CMDQ_OP_TLBI_EL2_VA 0x22;
+#define CMDQ_OP_CMD_SYNC 0x46
+
+// Use the same maximum as used for MAX_TLBI_OPS in the upstream
+// kernel.
+#define UVM_MAX_TLBI_OPS (1UL << (PAGE_SHIFT - 3))
+
+#if UVM_ATS_SMMU_WAR_REQUIRED()
+void uvm_ats_smmu_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space, NvU64 addr, size_t size)
+{
+    struct mm_struct *mm = gpu_va_space->va_space->va_space_mm.mm;
+    uvm_parent_gpu_t *parent_gpu = gpu_va_space->gpu->parent;
+    struct {
+        NvU64 low;
+        NvU64 high;
+    } *vcmdq;
+    unsigned long vcmdq_prod;
+    NvU64 end;
+    uvm_spin_loop_t spin;
+    NvU16 asid;
+
+    if (!parent_gpu->smmu_war.smmu_cmdqv_base)
+        return;
+
+    asid = arm64_mm_context_get(mm);
+    vcmdq = kmap(parent_gpu->smmu_war.smmu_cmdq);
+    uvm_mutex_lock(&parent_gpu->smmu_war.smmu_lock);
+    vcmdq_prod = parent_gpu->smmu_war.smmu_prod;
+
+    // Our queue management is very simple. The mutex prevents multiple
+    // producers writing to the queue and all our commands require waiting for
+    // the queue to drain so we know it's empty. If we can't fit enough commands
+    // in the queue we just invalidate the whole ASID.
+    //
+    // The command queue is a cirular buffer with the MSB representing a wrap
+    // bit that must toggle on each wrap. See the SMMU architecture
+    // specification for more details.
+    //
+    // SMMU_VCMDQ_CMDQ_ENTRIES - 1 because we need to leave space for the
+    // CMD_SYNC.
+    if ((size >> PAGE_SHIFT) > min(UVM_MAX_TLBI_OPS, SMMU_VCMDQ_CMDQ_ENTRIES - 1)) {
+        vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low = CMDQ_OP_TLBI_EL2_ASID;
+        vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low |= (NvU64) asid << 48;
+        vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].high = 0;
+        vcmdq_prod++;
+    }
+    else {
+        for (end = addr + size; addr < end; addr += PAGE_SIZE) {
+            vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low = CMDQ_OP_TLBI_EL2_VA;
+            vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low |= (NvU64) asid << 48;
+            vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].high = addr & ~((1UL << 12) - 1);
+            vcmdq_prod++;
+        }
+    }
+
+    vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low = CMDQ_OP_CMD_SYNC;
+    vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].high = 0x0;
+    vcmdq_prod++;
+
+    // MSB is the wrap bit
+    vcmdq_prod &= (1UL << (SMMU_VCMDQ_CMDQ_BASE_LOG2SIZE + 1)) - 1;
+    parent_gpu->smmu_war.smmu_prod = vcmdq_prod;
+    smmu_vcmdq_write32(parent_gpu->smmu_war.smmu_cmdqv_base, SMMU_VCMDQ_PROD, parent_gpu->smmu_war.smmu_prod);
+
+    UVM_SPIN_WHILE(
+        (smmu_vcmdq_read32(parent_gpu->smmu_war.smmu_cmdqv_base, SMMU_VCMDQ_CONS) & GENMASK(19, 0)) != vcmdq_prod,
+        &spin);
+
+    uvm_mutex_unlock(&parent_gpu->smmu_war.smmu_lock);
+    kunmap(parent_gpu->smmu_war.smmu_cmdq);
+    arm64_mm_context_put(mm);
+}
+#endif
+
 NV_STATUS uvm_ats_sva_add_gpu(uvm_parent_gpu_t *parent_gpu)
 {
    int ret;

    ret = iommu_dev_enable_feature(&parent_gpu->pci_dev->dev, IOMMU_DEV_FEAT_SVA);
+    if (ret)
+        return errno_to_nv_status(ret);

-    return errno_to_nv_status(ret);
+    if (UVM_ATS_SMMU_WAR_REQUIRED())
+        return uvm_ats_smmu_war_init(parent_gpu);
+    else
+        return NV_OK;
 }

 void uvm_ats_sva_remove_gpu(uvm_parent_gpu_t *parent_gpu)
 {
+    if (UVM_ATS_SMMU_WAR_REQUIRED())
+        uvm_ats_smmu_war_deinit(parent_gpu);
+
    iommu_dev_disable_feature(&parent_gpu->pci_dev->dev, IOMMU_DEV_FEAT_SVA);
 }

--- a/kernel-open/nvidia-uvm/uvm_ats_sva.h
+++ b/kernel-open/nvidia-uvm/uvm_ats_sva.h
@ -53,6 +53,17 @@
        #define UVM_ATS_SVA_SUPPORTED() 0
    #endif

+// If NV_ARCH_INVALIDATE_SECONDARY_TLBS is defined it means the upstream fix is
+// in place so no need for the WAR from Bug 4130089: [GH180][r535] WAR for
+// kernel not issuing SMMU TLB invalidates on read-only
+#if defined(NV_ARCH_INVALIDATE_SECONDARY_TLBS)
+    #define UVM_ATS_SMMU_WAR_REQUIRED() 0
+#elif NVCPU_IS_AARCH64
+    #define UVM_ATS_SMMU_WAR_REQUIRED() 1
+#else
+    #define UVM_ATS_SMMU_WAR_REQUIRED() 0
+#endif
+
 typedef struct
 {
    int placeholder;
@ -81,6 +92,17 @@ typedef struct

    // LOCKING: None
    void uvm_ats_sva_unregister_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space);
+
+    // Fix for Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU
+    // TLB invalidates on read-only to read-write upgrades
+    #if UVM_ATS_SMMU_WAR_REQUIRED()
+        void uvm_ats_smmu_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space, NvU64 addr, size_t size);
+    #else
+        static void uvm_ats_smmu_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space, NvU64 addr, size_t size)
+        {
+
+        }
+    #endif
 #else
    static NV_STATUS uvm_ats_sva_add_gpu(uvm_parent_gpu_t *parent_gpu)
    {
@ -111,6 +133,11 @@ typedef struct
    {

    }
+
+    static void uvm_ats_smmu_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space, NvU64 addr, size_t size)
+    {
+
+    }
 #endif // UVM_ATS_SVA_SUPPORTED

 #endif // __UVM_ATS_SVA_H__
--- a/kernel-open/nvidia-uvm/uvm_channel.c
+++ b/kernel-open/nvidia-uvm/uvm_channel.c
@ -2683,7 +2683,7 @@ static void init_channel_manager_conf(uvm_channel_manager_t *manager)
    // caches vidmem (and sysmem), we place GPFIFO and GPPUT on sysmem to avoid
    // cache thrash. The memory access latency is reduced, despite the required
    // access through the bus, because no cache coherence message is exchanged.
-    if (uvm_gpu_is_coherent(gpu->parent)) {
+    if (uvm_parent_gpu_is_coherent(gpu->parent)) {
        manager->conf.gpfifo_loc = UVM_BUFFER_LOCATION_SYS;

        // On GPUs with limited ESCHED addressing range, e.g., Volta on P9, RM
--- a/kernel-open/nvidia-uvm/uvm_common.c
+++ b/kernel-open/nvidia-uvm/uvm_common.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2013-2021 NVIDIA Corporation
+    Copyright (c) 2013-2023 NVIDIA Corporation

    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License
@ -233,18 +233,6 @@ unsigned uvm_get_stale_thread_id(void)
    return (unsigned)task_pid_vnr(current);
 }

-//
-// A simple security rule for allowing access to UVM user space memory: if you
-// are the same user as the owner of the memory, or if you are root, then you
-// are granted access. The idea is to allow debuggers and profilers to work, but
-// without opening up any security holes.
-//
-NvBool uvm_user_id_security_check(uid_t euidTarget)
-{
-    return (NV_CURRENT_EUID() == euidTarget) ||
-           (UVM_ROOT_UID == euidTarget);
-}
-
 void on_uvm_test_fail(void)
 {
    (void)NULL;
--- a/kernel-open/nvidia-uvm/uvm_common.h
+++ b/kernel-open/nvidia-uvm/uvm_common.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2013-2021 NVIDIA Corporation
+    Copyright (c) 2013-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -282,9 +282,6 @@ static inline void kmem_cache_destroy_safe(struct kmem_cache **ppCache)
    }
 }

-static const uid_t UVM_ROOT_UID = 0;
-
-
 typedef struct
 {
    NvU64 start_time_ns;
@ -335,7 +332,6 @@ NV_STATUS errno_to_nv_status(int errnoCode);
 int nv_status_to_errno(NV_STATUS status);
 unsigned uvm_get_stale_process_id(void);
 unsigned uvm_get_stale_thread_id(void);
-NvBool uvm_user_id_security_check(uid_t euidTarget);

 extern int uvm_enable_builtin_tests;

--- a/kernel-open/nvidia-uvm/uvm_conf_computing.c
+++ b/kernel-open/nvidia-uvm/uvm_conf_computing.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2021-2023 NVIDIA Corporation
+    Copyright (c) 2021 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -54,26 +54,23 @@ bool uvm_conf_computing_mode_is_hcc(const uvm_gpu_t *gpu)
    return uvm_conf_computing_get_mode(gpu->parent) == UVM_GPU_CONF_COMPUTE_MODE_HCC;
 }

-void uvm_conf_computing_check_parent_gpu(const uvm_parent_gpu_t *parent)
+NV_STATUS uvm_conf_computing_init_parent_gpu(const uvm_parent_gpu_t *parent)
 {
-    uvm_gpu_t *first_gpu;
+    UvmGpuConfComputeMode cc, sys_cc;
+    uvm_gpu_t *first;

    uvm_assert_mutex_locked(&g_uvm_global.global_lock);

-    // The Confidential Computing state of the GPU should match that of the
-    // system.
-    UVM_ASSERT(uvm_conf_computing_mode_enabled_parent(parent) == g_uvm_global.conf_computing_enabled);
-
    // TODO: Bug 2844714: since we have no routine to traverse parent GPUs,
    // find first child GPU and get its parent.
-    first_gpu = uvm_global_processor_mask_find_first_gpu(&g_uvm_global.retained_gpus);
-    if (first_gpu == NULL)
-        return;
+    first = uvm_global_processor_mask_find_first_gpu(&g_uvm_global.retained_gpus);
+    if (!first)
+        return NV_OK;

-    // All GPUs derive Confidential Computing status from their parent. By
-    // current policy all parent GPUs have identical Confidential Computing
-    // status.
-    UVM_ASSERT(uvm_conf_computing_get_mode(parent) == uvm_conf_computing_get_mode(first_gpu->parent));
+    sys_cc = uvm_conf_computing_get_mode(first->parent);
+    cc = uvm_conf_computing_get_mode(parent);
+
+    return cc == sys_cc ? NV_OK : NV_ERR_NOT_SUPPORTED;
 }

 static void dma_buffer_destroy_locked(uvm_conf_computing_dma_buffer_pool_t *dma_buffer_pool,
--- a/kernel-open/nvidia-uvm/uvm_conf_computing.h
+++ b/kernel-open/nvidia-uvm/uvm_conf_computing.h
@ -60,8 +60,10 @@
 // UVM_METHOD_SIZE * 2 * 10 = 80.
 #define UVM_CONF_COMPUTING_SIGN_BUF_MAX_SIZE 80

-void uvm_conf_computing_check_parent_gpu(const uvm_parent_gpu_t *parent);
-
+// All GPUs derive confidential computing status from their parent.
+// By current policy all parent GPUs have identical confidential
+// computing status.
+NV_STATUS uvm_conf_computing_init_parent_gpu(const uvm_parent_gpu_t *parent);
 bool uvm_conf_computing_mode_enabled_parent(const uvm_parent_gpu_t *parent);
 bool uvm_conf_computing_mode_enabled(const uvm_gpu_t *gpu);
 bool uvm_conf_computing_mode_is_hcc(const uvm_gpu_t *gpu);
--- a/kernel-open/nvidia-uvm/uvm_global.c
+++ b/kernel-open/nvidia-uvm/uvm_global.c
@ -71,6 +71,11 @@ static void uvm_unregister_callbacks(void)
    }
 }

+static void sev_init(const UvmPlatformInfo *platform_info)
+{
+    g_uvm_global.sev_enabled = platform_info->sevEnabled;
+}
+
 NV_STATUS uvm_global_init(void)
 {
    NV_STATUS status;
@ -119,7 +124,8 @@ NV_STATUS uvm_global_init(void)

    uvm_ats_init(&platform_info);
    g_uvm_global.num_simulated_devices = 0;
-    g_uvm_global.conf_computing_enabled = platform_info.confComputingEnabled;
+
+    sev_init(&platform_info);

    status = uvm_gpu_init();
    if (status != NV_OK) {
--- a/kernel-open/nvidia-uvm/uvm_global.h
+++ b/kernel-open/nvidia-uvm/uvm_global.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2021 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -143,16 +143,11 @@ struct uvm_global_struct
        struct page *page;
    } unload_state;

-    // True if the VM has AMD's SEV, or equivalent HW security extensions such
-    // as Intel's TDX, enabled. The flag is always false on the host.
-    //
-    // This value moves in tandem with that of Confidential Computing in the
-    // GPU(s) in all supported configurations, so it is used as a proxy for the
-    // Confidential Computing state.
-    //
-    // This field is set once during global initialization (uvm_global_init),
-    // and can be read afterwards without acquiring any locks.
-    bool conf_computing_enabled;
+    // AMD Secure Encrypted Virtualization (SEV) status. True if VM has SEV
+    // enabled. This field is set once during global initialization
+    // (uvm_global_init), and can be read afterwards without acquiring any
+    // locks.
+    bool sev_enabled;
 };

 // Initialize global uvm state
@ -238,10 +233,8 @@ static uvm_gpu_t *uvm_gpu_get_by_processor_id(uvm_processor_id_t id)
    return gpu;
 }

-static uvmGpuSessionHandle uvm_gpu_session_handle(uvm_gpu_t *gpu)
+static uvmGpuSessionHandle uvm_global_session_handle(void)
 {
-    if (gpu->parent->smc.enabled)
-        return gpu->smc.rm_session_handle;
    return g_uvm_global.rm_session_handle;
 }

--- a/kernel-open/nvidia-uvm/uvm_gpu.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu.c
@ -99,8 +99,8 @@ static void fill_gpu_info(uvm_parent_gpu_t *parent_gpu, const UvmGpuInfo *gpu_in
    parent_gpu->system_bus.link_rate_mbyte_per_s = gpu_info->sysmemLinkRateMBps;

    if (gpu_info->systemMemoryWindowSize > 0) {
-        // memory_window_end is inclusive but uvm_gpu_is_coherent() checks
-        // memory_window_end > memory_window_start as its condition.
+        // memory_window_end is inclusive but uvm_parent_gpu_is_coherent()
+        // checks memory_window_end > memory_window_start as its condition.
        UVM_ASSERT(gpu_info->systemMemoryWindowSize > 1);
        parent_gpu->system_bus.memory_window_start = gpu_info->systemMemoryWindowStart;
        parent_gpu->system_bus.memory_window_end   = gpu_info->systemMemoryWindowStart +
@ -136,12 +136,12 @@ static NV_STATUS get_gpu_caps(uvm_gpu_t *gpu)
        return status;

    if (gpu_caps.numaEnabled) {
-        UVM_ASSERT(uvm_gpu_is_coherent(gpu->parent));
+        UVM_ASSERT(uvm_parent_gpu_is_coherent(gpu->parent));
        gpu->mem_info.numa.enabled = true;
        gpu->mem_info.numa.node_id = gpu_caps.numaNodeId;
    }
    else {
-        UVM_ASSERT(!uvm_gpu_is_coherent(gpu->parent));
+        UVM_ASSERT(!uvm_parent_gpu_is_coherent(gpu->parent));
    }

    return NV_OK;
@ -1089,7 +1089,7 @@ static NV_STATUS init_parent_gpu(uvm_parent_gpu_t *parent_gpu,
 {
    NV_STATUS status;

-    status = uvm_rm_locked_call(nvUvmInterfaceDeviceCreate(g_uvm_global.rm_session_handle,
+    status = uvm_rm_locked_call(nvUvmInterfaceDeviceCreate(uvm_global_session_handle(),
                                                           gpu_info,
                                                           gpu_uuid,
                                                           &parent_gpu->rm_device,
@ -1099,7 +1099,12 @@ static NV_STATUS init_parent_gpu(uvm_parent_gpu_t *parent_gpu,
        return status;
    }

-    uvm_conf_computing_check_parent_gpu(parent_gpu);
+    status = uvm_conf_computing_init_parent_gpu(parent_gpu);
+    if (status != NV_OK) {
+        UVM_ERR_PRINT("Confidential computing: %s, GPU %s\n",
+                      nvstatusToString(status), parent_gpu->name);
+        return status;
+    }

    parent_gpu->pci_dev = gpu_platform_info->pci_dev;
    parent_gpu->closest_cpu_numa_node = dev_to_node(&parent_gpu->pci_dev->dev);
@ -1161,19 +1166,8 @@ static NV_STATUS init_gpu(uvm_gpu_t *gpu, const UvmGpuInfo *gpu_info)
 {
    NV_STATUS status;

-    // Presently, an RM client can only subscribe to a single partition per
-    // GPU. Therefore, UVM needs to create several RM clients. For simplicity,
-    // and since P2P is not supported when SMC partitions are created, we
-    // create a client (session) per GPU partition.
    if (gpu->parent->smc.enabled) {
-        UvmPlatformInfo platform_info;
-        status = uvm_rm_locked_call(nvUvmInterfaceSessionCreate(&gpu->smc.rm_session_handle, &platform_info));
-        if (status != NV_OK) {
-            UVM_ERR_PRINT("Creating RM session failed: %s\n", nvstatusToString(status));
-            return status;
-        }
-
-        status = uvm_rm_locked_call(nvUvmInterfaceDeviceCreate(uvm_gpu_session_handle(gpu),
+        status = uvm_rm_locked_call(nvUvmInterfaceDeviceCreate(uvm_global_session_handle(),
                                                               gpu_info,
                                                               uvm_gpu_uuid(gpu),
                                                               &gpu->smc.rm_device,
@ -1543,9 +1537,6 @@ static void deinit_gpu(uvm_gpu_t *gpu)
    if (gpu->parent->smc.enabled) {
        if (gpu->smc.rm_device != 0)
            uvm_rm_locked_call_void(nvUvmInterfaceDeviceDestroy(gpu->smc.rm_device));
-
-        if (gpu->smc.rm_session_handle != 0)
-            uvm_rm_locked_call_void(nvUvmInterfaceSessionDestroy(gpu->smc.rm_session_handle));
    }

    gpu->magic = 0;
@ -2575,7 +2566,7 @@ static void disable_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
        uvm_mmu_destroy_peer_identity_mappings(gpu0, gpu1);
        uvm_mmu_destroy_peer_identity_mappings(gpu1, gpu0);

-        uvm_rm_locked_call_void(nvUvmInterfaceP2pObjectDestroy(uvm_gpu_session_handle(gpu0), p2p_handle));
+        uvm_rm_locked_call_void(nvUvmInterfaceP2pObjectDestroy(uvm_global_session_handle(), p2p_handle));

        UVM_ASSERT(uvm_gpu_get(gpu0->global_id) == gpu0);
        UVM_ASSERT(uvm_gpu_get(gpu1->global_id) == gpu1);
@ -2701,9 +2692,9 @@ uvm_processor_id_t uvm_gpu_get_processor_id_by_address(uvm_gpu_t *gpu, uvm_gpu_p
    return id;
 }

-uvm_gpu_peer_t *uvm_gpu_index_peer_caps(const uvm_gpu_id_t gpu_id1, const uvm_gpu_id_t gpu_id2)
+uvm_gpu_peer_t *uvm_gpu_index_peer_caps(const uvm_gpu_id_t gpu_id0, const uvm_gpu_id_t gpu_id1)
 {
-    NvU32 table_index = uvm_gpu_peer_table_index(gpu_id1, gpu_id2);
+    NvU32 table_index = uvm_gpu_peer_table_index(gpu_id0, gpu_id1);
    return &g_uvm_global.peers[table_index];
 }

--- a/kernel-open/nvidia-uvm/uvm_gpu.h
+++ b/kernel-open/nvidia-uvm/uvm_gpu.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2022 NVIDIA Corporation
+    Copyright (c) 2015-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -167,7 +167,7 @@ struct uvm_service_block_context_struct
    } per_processor_masks[UVM_ID_MAX_PROCESSORS];

    // State used by the VA block routines called by the servicing routine
-    uvm_va_block_context_t block_context;
+    uvm_va_block_context_t *block_context;

    // Prefetch state hint
    uvm_perf_prefetch_hint_t prefetch_hint;
@ -263,7 +263,10 @@ struct uvm_fault_service_batch_context_struct

    NvU32 num_coalesced_faults;

-    bool has_fatal_faults;
+    // One of the VA spaces in this batch which had fatal faults. If NULL, no
+    // faults were fatal. More than one VA space could have fatal faults, but we
+    // pick one to be the target of the cancel sequence.
+    uvm_va_space_t *fatal_va_space;

    bool has_throttled_faults;

@ -825,8 +828,6 @@ struct uvm_gpu_struct
    {
        NvU32 swizz_id;

-        uvmGpuSessionHandle rm_session_handle;
-
        // RM device handle used in many of the UVM/RM APIs.
        //
        // Do not read this field directly, use uvm_gpu_device_handle instead.
@ -1162,6 +1163,16 @@ struct uvm_parent_gpu_struct
        NvU64 memory_window_start;
        NvU64 memory_window_end;
    } system_bus;
+
+    // WAR to issue ATS TLB invalidation commands ourselves.
+    struct
+    {
+        uvm_mutex_t smmu_lock;
+        struct page *smmu_cmdq;
+        void __iomem *smmu_cmdqv_base;
+        unsigned long smmu_prod;
+        unsigned long smmu_cons;
+    } smmu_war;
 };

 static const char *uvm_gpu_name(uvm_gpu_t *gpu)
@ -1336,7 +1347,7 @@ static NvU64 uvm_gpu_retained_count(uvm_gpu_t *gpu)
 void uvm_parent_gpu_kref_put(uvm_parent_gpu_t *gpu);

 // Calculates peer table index using GPU ids.
-NvU32 uvm_gpu_peer_table_index(uvm_gpu_id_t gpu_id1, uvm_gpu_id_t gpu_id2);
+NvU32 uvm_gpu_peer_table_index(const uvm_gpu_id_t gpu_id0, const uvm_gpu_id_t gpu_id1);

 // Either retains an existing PCIe peer entry or creates a new one. In both
 // cases the two GPUs are also each retained.
@ -1355,7 +1366,7 @@ uvm_aperture_t uvm_gpu_peer_aperture(uvm_gpu_t *local_gpu, uvm_gpu_t *remote_gpu
 uvm_processor_id_t uvm_gpu_get_processor_id_by_address(uvm_gpu_t *gpu, uvm_gpu_phys_address_t addr);

 // Get the P2P capabilities between the gpus with the given indexes
-uvm_gpu_peer_t *uvm_gpu_index_peer_caps(uvm_gpu_id_t gpu_id1, uvm_gpu_id_t gpu_id2);
+uvm_gpu_peer_t *uvm_gpu_index_peer_caps(const uvm_gpu_id_t gpu_id0, const uvm_gpu_id_t gpu_id1);

 // Get the P2P capabilities between the given gpus
 static uvm_gpu_peer_t *uvm_gpu_peer_caps(const uvm_gpu_t *gpu0, const uvm_gpu_t *gpu1)
@ -1363,10 +1374,10 @@ static uvm_gpu_peer_t *uvm_gpu_peer_caps(const uvm_gpu_t *gpu0, const uvm_gpu_t
    return uvm_gpu_index_peer_caps(gpu0->id, gpu1->id);
 }

-static bool uvm_gpus_are_nvswitch_connected(uvm_gpu_t *gpu1, uvm_gpu_t *gpu2)
+static bool uvm_gpus_are_nvswitch_connected(const uvm_gpu_t *gpu0, const uvm_gpu_t *gpu1)
 {
-    if (gpu1->parent->nvswitch_info.is_nvswitch_connected && gpu2->parent->nvswitch_info.is_nvswitch_connected) {
-        UVM_ASSERT(uvm_gpu_peer_caps(gpu1, gpu2)->link_type >= UVM_GPU_LINK_NVLINK_2);
+    if (gpu0->parent->nvswitch_info.is_nvswitch_connected && gpu1->parent->nvswitch_info.is_nvswitch_connected) {
+        UVM_ASSERT(uvm_gpu_peer_caps(gpu0, gpu1)->link_type >= UVM_GPU_LINK_NVLINK_2);
        return true;
    }

@ -1511,7 +1522,7 @@ bool uvm_gpu_can_address_kernel(uvm_gpu_t *gpu, NvU64 addr, NvU64 size);
 // addresses.
 NvU64 uvm_parent_gpu_canonical_address(uvm_parent_gpu_t *parent_gpu, NvU64 addr);

-static bool uvm_gpu_is_coherent(const uvm_parent_gpu_t *parent_gpu)
+static bool uvm_parent_gpu_is_coherent(const uvm_parent_gpu_t *parent_gpu)
 {
    return parent_gpu->system_bus.memory_window_end > parent_gpu->system_bus.memory_window_start;
 }
--- a/kernel-open/nvidia-uvm/uvm_gpu_access_counters.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_access_counters.c
@ -985,7 +985,7 @@ static NV_STATUS service_va_block_locked(uvm_processor_id_t processor,
        return NV_OK;

    if (uvm_processor_mask_test(&va_block->resident, processor))
-        residency_mask = uvm_va_block_resident_mask_get(va_block, processor);
+        residency_mask = uvm_va_block_resident_mask_get(va_block, processor, NUMA_NO_NODE);
    else
        residency_mask = NULL;

@ -1036,8 +1036,8 @@ static NV_STATUS service_va_block_locked(uvm_processor_id_t processor,

        // If the underlying VMA is gone, skip HMM migrations.
        if (uvm_va_block_is_hmm(va_block)) {
-            status = uvm_hmm_find_vma(service_context->block_context.mm,
-                                      &service_context->block_context.hmm.vma,
+            status = uvm_hmm_find_vma(service_context->block_context->mm,
+                                      &service_context->block_context->hmm.vma,
                                      address);
            if (status == NV_ERR_INVALID_ADDRESS)
                continue;
@ -1048,7 +1048,7 @@ static NV_STATUS service_va_block_locked(uvm_processor_id_t processor,
        policy = uvm_va_policy_get(va_block, address);

        new_residency = uvm_va_block_select_residency(va_block,
-                                                      &service_context->block_context,
+                                                      service_context->block_context,
                                                      page_index,
                                                      processor,
                                                      uvm_fault_access_type_mask_bit(UVM_FAULT_ACCESS_TYPE_PREFETCH),
@ -1083,7 +1083,7 @@ static NV_STATUS service_va_block_locked(uvm_processor_id_t processor,
        // Remove pages that are already resident in the destination processors
        for_each_id_in_mask(id, &update_processors) {
            bool migrate_pages;
-            uvm_page_mask_t *residency_mask = uvm_va_block_resident_mask_get(va_block, id);
+            uvm_page_mask_t *residency_mask = uvm_va_block_resident_mask_get(va_block, id, NUMA_NO_NODE);
            UVM_ASSERT(residency_mask);

            migrate_pages = uvm_page_mask_andnot(&service_context->per_processor_masks[uvm_id_value(id)].new_residency,
@ -1101,9 +1101,9 @@ static NV_STATUS service_va_block_locked(uvm_processor_id_t processor,

                if (uvm_va_block_is_hmm(va_block)) {
                    status = NV_ERR_INVALID_ADDRESS;
-                    if (service_context->block_context.mm) {
+                    if (service_context->block_context->mm) {
                        status = uvm_hmm_find_policy_vma_and_outer(va_block,
-                                                                   &service_context->block_context.hmm.vma,
+                                                                   &service_context->block_context->hmm.vma,
                                                                   first_page_index,
                                                                   &policy,
                                                                   &outer);
@ -1206,7 +1206,7 @@ static NV_STATUS service_phys_single_va_block(uvm_gpu_t *gpu,

        service_context->operation = UVM_SERVICE_OPERATION_ACCESS_COUNTERS;
        service_context->num_retries = 0;
-        service_context->block_context.mm = mm;
+        service_context->block_context->mm = mm;

        if (uvm_va_block_is_hmm(va_block)) {
            uvm_hmm_service_context_init(service_context);
--- a/kernel-open/nvidia-uvm/uvm_gpu_isr.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_isr.c
@ -292,6 +292,7 @@ NV_STATUS uvm_gpu_init_isr(uvm_parent_gpu_t *parent_gpu)
 {
    NV_STATUS status = NV_OK;
    char kthread_name[TASK_COMM_LEN + 1];
+    uvm_va_block_context_t *block_context;

    if (parent_gpu->replayable_faults_supported) {
        status = uvm_gpu_fault_buffer_init(parent_gpu);
@ -311,6 +312,12 @@ NV_STATUS uvm_gpu_init_isr(uvm_parent_gpu_t *parent_gpu)
        if (!parent_gpu->isr.replayable_faults.stats.cpu_exec_count)
            return NV_ERR_NO_MEMORY;

+        block_context = uvm_va_block_context_alloc(NULL);
+        if (!block_context)
+            return NV_ERR_NO_MEMORY;
+
+        parent_gpu->fault_buffer_info.replayable.block_service_context.block_context = block_context;
+
        parent_gpu->isr.replayable_faults.handling = true;

        snprintf(kthread_name, sizeof(kthread_name), "UVM GPU%u BH", uvm_id_value(parent_gpu->id));
@ -333,6 +340,12 @@ NV_STATUS uvm_gpu_init_isr(uvm_parent_gpu_t *parent_gpu)
            if (!parent_gpu->isr.non_replayable_faults.stats.cpu_exec_count)
                return NV_ERR_NO_MEMORY;

+            block_context = uvm_va_block_context_alloc(NULL);
+            if (!block_context)
+                return NV_ERR_NO_MEMORY;
+
+            parent_gpu->fault_buffer_info.non_replayable.block_service_context.block_context = block_context;
+
            parent_gpu->isr.non_replayable_faults.handling = true;

            snprintf(kthread_name, sizeof(kthread_name), "UVM GPU%u KC", uvm_id_value(parent_gpu->id));
@ -356,6 +369,13 @@ NV_STATUS uvm_gpu_init_isr(uvm_parent_gpu_t *parent_gpu)
                return status;
            }

+            block_context = uvm_va_block_context_alloc(NULL);
+            if (!block_context)
+                return NV_ERR_NO_MEMORY;
+
+            parent_gpu->access_counter_buffer_info.batch_service_context.block_service_context.block_context =
+                block_context;
+
            nv_kthread_q_item_init(&parent_gpu->isr.access_counters.bottom_half_q_item,
                                   access_counters_isr_bottom_half_entry,
                                   parent_gpu);
@ -410,6 +430,8 @@ void uvm_gpu_disable_isr(uvm_parent_gpu_t *parent_gpu)

 void uvm_gpu_deinit_isr(uvm_parent_gpu_t *parent_gpu)
 {
+    uvm_va_block_context_t *block_context;
+
    // Return ownership to RM:
    if (parent_gpu->isr.replayable_faults.was_handling) {
        // No user threads could have anything left on
@ -439,8 +461,18 @@ void uvm_gpu_deinit_isr(uvm_parent_gpu_t *parent_gpu)
        // It is safe to deinitialize access counters even if they have not been
        // successfully initialized.
        uvm_gpu_deinit_access_counters(parent_gpu);
+        block_context =
+            parent_gpu->access_counter_buffer_info.batch_service_context.block_service_context.block_context;
+        uvm_va_block_context_free(block_context);
    }

+    if (parent_gpu->non_replayable_faults_supported) {
+        block_context = parent_gpu->fault_buffer_info.non_replayable.block_service_context.block_context;
+        uvm_va_block_context_free(block_context);
+    }
+
+    block_context = parent_gpu->fault_buffer_info.replayable.block_service_context.block_context;
+    uvm_va_block_context_free(block_context);
    uvm_kvfree(parent_gpu->isr.replayable_faults.stats.cpu_exec_count);
    uvm_kvfree(parent_gpu->isr.non_replayable_faults.stats.cpu_exec_count);
    uvm_kvfree(parent_gpu->isr.access_counters.stats.cpu_exec_count);
--- a/kernel-open/nvidia-uvm/uvm_gpu_non_replayable_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_non_replayable_faults.c
@ -370,7 +370,7 @@ static NV_STATUS service_managed_fault_in_block_locked(uvm_gpu_t *gpu,

    // Check logical permissions
    status = uvm_va_block_check_logical_permissions(va_block,
-                                                    &service_context->block_context,
+                                                    service_context->block_context,
                                                    gpu->id,
                                                    uvm_va_block_cpu_page_index(va_block,
                                                                                fault_entry->fault_address),
@ -393,7 +393,7 @@ static NV_STATUS service_managed_fault_in_block_locked(uvm_gpu_t *gpu,

    // Compute new residency and update the masks
    new_residency = uvm_va_block_select_residency(va_block,
-                                                  &service_context->block_context,
+                                                  service_context->block_context,
                                                  page_index,
                                                  gpu->id,
                                                  fault_entry->access_type_mask,
@ -629,7 +629,7 @@ static NV_STATUS service_fault(uvm_gpu_t *gpu, uvm_fault_buffer_entry_t *fault_e
    uvm_gpu_va_space_t *gpu_va_space;
    uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable;
    uvm_va_block_context_t *va_block_context =
-        &gpu->parent->fault_buffer_info.non_replayable.block_service_context.block_context;
+        gpu->parent->fault_buffer_info.non_replayable.block_service_context.block_context;

    status = uvm_gpu_fault_entry_to_va_space(gpu, fault_entry, &va_space);
    if (status != NV_OK) {
@ -655,7 +655,7 @@ static NV_STATUS service_fault(uvm_gpu_t *gpu, uvm_fault_buffer_entry_t *fault_e
    // to remain valid until we release. If no mm is registered, we
    // can only service managed faults, not ATS/HMM faults.
    mm = uvm_va_space_mm_retain_lock(va_space);
-    va_block_context->mm = mm;
+    uvm_va_block_context_init(va_block_context, mm);

    uvm_va_space_down_read(va_space);

--- a/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
@ -1180,7 +1180,11 @@ static void mark_fault_fatal(uvm_fault_service_batch_context_t *batch_context,
    fault_entry->replayable.cancel_va_mode = cancel_va_mode;

    utlb->has_fatal_faults = true;
-    batch_context->has_fatal_faults = true;
+
+    if (!batch_context->fatal_va_space) {
+        UVM_ASSERT(fault_entry->va_space);
+        batch_context->fatal_va_space = fault_entry->va_space;
+    }
 }

 static void fault_entry_duplicate_flags(uvm_fault_service_batch_context_t *batch_context,
@ -1230,7 +1234,7 @@ static uvm_fault_access_type_t check_fault_access_permissions(uvm_gpu_t *gpu,
    UvmEventFatalReason fatal_reason;
    uvm_fault_cancel_va_mode_t cancel_va_mode;
    uvm_fault_access_type_t ret = UVM_FAULT_ACCESS_TYPE_COUNT;
-    uvm_va_block_context_t *va_block_context = &service_block_context->block_context;
+    uvm_va_block_context_t *va_block_context = service_block_context->block_context;

    perm_status = uvm_va_block_check_logical_permissions(va_block,
                                                         va_block_context,
@ -1345,7 +1349,7 @@ static NV_STATUS service_fault_batch_block_locked(uvm_gpu_t *gpu,

    if (uvm_va_block_is_hmm(va_block)) {
        policy = uvm_hmm_find_policy_end(va_block,
-                                         block_context->block_context.hmm.vma,
+                                         block_context->block_context->hmm.vma,
                                         ordered_fault_cache[first_fault_index]->fault_address,
                                         &end);
    }
@ -1469,7 +1473,7 @@ static NV_STATUS service_fault_batch_block_locked(uvm_gpu_t *gpu,

        // Compute new residency and update the masks
        new_residency = uvm_va_block_select_residency(va_block,
-                                                      &block_context->block_context,
+                                                      block_context->block_context,
                                                      page_index,
                                                      gpu->id,
                                                      service_access_type_mask,
@ -1511,8 +1515,8 @@ static NV_STATUS service_fault_batch_block_locked(uvm_gpu_t *gpu,

    ++block_context->num_retries;

-    if (status == NV_OK && batch_context->has_fatal_faults)
-        status = uvm_va_block_set_cancel(va_block, &block_context->block_context, gpu);
+    if (status == NV_OK && batch_context->fatal_va_space)
+        status = uvm_va_block_set_cancel(va_block, block_context->block_context, gpu);

    return status;
 }
@ -1860,7 +1864,7 @@ static NV_STATUS service_fault_batch_dispatch(uvm_va_space_t *va_space,
    uvm_va_block_t *va_block;
    uvm_gpu_t *gpu = gpu_va_space->gpu;
    uvm_va_block_context_t *va_block_context =
-        &gpu->parent->fault_buffer_info.replayable.block_service_context.block_context;
+        gpu->parent->fault_buffer_info.replayable.block_service_context.block_context;
    uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[fault_index];
    struct mm_struct *mm = va_block_context->mm;
    NvU64 fault_address = current_entry->fault_address;
@ -1937,14 +1941,198 @@ static NV_STATUS service_fault_batch_dispatch(uvm_va_space_t *va_space,
    return status;
 }

+// Called when a fault in the batch has been marked fatal. Flush the buffer
+// under the VA and mmap locks to remove any potential stale fatal faults, then
+// service all new faults for just that VA space and cancel those which are
+// fatal. Faults in other VA spaces are replayed when done and will be processed
+// when normal fault servicing resumes.
+static NV_STATUS service_fault_batch_for_cancel(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context)
+{
+    NV_STATUS status = NV_OK;
+    NvU32 i;
+    uvm_va_space_t *va_space = batch_context->fatal_va_space;
+    uvm_gpu_va_space_t *gpu_va_space = NULL;
+    struct mm_struct *mm;
+    uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable;
+    uvm_service_block_context_t *service_context = &gpu->parent->fault_buffer_info.replayable.block_service_context;
+    uvm_va_block_context_t *va_block_context = service_context->block_context;
+
+    UVM_ASSERT(gpu->parent->replayable_faults_supported);
+    UVM_ASSERT(va_space);
+
+    // Perform the flush and re-fetch while holding the mmap_lock and the
+    // VA space lock. This avoids stale faults because it prevents any vma
+    // modifications (mmap, munmap, mprotect) from happening between the time HW
+    // takes the fault and we cancel it.
+    mm = uvm_va_space_mm_retain_lock(va_space);
+    uvm_va_block_context_init(va_block_context, mm);
+    uvm_va_space_down_read(va_space);
+
+    // We saw fatal faults in this VA space before. Flush while holding
+    // mmap_lock to make sure those faults come back (aren't stale).
+    //
+    // We need to wait until all old fault messages have arrived before
+    // flushing, hence UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT.
+    status = fault_buffer_flush_locked(gpu,
+                                       UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT,
+                                       UVM_FAULT_REPLAY_TYPE_START,
+                                       batch_context);
+    if (status != NV_OK)
+        goto done;
+
+    // Wait for the flush's replay to finish to give the legitimate faults a
+    // chance to show up in the buffer again.
+    status = uvm_tracker_wait(&replayable_faults->replay_tracker);
+    if (status != NV_OK)
+        goto done;
+
+    // We expect all replayed faults to have arrived in the buffer so we can re-
+    // service them. The replay-and-wait sequence above will ensure they're all
+    // in the HW buffer. When GSP owns the HW buffer, we also have to wait for
+    // GSP to copy all available faults from the HW buffer into the shadow
+    // buffer.
+    //
+    // TODO: Bug 2533557: This flush does not actually guarantee that GSP will
+    //       copy over all faults.
+    status = hw_fault_buffer_flush_locked(gpu->parent);
+    if (status != NV_OK)
+        goto done;
+
+    // If there is no GPU VA space for the GPU, ignore all faults in the VA
+    // space. This can happen if the GPU VA space has been destroyed since we
+    // unlocked the VA space in service_fault_batch. That means the fatal faults
+    // are stale, because unregistering the GPU VA space requires preempting the
+    // context and detaching all channels in that VA space. Restart fault
+    // servicing from the top.
+    gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
+    if (!gpu_va_space)
+        goto done;
+
+    // Re-parse the new faults
+    batch_context->num_invalid_prefetch_faults = 0;
+    batch_context->num_duplicate_faults        = 0;
+    batch_context->num_replays                 = 0;
+    batch_context->fatal_va_space              = NULL;
+    batch_context->has_throttled_faults        = false;
+
+    status = fetch_fault_buffer_entries(gpu, batch_context, FAULT_FETCH_MODE_ALL);
+    if (status != NV_OK)
+        goto done;
+
+    // No more faults left. Either the previously-seen fatal entry was stale, or
+    // RM killed the context underneath us.
+    if (batch_context->num_cached_faults == 0)
+        goto done;
+
+    ++batch_context->batch_id;
+
+    status = preprocess_fault_batch(gpu, batch_context);
+    if (status != NV_OK) {
+        if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
+            // Another flush happened due to stale faults or a context-fatal
+            // error. The previously-seen fatal fault might not exist anymore,
+            // so restart fault servicing from the top.
+            status = NV_OK;
+        }
+
+        goto done;
+    }
+
+    // Search for the target VA space
+    for (i = 0; i < batch_context->num_coalesced_faults; i++) {
+        uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
+        UVM_ASSERT(current_entry->va_space);
+        if (current_entry->va_space == va_space)
+            break;
+    }
+
+    while (i < batch_context->num_coalesced_faults) {
+        uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
+
+        if (current_entry->va_space != va_space)
+            break;
+
+        // service_fault_batch_dispatch() doesn't expect unserviceable faults.
+        // Just cancel them directly.
+        if (current_entry->is_fatal) {
+            status = cancel_fault_precise_va(gpu, current_entry, UVM_FAULT_CANCEL_VA_MODE_ALL);
+            if (status != NV_OK)
+                break;
+
+            ++i;
+        }
+        else {
+            uvm_ats_fault_invalidate_t *ats_invalidate = &gpu->parent->fault_buffer_info.replayable.ats_invalidate;
+            NvU32 block_faults;
+
+            ats_invalidate->write_faults_in_batch = false;
+            uvm_hmm_service_context_init(service_context);
+
+            // Service all the faults that we can. We only really need to search
+            // for fatal faults, but attempting to service all is the easiest
+            // way to do that.
+            status = service_fault_batch_dispatch(va_space, gpu_va_space, batch_context, i, &block_faults, false);
+            if (status != NV_OK) {
+                // TODO: Bug 3900733: clean up locking in service_fault_batch().
+                // We need to drop lock and retry. That means flushing and
+                // starting over.
+                if (status == NV_WARN_MORE_PROCESSING_REQUIRED)
+                    status = NV_OK;
+
+                break;
+            }
+
+            // Invalidate TLBs before cancel to ensure that fatal faults don't
+            // get stuck in HW behind non-fatal faults to the same line.
+            status = uvm_ats_invalidate_tlbs(gpu_va_space, ats_invalidate, &batch_context->tracker);
+            if (status != NV_OK)
+                break;
+
+            while (block_faults-- > 0) {
+                current_entry = batch_context->ordered_fault_cache[i];
+                if (current_entry->is_fatal) {
+                    status = cancel_fault_precise_va(gpu, current_entry, current_entry->replayable.cancel_va_mode);
+                    if (status != NV_OK)
+                        break;
+                }
+
+                ++i;
+            }
+        }
+    }
+
+done:
+    uvm_va_space_up_read(va_space);
+    uvm_va_space_mm_release_unlock(va_space, mm);
+
+    if (status == NV_OK) {
+        // There are two reasons to flush the fault buffer here.
+        //
+        // 1) Functional. We need to replay both the serviced non-fatal faults
+        //    and the skipped faults in other VA spaces. The former need to be
+        //    restarted and the latter need to be replayed so the normal fault
+        //    service mechanism can fetch and process them.
+        //
+        // 2) Performance. After cancelling the fatal faults, a flush removes
+        //    any potential duplicated fault that may have been added while
+        //    processing the faults in this batch. This flush also avoids doing
+        //    unnecessary processing after the fatal faults have been cancelled,
+        //    so all the rest are unlikely to remain after a replay because the
+        //    context is probably in the process of dying.
+        status = fault_buffer_flush_locked(gpu,
+                                           UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
+                                           UVM_FAULT_REPLAY_TYPE_START,
+                                           batch_context);
+    }
+
+    return status;
+}
 // Scan the ordered view of faults and group them by different va_blocks
 // (managed faults) and service faults for each va_block, in batch.
 // Service non-managed faults one at a time as they are encountered during the
 // scan.
 //
-// This function returns NV_WARN_MORE_PROCESSING_REQUIRED if the fault buffer
-// was flushed because the needs_fault_buffer_flush flag was set on some GPU VA
-// space
+// Fatal faults are marked for later processing by the caller.
 static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
                                     fault_service_mode_t service_mode,
                                     uvm_fault_service_batch_context_t *batch_context)
@ -1959,7 +2147,7 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
                                     gpu->parent->fault_buffer_info.replayable.replay_policy == UVM_PERF_FAULT_REPLAY_POLICY_BLOCK;
    uvm_service_block_context_t *service_context =
        &gpu->parent->fault_buffer_info.replayable.block_service_context;
-    uvm_va_block_context_t *va_block_context = &service_context->block_context;
+    uvm_va_block_context_t *va_block_context = service_context->block_context;

    UVM_ASSERT(gpu->parent->replayable_faults_supported);

@ -1995,41 +2183,28 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
            // to remain valid until we release. If no mm is registered, we
            // can only service managed faults, not ATS/HMM faults.
            mm = uvm_va_space_mm_retain_lock(va_space);
-            va_block_context->mm = mm;
+            uvm_va_block_context_init(va_block_context, mm);

            uvm_va_space_down_read(va_space);
-
            gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
-            if (uvm_processor_mask_test_and_clear_atomic(&va_space->needs_fault_buffer_flush, gpu->id)) {
-                status = fault_buffer_flush_locked(gpu,
-                                                   UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT,
-                                                   UVM_FAULT_REPLAY_TYPE_START,
-                                                   batch_context);
-                if (status == NV_OK)
-                    status = NV_WARN_MORE_PROCESSING_REQUIRED;
-
-                break;
-            }
-
-            // The case where there is no valid GPU VA space for the GPU in this
-            // VA space is handled next
        }

        // Some faults could be already fatal if they cannot be handled by
        // the UVM driver
        if (current_entry->is_fatal) {
            ++i;
-            batch_context->has_fatal_faults = true;
+            if (!batch_context->fatal_va_space)
+                batch_context->fatal_va_space = va_space;
+
            utlb->has_fatal_faults = true;
            UVM_ASSERT(utlb->num_pending_faults > 0);
            continue;
        }

-        if (!uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->parent->id)) {
+        if (!gpu_va_space) {
            // If there is no GPU VA space for the GPU, ignore the fault. This
            // can happen if a GPU VA space is destroyed without explicitly
-            // freeing all memory ranges (destroying the VA range triggers a
-            // flush of the fault buffer) and there are stale entries in the
+            // freeing all memory ranges and there are stale entries in the
            // buffer that got fixed by the servicing in a previous batch.
            ++i;
            continue;
@ -2057,7 +2232,7 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
        i += block_faults;

        // Don't issue replays in cancel mode
-        if (replay_per_va_block && !batch_context->has_fatal_faults) {
+        if (replay_per_va_block && !batch_context->fatal_va_space) {
            status = push_replay_on_gpu(gpu, UVM_FAULT_REPLAY_TYPE_START, batch_context);
            if (status != NV_OK)
                goto fail;
@ -2069,8 +2244,6 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
        }
    }

-    // Only clobber status if invalidate_status != NV_OK, since status may also
-    // contain NV_WARN_MORE_PROCESSING_REQUIRED.
    if (va_space != NULL) {
        NV_STATUS invalidate_status = uvm_ats_invalidate_tlbs(gpu_va_space, ats_invalidate, &batch_context->tracker);
        if (invalidate_status != NV_OK)
@ -2278,64 +2451,6 @@ static NvU32 is_fatal_fault_in_buffer(uvm_fault_service_batch_context_t *batch_c
    return false;
 }

-// Cancel just the faults flagged as fatal in the given fault service batch
-// context.
-static NV_STATUS cancel_faults_precise_va(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context)
-{
-    NV_STATUS status = NV_OK;
-    NV_STATUS fault_status;
-    uvm_va_space_t *va_space = NULL;
-    NvU32 i;
-
-    UVM_ASSERT(gpu->parent->fault_cancel_va_supported);
-
-    for (i = 0; i < batch_context->num_coalesced_faults; ++i) {
-        uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
-
-        UVM_ASSERT(current_entry->va_space);
-
-        if (current_entry->va_space != va_space) {
-            // Fault on a different va_space, drop the lock of the old one...
-            if (va_space != NULL)
-                uvm_va_space_up_read(va_space);
-
-            va_space = current_entry->va_space;
-
-            // ... and take the lock of the new one
-            uvm_va_space_down_read(va_space);
-
-            // We don't need to check whether a buffer flush is required
-            // (due to VA range destruction). Once a fault is flagged as fatal
-            // we need to cancel it, even if its VA range no longer exists.
-        }
-
-        // See the comment for the same check in cancel_faults_all
-        if (!uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->parent->id))
-            continue;
-
-        if (current_entry->is_fatal) {
-            status = cancel_fault_precise_va(gpu, current_entry, current_entry->replayable.cancel_va_mode);
-            if (status != NV_OK)
-                break;
-        }
-    }
-
-    if (va_space != NULL)
-        uvm_va_space_up_read(va_space);
-
-    // See the comment on flushing in cancel_faults_all
-    fault_status = fault_buffer_flush_locked(gpu,
-                                             UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
-                                             UVM_FAULT_REPLAY_TYPE_START,
-                                             batch_context);
-
-    // We report the first encountered error.
-    if (status == NV_OK)
-        status = fault_status;
-
-    return status;
-}
-
 // Cancel all faults in the given fault service batch context, even those not
 // marked as fatal.
 static NV_STATUS cancel_faults_all(uvm_gpu_t *gpu,
@ -2344,56 +2459,51 @@ static NV_STATUS cancel_faults_all(uvm_gpu_t *gpu,
 {
    NV_STATUS status = NV_OK;
    NV_STATUS fault_status;
-    uvm_va_space_t *va_space = NULL;
-    NvU32 i;
+    NvU32 i = 0;

    UVM_ASSERT(gpu->parent->fault_cancel_va_supported);
    UVM_ASSERT(reason != UvmEventFatalReasonInvalid);

-    for (i = 0; i < batch_context->num_coalesced_faults; ++i) {
+    while (i < batch_context->num_coalesced_faults && status == NV_OK) {
        uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
-        uvm_fault_cancel_va_mode_t cancel_va_mode;
+        uvm_va_space_t *va_space = current_entry->va_space;
+        bool skip_va_space;

-        UVM_ASSERT(current_entry->va_space);
+        UVM_ASSERT(va_space);

-        if (current_entry->va_space != va_space) {
-            // Fault on a different va_space, drop the lock of the old one...
-            if (va_space != NULL)
-                uvm_va_space_up_read(va_space);
+        uvm_va_space_down_read(va_space);

-            va_space = current_entry->va_space;
+        // If there is no GPU VA space for the GPU, ignore all faults in
+        // that VA space. This can happen if the GPU VA space has been
+        // destroyed since we unlocked the VA space in service_fault_batch.
+        // Ignoring the fault avoids targetting a PDB that might have been
+        // reused by another process.
+        skip_va_space = !uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);

-            // ... and take the lock of the new one
-            uvm_va_space_down_read(va_space);
+        for (;
+             i < batch_context->num_coalesced_faults && current_entry->va_space == va_space;
+             current_entry = batch_context->ordered_fault_cache[++i]) {
+            uvm_fault_cancel_va_mode_t cancel_va_mode;
+
+            if (skip_va_space)
+                continue;
+
+            if (current_entry->is_fatal) {
+                UVM_ASSERT(current_entry->fatal_reason != UvmEventFatalReasonInvalid);
+                cancel_va_mode = current_entry->replayable.cancel_va_mode;
+            }
+            else {
+                current_entry->fatal_reason = reason;
+                cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
+            }
+
+            status = cancel_fault_precise_va(gpu, current_entry, cancel_va_mode);
+            if (status != NV_OK)
+                break;
        }

-        if (!uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->parent->id)) {
-            // If there is no GPU VA space for the GPU, ignore the fault.
-            // This can happen if the GPU VA did not exist in
-            // service_fault_batch(), or it was destroyed since then.
-            // This is to avoid targetting a PDB that might have been reused
-            // by another process.
-            continue;
-        }
-
-        // If the fault was already marked fatal, use its reason and cancel
-        // mode. Otherwise use the provided reason.
-        if (current_entry->is_fatal) {
-            UVM_ASSERT(current_entry->fatal_reason != UvmEventFatalReasonInvalid);
-            cancel_va_mode = current_entry->replayable.cancel_va_mode;
-        }
-        else {
-            current_entry->fatal_reason = reason;
-            cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
-        }
-
-        status = cancel_fault_precise_va(gpu, current_entry, cancel_va_mode);
-        if (status != NV_OK)
-            break;
-    }
-
-    if (va_space != NULL)
        uvm_va_space_up_read(va_space);
+    }

    // Because each cancel itself triggers a replay, there may be a large number
    // of new duplicated faults in the buffer after cancelling all the known
@ -2537,7 +2647,7 @@ static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_bat

        batch_context->num_invalid_prefetch_faults = 0;
        batch_context->num_replays                 = 0;
-        batch_context->has_fatal_faults            = false;
+        batch_context->fatal_va_space              = NULL;
        batch_context->has_throttled_faults        = false;

        // 5) Fetch all faults from buffer
@ -2584,9 +2694,6 @@ static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_bat
        // 8) Service all non-fatal faults and mark all non-serviceable faults
        // as fatal
        status = service_fault_batch(gpu, FAULT_SERVICE_MODE_CANCEL, batch_context);
-        if (status == NV_WARN_MORE_PROCESSING_REQUIRED)
-            continue;
-
        UVM_ASSERT(batch_context->num_replays == 0);
        if (status == NV_ERR_NO_MEMORY)
            continue;
@ -2594,7 +2701,7 @@ static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_bat
            break;

        // No more fatal faults left, we are done
-        if (!batch_context->has_fatal_faults)
+        if (!batch_context->fatal_va_space)
            break;

        // 9) Search for uTLBs that contain fatal faults and meet the
@ -2616,9 +2723,9 @@ static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_bat

 static NV_STATUS cancel_faults_precise(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context)
 {
-    UVM_ASSERT(batch_context->has_fatal_faults);
+    UVM_ASSERT(batch_context->fatal_va_space);
    if (gpu->parent->fault_cancel_va_supported)
-        return cancel_faults_precise_va(gpu, batch_context);
+        return service_fault_batch_for_cancel(gpu, batch_context);

    return cancel_faults_precise_tlb(gpu, batch_context);
 }
@ -2674,7 +2781,7 @@ void uvm_gpu_service_replayable_faults(uvm_gpu_t *gpu)
        batch_context->num_invalid_prefetch_faults = 0;
        batch_context->num_duplicate_faults        = 0;
        batch_context->num_replays                 = 0;
-        batch_context->has_fatal_faults            = false;
+        batch_context->fatal_va_space              = NULL;
        batch_context->has_throttled_faults        = false;

        status = fetch_fault_buffer_entries(gpu, batch_context, FAULT_FETCH_MODE_BATCH_READY);
@ -2702,9 +2809,6 @@ void uvm_gpu_service_replayable_faults(uvm_gpu_t *gpu)
        // was flushed
        num_replays += batch_context->num_replays;

-        if (status == NV_WARN_MORE_PROCESSING_REQUIRED)
-            continue;
-
        enable_disable_prefetch_faults(gpu->parent, batch_context);

        if (status != NV_OK) {
@ -2718,10 +2822,17 @@ void uvm_gpu_service_replayable_faults(uvm_gpu_t *gpu)
            break;
        }

-        if (batch_context->has_fatal_faults) {
+        if (batch_context->fatal_va_space) {
            status = uvm_tracker_wait(&batch_context->tracker);
-            if (status == NV_OK)
+            if (status == NV_OK) {
                status = cancel_faults_precise(gpu, batch_context);
+                if (status == NV_OK) {
+                    // Cancel handling should've issued at least one replay
+                    UVM_ASSERT(batch_context->num_replays > 0);
+                    ++num_batches;
+                    continue;
+                }
+            }

            break;
        }
--- a/kernel-open/nvidia-uvm/uvm_hal.c
+++ b/kernel-open/nvidia-uvm/uvm_hal.c
@ -794,7 +794,7 @@ uvm_membar_t uvm_hal_downgrade_membar_type(uvm_gpu_t *gpu, bool is_local_vidmem)
    // memory, including those from other processors like the CPU or peer GPUs,
    // must come through this GPU's L2. In all current architectures, MEMBAR_GPU
    // is sufficient to resolve ordering at the L2 level.
-    if (is_local_vidmem && !uvm_gpu_is_coherent(gpu->parent) && !uvm_downgrade_force_membar_sys)
+    if (is_local_vidmem && !uvm_parent_gpu_is_coherent(gpu->parent) && !uvm_downgrade_force_membar_sys)
        return UVM_MEMBAR_GPU;

    // If the mapped memory was remote, or if a coherence protocol can cache
--- a/kernel-open/nvidia-uvm/uvm_hmm.c
+++ b/kernel-open/nvidia-uvm/uvm_hmm.c
@ -60,6 +60,8 @@ module_param(uvm_disable_hmm, bool, 0444);
 #include "uvm_gpu.h"
 #include "uvm_pmm_gpu.h"
 #include "uvm_hal_types.h"
+#include "uvm_push.h"
+#include "uvm_hal.h"
 #include "uvm_va_block_types.h"
 #include "uvm_va_space_mm.h"
 #include "uvm_va_space.h"
@ -110,20 +112,7 @@ typedef struct

 bool uvm_hmm_is_enabled_system_wide(void)
 {
-    if (uvm_disable_hmm)
-        return false;
-
-    if (g_uvm_global.ats.enabled)
-        return false;
-
-    // Confidential Computing and HMM impose mutually exclusive constraints. In
-    // Confidential Computing the GPU can only access pages resident in vidmem,
-    // but in HMM pages may be required to be resident in sysmem: file backed
-    // VMAs, huge pages, etc.
-    if (g_uvm_global.conf_computing_enabled)
-        return false;
-
-    return uvm_va_space_mm_enabled_system();
+    return !uvm_disable_hmm && !g_uvm_global.ats.enabled && uvm_va_space_mm_enabled_system();
 }

 bool uvm_hmm_is_enabled(uvm_va_space_t *va_space)
@ -140,6 +129,100 @@ static uvm_va_block_t *hmm_va_block_from_node(uvm_range_tree_node_t *node)
    return container_of(node, uvm_va_block_t, hmm.node);
 }

+// Copies the contents of the source device-private page to the
+// destination CPU page. This will invalidate mappings, so cannot be
+// called while holding any va_block locks.
+static NV_STATUS uvm_hmm_copy_devmem_page(struct page *dst_page, struct page *src_page, uvm_tracker_t *tracker)
+{
+    uvm_gpu_phys_address_t src_addr;
+    uvm_gpu_phys_address_t dst_addr;
+    uvm_gpu_chunk_t *gpu_chunk;
+    NvU64 dma_addr;
+    uvm_push_t push;
+    NV_STATUS status = NV_OK;
+    uvm_gpu_t *gpu;
+
+    // Holding a reference on the device-private page ensures the gpu
+    // is already retained. This is because when a GPU is unregistered
+    // all device-private pages are migrated back to the CPU and freed
+    // before releasing the GPU. Therefore if we could get a reference
+    // to the page the GPU must be retained.
+    UVM_ASSERT(is_device_private_page(src_page) && page_count(src_page));
+    gpu_chunk = uvm_pmm_devmem_page_to_chunk(src_page);
+    gpu = uvm_gpu_chunk_get_gpu(gpu_chunk);
+    status = uvm_mmu_chunk_map(gpu_chunk);
+    if (status != NV_OK)
+        return status;
+
+    status = uvm_gpu_map_cpu_pages(gpu->parent, dst_page, PAGE_SIZE, &dma_addr);
+    if (status != NV_OK)
+        goto out_unmap_gpu;
+
+    dst_addr = uvm_gpu_phys_address(UVM_APERTURE_SYS, dma_addr);
+    src_addr = uvm_gpu_phys_address(UVM_APERTURE_VID, gpu_chunk->address);
+    status = uvm_push_begin_acquire(gpu->channel_manager,
+                                    UVM_CHANNEL_TYPE_GPU_TO_CPU,
+                                    tracker,
+                                    &push,
+                                    "Copy for remote process fault");
+    if (status != NV_OK)
+        goto out_unmap_cpu;
+
+    gpu->parent->ce_hal->memcopy(&push,
+                                 uvm_gpu_address_copy(gpu, dst_addr),
+                                 uvm_gpu_address_copy(gpu, src_addr),
+                                 PAGE_SIZE);
+    uvm_push_end(&push);
+    status = uvm_tracker_add_push_safe(tracker, &push);
+
+out_unmap_cpu:
+    uvm_gpu_unmap_cpu_pages(gpu->parent, dma_addr, PAGE_SIZE);
+
+out_unmap_gpu:
+    uvm_mmu_chunk_unmap(gpu_chunk, NULL);
+
+    return status;
+}
+
+static NV_STATUS uvm_hmm_pmm_gpu_evict_pfn(unsigned long pfn)
+{
+    unsigned long src_pfn = 0;
+    unsigned long dst_pfn = 0;
+    struct page *dst_page;
+    NV_STATUS status = NV_OK;
+    int ret;
+
+    ret = migrate_device_range(&src_pfn, pfn, 1);
+    if (ret)
+        return errno_to_nv_status(ret);
+
+    if (src_pfn & MIGRATE_PFN_MIGRATE) {
+        uvm_tracker_t tracker = UVM_TRACKER_INIT();
+
+        dst_page = alloc_page(GFP_HIGHUSER_MOVABLE);
+        if (!dst_page) {
+            status = NV_ERR_NO_MEMORY;
+            goto out;
+        }
+
+        lock_page(dst_page);
+        if (WARN_ON(uvm_hmm_copy_devmem_page(dst_page, migrate_pfn_to_page(src_pfn), &tracker) != NV_OK))
+            memzero_page(dst_page, 0, PAGE_SIZE);
+
+        dst_pfn = migrate_pfn(page_to_pfn(dst_page));
+        migrate_device_pages(&src_pfn, &dst_pfn, 1);
+        uvm_tracker_wait_deinit(&tracker);
+    }
+
+out:
+    migrate_device_finalize(&src_pfn, &dst_pfn, 1);
+
+    if (!(src_pfn & MIGRATE_PFN_MIGRATE))
+        status = NV_ERR_BUSY_RETRY;
+
+    return status;
+}
+
 void uvm_hmm_va_space_initialize(uvm_va_space_t *va_space)
 {
    uvm_hmm_va_space_t *hmm_va_space = &va_space->hmm;
@ -199,6 +282,9 @@ void uvm_hmm_unregister_gpu(uvm_va_space_t *va_space, uvm_gpu_t *gpu, struct mm_
 {
    uvm_range_tree_node_t *node;
    uvm_va_block_t *va_block;
+    struct range range = gpu->pmm.devmem.pagemap.range;
+    unsigned long pfn;
+    bool retry;

    if (!uvm_hmm_is_enabled(va_space))
        return;
@ -207,6 +293,29 @@ void uvm_hmm_unregister_gpu(uvm_va_space_t *va_space, uvm_gpu_t *gpu, struct mm_
        uvm_assert_mmap_lock_locked(mm);
    uvm_assert_rwsem_locked_write(&va_space->lock);

+    // There could be pages with page->zone_device_data pointing to the va_space
+    // which may be about to be freed. Migrate those back to the CPU so we don't
+    // fault on them. Normally infinite retries are bad, but we don't have any
+    // option here. Device-private pages can't be pinned so migration should
+    // eventually succeed. Even if we did eventually bail out of the loop we'd
+    // just stall in memunmap_pages() anyway.
+    do {
+        retry = false;
+
+        for (pfn = __phys_to_pfn(range.start); pfn <= __phys_to_pfn(range.end); pfn++) {
+            struct page *page = pfn_to_page(pfn);
+
+            UVM_ASSERT(is_device_private_page(page));
+
+            // This check is racy because nothing stops the page being freed and
+            // even reused. That doesn't matter though - worst case the
+            // migration fails, we retry and find the va_space doesn't match.
+            if (page->zone_device_data == va_space)
+                if (uvm_hmm_pmm_gpu_evict_pfn(pfn) != NV_OK)
+                    retry = true;
+        }
+    } while (retry);
+
    uvm_range_tree_for_each(node, &va_space->hmm.blocks) {
        va_block = hmm_va_block_from_node(node);

@ -568,7 +677,7 @@ bool uvm_hmm_check_context_vma_is_valid(uvm_va_block_t *va_block,
 void uvm_hmm_service_context_init(uvm_service_block_context_t *service_context)
 {
    // TODO: Bug 4050579: Remove this when swap cached pages can be migrated.
-    service_context->block_context.hmm.swap_cached = false;
+    service_context->block_context->hmm.swap_cached = false;
 }

 NV_STATUS uvm_hmm_migrate_begin(uvm_va_block_t *va_block)
@ -631,47 +740,6 @@ static NV_STATUS hmm_migrate_range(uvm_va_block_t *va_block,
    return status;
 }

-void uvm_hmm_evict_va_blocks(uvm_va_space_t *va_space)
-{
-    // We can't use uvm_va_space_mm_retain(), because the va_space_mm
-    // should already be dead by now.
-    struct mm_struct *mm = va_space->va_space_mm.mm;
-    uvm_hmm_va_space_t *hmm_va_space = &va_space->hmm;
-    uvm_range_tree_node_t *node, *next;
-    uvm_va_block_t *va_block;
-    uvm_va_block_context_t *block_context;
-
-    uvm_down_read_mmap_lock(mm);
-    uvm_va_space_down_write(va_space);
-
-    uvm_range_tree_for_each_safe(node, next, &hmm_va_space->blocks) {
-        uvm_va_block_region_t region;
-        struct vm_area_struct *vma;
-
-        va_block = hmm_va_block_from_node(node);
-        block_context = uvm_va_space_block_context(va_space, mm);
-        uvm_hmm_migrate_begin_wait(va_block);
-        uvm_mutex_lock(&va_block->lock);
-        for_each_va_block_vma_region(va_block, mm, vma, &region) {
-            if (!uvm_hmm_vma_is_valid(vma, vma->vm_start, false))
-                continue;
-
-            block_context->hmm.vma = vma;
-            uvm_hmm_va_block_migrate_locked(va_block,
-                                            NULL,
-                                            block_context,
-                                            UVM_ID_CPU,
-                                            region,
-                                            UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE);
-        }
-        uvm_mutex_unlock(&va_block->lock);
-        uvm_hmm_migrate_finish(va_block);
-    }
-
-    uvm_va_space_up_write(va_space);
-    uvm_up_read_mmap_lock(mm);
-}
-
 NV_STATUS uvm_hmm_test_va_block_inject_split_error(uvm_va_space_t *va_space, NvU64 addr)
 {
    uvm_va_block_test_t *block_test;
@ -1476,40 +1544,59 @@ static NV_STATUS hmm_va_block_cpu_page_populate(uvm_va_block_t *va_block,
        return status;
    }

-    status = uvm_va_block_map_cpu_chunk_on_gpus(va_block, page_index);
+    status = uvm_va_block_map_cpu_chunk_on_gpus(va_block, chunk, page_index);
    if (status != NV_OK) {
-        uvm_cpu_chunk_remove_from_block(va_block, page_index);
+        uvm_cpu_chunk_remove_from_block(va_block, page_to_nid(page), page_index);
        uvm_cpu_chunk_free(chunk);
    }

    return status;
 }

-static void hmm_va_block_cpu_page_unpopulate(uvm_va_block_t *va_block,
-                                             uvm_page_index_t page_index)
+static void hmm_va_block_cpu_unpopulate_chunk(uvm_va_block_t *va_block,
+                                              uvm_cpu_chunk_t *chunk,
+                                              int chunk_nid,
+                                              uvm_page_index_t page_index)
 {
-    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index);
-
-    UVM_ASSERT(uvm_va_block_is_hmm(va_block));
-
    if (!chunk)
        return;

    UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
-               !uvm_page_mask_test(&va_block->cpu.resident, page_index));
+               !uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index));
+    UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == PAGE_SIZE);

-    uvm_cpu_chunk_remove_from_block(va_block, page_index);
+    uvm_cpu_chunk_remove_from_block(va_block, chunk_nid, page_index);
    uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk, page_index);
    uvm_cpu_chunk_free(chunk);
 }

+static void hmm_va_block_cpu_page_unpopulate(uvm_va_block_t *va_block, uvm_page_index_t page_index, struct page *page)
+{
+    uvm_cpu_chunk_t *chunk;
+
+    UVM_ASSERT(uvm_va_block_is_hmm(va_block));
+
+    if (page) {
+        chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_to_nid(page), page_index);
+        hmm_va_block_cpu_unpopulate_chunk(va_block, chunk, page_to_nid(page), page_index);
+    }
+    else {
+        int nid;
+
+        for_each_possible_uvm_node(nid) {
+            chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, nid, page_index);
+            hmm_va_block_cpu_unpopulate_chunk(va_block, chunk, nid, page_index);
+        }
+    }
+}
+
 static bool hmm_va_block_cpu_page_is_same(uvm_va_block_t *va_block,
                                          uvm_page_index_t page_index,
                                          struct page *page)
 {
-    struct page *old_page = uvm_cpu_chunk_get_cpu_page(va_block, page_index);
+    struct page *old_page = uvm_va_block_get_cpu_page(va_block, page_index);

-    UVM_ASSERT(uvm_cpu_chunk_is_hmm(uvm_cpu_chunk_get_chunk_for_page(va_block, page_index)));
+    UVM_ASSERT(uvm_cpu_chunk_is_hmm(uvm_cpu_chunk_get_chunk_for_page(va_block, page_to_nid(page), page_index)));
    return old_page == page;
 }

@ -1522,7 +1609,7 @@ static void clear_service_context_masks(uvm_service_block_context_t *service_con
                                        uvm_processor_id_t new_residency,
                                        uvm_page_index_t page_index)
 {
-    uvm_page_mask_clear(&service_context->block_context.caller_page_mask, page_index);
+    uvm_page_mask_clear(&service_context->block_context->caller_page_mask, page_index);

    uvm_page_mask_clear(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency,
                        page_index);
@ -1549,7 +1636,6 @@ static void cpu_mapping_set(uvm_va_block_t *va_block,
                            uvm_page_index_t page_index)
 {
    uvm_processor_mask_set(&va_block->mapped, UVM_ID_CPU);
-    uvm_page_mask_set(&va_block->maybe_mapped_pages, page_index);
    uvm_page_mask_set(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index);
    if (is_write)
        uvm_page_mask_set(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], page_index);
@ -1699,7 +1785,7 @@ static NV_STATUS sync_page_and_chunk_state(uvm_va_block_t *va_block,
            // migrate_vma_finalize() will release the reference so we should
            // clear our pointer to it.
            // TODO: Bug 3660922: Need to handle read duplication at some point.
-            hmm_va_block_cpu_page_unpopulate(va_block, page_index);
+            hmm_va_block_cpu_page_unpopulate(va_block, page_index, page);
        }
    }

@ -1725,7 +1811,7 @@ static void clean_up_non_migrating_page(uvm_va_block_t *va_block,
    else {
        UVM_ASSERT(page_ref_count(dst_page) == 1);

-        hmm_va_block_cpu_page_unpopulate(va_block, page_index);
+        hmm_va_block_cpu_page_unpopulate(va_block, page_index, dst_page);
    }

    unlock_page(dst_page);
@ -1760,7 +1846,7 @@ static void lock_block_cpu_page(uvm_va_block_t *va_block,
                                unsigned long *dst_pfns,
                                uvm_page_mask_t *same_devmem_page_mask)
 {
-    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index);
+    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_to_nid(src_page), page_index);
    uvm_va_block_region_t chunk_region;
    struct page *dst_page;

@ -1786,7 +1872,7 @@ static void lock_block_cpu_page(uvm_va_block_t *va_block,
        // hmm_va_block_cpu_page_unpopulate() or block_kill(). If the page
        // does not migrate, it will be freed though.
        UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
-                   !uvm_page_mask_test(&va_block->cpu.resident, page_index));
+                   !uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index));
        UVM_ASSERT(chunk->type == UVM_CPU_CHUNK_TYPE_PHYSICAL);
        UVM_ASSERT(page_ref_count(dst_page) == 1);
        uvm_cpu_chunk_make_hmm(chunk);
@ -1934,7 +2020,7 @@ static NV_STATUS alloc_and_copy_to_cpu(uvm_va_block_t *va_block,
        }

        UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
-                   !uvm_page_mask_test(&va_block->cpu.resident, page_index));
+                   !uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index));

        // Allocate a user system memory page for the destination.
        // This is the typical case since Linux will free the source page when
@ -2012,8 +2098,8 @@ static NV_STATUS uvm_hmm_devmem_fault_alloc_and_copy(uvm_hmm_devmem_fault_contex
    service_context = devmem_fault_context->service_context;
    va_block_retry = devmem_fault_context->va_block_retry;
    va_block = devmem_fault_context->va_block;
-    src_pfns = service_context->block_context.hmm.src_pfns;
-    dst_pfns = service_context->block_context.hmm.dst_pfns;
+    src_pfns = service_context->block_context->hmm.src_pfns;
+    dst_pfns = service_context->block_context->hmm.dst_pfns;

    // Build the migration page mask.
    // Note that thrashing pinned pages and prefetch pages are already
@ -2022,7 +2108,7 @@ static NV_STATUS uvm_hmm_devmem_fault_alloc_and_copy(uvm_hmm_devmem_fault_contex
    uvm_page_mask_copy(page_mask, &service_context->per_processor_masks[UVM_ID_CPU_VALUE].new_residency);

    status = alloc_and_copy_to_cpu(va_block,
-                                   service_context->block_context.hmm.vma,
+                                   service_context->block_context->hmm.vma,
                                   src_pfns,
                                   dst_pfns,
                                   service_context->region,
@ -2057,8 +2143,8 @@ static NV_STATUS uvm_hmm_devmem_fault_finalize_and_map(uvm_hmm_devmem_fault_cont
    prefetch_hint = &service_context->prefetch_hint;
    va_block = devmem_fault_context->va_block;
    va_block_retry = devmem_fault_context->va_block_retry;
-    src_pfns = service_context->block_context.hmm.src_pfns;
-    dst_pfns = service_context->block_context.hmm.dst_pfns;
+    src_pfns = service_context->block_context->hmm.src_pfns;
+    dst_pfns = service_context->block_context->hmm.dst_pfns;
    region = service_context->region;

    page_mask = &devmem_fault_context->page_mask;
@ -2165,8 +2251,7 @@ static NV_STATUS populate_region(uvm_va_block_t *va_block,

        // Since we have a stable snapshot of the CPU pages, we can
        // update the residency and protection information.
-        uvm_processor_mask_set(&va_block->resident, UVM_ID_CPU);
-        uvm_page_mask_set(&va_block->cpu.resident, page_index);
+        uvm_va_block_cpu_set_resident_page(va_block, page_to_nid(page), page_index);

        cpu_mapping_set(va_block, pfns[page_index] & HMM_PFN_WRITE, page_index);
    }
@ -2253,7 +2338,7 @@ static void hmm_release_atomic_pages(uvm_va_block_t *va_block,
    uvm_page_index_t page_index;

    for_each_va_block_page_in_region(page_index, region) {
-        struct page *page = service_context->block_context.hmm.pages[page_index];
+        struct page *page = service_context->block_context->hmm.pages[page_index];

        if (!page)
            continue;
@ -2269,14 +2354,14 @@ static NV_STATUS hmm_block_atomic_fault_locked(uvm_processor_id_t processor_id,
                                               uvm_service_block_context_t *service_context)
 {
    uvm_va_block_region_t region = service_context->region;
-    struct page **pages = service_context->block_context.hmm.pages;
+    struct page **pages = service_context->block_context->hmm.pages;
    int npages;
    uvm_page_index_t page_index;
    uvm_make_resident_cause_t cause;
    NV_STATUS status;

    if (!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
-        !uvm_page_mask_region_full(&va_block->cpu.resident, region)) {
+        !uvm_va_block_cpu_is_region_resident_on(va_block, NUMA_NO_NODE, region)) {
        // There is an atomic GPU fault. We need to make sure no pages are
        // GPU resident so that make_device_exclusive_range() doesn't call
        // migrate_to_ram() and cause a va_space lock recursion problem.
@ -2289,7 +2374,7 @@ static NV_STATUS hmm_block_atomic_fault_locked(uvm_processor_id_t processor_id,

        status = uvm_hmm_va_block_migrate_locked(va_block,
                                                 va_block_retry,
-                                                 &service_context->block_context,
+                                                 service_context->block_context,
                                                 UVM_ID_CPU,
                                                 region,
                                                 cause);
@ -2299,7 +2384,7 @@ static NV_STATUS hmm_block_atomic_fault_locked(uvm_processor_id_t processor_id,
        // make_device_exclusive_range() will try to call migrate_to_ram()
        // and deadlock with ourself if the data isn't CPU resident.
        if (!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
-            !uvm_page_mask_region_full(&va_block->cpu.resident, region)) {
+            !uvm_va_block_cpu_is_region_resident_on(va_block, NUMA_NO_NODE, region)) {
            status = NV_WARN_MORE_PROCESSING_REQUIRED;
            goto done;
        }
@ -2309,7 +2394,7 @@ static NV_STATUS hmm_block_atomic_fault_locked(uvm_processor_id_t processor_id,
    // mmap() files so we check for that here and report a fatal fault.
    // Otherwise with the current Linux 6.1 make_device_exclusive_range(),
    // it doesn't make the page exclusive and we end up in an endless loop.
-    if (service_context->block_context.hmm.vma->vm_flags & VM_SHARED) {
+    if (service_context->block_context->hmm.vma->vm_flags & (VM_SHARED | VM_HUGETLB)) {
        status = NV_ERR_NOT_SUPPORTED;
        goto done;
    }
@ -2318,7 +2403,7 @@ static NV_STATUS hmm_block_atomic_fault_locked(uvm_processor_id_t processor_id,

    uvm_mutex_unlock(&va_block->lock);

-    npages = make_device_exclusive_range(service_context->block_context.mm,
+    npages = make_device_exclusive_range(service_context->block_context->mm,
        uvm_va_block_cpu_page_address(va_block, region.first),
        uvm_va_block_cpu_page_address(va_block, region.outer - 1) + PAGE_SIZE,
        pages + region.first,
@ -2356,15 +2441,13 @@ static NV_STATUS hmm_block_atomic_fault_locked(uvm_processor_id_t processor_id,
        if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
            UVM_ASSERT(hmm_va_block_cpu_page_is_same(va_block, page_index, page));
            UVM_ASSERT(uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU));
-            UVM_ASSERT(uvm_page_mask_test(&va_block->cpu.resident, page_index));
+            UVM_ASSERT(uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index));
        }
        else {
            NV_STATUS s = hmm_va_block_cpu_page_populate(va_block, page_index, page);

-            if (s == NV_OK) {
-                uvm_processor_mask_set(&va_block->resident, UVM_ID_CPU);
-                uvm_page_mask_set(&va_block->cpu.resident, page_index);
-            }
+            if (s == NV_OK)
+                uvm_va_block_cpu_set_resident_page(va_block, page_to_nid(page), page_index);
        }

        cpu_mapping_clear(va_block, page_index);
@ -2419,7 +2502,7 @@ static NV_STATUS hmm_block_cpu_fault_locked(uvm_processor_id_t processor_id,
                                            uvm_service_block_context_t *service_context)
 {
    uvm_va_block_region_t region = service_context->region;
-    struct migrate_vma *args = &service_context->block_context.hmm.migrate_vma_args;
+    struct migrate_vma *args = &service_context->block_context->hmm.migrate_vma_args;
    NV_STATUS status;
    int ret;
    uvm_hmm_devmem_fault_context_t fault_context = {
@ -2453,8 +2536,8 @@ static NV_STATUS hmm_block_cpu_fault_locked(uvm_processor_id_t processor_id,
        }

        status = hmm_make_resident_cpu(va_block,
-                                       service_context->block_context.hmm.vma,
-                                       service_context->block_context.hmm.src_pfns,
+                                       service_context->block_context->hmm.vma,
+                                       service_context->block_context->hmm.src_pfns,
                                       region,
                                       service_context->access_type,
                                       &fault_context.same_devmem_page_mask);
@ -2476,9 +2559,9 @@ static NV_STATUS hmm_block_cpu_fault_locked(uvm_processor_id_t processor_id,
        }
    }

-    args->vma = service_context->block_context.hmm.vma;
-    args->src = service_context->block_context.hmm.src_pfns + region.first;
-    args->dst = service_context->block_context.hmm.dst_pfns + region.first;
+    args->vma = service_context->block_context->hmm.vma;
+    args->src = service_context->block_context->hmm.src_pfns + region.first;
+    args->dst = service_context->block_context->hmm.dst_pfns + region.first;
    args->start = uvm_va_block_region_start(va_block, region);
    args->end = uvm_va_block_region_end(va_block, region) + 1;
    args->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
@ -2558,7 +2641,7 @@ static NV_STATUS dmamap_src_sysmem_pages(uvm_va_block_t *va_block,
                // TODO: Bug 4050579: Remove this when swap cached pages can be
                // migrated.
                if (service_context) {
-                    service_context->block_context.hmm.swap_cached = true;
+                    service_context->block_context->hmm.swap_cached = true;
                    break;
                }

@ -2574,7 +2657,7 @@ static NV_STATUS dmamap_src_sysmem_pages(uvm_va_block_t *va_block,
            if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
                UVM_ASSERT(hmm_va_block_cpu_page_is_same(va_block, page_index, src_page));
                UVM_ASSERT(uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU));
-                UVM_ASSERT(uvm_page_mask_test(&va_block->cpu.resident, page_index));
+                UVM_ASSERT(uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index));
            }
            else {
                status = hmm_va_block_cpu_page_populate(va_block, page_index, src_page);
@ -2588,8 +2671,7 @@ static NV_STATUS dmamap_src_sysmem_pages(uvm_va_block_t *va_block,

                // migrate_vma_setup() was able to isolate and lock the page;
                // therefore, it is CPU resident and not mapped.
-                uvm_processor_mask_set(&va_block->resident, UVM_ID_CPU);
-                uvm_page_mask_set(&va_block->cpu.resident, page_index);
+                uvm_va_block_cpu_set_resident_page(va_block, page_to_nid(src_page), page_index);
            }

            // The call to migrate_vma_setup() will have inserted a migration
@ -2604,7 +2686,7 @@ static NV_STATUS dmamap_src_sysmem_pages(uvm_va_block_t *va_block,
            if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
                UVM_ASSERT(!uvm_va_block_page_resident_processors_count(va_block, page_index));

-                hmm_va_block_cpu_page_unpopulate(va_block, page_index);
+                hmm_va_block_cpu_page_unpopulate(va_block, page_index, NULL);
            }
        }

@ -2618,7 +2700,7 @@ static NV_STATUS dmamap_src_sysmem_pages(uvm_va_block_t *va_block,
    }

    if (uvm_page_mask_empty(page_mask) ||
-        (service_context && service_context->block_context.hmm.swap_cached))
+        (service_context && service_context->block_context->hmm.swap_cached))
        status = NV_WARN_MORE_PROCESSING_REQUIRED;

    if (status != NV_OK)
@ -2649,8 +2731,8 @@ static NV_STATUS uvm_hmm_gpu_fault_alloc_and_copy(struct vm_area_struct *vma,
    service_context = uvm_hmm_gpu_fault_event->service_context;
    region = service_context->region;
    prefetch_hint = &service_context->prefetch_hint;
-    src_pfns = service_context->block_context.hmm.src_pfns;
-    dst_pfns = service_context->block_context.hmm.dst_pfns;
+    src_pfns = service_context->block_context->hmm.src_pfns;
+    dst_pfns = service_context->block_context->hmm.dst_pfns;

    // Build the migration mask.
    // Note that thrashing pinned pages are already accounted for in
@ -2708,8 +2790,8 @@ static NV_STATUS uvm_hmm_gpu_fault_finalize_and_map(uvm_hmm_gpu_fault_event_t *u
    va_block = uvm_hmm_gpu_fault_event->va_block;
    va_block_retry = uvm_hmm_gpu_fault_event->va_block_retry;
    service_context = uvm_hmm_gpu_fault_event->service_context;
-    src_pfns = service_context->block_context.hmm.src_pfns;
-    dst_pfns = service_context->block_context.hmm.dst_pfns;
+    src_pfns = service_context->block_context->hmm.src_pfns;
+    dst_pfns = service_context->block_context->hmm.dst_pfns;
    region = service_context->region;
    page_mask = &uvm_hmm_gpu_fault_event->page_mask;

@ -2752,11 +2834,11 @@ NV_STATUS uvm_hmm_va_block_service_locked(uvm_processor_id_t processor_id,
                                          uvm_va_block_retry_t *va_block_retry,
                                          uvm_service_block_context_t *service_context)
 {
-    struct mm_struct *mm = service_context->block_context.mm;
-    struct vm_area_struct *vma = service_context->block_context.hmm.vma;
+    struct mm_struct *mm = service_context->block_context->mm;
+    struct vm_area_struct *vma = service_context->block_context->hmm.vma;
    uvm_va_block_region_t region = service_context->region;
    uvm_hmm_gpu_fault_event_t uvm_hmm_gpu_fault_event;
-    struct migrate_vma *args = &service_context->block_context.hmm.migrate_vma_args;
+    struct migrate_vma *args = &service_context->block_context->hmm.migrate_vma_args;
    int ret;
    NV_STATUS status = NV_ERR_INVALID_ADDRESS;

@ -2780,8 +2862,8 @@ NV_STATUS uvm_hmm_va_block_service_locked(uvm_processor_id_t processor_id,
    uvm_hmm_gpu_fault_event.service_context = service_context;

    args->vma = vma;
-    args->src = service_context->block_context.hmm.src_pfns + region.first;
-    args->dst = service_context->block_context.hmm.dst_pfns + region.first;
+    args->src = service_context->block_context->hmm.src_pfns + region.first;
+    args->dst = service_context->block_context->hmm.dst_pfns + region.first;
    args->start = uvm_va_block_region_start(va_block, region);
    args->end = uvm_va_block_region_end(va_block, region) + 1;
    args->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE | MIGRATE_VMA_SELECT_SYSTEM;
@ -2815,8 +2897,8 @@ NV_STATUS uvm_hmm_va_block_service_locked(uvm_processor_id_t processor_id,
        // since migrate_vma_setup() would have reported that information.
        // Try to make it resident in system memory and retry the migration.
        status = hmm_make_resident_cpu(va_block,
-                                       service_context->block_context.hmm.vma,
-                                       service_context->block_context.hmm.src_pfns,
+                                       service_context->block_context->hmm.vma,
+                                       service_context->block_context->hmm.src_pfns,
                                       region,
                                       service_context->access_type,
                                       NULL);
@ -2962,16 +3044,6 @@ static NV_STATUS uvm_hmm_migrate_finalize(uvm_hmm_migrate_event_t *uvm_hmm_migra
                                     &uvm_hmm_migrate_event->same_devmem_page_mask);
 }

-static bool is_resident(uvm_va_block_t *va_block,
-                        uvm_processor_id_t dest_id,
-                        uvm_va_block_region_t region)
-{
-    if (!uvm_processor_mask_test(&va_block->resident, dest_id))
-        return false;
-
-    return uvm_page_mask_region_full(uvm_va_block_resident_mask_get(va_block, dest_id), region);
-}
-
 // Note that migrate_vma_*() doesn't handle asynchronous migrations so the
 // migration flag UVM_MIGRATE_FLAG_SKIP_CPU_MAP doesn't have an effect.
 // TODO: Bug 3900785: investigate ways to implement async migration.
@ -3063,9 +3135,7 @@ NV_STATUS uvm_hmm_va_block_migrate_locked(uvm_va_block_t *va_block,
        uvm_page_mask_init_from_region(page_mask, region, NULL);

        for_each_id_in_mask(id, &va_block->resident) {
-            if (!uvm_page_mask_andnot(page_mask,
-                                      page_mask,
-                                      uvm_va_block_resident_mask_get(va_block, id)))
+            if (!uvm_page_mask_andnot(page_mask, page_mask, uvm_va_block_resident_mask_get(va_block, id, NUMA_NO_NODE)))
                return NV_OK;
        }

@ -3193,6 +3263,7 @@ static NV_STATUS hmm_va_block_evict_chunks(uvm_va_block_t *va_block,
    uvm_page_mask_t *page_mask = &uvm_hmm_migrate_event.page_mask;
    const uvm_va_policy_t *policy;
    uvm_va_policy_node_t *node;
+    uvm_page_mask_t *cpu_resident_mask = uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE);
    unsigned long npages;
    NV_STATUS status;

@ -3215,7 +3286,7 @@ static NV_STATUS hmm_va_block_evict_chunks(uvm_va_block_t *va_block,
        // Pages resident on the GPU should not have a resident page in system
        // memory.
        // TODO: Bug 3660922: Need to handle read duplication at some point.
-        UVM_ASSERT(uvm_page_mask_region_empty(&va_block->cpu.resident, region));
+        UVM_ASSERT(uvm_page_mask_region_empty(cpu_resident_mask, region));

        status = alloc_and_copy_to_cpu(va_block,
                                       NULL,
@ -3314,35 +3385,34 @@ NV_STATUS uvm_hmm_va_block_evict_pages_from_gpu(uvm_va_block_t *va_block,
                                     NULL);
 }

-NV_STATUS uvm_hmm_pmm_gpu_evict_pfn(unsigned long pfn)
+NV_STATUS uvm_hmm_remote_cpu_fault(struct vm_fault *vmf)
 {
-    unsigned long src_pfn = 0;
-    unsigned long dst_pfn = 0;
-    struct page *dst_page;
    NV_STATUS status = NV_OK;
+    unsigned long src_pfn;
+    unsigned long dst_pfn;
+    struct migrate_vma args;
+    struct page *src_page = vmf->page;
+    uvm_tracker_t tracker = UVM_TRACKER_INIT();
    int ret;

-    ret = migrate_device_range(&src_pfn, pfn, 1);
-    if (ret)
-        return errno_to_nv_status(ret);
+    args.vma = vmf->vma;
+    args.src = &src_pfn;
+    args.dst = &dst_pfn;
+    args.start = nv_page_fault_va(vmf);
+    args.end = args.start + PAGE_SIZE;
+    args.pgmap_owner = &g_uvm_global;
+    args.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
+    args.fault_page = src_page;
+
+    // We don't call migrate_vma_setup_locked() here because we don't
+    // have a va_block and don't want to ignore invalidations.
+    ret = migrate_vma_setup(&args);
+    UVM_ASSERT(!ret);

    if (src_pfn & MIGRATE_PFN_MIGRATE) {
-        // All the code for copying a vidmem page to sysmem relies on
-        // having a va_block. However certain combinations of mremap()
-        // and fork() can result in device-private pages being mapped
-        // in a child process without a va_block.
-        //
-        // We don't expect the above to be a common occurance so for
-        // now we allocate a fresh zero page when evicting without a
-        // va_block. However this results in child processes losing
-        // data so make sure we warn about it. Ideally we would just
-        // not migrate and SIGBUS the child if it tries to access the
-        // page. However that would prevent unloading of the driver so
-        // we're stuck with this until we fix the problem.
-        // TODO: Bug 3902536: add code to migrate GPU memory without having a
-        // va_block.
-        WARN_ON(1);
-        dst_page = alloc_page(GFP_HIGHUSER_MOVABLE | __GFP_ZERO);
+        struct page *dst_page;
+
+        dst_page = alloc_page(GFP_HIGHUSER_MOVABLE);
        if (!dst_page) {
            status = NV_ERR_NO_MEMORY;
            goto out;
@ -3351,11 +3421,15 @@ NV_STATUS uvm_hmm_pmm_gpu_evict_pfn(unsigned long pfn)
        lock_page(dst_page);
        dst_pfn = migrate_pfn(page_to_pfn(dst_page));

-        migrate_device_pages(&src_pfn, &dst_pfn, 1);
+        status = uvm_hmm_copy_devmem_page(dst_page, src_page, &tracker);
+        if (status == NV_OK)
+            status = uvm_tracker_wait_deinit(&tracker);
    }

+    migrate_vma_pages(&args);
+
 out:
-    migrate_device_finalize(&src_pfn, &dst_pfn, 1);
+    migrate_vma_finalize(&args);

    return status;
 }
@ -3606,4 +3680,3 @@ bool uvm_hmm_must_use_sysmem(uvm_va_block_t *va_block,
 }

 #endif // UVM_IS_CONFIG_HMM()
-
--- a/kernel-open/nvidia-uvm/uvm_hmm.h
+++ b/kernel-open/nvidia-uvm/uvm_hmm.h
@ -307,10 +307,10 @@ typedef struct
                                     uvm_migrate_mode_t mode,
                                     uvm_tracker_t *out_tracker);

-    // Evicts all va_blocks in the va_space to the CPU. Unlike the
-    // other va_block eviction functions this is based on virtual
-    // address and therefore takes mmap_lock for read.
-    void uvm_hmm_evict_va_blocks(uvm_va_space_t *va_space);
+    // Handle a fault to a device-private page from a process other than the
+    // process which created the va_space that originally allocated the
+    // device-private page.
+    NV_STATUS uvm_hmm_remote_cpu_fault(struct vm_fault *vmf);

    // This sets the va_block_context->hmm.src_pfns[] to the ZONE_DEVICE private
    // PFN for the GPU chunk memory.
@ -343,14 +343,6 @@ typedef struct
                                                    const uvm_page_mask_t *pages_to_evict,
                                                    uvm_va_block_region_t region);

-    // Migrate a GPU device-private page to system memory. This is
-    // called to remove CPU page table references to device private
-    // struct pages for the given GPU after all other references in
-    // va_blocks have been released and the GPU is in the process of
-    // being removed/torn down. Note that there is no mm, VMA,
-    // va_block or any user channel activity on this GPU.
-    NV_STATUS uvm_hmm_pmm_gpu_evict_pfn(unsigned long pfn);
-
    // This returns what would be the intersection of va_block start/end and
    // VMA start/end-1 for the given 'lookup_address' if
    // uvm_hmm_va_block_find_create() was called.
@ -592,8 +584,10 @@ typedef struct
        return NV_ERR_INVALID_ADDRESS;
    }

-    static void uvm_hmm_evict_va_blocks(uvm_va_space_t *va_space)
+    static NV_STATUS uvm_hmm_remote_cpu_fault(struct vm_fault *vmf)
    {
+        UVM_ASSERT(0);
+        return NV_ERR_INVALID_ADDRESS;
    }

    static NV_STATUS uvm_hmm_va_block_evict_chunk_prep(uvm_va_block_t *va_block,
@ -622,11 +616,6 @@ typedef struct
        return NV_OK;
    }

-    static NV_STATUS uvm_hmm_pmm_gpu_evict_pfn(unsigned long pfn)
-    {
-        return NV_OK;
-    }
-
    static NV_STATUS uvm_hmm_va_block_range_bounds(uvm_va_space_t *va_space,
                                                   struct mm_struct *mm,
                                                   NvU64 lookup_address,
--- a/kernel-open/nvidia-uvm/uvm_hopper.c
+++ b/kernel-open/nvidia-uvm/uvm_hopper.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2020-2022 NVIDIA Corporation
+    Copyright (c) 2020-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -59,12 +59,12 @@ void uvm_hal_hopper_arch_init_properties(uvm_parent_gpu_t *parent_gpu)

    // Physical CE writes to vidmem are non-coherent with respect to the CPU on
    // GH180.
-    parent_gpu->ce_phys_vidmem_write_supported = !uvm_gpu_is_coherent(parent_gpu);
+    parent_gpu->ce_phys_vidmem_write_supported = !uvm_parent_gpu_is_coherent(parent_gpu);

    // TODO: Bug 4174553: [HGX-SkinnyJoe][GH180] channel errors discussion/debug
    //                    portion for the uvm tests became nonresponsive after
    //                    some time and then failed even after reboot
-    parent_gpu->peer_copy_mode = uvm_gpu_is_coherent(parent_gpu) ?
+    parent_gpu->peer_copy_mode = uvm_parent_gpu_is_coherent(parent_gpu) ?
                                                           UVM_GPU_PEER_COPY_MODE_VIRTUAL : g_uvm_global.peer_copy_mode;

    // All GR context buffers may be mapped to 57b wide VAs. All "compute" units
--- a/kernel-open/nvidia-uvm/uvm_hopper_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_hopper_mmu.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2020-2023 NVIDIA Corporation
+    Copyright (c) 2020-2022 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -368,10 +368,7 @@ static NvU64 small_half_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc)
    return pde_bits;
 }

-static void make_pde_hopper(void *entry,
-                            uvm_mmu_page_table_alloc_t **phys_allocs,
-                            NvU32 depth,
-                            uvm_page_directory_t *child_dir)
+static void make_pde_hopper(void *entry, uvm_mmu_page_table_alloc_t **phys_allocs, NvU32 depth)
 {
    NvU32 entry_count = entries_per_index_hopper(depth);
    NvU64 *entry_bits = (NvU64 *)entry;
--- a/kernel-open/nvidia-uvm/uvm_linux.h
+++ b/kernel-open/nvidia-uvm/uvm_linux.h
@ -128,8 +128,9 @@ static inline const struct cpumask *uvm_cpumask_of_node(int node)
 // present if we see the callback.
 //
 // The callback was added in commit 0f0a327fa12cd55de5e7f8c05a70ac3d047f405e,
-// v3.19 (2014-11-13).
-    #if defined(NV_MMU_NOTIFIER_OPS_HAS_INVALIDATE_RANGE)
+// v3.19 (2014-11-13) and renamed in commit 1af5a8109904.
+    #if defined(NV_MMU_NOTIFIER_OPS_HAS_INVALIDATE_RANGE) || \
+        defined(NV_MMU_NOTIFIER_OPS_HAS_ARCH_INVALIDATE_SECONDARY_TLBS)
        #define UVM_CAN_USE_MMU_NOTIFIERS() 1
    #else
        #define UVM_CAN_USE_MMU_NOTIFIERS() 0
@ -348,6 +349,47 @@ static inline NvU64 NV_GETTIME(void)
             (bit) = find_next_zero_bit((addr), (size), (bit) + 1))
 #endif

+#if !defined(NV_FIND_NEXT_BIT_WRAP_PRESENT)
+    static inline unsigned long find_next_bit_wrap(const unsigned long *addr, unsigned long size, unsigned long offset)
+    {
+        unsigned long bit = find_next_bit(addr, size, offset);
+
+        if (bit < size)
+            return bit;
+
+        bit = find_first_bit(addr, offset);
+        return bit < offset ? bit : size;
+    }
+#endif
+
+// for_each_set_bit_wrap and __for_each_wrap were introduced in v6.1-rc1
+// by commit 4fe49b3b97c2640147c46519c2a6fdb06df34f5f
+#if !defined(for_each_set_bit_wrap)
+static inline unsigned long __for_each_wrap(const unsigned long *bitmap,
+                                            unsigned long size,
+                                            unsigned long start,
+                                            unsigned long n)
+{
+    unsigned long bit;
+
+    if (n > start) {
+        bit = find_next_bit(bitmap, size, n);
+        if (bit < size)
+            return bit;
+
+        n = 0;
+    }
+
+    bit = find_next_bit(bitmap, start, n);
+    return bit < start ? bit : size;
+}
+
+#define for_each_set_bit_wrap(bit, addr, size, start)                   \
+    for ((bit) = find_next_bit_wrap((addr), (size), (start));           \
+         (bit) < (size);                                                \
+         (bit) = __for_each_wrap((addr), (size), (start), (bit) + 1))
+#endif
+
 // Added in 2.6.24
 #ifndef ACCESS_ONCE
  #define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
@ -579,4 +621,5 @@ static inline pgprot_t uvm_pgprot_decrypted(pgprot_t prot)
  #include <asm/page.h>
  #define page_to_virt(x)    __va(PFN_PHYS(page_to_pfn(x)))
 #endif
+
 #endif // _UVM_LINUX_H
--- a/kernel-open/nvidia-uvm/uvm_map_external.c
+++ b/kernel-open/nvidia-uvm/uvm_map_external.c
@ -355,6 +355,7 @@ static uvm_membar_t va_range_downgrade_membar(uvm_va_range_t *va_range, uvm_ext_
    if (!ext_gpu_map->mem_handle)
        return UVM_MEMBAR_GPU;

+    // EGM uses the same barriers as sysmem.
    return uvm_hal_downgrade_membar_type(ext_gpu_map->gpu,
                                         !ext_gpu_map->is_sysmem && ext_gpu_map->gpu == ext_gpu_map->owning_gpu);
 }
@ -633,6 +634,8 @@ static NV_STATUS set_ext_gpu_map_location(uvm_ext_gpu_map_t *ext_gpu_map,
                                          const UvmGpuMemoryInfo *mem_info)
 {
    uvm_gpu_t *owning_gpu;
+    if (mem_info->egm)
+        UVM_ASSERT(mem_info->sysmem);

    if (!mem_info->deviceDescendant && !mem_info->sysmem) {
        ext_gpu_map->owning_gpu = NULL;
@ -641,6 +644,7 @@ static NV_STATUS set_ext_gpu_map_location(uvm_ext_gpu_map_t *ext_gpu_map,
    }
    // This is a local or peer allocation, so the owning GPU must have been
    // registered.
+    // This also checks for if EGM owning GPU is registered.
    owning_gpu = uvm_va_space_get_gpu_by_uuid(va_space, &mem_info->uuid);
    if (!owning_gpu)
        return NV_ERR_INVALID_DEVICE;
@ -651,13 +655,10 @@ static NV_STATUS set_ext_gpu_map_location(uvm_ext_gpu_map_t *ext_gpu_map,
    // crashes when it's eventually freed.
    // TODO: Bug 1811006: Bug tracking the RM issue, its fix might change the
    // semantics of sysmem allocations.
-    if (mem_info->sysmem) {
-        ext_gpu_map->owning_gpu = owning_gpu;
-        ext_gpu_map->is_sysmem = true;
-        return NV_OK;
-    }

-    if (owning_gpu != mapping_gpu) {
+    // Check if peer access for peer memory is enabled.
+    // This path also handles EGM allocations.
+    if (owning_gpu != mapping_gpu && (!mem_info->sysmem || mem_info->egm)) {
        // TODO: Bug 1757136: In SLI, the returned UUID may be different but a
        //       local mapping must be used. We need to query SLI groups to know
        //       that.
@ -666,7 +667,9 @@ static NV_STATUS set_ext_gpu_map_location(uvm_ext_gpu_map_t *ext_gpu_map,
    }

    ext_gpu_map->owning_gpu = owning_gpu;
-    ext_gpu_map->is_sysmem = false;
+    ext_gpu_map->is_sysmem = mem_info->sysmem;
+    ext_gpu_map->is_egm = mem_info->egm;
+
    return NV_OK;
 }

@ -719,6 +722,7 @@ static NV_STATUS uvm_ext_gpu_map_split(uvm_range_tree_t *tree,
    new->gpu = existing_map->gpu;
    new->owning_gpu = existing_map->owning_gpu;
    new->is_sysmem = existing_map->is_sysmem;
+    new->is_egm = existing_map->is_egm;

    // Initialize the new ext_gpu_map tracker as a copy of the existing_map tracker.
    // This way, any operations on any of the two ext_gpu_maps will be able to
--- a/kernel-open/nvidia-uvm/uvm_maxwell_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_maxwell_mmu.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2023 NVIDIA Corporation
+    Copyright (c) 2016-2021 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -106,10 +106,7 @@ static NvU64 small_half_pde_maxwell(uvm_mmu_page_table_alloc_t *phys_alloc)
    return pde_bits;
 }

-static void make_pde_maxwell(void *entry,
-                             uvm_mmu_page_table_alloc_t **phys_allocs,
-                             NvU32 depth,
-                             uvm_page_directory_t *child_dir)
+static void make_pde_maxwell(void *entry, uvm_mmu_page_table_alloc_t **phys_allocs, NvU32 depth)
 {
    NvU64 pde_bits = 0;
    UVM_ASSERT(depth == 0);
--- a/kernel-open/nvidia-uvm/uvm_mem.c
+++ b/kernel-open/nvidia-uvm/uvm_mem.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2023 NVIDIA Corporation
+    Copyright (c) 2016-2022 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -93,9 +93,8 @@ static bool sysmem_can_be_mapped_on_gpu(uvm_mem_t *sysmem)
 {
    UVM_ASSERT(uvm_mem_is_sysmem(sysmem));

-    // In Confidential Computing, only unprotected memory can be mapped on the
-    // GPU
-    if (g_uvm_global.conf_computing_enabled)
+    // If SEV is enabled, only unprotected memory can be mapped
+    if (g_uvm_global.sev_enabled)
        return uvm_mem_is_sysmem_dma(sysmem);

    return true;
@ -738,7 +737,7 @@ static NV_STATUS mem_map_cpu_to_sysmem_kernel(uvm_mem_t *mem)
            pages[page_index] = mem_cpu_page(mem, page_index * PAGE_SIZE);
    }

-    if (g_uvm_global.conf_computing_enabled && uvm_mem_is_sysmem_dma(mem))
+    if (g_uvm_global.sev_enabled && uvm_mem_is_sysmem_dma(mem))
        prot = uvm_pgprot_decrypted(PAGE_KERNEL_NOENC);

    mem->kernel.cpu_addr = vmap(pages, num_pages, VM_MAP, prot);
--- a/kernel-open/nvidia-uvm/uvm_mem_test.c
+++ b/kernel-open/nvidia-uvm/uvm_mem_test.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2023 NVIDIA Corporation
+    Copyright (c) 2016-2021 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -44,10 +44,10 @@ static NvU32 first_page_size(NvU32 page_sizes)

 static inline NV_STATUS __alloc_map_sysmem(NvU64 size, uvm_gpu_t *gpu, uvm_mem_t **sys_mem)
 {
-    if (g_uvm_global.conf_computing_enabled)
+    if (g_uvm_global.sev_enabled)
        return uvm_mem_alloc_sysmem_dma_and_map_cpu_kernel(size, gpu, current->mm, sys_mem);
-
-    return uvm_mem_alloc_sysmem_and_map_cpu_kernel(size, current->mm, sys_mem);
+    else
+        return uvm_mem_alloc_sysmem_and_map_cpu_kernel(size, current->mm, sys_mem);
 }

 static NV_STATUS check_accessible_from_gpu(uvm_gpu_t *gpu, uvm_mem_t *mem)
@ -335,6 +335,9 @@ error:

 static bool should_test_page_size(size_t alloc_size, NvU32 page_size)
 {
+    if (g_uvm_global.sev_enabled)
+        return false;
+
    if (g_uvm_global.num_simulated_devices == 0)
        return true;

--- a/kernel-open/nvidia-uvm/uvm_migrate.c
+++ b/kernel-open/nvidia-uvm/uvm_migrate.c
@ -130,9 +130,9 @@ static NV_STATUS block_migrate_map_unmapped_pages(uvm_va_block_t *va_block,
    NV_STATUS status = NV_OK;
    NV_STATUS tracker_status;

-    // Save the mask of unmapped pages because it will change after the
+    // Get the mask of unmapped pages because it will change after the
    // first map operation
-    uvm_page_mask_complement(&va_block_context->caller_page_mask, &va_block->maybe_mapped_pages);
+    uvm_va_block_unmapped_pages_get(va_block, region, &va_block_context->caller_page_mask);

    if (uvm_va_block_is_hmm(va_block) && !UVM_ID_IS_CPU(dest_id)) {
        // Do not map pages that are already resident on the CPU. This is in
@ -147,7 +147,7 @@ static NV_STATUS block_migrate_map_unmapped_pages(uvm_va_block_t *va_block,
        // such pages at all, when migrating.
        uvm_page_mask_andnot(&va_block_context->caller_page_mask,
                             &va_block_context->caller_page_mask,
-                             uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU));
+                             uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE));
    }

    // Only map those pages that are not mapped anywhere else (likely due
@ -377,7 +377,7 @@ static bool va_block_should_do_cpu_preunmap(uvm_va_block_t *va_block,

    mapped_pages_cpu = uvm_va_block_map_mask_get(va_block, UVM_ID_CPU);
    if (uvm_processor_mask_test(&va_block->resident, dest_id)) {
-        const uvm_page_mask_t *resident_pages_dest = uvm_va_block_resident_mask_get(va_block, dest_id);
+        const uvm_page_mask_t *resident_pages_dest = uvm_va_block_resident_mask_get(va_block, dest_id, NUMA_NO_NODE);
        uvm_page_mask_t *do_not_unmap_pages = &va_block_context->scratch_page_mask;

        // TODO: Bug 1877578
--- a/kernel-open/nvidia-uvm/uvm_migrate_pageable.c
+++ b/kernel-open/nvidia-uvm/uvm_migrate_pageable.c
@ -672,14 +672,6 @@ static NV_STATUS nv_migrate_vma(struct migrate_vma *args, migrate_vma_state_t *s
        .finalize_and_map = uvm_migrate_vma_finalize_and_map_helper,
    };

-    // WAR for Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU TLB
-    // invalidates on read-only to read-write upgrades
-    //
-    // This code path isn't used on GH180 but we need to maintain consistent
-    // behaviour on systems that do.
-    if (!vma_is_anonymous(args->vma))
-        return NV_WARN_NOTHING_TO_DO;
-
    ret = migrate_vma(&uvm_migrate_vma_ops, args->vma, args->start, args->end, args->src, args->dst, state);
    if (ret < 0)
        return errno_to_nv_status(ret);
@ -693,24 +685,6 @@ static NV_STATUS nv_migrate_vma(struct migrate_vma *args, migrate_vma_state_t *s
    if (ret < 0)
        return errno_to_nv_status(ret);

-    // TODO: Bug 2419180: support file-backed pages in migrate_vma, when
-    //       support for it is added to the Linux kernel
-    //
-    // A side-effect of migrate_vma_setup() is it calls mmu notifiers even if a
-    // page can't be migrated (eg. because it's a non-anonymous mapping). We
-    // need this side-effect for SMMU on GH180 to ensure any cached read-only
-    // entries are flushed from SMMU on permission upgrade.
-    //
-    // TODO: Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU TLB
-    // invalidates on read-only to read-write upgrades
-    //
-    // The above WAR doesn't work for HugeTLBfs mappings because
-    // migrate_vma_setup() will fail in that case.
-    if (!vma_is_anonymous(args->vma)) {
-        migrate_vma_finalize(args);
-        return NV_WARN_NOTHING_TO_DO;
-    }
-
    uvm_migrate_vma_alloc_and_copy(args, state);
    if (state->status == NV_OK) {
        migrate_vma_pages(args);
@ -884,13 +858,9 @@ static NV_STATUS migrate_pageable_vma(struct vm_area_struct *vma,
    start = max(start, vma->vm_start);
    outer = min(outer, vma->vm_end);

-    // migrate_vma only supports anonymous VMAs. We check for those after
-    // calling migrate_vma_setup() to workaround Bug 4130089. We need to check
-    // for HugeTLB VMAs here because migrate_vma_setup() will return a fatal
-    // error for those.
-    // TODO: Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU TLB
-    // invalidates on read-only to read-write upgrades
-    if (is_vm_hugetlb_page(vma))
+    // TODO: Bug 2419180: support file-backed pages in migrate_vma, when
+    //       support for it is added to the Linux kernel
+    if (!vma_is_anonymous(vma))
        return NV_WARN_NOTHING_TO_DO;

    if (uvm_processor_mask_empty(&va_space->registered_gpus))
--- a/kernel-open/nvidia-uvm/uvm_migrate_pageable.h
+++ b/kernel-open/nvidia-uvm/uvm_migrate_pageable.h
@ -51,7 +51,7 @@ typedef struct
 #if defined(CONFIG_MIGRATE_VMA_HELPER)
 #define UVM_MIGRATE_VMA_SUPPORTED 1
 #else
-#if NV_IS_EXPORT_SYMBOL_PRESENT_migrate_vma_setup
+#if defined(CONFIG_DEVICE_PRIVATE) && defined(NV_MIGRATE_VMA_SETUP_PRESENT)
 #define UVM_MIGRATE_VMA_SUPPORTED 1
 #endif
 #endif
--- a/kernel-open/nvidia-uvm/uvm_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_mmu.c
@ -323,153 +323,37 @@ static void uvm_mmu_page_table_cpu_memset_16(uvm_gpu_t *gpu,
    uvm_mmu_page_table_cpu_unmap(gpu, phys_alloc);
 }

-static void pde_fill_cpu(uvm_page_tree_t *tree,
-                         uvm_page_directory_t *directory,
-                         NvU32 start_index,
-                         NvU32 pde_count,
-                         uvm_mmu_page_table_alloc_t **phys_addr)
-{
-    NvU64 pde_data[2], entry_size;
-    NvU32 i;
-
-    UVM_ASSERT(uvm_mmu_use_cpu(tree));
-
-    entry_size = tree->hal->entry_size(directory->depth);
-    UVM_ASSERT(sizeof(pde_data) >= entry_size);
-
-    for (i = 0; i < pde_count; i++) {
-        tree->hal->make_pde(pde_data, phys_addr, directory->depth, directory->entries[start_index + i]);
-
-        if (entry_size == sizeof(pde_data[0]))
-            uvm_mmu_page_table_cpu_memset_8(tree->gpu, &directory->phys_alloc, start_index + i, pde_data[0], 1);
-        else
-            uvm_mmu_page_table_cpu_memset_16(tree->gpu, &directory->phys_alloc, start_index + i, pde_data, 1);
-    }
-}
-
-static void pde_fill_gpu(uvm_page_tree_t *tree,
-                         uvm_page_directory_t *directory,
-                         NvU32 start_index,
-                         NvU32 pde_count,
-                         uvm_mmu_page_table_alloc_t **phys_addr,
-                         uvm_push_t *push)
-{
-    NvU64 pde_data[2], entry_size;
-    uvm_gpu_address_t pde_entry_addr = uvm_mmu_gpu_address(tree->gpu, directory->phys_alloc.addr);
-    NvU32 max_inline_entries;
-    uvm_push_flag_t push_membar_flag = UVM_PUSH_FLAG_COUNT;
-    uvm_gpu_address_t inline_data_addr;
-    uvm_push_inline_data_t inline_data;
-    NvU32 entry_count, i, j;
-
-    UVM_ASSERT(!uvm_mmu_use_cpu(tree));
-
-    entry_size = tree->hal->entry_size(directory->depth);
-    UVM_ASSERT(sizeof(pde_data) >= entry_size);
-
-    max_inline_entries = UVM_PUSH_INLINE_DATA_MAX_SIZE / entry_size;
-
-    if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE))
-        push_membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE;
-    else if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU))
-        push_membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU;
-
-    pde_entry_addr.address += start_index * entry_size;
-
-    for (i = 0; i < pde_count;) {
-        // All but the first memory operation can be pipelined. We respect the
-        // caller's pipelining settings for the first push.
-        if (i != 0)
-            uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
-
-        entry_count = min(pde_count - i, max_inline_entries);
-
-        // No membar is needed until the last memory operation. Otherwise,
-        // use caller's membar flag.
-        if ((i + entry_count) < pde_count)
-            uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
-        else if (push_membar_flag != UVM_PUSH_FLAG_COUNT)
-            uvm_push_set_flag(push, push_membar_flag);
-
-        uvm_push_inline_data_begin(push, &inline_data);
-        for (j = 0; j < entry_count; j++) {
-            tree->hal->make_pde(pde_data, phys_addr, directory->depth, directory->entries[start_index + i + j]);
-            uvm_push_inline_data_add(&inline_data, pde_data, entry_size);
-        }
-        inline_data_addr = uvm_push_inline_data_end(&inline_data);
-
-        tree->gpu->parent->ce_hal->memcopy(push, pde_entry_addr, inline_data_addr, entry_count * entry_size);
-
-        i += entry_count;
-        pde_entry_addr.address += entry_size * entry_count;
-    }
-}
-
-// pde_fill() populates pde_count PDE entries (starting at start_index) with
-// the same mapping, i.e., with the same physical address (phys_addr).
-// pde_fill() is optimized for pde_count == 1, which is the common case. The
-// map_remap() function is the only case where pde_count > 1, only used on GA100
-// GPUs for 512MB page size mappings.
-static void pde_fill(uvm_page_tree_t *tree,
-                     uvm_page_directory_t *directory,
-                     NvU32 start_index,
-                     NvU32 pde_count,
-                     uvm_mmu_page_table_alloc_t **phys_addr,
-                     uvm_push_t *push)
-{
-    UVM_ASSERT(start_index + pde_count <= uvm_mmu_page_tree_entries(tree, directory->depth, UVM_PAGE_SIZE_AGNOSTIC));
-
-    if (push)
-        pde_fill_gpu(tree, directory, start_index, pde_count, phys_addr, push);
-    else
-        pde_fill_cpu(tree, directory, start_index, pde_count, phys_addr);
-}
-
 static void phys_mem_init(uvm_page_tree_t *tree, NvU32 page_size, uvm_page_directory_t *dir, uvm_push_t *push)
 {
-    NvU32 entries_count = uvm_mmu_page_tree_entries(tree, dir->depth, page_size);
+    NvU64 clear_bits[2];
+    uvm_mmu_mode_hal_t *hal = tree->hal;

-    // Passing in NULL for the phys_allocs will mark the child entries as
-    // invalid.
-    uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
-
-    // Init with an invalid PTE or clean PDE. Only Maxwell PDEs can have more
-    // than 512 entries. We initialize them all with the same clean PDE.
-    // Additionally, only ATS systems may require clean PDEs bit settings based
-    // on the mapping VA.
-    if (dir->depth == tree->hal->page_table_depth(page_size) || (entries_count > 512 && !g_uvm_global.ats.enabled)) {
-        NvU64 clear_bits[2];
-
-        // If it is not a PTE, make a clean PDE.
-        if (dir->depth != tree->hal->page_table_depth(page_size)) {
-            tree->hal->make_pde(clear_bits, phys_allocs, dir->depth, dir->entries[0]);
-
-            // Make sure that using only clear_bits[0] will work.
-            UVM_ASSERT(tree->hal->entry_size(dir->depth) == sizeof(clear_bits[0]) || clear_bits[0] == clear_bits[1]);
-        }
-        else {
-            *clear_bits = 0;
-        }
-
-        // Initialize the memory to a reasonable value.
-        if (push) {
-            tree->gpu->parent->ce_hal->memset_8(push,
-                                                uvm_mmu_gpu_address(tree->gpu, dir->phys_alloc.addr),
-                                                *clear_bits,
-                                                dir->phys_alloc.size);
-        }
-        else {
-            uvm_mmu_page_table_cpu_memset_8(tree->gpu,
-                                            &dir->phys_alloc,
-                                            0,
-                                            *clear_bits,
-                                            dir->phys_alloc.size / sizeof(*clear_bits));
-        }
+    if (dir->depth == tree->hal->page_table_depth(page_size)) {
+        *clear_bits = 0; // Invalid PTE
    }
    else {
-        pde_fill(tree, dir, 0, entries_count, phys_allocs, push);
+        // passing in NULL for the phys_allocs will mark the child entries as invalid
+        uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
+        hal->make_pde(clear_bits, phys_allocs, dir->depth);
+
+        // Make sure that using only clear_bits[0] will work
+        UVM_ASSERT(hal->entry_size(dir->depth) == sizeof(clear_bits[0]) || clear_bits[0] == clear_bits[1]);
    }

+    // initialize the memory to a reasonable value
+    if (push) {
+        tree->gpu->parent->ce_hal->memset_8(push,
+                                            uvm_mmu_gpu_address(tree->gpu, dir->phys_alloc.addr),
+                                            *clear_bits,
+                                            dir->phys_alloc.size);
+    }
+    else {
+        uvm_mmu_page_table_cpu_memset_8(tree->gpu,
+                                        &dir->phys_alloc,
+                                        0,
+                                        *clear_bits,
+                                        dir->phys_alloc.size / sizeof(*clear_bits));
+    }
 }

 static uvm_page_directory_t *allocate_directory(uvm_page_tree_t *tree,
@ -483,10 +367,8 @@ static uvm_page_directory_t *allocate_directory(uvm_page_tree_t *tree,
    NvLength phys_alloc_size = hal->allocation_size(depth, page_size);
    uvm_page_directory_t *dir;

-    // The page tree doesn't cache PTEs so space is not allocated for entries
-    // that are always PTEs.
-    // 2M PTEs may later become PDEs so pass UVM_PAGE_SIZE_AGNOSTIC, not
-    // page_size.
+    // The page tree doesn't cache PTEs so space is not allocated for entries that are always PTEs.
+    // 2M PTEs may later become PDEs so pass UVM_PAGE_SIZE_AGNOSTIC, not page_size.
    if (depth == hal->page_table_depth(UVM_PAGE_SIZE_AGNOSTIC))
        entry_count = 0;
    else
@ -527,6 +409,108 @@ static inline NvU32 index_to_entry(uvm_mmu_mode_hal_t *hal, NvU32 entry_index, N
    return hal->entries_per_index(depth) * entry_index + hal->entry_offset(depth, page_size);
 }

+static void pde_fill_cpu(uvm_page_tree_t *tree,
+                         NvU32 depth,
+                         uvm_mmu_page_table_alloc_t *directory,
+                         NvU32 start_index,
+                         NvU32 pde_count,
+                         uvm_mmu_page_table_alloc_t **phys_addr)
+{
+    NvU64 pde_data[2], entry_size;
+
+    UVM_ASSERT(uvm_mmu_use_cpu(tree));
+    entry_size = tree->hal->entry_size(depth);
+    UVM_ASSERT(sizeof(pde_data) >= entry_size);
+
+    tree->hal->make_pde(pde_data, phys_addr, depth);
+
+    if (entry_size == sizeof(pde_data[0]))
+        uvm_mmu_page_table_cpu_memset_8(tree->gpu, directory, start_index, pde_data[0], pde_count);
+    else
+        uvm_mmu_page_table_cpu_memset_16(tree->gpu, directory, start_index, pde_data, pde_count);
+}
+
+static void pde_fill_gpu(uvm_page_tree_t *tree,
+                         NvU32 depth,
+                         uvm_mmu_page_table_alloc_t *directory,
+                         NvU32 start_index,
+                         NvU32 pde_count,
+                         uvm_mmu_page_table_alloc_t **phys_addr,
+                         uvm_push_t *push)
+{
+    NvU64 pde_data[2], entry_size;
+    uvm_gpu_address_t pde_entry_addr = uvm_mmu_gpu_address(tree->gpu, directory->addr);
+
+    UVM_ASSERT(!uvm_mmu_use_cpu(tree));
+
+    entry_size = tree->hal->entry_size(depth);
+    UVM_ASSERT(sizeof(pde_data) >= entry_size);
+
+    tree->hal->make_pde(pde_data, phys_addr, depth);
+    pde_entry_addr.address += start_index * entry_size;
+
+    if (entry_size == sizeof(pde_data[0])) {
+        tree->gpu->parent->ce_hal->memset_8(push, pde_entry_addr, pde_data[0], sizeof(pde_data[0]) * pde_count);
+    }
+    else {
+        NvU32 max_inline_entries = UVM_PUSH_INLINE_DATA_MAX_SIZE / sizeof(pde_data);
+        uvm_gpu_address_t inline_data_addr;
+        uvm_push_inline_data_t inline_data;
+        uvm_push_flag_t push_membar_flag = UVM_PUSH_FLAG_COUNT;
+        NvU32 i;
+
+        if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE))
+            push_membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE;
+        else if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU))
+            push_membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU;
+
+        for (i = 0; i < pde_count;) {
+            NvU32 j;
+            NvU32 entry_count = min(pde_count - i, max_inline_entries);
+
+            uvm_push_inline_data_begin(push, &inline_data);
+            for (j = 0; j < entry_count; j++)
+                uvm_push_inline_data_add(&inline_data, pde_data, sizeof(pde_data));
+            inline_data_addr = uvm_push_inline_data_end(&inline_data);
+
+            // All but the first memcopy can be pipelined. We respect the
+            // caller's pipelining settings for the first push.
+            if (i != 0)
+                uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
+
+            // No membar is needed until the last copy. Otherwise, use
+            // caller's membar flag.
+            if (i + entry_count < pde_count)
+                uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
+            else if (push_membar_flag != UVM_PUSH_FLAG_COUNT)
+                uvm_push_set_flag(push, push_membar_flag);
+
+            tree->gpu->parent->ce_hal->memcopy(push, pde_entry_addr, inline_data_addr, entry_count * sizeof(pde_data));
+
+            i += entry_count;
+            pde_entry_addr.address += sizeof(pde_data) * entry_count;
+        }
+    }
+}
+
+// pde_fill() populates pde_count PDE entries (starting at start_index) with
+// the same mapping, i.e., with the same physical address (phys_addr).
+static void pde_fill(uvm_page_tree_t *tree,
+                     NvU32 depth,
+                     uvm_mmu_page_table_alloc_t *directory,
+                     NvU32 start_index,
+                     NvU32 pde_count,
+                     uvm_mmu_page_table_alloc_t **phys_addr,
+                     uvm_push_t *push)
+{
+    UVM_ASSERT(start_index + pde_count <= uvm_mmu_page_tree_entries(tree, depth, UVM_PAGE_SIZE_AGNOSTIC));
+
+    if (push)
+        pde_fill_gpu(tree, depth, directory, start_index, pde_count, phys_addr, push);
+    else
+        pde_fill_cpu(tree, depth, directory, start_index, pde_count, phys_addr);
+}
+
 static uvm_page_directory_t *host_pde_write(uvm_page_directory_t *dir,
                                            uvm_page_directory_t *parent,
                                            NvU32 index_in_parent)
@ -556,7 +540,7 @@ static void pde_write(uvm_page_tree_t *tree,
            phys_allocs[i] = &entry->phys_alloc;
    }

-    pde_fill(tree, dir, entry_index, 1, phys_allocs, push);
+    pde_fill(tree, dir->depth, &dir->phys_alloc, entry_index, 1, phys_allocs, push);
 }

 static void host_pde_clear(uvm_page_tree_t *tree, uvm_page_directory_t *dir, NvU32 entry_index, NvU32 page_size)
@ -829,11 +813,8 @@ static NV_STATUS allocate_page_table(uvm_page_tree_t *tree, NvU32 page_size, uvm

 static void map_remap_deinit(uvm_page_tree_t *tree)
 {
-    if (tree->map_remap.pde0) {
-        phys_mem_deallocate(tree, &tree->map_remap.pde0->phys_alloc);
-        uvm_kvfree(tree->map_remap.pde0);
-        tree->map_remap.pde0 = NULL;
-    }
+    if (tree->map_remap.pde0.size)
+        phys_mem_deallocate(tree, &tree->map_remap.pde0);

    if (tree->map_remap.ptes_invalid_4k.size)
        phys_mem_deallocate(tree, &tree->map_remap.ptes_invalid_4k);
@ -858,16 +839,10 @@ static NV_STATUS map_remap_init(uvm_page_tree_t *tree)
    // PDE1-depth(512M) PTE. We first map it to the pde0 directory, then we
    // return the PTE for the get_ptes()'s caller.
    if (tree->hal->page_sizes() & UVM_PAGE_SIZE_512M) {
-        tree->map_remap.pde0 = allocate_directory(tree,
-                                                  UVM_PAGE_SIZE_2M,
-                                                  tree->hal->page_table_depth(UVM_PAGE_SIZE_2M),
-                                                  UVM_PMM_ALLOC_FLAGS_EVICT);
-        if (tree->map_remap.pde0 == NULL) {
-            status = NV_ERR_NO_MEMORY;
+        status = allocate_page_table(tree, UVM_PAGE_SIZE_2M, &tree->map_remap.pde0);
+        if (status != NV_OK)
            goto error;
-        }
    }
-
    status = page_tree_begin_acquire(tree, &tree->tracker, &push, "map remap init");
    if (status != NV_OK)
        goto error;
@ -889,23 +864,22 @@ static NV_STATUS map_remap_init(uvm_page_tree_t *tree)
        uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
        NvU32 depth = tree->hal->page_table_depth(UVM_PAGE_SIZE_4K) - 1;
        size_t index_4k = tree->hal->entry_offset(depth, UVM_PAGE_SIZE_4K);
-        NvU32 pde0_entries = tree->map_remap.pde0->phys_alloc.size / tree->hal->entry_size(tree->map_remap.pde0->depth);
+
+        // pde0 depth equals UVM_PAGE_SIZE_2M.
+        NvU32 pde0_depth = tree->hal->page_table_depth(UVM_PAGE_SIZE_2M);
+        NvU32 pde0_entries = tree->map_remap.pde0.size / tree->hal->entry_size(pde0_depth);

        // The big-page entry is NULL which makes it an invalid entry.
        phys_allocs[index_4k] = &tree->map_remap.ptes_invalid_4k;

        // By default CE operations include a MEMBAR_SYS. MEMBAR_GPU is
        // sufficient when pde0 is allocated in VIDMEM.
-        if (tree->map_remap.pde0->phys_alloc.addr.aperture == UVM_APERTURE_VID)
+        if (tree->map_remap.pde0.addr.aperture == UVM_APERTURE_VID)
            uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);

-        // This is an orphan directory, make_pde() requires a directory to
-        // compute the VA. The UVM depth map_remap() operates on is not in the
-        // range make_pde() must operate. We only need to supply the fields used
-        // by make_pde() to not access invalid memory addresses.
-
        pde_fill(tree,
-                 tree->map_remap.pde0,
+                 pde0_depth,
+                 &tree->map_remap.pde0,
                 0,
                 pde0_entries,
                 (uvm_mmu_page_table_alloc_t **)&phys_allocs,
@ -932,10 +906,11 @@ error:
 // --------------|-------------------------||----------------|----------------
 //    vidmem     |           -             ||    vidmem      |      false
 //    sysmem     |           -             ||    sysmem      |      false
-//    default    |        <not set>        ||    vidmem      |      true
+//    default    |        <not set>        ||    vidmem      |      true (1)
 //    default    |         vidmem          ||    vidmem      |      false
 //    default    |         sysmem          ||    sysmem      |      false
 //
+// (1) When SEV mode is enabled, the fallback path is disabled.
 //
 // In SR-IOV heavy the the page tree must be in vidmem, to prevent guest drivers
 // from updating GPU page tables without hypervisor knowledge.
@ -951,27 +926,28 @@ error:
 //
 static void page_tree_set_location(uvm_page_tree_t *tree, uvm_aperture_t location)
 {
+    bool should_location_be_vidmem;
    UVM_ASSERT(tree->gpu != NULL);
    UVM_ASSERT_MSG((location == UVM_APERTURE_VID) ||
                   (location == UVM_APERTURE_SYS) ||
                   (location == UVM_APERTURE_DEFAULT),
                   "Invalid location %s (%d)\n", uvm_aperture_string(location), (int)location);

-    // The page tree of a "fake" GPU used during page tree testing can be in
-    // sysmem in scenarios where a "real" GPU must be in vidmem. Fake GPUs can
-    // be identified by having no channel manager.
-    if (tree->gpu->channel_manager != NULL) {
+    should_location_be_vidmem = uvm_gpu_is_virt_mode_sriov_heavy(tree->gpu)
+                                || uvm_conf_computing_mode_enabled(tree->gpu);

-        if (uvm_gpu_is_virt_mode_sriov_heavy(tree->gpu))
-            UVM_ASSERT(location == UVM_APERTURE_VID);
-        else if (uvm_conf_computing_mode_enabled(tree->gpu))
-            UVM_ASSERT(location == UVM_APERTURE_VID);
-    }
+    // The page tree of a "fake" GPU used during page tree testing can be in
+    // sysmem even if should_location_be_vidmem is true. A fake GPU can be
+    // identified by having no channel manager.
+    if ((tree->gpu->channel_manager != NULL) && should_location_be_vidmem)
+        UVM_ASSERT(location == UVM_APERTURE_VID);

    if (location == UVM_APERTURE_DEFAULT) {
        if (page_table_aperture == UVM_APERTURE_DEFAULT) {
            tree->location = UVM_APERTURE_VID;
-            tree->location_sys_fallback = true;
+
+            // See the comment (1) above.
+            tree->location_sys_fallback = !g_uvm_global.sev_enabled;
        }
        else {
            tree->location = page_table_aperture;
@ -1358,9 +1334,10 @@ static NV_STATUS map_remap(uvm_page_tree_t *tree, NvU64 start, NvLength size, uv
    if (uvm_page_table_range_aperture(range) == UVM_APERTURE_VID)
        uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);

-    phys_alloc[0] = &tree->map_remap.pde0->phys_alloc;
+    phys_alloc[0] = &tree->map_remap.pde0;
    pde_fill(tree,
-             range->table,
+             range->table->depth,
+             &range->table->phys_alloc,
             range->start_index,
             range->entry_count,
             (uvm_mmu_page_table_alloc_t **)&phys_alloc,
--- a/kernel-open/nvidia-uvm/uvm_mmu.h
+++ b/kernel-open/nvidia-uvm/uvm_mmu.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2022 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -219,7 +219,7 @@ struct uvm_mmu_mode_hal_struct
    // point to two items for dual PDEs).
    // any of allocs are allowed to be NULL, in which case they are to be
    // treated as empty.
-    void (*make_pde)(void *entry, uvm_mmu_page_table_alloc_t **allocs, NvU32 depth, uvm_page_directory_t *child_dir);
+    void (*make_pde)(void *entry, uvm_mmu_page_table_alloc_t **allocs, NvU32 depth);

    // size of an entry in a directory/table.  Generally either 8 or 16 bytes.
    // (in the case of Pascal dual PDEs)
@ -229,7 +229,7 @@ struct uvm_mmu_mode_hal_struct
    NvU32 (*entries_per_index)(NvU32 depth);

    // For dual PDEs, this is ether 1 or 0, depending on the page size.
-    // This is used to index the host copy only. GPU PDEs are always entirely
+    // This is used to index the host copy only.  GPU PDEs are always entirely
    // re-written using make_pde.
    NvLength (*entry_offset)(NvU32 depth, NvU32 page_size);

@ -295,8 +295,9 @@ struct uvm_page_tree_struct

        // PDE0 where all big-page entries are invalid, and small-page entries
        // point to ptes_invalid_4k.
-        // pde0 is used on Pascal+ GPUs, i.e., they have the same PDE format.
-        uvm_page_directory_t *pde0;
+        // pde0 is only used on Pascal-Ampere, i.e., they have the same PDE
+        // format.
+        uvm_mmu_page_table_alloc_t pde0;
    } map_remap;

    // Tracker for all GPU operations on the tree
@ -364,32 +365,21 @@ void uvm_page_tree_deinit(uvm_page_tree_t *tree);
 // the same page size without an intervening put_ptes. To duplicate a subset of
 // an existing range or change the size of an existing range, use
 // uvm_page_table_range_get_upper() and/or uvm_page_table_range_shrink().
-NV_STATUS uvm_page_tree_get_ptes(uvm_page_tree_t *tree,
-                                 NvU32 page_size,
-                                 NvU64 start,
-                                 NvLength size,
-                                 uvm_pmm_alloc_flags_t pmm_flags,
-                                 uvm_page_table_range_t *range);
+NV_STATUS uvm_page_tree_get_ptes(uvm_page_tree_t *tree, NvU32 page_size, NvU64 start, NvLength size,
+        uvm_pmm_alloc_flags_t pmm_flags, uvm_page_table_range_t *range);

 // Same as uvm_page_tree_get_ptes(), but doesn't synchronize the GPU work.
 //
 // All pending operations can be waited on with uvm_page_tree_wait().
-NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree,
-                                       NvU32 page_size,
-                                       NvU64 start,
-                                       NvLength size,
-                                       uvm_pmm_alloc_flags_t pmm_flags,
-                                       uvm_page_table_range_t *range);
+NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree, NvU32 page_size, NvU64 start, NvLength size,
+        uvm_pmm_alloc_flags_t pmm_flags, uvm_page_table_range_t *range);

 // Returns a single-entry page table range for the addresses passed.
 // The size parameter must be a page size supported by this tree.
 // This is equivalent to calling uvm_page_tree_get_ptes() with size equal to
 // page_size.
-NV_STATUS uvm_page_tree_get_entry(uvm_page_tree_t *tree,
-                                  NvU32 page_size,
-                                  NvU64 start,
-                                  uvm_pmm_alloc_flags_t pmm_flags,
-                                  uvm_page_table_range_t *single);
+NV_STATUS uvm_page_tree_get_entry(uvm_page_tree_t *tree, NvU32 page_size, NvU64 start,
+        uvm_pmm_alloc_flags_t pmm_flags, uvm_page_table_range_t *single);

 // For a single-entry page table range, write the PDE (which could be a dual
 // PDE) to the GPU.
@ -488,8 +478,8 @@ NV_STATUS uvm_page_table_range_vec_create(uvm_page_tree_t *tree,
 // new_range_vec will contain the upper portion of range_vec, starting at
 // new_end + 1.
 //
-// new_end + 1 is required to be within the address range of range_vec and be
-// aligned to range_vec's page_size.
+// new_end + 1 is required to be within the address range of range_vec and be aligned to
+// range_vec's page_size.
 //
 // On failure, the original range vector is left unmodified.
 NV_STATUS uvm_page_table_range_vec_split_upper(uvm_page_table_range_vec_t *range_vec,
@ -511,22 +501,18 @@ void uvm_page_table_range_vec_destroy(uvm_page_table_range_vec_t *range_vec);
 // for each offset.
 // The caller_data pointer is what the caller passed in as caller_data to
 // uvm_page_table_range_vec_write_ptes().
-typedef NvU64 (*uvm_page_table_range_pte_maker_t)(uvm_page_table_range_vec_t *range_vec,
-                                                  NvU64 offset,
-                                                  void *caller_data);
+typedef NvU64 (*uvm_page_table_range_pte_maker_t)(uvm_page_table_range_vec_t *range_vec, NvU64 offset,
+        void *caller_data);

-// Write all PTEs covered by the range vector using the given PTE making
-// function.
+// Write all PTEs covered by the range vector using the given PTE making function.
 //
 // After writing all the PTEs a TLB invalidate operation is performed including
 // the passed in tlb_membar.
 //
 // See comments about uvm_page_table_range_pte_maker_t for details about the
 // PTE making callback.
-NV_STATUS uvm_page_table_range_vec_write_ptes(uvm_page_table_range_vec_t *range_vec,
-                                              uvm_membar_t tlb_membar,
-                                              uvm_page_table_range_pte_maker_t pte_maker,
-                                              void *caller_data);
+NV_STATUS uvm_page_table_range_vec_write_ptes(uvm_page_table_range_vec_t *range_vec, uvm_membar_t tlb_membar,
+        uvm_page_table_range_pte_maker_t pte_maker, void *caller_data);

 // Set all PTEs covered by the range vector to an empty PTE
 //
@ -650,9 +636,8 @@ static NvU64 uvm_page_table_range_size(uvm_page_table_range_t *range)

 // Get the physical address of the entry at entry_index within the range
 // (counted from range->start_index).
-static uvm_gpu_phys_address_t uvm_page_table_range_entry_address(uvm_page_tree_t *tree,
-                                                                 uvm_page_table_range_t *range,
-                                                                 size_t entry_index)
+static uvm_gpu_phys_address_t uvm_page_table_range_entry_address(uvm_page_tree_t *tree, uvm_page_table_range_t *range,
+        size_t entry_index)
 {
    NvU32 entry_size = uvm_mmu_pte_size(tree, range->page_size);
    uvm_gpu_phys_address_t entry = range->table->phys_alloc.addr;
--- a/kernel-open/nvidia-uvm/uvm_page_tree_test.c
+++ b/kernel-open/nvidia-uvm/uvm_page_tree_test.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2022 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -146,15 +146,9 @@ static void fake_tlb_invals_disable(void)
    g_fake_tlb_invals_tracking_enabled = false;
 }

-// Fake TLB invalidate VA that just saves off the parameters so that they can be
-// verified later.
-static void fake_tlb_invalidate_va(uvm_push_t *push,
-                                   uvm_gpu_phys_address_t pdb,
-                                   NvU32 depth,
-                                   NvU64 base,
-                                   NvU64 size,
-                                   NvU32 page_size,
-                                   uvm_membar_t membar)
+// Fake TLB invalidate VA that just saves off the parameters so that they can be verified later
+static void fake_tlb_invalidate_va(uvm_push_t *push, uvm_gpu_phys_address_t pdb,
+        NvU32 depth, NvU64 base, NvU64 size, NvU32 page_size, uvm_membar_t membar)
 {
    if (!g_fake_tlb_invals_tracking_enabled)
        return;
@ -216,8 +210,8 @@ static bool assert_and_reset_last_invalidate(NvU32 expected_depth, bool expected
    }
    if ((g_last_fake_inval->membar == UVM_MEMBAR_NONE) == expected_membar) {
        UVM_TEST_PRINT("Expected %s membar, got %s instead\n",
-                       expected_membar ? "a" : "no",
-                       uvm_membar_string(g_last_fake_inval->membar));
+                expected_membar ? "a" : "no",
+                uvm_membar_string(g_last_fake_inval->membar));
        result = false;
    }

@ -236,8 +230,7 @@ static bool assert_last_invalidate_all(NvU32 expected_depth, bool expected_memba
    }
    if (g_last_fake_inval->base != 0 || g_last_fake_inval->size != -1) {
        UVM_TEST_PRINT("Expected invalidate all but got range [0x%llx, 0x%llx) instead\n",
-                       g_last_fake_inval->base,
-                       g_last_fake_inval->base + g_last_fake_inval->size);
+                g_last_fake_inval->base, g_last_fake_inval->base + g_last_fake_inval->size);
        return false;
    }
    if (g_last_fake_inval->depth != expected_depth) {
@ -254,16 +247,15 @@ static bool assert_invalidate_range_specific(fake_tlb_invalidate_t *inval,
    UVM_ASSERT(g_fake_tlb_invals_tracking_enabled);

    if (g_fake_invals_count == 0) {
-        UVM_TEST_PRINT("Expected an invalidate for range [0x%llx, 0x%llx), but got none\n", base, base + size);
+        UVM_TEST_PRINT("Expected an invalidate for range [0x%llx, 0x%llx), but got none\n",
+                base, base + size);
        return false;
    }

    if ((inval->base != base || inval->size != size) && inval->base != 0 && inval->size != -1) {
        UVM_TEST_PRINT("Expected invalidate range [0x%llx, 0x%llx), but got range [0x%llx, 0x%llx) instead\n",
-                        base,
-                        base + size,
-                        inval->base,
-                        inval->base + inval->size);
+                base, base + size,
+                inval->base, inval->base + inval->size);
        return false;
    }
    if (inval->depth != expected_depth) {
@ -278,13 +270,7 @@ static bool assert_invalidate_range_specific(fake_tlb_invalidate_t *inval,
    return true;
 }

-static bool assert_invalidate_range(NvU64 base,
-                                    NvU64 size,
-                                    NvU32 page_size,
-                                    bool allow_inval_all,
-                                    NvU32 range_depth,
-                                    NvU32 all_depth,
-                                    bool expected_membar)
+static bool assert_invalidate_range(NvU64 base, NvU64 size, NvU32 page_size, bool allow_inval_all, NvU32 range_depth, NvU32 all_depth, bool expected_membar)
 {
    NvU32 i;

@ -502,6 +488,7 @@ static NV_STATUS alloc_adjacent_pde_64k_memory(uvm_gpu_t *gpu)
    return NV_OK;
 }

+
 static NV_STATUS alloc_nearby_pde_64k_memory(uvm_gpu_t *gpu)
 {
    uvm_page_tree_t tree;
@ -855,7 +842,6 @@ static NV_STATUS get_two_free_apart(uvm_gpu_t *gpu)
    TEST_CHECK_RET(range2.entry_count == 256);
    TEST_CHECK_RET(range2.table->ref_count == 512);
    TEST_CHECK_RET(range1.table == range2.table);
-
    // 4k page is second entry in a dual PDE
    TEST_CHECK_RET(range1.table == tree.root->entries[0]->entries[0]->entries[0]->entries[1]);
    TEST_CHECK_RET(range1.start_index == 256);
@ -885,7 +871,6 @@ static NV_STATUS get_overlapping_dual_pdes(uvm_gpu_t *gpu)
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_64K, size, size, &range64k), NV_OK);
    TEST_CHECK_RET(range64k.entry_count == 16);
    TEST_CHECK_RET(range64k.table->ref_count == 16);
-
    // 4k page is second entry in a dual PDE
    TEST_CHECK_RET(range64k.table == tree.root->entries[0]->entries[0]->entries[0]->entries[0]);
    TEST_CHECK_RET(range64k.start_index == 16);
@ -1045,13 +1030,10 @@ static NV_STATUS test_tlb_invalidates(uvm_gpu_t *gpu)

    // Depth 4
    NvU64 extent_pte = UVM_PAGE_SIZE_2M;
-
    // Depth 3
    NvU64 extent_pde0 = extent_pte * (1ull << 8);
-
    // Depth 2
    NvU64 extent_pde1 = extent_pde0 * (1ull << 9);
-
    // Depth 1
    NvU64 extent_pde2 = extent_pde1 * (1ull << 9);

@ -1099,11 +1081,7 @@ static NV_STATUS test_tlb_invalidates(uvm_gpu_t *gpu)
    return status;
 }

-static NV_STATUS test_tlb_batch_invalidates_case(uvm_page_tree_t *tree,
-                                                 NvU64 base,
-                                                 NvU64 size,
-                                                 NvU32 min_page_size,
-                                                 NvU32 max_page_size)
+static NV_STATUS test_tlb_batch_invalidates_case(uvm_page_tree_t *tree, NvU64 base, NvU64 size, NvU32 min_page_size, NvU32 max_page_size)
 {
    NV_STATUS status = NV_OK;
    uvm_push_t push;
@ -1227,11 +1205,7 @@ static bool assert_range_vec_ptes(uvm_page_table_range_vec_t *range_vec, bool ex
            NvU64 expected_pte = expecting_cleared ? 0 : range_vec->size + offset;
            if (*pte != expected_pte) {
                UVM_TEST_PRINT("PTE is 0x%llx instead of 0x%llx for offset 0x%llx within range [0x%llx, 0x%llx)\n",
-                               *pte,
-                               expected_pte,
-                               offset,
-                               range_vec->start,
-                               range_vec->size);
+                        *pte, expected_pte, offset, range_vec->start, range_vec->size);
                return false;
            }
            offset += range_vec->page_size;
@ -1252,11 +1226,7 @@ static NV_STATUS test_range_vec_write_ptes(uvm_page_table_range_vec_t *range_vec
    TEST_CHECK_RET(data.status == NV_OK);
    TEST_CHECK_RET(data.count == range_vec->size / range_vec->page_size);
    TEST_CHECK_RET(assert_invalidate_range_specific(g_last_fake_inval,
-                                                    range_vec->start,
-                                                    range_vec->size,
-                                                    range_vec->page_size,
-                                                    page_table_depth,
-                                                    membar != UVM_MEMBAR_NONE));
+            range_vec->start, range_vec->size, range_vec->page_size, page_table_depth, membar != UVM_MEMBAR_NONE));
    TEST_CHECK_RET(assert_range_vec_ptes(range_vec, false));

    fake_tlb_invals_disable();
@ -1279,11 +1249,7 @@ static NV_STATUS test_range_vec_clear_ptes(uvm_page_table_range_vec_t *range_vec
    return NV_OK;
 }

-static NV_STATUS test_range_vec_create(uvm_page_tree_t *tree,
-                                       NvU64 start,
-                                       NvU64 size,
-                                       NvU32 page_size,
-                                       uvm_page_table_range_vec_t **range_vec_out)
+static NV_STATUS test_range_vec_create(uvm_page_tree_t *tree, NvU64 start, NvU64 size, NvU32 page_size, uvm_page_table_range_vec_t **range_vec_out)
 {
    uvm_page_table_range_vec_t *range_vec;
    uvm_pmm_alloc_flags_t pmm_flags = UVM_PMM_ALLOC_FLAGS_EVICT;
@ -1586,17 +1552,17 @@ static NV_STATUS entry_test_maxwell(uvm_gpu_t *gpu)

        memset(phys_allocs, 0, sizeof(phys_allocs));

-        hal->make_pde(&pde_bits, phys_allocs, 0, NULL);
+        hal->make_pde(&pde_bits, phys_allocs, 0);
        TEST_CHECK_RET(pde_bits == 0x0L);

        phys_allocs[0] = &alloc_sys;
        phys_allocs[1] = &alloc_vid;
-        hal->make_pde(&pde_bits, phys_allocs, 0, NULL);
+        hal->make_pde(&pde_bits, phys_allocs, 0);
        TEST_CHECK_RET(pde_bits == 0x1BBBBBBD99999992LL);

        phys_allocs[0] = &alloc_vid;
        phys_allocs[1] = &alloc_sys;
-        hal->make_pde(&pde_bits, phys_allocs, 0, NULL);
+        hal->make_pde(&pde_bits, phys_allocs, 0);
        TEST_CHECK_RET(pde_bits == 0x9999999E1BBBBBB1LL);

        for (j = 0; j <= 2; j++) {
@ -1666,7 +1632,6 @@ static NV_STATUS entry_test_pascal(uvm_gpu_t *gpu, entry_test_page_size_func ent
    uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
    uvm_mmu_page_table_alloc_t alloc_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x399999999999000LL);
    uvm_mmu_page_table_alloc_t alloc_vid = fake_table_alloc(UVM_APERTURE_VID, 0x1BBBBBB000LL);
-
    // big versions have [11:8] set as well to test the page table merging
    uvm_mmu_page_table_alloc_t alloc_big_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x399999999999900LL);
    uvm_mmu_page_table_alloc_t alloc_big_vid = fake_table_alloc(UVM_APERTURE_VID, 0x1BBBBBBB00LL);
@ -1674,31 +1639,31 @@ static NV_STATUS entry_test_pascal(uvm_gpu_t *gpu, entry_test_page_size_func ent
    uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);

    // Make sure cleared PDEs work as expected
-    hal->make_pde(pde_bits, phys_allocs, 0, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 0);
    TEST_CHECK_RET(pde_bits[0] == 0);

    memset(pde_bits, 0xFF, sizeof(pde_bits));
-    hal->make_pde(pde_bits, phys_allocs, 3, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 3);
    TEST_CHECK_RET(pde_bits[0] == 0 && pde_bits[1] == 0);

    // Sys and vidmem PDEs
    phys_allocs[0] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 0, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x3999999999990C);

    phys_allocs[0] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 0, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBB0A);

    // Dual PDEs
    phys_allocs[0] = &alloc_big_sys;
    phys_allocs[1] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 3, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 3);
    TEST_CHECK_RET(pde_bits[0] == 0x3999999999999C && pde_bits[1] == 0x1BBBBBB0A);

    phys_allocs[0] = &alloc_big_vid;
    phys_allocs[1] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 3, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 3);
    TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBBBA && pde_bits[1] == 0x3999999999990C);

    // uncached, i.e., the sysmem data is not cached in GPU's L2 cache. Clear
@ -1762,36 +1727,36 @@ static NV_STATUS entry_test_volta(uvm_gpu_t *gpu, entry_test_page_size_func entr
    uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);

    // Make sure cleared PDEs work as expected
-    hal->make_pde(pde_bits, phys_allocs, 0, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 0);
    TEST_CHECK_RET(pde_bits[0] == 0);

    memset(pde_bits, 0xFF, sizeof(pde_bits));
-    hal->make_pde(pde_bits, phys_allocs, 3, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 3);
    TEST_CHECK_RET(pde_bits[0] == 0 && pde_bits[1] == 0);

    // Sys and vidmem PDEs
    phys_allocs[0] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 0, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x3999999999990C);

    phys_allocs[0] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 0, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBB0A);

    // Dual PDEs
    phys_allocs[0] = &alloc_big_sys;
    phys_allocs[1] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 3, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 3);
    TEST_CHECK_RET(pde_bits[0] == 0x3999999999999C && pde_bits[1] == 0x1BBBBBB0A);

    phys_allocs[0] = &alloc_big_vid;
    phys_allocs[1] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 3, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 3);
    TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBBBA && pde_bits[1] == 0x3999999999990C);

    // NO_ATS PDE1 (depth 2)
    phys_allocs[0] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 2, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 2);
    if (g_uvm_global.ats.enabled)
        TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBB2A);
    else
@ -1840,32 +1805,32 @@ static NV_STATUS entry_test_hopper(uvm_gpu_t *gpu, entry_test_page_size_func ent
    uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);

    // Make sure cleared PDEs work as expected
-    hal->make_pde(pde_bits, phys_allocs, 0, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 0);
    TEST_CHECK_RET(pde_bits[0] == 0);

    // Cleared PDEs work as expected for big and small PDEs.
    memset(pde_bits, 0xFF, sizeof(pde_bits));
-    hal->make_pde(pde_bits, phys_allocs, 4, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 4);
    TEST_CHECK_RET(pde_bits[0] == 0 && pde_bits[1] == 0);

    // Sys and vidmem PDEs, uncached ATS allowed.
    phys_allocs[0] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 0, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x999999999900C);

    phys_allocs[0] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 0, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 0);
    TEST_CHECK_RET(pde_bits[0] == 0xBBBBBBB00A);

    // Dual PDEs, uncached.
    phys_allocs[0] = &alloc_big_sys;
    phys_allocs[1] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 4, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 4);
    TEST_CHECK_RET(pde_bits[0] == 0x999999999991C && pde_bits[1] == 0xBBBBBBB01A);

    phys_allocs[0] = &alloc_big_vid;
    phys_allocs[1] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 4, NULL);
+    hal->make_pde(pde_bits, phys_allocs, 4);
    TEST_CHECK_RET(pde_bits[0] == 0xBBBBBBBB1A && pde_bits[1] == 0x999999999901C);

    // uncached, i.e., the sysmem data is not cached in GPU's L2 cache, and
@ -2338,8 +2303,7 @@ NV_STATUS uvm_test_page_tree(UVM_TEST_PAGE_TREE_PARAMS *params, struct file *fil
    gpu->parent = parent_gpu;

    // At least test_tlb_invalidates() relies on global state
-    // (g_tlb_invalidate_*) so make sure only one test instance can run at a
-    // time.
+    // (g_tlb_invalidate_*) so make sure only one test instance can run at a time.
    uvm_mutex_lock(&g_uvm_global.global_lock);

    // Allocate the fake TLB tracking state. Notably tests still need to enable
--- a/kernel-open/nvidia-uvm/uvm_pascal_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_pascal_mmu.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2020 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -140,10 +140,7 @@ static NvU64 small_half_pde_pascal(uvm_mmu_page_table_alloc_t *phys_alloc)
    return pde_bits;
 }

-static void make_pde_pascal(void *entry,
-                            uvm_mmu_page_table_alloc_t **phys_allocs,
-                            NvU32 depth,
-                            uvm_page_directory_t *child_dir)
+static void make_pde_pascal(void *entry, uvm_mmu_page_table_alloc_t **phys_allocs, NvU32 depth)
 {
    NvU32 entry_count = entries_per_index_pascal(depth);
    NvU64 *entry_bits = (NvU64 *)entry;
--- a/kernel-open/nvidia-uvm/uvm_perf_events_test.c
+++ b/kernel-open/nvidia-uvm/uvm_perf_events_test.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2019 NVIDIA Corporation
+    Copyright (c) 2016-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -22,10 +22,7 @@
 *******************************************************************************/

 #include "uvm_perf_events.h"
-#include "uvm_va_block.h"
-#include "uvm_va_range.h"
 #include "uvm_va_space.h"
-#include "uvm_kvmalloc.h"
 #include "uvm_test.h"

 // Global variable used to check that callbacks are correctly executed
@ -46,10 +43,7 @@ static NV_STATUS test_events(uvm_va_space_t *va_space)
    NV_STATUS status;
    uvm_perf_event_data_t event_data;

-    uvm_va_block_t block;
-
    test_data = 0;
-
    memset(&event_data, 0, sizeof(event_data));

    // Use CPU id to avoid triggering the GPU stats update code
@ -58,6 +52,7 @@ static NV_STATUS test_events(uvm_va_space_t *va_space)
    // Register a callback for page fault
    status = uvm_perf_register_event_callback(&va_space->perf_events, UVM_PERF_EVENT_FAULT, callback_inc_1);
    TEST_CHECK_GOTO(status == NV_OK, done);
+
    // Register a callback for page fault
    status = uvm_perf_register_event_callback(&va_space->perf_events, UVM_PERF_EVENT_FAULT, callback_inc_2);
    TEST_CHECK_GOTO(status == NV_OK, done);
@ -65,13 +60,14 @@ static NV_STATUS test_events(uvm_va_space_t *va_space)
    // va_space read lock is required for page fault event notification
    uvm_va_space_down_read(va_space);

-    // Notify (fake) page fault. The two registered callbacks for this event increment the value of test_value
-    event_data.fault.block = &block;
+    // Notify (fake) page fault. The two registered callbacks for this event
+    // increment the value of test_value
    uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_FAULT, &event_data);

    uvm_va_space_up_read(va_space);

-    // test_data was initialized to zero. It should have been incremented by 1 and 2, respectively in the callbacks
+    // test_data was initialized to zero. It should have been incremented by 1
+    // and 2, respectively in the callbacks
    TEST_CHECK_GOTO(test_data == 3, done);

 done:
@ -96,4 +92,3 @@ NV_STATUS uvm_test_perf_events_sanity(UVM_TEST_PERF_EVENTS_SANITY_PARAMS *params
 done:
    return status;
 }
-
--- a/kernel-open/nvidia-uvm/uvm_perf_prefetch.c
+++ b/kernel-open/nvidia-uvm/uvm_perf_prefetch.c
@ -355,7 +355,7 @@ static NvU32 uvm_perf_prefetch_prenotify_fault_migrations(uvm_va_block_t *va_blo
    uvm_page_mask_zero(prefetch_pages);

    if (UVM_ID_IS_CPU(new_residency) || va_block->gpus[uvm_id_gpu_index(new_residency)] != NULL)
-        resident_mask = uvm_va_block_resident_mask_get(va_block, new_residency);
+        resident_mask = uvm_va_block_resident_mask_get(va_block, new_residency, NUMA_NO_NODE);

    // If this is a first-touch fault and the destination processor is the
    // preferred location, populate the whole max_prefetch_region.
--- a/kernel-open/nvidia-uvm/uvm_perf_thrashing.c
+++ b/kernel-open/nvidia-uvm/uvm_perf_thrashing.c
@ -164,7 +164,7 @@ typedef struct

        uvm_spinlock_t                          lock;

-        uvm_va_block_context_t      va_block_context;
+        uvm_va_block_context_t      *va_block_context;

        // Flag used to avoid scheduling delayed unpinning operations after
        // uvm_perf_thrashing_stop has been called.
@ -601,6 +601,14 @@ static va_space_thrashing_info_t *va_space_thrashing_info_create(uvm_va_space_t

    va_space_thrashing = uvm_kvmalloc_zero(sizeof(*va_space_thrashing));
    if (va_space_thrashing) {
+        uvm_va_block_context_t *block_context = uvm_va_block_context_alloc(NULL);
+
+        if (!block_context) {
+            uvm_kvfree(va_space_thrashing);
+            return NULL;
+        }
+
+        va_space_thrashing->pinned_pages.va_block_context = block_context;
        va_space_thrashing->va_space = va_space;

        va_space_thrashing_info_init_params(va_space_thrashing);
@ -621,6 +629,7 @@ static void va_space_thrashing_info_destroy(uvm_va_space_t *va_space)

    if (va_space_thrashing) {
        uvm_perf_module_type_unset_data(va_space->perf_modules_data, UVM_PERF_MODULE_TYPE_THRASHING);
+        uvm_va_block_context_free(va_space_thrashing->pinned_pages.va_block_context);
        uvm_kvfree(va_space_thrashing);
    }
 }
@ -1104,7 +1113,7 @@ static NV_STATUS unmap_remote_pinned_pages(uvm_va_block_t *va_block,
                   !uvm_processor_mask_test(&policy->accessed_by, processor_id));

        if (uvm_processor_mask_test(&va_block->resident, processor_id)) {
-            const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, processor_id);
+            const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, processor_id, NUMA_NO_NODE);

            if (!uvm_page_mask_andnot(&va_block_context->caller_page_mask,
                                      &block_thrashing->pinned_pages.mask,
@ -1312,9 +1321,8 @@ void thrashing_event_cb(uvm_perf_event_t event_id, uvm_perf_event_data_t *event_

        if (block_thrashing->last_time_stamp == 0 ||
            uvm_id_equal(block_thrashing->last_processor, processor_id) ||
-            time_stamp - block_thrashing->last_time_stamp > va_space_thrashing->params.lapse_ns) {
+            time_stamp - block_thrashing->last_time_stamp > va_space_thrashing->params.lapse_ns)
            goto done;
-        }

        num_block_pages = uvm_va_block_size(va_block) / PAGE_SIZE;

@ -1803,7 +1811,7 @@ static void thrashing_unpin_pages(struct work_struct *work)
    struct delayed_work *dwork = to_delayed_work(work);
    va_space_thrashing_info_t *va_space_thrashing = container_of(dwork, va_space_thrashing_info_t, pinned_pages.dwork);
    uvm_va_space_t *va_space = va_space_thrashing->va_space;
-    uvm_va_block_context_t *va_block_context = &va_space_thrashing->pinned_pages.va_block_context;
+    uvm_va_block_context_t *va_block_context = va_space_thrashing->pinned_pages.va_block_context;

    // Take the VA space lock so that VA blocks don't go away during this
    // operation.
@ -1937,7 +1945,6 @@ void uvm_perf_thrashing_unload(uvm_va_space_t *va_space)

    // Make sure that there are not pending work items
    if (va_space_thrashing) {
-        UVM_ASSERT(va_space_thrashing->pinned_pages.in_va_space_teardown);
        UVM_ASSERT(list_empty(&va_space_thrashing->pinned_pages.list));

        va_space_thrashing_info_destroy(va_space);
--- a/kernel-open/nvidia-uvm/uvm_pmm_gpu.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_gpu.c
@ -3377,76 +3377,47 @@ uvm_gpu_id_t uvm_pmm_devmem_page_to_gpu_id(struct page *page)
    return gpu->id;
 }

-static void evict_orphan_pages(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
-{
-    NvU32 i;
-
-    UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT);
-    UVM_ASSERT(chunk->suballoc);
-
-    for (i = 0; i < num_subchunks(chunk); i++) {
-        uvm_gpu_chunk_t *subchunk = chunk->suballoc->subchunks[i];
-
-        uvm_spin_lock(&pmm->list_lock);
-
-        if (subchunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT) {
-            uvm_spin_unlock(&pmm->list_lock);
-
-            evict_orphan_pages(pmm, subchunk);
-            continue;
-        }
-
-        if (subchunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED && subchunk->is_referenced) {
-            unsigned long pfn = uvm_pmm_gpu_devmem_get_pfn(pmm, subchunk);
-
-            // TODO: Bug 3368756: add support for large GPU pages.
-            UVM_ASSERT(uvm_gpu_chunk_get_size(subchunk) == PAGE_SIZE);
-            uvm_spin_unlock(&pmm->list_lock);
-
-            // The above check for subchunk state is racy because the
-            // chunk may be freed after the lock is dropped. It is
-            // still safe to proceed in that case because the struct
-            // page reference will have dropped to zero and cannot
-            // have been re-allocated as this is only called during
-            // GPU teardown. Therefore migrate_device_range() will
-            // simply fail.
-            uvm_hmm_pmm_gpu_evict_pfn(pfn);
-            continue;
-        }
-
-        uvm_spin_unlock(&pmm->list_lock);
-    }
-}
-
-// Free any orphan pages.
-// This should be called as part of removing a GPU: after all work is stopped
-// and all va_blocks have been destroyed. There normally won't be any
-// device private struct page references left but there can be cases after
-// fork() where a child process still holds a reference. This function searches
-// for pages that still have a reference and migrates the page to the GPU in
-// order to release the reference in the CPU page table.
-static void uvm_pmm_gpu_free_orphan_pages(uvm_pmm_gpu_t *pmm)
+// Check there are no orphan pages. This should be only called as part of
+// removing a GPU: after all work is stopped and all va_blocks have been
+// destroyed. By now there should be no device-private page references left as
+// there are no va_space's left on this GPU and orphan pages should be removed
+// by va_space destruction or unregistration from the GPU.
+static bool uvm_pmm_gpu_check_orphan_pages(uvm_pmm_gpu_t *pmm)
 {
    size_t i;
+    bool ret = true;
+    unsigned long pfn;
+    struct range range = pmm->devmem.pagemap.range;

-    if (!pmm->initialized)
-        return;
-
-    // This is only safe to call during GPU teardown where chunks
-    // cannot be re-allocated.
-    UVM_ASSERT(uvm_gpu_retained_count(uvm_pmm_to_gpu(pmm)) == 0);
+    if (!pmm->initialized || !uvm_hmm_is_enabled_system_wide())
+        return ret;

    // Scan all the root chunks looking for subchunks which are still
-    // referenced. This is slow, but we only do this when unregistering a GPU
-    // and is not critical for performance.
+    // referenced.
    for (i = 0; i < pmm->root_chunks.count; i++) {
        uvm_gpu_root_chunk_t *root_chunk = &pmm->root_chunks.array[i];

        root_chunk_lock(pmm, root_chunk);
        if (root_chunk->chunk.state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT)
-            evict_orphan_pages(pmm, &root_chunk->chunk);
+            ret = false;
        root_chunk_unlock(pmm, root_chunk);
    }
+
+    for (pfn = __phys_to_pfn(range.start); pfn <= __phys_to_pfn(range.end); pfn++) {
+        struct page *page = pfn_to_page(pfn);
+
+        if (!is_device_private_page(page)) {
+            ret = false;
+            break;
+        }
+
+        if (page_count(page)) {
+            ret = false;
+            break;
+        }
+    }
+
+    return ret;
 }

 static void devmem_page_free(struct page *page)
@ -3479,7 +3450,7 @@ static vm_fault_t devmem_fault(struct vm_fault *vmf)
 {
    uvm_va_space_t *va_space = vmf->page->zone_device_data;

-    if (!va_space || va_space->va_space_mm.mm != vmf->vma->vm_mm)
+    if (!va_space)
        return VM_FAULT_SIGBUS;

    return uvm_va_space_cpu_fault_hmm(va_space, vmf->vma, vmf);
@ -3568,8 +3539,9 @@ static void devmem_deinit(uvm_pmm_gpu_t *pmm)
 {
 }

-static void uvm_pmm_gpu_free_orphan_pages(uvm_pmm_gpu_t *pmm)
+static bool uvm_pmm_gpu_check_orphan_pages(uvm_pmm_gpu_t *pmm)
 {
+    return true;
 }
 #endif // UVM_IS_CONFIG_HMM()

@ -3744,7 +3716,7 @@ void uvm_pmm_gpu_deinit(uvm_pmm_gpu_t *pmm)

    gpu = uvm_pmm_to_gpu(pmm);

-    uvm_pmm_gpu_free_orphan_pages(pmm);
+    UVM_ASSERT(uvm_pmm_gpu_check_orphan_pages(pmm));
    nv_kthread_q_flush(&gpu->parent->lazy_free_q);
    UVM_ASSERT(list_empty(&pmm->root_chunks.va_block_lazy_free));
    release_free_root_chunks(pmm);
--- a/kernel-open/nvidia-uvm/uvm_pmm_sysmem.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_sysmem.c
@ -749,6 +749,7 @@ NV_STATUS uvm_cpu_chunk_map_gpu(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu)
 }

 static struct page *uvm_cpu_chunk_alloc_page(uvm_chunk_size_t alloc_size,
+                                             int nid,
                                             uvm_cpu_chunk_alloc_flags_t alloc_flags)
 {
    gfp_t kernel_alloc_flags;
@ -764,18 +765,27 @@ static struct page *uvm_cpu_chunk_alloc_page(uvm_chunk_size_t alloc_size,

    kernel_alloc_flags |= GFP_HIGHUSER;

-    // For allocation sizes higher than PAGE_SIZE, use __GFP_NORETRY in
-    // order to avoid higher allocation latency from the kernel compacting
-    // memory to satisfy the request.
+    // For allocation sizes higher than PAGE_SIZE, use __GFP_NORETRY in order
+    // to avoid higher allocation latency from the kernel compacting memory to
+    // satisfy the request.
+    // Use __GFP_NOWARN to avoid printing allocation failure to the kernel log.
+    // High order allocation failures are handled gracefully by the caller.
    if (alloc_size > PAGE_SIZE)
-        kernel_alloc_flags |= __GFP_COMP | __GFP_NORETRY;
+        kernel_alloc_flags |= __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN;

    if (alloc_flags & UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO)
        kernel_alloc_flags |= __GFP_ZERO;

-    page = alloc_pages(kernel_alloc_flags, get_order(alloc_size));
-    if (page && (alloc_flags & UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO))
-        SetPageDirty(page);
+    UVM_ASSERT(nid < num_online_nodes());
+    if (nid == NUMA_NO_NODE)
+        page = alloc_pages(kernel_alloc_flags, get_order(alloc_size));
+    else
+        page = alloc_pages_node(nid, kernel_alloc_flags, get_order(alloc_size));
+
+    if (page) {
+        if (alloc_flags & UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO)
+            SetPageDirty(page);
+    }

    return page;
 }
@ -805,6 +815,7 @@ static uvm_cpu_physical_chunk_t *uvm_cpu_chunk_create(uvm_chunk_size_t alloc_siz

 NV_STATUS uvm_cpu_chunk_alloc(uvm_chunk_size_t alloc_size,
                              uvm_cpu_chunk_alloc_flags_t alloc_flags,
+                              int nid,
                              uvm_cpu_chunk_t **new_chunk)
 {
    uvm_cpu_physical_chunk_t *chunk;
@ -812,7 +823,7 @@ NV_STATUS uvm_cpu_chunk_alloc(uvm_chunk_size_t alloc_size,

    UVM_ASSERT(new_chunk);

-    page = uvm_cpu_chunk_alloc_page(alloc_size, alloc_flags);
+    page = uvm_cpu_chunk_alloc_page(alloc_size, nid, alloc_flags);
    if (!page)
        return NV_ERR_NO_MEMORY;

@ -847,6 +858,13 @@ NV_STATUS uvm_cpu_chunk_alloc_hmm(struct page *page,
    return NV_OK;
 }

+int uvm_cpu_chunk_get_numa_node(uvm_cpu_chunk_t *chunk)
+{
+    UVM_ASSERT(chunk);
+    UVM_ASSERT(chunk->page);
+    return page_to_nid(chunk->page);
+}
+
 NV_STATUS uvm_cpu_chunk_split(uvm_cpu_chunk_t *chunk, uvm_cpu_chunk_t **new_chunks)
 {
    NV_STATUS status = NV_OK;
--- a/kernel-open/nvidia-uvm/uvm_pmm_sysmem.h
+++ b/kernel-open/nvidia-uvm/uvm_pmm_sysmem.h
@ -304,11 +304,24 @@ uvm_chunk_sizes_mask_t uvm_cpu_chunk_get_allocation_sizes(void);

 // Allocate a physical CPU chunk of the specified size.
 //
+// The nid argument is used to indicate a memory node preference. If the
+// value is a memory node ID, the chunk allocation will be attempted on
+// that memory node. If the chunk cannot be allocated on that memory node,
+// it will be allocated on any memory node allowed by the process's policy.
+//
+// If the value of nid is a memory node ID that is not in the set of
+// current process's allowed memory nodes, it will be allocated on one of the
+// nodes in the allowed set.
+//
+// If the value of nid is NUMA_NO_NODE, the chunk will be allocated from any
+// of the allowed memory nodes by the process policy.
+//
 // If a CPU chunk allocation succeeds, NV_OK is returned. new_chunk will be set
 // to point to the newly allocated chunk. On failure, NV_ERR_NO_MEMORY is
 // returned.
 NV_STATUS uvm_cpu_chunk_alloc(uvm_chunk_size_t alloc_size,
                              uvm_cpu_chunk_alloc_flags_t flags,
+                              int nid,
                              uvm_cpu_chunk_t **new_chunk);

 // Allocate a HMM CPU chunk.
@ -375,6 +388,9 @@ static uvm_cpu_logical_chunk_t *uvm_cpu_chunk_to_logical(uvm_cpu_chunk_t *chunk)
    return container_of((chunk), uvm_cpu_logical_chunk_t, common);
 }

+// Return the NUMA node ID of the physical page backing the chunk.
+int uvm_cpu_chunk_get_numa_node(uvm_cpu_chunk_t *chunk);
+
 // Free a CPU chunk.
 // This may not result in the immediate freeing of the physical pages of the
 // chunk if this is a logical chunk and there are other logical chunks holding
--- a/kernel-open/nvidia-uvm/uvm_pmm_sysmem_test.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_sysmem_test.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2017-2019 NVIDIA Corporation
+    Copyright (c) 2017-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -664,6 +664,7 @@ done:

 static NV_STATUS test_cpu_chunk_alloc(uvm_chunk_size_t size,
                                      uvm_cpu_chunk_alloc_flags_t flags,
+                                      int nid,
                                      uvm_cpu_chunk_t **out_chunk)
 {
    uvm_cpu_chunk_t *chunk;
@ -675,7 +676,7 @@ static NV_STATUS test_cpu_chunk_alloc(uvm_chunk_size_t size,
    // It is possible that the allocation fails due to lack of large pages
    // rather than an API issue, which will result in a false negative.
    // However, that should be very rare.
-    TEST_NV_CHECK_RET(uvm_cpu_chunk_alloc(size, flags, &chunk));
+    TEST_NV_CHECK_RET(uvm_cpu_chunk_alloc(size, flags, nid, &chunk));

    // Check general state of the chunk:
    //   - chunk should be a physical chunk,
@ -685,6 +686,12 @@ static NV_STATUS test_cpu_chunk_alloc(uvm_chunk_size_t size,
    TEST_CHECK_GOTO(uvm_cpu_chunk_get_size(chunk) == size, done);
    TEST_CHECK_GOTO(uvm_cpu_chunk_num_pages(chunk) == size / PAGE_SIZE, done);

+    // It is possible for the kernel to allocate a chunk on a NUMA node other
+    // than the one requested. However, that should not be an issue with
+    // sufficient memory on each NUMA node.
+    if (nid != NUMA_NO_NODE)
+        TEST_CHECK_GOTO(uvm_cpu_chunk_get_numa_node(chunk) == nid, done);
+
    if (flags & UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO) {
        NvU64 *cpu_addr;

@ -719,7 +726,7 @@ static NV_STATUS test_cpu_chunk_mapping_basic_verify(uvm_gpu_t *gpu,
    NvU64 dma_addr;
    NV_STATUS status = NV_OK;

-    TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, flags, &chunk));
+    TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, flags, NUMA_NO_NODE, &chunk));
    phys_chunk = uvm_cpu_chunk_to_physical(chunk);

    // Check state of the physical chunk:
@ -763,27 +770,27 @@ static NV_STATUS test_cpu_chunk_mapping_basic(uvm_gpu_t *gpu, uvm_cpu_chunk_allo
    return NV_OK;
 }

-static NV_STATUS test_cpu_chunk_mapping_array(uvm_gpu_t *gpu1, uvm_gpu_t *gpu2, uvm_gpu_t *gpu3)
+static NV_STATUS test_cpu_chunk_mapping_array(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1, uvm_gpu_t *gpu2)
 {
    NV_STATUS status = NV_OK;
    uvm_cpu_chunk_t *chunk;
    uvm_cpu_physical_chunk_t *phys_chunk;
-    NvU64 dma_addr_gpu2;
+    NvU64 dma_addr_gpu1;

-    TEST_NV_CHECK_RET(test_cpu_chunk_alloc(PAGE_SIZE, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, &chunk));
+    TEST_NV_CHECK_RET(test_cpu_chunk_alloc(PAGE_SIZE, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, NUMA_NO_NODE, &chunk));
    phys_chunk = uvm_cpu_chunk_to_physical(chunk);

-    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu2), done);
-    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu2), done);
-    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu3), done);
-    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu2), done);
-    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu3), done);
-    dma_addr_gpu2 = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu2->parent);
-    uvm_cpu_chunk_unmap_gpu_phys(chunk, gpu3->parent);
-    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu2), done);
    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu1), done);
    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu1), done);
+    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu2), done);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu1), done);
    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu2), done);
+    dma_addr_gpu1 = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu1->parent);
+    uvm_cpu_chunk_unmap_gpu_phys(chunk, gpu2->parent);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu1), done);
+    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu0), done);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu0), done);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu1), done);

    // DMA mapping addresses for different GPUs live in different IOMMU spaces,
    // so it would be perfectly legal for them to have the same IOVA, and even
@ -793,7 +800,7 @@ static NV_STATUS test_cpu_chunk_mapping_array(uvm_gpu_t *gpu1, uvm_gpu_t *gpu2,
    // GPU1. It's true that we may get a false negative if both addresses
    // happened to alias and we had a bug in how the addresses are shifted in
    // the dense array, but that's better than intermittent failure.
-    TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu2->parent) == dma_addr_gpu2, done);
+    TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu1->parent) == dma_addr_gpu1, done);

 done:
    uvm_cpu_chunk_free(chunk);
@ -911,7 +918,7 @@ static NV_STATUS test_cpu_chunk_split_and_merge(uvm_gpu_t *gpu)
        uvm_cpu_chunk_t *chunk;
        NV_STATUS status;

-        TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, &chunk));
+        TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, NUMA_NO_NODE, &chunk));
        status = do_test_cpu_chunk_split_and_merge(chunk, gpu);
        uvm_cpu_chunk_free(chunk);

@ -993,7 +1000,7 @@ static NV_STATUS test_cpu_chunk_dirty(uvm_gpu_t *gpu)
        uvm_cpu_physical_chunk_t *phys_chunk;
        size_t num_pages;

-        TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, &chunk));
+        TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, NUMA_NO_NODE, &chunk));
        phys_chunk = uvm_cpu_chunk_to_physical(chunk);
        num_pages = uvm_cpu_chunk_num_pages(chunk);

@ -1005,7 +1012,7 @@ static NV_STATUS test_cpu_chunk_dirty(uvm_gpu_t *gpu)

        uvm_cpu_chunk_free(chunk);

-        TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO, &chunk));
+        TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO, NUMA_NO_NODE, &chunk));
        phys_chunk = uvm_cpu_chunk_to_physical(chunk);
        num_pages = uvm_cpu_chunk_num_pages(chunk);

@ -1170,13 +1177,35 @@ NV_STATUS test_cpu_chunk_free(uvm_va_space_t *va_space, uvm_processor_mask_t *te
    size_t size = uvm_chunk_find_next_size(alloc_sizes, PAGE_SIZE);

    for_each_chunk_size_from(size, alloc_sizes) {
-        TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, &chunk));
+        TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, NUMA_NO_NODE, &chunk));
        TEST_NV_CHECK_RET(do_test_cpu_chunk_free(chunk, va_space, test_gpus));
    }

    return NV_OK;
 }

+static NV_STATUS test_cpu_chunk_numa_alloc(uvm_va_space_t *va_space)
+{
+    uvm_cpu_chunk_t *chunk;
+    uvm_chunk_sizes_mask_t alloc_sizes = uvm_cpu_chunk_get_allocation_sizes();
+    size_t size;
+
+    for_each_chunk_size(size, alloc_sizes) {
+        int nid;
+
+        for_each_possible_uvm_node(nid) {
+            // Do not test CPU allocation on nodes that have no memory or CPU
+            if (!node_state(nid, N_MEMORY) || !node_state(nid, N_CPU))
+                continue;
+
+            TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, nid, &chunk));
+            uvm_cpu_chunk_free(chunk);
+        }
+    }
+
+    return NV_OK;
+}
+
 NV_STATUS uvm_test_cpu_chunk_api(UVM_TEST_CPU_CHUNK_API_PARAMS *params, struct file *filp)
 {
    uvm_va_space_t *va_space = uvm_va_space_get(filp);
@ -1197,6 +1226,7 @@ NV_STATUS uvm_test_cpu_chunk_api(UVM_TEST_CPU_CHUNK_API_PARAMS *params, struct f
    }

    TEST_NV_CHECK_GOTO(test_cpu_chunk_free(va_space, &test_gpus), done);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_numa_alloc(va_space), done);

    if (uvm_processor_mask_get_gpu_count(&test_gpus) >= 3) {
        uvm_gpu_t *gpu2, *gpu3;
--- a/kernel-open/nvidia-uvm/uvm_pmm_test.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_test.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2023 NVIDIA Corporation
+    Copyright (c) 2015-2022 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -324,7 +324,7 @@ static NV_STATUS gpu_mem_check(uvm_gpu_t *gpu,

    // TODO: Bug 3839176: [UVM][HCC][uvm_test] Update tests that assume GPU
    //                     engines can directly access sysmem
-    // Skip this test for now. To enable this test in Confidential Computing,
+    // Skip this test for now. To enable this test under SEV,
    // The GPU->CPU CE copy needs to be updated so it uses encryption when
    // CC is enabled.
    if (uvm_conf_computing_mode_enabled(gpu))
@ -1068,7 +1068,7 @@ static NV_STATUS test_pmm_reverse_map_single(uvm_gpu_t *gpu, uvm_va_space_t *va_
    uvm_mutex_lock(&va_block->lock);

    is_resident = uvm_processor_mask_test(&va_block->resident, gpu->id) &&
-                  uvm_page_mask_full(uvm_va_block_resident_mask_get(va_block, gpu->id));
+                  uvm_page_mask_full(uvm_va_block_resident_mask_get(va_block, gpu->id, NUMA_NO_NODE));
    if (is_resident)
        phys_addr = uvm_va_block_gpu_phys_page_address(va_block, 0, gpu);

@ -1154,7 +1154,7 @@ static NV_STATUS test_pmm_reverse_map_many_blocks(uvm_gpu_t *gpu, uvm_va_space_t
                uvm_mutex_lock(&va_block->lock);

                // Verify that all pages are populated on the GPU
-                is_resident = uvm_page_mask_region_full(uvm_va_block_resident_mask_get(va_block, gpu->id),
+                is_resident = uvm_page_mask_region_full(uvm_va_block_resident_mask_get(va_block, gpu->id, NUMA_NO_NODE),
                                                        reverse_mapping->region);

                uvm_mutex_unlock(&va_block->lock);
@ -1223,6 +1223,8 @@ static NV_STATUS test_indirect_peers(uvm_gpu_t *owning_gpu, uvm_gpu_t *accessing
    if (!chunks)
        return NV_ERR_NO_MEMORY;

+    UVM_ASSERT(!g_uvm_global.sev_enabled);
+
    TEST_NV_CHECK_GOTO(uvm_mem_alloc_sysmem_and_map_cpu_kernel(UVM_CHUNK_SIZE_MAX, current->mm, &verif_mem), out);
    TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(verif_mem, owning_gpu), out);
    TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(verif_mem, accessing_gpu), out);
--- a/kernel-open/nvidia-uvm/uvm_policy.c
+++ b/kernel-open/nvidia-uvm/uvm_policy.c
@ -176,7 +176,9 @@ static NV_STATUS preferred_location_unmap_remote_pages(uvm_va_block_t *va_block,
    mapped_mask = uvm_va_block_map_mask_get(va_block, preferred_location);

    if (uvm_processor_mask_test(&va_block->resident, preferred_location)) {
-        const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, preferred_location);
+        const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block,
+                                                                              preferred_location,
+                                                                              NUMA_NO_NODE);

        if (!uvm_page_mask_andnot(&va_block_context->caller_page_mask, mapped_mask, resident_mask))
            goto done;
@ -638,7 +640,7 @@ static NV_STATUS va_block_set_read_duplication_locked(uvm_va_block_t *va_block,

    for_each_id_in_mask(src_id, &va_block->resident) {
        NV_STATUS status;
-        uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, src_id);
+        uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, src_id, NUMA_NO_NODE);

        // Calling uvm_va_block_make_resident_read_duplicate will break all
        // SetAccessedBy and remote mappings
@ -695,7 +697,7 @@ static NV_STATUS va_block_unset_read_duplication_locked(uvm_va_block_t *va_block
    // If preferred_location is set and has resident copies, give it preference
    if (UVM_ID_IS_VALID(preferred_location) &&
        uvm_processor_mask_test(&va_block->resident, preferred_location)) {
-        uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, preferred_location);
+        uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, preferred_location, NUMA_NO_NODE);
        bool is_mask_empty = !uvm_page_mask_and(break_read_duplication_pages,
                                                &va_block->read_duplicated_pages,
                                                resident_mask);
@ -723,7 +725,7 @@ static NV_STATUS va_block_unset_read_duplication_locked(uvm_va_block_t *va_block
        if (uvm_id_equal(processor_id, preferred_location))
            continue;

-        resident_mask = uvm_va_block_resident_mask_get(va_block, processor_id);
+        resident_mask = uvm_va_block_resident_mask_get(va_block, processor_id, NUMA_NO_NODE);
        is_mask_empty = !uvm_page_mask_and(break_read_duplication_pages,
                                           &va_block->read_duplicated_pages,
                                           resident_mask);
--- a/kernel-open/nvidia-uvm/uvm_processors.c
+++ b/kernel-open/nvidia-uvm/uvm_processors.c
@ -0,0 +1,40 @@
+/*******************************************************************************
+    Copyright (c) 2023 NVIDIA Corporation
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to
+    deal in the Software without restriction, including without limitation the
+    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+    sell copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+        The above copyright notice and this permission notice shall be
+        included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    DEALINGS IN THE SOFTWARE.
+
+*******************************************************************************/
+
+#include "uvm_processors.h"
+
+int uvm_find_closest_node_mask(int src, const nodemask_t *mask)
+{
+    int nid;
+    int closest_nid = NUMA_NO_NODE;
+
+    if (node_isset(src, *mask))
+        return src;
+
+    for_each_set_bit(nid, mask->bits, MAX_NUMNODES) {
+        if (closest_nid == NUMA_NO_NODE || node_distance(src, nid) < node_distance(src, closest_nid))
+            closest_nid = nid;
+    }
+
+    return closest_nid;
+}
--- a/kernel-open/nvidia-uvm/uvm_processors.h
+++ b/kernel-open/nvidia-uvm/uvm_processors.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2019 NVIDIA Corporation
+    Copyright (c) 2016-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -26,6 +26,7 @@

 #include "uvm_linux.h"
 #include "uvm_common.h"
+#include <linux/numa.h>

 #define UVM_MAX_UNIQUE_GPU_PAIRS SUM_FROM_0_TO_N(UVM_MAX_GPUS - 1)

@ -37,11 +38,11 @@
 // provide type safety, they are wrapped within the uvm_processor_id_t struct.
 // The range of valid identifiers needs to cover the maximum number of
 // supported GPUs on a system plus the CPU. CPU is assigned value 0, and GPUs
-// range: [1, UVM_ID_MAX_GPUS].
+// range: [1, UVM_PARENT_ID_MAX_GPUS].
 //
 // There are some functions that only expect GPU identifiers and, in order to
-// make it clearer, the uvm_gpu_id_t alias type is provided. However, as this
-// type is just a typedef of uvm_processor_id_t, there is no type checking
+// make it clearer, the uvm_parent_gpu_id_t alias type is provided. However, as
+// this type is just a typedef of uvm_processor_id_t, there is no type checking
 // performed by the compiler.
 //
 // Identifier value vs index
@ -60,22 +61,25 @@
 // the GPU within the GPU id space (basically id - 1).
 //
 // In the diagram below, MAX_SUB is used to abbreviate
-// UVM_ID_MAX_SUB_PROCESSORS.
+// UVM_PARENT_ID_MAX_SUB_PROCESSORS.
 //
-//            |-------------------------- uvm_processor_id_t ----------------------|
-//            |                                                                    |
-//            |     |----------------------- uvm_gpu_id_t ------------------------||
-//            |     |                                                             ||
-// Proc type  | CPU | GPU          ...          GPU   ... GPU                     ||
-//            |     |                                                             ||
-// ID values  |  0  |  1           ...          i+1   ... UVM_ID_MAX_PROCESSORS-1 ||
+// TODO: Bug 4195538: uvm_parent_processor_id_t is currently but temporarily the
+//                    same as uvm_processor_id_t.
 //
-// GPU index           0           ...           i    ... UVM_ID_MAX_GPUS-1
+//            |-------------------------- uvm_parent_processor_id_t ----------------------|
+//            |                                                                           |
+//            |     |----------------------- uvm_parent_gpu_id_t ------------------------||
+//            |     |                                                                    ||
+// Proc type  | CPU | GPU          ...          GPU   ... GPU                            ||
+//            |     |                                                                    ||
+// ID values  |  0  |  1           ...          i+1   ... UVM_PARENT_ID_MAX_PROCESSORS-1 ||
+//
+// GPU index           0           ...           i    ... UVM_PARENT_ID_MAX_GPUS-1
 //                  |     |                   |     |
 //                  |     |                   |     |
-//                  |     |-------------|     |     |-----------------------------|
-//                  |                   |     |                                   |
-//                  |                   |     |                                   |
+//                  |     |-------------|     |     |------------------------------------|
+//                  |                   |     |                                          |
+//                  |                   |     |                                          |
 // GPU index           0  ... MAX_SUB-1   ...    i*MAX_SUB    ... (i+1)*MAX_SUB-1   ... UVM_GLOBAL_ID_MAX_GPUS-1
 //
 // ID values  |  0  |  1  ... MAX_SUB     ...   (i*MAX_SUB)+1 ... (i+1)*MAX_SUB     ... UVM_GLOBAL_ID_MAX_PROCESSORS-1 ||
@ -210,7 +214,7 @@ static proc_id_t prefix_fn_mask##_find_first_id(const mask_t *mask)
                                                                                                             \
 static proc_id_t prefix_fn_mask##_find_first_gpu_id(const mask_t *mask)                                      \
 {                                                                                                            \
-    return proc_id_ctor(find_next_bit(mask->bitmap, (maxval), UVM_ID_GPU0_VALUE));                           \
+    return proc_id_ctor(find_next_bit(mask->bitmap, (maxval), UVM_PARENT_ID_GPU0_VALUE));                    \
 }                                                                                                            \
                                                                                                             \
 static proc_id_t prefix_fn_mask##_find_next_id(const mask_t *mask, proc_id_t min_id)                         \
@ -252,7 +256,7 @@ static NvU32 prefix_fn_mask##_get_gpu_count(const mask_t *mask)
 {                                                                                                            \
    NvU32 gpu_count = prefix_fn_mask##_get_count(mask);                                                      \
                                                                                                             \
-    if (prefix_fn_mask##_test(mask, proc_id_ctor(UVM_ID_CPU_VALUE)))                                         \
+    if (prefix_fn_mask##_test(mask, proc_id_ctor(UVM_PARENT_ID_CPU_VALUE)))                                  \
        --gpu_count;                                                                                         \
                                                                                                             \
    return gpu_count;                                                                                        \
@ -261,55 +265,55 @@ static NvU32 prefix_fn_mask##_get_gpu_count(const mask_t *mask)
 typedef struct
 {
    NvU32 val;
-} uvm_processor_id_t;
+} uvm_parent_processor_id_t;

 typedef struct
 {
    NvU32 val;
 } uvm_global_processor_id_t;

-typedef uvm_processor_id_t uvm_gpu_id_t;
+typedef uvm_parent_processor_id_t uvm_parent_gpu_id_t;
 typedef uvm_global_processor_id_t uvm_global_gpu_id_t;

 // Static value assigned to the CPU
-#define UVM_ID_CPU_VALUE      0
-#define UVM_ID_GPU0_VALUE     (UVM_ID_CPU_VALUE + 1)
+#define UVM_PARENT_ID_CPU_VALUE      0
+#define UVM_PARENT_ID_GPU0_VALUE     (UVM_PARENT_ID_CPU_VALUE + 1)

 // ID values for the CPU and first GPU, respectively; the values for both types
 // of IDs must match to enable sharing of UVM_PROCESSOR_MASK().
-#define UVM_GLOBAL_ID_CPU_VALUE  UVM_ID_CPU_VALUE
-#define UVM_GLOBAL_ID_GPU0_VALUE UVM_ID_GPU0_VALUE
+#define UVM_GLOBAL_ID_CPU_VALUE  UVM_PARENT_ID_CPU_VALUE
+#define UVM_GLOBAL_ID_GPU0_VALUE UVM_PARENT_ID_GPU0_VALUE

 // Maximum number of GPUs/processors that can be represented with the id types
-#define UVM_ID_MAX_GPUS       UVM_MAX_GPUS
-#define UVM_ID_MAX_PROCESSORS UVM_MAX_PROCESSORS
+#define UVM_PARENT_ID_MAX_GPUS       UVM_MAX_GPUS
+#define UVM_PARENT_ID_MAX_PROCESSORS UVM_MAX_PROCESSORS

-#define UVM_ID_MAX_SUB_PROCESSORS 8
+#define UVM_PARENT_ID_MAX_SUB_PROCESSORS 8

-#define UVM_GLOBAL_ID_MAX_GPUS       (UVM_MAX_GPUS * UVM_ID_MAX_SUB_PROCESSORS)
+#define UVM_GLOBAL_ID_MAX_GPUS       (UVM_PARENT_ID_MAX_GPUS * UVM_PARENT_ID_MAX_SUB_PROCESSORS)
 #define UVM_GLOBAL_ID_MAX_PROCESSORS (UVM_GLOBAL_ID_MAX_GPUS + 1)

-#define UVM_ID_CPU            ((uvm_processor_id_t) { .val = UVM_ID_CPU_VALUE })
-#define UVM_ID_INVALID        ((uvm_processor_id_t) { .val = UVM_ID_MAX_PROCESSORS })
+#define UVM_PARENT_ID_CPU     ((uvm_parent_processor_id_t) { .val = UVM_PARENT_ID_CPU_VALUE })
+#define UVM_PARENT_ID_INVALID ((uvm_parent_processor_id_t) { .val = UVM_PARENT_ID_MAX_PROCESSORS })
 #define UVM_GLOBAL_ID_CPU     ((uvm_global_processor_id_t) { .val = UVM_GLOBAL_ID_CPU_VALUE })
 #define UVM_GLOBAL_ID_INVALID ((uvm_global_processor_id_t) { .val = UVM_GLOBAL_ID_MAX_PROCESSORS })

-#define UVM_ID_CHECK_BOUNDS(id) UVM_ASSERT_MSG(id.val <= UVM_ID_MAX_PROCESSORS, "id %u\n", id.val)
+#define UVM_PARENT_ID_CHECK_BOUNDS(id) UVM_ASSERT_MSG(id.val <= UVM_PARENT_ID_MAX_PROCESSORS, "id %u\n", id.val)

 #define UVM_GLOBAL_ID_CHECK_BOUNDS(id) UVM_ASSERT_MSG(id.val <= UVM_GLOBAL_ID_MAX_PROCESSORS, "id %u\n", id.val)

-static int uvm_id_cmp(uvm_processor_id_t id1, uvm_processor_id_t id2)
+static int uvm_parent_id_cmp(uvm_parent_processor_id_t id1, uvm_parent_processor_id_t id2)
 {
-    UVM_ID_CHECK_BOUNDS(id1);
-    UVM_ID_CHECK_BOUNDS(id2);
+    UVM_PARENT_ID_CHECK_BOUNDS(id1);
+    UVM_PARENT_ID_CHECK_BOUNDS(id2);

    return UVM_CMP_DEFAULT(id1.val, id2.val);
 }

-static bool uvm_id_equal(uvm_processor_id_t id1, uvm_processor_id_t id2)
+static bool uvm_parent_id_equal(uvm_parent_processor_id_t id1, uvm_parent_processor_id_t id2)
 {
-    UVM_ID_CHECK_BOUNDS(id1);
-    UVM_ID_CHECK_BOUNDS(id2);
+    UVM_PARENT_ID_CHECK_BOUNDS(id1);
+    UVM_PARENT_ID_CHECK_BOUNDS(id2);

    return id1.val == id2.val;
 }
@ -330,30 +334,30 @@ static bool uvm_global_id_equal(uvm_global_processor_id_t id1, uvm_global_proces
    return id1.val == id2.val;
 }

-#define UVM_ID_IS_CPU(id)     uvm_id_equal(id, UVM_ID_CPU)
-#define UVM_ID_IS_INVALID(id) uvm_id_equal(id, UVM_ID_INVALID)
-#define UVM_ID_IS_VALID(id)   (!UVM_ID_IS_INVALID(id))
-#define UVM_ID_IS_GPU(id)     (!UVM_ID_IS_CPU(id) && !UVM_ID_IS_INVALID(id))
+#define UVM_PARENT_ID_IS_CPU(id)     uvm_parent_id_equal(id, UVM_PARENT_ID_CPU)
+#define UVM_PARENT_ID_IS_INVALID(id) uvm_parent_id_equal(id, UVM_PARENT_ID_INVALID)
+#define UVM_PARENT_ID_IS_VALID(id)   (!UVM_PARENT_ID_IS_INVALID(id))
+#define UVM_PARENT_ID_IS_GPU(id)     (!UVM_PARENT_ID_IS_CPU(id) && !UVM_PARENT_ID_IS_INVALID(id))

 #define UVM_GLOBAL_ID_IS_CPU(id)     uvm_global_id_equal(id, UVM_GLOBAL_ID_CPU)
 #define UVM_GLOBAL_ID_IS_INVALID(id) uvm_global_id_equal(id, UVM_GLOBAL_ID_INVALID)
 #define UVM_GLOBAL_ID_IS_VALID(id)   (!UVM_GLOBAL_ID_IS_INVALID(id))
 #define UVM_GLOBAL_ID_IS_GPU(id)     (!UVM_GLOBAL_ID_IS_CPU(id) && !UVM_GLOBAL_ID_IS_INVALID(id))

-static uvm_processor_id_t uvm_id_from_value(NvU32 val)
+static uvm_parent_processor_id_t uvm_parent_id_from_value(NvU32 val)
 {
-    uvm_processor_id_t ret = { .val = val };
+    uvm_parent_processor_id_t ret = { .val = val };

-    UVM_ID_CHECK_BOUNDS(ret);
+    UVM_PARENT_ID_CHECK_BOUNDS(ret);

    return ret;
 }

-static uvm_gpu_id_t uvm_gpu_id_from_value(NvU32 val)
+static uvm_parent_gpu_id_t uvm_parent_gpu_id_from_value(NvU32 val)
 {
-    uvm_gpu_id_t ret = uvm_id_from_value(val);
+    uvm_parent_gpu_id_t ret = uvm_parent_id_from_value(val);

-    UVM_ASSERT(!UVM_ID_IS_CPU(ret));
+    UVM_ASSERT(!UVM_PARENT_ID_IS_CPU(ret));

    return ret;
 }
@ -376,34 +380,34 @@ static uvm_global_gpu_id_t uvm_global_gpu_id_from_value(NvU32 val)
    return ret;
 }

-// Create a GPU id from the given GPU id index (previously obtained via
-// uvm_id_gpu_index)
-static uvm_gpu_id_t uvm_gpu_id_from_index(NvU32 index)
+// Create a parent GPU id from the given parent GPU id index (previously
+// obtained via uvm_parent_id_gpu_index)
+static uvm_parent_gpu_id_t uvm_parent_gpu_id_from_index(NvU32 index)
 {
-    return uvm_gpu_id_from_value(index + UVM_ID_GPU0_VALUE);
+    return uvm_parent_gpu_id_from_value(index + UVM_PARENT_ID_GPU0_VALUE);
 }

-static uvm_processor_id_t uvm_id_next(uvm_processor_id_t id)
+static uvm_parent_processor_id_t uvm_parent_id_next(uvm_parent_processor_id_t id)
 {
    ++id.val;

-    UVM_ID_CHECK_BOUNDS(id);
+    UVM_PARENT_ID_CHECK_BOUNDS(id);

    return id;
 }

-static uvm_gpu_id_t uvm_gpu_id_next(uvm_gpu_id_t id)
+static uvm_parent_gpu_id_t uvm_parent_gpu_id_next(uvm_parent_gpu_id_t id)
 {
-    UVM_ASSERT(UVM_ID_IS_GPU(id));
+    UVM_ASSERT(UVM_PARENT_ID_IS_GPU(id));

    ++id.val;

-    UVM_ID_CHECK_BOUNDS(id);
+    UVM_PARENT_ID_CHECK_BOUNDS(id);

    return id;
 }

-// Same as uvm_gpu_id_from_index but for uvm_global_processor_id_t
+// Same as uvm_parent_gpu_id_from_index but for uvm_global_processor_id_t
 static uvm_global_gpu_id_t uvm_global_gpu_id_from_index(NvU32 index)
 {
    return uvm_global_gpu_id_from_value(index + UVM_GLOBAL_ID_GPU0_VALUE);
@ -429,11 +433,11 @@ static uvm_global_gpu_id_t uvm_global_gpu_id_next(uvm_global_gpu_id_t id)
    return id;
 }

-// This function returns the numerical value within [0, UVM_ID_MAX_PROCESSORS)
-// of the given processor id
-static NvU32 uvm_id_value(uvm_processor_id_t id)
+// This function returns the numerical value within
+// [0, UVM_PARENT_ID_MAX_PROCESSORS) of the given parent processor id.
+static NvU32 uvm_parent_id_value(uvm_parent_processor_id_t id)
 {
-    UVM_ASSERT(UVM_ID_IS_VALID(id));
+    UVM_ASSERT(UVM_PARENT_ID_IS_VALID(id));

    return id.val;
 }
@ -448,12 +452,12 @@ static NvU32 uvm_global_id_value(uvm_global_processor_id_t id)
 }

 // This function returns the index of the given GPU id within the GPU id space
-// [0, UVM_ID_MAX_GPUS)
-static NvU32 uvm_id_gpu_index(uvm_gpu_id_t id)
+// [0, UVM_PARENT_ID_MAX_GPUS)
+static NvU32 uvm_parent_id_gpu_index(uvm_parent_gpu_id_t id)
 {
-    UVM_ASSERT(UVM_ID_IS_GPU(id));
+    UVM_ASSERT(UVM_PARENT_ID_IS_GPU(id));

-    return id.val - UVM_ID_GPU0_VALUE;
+    return id.val - UVM_PARENT_ID_GPU0_VALUE;
 }

 // This function returns the index of the given GPU id within the GPU id space
@ -465,61 +469,61 @@ static NvU32 uvm_global_id_gpu_index(const uvm_global_gpu_id_t id)
    return id.val - UVM_GLOBAL_ID_GPU0_VALUE;
 }

-static NvU32 uvm_global_id_gpu_index_from_gpu_id(const uvm_gpu_id_t id)
+static NvU32 uvm_global_id_gpu_index_from_parent_gpu_id(const uvm_parent_gpu_id_t id)
 {
-    UVM_ASSERT(UVM_ID_IS_GPU(id));
+    UVM_ASSERT(UVM_PARENT_ID_IS_GPU(id));

-    return uvm_id_gpu_index(id) * UVM_ID_MAX_SUB_PROCESSORS;
+    return uvm_parent_id_gpu_index(id) * UVM_PARENT_ID_MAX_SUB_PROCESSORS;
 }

-static NvU32 uvm_id_gpu_index_from_global_gpu_id(const uvm_global_gpu_id_t id)
+static NvU32 uvm_parent_id_gpu_index_from_global_gpu_id(const uvm_global_gpu_id_t id)
 {
    UVM_ASSERT(UVM_GLOBAL_ID_IS_GPU(id));

-    return uvm_global_id_gpu_index(id) / UVM_ID_MAX_SUB_PROCESSORS;
+    return uvm_global_id_gpu_index(id) / UVM_PARENT_ID_MAX_SUB_PROCESSORS;
 }

-static uvm_global_gpu_id_t uvm_global_gpu_id_from_gpu_id(const uvm_gpu_id_t id)
+static uvm_global_gpu_id_t uvm_global_gpu_id_from_parent_gpu_id(const uvm_parent_gpu_id_t id)
 {
-    UVM_ASSERT(UVM_ID_IS_GPU(id));
+    UVM_ASSERT(UVM_PARENT_ID_IS_GPU(id));

-    return uvm_global_gpu_id_from_index(uvm_global_id_gpu_index_from_gpu_id(id));
+    return uvm_global_gpu_id_from_index(uvm_global_id_gpu_index_from_parent_gpu_id(id));
 }

 static uvm_global_gpu_id_t uvm_global_gpu_id_from_parent_index(NvU32 index)
 {
-    UVM_ASSERT(index < UVM_MAX_GPUS);
+    UVM_ASSERT(index < UVM_PARENT_ID_MAX_GPUS);

-    return uvm_global_gpu_id_from_gpu_id(uvm_gpu_id_from_value(index + UVM_GLOBAL_ID_GPU0_VALUE));
+    return uvm_global_gpu_id_from_parent_gpu_id(uvm_parent_gpu_id_from_value(index + UVM_GLOBAL_ID_GPU0_VALUE));
 }

-static uvm_global_gpu_id_t uvm_global_gpu_id_from_sub_processor_index(const uvm_gpu_id_t id, NvU32 sub_index)
+static uvm_global_gpu_id_t uvm_global_gpu_id_from_sub_processor_index(const uvm_parent_gpu_id_t id, NvU32 sub_index)
 {
    NvU32 index;

-    UVM_ASSERT(sub_index < UVM_ID_MAX_SUB_PROCESSORS);
+    UVM_ASSERT(sub_index < UVM_PARENT_ID_MAX_SUB_PROCESSORS);

-    index = uvm_global_id_gpu_index_from_gpu_id(id) + sub_index;
+    index = uvm_global_id_gpu_index_from_parent_gpu_id(id) + sub_index;
    return uvm_global_gpu_id_from_index(index);
 }

-static uvm_gpu_id_t uvm_gpu_id_from_global_gpu_id(const uvm_global_gpu_id_t id)
+static uvm_parent_gpu_id_t uvm_parent_gpu_id_from_global_gpu_id(const uvm_global_gpu_id_t id)
 {
    UVM_ASSERT(UVM_GLOBAL_ID_IS_GPU(id));

-    return uvm_gpu_id_from_index(uvm_id_gpu_index_from_global_gpu_id(id));
+    return uvm_parent_gpu_id_from_index(uvm_parent_id_gpu_index_from_global_gpu_id(id));
 }

 static NvU32 uvm_global_id_sub_processor_index(const uvm_global_gpu_id_t id)
 {
-    return uvm_global_id_gpu_index(id) % UVM_ID_MAX_SUB_PROCESSORS;
+    return uvm_global_id_gpu_index(id) % UVM_PARENT_ID_MAX_SUB_PROCESSORS;
 }

 UVM_PROCESSOR_MASK(uvm_processor_mask_t,              \
                   uvm_processor_mask,                \
-                   UVM_ID_MAX_PROCESSORS,             \
-                   uvm_processor_id_t,                \
-                   uvm_id_from_value)
+                   UVM_PARENT_ID_MAX_PROCESSORS,      \
+                   uvm_parent_processor_id_t,         \
+                   uvm_parent_id_from_value)

 UVM_PROCESSOR_MASK(uvm_global_processor_mask_t,       \
                   uvm_global_processor_mask,         \
@ -533,19 +537,19 @@ static bool uvm_processor_mask_gpu_subset(const uvm_processor_mask_t *subset, co
 {
    uvm_processor_mask_t subset_gpus;
    uvm_processor_mask_copy(&subset_gpus, subset);
-    uvm_processor_mask_clear(&subset_gpus, UVM_ID_CPU);
+    uvm_processor_mask_clear(&subset_gpus, UVM_PARENT_ID_CPU);
    return uvm_processor_mask_subset(&subset_gpus, mask);
 }

 #define for_each_id_in_mask(id, mask)                                                                 \
    for ((id) = uvm_processor_mask_find_first_id(mask);                                               \
-         UVM_ID_IS_VALID(id);                                                                         \
-         (id) = uvm_processor_mask_find_next_id((mask), uvm_id_next(id)))
+         UVM_PARENT_ID_IS_VALID(id);                                                                  \
+         (id) = uvm_processor_mask_find_next_id((mask), uvm_parent_id_next(id)))

 #define for_each_gpu_id_in_mask(gpu_id, mask)                                                         \
    for ((gpu_id) = uvm_processor_mask_find_first_gpu_id((mask));                                     \
-         UVM_ID_IS_VALID(gpu_id);                                                                     \
-         (gpu_id) = uvm_processor_mask_find_next_id((mask), uvm_gpu_id_next(gpu_id)))
+         UVM_PARENT_ID_IS_VALID(gpu_id);                                                              \
+         (gpu_id) = uvm_processor_mask_find_next_id((mask), uvm_parent_gpu_id_next(gpu_id)))

 #define for_each_global_id_in_mask(id, mask)                                                          \
    for ((id) = uvm_global_processor_mask_find_first_id(mask);                                        \
@ -559,21 +563,36 @@ static bool uvm_processor_mask_gpu_subset(const uvm_processor_mask_t *subset, co

 // Helper to iterate over all valid gpu ids
 #define for_each_gpu_id(i)       \
-    for (i = uvm_gpu_id_from_value(UVM_ID_GPU0_VALUE); UVM_ID_IS_VALID(i); i = uvm_gpu_id_next(i))
+    for (i = uvm_parent_gpu_id_from_value(UVM_PARENT_ID_GPU0_VALUE); UVM_PARENT_ID_IS_VALID(i); i = uvm_parent_gpu_id_next(i))
 #define for_each_global_gpu_id(i)  \
    for (i = uvm_global_gpu_id_from_value(UVM_GLOBAL_ID_GPU0_VALUE); UVM_GLOBAL_ID_IS_VALID(i); i = uvm_global_gpu_id_next(i))

 #define for_each_global_sub_processor_id_in_gpu(id, i) \
-    for (i = uvm_global_gpu_id_from_gpu_id(id); \
+    for (i = uvm_global_gpu_id_from_parent_gpu_id(id); \
         UVM_GLOBAL_ID_IS_VALID(i) && \
-         (uvm_global_id_value(i) < uvm_global_id_value(uvm_global_gpu_id_from_gpu_id(id)) + UVM_ID_MAX_SUB_PROCESSORS); \
+         (uvm_global_id_value(i) < uvm_global_id_value(uvm_global_gpu_id_from_parent_gpu_id(id)) + UVM_PARENT_ID_MAX_SUB_PROCESSORS); \
         i = uvm_global_gpu_id_next(i))

 // Helper to iterate over all valid gpu ids
-#define for_each_processor_id(i) for (i = UVM_ID_CPU; UVM_ID_IS_VALID(i); i = uvm_id_next(i))
+#define for_each_processor_id(i) for (i = UVM_PARENT_ID_CPU; UVM_PARENT_ID_IS_VALID(i); i = uvm_parent_id_next(i))

 #define for_each_global_id(i) for (i = UVM_GLOBAL_ID_CPU; UVM_GLOBAL_ID_IS_VALID(i); i = uvm_global_id_next(i))

+// Find the node in mask with the shorted distance (as returned by
+// node_distance) for src.
+// Note that the search is inclusive of src.
+// If mask has no bits set, NUMA_NO_NODE is returned.
+int uvm_find_closest_node_mask(int src, const nodemask_t *mask);
+
+// Iterate over all nodes in mask with increasing distance from src.
+// Note that this iterator is destructive of the mask.
+#define for_each_closest_uvm_node(nid, src, mask)                                                                      \
+    for ((nid) = uvm_find_closest_node_mask((src), &(mask));                                                           \
+         (nid) != NUMA_NO_NODE;                                                                                        \
+         node_clear((nid), (mask)), (nid) = uvm_find_closest_node_mask((src), &(mask)))
+
+#define for_each_possible_uvm_node(nid) for_each_node_mask((nid), node_possible_map)
+
 static bool uvm_processor_uuid_eq(const NvProcessorUuid *uuid1, const NvProcessorUuid *uuid2)
 {
    return memcmp(uuid1, uuid2, sizeof(*uuid1)) == 0;
@ -585,4 +604,78 @@ static void uvm_processor_uuid_copy(NvProcessorUuid *dst, const NvProcessorUuid
    memcpy(dst, src, sizeof(*dst));
 }

+// TODO: Bug 4195538: [uvm][multi-SMC] Get UVM internal data structures ready to
+// meet multi-SMC requirements. Temporary aliases, they must be removed once
+// the data structures are converted.
+typedef uvm_parent_processor_id_t uvm_processor_id_t;
+typedef uvm_parent_gpu_id_t uvm_gpu_id_t;
+
+#define UVM_ID_CPU_VALUE                 UVM_PARENT_ID_CPU_VALUE
+#define UVM_ID_GPU0_VALUE                UVM_PARENT_ID_GPU0_VALUE
+#define UVM_ID_MAX_GPUS                  UVM_PARENT_ID_MAX_GPUS
+#define UVM_ID_MAX_PROCESSORS            UVM_PARENT_ID_MAX_PROCESSORS
+#define UVM_ID_MAX_SUB_PROCESSORS        UVM_PARENT_ID_MAX_SUB_PROCESSORS
+#define UVM_ID_CPU                       UVM_PARENT_ID_CPU
+#define UVM_ID_INVALID                   UVM_PARENT_ID_INVALID
+
+static int uvm_id_cmp(uvm_parent_processor_id_t id1, uvm_parent_processor_id_t id2)
+{
+    return UVM_CMP_DEFAULT(id1.val, id2.val);
+}
+
+static bool uvm_id_equal(uvm_parent_processor_id_t id1, uvm_parent_processor_id_t id2)
+{
+    return uvm_parent_id_equal(id1, id2);
+}
+
+#define UVM_ID_IS_CPU(id)     uvm_id_equal(id, UVM_ID_CPU)
+#define UVM_ID_IS_INVALID(id) uvm_id_equal(id, UVM_ID_INVALID)
+#define UVM_ID_IS_VALID(id)   (!UVM_ID_IS_INVALID(id))
+#define UVM_ID_IS_GPU(id)     (!UVM_ID_IS_CPU(id) && !UVM_ID_IS_INVALID(id))
+
+static uvm_parent_gpu_id_t uvm_gpu_id_from_value(NvU32 val)
+{
+    return uvm_parent_gpu_id_from_value(val);
+}
+
+static NvU32 uvm_id_value(uvm_parent_processor_id_t id)
+{
+    return uvm_parent_id_value(id);
+}
+
+static NvU32 uvm_id_gpu_index(uvm_parent_gpu_id_t id)
+{
+    return uvm_parent_id_gpu_index(id);
+}
+
+static NvU32 uvm_id_gpu_index_from_global_gpu_id(const uvm_global_gpu_id_t id)
+{
+    return uvm_parent_id_gpu_index_from_global_gpu_id(id);
+}
+
+static uvm_parent_gpu_id_t uvm_gpu_id_from_index(NvU32 index)
+{
+    return uvm_parent_gpu_id_from_index(index);
+}
+
+static uvm_parent_gpu_id_t uvm_gpu_id_next(uvm_parent_gpu_id_t id)
+{
+    return uvm_parent_gpu_id_next(id);
+}
+
+static uvm_parent_gpu_id_t uvm_gpu_id_from_global_gpu_id(const uvm_global_gpu_id_t id)
+{
+    return uvm_parent_gpu_id_from_global_gpu_id(id);
+}
+
+static NvU32 uvm_global_id_gpu_index_from_gpu_id(const uvm_parent_gpu_id_t id)
+{
+    return uvm_global_id_gpu_index_from_parent_gpu_id(id);
+}
+
+static uvm_global_gpu_id_t uvm_global_gpu_id_from_gpu_id(const uvm_parent_gpu_id_t id)
+{
+    return uvm_global_gpu_id_from_parent_gpu_id(id);
+}
+
 #endif
--- a/kernel-open/nvidia-uvm/uvm_test.c
+++ b/kernel-open/nvidia-uvm/uvm_test.c
@ -106,26 +106,6 @@ static NV_STATUS uvm_test_nv_kthread_q(UVM_TEST_NV_KTHREAD_Q_PARAMS *params, str
    return NV_ERR_INVALID_STATE;
 }

-static NV_STATUS uvm_test_numa_get_closest_cpu_node_to_gpu(UVM_TEST_NUMA_GET_CLOSEST_CPU_NODE_TO_GPU_PARAMS *params,
-                                                           struct file *filp)
-{
-    uvm_gpu_t *gpu;
-    NV_STATUS status;
-    uvm_rm_user_object_t user_rm_va_space = {
-        .rm_control_fd = -1,
-        .user_client = params->client,
-        .user_object = params->smc_part_ref
-    };
-
-    status = uvm_gpu_retain_by_uuid(&params->gpu_uuid, &user_rm_va_space, &gpu);
-    if (status != NV_OK)
-        return status;
-
-    params->node_id = gpu->parent->closest_cpu_numa_node;
-    uvm_gpu_release(gpu);
-    return NV_OK;
-}
-
 // Callers of this function should ensure that node is not NUMA_NO_NODE in order
 // to avoid overrunning the kernel's node to cpumask map.
 static NV_STATUS uvm_test_verify_bh_affinity(uvm_intr_handler_t *isr, int node)
@ -307,8 +287,6 @@ long uvm_test_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_DRAIN_REPLAYABLE_FAULTS,      uvm_test_drain_replayable_faults);
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_PMA_GET_BATCH_SIZE,           uvm_test_pma_get_batch_size);
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_PMM_QUERY_PMA_STATS,          uvm_test_pmm_query_pma_stats);
-        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_NUMA_GET_CLOSEST_CPU_NODE_TO_GPU,
-                                       uvm_test_numa_get_closest_cpu_node_to_gpu);
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_NUMA_CHECK_AFFINITY,          uvm_test_numa_check_affinity);
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_VA_SPACE_ADD_DUMMY_THREAD_CONTEXTS,
                                       uvm_test_va_space_add_dummy_thread_contexts);
--- a/kernel-open/nvidia-uvm/uvm_test_ioctl.h
+++ b/kernel-open/nvidia-uvm/uvm_test_ioctl.h
@ -561,6 +561,22 @@ typedef struct
 // user_pages_allocation_retry_force_count, but the injection point simulates
 // driver metadata allocation failure.
 //
+// cpu_chunk_allocation_target_id and cpu_chunk_allocation_actual_id are used
+// to control the NUMA node IDs for CPU chunk allocations, specifically for
+// testing overlapping CPU chunk allocations.
+//
+// Currently, uvm_api_migrate() does not pass the preferred CPU NUMA node to for
+// managed memory so it is not possible to request a specific node.
+// cpu_chunk_allocation_target_id is used to request the allocation be made on
+// specific node. On the other hand, cpu_chunk_allocation_actual_id is the node
+// on which the allocation will actually be made.
+//
+// The two parameters can be used to force a CPU chunk allocation to overlap a
+// previously allocated chunk.
+//
+// Please note that even when specifying cpu_cpu_allocation_actual_id, the
+// kernel may end up allocating on a different node.
+//
 // Error returns:
 // NV_ERR_INVALID_ADDRESS
 //  - lookup_address doesn't match a UVM range
@ -571,6 +587,8 @@ typedef struct
    NvU32     page_table_allocation_retry_force_count;  // In
    NvU32     user_pages_allocation_retry_force_count;  // In
    NvU32     cpu_chunk_allocation_size_mask;           // In
+    NvS32     cpu_chunk_allocation_target_id;           // In
+    NvS32     cpu_chunk_allocation_actual_id;           // In
    NvU32     cpu_pages_allocation_error_count;         // In
    NvBool    eviction_error;                           // In
    NvBool    populate_error;                           // In
@ -604,6 +622,10 @@ typedef struct
    NvProcessorUuid                 resident_on[UVM_MAX_PROCESSORS];                    // Out
    NvU32                           resident_on_count;                                  // Out

+    // If the memory is resident on the CPU, the NUMA node on which the page
+    // is resident. Otherwise, -1.
+    NvS32                           resident_nid;                                       // Out
+
    // The size of the physical allocation backing lookup_address. Only the
    // system-page-sized portion of this allocation which contains
    // lookup_address is guaranteed to be resident on the corresponding
@ -1168,19 +1190,6 @@ typedef struct
    NV_STATUS                       rmStatus;                                           // Out
 } UVM_TEST_PMM_QUERY_PMA_STATS_PARAMS;

-#define UVM_TEST_NUMA_GET_CLOSEST_CPU_NODE_TO_GPU        UVM_TEST_IOCTL_BASE(77)
-typedef struct
-{
-    NvProcessorUuid                 gpu_uuid;                                           // In
-    NvHandle                        client;                                             // In
-    NvHandle                        smc_part_ref;                                       // In
-
-    // On kernels with NUMA support, this entry contains the closest CPU NUMA
-    // node to this GPU. Otherwise, the value will be -1.
-    NvS32                           node_id;                                            // Out
-    NV_STATUS                       rmStatus;                                           // Out
-} UVM_TEST_NUMA_GET_CLOSEST_CPU_NODE_TO_GPU_PARAMS;
-
 // Test whether the bottom halves have run on the correct CPUs based on the
 // NUMA node locality of the GPU.
 //
--- a/kernel-open/nvidia-uvm/uvm_va_block.c
+++ b/kernel-open/nvidia-uvm/uvm_va_block.c
@ -54,6 +54,7 @@ static struct kmem_cache *g_uvm_va_block_cache __read_mostly;
 static struct kmem_cache *g_uvm_va_block_gpu_state_cache __read_mostly;
 static struct kmem_cache *g_uvm_page_mask_cache __read_mostly;
 static struct kmem_cache *g_uvm_va_block_context_cache __read_mostly;
+static struct kmem_cache *g_uvm_va_block_cpu_node_state_cache __read_mostly;

 static int uvm_fault_force_sysmem __read_mostly = 0;
 module_param(uvm_fault_force_sysmem, int, S_IRUGO|S_IWUSR);
@ -179,6 +180,20 @@ void uvm_va_block_retry_init(uvm_va_block_retry_t *retry)
    INIT_LIST_HEAD(&retry->free_chunks);
 }

+static size_t node_to_index(int nid)
+{
+    UVM_ASSERT(nid != NUMA_NO_NODE);
+    UVM_ASSERT(nid < MAX_NUMNODES);
+    return __nodes_weight(&node_possible_map, nid);
+}
+
+static uvm_va_block_cpu_node_state_t *block_node_state_get(uvm_va_block_t *block, int nid)
+{
+    size_t index = node_to_index(nid);
+    UVM_ASSERT(block->cpu.node_state[index]);
+    return block->cpu.node_state[index];
+}
+
 // The bottom bit of uvm_va_block_t::chunks is used to indicate how CPU chunks
 // are stored.
 //
@ -227,14 +242,26 @@ static uvm_va_block_region_t uvm_cpu_chunk_block_region(uvm_va_block_t *va_block
    return uvm_va_block_chunk_region(va_block, uvm_cpu_chunk_get_size(chunk), page_index);
 }

-static void *uvm_cpu_storage_get_ptr(uvm_va_block_t *block)
+static void *uvm_cpu_storage_get_ptr(uvm_va_block_cpu_node_state_t *node_state)
 {
-    return (void *)(block->cpu.chunks & ~UVM_CPU_CHUNK_STORAGE_MASK);
+    return (void *)(node_state->chunks & ~UVM_CPU_CHUNK_STORAGE_MASK);
 }

-static uvm_cpu_chunk_storage_type_t uvm_cpu_storage_get_type(uvm_va_block_t *block)
+static uvm_cpu_chunk_storage_type_t uvm_cpu_storage_get_type(uvm_va_block_cpu_node_state_t *node_state)
 {
-    return block->cpu.chunks & UVM_CPU_CHUNK_STORAGE_MASK;
+    return node_state->chunks & UVM_CPU_CHUNK_STORAGE_MASK;
+}
+
+static int block_get_page_node_residency(uvm_va_block_t *block, uvm_page_index_t page_index)
+{
+    int nid;
+
+    for_each_possible_uvm_node(nid) {
+        if (uvm_va_block_cpu_is_page_resident_on(block, nid, page_index))
+            return nid;
+    }
+
+    return NUMA_NO_NODE;
 }

 static uvm_page_index_t compute_page_prefix(uvm_va_block_t *va_block, uvm_chunk_size_t size)
@ -270,12 +297,12 @@ static size_t compute_small_index(uvm_va_block_t *va_block, uvm_page_index_t pag
    return (page_index - prefix) % MAX_SMALL_CHUNKS_PER_BIG_SLOT;
 }

-NV_STATUS uvm_cpu_chunk_insert_in_block(uvm_va_block_t *va_block,
-                                        uvm_cpu_chunk_t *chunk,
-                                        uvm_page_index_t page_index)
+NV_STATUS uvm_cpu_chunk_insert_in_block(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index)
 {
    uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk);
    uvm_va_block_region_t chunk_region = uvm_va_block_region(page_index, page_index + uvm_cpu_chunk_num_pages(chunk));
+    int nid = uvm_cpu_chunk_get_numa_node(chunk);
+    uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(va_block, nid);
    size_t slot_index;
    uvm_cpu_chunk_storage_mixed_t *mixed;
    uvm_cpu_chunk_t **chunks = NULL;
@ -291,20 +318,20 @@ NV_STATUS uvm_cpu_chunk_insert_in_block(uvm_va_block_t *va_block,

    if (chunk_size == UVM_CHUNK_SIZE_2M) {
        UVM_ASSERT(uvm_va_block_size(va_block) == UVM_PAGE_SIZE_2M);
-        UVM_ASSERT(!va_block->cpu.chunks);
-        va_block->cpu.chunks = (unsigned long)chunk | UVM_CPU_CHUNK_STORAGE_CHUNK;
+        UVM_ASSERT(!node_state->chunks);
+        node_state->chunks = (unsigned long)chunk | UVM_CPU_CHUNK_STORAGE_CHUNK;
    }
    else {
-        if (!va_block->cpu.chunks) {
+        if (!node_state->chunks) {
            mixed = uvm_kvmalloc_zero(sizeof(*mixed));
            if (!mixed)
                return NV_ERR_NO_MEMORY;

-            va_block->cpu.chunks = (unsigned long)mixed | UVM_CPU_CHUNK_STORAGE_MIXED;
+            node_state->chunks = (unsigned long)mixed | UVM_CPU_CHUNK_STORAGE_MIXED;
        }

-        UVM_ASSERT(uvm_cpu_storage_get_type(va_block) == UVM_CPU_CHUNK_STORAGE_MIXED);
-        mixed = uvm_cpu_storage_get_ptr(va_block);
+        UVM_ASSERT(uvm_cpu_storage_get_type(node_state) == UVM_CPU_CHUNK_STORAGE_MIXED);
+        mixed = uvm_cpu_storage_get_ptr(node_state);
        slot_index = compute_slot_index(va_block, page_index);
        UVM_ASSERT(compute_slot_index(va_block, page_index + uvm_cpu_chunk_num_pages(chunk) - 1) == slot_index);
        UVM_ASSERT(!test_bit(slot_index, mixed->big_chunks));
@ -331,28 +358,32 @@ NV_STATUS uvm_cpu_chunk_insert_in_block(uvm_va_block_t *va_block,
        }
    }

+    uvm_page_mask_region_fill(&node_state->allocated, chunk_region);
    uvm_page_mask_region_fill(&va_block->cpu.allocated, chunk_region);
    return NV_OK;
 }

-uvm_cpu_chunk_t *uvm_cpu_chunk_get_chunk_for_page(uvm_va_block_t *va_block, uvm_page_index_t page_index)
+uvm_cpu_chunk_t *uvm_cpu_chunk_get_chunk_for_page(uvm_va_block_t *va_block, int nid, uvm_page_index_t page_index)
 {
+    uvm_va_block_cpu_node_state_t *node_state;
    uvm_cpu_chunk_storage_mixed_t *mixed;
    uvm_cpu_chunk_t *chunk;
    uvm_cpu_chunk_t **chunks;
    size_t slot_index;

    UVM_ASSERT(page_index < uvm_va_block_num_cpu_pages(va_block));
-    if (!uvm_page_mask_test(&va_block->cpu.allocated, page_index))
+    UVM_ASSERT(nid != NUMA_NO_NODE);
+    node_state = block_node_state_get(va_block, nid);
+    if (!uvm_page_mask_test(&node_state->allocated, page_index))
        return NULL;

-    UVM_ASSERT(va_block->cpu.chunks);
+    UVM_ASSERT(node_state->chunks);

-    if (uvm_cpu_storage_get_type(va_block) == UVM_CPU_CHUNK_STORAGE_CHUNK) {
-        return uvm_cpu_storage_get_ptr(va_block);
+    if (uvm_cpu_storage_get_type(node_state) == UVM_CPU_CHUNK_STORAGE_CHUNK) {
+        return uvm_cpu_storage_get_ptr(node_state);
    }
    else {
-        mixed = uvm_cpu_storage_get_ptr(va_block);
+        mixed = uvm_cpu_storage_get_ptr(node_state);
        slot_index = compute_slot_index(va_block, page_index);
        UVM_ASSERT(mixed->slots[slot_index] != NULL);
        if (test_bit(slot_index, mixed->big_chunks))
@ -366,31 +397,43 @@ uvm_cpu_chunk_t *uvm_cpu_chunk_get_chunk_for_page(uvm_va_block_t *va_block, uvm_
    return chunk;
 }

-void uvm_cpu_chunk_remove_from_block(uvm_va_block_t *va_block,
-                                     uvm_page_index_t page_index)
+static uvm_cpu_chunk_t *uvm_cpu_chunk_get_chunk_for_page_resident(uvm_va_block_t *va_block, uvm_page_index_t page_index)
 {
+    uvm_cpu_chunk_t *chunk = NULL;
+    int nid = block_get_page_node_residency(va_block, page_index);
+
+    if (nid != NUMA_NO_NODE)
+        chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, nid, page_index);
+
+    return chunk;
+}
+
+void uvm_cpu_chunk_remove_from_block(uvm_va_block_t *va_block, int nid, uvm_page_index_t page_index)
+{
+    uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(va_block, nid);
    uvm_cpu_chunk_storage_mixed_t *mixed;
-    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index);
+    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, nid, page_index);
    uvm_va_block_region_t chunk_region = uvm_cpu_chunk_block_region(va_block, chunk, page_index);
    size_t slot_index;
    uvm_cpu_chunk_t **chunks;
+    int nid_iter;

    // We want to protect against two threads manipulating the VA block's CPU
    // chunks at the same time. However, when a block is split, the new block's
    // lock is locked without tracking. So, we can't use
    // uvm_assert_mutex_locked().
    UVM_ASSERT(mutex_is_locked(&va_block->lock.m));
-    UVM_ASSERT(va_block->cpu.chunks);
+    UVM_ASSERT(node_state->chunks);
    UVM_ASSERT(uvm_va_block_region_num_pages(chunk_region) == uvm_cpu_chunk_num_pages(chunk));

-    if (uvm_cpu_storage_get_type(va_block) == UVM_CPU_CHUNK_STORAGE_CHUNK) {
+    if (uvm_cpu_storage_get_type(node_state) == UVM_CPU_CHUNK_STORAGE_CHUNK) {
        UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_2M);
-        UVM_ASSERT(uvm_cpu_storage_get_ptr(va_block) == chunk);
-        va_block->cpu.chunks = 0;
+        UVM_ASSERT(uvm_cpu_storage_get_ptr(node_state) == chunk);
+        node_state->chunks = 0;
    }
    else {
        UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) != UVM_CHUNK_SIZE_2M);
-        mixed = uvm_cpu_storage_get_ptr(va_block);
+        mixed = uvm_cpu_storage_get_ptr(node_state);
        slot_index = compute_slot_index(va_block, page_index);
        UVM_ASSERT(mixed->slots[slot_index] != NULL);

@ -421,18 +464,22 @@ void uvm_cpu_chunk_remove_from_block(uvm_va_block_t *va_block,
        }
    }

-    uvm_page_mask_region_clear(&va_block->cpu.allocated, chunk_region);
+    uvm_page_mask_region_clear(&node_state->allocated, chunk_region);
+    uvm_page_mask_zero(&va_block->cpu.allocated);
+    for_each_possible_uvm_node(nid_iter) {
+        uvm_va_block_cpu_node_state_t *iter_node_state = block_node_state_get(va_block, nid_iter);
+        uvm_page_mask_or(&va_block->cpu.allocated, &va_block->cpu.allocated, &iter_node_state->allocated);
+    }

-    if (uvm_page_mask_empty(&va_block->cpu.allocated) && va_block->cpu.chunks) {
-        uvm_kvfree(uvm_cpu_storage_get_ptr(va_block));
-        va_block->cpu.chunks = 0;
+    if (uvm_page_mask_empty(&node_state->allocated) && node_state->chunks) {
+        uvm_kvfree(uvm_cpu_storage_get_ptr(node_state));
+        node_state->chunks = 0;
    }
 }

-struct page *uvm_cpu_chunk_get_cpu_page(uvm_va_block_t *va_block, uvm_page_index_t page_index)
+struct page *uvm_cpu_chunk_get_cpu_page(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index)
 {
    uvm_va_block_region_t chunk_region;
-    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index);

    UVM_ASSERT(chunk);
    UVM_ASSERT(chunk->page);
@ -440,16 +487,28 @@ struct page *uvm_cpu_chunk_get_cpu_page(uvm_va_block_t *va_block, uvm_page_index
    return chunk->page + (page_index - chunk_region.first);
 }

+struct page *uvm_va_block_get_cpu_page(uvm_va_block_t *va_block, uvm_page_index_t page_index)
+{
+    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page_resident(va_block, page_index);
+
+    return uvm_cpu_chunk_get_cpu_page(va_block, chunk, page_index);
+}
+
 static uvm_cpu_chunk_t *uvm_cpu_chunk_first_in_region(uvm_va_block_t *va_block,
                                                      uvm_va_block_region_t region,
+                                                      int nid,
                                                      uvm_page_index_t *first_chunk_page)
 {
    uvm_cpu_chunk_t *chunk = NULL;
    uvm_page_index_t page_index;
+    uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(va_block, nid);

-    page_index = uvm_va_block_first_page_in_mask(region, &va_block->cpu.allocated);
+    if (!node_state)
+        return NULL;
+
+    page_index = uvm_va_block_first_page_in_mask(region, &node_state->allocated);
    if (page_index < region.outer)
-        chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index);
+        chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, nid, page_index);

    if (first_chunk_page && chunk) {
        uvm_va_block_region_t chunk_region = uvm_cpu_chunk_block_region(va_block, chunk, page_index);
@ -459,33 +518,156 @@ static uvm_cpu_chunk_t *uvm_cpu_chunk_first_in_region(uvm_va_block_t *va_block,
    return chunk;
 }

-#define for_each_cpu_chunk_in_block_region(chunk, page_index, va_block, region)                                       \
-    for ((chunk) = uvm_cpu_chunk_first_in_region((va_block), (region), &(page_index));                                \
-         (chunk) != NULL;                                                                                             \
-         (chunk) = uvm_cpu_chunk_first_in_region((va_block),                                                          \
-                                                 uvm_va_block_region((page_index) + uvm_cpu_chunk_num_pages((chunk)), \
-                                                                     (region).outer),                                 \
-                                                 &(page_index)))
+static uvm_cpu_chunk_t *uvm_cpu_chunk_next_in_region(uvm_va_block_t *va_block,
+                                                     uvm_va_block_region_t region,
+                                                     int nid,
+                                                     uvm_page_index_t prev_page_index,
+                                                     uvm_page_index_t *next_chunk_page)
+{
+    if (prev_page_index >= region.outer)
+        return NULL;

-#define for_each_cpu_chunk_in_block_region_safe(chunk, page_index, next_page_index, va_block, region)    \
-    for ((chunk) = uvm_cpu_chunk_first_in_region((va_block), (region), &(page_index)),                   \
-                       (next_page_index) = (page_index) + (chunk ? uvm_cpu_chunk_num_pages(chunk) : 0);  \
-         (chunk) != NULL;                                                                                \
-         (chunk) = uvm_cpu_chunk_first_in_region((va_block),                                             \
-                                                 uvm_va_block_region((next_page_index), (region).outer), \
-                                                 &(page_index)),                                         \
-             (next_page_index) = (page_index) + ((chunk) ? uvm_cpu_chunk_num_pages((chunk)) : 0))
+    return uvm_cpu_chunk_first_in_region(va_block,
+                                         uvm_va_block_region(prev_page_index, region.outer),
+                                         nid, next_chunk_page);
+}

-#define for_each_cpu_chunk_in_block(chunk, page_index, va_block)        \
-    for_each_cpu_chunk_in_block_region((chunk), (page_index), (va_block), uvm_va_block_region_from_block((va_block)))
+#define for_each_cpu_chunk_in_block_region(chunk, chunk_start, va_block, nid, region)                                  \
+    for ((chunk) = uvm_cpu_chunk_first_in_region((va_block), (region), (nid), &(chunk_start));                         \
+         (chunk) != NULL;                                                                                              \
+         (chunk) = uvm_cpu_chunk_next_in_region((va_block),                                                            \
+                                                (region),                                                              \
+                                                (nid),                                                                 \
+                                                (chunk_start) + uvm_cpu_chunk_num_pages((chunk)),                      \
+                                                &(chunk_start)))

-#define for_each_cpu_chunk_in_block_safe(chunk, page_index, next_page_index, va_block)  \
-    for_each_cpu_chunk_in_block_region_safe((chunk),                                    \
-                                            (page_index),                               \
-                                            (next_page_index),                          \
-                                            (va_block),                                 \
+#define for_each_cpu_chunk_in_block_region_safe(chunk, chunk_start, next_chunk_start, va_block, nid, region)           \
+    for ((chunk) = uvm_cpu_chunk_first_in_region((va_block), (region), (nid), &(chunk_start)),                         \
+             (next_chunk_start) = (chunk_start) + (chunk ? uvm_cpu_chunk_num_pages(chunk) : 0);                        \
+         (chunk) != NULL;                                                                                              \
+         (chunk) = uvm_cpu_chunk_next_in_region((va_block), (region), (nid), (next_chunk_start), &(chunk_start)),      \
+             (next_chunk_start) = (chunk_start) + ((chunk) ? uvm_cpu_chunk_num_pages((chunk)) : 0))
+
+#define for_each_cpu_chunk_in_block(chunk, chunk_start, va_block, nid)                                                 \
+    for_each_cpu_chunk_in_block_region((chunk),                                                                        \
+                                       (chunk_start),                                                                  \
+                                       (va_block),                                                                     \
+                                       (nid),                                                                          \
+                                       uvm_va_block_region_from_block((va_block)))
+
+#define for_each_cpu_chunk_in_block_safe(chunk, chunk_start, next_chunk_start, va_block, nid)                          \
+    for_each_cpu_chunk_in_block_region_safe((chunk),                                                                   \
+                                            (chunk_start),                                                             \
+                                            (next_chunk_start),                                                        \
+                                            (va_block),                                                                \
+                                            (nid),                                                                     \
                                            uvm_va_block_region_from_block((va_block)))

+static void block_update_cpu_resident_mask(uvm_va_block_t *va_block)
+{
+    int nid;
+
+    uvm_page_mask_zero(&va_block->cpu.resident);
+    for_each_possible_uvm_node(nid) {
+        uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(va_block, nid);
+        uvm_page_mask_or(&va_block->cpu.resident, &va_block->cpu.resident, &node_state->resident);
+    }
+}
+
+void uvm_va_block_cpu_set_resident_page(uvm_va_block_t *va_block, int nid, uvm_page_index_t page_index)
+{
+    uvm_va_block_cpu_node_state_t *node_state;
+
+    node_state = block_node_state_get(va_block, nid);
+    UVM_ASSERT(node_state);
+    UVM_ASSERT(uvm_page_mask_test(&node_state->allocated, page_index));
+    uvm_page_mask_set(&node_state->resident, page_index);
+    uvm_page_mask_set(&va_block->cpu.resident, page_index);
+    uvm_processor_mask_set(&va_block->resident, UVM_ID_CPU);
+}
+
+// Set all CPU pages in the mask as resident on NUMA node nid.
+// nid cannot be NUMA_NO_NODE.
+static void uvm_va_block_cpu_set_resident_mask(uvm_va_block_t *va_block, int nid, const uvm_page_mask_t *mask)
+{
+    uvm_va_block_cpu_node_state_t *node_state;
+
+    node_state = block_node_state_get(va_block, nid);
+    UVM_ASSERT(node_state);
+    UVM_ASSERT(uvm_page_mask_subset(mask, &node_state->allocated));
+    uvm_page_mask_or(&node_state->resident, &node_state->resident, mask);
+    uvm_page_mask_or(&va_block->cpu.resident, &va_block->cpu.resident, mask);
+}
+
+static void uvm_va_block_cpu_set_resident_all_chunks(uvm_va_block_t *va_block,
+                                                     uvm_va_block_context_t *va_block_context,
+                                                     const uvm_page_mask_t *page_mask)
+{
+    uvm_make_resident_page_tracking_t *tracking = &va_block_context->make_resident.cpu_pages_used;
+    uvm_page_mask_t *node_pages_mask = &va_block_context->make_resident.node_pages_mask;
+    uvm_page_mask_t *page_mask_copy = &va_block_context->scratch_page_mask;
+    int nid;
+
+    if (uvm_page_mask_empty(page_mask))
+        return;
+
+    uvm_page_mask_copy(page_mask_copy, page_mask);
+    for_each_node_mask(nid, tracking->nodes) {
+        size_t index = node_to_index(nid);
+
+        if (uvm_page_mask_and(node_pages_mask, page_mask_copy, tracking->node_masks[index])) {
+            uvm_va_block_cpu_set_resident_mask(va_block, nid, node_pages_mask);
+            uvm_page_mask_andnot(page_mask_copy, page_mask_copy, node_pages_mask);
+        }
+    }
+
+    UVM_ASSERT(uvm_page_mask_empty(page_mask_copy));
+}
+
+// Clear residency for all CPU pages in the mask.
+// nid cannot be NUMA_NO_NODE.
+static void uvm_va_block_cpu_clear_resident_mask(uvm_va_block_t *va_block, int nid, const uvm_page_mask_t *mask)
+{
+    uvm_va_block_cpu_node_state_t *node_state;
+
+    node_state = block_node_state_get(va_block, nid);
+    UVM_ASSERT(node_state);
+    uvm_page_mask_andnot(&node_state->resident, &node_state->resident, mask);
+    block_update_cpu_resident_mask(va_block);
+}
+
+static void uvm_va_block_cpu_clear_resident_region(uvm_va_block_t *va_block, int nid, uvm_va_block_region_t region)
+{
+    uvm_va_block_cpu_node_state_t *node_state;
+
+    node_state = block_node_state_get(va_block, nid);
+    UVM_ASSERT(node_state);
+    uvm_page_mask_region_clear(&node_state->resident, region);
+    block_update_cpu_resident_mask(va_block);
+}
+
+bool uvm_va_block_cpu_is_page_resident_on(uvm_va_block_t *va_block, int nid, uvm_page_index_t page_index)
+{
+    uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, nid);
+
+    return uvm_page_mask_test(resident_mask, page_index);
+}
+
+bool uvm_va_block_cpu_is_region_resident_on(uvm_va_block_t *va_block, int nid, uvm_va_block_region_t region)
+{
+    uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, nid);
+
+    return uvm_page_mask_region_full(resident_mask, region);
+}
+
+int uvm_va_block_context_get_node(uvm_va_block_context_t *va_block_context)
+{
+    if (va_block_context->make_resident.dest_nid != NUMA_NO_NODE)
+        return va_block_context->make_resident.dest_nid;
+
+    return numa_mem_id();
+}
+
 struct vm_area_struct *uvm_va_block_find_vma_region(uvm_va_block_t *va_block,
                                                    struct mm_struct *mm,
                                                    NvU64 start,
@ -515,29 +697,48 @@ struct vm_area_struct *uvm_va_block_find_vma_region(uvm_va_block_t *va_block,

 static bool block_check_cpu_chunks(uvm_va_block_t *block)
 {
-    uvm_cpu_chunk_t *chunk;
-    size_t alloced_pages = 0;
-    uvm_va_block_region_t prev_region = { 0 };
-    uvm_page_index_t page_index;
+    int nid;
+    uvm_page_mask_t *temp_resident_mask;

-    for_each_cpu_chunk_in_block(chunk, page_index, block) {
-        uvm_va_block_region_t chunk_region = uvm_cpu_chunk_block_region(block, chunk, page_index);
-        size_t num_chunk_pages = uvm_cpu_chunk_num_pages(chunk);
-        uvm_page_index_t chunk_page;
+    temp_resident_mask = kmem_cache_alloc(g_uvm_page_mask_cache, NV_UVM_GFP_FLAGS | __GFP_ZERO);

-        UVM_ASSERT(prev_region.outer <= chunk_region.first);
-        UVM_ASSERT(IS_ALIGNED(uvm_va_block_region_start(block, chunk_region), uvm_cpu_chunk_get_size(chunk)));
-        UVM_ASSERT(chunk_region.outer <= uvm_va_block_num_cpu_pages(block));
+    for_each_possible_uvm_node(nid) {
+        uvm_cpu_chunk_t *chunk;
+        uvm_page_index_t page_index;
+        uvm_va_block_region_t prev_region = {0};
+        uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, nid);
+        size_t alloced_pages = 0;

-        alloced_pages += uvm_cpu_chunk_num_pages(chunk);
-        UVM_ASSERT(uvm_page_mask_region_full(&block->cpu.allocated, chunk_region));
-        prev_region = chunk_region;
+        for_each_cpu_chunk_in_block(chunk, page_index, block, nid) {
+            uvm_va_block_region_t chunk_region = uvm_cpu_chunk_block_region(block, chunk, page_index);
+            size_t num_chunk_pages = uvm_cpu_chunk_num_pages(chunk);
+            uvm_page_index_t chunk_page;

-        for (chunk_page = page_index; chunk_page < page_index + num_chunk_pages; chunk_page++)
-            UVM_ASSERT(uvm_cpu_chunk_get_chunk_for_page(block, chunk_page) == chunk);
+            UVM_ASSERT(prev_region.outer <= chunk_region.first);
+            UVM_ASSERT(IS_ALIGNED(uvm_va_block_region_start(block, chunk_region), uvm_cpu_chunk_get_size(chunk)));
+            UVM_ASSERT(chunk_region.outer <= uvm_va_block_num_cpu_pages(block));
+
+            alloced_pages += uvm_cpu_chunk_num_pages(chunk);
+            UVM_ASSERT(uvm_page_mask_region_full(&node_state->allocated, chunk_region));
+            prev_region = chunk_region;
+
+            for (chunk_page = page_index; chunk_page < page_index + num_chunk_pages; chunk_page++)
+                UVM_ASSERT(uvm_cpu_chunk_get_chunk_for_page(block, nid, chunk_page) == chunk);
+        }
+
+        UVM_ASSERT(alloced_pages == uvm_page_mask_weight(&node_state->allocated));
+        UVM_ASSERT(uvm_page_mask_subset(&node_state->resident, &node_state->allocated));
+        UVM_ASSERT(uvm_page_mask_subset(&node_state->resident, &block->cpu.resident));
+        if (temp_resident_mask && !uvm_page_mask_empty(&node_state->resident)) {
+            UVM_ASSERT(!uvm_page_mask_intersects(&node_state->resident, temp_resident_mask));
+            uvm_page_mask_or(temp_resident_mask, temp_resident_mask, &node_state->resident);
+        }
    }

-    UVM_ASSERT(alloced_pages == uvm_page_mask_weight(&block->cpu.allocated));
+    if (temp_resident_mask) {
+        UVM_ASSERT(uvm_page_mask_equal(temp_resident_mask, &block->cpu.resident));
+        kmem_cache_free(g_uvm_page_mask_cache, temp_resident_mask);
+    }

    return true;
 }
@ -607,11 +808,17 @@ typedef struct

    // The page index
    uvm_page_index_t page_index;
+
+    // If processor is the CPU, the NUMA node of the page.
+    int nid;
 } block_phys_page_t;

-static block_phys_page_t block_phys_page(uvm_processor_id_t processor, uvm_page_index_t page_index)
+static block_phys_page_t block_phys_page(uvm_processor_id_t processor, int nid, uvm_page_index_t page_index)
 {
-    return (block_phys_page_t){ processor, page_index };
+    if (UVM_ID_IS_CPU(processor))
+        UVM_ASSERT(nid != NUMA_NO_NODE);
+
+    return (block_phys_page_t){ processor, page_index, nid };
 }

 NV_STATUS uvm_va_block_init(void)
@ -636,30 +843,102 @@ NV_STATUS uvm_va_block_init(void)
    if (!g_uvm_va_block_context_cache)
        return NV_ERR_NO_MEMORY;

+    g_uvm_va_block_cpu_node_state_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_cpu_node_state_t",
+                                                               uvm_va_block_cpu_node_state_t);
+    if (!g_uvm_va_block_cpu_node_state_cache)
+        return NV_ERR_NO_MEMORY;
+
    return NV_OK;
 }

 void uvm_va_block_exit(void)
 {
+    kmem_cache_destroy_safe(&g_uvm_va_block_cpu_node_state_cache);
    kmem_cache_destroy_safe(&g_uvm_va_block_context_cache);
    kmem_cache_destroy_safe(&g_uvm_page_mask_cache);
    kmem_cache_destroy_safe(&g_uvm_va_block_gpu_state_cache);
    kmem_cache_destroy_safe(&g_uvm_va_block_cache);
 }

+static void block_context_free_tracking(uvm_make_resident_page_tracking_t *tracking)
+{
+    size_t index;
+
+    for (index = 0; index < num_possible_nodes(); index++) {
+        if (tracking->node_masks[index])
+            kmem_cache_free(g_uvm_page_mask_cache, tracking->node_masks[index]);
+    }
+
+    uvm_kvfree(tracking->node_masks);
+}
+
+static NV_STATUS block_context_alloc_tracking(uvm_make_resident_page_tracking_t *tracking)
+{
+    size_t index;
+
+    tracking->node_masks = uvm_kvmalloc_zero(num_possible_nodes() * sizeof(*tracking->node_masks));
+    if (!tracking->node_masks)
+        return NV_ERR_NO_MEMORY;
+
+    for (index = 0; index < num_possible_nodes(); index++) {
+        tracking->node_masks[index] = kmem_cache_alloc(g_uvm_page_mask_cache, NV_UVM_GFP_FLAGS);
+        if (!tracking->node_masks[index])
+            goto error;
+    }
+
+    return NV_OK;
+
+error:
+    block_context_free_tracking(tracking);
+    return NV_ERR_NO_MEMORY;
+}
+
 uvm_va_block_context_t *uvm_va_block_context_alloc(struct mm_struct *mm)
 {
    uvm_va_block_context_t *block_context = kmem_cache_alloc(g_uvm_va_block_context_cache, NV_UVM_GFP_FLAGS);
-    if (block_context)
-        uvm_va_block_context_init(block_context, mm);
+    NV_STATUS status;

+    if (!block_context)
+        return NULL;
+
+    status = block_context_alloc_tracking(&block_context->make_resident.cpu_pages_used);
+    if (status != NV_OK) {
+        kmem_cache_free(g_uvm_va_block_context_cache, block_context);
+        return NULL;
+    }
+
+    uvm_va_block_context_init(block_context, mm);
    return block_context;
 }

+void uvm_va_block_context_init(uvm_va_block_context_t *va_block_context, struct mm_struct *mm)
+{
+    UVM_ASSERT(va_block_context);
+
+    // Write garbage into the VA Block context to ensure that the UVM code
+    // clears masks appropriately
+    if (UVM_IS_DEBUG()) {
+        uvm_page_mask_t **mask_array = va_block_context->make_resident.cpu_pages_used.node_masks;
+        int nid;
+
+        memset(va_block_context, 0xff, sizeof(*va_block_context));
+
+        for_each_possible_uvm_node(nid)
+            uvm_page_mask_fill(mask_array[node_to_index(nid)]);
+
+        va_block_context->make_resident.cpu_pages_used.node_masks = mask_array;
+    }
+
+    va_block_context->mm = mm;
+    va_block_context->make_resident.dest_nid = NUMA_NO_NODE;
+}
+
 void uvm_va_block_context_free(uvm_va_block_context_t *va_block_context)
 {
-    if (va_block_context)
+    if (va_block_context) {
+        block_context_free_tracking(&va_block_context->make_resident.cpu_pages_used);
        kmem_cache_free(g_uvm_va_block_context_cache, va_block_context);
+    }
 }

 // Convert from page_index to chunk_index. The goal is for each system page in
@ -884,6 +1163,18 @@ uvm_gpu_chunk_t *uvm_va_block_lookup_gpu_chunk(uvm_va_block_t *va_block, uvm_gpu
    return gpu_state->chunks[chunk_index];
 }

+static void uvm_va_block_free(uvm_va_block_t *block)
+{
+    if (uvm_enable_builtin_tests) {
+        uvm_va_block_wrapper_t *block_wrapper = container_of(block, uvm_va_block_wrapper_t, block);
+
+        kmem_cache_free(g_uvm_va_block_cache, block_wrapper);
+    }
+    else {
+        kmem_cache_free(g_uvm_va_block_cache, block);
+    }
+}
+
 NV_STATUS uvm_va_block_create(uvm_va_range_t *va_range,
                              NvU64 start,
                              NvU64 end,
@ -891,6 +1182,7 @@ NV_STATUS uvm_va_block_create(uvm_va_range_t *va_range,
 {
    uvm_va_block_t *block = NULL;
    NvU64 size = end - start + 1;
+    int nid;

    UVM_ASSERT(PAGE_ALIGNED(start));
    UVM_ASSERT(PAGE_ALIGNED(end + 1));
@ -911,8 +1203,11 @@ NV_STATUS uvm_va_block_create(uvm_va_range_t *va_range,
    if (uvm_enable_builtin_tests) {
        uvm_va_block_wrapper_t *block_wrapper = nv_kmem_cache_zalloc(g_uvm_va_block_cache, NV_UVM_GFP_FLAGS);

-        if (block_wrapper)
+        if (block_wrapper) {
            block = &block_wrapper->block;
+            block_wrapper->test.cpu_chunk_allocation_target_id = NUMA_NO_NODE;
+            block_wrapper->test.cpu_chunk_allocation_actual_id = NUMA_NO_NODE;
+        }
    }
    else {
        block = nv_kmem_cache_zalloc(g_uvm_va_block_cache, NV_UVM_GFP_FLAGS);
@ -921,6 +1216,18 @@ NV_STATUS uvm_va_block_create(uvm_va_range_t *va_range,
    if (!block)
        return NV_ERR_NO_MEMORY;

+    block->cpu.node_state = uvm_kvmalloc_zero(sizeof(*block->cpu.node_state) * num_possible_nodes());
+    if (!block->cpu.node_state)
+        goto error_block_free;
+
+    for_each_possible_uvm_node(nid) {
+        size_t index = node_to_index(nid);
+
+        block->cpu.node_state[index] = nv_kmem_cache_zalloc(g_uvm_va_block_cpu_node_state_cache, NV_UVM_GFP_FLAGS);
+        if (!block->cpu.node_state[index])
+            goto error;
+    }
+
    nv_kref_init(&block->kref);
    uvm_mutex_init(&block->lock, UVM_LOCK_ORDER_VA_BLOCK);
    block->start = start;
@ -933,6 +1240,20 @@ NV_STATUS uvm_va_block_create(uvm_va_range_t *va_range,

    *out_block = block;
    return NV_OK;
+
+error:
+    for_each_possible_uvm_node(nid) {
+        size_t index = node_to_index(nid);
+
+        if (block->cpu.node_state[index])
+            kmem_cache_free(g_uvm_va_block_cpu_node_state_cache, block->cpu.node_state[index]);
+    }
+
+    uvm_kvfree(block->cpu.node_state);
+
+error_block_free:
+    uvm_va_block_free(block);
+    return NV_ERR_NO_MEMORY;
 }

 static void cpu_chunk_remove_sysmem_gpu_mapping(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu)
@ -984,9 +1305,12 @@ static void block_gpu_unmap_phys_all_cpu_pages(uvm_va_block_t *block, uvm_gpu_t
 {
    uvm_cpu_chunk_t *chunk;
    uvm_page_index_t page_index;
+    int nid;

-    for_each_cpu_chunk_in_block(chunk, page_index, block)
-        cpu_chunk_remove_sysmem_gpu_mapping(chunk, gpu);
+    for_each_possible_uvm_node(nid) {
+        for_each_cpu_chunk_in_block(chunk, page_index, block, nid)
+            cpu_chunk_remove_sysmem_gpu_mapping(chunk, gpu);
+    }
 }

 static NV_STATUS block_gpu_map_phys_all_cpu_pages(uvm_va_block_t *block, uvm_gpu_t *gpu)
@ -995,18 +1319,21 @@ static NV_STATUS block_gpu_map_phys_all_cpu_pages(uvm_va_block_t *block, uvm_gpu
    uvm_cpu_chunk_t *chunk;
    NvU64 block_mapping_size = uvm_va_block_size(block);
    uvm_page_index_t page_index;
+    int nid;

    UVM_ASSERT(IS_ALIGNED(block_mapping_size, UVM_PAGE_SIZE_4K));

-    for_each_cpu_chunk_in_block(chunk, page_index, block) {
-        UVM_ASSERT_MSG(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent) == 0,
-                       "GPU%u DMA address 0x%llx\n",
-                       uvm_id_value(gpu->id),
-                       uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent));
+    for_each_possible_uvm_node(nid) {
+        for_each_cpu_chunk_in_block(chunk, page_index, block, nid) {
+            UVM_ASSERT_MSG(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent) == 0,
+                           "GPU%u DMA address 0x%llx\n",
+                           uvm_id_value(gpu->id),
+                           uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent));

-        status = cpu_chunk_add_sysmem_gpu_mapping(chunk, block, page_index, gpu);
-        if (status != NV_OK)
-            goto error;
+            status = cpu_chunk_add_sysmem_gpu_mapping(chunk, block, page_index, gpu);
+            if (status != NV_OK)
+                goto error;
+        }
    }

    return NV_OK;
@ -1176,11 +1503,11 @@ void uvm_va_block_unmap_cpu_chunk_on_gpus(uvm_va_block_t *block,
 }

 NV_STATUS uvm_va_block_map_cpu_chunk_on_gpus(uvm_va_block_t *block,
+                                             uvm_cpu_chunk_t *chunk,
                                             uvm_page_index_t page_index)
 {
    NV_STATUS status;
    uvm_gpu_id_t id;
-    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
    uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk);
    uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, chunk_size, page_index);

@ -1213,21 +1540,25 @@ void uvm_va_block_remove_cpu_chunks(uvm_va_block_t *va_block, uvm_va_block_regio
    uvm_cpu_chunk_t *chunk;
    uvm_page_index_t page_index, next_page_index;
    uvm_va_block_region_t chunk_region;
+    int nid;

-    for_each_cpu_chunk_in_block_region_safe(chunk, page_index, next_page_index, va_block, region) {
-        chunk_region = uvm_va_block_region(page_index, page_index + uvm_cpu_chunk_num_pages(chunk));
+    for_each_possible_uvm_node(nid) {
+        for_each_cpu_chunk_in_block_region_safe(chunk, page_index, next_page_index, va_block, nid, region) {
+            chunk_region = uvm_va_block_region(page_index, page_index + uvm_cpu_chunk_num_pages(chunk));

-        uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], chunk_region);
-        uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], chunk_region);
-        uvm_page_mask_region_clear(&va_block->cpu.resident, chunk_region);
-        uvm_cpu_chunk_remove_from_block(va_block, page_index);
-        uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk, page_index);
-        uvm_cpu_chunk_free(chunk);
+            uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], chunk_region);
+            uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], chunk_region);
+            uvm_va_block_cpu_clear_resident_region(va_block, nid, chunk_region);
+            uvm_cpu_chunk_remove_from_block(va_block, nid, page_index);
+            uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk, page_index);
+            uvm_cpu_chunk_free(chunk);
+        }
    }

    if (uvm_page_mask_empty(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]))
        uvm_processor_mask_clear(&va_block->mapped, UVM_ID_CPU);
-    if (uvm_page_mask_empty(&va_block->cpu.resident))
+
+    if (uvm_page_mask_empty(uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE)))
        uvm_processor_mask_clear(&va_block->resident, UVM_ID_CPU);
 }

@ -1291,25 +1622,25 @@ static void block_unmap_indirect_peers_from_gpu_chunk(uvm_va_block_t *block, uvm
 }

 // Mark a CPU page as dirty.
-static void  block_mark_cpu_page_dirty(uvm_va_block_t *block, uvm_page_index_t page_index)
+static void block_mark_cpu_page_dirty(uvm_va_block_t *block, uvm_page_index_t page_index, int nid)
 {
-    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
+    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, nid, page_index);
    uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), page_index);
    uvm_cpu_chunk_mark_dirty(chunk, page_index - chunk_region.first);
 }

 // Mark a CPU page as clean.
-static void block_mark_cpu_page_clean(uvm_va_block_t *block, uvm_page_index_t page_index)
+static void block_mark_cpu_page_clean(uvm_va_block_t *block, uvm_page_index_t page_index, int nid)
 {
-    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
+    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, nid, page_index);
    uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), page_index);
    uvm_cpu_chunk_mark_clean(chunk, page_index - chunk_region.first);
 }

 // Check if a CPU page is dirty.
-static bool block_cpu_page_is_dirty(uvm_va_block_t *block, uvm_page_index_t page_index)
+static bool block_cpu_page_is_dirty(uvm_va_block_t *block, uvm_page_index_t page_index, int nid)
 {
-    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
+    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, nid, page_index);
    uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), page_index);
    return uvm_cpu_chunk_is_dirty(chunk, page_index - chunk_region.first);
 }
@ -1317,21 +1648,177 @@ static bool block_cpu_page_is_dirty(uvm_va_block_t *block, uvm_page_index_t page
 static NV_STATUS block_alloc_cpu_chunk(uvm_va_block_t *block,
                                       uvm_chunk_size_t alloc_size,
                                       uvm_cpu_chunk_alloc_flags_t flags,
+                                       int nid,
                                       uvm_cpu_chunk_t **chunk)
 {
    uvm_va_block_test_t *block_test = uvm_va_block_get_test(block);

-    // Return out of memory error if the tests have requested it. As opposed to
-    // other error injection settings, this one fails N times and then succeeds.
-    // TODO: Bug 3701182: This will print a warning in Linux kernels newer than
-    // 5.16.0-rc1+.
-    if (block_test && block_test->inject_cpu_pages_allocation_error_count) {
-        if (block_test->inject_cpu_pages_allocation_error_count != ~(NvU32)0)
-            block_test->inject_cpu_pages_allocation_error_count--;
+    if (block_test) {
+        // Return out of memory error if the tests have requested it. As opposed
+        // to other error injection settings, this one fails N times and then
+        // succeeds.
+        // TODO: Bug 3701182: This will print a warning in Linux kernels newer
+        // than 5.16.0-rc1+.
+        if (block_test->inject_cpu_pages_allocation_error_count) {
+            if (block_test->inject_cpu_pages_allocation_error_count != ~(NvU32)0)
+                block_test->inject_cpu_pages_allocation_error_count--;
+            return NV_ERR_NO_MEMORY;
+        }
+
+        if (block_test->cpu_chunk_allocation_actual_id != NUMA_NO_NODE)
+            nid = block_test->cpu_chunk_allocation_actual_id;
+    }
+
+    return uvm_cpu_chunk_alloc(alloc_size, flags, nid, chunk);
+}
+
+// Handle insertion of overlapping CPU chunks.
+// In cases where the kernel allocates CPU chunks on NUMA nodes that already
+// have existing chunks, it's possible that the newly allocated chunk overlaps
+// existing chunks.
+// In such cases, the newly allocated chunk has to be appropriately split and
+// only the non-overlapping subchunks inserted into the block.
+// The subchunks, which are not inserted are freed.
+// If there is an error during split, insertion, or mapping, any sub-chunks that
+// have already been successfully inserted will remain in the block. The rest of
+// the sub-chunks will be freed in order to maintain proper refcounts on the
+// parent chunk.
+static NV_STATUS block_populate_overlapping_cpu_chunks(uvm_va_block_t *block,
+                                                       uvm_va_block_context_t *block_context,
+                                                       uvm_cpu_chunk_t *chunk,
+                                                       uvm_page_index_t page_index)
+{
+    int nid = uvm_cpu_chunk_get_numa_node(chunk);
+    uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, nid);
+    uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk);
+    uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, chunk_size, page_index);
+    uvm_page_index_t running_page_index;
+    uvm_cpu_chunk_t **split_chunks;
+    uvm_cpu_chunk_t **small_chunks = NULL;
+    uvm_cpu_chunk_t *chunk_ptr;
+    uvm_page_mask_t *node_pages_mask = &block_context->make_resident.node_pages_mask;
+    uvm_chunk_size_t split_size;
+    size_t i;
+    NV_STATUS status;
+
+    UVM_ASSERT(IS_ALIGNED(uvm_va_block_cpu_page_address(block, page_index), chunk_size));
+
+    // Get a mask of all the chunk pages that are not overlapping existing
+    // chunks.
+    uvm_page_mask_init_from_region(node_pages_mask, chunk_region, NULL);
+    uvm_page_mask_andnot(node_pages_mask, node_pages_mask, &node_state->allocated);
+
+    split_size = uvm_chunk_find_prev_size(uvm_cpu_chunk_get_allocation_sizes(), chunk_size);
+    split_chunks = uvm_kvmalloc_zero((chunk_size / split_size) * sizeof(*split_chunks));
+    if (!split_chunks) {
+        uvm_cpu_chunk_free(chunk);
        return NV_ERR_NO_MEMORY;
    }

-    return uvm_cpu_chunk_alloc(alloc_size, flags, chunk);
+    if (split_size > UVM_PAGE_SIZE_4K) {
+        small_chunks = uvm_kvmalloc_zero(MAX_SMALL_CHUNKS_PER_BIG_SLOT * sizeof(*small_chunks));
+        if (!small_chunks) {
+            uvm_kvfree(split_chunks);
+            uvm_cpu_chunk_free(chunk);
+            return NV_ERR_NO_MEMORY;
+        }
+    }
+
+    // If we are here, we have to do at least one split.
+    // We can't call any of the block_split_cpu_chunk_to_* functions since they
+    // insert all of the split chunks into the block.
+    // We only want to insert the sub-chunks that don't overlap. So, we have to
+    // handle that by calling uvm_cpu_chunk_split() directly.
+    status = uvm_cpu_chunk_split(chunk, split_chunks);
+    if (status != NV_OK)
+        goto done;
+
+    // Insert all split chunks that don't overlap existing allocations.
+    // Note that this handles both splitting to 64K and 4K.
+    running_page_index = page_index;
+    for (i = 0; i < chunk_size / split_size; i++) {
+        uvm_va_block_region_t subchunk_region = uvm_va_block_chunk_region(block, split_size, running_page_index);
+
+        // - If all the pages covered by the split chunk are missing, insert the
+        //   chunk into the block.
+        // - If none of the pages are missing, free the chunk.
+        // - Otherwise, some of the pages covered by the chunk are missing and a
+        //   second split will be needed.
+        if (uvm_page_mask_region_full(node_pages_mask, subchunk_region)) {
+            status = uvm_cpu_chunk_insert_in_block(block, split_chunks[i], running_page_index);
+            if (status != NV_OK)
+                goto done;
+
+            // To prevent double chunk freeing on error, clear the array pointer
+            // before mapping.
+            chunk_ptr = split_chunks[i];
+            split_chunks[i] = NULL;
+            status = uvm_va_block_map_cpu_chunk_on_gpus(block, chunk_ptr, running_page_index);
+            if (status != NV_OK)
+                goto done;
+        }
+        else if (uvm_page_mask_region_empty(node_pages_mask, subchunk_region)) {
+            uvm_cpu_chunk_free(split_chunks[i]);
+            split_chunks[i] = NULL;
+        }
+
+        running_page_index = subchunk_region.outer;
+    }
+
+    if (split_size > UVM_PAGE_SIZE_4K) {
+        // Split any 64K chunks that overlap 4K chunks.
+        for (i = 0; i < chunk_size / split_size; i++) {
+            size_t j;
+
+            if (!split_chunks[i])
+                continue;
+
+            running_page_index = page_index + ((split_size * i) / PAGE_SIZE);
+            status = uvm_cpu_chunk_split(split_chunks[i], small_chunks);
+            if (status != NV_OK)
+                goto done;
+
+            for (j = 0; j < MAX_SMALL_CHUNKS_PER_BIG_SLOT; j++) {
+                size_t chunk_num_pages = uvm_cpu_chunk_num_pages(small_chunks[j]);
+
+                if (uvm_page_mask_test(node_pages_mask, running_page_index)) {
+                    status = uvm_cpu_chunk_insert_in_block(block, small_chunks[j], running_page_index);
+                    if (status != NV_OK)
+                        goto done;
+
+                    // To prevent double chunk freeing on error, clear the array pointer
+                    // before mapping.
+                    chunk_ptr = small_chunks[j];
+                    small_chunks[j] = NULL;
+                    status = uvm_va_block_map_cpu_chunk_on_gpus(block, chunk_ptr, running_page_index);
+                    if (status != NV_OK)
+                        goto done;
+                }
+                else {
+                    uvm_cpu_chunk_free(small_chunks[j]);
+                }
+
+                running_page_index += chunk_num_pages;
+            }
+        }
+    }
+
+done:
+    if (status != NV_OK) {
+        // First, free any small chunks that have not been inserted.
+        if (small_chunks) {
+            for (i = 0; i < MAX_SMALL_CHUNKS_PER_BIG_SLOT; i++)
+                uvm_cpu_chunk_free(small_chunks[i]);
+        }
+
+        // Next, free any large chunks that have not been inserted.
+        for (i = 0; i < chunk_size / split_size; i++)
+            uvm_cpu_chunk_free(split_chunks[i]);
+    }
+
+    uvm_kvfree(small_chunks);
+    uvm_kvfree(split_chunks);
+    return status;
 }

 // Allocates the input page in the block, if it doesn't already exist
@ -1350,17 +1837,31 @@ static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,
    uvm_chunk_sizes_mask_t cpu_allocation_sizes = uvm_cpu_chunk_get_allocation_sizes();
    uvm_chunk_size_t alloc_size;
    uvm_page_mask_t *resident_mask = &block_context->scratch_page_mask;
+    uvm_page_mask_t *allocated_mask;
    uvm_cpu_chunk_alloc_flags_t alloc_flags = UVM_CPU_CHUNK_ALLOC_FLAGS_NONE;
    uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
    uvm_processor_mask_t uvm_lite_gpus;
    uvm_page_index_t page_index;
    uvm_gpu_id_t id;
+    int preferred_nid = block_context->make_resident.dest_nid;
+
+    if (block_test && block_test->cpu_chunk_allocation_target_id != NUMA_NO_NODE)
+        preferred_nid = block_test->cpu_chunk_allocation_target_id;
+
+    // TODO: Bug 4158598: Using NUMA_NO_NODE for staging allocations is sub-optimal.
+    if (preferred_nid != NUMA_NO_NODE) {
+        uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, preferred_nid);
+        allocated_mask = &node_state->allocated;
+    }
+    else {
+        allocated_mask = &block->cpu.allocated;
+    }

    // Check whether all requested pages have already been allocated.
    uvm_page_mask_init_from_region(&block_context->scratch_page_mask, populate_region, populate_page_mask);
    if (!uvm_page_mask_andnot(&block_context->scratch_page_mask,
                              &block_context->scratch_page_mask,
-                              &block->cpu.allocated))
+                              allocated_mask))
        return NV_OK;

    if (block_test) {
@ -1369,8 +1870,8 @@ static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,
    }

    uvm_page_mask_zero(resident_mask);
-    for_each_id_in_mask (id, &block->resident)
-        uvm_page_mask_or(resident_mask, resident_mask, uvm_va_block_resident_mask_get(block, id));
+    for_each_id_in_mask(id, &block->resident)
+        uvm_page_mask_or(resident_mask, resident_mask, uvm_va_block_resident_mask_get(block, id, NUMA_NO_NODE));

    // If the VA space has a UVM-Lite GPU registered, only PAGE_SIZE allocations
    // should be used in order to avoid extra copies due to dirty compound
@ -1390,13 +1891,15 @@ static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,
    for_each_va_block_page_in_region_mask(page_index, populate_page_mask, populate_region) {
        uvm_cpu_chunk_alloc_flags_t chunk_alloc_flags;
        uvm_va_block_region_t region = populate_region;
+        uvm_va_block_cpu_node_state_t *node_state;
+        int alloced_nid;

-        if (uvm_page_mask_test(&block->cpu.allocated, page_index)) {
-            page_index = uvm_va_block_next_unset_page_in_mask(populate_region, &block->cpu.allocated, page_index) - 1;
+        if (uvm_page_mask_test(allocated_mask, page_index)) {
+            page_index = uvm_va_block_next_unset_page_in_mask(populate_region, allocated_mask, page_index) - 1;
            continue;
        }

-        UVM_ASSERT(!uvm_page_mask_test(&block->cpu.resident, page_index));
+        UVM_ASSERT(!uvm_va_block_cpu_is_page_resident_on(block, preferred_nid, page_index));

        chunk_alloc_flags = alloc_flags;

@ -1419,7 +1922,7 @@ static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,

            region = uvm_va_block_region_from_start_end(block, alloc_virt_addr, alloc_virt_addr + alloc_size - 1);

-            if (!uvm_page_mask_region_empty(&block->cpu.allocated, region))
+            if (!uvm_page_mask_region_empty(allocated_mask, region))
                continue;

            // If not all pages in the allocation region are resident somewhere,
@ -1430,7 +1933,7 @@ static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,
            if (!uvm_page_mask_region_full(resident_mask, region))
                chunk_alloc_flags |= UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO;

-            status = block_alloc_cpu_chunk(block, alloc_size, chunk_alloc_flags, &chunk);
+            status = block_alloc_cpu_chunk(block, alloc_size, chunk_alloc_flags, preferred_nid, &chunk);
            if (status == NV_OK) {
                page_index = region.first;
                break;
@ -1442,22 +1945,39 @@ static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,
        if (status != NV_OK)
            break;

-        status = uvm_cpu_chunk_insert_in_block(block, chunk, page_index);
-        if (status != NV_OK) {
-            uvm_cpu_chunk_free(chunk);
-            return status;
+        alloced_nid = uvm_cpu_chunk_get_numa_node(chunk);
+        node_state = block_node_state_get(block, alloced_nid);
+        if (!uvm_page_mask_region_empty(&node_state->allocated, region)) {
+            UVM_ASSERT(preferred_nid != NUMA_NO_NODE);
+
+            if (uvm_page_mask_region_full(&node_state->allocated, region)) {
+                uvm_cpu_chunk_free(chunk);
+                goto skip;
+            }
+
+            status = block_populate_overlapping_cpu_chunks(block, block_context, chunk, page_index);
+            if (status != NV_OK)
+                return status;
+        }
+        else {
+            status = uvm_cpu_chunk_insert_in_block(block, chunk, page_index);
+            if (status != NV_OK) {
+                uvm_cpu_chunk_free(chunk);
+                return status;
+            }
+
+            status = uvm_va_block_map_cpu_chunk_on_gpus(block, chunk, page_index);
+            if (status != NV_OK)
+                break;
        }

-        status = uvm_va_block_map_cpu_chunk_on_gpus(block, page_index);
-        if (status != NV_OK)
-            break;
-
+skip:
        // Skip iterating over all pages covered by the allocated chunk.
        page_index = region.outer - 1;
    }

    if (status != NV_OK && chunk) {
-        uvm_cpu_chunk_remove_from_block(block, page_index);
+        uvm_cpu_chunk_remove_from_block(block, uvm_cpu_chunk_get_numa_node(chunk), page_index);
        uvm_cpu_chunk_free(chunk);
    }

@ -1742,9 +2262,10 @@ static NvU32 block_phys_page_size(uvm_va_block_t *block, block_phys_page_t page)
    uvm_chunk_size_t chunk_size;

    if (UVM_ID_IS_CPU(page.processor)) {
-        uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page.page_index);
+        uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page.nid, page.page_index);
+        uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, page.processor, NUMA_NO_NODE);

-        if (!uvm_page_mask_test(&block->cpu.resident, page.page_index))
+        if (!uvm_page_mask_test(resident_mask, page.page_index))
            return 0;

        UVM_ASSERT(uvm_processor_mask_test(&block->resident, UVM_ID_CPU));
@ -1791,29 +2312,41 @@ static uvm_pte_bits_gpu_t get_gpu_pte_bit_index(uvm_prot_t prot)
    return pte_bit_index;
 }

-uvm_page_mask_t *uvm_va_block_resident_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor)
+uvm_page_mask_t *uvm_va_block_resident_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor, int nid)
 {
    uvm_va_block_gpu_state_t *gpu_state;
+    uvm_page_mask_t *resident_mask;

-    if (UVM_ID_IS_CPU(processor))
-        return &block->cpu.resident;
+    if (UVM_ID_IS_CPU(processor)) {
+        uvm_va_block_cpu_node_state_t *node_state;

-    gpu_state = uvm_va_block_gpu_state_get(block, processor);
+        if (nid == NUMA_NO_NODE) {
+            resident_mask = &block->cpu.resident;
+        }
+        else {
+            node_state = block_node_state_get(block, nid);
+            resident_mask = &node_state->resident;
+        }
+    }
+    else {
+        gpu_state = uvm_va_block_gpu_state_get(block, processor);
+        UVM_ASSERT(gpu_state);
+        resident_mask = &gpu_state->resident;
+    }

-    UVM_ASSERT(gpu_state);
-    return &gpu_state->resident;
+    return resident_mask;
 }

 // Get the page residency mask for a processor
 //
 // Notably this will allocate GPU state if not yet present and if that fails
 // NULL is returned.
-static uvm_page_mask_t *block_resident_mask_get_alloc(uvm_va_block_t *block, uvm_processor_id_t processor)
+static uvm_page_mask_t *block_resident_mask_get_alloc(uvm_va_block_t *block, uvm_processor_id_t processor, int nid)
 {
    uvm_va_block_gpu_state_t *gpu_state;

    if (UVM_ID_IS_CPU(processor))
-        return &block->cpu.resident;
+        return uvm_va_block_resident_mask_get(block, processor, nid);

    gpu_state = block_gpu_state_get_alloc(block, block_get_gpu(block, processor));
    if (!gpu_state)
@ -1842,6 +2375,28 @@ const uvm_page_mask_t *uvm_va_block_map_mask_get(uvm_va_block_t *block, uvm_proc
    return block_map_with_prot_mask_get(block, processor, UVM_PROT_READ_ONLY);
 }

+void uvm_va_block_unmapped_pages_get(uvm_va_block_t *va_block,
+                                     uvm_va_block_region_t region,
+                                     uvm_page_mask_t *out_mask)
+{
+    uvm_processor_mask_t non_uvm_lite_gpus;
+    uvm_processor_id_t id;
+
+    uvm_assert_mutex_locked(&va_block->lock);
+
+    if (!uvm_va_block_is_hmm(va_block)) {
+        uvm_page_mask_complement(out_mask, &va_block->maybe_mapped_pages);
+        return;
+    }
+
+    uvm_page_mask_region_fill(out_mask, region);
+    uvm_processor_mask_andnot(&non_uvm_lite_gpus, &va_block->mapped, block_get_uvm_lite_gpus(va_block));
+
+    for_each_id_in_mask(id, &non_uvm_lite_gpus) {
+        uvm_page_mask_andnot(out_mask, out_mask, uvm_va_block_map_mask_get(va_block, id));
+    }
+}
+
 static const uvm_page_mask_t *block_evicted_mask_get(uvm_va_block_t *block, uvm_gpu_id_t gpu_id)
 {
    uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu_id);
@ -1854,7 +2409,7 @@ static bool block_is_page_resident_anywhere(uvm_va_block_t *block, uvm_page_inde
 {
    uvm_processor_id_t id;
    for_each_id_in_mask(id, &block->resident) {
-        if (uvm_page_mask_test(uvm_va_block_resident_mask_get(block, id), page_index))
+        if (uvm_page_mask_test(uvm_va_block_resident_mask_get(block, id, NUMA_NO_NODE), page_index))
            return true;
    }

@ -1877,24 +2432,6 @@ static bool block_processor_page_is_populated(uvm_va_block_t *block, uvm_process
    return gpu_state->chunks[chunk_index] != NULL;
 }

-static bool block_processor_page_is_resident_on(uvm_va_block_t *block, uvm_processor_id_t proc, uvm_page_index_t page_index)
-{
-    const uvm_page_mask_t *resident_mask;
-
-    if (UVM_ID_IS_CPU(proc)) {
-        resident_mask = &block->cpu.resident;
-    }
-    else {
-        uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, proc);
-        if (!gpu_state)
-            return false;
-
-        resident_mask = &gpu_state->resident;
-    }
-
-    return uvm_page_mask_test(resident_mask, page_index);
-}
-
 // Compute the gpus that have at least the given access permissions for the
 // range described by region and page_mask. The function sets the bit if any
 // page in the region has the permissions.
@ -2007,7 +2544,7 @@ static void block_page_resident_gpus(uvm_va_block_t *va_block,
    uvm_processor_mask_zero(resident_gpus);

    for_each_gpu_id_in_mask(id, &va_block->resident) {
-        if (uvm_page_mask_test(uvm_va_block_resident_mask_get(va_block, id), page_index)) {
+        if (uvm_page_mask_test(uvm_va_block_resident_mask_get(va_block, id, NUMA_NO_NODE), page_index)) {
            UVM_ASSERT(block_processor_page_is_populated(va_block, id, page_index));
            uvm_processor_mask_set(resident_gpus, id);
        }
@ -2020,7 +2557,7 @@ void uvm_va_block_page_resident_processors(uvm_va_block_t *va_block,
 {
    block_page_resident_gpus(va_block, page_index, resident_processors);

-    if (uvm_page_mask_test(uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU), page_index)) {
+    if (uvm_page_mask_test(uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE), page_index)) {
        UVM_ASSERT(block_processor_page_is_populated(va_block, UVM_ID_CPU, page_index));
        uvm_processor_mask_set(resident_processors, UVM_ID_CPU);
    }
@ -2049,7 +2586,7 @@ static uvm_processor_id_t block_page_get_closest_resident_in_mask(uvm_va_block_t
        uvm_processor_mask_copy(&search_mask, &va_block->resident);

    for_each_closest_id(id, &search_mask, processor, va_space) {
-        if (uvm_page_mask_test(uvm_va_block_resident_mask_get(va_block, id), page_index))
+        if (uvm_page_mask_test(uvm_va_block_resident_mask_get(va_block, id, NUMA_NO_NODE), page_index))
            return id;
    }

@ -2245,7 +2782,7 @@ static NV_STATUS block_zero_new_gpu_chunk(uvm_va_block_t *block,
    // Roll up all pages in chunk_region which are resident somewhere
    uvm_page_mask_zero(zero_mask);
    for_each_id_in_mask(id, &block->resident)
-        uvm_page_mask_or(zero_mask, zero_mask, uvm_va_block_resident_mask_get(block, id));
+        uvm_page_mask_or(zero_mask, zero_mask, uvm_va_block_resident_mask_get(block, id, NUMA_NO_NODE));

    // If all pages in the chunk are resident somewhere, we don't need to clear
    // anything. Just make sure the chunk is tracked properly.
@ -2434,6 +2971,13 @@ static NV_STATUS block_populate_pages_gpu(uvm_va_block_t *block,
    return NV_OK;
 }

+static const uvm_processor_mask_t *block_get_can_copy_from_mask(uvm_va_block_t *block, uvm_processor_id_t from)
+{
+    uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
+
+    return &va_space->can_copy_from[uvm_id_value(from)];
+}
+
 static NV_STATUS block_populate_pages(uvm_va_block_t *block,
                                      uvm_va_block_retry_t *retry,
                                      uvm_va_block_context_t *block_context,
@ -2442,8 +2986,10 @@ static NV_STATUS block_populate_pages(uvm_va_block_t *block,
                                      const uvm_page_mask_t *page_mask)
 {
    NV_STATUS status;
-    const uvm_page_mask_t *resident_mask = block_resident_mask_get_alloc(block, dest_id);
+    const uvm_page_mask_t *resident_mask = block_resident_mask_get_alloc(block, dest_id, NUMA_NO_NODE);
    uvm_page_mask_t *populate_page_mask = &block_context->make_resident.page_mask;
+    uvm_page_mask_t *pages_staged = &block_context->make_resident.pages_staged;
+    uvm_page_mask_t *cpu_populate_mask;
    uvm_memcg_context_t memcg_context;

    if (!resident_mask)
@ -2454,22 +3000,58 @@ static NV_STATUS block_populate_pages(uvm_va_block_t *block,
    else
        uvm_page_mask_complement(populate_page_mask, resident_mask);

-    if (UVM_ID_IS_GPU(dest_id))
-        return block_populate_pages_gpu(block, retry, block_get_gpu(block, dest_id), region, populate_page_mask);
+    if (UVM_ID_IS_GPU(dest_id)) {
+        uvm_processor_mask_t staged_processors;
+        uvm_processor_mask_t accessible_resident_processors;
+        const uvm_processor_mask_t *can_copy_from_processors;
+        uvm_page_mask_t *scratch_page_mask = &block_context->scratch_page_mask;
+        uvm_page_mask_t *id_resident_mask;
+        uvm_processor_id_t id;
+
+        status = block_populate_pages_gpu(block, retry, block_get_gpu(block, dest_id), region, populate_page_mask);
+        if (status != NV_OK)
+            return status;
+
+        uvm_page_mask_zero(pages_staged);
+
+        // Get the mask of all processors that have resident pages from which
+        // the destination cannot copy directly.
+        can_copy_from_processors = block_get_can_copy_from_mask(block, dest_id);
+        if (!uvm_processor_mask_andnot(&staged_processors, &block->resident, can_copy_from_processors))
+            return status;
+
+        // Compute the pages that will be staged through the CPU by:
+        //   1. Computing all of the pages resident on the processors from which
+        //      dest_id cannot directly copy.
+        for_each_id_in_mask(id, &staged_processors) {
+            id_resident_mask = uvm_va_block_resident_mask_get(block, id, NUMA_NO_NODE);
+            uvm_page_mask_and(scratch_page_mask, populate_page_mask, id_resident_mask);
+            uvm_page_mask_or(pages_staged, pages_staged, scratch_page_mask);
+        }
+
+        //   2. Remove any pages in pages_staged that are on any resident processor
+        //      dest_id can copy from.
+        if (uvm_processor_mask_and(&accessible_resident_processors, can_copy_from_processors, &block->resident)) {
+            for_each_id_in_mask(id, &accessible_resident_processors) {
+                id_resident_mask = uvm_va_block_resident_mask_get(block, id, NUMA_NO_NODE);
+                uvm_page_mask_andnot(pages_staged, pages_staged, id_resident_mask);
+            }
+        }
+
+        //   3. Removing any pages not in the populate mask.
+        uvm_page_mask_region_clear_outside(pages_staged, region);
+        cpu_populate_mask = pages_staged;
+    }
+    else {
+        cpu_populate_mask = populate_page_mask;
+    }

    uvm_memcg_context_start(&memcg_context, block_context->mm);
-    status = block_populate_pages_cpu(block, populate_page_mask, region, block_context);
+    status = block_populate_pages_cpu(block, cpu_populate_mask, region, block_context);
    uvm_memcg_context_end(&memcg_context);
    return status;
 }

-static const uvm_processor_mask_t *block_get_can_copy_from_mask(uvm_va_block_t *block, uvm_processor_id_t from)
-{
-    uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
-
-    return &va_space->can_copy_from[uvm_id_value(from)];
-}
-
 static bool block_can_copy_from(uvm_va_block_t *va_block, uvm_processor_id_t from, uvm_processor_id_t to)
 {
    return uvm_processor_mask_test(block_get_can_copy_from_mask(va_block, to), from);
@ -2513,7 +3095,7 @@ static uvm_gpu_phys_address_t block_phys_page_address(uvm_va_block_t *block,
    UVM_ASSERT(accessing_gpu_state);

    if (UVM_ID_IS_CPU(block_page.processor)) {
-        uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, block_page.page_index);
+        uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, block_page.nid, block_page.page_index);
        NvU64 dma_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent);
        uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block,
                                                                       uvm_cpu_chunk_get_size(chunk),
@ -2588,9 +3170,15 @@ uvm_gpu_phys_address_t uvm_va_block_res_phys_page_address(uvm_va_block_t *va_blo
                                                          uvm_processor_id_t residency,
                                                          uvm_gpu_t *gpu)
 {
-    uvm_assert_mutex_locked(&va_block->lock);
+    int nid = NUMA_NO_NODE;

-    return block_phys_page_address(va_block, block_phys_page(residency, page_index), gpu);
+    uvm_assert_mutex_locked(&va_block->lock);
+    if (UVM_ID_IS_CPU(residency)) {
+        nid = block_get_page_node_residency(va_block, page_index);
+        UVM_ASSERT(nid != NUMA_NO_NODE);
+    }
+
+    return block_phys_page_address(va_block, block_phys_page(residency, nid, page_index), gpu);
 }

 uvm_gpu_phys_address_t uvm_va_block_gpu_phys_page_address(uvm_va_block_t *va_block,
@ -2605,6 +3193,9 @@ typedef struct
    // Location of the memory
    uvm_processor_id_t id;

+    // NUMA node ID if the processor is the CPU. Ignored otherwise.
+    int nid;
+
    // Whether the whole block has a single physically-contiguous chunk of
    // storage on the processor.
    bool is_block_contig;
@ -2734,13 +3325,14 @@ error:
 static bool block_page_is_clean(uvm_va_block_t *block,
                                uvm_processor_id_t dst_id,
                                uvm_processor_id_t src_id,
-                                uvm_page_index_t page_index)
+                                uvm_page_index_t page_index,
+                                int nid)
 {
    return !uvm_va_block_is_hmm(block) &&
           uvm_id_equal(dst_id, uvm_va_range_get_policy(block->va_range)->preferred_location) &&
           UVM_ID_IS_CPU(src_id) &&
           !block_get_gpu(block, dst_id)->parent->isr.replayable_faults.handling &&
-           !block_cpu_page_is_dirty(block, page_index);
+           !block_cpu_page_is_dirty(block, page_index, nid);
 }

 // When the destination is the CPU...
@ -2749,15 +3341,16 @@ static bool block_page_is_clean(uvm_va_block_t *block,
 static void block_update_page_dirty_state(uvm_va_block_t *block,
                                          uvm_processor_id_t dst_id,
                                          uvm_processor_id_t src_id,
+                                          int nid,
                                          uvm_page_index_t page_index)
 {
    if (UVM_ID_IS_GPU(dst_id))
        return;

    if (uvm_id_equal(src_id, uvm_va_range_get_policy(block->va_range)->preferred_location))
-        block_mark_cpu_page_clean(block, page_index);
+        block_mark_cpu_page_clean(block, page_index, nid);
    else
-        block_mark_cpu_page_dirty(block, page_index);
+        block_mark_cpu_page_dirty(block, page_index, nid);
 }

 static void block_mark_memory_used(uvm_va_block_t *block, uvm_processor_id_t id)
@ -2783,7 +3376,7 @@ static void block_mark_memory_used(uvm_va_block_t *block, uvm_processor_id_t id)

 static void block_set_resident_processor(uvm_va_block_t *block, uvm_processor_id_t id)
 {
-    UVM_ASSERT(!uvm_page_mask_empty(uvm_va_block_resident_mask_get(block, id)));
+    UVM_ASSERT(!uvm_page_mask_empty(uvm_va_block_resident_mask_get(block, id, NUMA_NO_NODE)));

    if (uvm_processor_mask_test_and_set(&block->resident, id))
        return;
@ -2795,7 +3388,7 @@ static void block_clear_resident_processor(uvm_va_block_t *block, uvm_processor_
 {
    uvm_gpu_t *gpu;

-    UVM_ASSERT(uvm_page_mask_empty(uvm_va_block_resident_mask_get(block, id)));
+    UVM_ASSERT(uvm_page_mask_empty(uvm_va_block_resident_mask_get(block, id, NUMA_NO_NODE)));

    if (!uvm_processor_mask_test_and_clear(&block->resident, id))
        return;
@ -2821,37 +3414,41 @@ static bool block_phys_copy_contig_check(uvm_va_block_t *block,
                                         uvm_page_index_t page_index,
                                         const uvm_gpu_address_t *base_address,
                                         uvm_processor_id_t proc_id,
+                                         int nid,
                                         uvm_gpu_t *copying_gpu)
 {
    uvm_gpu_address_t page_address;
    uvm_gpu_address_t contig_address = *base_address;

    contig_address.address += page_index * PAGE_SIZE;
-
-    page_address = block_phys_page_copy_address(block, block_phys_page(proc_id, page_index), copying_gpu);
+    page_address = block_phys_page_copy_address(block, block_phys_page(proc_id, nid, page_index), copying_gpu);

    return uvm_gpu_addr_cmp(page_address, contig_address) == 0;
 }

 // Check if the VA block has a single physically-contiguous chunk of storage
 // on the processor.
-static bool is_block_phys_contig(uvm_va_block_t *block, uvm_processor_id_t id)
+static bool is_block_phys_contig(uvm_va_block_t *block, uvm_processor_id_t id, int nid)
 {
    uvm_cpu_chunk_t *chunk;

    if (UVM_ID_IS_GPU(id))
        return uvm_va_block_size(block) == block_gpu_chunk_size(block, block_get_gpu(block, id), 0);

-    chunk = uvm_cpu_chunk_first_in_region(block, uvm_va_block_region_from_block(block), NULL);
+    UVM_ASSERT(nid != NUMA_NO_NODE);
+    chunk = uvm_cpu_chunk_first_in_region(block, uvm_va_block_region_from_block(block), nid, NULL);
    return chunk && (uvm_va_block_size(block) == uvm_cpu_chunk_get_size(chunk));
 }

 static uvm_va_block_region_t block_phys_contig_region(uvm_va_block_t *block,
                                                      uvm_page_index_t page_index,
-                                                      uvm_processor_id_t resident_id)
+                                                      uvm_processor_id_t resident_id,
+                                                      int nid)
 {
    if (UVM_ID_IS_CPU(resident_id)) {
-        uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
+        uvm_cpu_chunk_t *chunk;
+        UVM_ASSERT(nid != NUMA_NO_NODE);
+        chunk = uvm_cpu_chunk_get_chunk_for_page(block, nid, page_index);
        return uvm_cpu_chunk_block_region(block, chunk, page_index);
    }
    else {
@ -2871,11 +3468,11 @@ static uvm_gpu_address_t block_copy_get_address(uvm_va_block_t *block,
    if (bca->is_block_contig) {
        uvm_gpu_address_t addr = bca->gpu_address;
        addr.address += page_index * PAGE_SIZE;
-        UVM_ASSERT(block_phys_copy_contig_check(block, page_index, &bca->gpu_address, bca->id, copying_gpu));
+        UVM_ASSERT(block_phys_copy_contig_check(block, page_index, &bca->gpu_address, bca->id, bca->nid, copying_gpu));
        return addr;
    }

-    return block_phys_page_copy_address(block, block_phys_page(bca->id, page_index), copying_gpu);
+    return block_phys_page_copy_address(block, block_phys_page(bca->id, bca->nid, page_index), copying_gpu);
 }

 // When the Confidential Computing feature is enabled, the function performs
@ -2886,17 +3483,19 @@ static void conf_computing_block_copy_push_cpu_to_gpu(uvm_va_block_t *block,
                                                      uvm_va_block_region_t region,
                                                      uvm_push_t *push)
 {
-    uvm_push_flag_t membar_flag = 0;
+    uvm_push_flag_t push_membar_flag = UVM_PUSH_FLAG_COUNT;
    uvm_gpu_t *gpu = uvm_push_get_gpu(push);
    uvm_page_index_t page_index = region.first;
    uvm_conf_computing_dma_buffer_t *dma_buffer = copy_state->dma_buffer;
-    struct page *src_page = uvm_cpu_chunk_get_cpu_page(block, page_index);
+    struct page *src_page;
    uvm_gpu_address_t staging_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
    uvm_gpu_address_t auth_tag_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
    char *cpu_auth_tag_buffer = (char *)uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag) +
                                        (page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE);
    uvm_gpu_address_t dst_address = block_copy_get_address(block, &copy_state->dst, page_index, gpu);
    char *cpu_va_staging_buffer = (char *)uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc) + (page_index * PAGE_SIZE);
+    uvm_cpu_chunk_t *chunk;
+    uvm_va_block_region_t chunk_region;

    UVM_ASSERT(UVM_ID_IS_CPU(copy_state->src.id));
    UVM_ASSERT(UVM_ID_IS_GPU(copy_state->dst.id));
@ -2906,23 +3505,27 @@ static void conf_computing_block_copy_push_cpu_to_gpu(uvm_va_block_t *block,
    // See comment in block_copy_begin_push.
    UVM_ASSERT(uvm_tracker_is_completed(&block->tracker));

+    chunk = uvm_cpu_chunk_get_chunk_for_page(block, copy_state->src.nid, page_index);
+    UVM_ASSERT(chunk);
+    // The caller guarantees that all pages in region are contiguous,
+    // meaning they're guaranteed to be part of the same compound page.
+    chunk_region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), page_index);
+    UVM_ASSERT(uvm_va_block_region_contains_region(region, chunk_region));
+
+    src_page = uvm_cpu_chunk_get_cpu_page(block, chunk, page_index);
    staging_buffer.address += page_index * PAGE_SIZE;
    auth_tag_buffer.address += page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;

    if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE))
-        membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE;
+        push_membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE;
    else if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU))
-        membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU;
+        push_membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU;

    // kmap() only guarantees PAGE_SIZE contiguity, all encryption and
    // decryption must happen on a PAGE_SIZE basis.
    for_each_va_block_page_in_region(page_index, region) {
        void *src_cpu_virt_addr;

-        // The caller guarantees that all pages in region are contiguous,
-        // meaning they're guaranteed to be part of the same compound page.
-        UVM_ASSERT(src_page == uvm_cpu_chunk_get_cpu_page(block, page_index));
-
        src_cpu_virt_addr = kmap(src_page);
        uvm_conf_computing_cpu_encrypt(push->channel,
                                       cpu_va_staging_buffer,
@ -2942,8 +3545,8 @@ static void conf_computing_block_copy_push_cpu_to_gpu(uvm_va_block_t *block,

        if (page_index < (region.outer - 1))
            uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
-        else if (membar_flag)
-            uvm_push_set_flag(push, membar_flag);
+        else if (push_membar_flag != UVM_PUSH_FLAG_COUNT)
+            uvm_push_set_flag(push, push_membar_flag);

        gpu->parent->ce_hal->decrypt(push, dst_address, staging_buffer, PAGE_SIZE, auth_tag_buffer);

@ -2964,7 +3567,7 @@ static void conf_computing_block_copy_push_gpu_to_cpu(uvm_va_block_t *block,
                                                      uvm_va_block_region_t region,
                                                      uvm_push_t *push)
 {
-    uvm_push_flag_t membar_flag = 0;
+    uvm_push_flag_t push_membar_flag = UVM_PUSH_FLAG_COUNT;
    uvm_gpu_t *gpu = uvm_push_get_gpu(push);
    uvm_page_index_t page_index = region.first;
    uvm_conf_computing_dma_buffer_t *dma_buffer = copy_state->dma_buffer;
@ -2981,9 +3584,9 @@ static void conf_computing_block_copy_push_gpu_to_cpu(uvm_va_block_t *block,
    auth_tag_buffer.address += page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;

    if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE))
-        membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE;
+        push_membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE;
    else if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU))
-        membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU;
+        push_membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU;

    // Because we use kmap() for mapping pages for CPU side
    // crypto-operations and it only guarantees PAGE_SIZE contiguity, all
@ -3001,8 +3604,8 @@ static void conf_computing_block_copy_push_gpu_to_cpu(uvm_va_block_t *block,

        if (page_index < (region.outer - 1))
            uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
-        else if (membar_flag)
-            uvm_push_set_flag(push, membar_flag);
+        else if (push_membar_flag != UVM_PUSH_FLAG_COUNT)
+            uvm_push_set_flag(push, push_membar_flag);

        gpu->parent->ce_hal->encrypt(push, staging_buffer, src_address, PAGE_SIZE, auth_tag_buffer);

@ -3039,7 +3642,7 @@ static NV_STATUS conf_computing_copy_pages_finish(uvm_va_block_t *block,
    // kmap() only guarantees PAGE_SIZE contiguity, all encryption and
    // decryption must happen on a PAGE_SIZE basis.
    for_each_va_block_page_in_mask(page_index, encrypted_page_mask, block) {
-        struct page *dst_page = uvm_cpu_chunk_get_cpu_page(block, page_index);
+        struct page *dst_page = uvm_va_block_get_cpu_page(block, page_index);
        void *staging_buffer = (char *)staging_buffer_base + (page_index * PAGE_SIZE);
        void *auth_tag_buffer = (char *)auth_tag_buffer_base + (page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE);
        void *cpu_page_address = kmap(dst_page);
@ -3135,7 +3738,9 @@ static NV_STATUS block_copy_end_push(uvm_va_block_t *block,
 static NV_STATUS block_copy_resident_pages_between(uvm_va_block_t *block,
                                                   uvm_va_block_context_t *block_context,
                                                   uvm_processor_id_t dst_id,
+                                                   int dst_nid,
                                                   uvm_processor_id_t src_id,
+                                                   int src_nid,
                                                   uvm_va_block_region_t region,
                                                   uvm_page_mask_t *copy_mask,
                                                   const uvm_page_mask_t *prefetch_page_mask,
@ -3145,7 +3750,7 @@ static NV_STATUS block_copy_resident_pages_between(uvm_va_block_t *block,
                                                   uvm_tracker_t *copy_tracker)
 {
    NV_STATUS status = NV_OK;
-    uvm_page_mask_t *dst_resident_mask = uvm_va_block_resident_mask_get(block, dst_id);
+    uvm_page_mask_t *dst_resident_mask = uvm_va_block_resident_mask_get(block, dst_id, dst_nid);
    uvm_gpu_t *copying_gpu = NULL;
    uvm_push_t push;
    uvm_page_index_t page_index;
@ -3162,18 +3767,31 @@ static NV_STATUS block_copy_resident_pages_between(uvm_va_block_t *block,
    uvm_va_range_t *va_range = block->va_range;
    uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);

-    copy_state.src.id = src_id;
-    copy_state.dst.id = dst_id;
-    copy_state.src.is_block_contig = is_block_phys_contig(block, src_id);
-    copy_state.dst.is_block_contig = is_block_phys_contig(block, dst_id);
-
    *copied_pages = 0;

+    UVM_ASSERT(UVM_ID_IS_GPU(src_id) || UVM_ID_IS_GPU(dst_id));
+
+    if (UVM_ID_IS_CPU(src_id))
+        UVM_ASSERT(src_nid != NUMA_NO_NODE);
+
+    if (UVM_ID_IS_CPU(dst_id))
+        UVM_ASSERT(dst_nid != NUMA_NO_NODE);
+
    // If there are no pages to be copied, exit early
-    if (!uvm_page_mask_andnot(copy_mask, copy_mask, dst_resident_mask) ||
-        !uvm_page_mask_andnot(copy_mask, copy_mask, migrated_pages))
+    if (!uvm_page_mask_andnot(copy_mask, copy_mask, dst_resident_mask))
        return NV_OK;

+    if (migrated_pages && !uvm_page_mask_andnot(copy_mask, copy_mask, migrated_pages))
+        return NV_OK;
+
+    copy_state.src.id = src_id;
+    copy_state.dst.id = dst_id;
+    copy_state.src.nid = src_nid;
+    copy_state.dst.nid = dst_nid;
+
+    copy_state.src.is_block_contig = is_block_phys_contig(block, src_id, copy_state.src.nid);
+    copy_state.dst.is_block_contig = is_block_phys_contig(block, dst_id, copy_state.dst.nid);
+
    // uvm_range_group_range_iter_first should only be called when the va_space
    // lock is held, which is always the case unless an eviction is taking
    // place.
@ -3184,19 +3802,6 @@ static NV_STATUS block_copy_resident_pages_between(uvm_va_block_t *block,
        rgr_has_changed = true;
    }

-    if (UVM_ID_IS_CPU(dst_id)) {
-        uvm_memcg_context_t memcg_context;
-
-        // To support staging through CPU, populate CPU pages on demand.
-        // GPU destinations should have their pages populated already, but
-        // that might change if we add staging through GPUs.
-        uvm_memcg_context_start(&memcg_context, block_context->mm);
-        status = block_populate_pages_cpu(block, copy_mask, region, block_context);
-        uvm_memcg_context_end(&memcg_context);
-        if (status != NV_OK)
-            return status;
-    }
-
    // TODO: Bug 3745051: This function is complicated and needs refactoring
    for_each_va_block_page_in_region_mask(page_index, copy_mask, region) {
        NvU64 page_start = uvm_va_block_cpu_page_address(block, page_index);
@ -3244,7 +3849,7 @@ static NV_STATUS block_copy_resident_pages_between(uvm_va_block_t *block,

        // No need to copy pages that haven't changed.  Just clear residency
        // information
-        if (block_page_is_clean(block, dst_id, src_id, page_index))
+        if (block_page_is_clean(block, dst_id, src_id, page_index, copy_state.src.nid))
            continue;

        if (!copying_gpu) {
@ -3270,7 +3875,7 @@ static NV_STATUS block_copy_resident_pages_between(uvm_va_block_t *block,
        }

        if (!uvm_va_block_is_hmm(block))
-            block_update_page_dirty_state(block, dst_id, src_id, page_index);
+            block_update_page_dirty_state(block, dst_id, src_id, copy_state.dst.nid, page_index);

        if (last_index == region.outer) {
            bool can_cache_src_phys_addr = copy_state.src.is_block_contig;
@ -3291,12 +3896,16 @@ static NV_STATUS block_copy_resident_pages_between(uvm_va_block_t *block,
            // using the page index.
            if (can_cache_src_phys_addr) {
                copy_state.src.gpu_address = block_phys_page_copy_address(block,
-                                                                          block_phys_page(src_id, 0),
+                                                                          block_phys_page(src_id,
+                                                                                          copy_state.src.nid,
+                                                                                          0),
                                                                          copying_gpu);
            }
            if (can_cache_dst_phys_addr) {
                copy_state.dst.gpu_address = block_phys_page_copy_address(block,
-                                                                          block_phys_page(dst_id, 0),
+                                                                          block_phys_page(dst_id,
+                                                                                          copy_state.dst.nid,
+                                                                                          0),
                                                                          copying_gpu);
            }
        }
@ -3359,12 +3968,87 @@ static NV_STATUS block_copy_resident_pages_between(uvm_va_block_t *block,
        uvm_page_mask_region_clear(copy_mask, uvm_va_block_region(page_index, PAGES_PER_UVM_VA_BLOCK));

    *copied_pages = uvm_page_mask_weight(copy_mask);
-    if (*copied_pages)
+    if (*copied_pages && migrated_pages)
        uvm_page_mask_or(migrated_pages, migrated_pages, copy_mask);

    return status;
 }

+static NV_STATUS block_copy_resident_pages_from(uvm_va_block_t *block,
+                                                uvm_va_block_context_t *block_context,
+                                                uvm_processor_id_t dst_id,
+                                                uvm_processor_id_t src_id,
+                                                int src_nid,
+                                                uvm_va_block_region_t region,
+                                                const uvm_page_mask_t *page_mask,
+                                                const uvm_page_mask_t *prefetch_page_mask,
+                                                uvm_va_block_transfer_mode_t transfer_mode,
+                                                uvm_page_mask_t *migrated_pages,
+                                                NvU32 *copied_pages_out,
+                                                uvm_tracker_t *copy_tracker)
+{
+    uvm_page_mask_t *copy_mask = &block_context->make_resident.copy_resident_pages_mask;
+    uvm_page_mask_t *src_resident_mask;
+    uvm_page_mask_t *node_pages_mask = &block_context->make_resident.node_pages_mask;
+    uvm_make_resident_page_tracking_t *page_tracking = &block_context->make_resident.cpu_pages_used;
+    NvU32 copied_pages_from_src;
+    NV_STATUS status = NV_OK;
+    int dst_nid;
+
+    src_resident_mask = uvm_va_block_resident_mask_get(block, src_id, src_nid);
+    uvm_page_mask_init_from_region(copy_mask, region, src_resident_mask);
+
+    if (page_mask)
+        uvm_page_mask_and(copy_mask, copy_mask, page_mask);
+
+    if (UVM_ID_IS_CPU(dst_id)) {
+        for_each_node_mask(dst_nid, page_tracking->nodes) {
+            if (!uvm_page_mask_and(node_pages_mask, copy_mask, page_tracking->node_masks[node_to_index(dst_nid)]))
+                continue;
+
+            status = block_copy_resident_pages_between(block,
+                                                       block_context,
+                                                       dst_id,
+                                                       dst_nid,
+                                                       src_id,
+                                                       src_nid,
+                                                       region,
+                                                       node_pages_mask,
+                                                       prefetch_page_mask,
+                                                       transfer_mode,
+                                                       migrated_pages,
+                                                       &copied_pages_from_src,
+                                                       copy_tracker);
+
+            *copied_pages_out += copied_pages_from_src;
+
+            if (status != NV_OK)
+                break;
+
+            if (!uvm_page_mask_andnot(copy_mask, copy_mask, node_pages_mask))
+                break;
+        }
+    }
+    else {
+        status = block_copy_resident_pages_between(block,
+                                                   block_context,
+                                                   dst_id,
+                                                   NUMA_NO_NODE,
+                                                   src_id,
+                                                   src_nid,
+                                                   region,
+                                                   copy_mask,
+                                                   prefetch_page_mask,
+                                                   transfer_mode,
+                                                   migrated_pages,
+                                                   &copied_pages_from_src,
+                                                   copy_tracker);
+        *copied_pages_out += copied_pages_from_src;
+    }
+
+    return status;
+}
+
 // Copy resident pages to the destination from all source processors in the
 // src_processor_mask
 //
@ -3387,36 +4071,53 @@ static NV_STATUS block_copy_resident_pages_mask(uvm_va_block_t *block,
    uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
    uvm_processor_id_t src_id;
    uvm_processor_mask_t search_mask;
-    uvm_page_mask_t *copy_mask = &block_context->make_resident.copy_resident_pages_mask;

    uvm_processor_mask_copy(&search_mask, src_processor_mask);

    *copied_pages_out = 0;

    for_each_closest_id(src_id, &search_mask, dst_id, va_space) {
-        uvm_page_mask_t *src_resident_mask = uvm_va_block_resident_mask_get(block, src_id);
        NV_STATUS status;
-        NvU32 copied_pages_from_src;

        UVM_ASSERT(!uvm_id_equal(src_id, dst_id));

-        uvm_page_mask_init_from_region(copy_mask, region, src_resident_mask);
+        if (UVM_ID_IS_CPU(src_id)) {
+            int nid;

-        if (page_mask)
-            uvm_page_mask_and(copy_mask, copy_mask, page_mask);
+            for_each_possible_uvm_node(nid) {
+                status = block_copy_resident_pages_from(block,
+                                                        block_context,
+                                                        dst_id,
+                                                        src_id,
+                                                        nid,
+                                                        region,
+                                                        page_mask,
+                                                        prefetch_page_mask,
+                                                        transfer_mode,
+                                                        migrated_pages,
+                                                        copied_pages_out,
+                                                        tracker_out);
+
+                if (status != NV_OK)
+                    break;
+            }
+        }
+        else {
+            status = block_copy_resident_pages_from(block,
+                                                    block_context,
+                                                    dst_id,
+                                                    src_id,
+                                                    NUMA_NO_NODE,
+                                                    region,
+                                                    page_mask,
+                                                    prefetch_page_mask,
+                                                    transfer_mode,
+                                                    migrated_pages,
+                                                    copied_pages_out,
+                                                    tracker_out);
+
+        }

-        status = block_copy_resident_pages_between(block,
-                                                   block_context,
-                                                   dst_id,
-                                                   src_id,
-                                                   region,
-                                                   copy_mask,
-                                                   prefetch_page_mask,
-                                                   transfer_mode,
-                                                   migrated_pages,
-                                                   &copied_pages_from_src,
-                                                   tracker_out);
-        *copied_pages_out += copied_pages_from_src;
        UVM_ASSERT(*copied_pages_out <= max_pages_to_copy);

        if (status != NV_OK)
@ -3441,7 +4142,8 @@ static void break_read_duplication_in_region(uvm_va_block_t *block,

    uvm_page_mask_init_from_region(break_pages_in_region, region, page_mask);

-    UVM_ASSERT(uvm_page_mask_subset(break_pages_in_region, uvm_va_block_resident_mask_get(block, dst_id)));
+    UVM_ASSERT(
+        uvm_page_mask_subset(break_pages_in_region, uvm_va_block_resident_mask_get(block, dst_id, NUMA_NO_NODE)));

    // Clear read_duplicated bit for all pages in region
    uvm_page_mask_andnot(&block->read_duplicated_pages, &block->read_duplicated_pages, break_pages_in_region);
@ -3453,9 +4155,20 @@ static void break_read_duplication_in_region(uvm_va_block_t *block,
        if (uvm_id_equal(id, dst_id))
            continue;

-        other_resident_mask = uvm_va_block_resident_mask_get(block, id);
+        if (UVM_ID_IS_CPU(id)) {
+            int nid;

-        if (!uvm_page_mask_andnot(other_resident_mask, other_resident_mask, break_pages_in_region))
+            for_each_possible_uvm_node(nid)
+                uvm_va_block_cpu_clear_resident_mask(block, nid, break_pages_in_region);
+
+            other_resident_mask = uvm_va_block_resident_mask_get(block, UVM_ID_CPU, NUMA_NO_NODE);
+        }
+        else {
+            other_resident_mask = uvm_va_block_resident_mask_get(block, id, NUMA_NO_NODE);
+            uvm_page_mask_andnot(other_resident_mask, other_resident_mask, break_pages_in_region);
+        }
+
+        if (uvm_page_mask_empty(other_resident_mask))
            block_clear_resident_processor(block, id);
    }
 }
@ -3467,7 +4180,7 @@ static void block_copy_set_first_touch_residency(uvm_va_block_t *block,
                                                 const uvm_page_mask_t *page_mask)
 {
    uvm_page_index_t page_index;
-    uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, dst_id);
+    uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, dst_id, NUMA_NO_NODE);
    uvm_page_mask_t *first_touch_mask = &block_context->make_resident.page_mask;

    if (page_mask)
@ -3483,7 +4196,14 @@ static void block_copy_set_first_touch_residency(uvm_va_block_t *block,
        UVM_ASSERT(block_check_resident_proximity(block, page_index, dst_id));
    }

-    uvm_page_mask_or(resident_mask, resident_mask, first_touch_mask);
+    if (UVM_ID_IS_CPU(dst_id)) {
+        uvm_va_block_cpu_set_resident_all_chunks(block, block_context, first_touch_mask);
+        resident_mask = uvm_va_block_resident_mask_get(block, UVM_ID_CPU, NUMA_NO_NODE);
+    }
+    else {
+        uvm_page_mask_or(resident_mask, resident_mask, first_touch_mask);
+    }
+
    if (!uvm_page_mask_empty(resident_mask))
        block_set_resident_processor(block, dst_id);

@ -3493,6 +4213,41 @@ static void block_copy_set_first_touch_residency(uvm_va_block_t *block,
                     first_touch_mask);
 }

+// Select the set of CPU pages to be used for the migration. The pages selected
+// could be used for either CPU destination pages (when the destination of the
+// migration is the CPU) or staging pages (when the migration to the destination
+// processor requires staging through the CPU).
+static void block_select_cpu_node_pages(uvm_va_block_t *block,
+                                        uvm_va_block_context_t *block_context,
+                                        const uvm_page_mask_t *page_mask,
+                                        uvm_va_block_region_t region)
+{
+    uvm_va_block_cpu_node_state_t *node_state;
+    uvm_make_resident_page_tracking_t *tracking = &block_context->make_resident.cpu_pages_used;
+    uvm_page_mask_t **node_masks = tracking->node_masks;
+    uvm_page_mask_t *scratch_page_mask = &block_context->scratch_page_mask;
+    size_t index;
+    int nid;
+
+    nodes_clear(tracking->nodes);
+
+    if (uvm_page_mask_empty(page_mask))
+        return;
+
+    block_context->scratch_node_mask = node_possible_map;
+    uvm_page_mask_init_from_region(scratch_page_mask, region, page_mask);
+
+    for_each_closest_uvm_node(nid, uvm_va_block_context_get_node(block_context), block_context->scratch_node_mask) {
+        node_state = block_node_state_get(block, nid);
+        index = node_to_index(nid);
+        if (uvm_page_mask_and(node_masks[index], scratch_page_mask, &node_state->allocated)) {
+            node_set(nid, tracking->nodes);
+            if (!uvm_page_mask_andnot(scratch_page_mask, scratch_page_mask, node_masks[index]))
+                return;
+        }
+    }
+}
+
 // Copy resident pages from other processors to the destination.
 // All the pages on the destination need to be populated by the caller first.
 // Pages not resident anywhere else need to be zeroed out as well.
@ -3509,17 +4264,18 @@ static NV_STATUS block_copy_resident_pages(uvm_va_block_t *block,
    NV_STATUS status = NV_OK;
    NV_STATUS tracker_status;
    uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
-    uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, dst_id);
+    uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, dst_id, NUMA_NO_NODE);
    NvU32 missing_pages_count;
    NvU32 pages_copied;
-    NvU32 pages_copied_to_cpu;
+    NvU32 pages_copied_to_cpu = 0;
    uvm_processor_mask_t src_processor_mask;
    uvm_page_mask_t *copy_page_mask = &block_context->make_resident.page_mask;
    uvm_page_mask_t *migrated_pages = &block_context->make_resident.pages_migrated;
-    uvm_page_mask_t *staged_pages = &block_context->make_resident.pages_staged;
+    uvm_page_mask_t *pages_staged = &block_context->make_resident.pages_staged;
+    uvm_page_mask_t *cpu_page_mask;
+    int nid;

    uvm_page_mask_zero(migrated_pages);
-    uvm_page_mask_zero(staged_pages);

    if (page_mask)
        uvm_page_mask_andnot(copy_page_mask, page_mask, resident_mask);
@ -3538,7 +4294,7 @@ static NV_STATUS block_copy_resident_pages(uvm_va_block_t *block,

    uvm_processor_mask_zero(&src_processor_mask);

-    if (!uvm_id_equal(dst_id, UVM_ID_CPU)) {
+    if (UVM_ID_IS_GPU(dst_id)) {
        // If the destination is a GPU, first copy everything from processors
        // with copy access supported. Notably this will copy pages from the CPU
        // as well even if later some extra copies from CPU are required for
@ -3546,6 +4302,15 @@ static NV_STATUS block_copy_resident_pages(uvm_va_block_t *block,
        uvm_processor_mask_and(&src_processor_mask, block_get_can_copy_from_mask(block, dst_id), &block->resident);
        uvm_processor_mask_clear(&src_processor_mask, dst_id);

+        cpu_page_mask = pages_staged;
+    }
+    else {
+        cpu_page_mask = copy_page_mask;
+    }
+
+    block_select_cpu_node_pages(block, block_context, cpu_page_mask, region);
+
+    if (UVM_ID_IS_GPU(dst_id)) {
        status = block_copy_resident_pages_mask(block,
                                                block_context,
                                                dst_id,
@ -3565,8 +4330,10 @@ static NV_STATUS block_copy_resident_pages(uvm_va_block_t *block,
        if (status != NV_OK)
            goto out;

-        if (missing_pages_count == 0)
+        if (missing_pages_count == 0) {
+            UVM_ASSERT(uvm_page_mask_empty(pages_staged));
            goto out;
+        }

        if (pages_copied)
            uvm_page_mask_andnot(copy_page_mask, copy_page_mask, migrated_pages);
@ -3579,28 +4346,27 @@ static NV_STATUS block_copy_resident_pages(uvm_va_block_t *block,
    uvm_processor_mask_clear(&src_processor_mask, dst_id);
    uvm_processor_mask_clear(&src_processor_mask, UVM_ID_CPU);

-    status = block_copy_resident_pages_mask(block,
-                                            block_context,
-                                            UVM_ID_CPU,
-                                            &src_processor_mask,
-                                            region,
-                                            copy_page_mask,
-                                            prefetch_page_mask,
-                                            transfer_mode,
-                                            missing_pages_count,
-                                            staged_pages,
-                                            &pages_copied_to_cpu,
-                                            &local_tracker);
-    if (status != NV_OK)
-        goto out;
+    if (!uvm_page_mask_empty(cpu_page_mask)) {
+        status = block_copy_resident_pages_mask(block,
+                                                block_context,
+                                                UVM_ID_CPU,
+                                                &src_processor_mask,
+                                                region,
+                                                cpu_page_mask,
+                                                prefetch_page_mask,
+                                                transfer_mode,
+                                                missing_pages_count,
+                                                UVM_ID_IS_CPU(dst_id) ? migrated_pages : NULL,
+                                                &pages_copied_to_cpu,
+                                                &local_tracker);
+
+        if (status != NV_OK)
+            goto out;
+    }

    // If destination is the CPU then we copied everything there above
-    if (UVM_ID_IS_CPU(dst_id)) {
-        uvm_page_mask_or(migrated_pages, migrated_pages, staged_pages);
-        missing_pages_count -= pages_copied_to_cpu;
-
+    if (!UVM_ID_IS_GPU(dst_id))
        goto out;
-    }

    // Add everything to the block's tracker so that the
    // block_copy_resident_pages_between() call below will acquire it.
@ -3610,20 +4376,37 @@ static NV_STATUS block_copy_resident_pages(uvm_va_block_t *block,
    uvm_tracker_clear(&local_tracker);

    // Now copy staged pages from the CPU to the destination.
-    status = block_copy_resident_pages_between(block,
-                                               block_context,
-                                               dst_id,
-                                               UVM_ID_CPU,
-                                               region,
-                                               staged_pages,
-                                               prefetch_page_mask,
-                                               transfer_mode,
-                                               migrated_pages,
-                                               &pages_copied,
-                                               &local_tracker);
+    // The staging copy above could have allocated pages on any NUMA node.
+    // Loop over all nodes where pages were allocated and copy from those
+    // nodes.
+    pages_copied = 0;
+    for_each_node_mask(nid, block_context->make_resident.cpu_pages_used.nodes) {
+        NvU32 pages_copied_from_node;
+        uvm_page_mask_t *node_pages_mask = &block_context->make_resident.node_pages_mask;
+        uvm_page_mask_t *node_alloc_mask = block_context->make_resident.cpu_pages_used.node_masks[node_to_index(nid)];

-    UVM_ASSERT(missing_pages_count >= pages_copied);
-    missing_pages_count -= pages_copied;
+        if (uvm_page_mask_and(node_pages_mask, pages_staged, node_alloc_mask)) {
+            status = block_copy_resident_pages_between(block,
+                                                       block_context,
+                                                       dst_id,
+                                                       NUMA_NO_NODE,
+                                                       UVM_ID_CPU,
+                                                       nid,
+                                                       region,
+                                                       node_pages_mask,
+                                                       prefetch_page_mask,
+                                                       transfer_mode,
+                                                       migrated_pages,
+                                                       &pages_copied_from_node,
+                                                       &local_tracker);
+            UVM_ASSERT(missing_pages_count >= pages_copied_from_node);
+            missing_pages_count -= pages_copied_from_node;
+            pages_copied += pages_copied_from_node;
+        }
+
+        if (status != NV_OK)
+            break;
+    }

    if (status != NV_OK)
        goto out;
@ -3668,7 +4451,7 @@ NV_STATUS uvm_va_block_make_resident_copy(uvm_va_block_t *va_block,
    uvm_assert_mutex_locked(&va_block->lock);
    UVM_ASSERT(uvm_va_block_is_hmm(va_block) || va_block->va_range->type == UVM_VA_RANGE_TYPE_MANAGED);

-    resident_mask = block_resident_mask_get_alloc(va_block, dest_id);
+    resident_mask = block_resident_mask_get_alloc(va_block, dest_id, NUMA_NO_NODE);
    if (!resident_mask)
        return NV_ERR_NO_MEMORY;

@ -3740,9 +4523,17 @@ static void block_make_resident_update_state(uvm_va_block_t *va_block,
                                             uvm_page_mask_t *copy_mask,
                                             uvm_make_resident_cause_t cause)
 {
-    uvm_page_mask_t *dst_resident_mask = uvm_va_block_resident_mask_get(va_block, dst_id);
+    if (UVM_ID_IS_CPU(dst_id)) {
+        // CPU chunks may not have been allocated on the preferred NUMA node. So,
+        // the residency has to be updated based on the chunk's NUMA ID.
+        uvm_va_block_cpu_set_resident_all_chunks(va_block, va_block_context, copy_mask);
+    }
+    else {
+        uvm_page_mask_t *dst_resident_mask = uvm_va_block_resident_mask_get(va_block, dst_id, NUMA_NO_NODE);
+
+        uvm_page_mask_or(dst_resident_mask, dst_resident_mask, copy_mask);
+    }

-    uvm_page_mask_or(dst_resident_mask, dst_resident_mask, copy_mask);
    block_set_resident_processor(va_block, dst_id);

    // Accumulate the pages that migrated into the output mask.
@ -3752,7 +4543,8 @@ static void block_make_resident_update_state(uvm_va_block_t *va_block,

    // Any move operation implies that mappings have been removed from all
    // non-UVM-Lite GPUs.
-    uvm_page_mask_andnot(&va_block->maybe_mapped_pages, &va_block->maybe_mapped_pages, copy_mask);
+    if (!uvm_va_block_is_hmm(va_block))
+        uvm_page_mask_andnot(&va_block->maybe_mapped_pages, &va_block->maybe_mapped_pages, copy_mask);

    // If we are migrating due to an eviction, set the GPU as evicted and
    // mark the evicted pages. If we are migrating away from the CPU this
@ -3814,6 +4606,10 @@ void uvm_va_block_make_resident_finish(uvm_va_block_t *va_block,
    // empty).
    if (uvm_processor_mask_test(&va_block->resident, dst_id))
        block_mark_memory_used(va_block, dst_id);
+
+    // Check state of all chunks after residency change.
+    // TODO: Bug 4207783: Check both CPU and GPU chunks.
+    UVM_ASSERT(block_check_cpu_chunks(va_block));
 }

 NV_STATUS uvm_va_block_make_resident(uvm_va_block_t *va_block,
@ -3906,7 +4702,6 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
    NV_STATUS status = NV_OK;
    uvm_processor_id_t src_id;
    uvm_page_mask_t *dst_resident_mask;
-    uvm_page_mask_t *cpu_resident_mask;
    uvm_page_mask_t *migrated_pages;
    uvm_page_mask_t *staged_pages;
    uvm_page_mask_t *first_touch_mask;
@ -3938,7 +4733,7 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
        // block_copy_resident_pages also use
        // va_block_context->make_resident.page_mask.
        uvm_page_mask_t *preprocess_page_mask = &va_block_context->make_resident.page_mask;
-        const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, src_id);
+        const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, src_id, NUMA_NO_NODE);
        UVM_ASSERT(!uvm_page_mask_empty(resident_mask));

        if (page_mask)
@ -3987,16 +4782,21 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,

    staged_pages = &va_block_context->make_resident.pages_staged;
    if (!UVM_ID_IS_CPU(dest_id) && !uvm_page_mask_empty(staged_pages)) {
-        cpu_resident_mask = uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU);
-        uvm_page_mask_or(cpu_resident_mask, cpu_resident_mask, staged_pages);
+        uvm_va_block_cpu_set_resident_all_chunks(va_block, va_block_context, staged_pages);
        block_set_resident_processor(va_block, UVM_ID_CPU);
        uvm_page_mask_or(&va_block->read_duplicated_pages, &va_block->read_duplicated_pages, staged_pages);
        uvm_tools_record_read_duplicate(va_block, UVM_ID_CPU, region, staged_pages);
    }

    if (!uvm_page_mask_empty(migrated_pages)) {
-        dst_resident_mask = uvm_va_block_resident_mask_get(va_block, dest_id);
-        uvm_page_mask_or(dst_resident_mask, dst_resident_mask, migrated_pages);
+        if (UVM_ID_IS_CPU(dest_id)) {
+            uvm_va_block_cpu_set_resident_all_chunks(va_block, va_block_context, migrated_pages);
+        }
+        else {
+            dst_resident_mask = uvm_va_block_resident_mask_get(va_block, dest_id, NUMA_NO_NODE);
+            uvm_page_mask_or(dst_resident_mask, dst_resident_mask, migrated_pages);
+        }
+
        block_set_resident_processor(va_block, dest_id);
        uvm_page_mask_or(&va_block->read_duplicated_pages, &va_block->read_duplicated_pages, migrated_pages);
        uvm_tools_record_read_duplicate(va_block, dest_id, region, migrated_pages);
@ -4015,6 +4815,9 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
    if (uvm_processor_mask_test(&va_block->resident, dest_id))
        block_mark_memory_used(va_block, dest_id);

+    // Check state of all chunks after residency change.
+    // TODO: Bug 4207783: Check both CPU and GPU chunks.
+    UVM_ASSERT(block_check_cpu_chunks(va_block));
    return NV_OK;
 }

@ -4301,7 +5104,7 @@ static bool block_check_mappings_page(uvm_va_block_t *block, uvm_page_index_t pa

    // Pages set to zero in maybe_mapped_pages must not be mapped on any
    // non-UVM-Lite GPU
-    if (!uvm_page_mask_test(&block->maybe_mapped_pages, page_index)) {
+    if (!uvm_va_block_is_hmm(block) && !uvm_page_mask_test(&block->maybe_mapped_pages, page_index)) {
        UVM_ASSERT_MSG(uvm_processor_mask_get_count(&read_mappings) == 0,
                       "Resident: 0x%lx - Mappings Block: 0x%lx / Page R: 0x%lx W: 0x%lx A: 0x%lx\n",
                       *resident_processors.bitmap,
@ -4346,7 +5149,9 @@ static bool block_check_mappings_page(uvm_va_block_t *block, uvm_page_index_t pa
                if (uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(residency)], id)) {
                    uvm_gpu_t *resident_gpu = uvm_va_space_get_gpu(va_space, residency);
                    uvm_gpu_t *mapped_gpu = uvm_va_space_get_gpu(va_space, id);
-                    uvm_gpu_chunk_t *chunk = block_phys_page_chunk(block, block_phys_page(residency, page_index), NULL);
+                    uvm_gpu_chunk_t *chunk = block_phys_page_chunk(block,
+                                                                   block_phys_page(residency, NUMA_NO_NODE, page_index),
+                                                                   NULL);

                    // This function will assert if no mapping exists
                    (void)uvm_pmm_gpu_indirect_peer_addr(&resident_gpu->pmm, chunk, mapped_gpu);
@ -4519,7 +5324,7 @@ static bool block_check_mappings_ptes(uvm_va_block_t *block, uvm_gpu_t *gpu)

            // The mapped processor should be fully resident and physically-
            // contiguous.
-            UVM_ASSERT(uvm_page_mask_full(uvm_va_block_resident_mask_get(block, resident_id)));
+            UVM_ASSERT(uvm_page_mask_full(uvm_va_block_resident_mask_get(block, resident_id, NUMA_NO_NODE)));

            if (UVM_ID_IS_GPU(resident_id)) {
                resident_gpu_state = uvm_va_block_gpu_state_get(block, resident_id);
@ -4527,13 +5332,15 @@ static bool block_check_mappings_ptes(uvm_va_block_t *block, uvm_gpu_t *gpu)
                UVM_ASSERT(uvm_gpu_chunk_get_size(resident_gpu_state->chunks[0]) == UVM_CHUNK_SIZE_2M);
            }
            else {
-                uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_first_in_region(block,
-                                                                       uvm_va_block_region_from_block(block),
-                                                                       NULL);
+                uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page_resident(block, 0);
+                int chunk_nid = uvm_cpu_chunk_get_numa_node(chunk);

                UVM_ASSERT(uvm_page_mask_full(&block->cpu.allocated));
                UVM_ASSERT(chunk);
                UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_2M);
+                UVM_ASSERT(uvm_va_block_cpu_is_region_resident_on(block,
+                                                                  chunk_nid,
+                                                                  uvm_va_block_region_from_block(block)));
            }
        }
    }
@ -4572,19 +5379,28 @@ static bool block_check_mappings_ptes(uvm_va_block_t *block, uvm_gpu_t *gpu)
                // location even if the memory is resident elsewhere. Skip the
                // residency check but still verify contiguity.
                if (!uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu->id)) {
-                    UVM_ASSERT(uvm_page_mask_region_full(uvm_va_block_resident_mask_get(block, resident_id),
-                                                         big_region));
+                    UVM_ASSERT(
+                        uvm_page_mask_region_full(uvm_va_block_resident_mask_get(block, resident_id, NUMA_NO_NODE),
+                                                  big_region));
                }

                if (UVM_ID_IS_CPU(resident_id)) {
-                    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, big_region.first);
+                    int resident_nid = block_get_page_node_residency(block, big_region.first);
+                    uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, resident_nid);
+                    uvm_cpu_chunk_t *chunk;

+                    UVM_ASSERT(resident_nid != NUMA_NO_NODE);
+                    UVM_ASSERT(uvm_page_mask_region_full(&node_state->allocated, big_region));
+                    chunk = uvm_cpu_chunk_get_chunk_for_page(block, resident_nid, big_region.first);
                    UVM_ASSERT(gpu->parent->can_map_sysmem_with_large_pages);
                    UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) >= uvm_va_block_region_size(big_region));
+                    UVM_ASSERT(uvm_page_mask_region_full(&node_state->resident, big_region));
                }
                else {
                    // Check GPU chunks
-                    chunk = block_phys_page_chunk(block, block_phys_page(resident_id, big_region.first), NULL);
+                    chunk = block_phys_page_chunk(block,
+                                                  block_phys_page(resident_id, NUMA_NO_NODE, big_region.first),
+                                                  NULL);
                    chunk_region = uvm_va_block_chunk_region(block, uvm_gpu_chunk_get_size(chunk), big_region.first);
                    UVM_ASSERT(uvm_va_block_region_contains_region(chunk_region, big_region));
                }
@ -4611,7 +5427,7 @@ static bool block_check_mappings(uvm_va_block_t *block)
            continue;
        }

-        resident_mask = uvm_va_block_resident_mask_get(block, id);
+        resident_mask = uvm_va_block_resident_mask_get(block, id, NUMA_NO_NODE);
        UVM_ASSERT(uvm_processor_mask_test(&block->resident, id) == !uvm_page_mask_empty(resident_mask));

        map_mask = uvm_va_block_map_mask_get(block, id);
@ -4689,7 +5505,7 @@ static void block_unmap_cpu(uvm_va_block_t *block, uvm_va_block_region_t region,

        // If the CPU is the only processor with mappings we can safely mark
        // the pages as fully unmapped
-        if (num_mapped_processors == 1)
+        if (num_mapped_processors == 1 && !uvm_va_block_is_hmm(block))
            uvm_page_mask_region_clear(&block->maybe_mapped_pages, subregion);

        unmapped_something = true;
@ -4799,6 +5615,7 @@ static void block_gpu_pte_write_4k(uvm_va_block_t *block,
    uvm_gpu_phys_address_t page_addr = {0};
    uvm_page_index_t page_index;
    NvU64 pte_flags = block_gpu_pte_flag_cacheable(block, gpu, resident_id);
+    int contig_nid = NUMA_NO_NODE;

    UVM_ASSERT(new_prot != UVM_PROT_NONE);
    UVM_ASSERT(UVM_ID_IS_VALID(resident_id));
@ -4806,15 +5623,22 @@ static void block_gpu_pte_write_4k(uvm_va_block_t *block,
    for_each_va_block_page_in_mask(page_index, write_page_mask, block) {
        uvm_gpu_phys_address_t pte_addr;
        size_t i;
+        int nid = NUMA_NO_NODE;

-        // Assume that this mapping will be used to write to the page
-        if (new_prot > UVM_PROT_READ_ONLY && UVM_ID_IS_CPU(resident_id) && !uvm_va_block_is_hmm(block))
-            block_mark_cpu_page_dirty(block, page_index);
+        if (UVM_ID_IS_CPU(resident_id)) {
+            nid = block_get_page_node_residency(block, page_index);
+            UVM_ASSERT(nid != NUMA_NO_NODE);

-        if (page_index >= contig_region.outer) {
-            contig_region = block_phys_contig_region(block, page_index, resident_id);
-            contig_addr = block_phys_page_address(block, block_phys_page(resident_id, contig_region.first), gpu);
+            // Assume that this mapping will be used to write to the page
+            if (new_prot > UVM_PROT_READ_ONLY && !uvm_va_block_is_hmm(block))
+                block_mark_cpu_page_dirty(block, page_index, nid);
+        }
+
+        if (page_index >= contig_region.outer || nid != contig_nid) {
+            contig_region = block_phys_contig_region(block, page_index, resident_id, nid);
+            contig_addr = block_phys_page_address(block, block_phys_page(resident_id, nid, contig_region.first), gpu);
            page_addr = contig_addr;
+            contig_nid = nid;
        }

        page_addr.address = contig_addr.address + (page_index - contig_region.first) * PAGE_SIZE;
@ -4998,6 +5822,7 @@ static void block_gpu_pte_write_big(uvm_va_block_t *block,
    uvm_gpu_phys_address_t contig_addr = {0};
    uvm_gpu_phys_address_t page_addr = {0};
    NvU64 pte_flags = block_gpu_pte_flag_cacheable(block, gpu, resident_id);
+    int contig_nid = NUMA_NO_NODE;

    UVM_ASSERT(new_prot != UVM_PROT_NONE);
    UVM_ASSERT(UVM_ID_IS_VALID(resident_id));
@ -5014,19 +5839,26 @@ static void block_gpu_pte_write_big(uvm_va_block_t *block,
        NvU64 pte_val;
        uvm_gpu_phys_address_t pte_addr;
        uvm_va_block_region_t big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
+        int nid = NUMA_NO_NODE;

-        // Assume that this mapping will be used to write to the page
-        if (new_prot > UVM_PROT_READ_ONLY && UVM_ID_IS_CPU(resident_id) && !uvm_va_block_is_hmm(block)) {
-            uvm_page_index_t page_index;
+        if (UVM_ID_IS_CPU(resident_id)) {
+            nid = block_get_page_node_residency(block, big_region.first);
+            UVM_ASSERT(nid != NUMA_NO_NODE);

-            for_each_va_block_page_in_region(page_index, big_region)
-                block_mark_cpu_page_dirty(block, page_index);
+            // Assume that this mapping will be used to write to the page
+            if (new_prot > UVM_PROT_READ_ONLY && !uvm_va_block_is_hmm(block)) {
+                uvm_page_index_t page_index;
+
+                for_each_va_block_page_in_region(page_index, big_region)
+                    block_mark_cpu_page_dirty(block, page_index, nid);
+            }
        }

-        if (big_region.first >= contig_region.outer) {
-            contig_region = block_phys_contig_region(block, big_region.first, resident_id);
-            contig_addr = block_phys_page_address(block, block_phys_page(resident_id, contig_region.first), gpu);
+        if (big_region.first >= contig_region.outer || nid != contig_nid) {
+            contig_region = block_phys_contig_region(block, big_region.first, resident_id, nid);
+            contig_addr = block_phys_page_address(block, block_phys_page(resident_id, nid, contig_region.first), gpu);
            page_addr = contig_addr;
+            contig_nid = nid;
        }

        page_addr.address = contig_addr.address + (big_region.first - contig_region.first) * PAGE_SIZE;
@ -5164,14 +5996,19 @@ static void block_gpu_pte_write_2m(uvm_va_block_t *block,
    NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_2M);
    NvU64 pte_val;
    NvU64 pte_flags = block_gpu_pte_flag_cacheable(block, gpu, resident_id);
+    int nid = NUMA_NO_NODE;

    UVM_ASSERT(new_prot != UVM_PROT_NONE);
    UVM_ASSERT(UVM_ID_IS_VALID(resident_id));

-    if (UVM_ID_IS_CPU(resident_id) && !uvm_va_block_is_hmm(block))
-        block_mark_cpu_page_dirty(block, 0);
+    if (UVM_ID_IS_CPU(resident_id)) {
+        nid = block_get_page_node_residency(block, 0);
+        UVM_ASSERT(nid != NUMA_NO_NODE);
+        if (!uvm_va_block_is_hmm(block))
+            block_mark_cpu_page_dirty(block, 0, nid);
+    }

-    page_addr = block_phys_page_address(block, block_phys_page(resident_id, 0), gpu);
+    page_addr = block_phys_page_address(block, block_phys_page(resident_id, nid, 0), gpu);
    pte_val = tree->hal->make_pte(page_addr.aperture, page_addr.address, new_prot, pte_flags);
    uvm_pte_batch_write_pte(pte_batch, pte_addr, pte_val, pte_size);

@ -6195,9 +7032,9 @@ static void block_gpu_compute_new_pte_state(uvm_va_block_t *block,

    // If all pages in the 2M mask have the same attributes after the
    // operation is applied, we can use a 2M PTE.
-    if (block_gpu_supports_2m(block, gpu) &&
-        uvm_page_mask_full(page_mask_after) &&
-        (UVM_ID_IS_INVALID(resident_id) || is_block_phys_contig(block, resident_id))) {
+    if (block_gpu_supports_2m(block, gpu) && uvm_page_mask_full(page_mask_after) &&
+        (UVM_ID_IS_INVALID(resident_id) ||
+         is_block_phys_contig(block, resident_id, block_get_page_node_residency(block, 0)))) {
        new_pte_state->pte_is_2m = true;
        new_pte_state->needs_4k = false;
        return;
@ -6233,14 +7070,18 @@ static void block_gpu_compute_new_pte_state(uvm_va_block_t *block,
        can_make_new_big_ptes = false;

    for_each_va_block_page_in_region_mask(page_index, pages_changing, big_region_all) {
-        uvm_va_block_region_t contig_region = {0};
+        uvm_cpu_chunk_t *chunk = NULL;
+        int nid;
+
+        if (UVM_ID_IS_CPU(resident_id)) {
+            nid = block_get_page_node_residency(block, page_index);
+            UVM_ASSERT(nid != NUMA_NO_NODE);
+            chunk = uvm_cpu_chunk_get_chunk_for_page(block, nid, page_index);
+        }

        big_page_index = uvm_va_block_big_page_index(block, page_index, big_page_size);
        big_page_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);

-        if (!UVM_ID_IS_INVALID(resident_id))
-            contig_region = block_phys_contig_region(block, page_index, resident_id);
-
        __set_bit(big_page_index, new_pte_state->big_ptes_covered);

        // When mapping sysmem, we can use big pages only if we are mapping all
@ -6249,9 +7090,9 @@ static void block_gpu_compute_new_pte_state(uvm_va_block_t *block,
        if (can_make_new_big_ptes &&
            uvm_page_mask_region_full(page_mask_after, big_page_region) &&
            (!UVM_ID_IS_CPU(resident_id) ||
-             (contig_region.first <= big_page_region.first && contig_region.outer >= big_page_region.outer))) {
+             (uvm_cpu_chunk_get_size(chunk) >= big_page_size &&
+              uvm_va_block_cpu_is_region_resident_on(block, nid, big_page_region))))
            __set_bit(big_page_index, new_pte_state->big_ptes);
-        }

        if (!test_bit(big_page_index, new_pte_state->big_ptes))
            new_pte_state->needs_4k = true;
@ -6667,7 +7508,7 @@ static NV_STATUS block_unmap_gpu(uvm_va_block_t *block,

        // If the GPU is the only non-UVM-Lite processor with mappings, we can
        // safely mark pages as fully unmapped
-        if (uvm_processor_mask_get_count(&non_uvm_lite_gpus) == 1)
+        if (uvm_processor_mask_get_count(&non_uvm_lite_gpus) == 1 && !uvm_va_block_is_hmm(block))
            uvm_page_mask_andnot(&block->maybe_mapped_pages, &block->maybe_mapped_pages, pages_to_unmap);
    }

@ -6803,7 +7644,7 @@ static struct page *block_page_get(uvm_va_block_t *block, block_phys_page_t bloc
    struct page *page;

    if (UVM_ID_IS_CPU(block_page.processor)) {
-        page = uvm_cpu_chunk_get_cpu_page(block, block_page.page_index);
+        page = uvm_va_block_get_cpu_page(block, block_page.page_index);
    }
    else {
        uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
@ -6853,6 +7694,7 @@ static NV_STATUS block_map_cpu_page_to(uvm_va_block_t *block,
    NV_STATUS status;
    NvU64 addr;
    struct page *page;
+    int nid = NUMA_NO_NODE;

    UVM_ASSERT((uvm_va_block_is_hmm(block) && hmm_vma) || va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
    UVM_ASSERT(new_prot != UVM_PROT_NONE);
@ -6885,16 +7727,21 @@ static NV_STATUS block_map_cpu_page_to(uvm_va_block_t *block,

    UVM_ASSERT(va_range);

-    if (UVM_ID_IS_CPU(resident_id) && UVM_ID_IS_CPU(uvm_va_range_get_policy(va_range)->preferred_location)) {
-        // Add the page's range group range to the range group's migrated list.
-        uvm_range_group_range_t *rgr = uvm_range_group_range_find(va_space,
-                                                                  uvm_va_block_cpu_page_address(block, page_index));
-        if (rgr != NULL) {
-            uvm_spin_lock(&rgr->range_group->migrated_ranges_lock);
-            if (list_empty(&rgr->range_group_migrated_list_node))
-                list_move_tail(&rgr->range_group_migrated_list_node, &rgr->range_group->migrated_ranges);
-            uvm_spin_unlock(&rgr->range_group->migrated_ranges_lock);
+    if (UVM_ID_IS_CPU(resident_id)) {
+        if (UVM_ID_IS_CPU(uvm_va_range_get_policy(va_range)->preferred_location)) {
+            // Add the page's range group range to the range group's migrated list.
+            uvm_range_group_range_t *rgr = uvm_range_group_range_find(va_space,
+                                                                      uvm_va_block_cpu_page_address(block, page_index));
+            if (rgr != NULL) {
+                uvm_spin_lock(&rgr->range_group->migrated_ranges_lock);
+                if (list_empty(&rgr->range_group_migrated_list_node))
+                    list_move_tail(&rgr->range_group_migrated_list_node, &rgr->range_group->migrated_ranges);
+                uvm_spin_unlock(&rgr->range_group->migrated_ranges_lock);
+            }
        }
+
+        nid = block_get_page_node_residency(block, page_index);
+        UVM_ASSERT(nid != NUMA_NO_NODE);
    }

    // It's possible here that current->mm != vma->vm_mm. That can happen for
@ -6923,7 +7770,7 @@ static NV_STATUS block_map_cpu_page_to(uvm_va_block_t *block,
    if (status != NV_OK)
        return status;

-    page = block_page_get(block, block_phys_page(resident_id, page_index));
+    page = block_page_get(block, block_phys_page(resident_id, nid, page_index));
    return uvm_cpu_insert_page(vma, addr, page, new_prot);
 }

@ -6946,7 +7793,7 @@ static NV_STATUS block_map_cpu_to(uvm_va_block_t *block,
    uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
    uvm_page_index_t page_index;
    uvm_page_mask_t *pages_to_map = &block_context->mapping.page_mask;
-    const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, resident_id);
+    const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, resident_id, NUMA_NO_NODE);
    uvm_pte_bits_cpu_t prot_pte_bit = get_cpu_pte_bit_index(new_prot);
    uvm_pte_bits_cpu_t pte_bit;

@ -7012,7 +7859,8 @@ static NV_STATUS block_map_cpu_to(uvm_va_block_t *block,
    for (pte_bit = 0; pte_bit <= prot_pte_bit; pte_bit++)
        uvm_page_mask_or(&block->cpu.pte_bits[pte_bit], &block->cpu.pte_bits[pte_bit], pages_to_map);

-    uvm_page_mask_or(&block->maybe_mapped_pages, &block->maybe_mapped_pages, pages_to_map);
+    if (!uvm_va_block_is_hmm(block))
+        uvm_page_mask_or(&block->maybe_mapped_pages, &block->maybe_mapped_pages, pages_to_map);

    UVM_ASSERT(block_check_mappings(block));

@ -7042,7 +7890,7 @@ static NV_STATUS block_map_gpu_to(uvm_va_block_t *va_block,
    uvm_push_t push;
    NV_STATUS status;
    uvm_page_mask_t *pages_to_map = &block_context->mapping.page_mask;
-    const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id);
+    const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id, NUMA_NO_NODE);
    uvm_pte_bits_gpu_t pte_bit;
    uvm_pte_bits_gpu_t prot_pte_bit = get_gpu_pte_bit_index(new_prot);
    uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
@ -7139,8 +7987,10 @@ static NV_STATUS block_map_gpu_to(uvm_va_block_t *va_block,

    uvm_processor_mask_set(&va_block->mapped, gpu->id);

-    // If we are mapping a UVM-Lite GPU do not update maybe_mapped_pages
-    if (!uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id))
+    // If we are mapping a UVM-Lite GPU or HMM va_block, do not update
+    // maybe_mapped_pages.
+    if (!uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id) &&
+        !uvm_va_block_is_hmm(va_block))
        uvm_page_mask_or(&va_block->maybe_mapped_pages, &va_block->maybe_mapped_pages, pages_to_map);

    // Remove all pages resident on this processor from the input mask, which
@ -7389,7 +8239,7 @@ static NV_STATUS block_revoke_prot_gpu_to(uvm_va_block_t *va_block,
    uvm_prot_t new_prot = prot_to_revoke - 1;
    uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
    block_pte_op_t pte_op;
-    const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id);
+    const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id, NUMA_NO_NODE);
    uvm_page_mask_t *pages_to_revoke = &block_context->mapping.page_mask;

    UVM_ASSERT(revoke_page_mask);
@ -7663,7 +8513,7 @@ static void update_read_duplicated_pages_mask(uvm_va_block_t *block,
        if (uvm_id_equal(running_id, id))
            continue;

-        running_residency_mask = uvm_va_block_resident_mask_get(block, running_id);
+        running_residency_mask = uvm_va_block_resident_mask_get(block, running_id, NUMA_NO_NODE);

        if (first) {
            uvm_page_mask_copy(running_page_mask, running_residency_mask);
@ -7920,8 +8770,8 @@ void uvm_va_block_disable_peer(uvm_va_block_t *va_block, uvm_gpu_t *gpu0, uvm_gp
    if (!uvm_va_block_gpu_state_get(va_block, gpu0->id) || !uvm_va_block_gpu_state_get(va_block, gpu1->id))
        return;

-    resident0 = uvm_va_block_resident_mask_get(va_block, gpu0->id);
-    resident1 = uvm_va_block_resident_mask_get(va_block, gpu1->id);
+    resident0 = uvm_va_block_resident_mask_get(va_block, gpu0->id, NUMA_NO_NODE);
+    resident1 = uvm_va_block_resident_mask_get(va_block, gpu1->id, NUMA_NO_NODE);

    // Unmap all pages resident on gpu1, but not on gpu0, from gpu0
    if (uvm_page_mask_andnot(unmap_page_mask, resident1, resident0)) {
@ -7998,7 +8848,7 @@ void uvm_va_block_unmap_preferred_location_uvm_lite(uvm_va_block_t *va_block, uv
 static NV_STATUS block_evict_pages_from_gpu(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm)
 {
    NV_STATUS status = NV_OK;
-    const uvm_page_mask_t *resident = uvm_va_block_resident_mask_get(va_block, gpu->id);
+    const uvm_page_mask_t *resident = uvm_va_block_resident_mask_get(va_block, gpu->id, NUMA_NO_NODE);
    uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block);
    uvm_va_block_region_t subregion;
    uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
@ -8083,11 +8933,15 @@ void uvm_va_block_unregister_gpu(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struc
 static void block_mark_region_cpu_dirty(uvm_va_block_t *va_block, uvm_va_block_region_t region)
 {
    uvm_page_index_t page_index;
+    uvm_page_mask_t *resident_mask;

    uvm_assert_mutex_locked(&va_block->lock);
-
-    for_each_va_block_page_in_region_mask (page_index, &va_block->cpu.resident, region)
-        block_mark_cpu_page_dirty(va_block, page_index);
+    resident_mask = uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE);
+    for_each_va_block_page_in_region_mask(page_index, resident_mask, region) {
+        int nid = block_get_page_node_residency(va_block, page_index);
+        UVM_ASSERT(nid != NUMA_NO_NODE);
+        block_mark_cpu_page_dirty(va_block, page_index, nid);
+    }
 }

 // Tears down everything within the block, but doesn't free the block itself.
@ -8105,6 +8959,7 @@ static void block_kill(uvm_va_block_t *block)
    uvm_va_block_region_t region = uvm_va_block_region_from_block(block);
    uvm_page_index_t page_index;
    uvm_page_index_t next_page_index;
+    int nid;

    if (uvm_va_block_is_dead(block))
        return;
@ -8147,18 +9002,28 @@ static void block_kill(uvm_va_block_t *block)
    UVM_ASSERT(block_check_processor_not_mapped(block, UVM_ID_CPU));

    // Free CPU pages
-    for_each_cpu_chunk_in_block_safe(chunk, page_index, next_page_index, block) {
-        // be conservative.
-        // Tell the OS we wrote to the page because we sometimes clear the dirty
-        // bit after writing to it. HMM dirty flags are managed by the kernel.
-        if (!uvm_va_block_is_hmm(block))
-            uvm_cpu_chunk_mark_dirty(chunk, 0);
-        uvm_cpu_chunk_remove_from_block(block, page_index);
-        uvm_cpu_chunk_free(chunk);
+    for_each_possible_uvm_node(nid) {
+        uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, nid);
+        size_t index = node_to_index(nid);
+
+        for_each_cpu_chunk_in_block_safe(chunk, page_index, next_page_index, block, nid) {
+            // be conservative.
+            // Tell the OS we wrote to the page because we sometimes clear the dirty
+            // bit after writing to it. HMM dirty flags are managed by the kernel.
+            if (!uvm_va_block_is_hmm(block))
+                uvm_cpu_chunk_mark_dirty(chunk, 0);
+
+            uvm_cpu_chunk_remove_from_block(block, nid, page_index);
+            uvm_cpu_chunk_free(chunk);
+        }
+
+        UVM_ASSERT(uvm_page_mask_empty(&node_state->allocated));
+        UVM_ASSERT(node_state->chunks == 0);
+        kmem_cache_free(g_uvm_va_block_cpu_node_state_cache, block->cpu.node_state[index]);
    }

-    uvm_kvfree((void *)block->cpu.chunks);
-    block->cpu.chunks = 0;
+    uvm_kvfree((void *)block->cpu.node_state);
+    block->cpu.node_state = NULL;

    // Clearing the resident bit isn't strictly necessary since this block
    // is getting destroyed, but it keeps state consistent for assertions.
@ -8185,15 +9050,7 @@ void uvm_va_block_destroy(nv_kref_t *nv_kref)
    uvm_mutex_lock(&block->lock);
    block_kill(block);
    uvm_mutex_unlock(&block->lock);
-
-    if (uvm_enable_builtin_tests) {
-        uvm_va_block_wrapper_t *block_wrapper = container_of(block, uvm_va_block_wrapper_t, block);
-
-        kmem_cache_free(g_uvm_va_block_cache, block_wrapper);
-    }
-    else {
-        kmem_cache_free(g_uvm_va_block_cache, block);
-    }
+    uvm_va_block_free(block);
 }

 void uvm_va_block_kill(uvm_va_block_t *va_block)
@ -8248,14 +9105,6 @@ void uvm_va_block_munmap_region(uvm_va_block_t *va_block,
    event_data.block_munmap.region = region;
    uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_BLOCK_MUNMAP, &event_data);

-    // Set a flag so that GPU fault events are flushed since they might refer
-    // to the region being unmapped.
-    // Note that holding the va_block lock prevents GPU VA spaces from
-    // being removed so the registered_gpu_va_spaces mask is stable.
-    for_each_gpu_id_in_mask(gpu_id, &va_space->registered_gpu_va_spaces) {
-        uvm_processor_mask_set_atomic(&va_space->needs_fault_buffer_flush, gpu_id);
-    }
-
    // Release any remaining vidmem chunks in the given region.
    for_each_gpu_id(gpu_id) {
        uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id);
@ -8545,14 +9394,15 @@ error:
    return status;
 }

-static NV_STATUS block_split_cpu_chunk_to_64k(uvm_va_block_t *block)
+static NV_STATUS block_split_cpu_chunk_to_64k(uvm_va_block_t *block, int nid)
 {
    uvm_cpu_chunk_storage_mixed_t *mixed;
-    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, 0);
+    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, nid, 0);
+    uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, nid);
    NV_STATUS status;

    UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_2M);
-    UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_CHUNK);
+    UVM_ASSERT(uvm_cpu_storage_get_type(node_state) == UVM_CPU_CHUNK_STORAGE_CHUNK);

    mixed = uvm_kvmalloc_zero(sizeof(*mixed));
    if (!mixed)
@ -8565,23 +9415,25 @@ static NV_STATUS block_split_cpu_chunk_to_64k(uvm_va_block_t *block)
    }

    bitmap_fill(mixed->big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK);
-    block->cpu.chunks = (unsigned long)mixed | UVM_CPU_CHUNK_STORAGE_MIXED;
+    node_state->chunks = (unsigned long)mixed | UVM_CPU_CHUNK_STORAGE_MIXED;
+
    return status;
 }

-static NV_STATUS block_split_cpu_chunk_to_4k(uvm_va_block_t *block, uvm_page_index_t page_index)
+static NV_STATUS block_split_cpu_chunk_to_4k(uvm_va_block_t *block, uvm_page_index_t page_index, int nid)
 {
+    uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, nid);
    uvm_cpu_chunk_storage_mixed_t *mixed;
-    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
+    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, nid, page_index);
    uvm_cpu_chunk_t **small_chunks;
    size_t slot_index;
    NV_STATUS status;

    UVM_ASSERT(chunk);
    UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_64K);
-    UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_MIXED);
+    UVM_ASSERT(uvm_cpu_storage_get_type(node_state) == UVM_CPU_CHUNK_STORAGE_MIXED);

-    mixed = uvm_cpu_storage_get_ptr(block);
+    mixed = uvm_cpu_storage_get_ptr(node_state);
    slot_index = compute_slot_index(block, page_index);
    small_chunks = uvm_kvmalloc_zero(sizeof(*small_chunks) * MAX_SMALL_CHUNKS_PER_BIG_SLOT);
    if (!small_chunks)
@ -8595,12 +9447,13 @@ static NV_STATUS block_split_cpu_chunk_to_4k(uvm_va_block_t *block, uvm_page_ind

    mixed->slots[slot_index] = small_chunks;
    clear_bit(slot_index, mixed->big_chunks);
+
    return status;
 }

-static NV_STATUS block_split_cpu_chunk_one(uvm_va_block_t *block, uvm_page_index_t page_index)
+static NV_STATUS block_split_cpu_chunk_one(uvm_va_block_t *block, uvm_page_index_t page_index, int nid)
 {
-    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
+    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, nid, page_index);
    uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk);
    uvm_chunk_size_t new_size;
    uvm_gpu_t *gpu;
@ -8638,9 +9491,9 @@ static NV_STATUS block_split_cpu_chunk_one(uvm_va_block_t *block, uvm_page_index
    }

    if (new_size == UVM_CHUNK_SIZE_64K)
-        status = block_split_cpu_chunk_to_64k(block);
+        status = block_split_cpu_chunk_to_64k(block, nid);
    else
-        status = block_split_cpu_chunk_to_4k(block, page_index);
+        status = block_split_cpu_chunk_to_4k(block, page_index, nid);

    if (status != NV_OK) {
 merge:
@ -8656,16 +9509,18 @@ merge:
    return status;
 }

-static NV_STATUS block_prealloc_cpu_chunk_storage(uvm_va_block_t *existing, uvm_va_block_t *new)
+static NV_STATUS block_prealloc_cpu_chunk_storage(uvm_va_block_t *existing, uvm_va_block_t *new, int nid)
 {
    uvm_cpu_chunk_storage_mixed_t *existing_mixed;
    uvm_cpu_chunk_storage_mixed_t *new_mixed = NULL;
+    uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(existing, nid);
+    uvm_va_block_cpu_node_state_t *new_node_state = block_node_state_get(new, nid);
    size_t slot_offset;
    size_t existing_slot;
    NV_STATUS status = NV_OK;

-    UVM_ASSERT(uvm_cpu_storage_get_type(existing) == UVM_CPU_CHUNK_STORAGE_MIXED);
-    existing_mixed = uvm_cpu_storage_get_ptr(existing);
+    UVM_ASSERT(uvm_cpu_storage_get_type(node_state) == UVM_CPU_CHUNK_STORAGE_MIXED);
+    existing_mixed = uvm_cpu_storage_get_ptr(node_state);

    // Pre-allocate chunk storage for the new block. By definition, the new block
    // will contain either 64K and/or 4K chunks.
@ -8692,7 +9547,7 @@ static NV_STATUS block_prealloc_cpu_chunk_storage(uvm_va_block_t *existing, uvm_
        }
    }

-    new->cpu.chunks = (unsigned long)new_mixed | UVM_CPU_CHUNK_STORAGE_MIXED;
+    new_node_state->chunks = (unsigned long)new_mixed | UVM_CPU_CHUNK_STORAGE_MIXED;
    UVM_ASSERT(status == NV_OK);

 done:
@ -8706,19 +9561,21 @@ done:
    return status;
 }

-static void block_free_cpu_chunk_storage(uvm_va_block_t *block)
+static void block_free_cpu_chunk_storage(uvm_va_block_t *block, int nid)
 {
-    if (block->cpu.chunks) {
+    uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, nid);
+
+    if (node_state->chunks) {
        uvm_cpu_chunk_storage_mixed_t *mixed;
        size_t slot_index;

-        UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_MIXED);
-        mixed = uvm_cpu_storage_get_ptr(block);
+        UVM_ASSERT(uvm_cpu_storage_get_type(node_state) == UVM_CPU_CHUNK_STORAGE_MIXED);
+        mixed = uvm_cpu_storage_get_ptr(node_state);
        for (slot_index = 0; slot_index < MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK; slot_index++)
            uvm_kvfree(mixed->slots[slot_index]);

        uvm_kvfree(mixed);
-        block->cpu.chunks = 0;
+        node_state->chunks = 0;
    }
 }

@ -8731,42 +9588,51 @@ static NV_STATUS block_presplit_cpu_chunks(uvm_va_block_t *existing, uvm_va_bloc
    uvm_chunk_sizes_mask_t split_sizes = uvm_cpu_chunk_get_allocation_sizes();
    uvm_chunk_size_t subchunk_size;
    NV_STATUS status = NV_OK;
+    int nid;

    UVM_ASSERT(!IS_ALIGNED(new->start, UVM_VA_BLOCK_SIZE));
-    splitting_chunk = uvm_cpu_chunk_get_chunk_for_page(existing, page_index);

-    // If the page covering the split point has not been populated, there is no
-    // need to split.
-    if (!splitting_chunk)
-        return NV_OK;
+    for_each_possible_uvm_node(nid) {
+        splitting_chunk = uvm_cpu_chunk_get_chunk_for_page(existing, nid, page_index);

-    // If the split point is aligned on the chunk size, there is no need to
-    // split.
-    if (IS_ALIGNED(new->start, uvm_cpu_chunk_get_size(splitting_chunk)))
-        return NV_OK;
+        // If the page covering the split point has not been populated, there is no
+        // need to split.
+        if (!splitting_chunk)
+            continue;

-    // Remove all sizes above the chunk's current size.
-    split_sizes &= uvm_cpu_chunk_get_size(splitting_chunk) - 1;
-    // Remove all sizes below the alignment of the new block's start.
-    split_sizes &= ~(IS_ALIGNED(new->start, UVM_CHUNK_SIZE_64K) ? UVM_CHUNK_SIZE_64K - 1 : 0);
+        // If the split point is aligned on the chunk size, there is no need to
+        // split.
+        if (IS_ALIGNED(new->start, uvm_cpu_chunk_get_size(splitting_chunk)))
+            continue;

-    for_each_chunk_size_rev(subchunk_size, split_sizes) {
-        status = block_split_cpu_chunk_one(existing, page_index);
+        // Remove all sizes above the chunk's current size.
+        split_sizes &= uvm_cpu_chunk_get_size(splitting_chunk) - 1;
+        // Remove all sizes below the alignment of the new block's start.
+        split_sizes &= ~(IS_ALIGNED(new->start, UVM_CHUNK_SIZE_64K) ? UVM_CHUNK_SIZE_64K - 1 : 0);
+
+        for_each_chunk_size_rev(subchunk_size, split_sizes) {
+            status = block_split_cpu_chunk_one(existing, page_index, nid);
+            if (status != NV_OK)
+                return status;
+        }
+
+        status = block_prealloc_cpu_chunk_storage(existing, new, nid);
        if (status != NV_OK)
-            return status;
+            break;
    }

-    return block_prealloc_cpu_chunk_storage(existing, new);
+    return status;
 }

-static void block_merge_cpu_chunks_to_64k(uvm_va_block_t *block, uvm_page_index_t page_index)
+static void block_merge_cpu_chunks_to_64k(uvm_va_block_t *block, uvm_page_index_t page_index, int nid)
 {
-    uvm_cpu_chunk_storage_mixed_t *mixed = uvm_cpu_storage_get_ptr(block);
+    uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, nid);
+    uvm_cpu_chunk_storage_mixed_t *mixed = uvm_cpu_storage_get_ptr(node_state);
    size_t slot_index = compute_slot_index(block, page_index);
    uvm_cpu_chunk_t **small_chunks = mixed->slots[slot_index];
    uvm_cpu_chunk_t *merged_chunk;

-    UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_MIXED);
+    UVM_ASSERT(uvm_cpu_storage_get_type(node_state) == UVM_CPU_CHUNK_STORAGE_MIXED);
    UVM_ASSERT(small_chunks);
    UVM_ASSERT(!test_bit(slot_index, mixed->big_chunks));

@ -8776,34 +9642,38 @@ static void block_merge_cpu_chunks_to_64k(uvm_va_block_t *block, uvm_page_index_
    uvm_kvfree(small_chunks);
 }

-static void block_merge_cpu_chunks_to_2m(uvm_va_block_t *block, uvm_page_index_t page_index)
+static void block_merge_cpu_chunks_to_2m(uvm_va_block_t *block, uvm_page_index_t page_index, int nid)
 {
-    uvm_cpu_chunk_storage_mixed_t *mixed = uvm_cpu_storage_get_ptr(block);
+    uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, nid);
+    uvm_cpu_chunk_storage_mixed_t *mixed = uvm_cpu_storage_get_ptr(node_state);
    uvm_cpu_chunk_t **big_chunks = (uvm_cpu_chunk_t **)&mixed->slots;
    uvm_cpu_chunk_t *merged_chunk;

-    UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_MIXED);
+    UVM_ASSERT(uvm_cpu_storage_get_type(node_state) == UVM_CPU_CHUNK_STORAGE_MIXED);
    UVM_ASSERT(bitmap_full(mixed->big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK));

    merged_chunk = uvm_cpu_chunk_merge(big_chunks);
-    block->cpu.chunks = (unsigned long)merged_chunk | UVM_CPU_CHUNK_STORAGE_CHUNK;
+    node_state->chunks = (unsigned long)merged_chunk | UVM_CPU_CHUNK_STORAGE_CHUNK;
    uvm_kvfree(mixed);
 }

-static void block_merge_cpu_chunks_one(uvm_va_block_t *block, uvm_page_index_t page_index)
+static void block_merge_cpu_chunks_one(uvm_va_block_t *block, uvm_page_index_t page_index, int nid)
 {
-    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
+    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, nid, page_index);
    uvm_gpu_id_t id;

+    if (!chunk)
+        return;
+
    if (uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_4K) {
-        block_merge_cpu_chunks_to_64k(block, page_index);
+        block_merge_cpu_chunks_to_64k(block, page_index, nid);
    }
    else {
        UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_64K);
-        block_merge_cpu_chunks_to_2m(block, page_index);
+        block_merge_cpu_chunks_to_2m(block, page_index, nid);
    }

-    chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
+    chunk = uvm_cpu_chunk_get_chunk_for_page(block, nid, page_index);

    for_each_gpu_id(id) {
        NvU64 gpu_mapping_addr;
@ -8826,50 +9696,56 @@ static void block_merge_cpu_chunks_one(uvm_va_block_t *block, uvm_page_index_t p
 static void block_merge_cpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new)
 {
    uvm_page_index_t page_index = uvm_va_block_cpu_page_index(existing, new->start);
-    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(existing, page_index);
    uvm_chunk_sizes_mask_t merge_sizes = uvm_cpu_chunk_get_allocation_sizes();
    uvm_chunk_size_t largest_size;
-    uvm_chunk_size_t chunk_size;
-    uvm_chunk_size_t merge_size;
    size_t block_size = uvm_va_block_size(existing);
+    int nid;

-    if (!chunk || uvm_cpu_chunk_is_physical(chunk))
-        return;
-
-    chunk_size = uvm_cpu_chunk_get_size(chunk);
-
-    // Remove all CPU chunk sizes above the size of the existing VA block.
    // Since block sizes are not always powers of 2, use the largest power of 2
    // less than or equal to the block size since we can't merge to a size
    // larger than the block's size.
    largest_size = rounddown_pow_of_two(block_size);
-    merge_sizes &= (largest_size | (largest_size - 1));

-    // Remove all CPU chunk sizes smaller than the size of the chunk being merged up.
-    merge_sizes &= ~(chunk_size | (chunk_size - 1));
+    for_each_possible_uvm_node(nid) {
+        uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(existing, nid, page_index);
+        uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(existing, nid);
+        uvm_chunk_size_t chunk_size;
+        uvm_chunk_size_t merge_size;

-    for_each_chunk_size(merge_size, merge_sizes) {
-        uvm_va_block_region_t chunk_region;
+        if (!chunk || uvm_cpu_chunk_is_physical(chunk))
+            continue;

-        // The block has to fully contain the VA range after the merge.
-        if (!uvm_va_block_contains_address(existing, UVM_ALIGN_DOWN(new->start, merge_size)) ||
-            !uvm_va_block_contains_address(existing, UVM_ALIGN_DOWN(new->start, merge_size) + merge_size - 1))
-            break;
+        chunk_size = uvm_cpu_chunk_get_size(chunk);

-        chunk_region = uvm_va_block_chunk_region(existing, merge_size, page_index);
+        // Remove all CPU chunk sizes above the size of the existing VA block.
+        merge_sizes &= (largest_size | (largest_size - 1));

-        // If not all pages in the region covered by the chunk are allocated,
-        // we can't merge.
-        if (!uvm_page_mask_region_full(&existing->cpu.allocated, chunk_region))
-            break;
+        // Remove all CPU chunk sizes smaller than the size of the chunk being merged up.
+        merge_sizes &= ~(chunk_size | (chunk_size - 1));

-        block_merge_cpu_chunks_one(existing, chunk_region.first);
-        chunk = uvm_cpu_chunk_get_chunk_for_page(existing, page_index);
-        if (uvm_cpu_chunk_is_physical(chunk))
-            break;
+        for_each_chunk_size(merge_size, merge_sizes) {
+            uvm_va_block_region_t chunk_region;
+
+            // The block has to fully contain the VA range after the merge.
+            if (!uvm_va_block_contains_address(existing, UVM_ALIGN_DOWN(new->start, merge_size)) ||
+                !uvm_va_block_contains_address(existing, UVM_ALIGN_DOWN(new->start, merge_size) + merge_size - 1))
+                break;
+
+            chunk_region = uvm_va_block_chunk_region(existing, merge_size, page_index);
+
+            // If not all pages in the region covered by the chunk are allocated,
+            // we can't merge.
+            if (!uvm_page_mask_region_full(&node_state->allocated, chunk_region))
+                break;
+
+            block_merge_cpu_chunks_one(existing, chunk_region.first, nid);
+            chunk = uvm_cpu_chunk_get_chunk_for_page(existing, nid, page_index);
+            if (uvm_cpu_chunk_is_physical(chunk))
+                break;
+        }
+
+        block_free_cpu_chunk_storage(new, nid);
    }
-
-    block_free_cpu_chunk_storage(new);
 }

 // Pre-allocate everything which doesn't require retry on both existing and new
@ -8977,7 +9853,7 @@ static void block_set_processor_masks(uvm_va_block_t *block)
        uvm_processor_mask_set(&block->mapped, UVM_ID_CPU);
    }

-    if (uvm_page_mask_region_empty(&block->cpu.resident, block_region)) {
+    if (uvm_page_mask_region_empty(uvm_va_block_resident_mask_get(block, UVM_ID_CPU, NUMA_NO_NODE), block_region)) {
        uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);

        if (uvm_processor_mask_get_gpu_count(&va_space->can_access[UVM_ID_CPU_VALUE]) == 0)
@ -9046,6 +9922,7 @@ static void block_split_cpu(uvm_va_block_t *existing, uvm_va_block_t *new)
    uvm_page_index_t next_page_index;
    uvm_cpu_chunk_t *chunk;
    uvm_va_range_t *existing_va_range = existing->va_range;
+    int nid;

    if (existing_va_range) {
        UVM_ASSERT(existing->va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
@ -9063,30 +9940,35 @@ static void block_split_cpu(uvm_va_block_t *existing, uvm_va_block_t *new)
    // We don't have to unmap the CPU since its virtual -> physical mappings
    // don't change.

-    page_index = uvm_va_block_next_page_in_mask(block_region, &existing->cpu.allocated, split_page_index - 1);
+    for_each_possible_uvm_node(nid) {
+        uvm_page_mask_t *existing_resident_mask = uvm_va_block_resident_mask_get(existing, UVM_ID_CPU, nid);
+        uvm_page_mask_t *new_resident_mask = uvm_va_block_resident_mask_get(new, UVM_ID_CPU, nid);

-    for_each_cpu_chunk_in_block_region_safe(chunk,
-                                            page_index,
-                                            next_page_index,
-                                            existing,
-                                            uvm_va_block_region(split_page_index, block_region.outer)) {
-        uvm_page_index_t new_chunk_page_index;
-        NV_STATUS status;
+        for_each_cpu_chunk_in_block_region_safe(chunk,
+                                                page_index,
+                                                next_page_index,
+                                                existing,
+                                                nid,
+                                                uvm_va_block_region(split_page_index, block_region.outer)) {
+            uvm_page_index_t new_chunk_page_index;
+            NV_STATUS status;

-        uvm_cpu_chunk_remove_from_block(existing, page_index);
+            uvm_cpu_chunk_remove_from_block(existing, nid, page_index);

-        // The chunk has to be adjusted for the new block before inserting it.
-        new_chunk_page_index = page_index - split_page_index;
+            // The chunk has to be adjusted for the new block before inserting it.
+            new_chunk_page_index = page_index - split_page_index;

-        // This should never fail because all necessary storage was allocated
-        // in block_presplit_cpu_chunks().
-        status = uvm_cpu_chunk_insert_in_block(new, chunk, new_chunk_page_index);
-        UVM_ASSERT(status == NV_OK);
+            // This should never fail because all necessary storage was allocated
+            // in block_presplit_cpu_chunks().
+            status = uvm_cpu_chunk_insert_in_block(new, chunk, new_chunk_page_index);
+            UVM_ASSERT(status == NV_OK);
+        }
+
+        block_split_page_mask(existing_resident_mask, existing_pages, new_resident_mask, new_pages);
    }

-    new->cpu.ever_mapped = existing->cpu.ever_mapped;
-
    block_split_page_mask(&existing->cpu.resident, existing_pages, &new->cpu.resident, new_pages);
+    new->cpu.ever_mapped = existing->cpu.ever_mapped;

    for (pte_bit = 0; pte_bit < UVM_PTE_BITS_CPU_MAX; pte_bit++)
        block_split_page_mask(&existing->cpu.pte_bits[pte_bit], existing_pages, &new->cpu.pte_bits[pte_bit], new_pages);
@ -9236,6 +10118,7 @@ static void block_split_gpu(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_g
    size_t num_chunks, i;
    uvm_cpu_chunk_t *cpu_chunk;
    uvm_page_index_t page_index;
+    int nid;

    if (!existing_gpu_state)
        return;
@ -9249,10 +10132,12 @@ static void block_split_gpu(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_g
    UVM_ASSERT(PAGE_ALIGNED(existing->start));
    existing_pages = (new->start - existing->start) / PAGE_SIZE;

-    for_each_cpu_chunk_in_block(cpu_chunk, page_index, new) {
-        uvm_pmm_sysmem_mappings_reparent_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings,
-                                                     uvm_cpu_chunk_get_gpu_phys_addr(cpu_chunk, gpu->parent),
-                                                     new);
+    for_each_possible_uvm_node(nid) {
+        for_each_cpu_chunk_in_block(cpu_chunk, page_index, new, nid) {
+            uvm_pmm_sysmem_mappings_reparent_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings,
+                                                         uvm_cpu_chunk_get_gpu_phys_addr(cpu_chunk, gpu->parent),
+                                                         new);
+        }
    }

    block_copy_split_gpu_chunks(existing, new, gpu);
@ -9483,10 +10368,12 @@ NV_STATUS uvm_va_block_split_locked(uvm_va_block_t *existing_va_block,
                          &new_block->read_duplicated_pages,
                          uvm_va_block_num_cpu_pages(new_block));

-    block_split_page_mask(&existing_va_block->maybe_mapped_pages,
-                          uvm_va_block_num_cpu_pages(existing_va_block),
-                          &new_block->maybe_mapped_pages,
-                          uvm_va_block_num_cpu_pages(new_block));
+    if (!uvm_va_block_is_hmm(existing_va_block)) {
+        block_split_page_mask(&existing_va_block->maybe_mapped_pages,
+                              uvm_va_block_num_cpu_pages(existing_va_block),
+                              &new_block->maybe_mapped_pages,
+                              uvm_va_block_num_cpu_pages(new_block));
+    }

    block_set_processor_masks(existing_va_block);
    block_set_processor_masks(new_block);
@ -9506,7 +10393,10 @@ out:
        UVM_ASSERT(block_check_mappings(new_block));
    }
    else {
-        block_free_cpu_chunk_storage(new_block);
+        int nid;
+
+        for_each_possible_uvm_node(nid)
+            block_free_cpu_chunk_storage(new_block, nid);
    }

    uvm_mutex_unlock_no_tracking(&new_block->lock);
@ -9868,7 +10758,8 @@ uvm_prot_t uvm_va_block_page_compute_highest_permission(uvm_va_block_t *va_block

        // Fast path: if the page is not mapped anywhere else, it can be safely
        // mapped with RWA permission
-        if (!uvm_page_mask_test(&va_block->maybe_mapped_pages, page_index))
+        if (!uvm_page_mask_test(&va_block->maybe_mapped_pages, page_index) &&
+            !uvm_va_block_is_hmm(va_block))
            return UVM_PROT_READ_WRITE_ATOMIC;

        block_page_authorized_processors(va_block, page_index, UVM_PROT_READ_WRITE_ATOMIC, &atomic_mappings);
@ -10155,30 +11046,6 @@ static uvm_processor_id_t block_select_residency(uvm_va_block_t *va_block,
        uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(preferred_location)], processor_id))
        return preferred_location;

-    // Check if we should map the closest resident processor remotely on remote CPU fault
-    //
-    // When faulting on CPU, there's a linux process on behalf of it, which is associated
-    // with a unique VM pointed by current->mm. A block of memory residing on GPU is also
-    // associated with VM, pointed by va_block_context->mm. If they match, it's a regular
-    // (local) fault, and we may want to migrate a page from GPU to CPU.
-    // If it's a 'remote' fault, i.e. linux process differs from one associated with block
-    // VM, we might preserve residence.
-    //
-    // Establishing a remote fault without access counters means the memory could stay in
-    // the wrong spot for a long time, which is why we prefer to avoid creating remote
-    // mappings. However when NIC accesses a memory residing on GPU, it's worth to keep it
-    // in place for NIC accesses.
-    //
-    // The logic that's used to detect remote faulting also keeps memory in place for
-    // ptrace accesses. We would prefer to control those policies separately, but the
-    // NIC case takes priority.
-    if (UVM_ID_IS_CPU(processor_id) &&
-        uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(closest_resident_processor)], processor_id) &&
-        va_block_context->mm != current->mm) {
-        UVM_ASSERT(va_block_context->mm != NULL);
-        return closest_resident_processor;
-    }
-
    // If the page is resident on a processor other than the preferred location,
    // or the faulting processor can't access the preferred location, we select
    // the faulting processor as the new residency.
@ -10214,9 +11081,14 @@ uvm_processor_id_t uvm_va_block_select_residency(uvm_va_block_t *va_block,
    // If the intended residency doesn't have memory, fall back to the CPU.
    if (!block_processor_has_memory(va_block, id)) {
        *read_duplicate = false;
-        return UVM_ID_CPU;
+        id = UVM_ID_CPU;
    }

+    // Set the destination NUMA node unconditionally since it could be used
+    // for CPU allocations (if staging pages are needed) even if the new
+    // residency is not the CPU.
+    va_block_context->make_resident.dest_nid = policy->preferred_nid;
+
    return id;
 }

@ -10260,7 +11132,7 @@ static void uvm_va_block_get_prefetch_hint(uvm_va_block_t *va_block,
        // Update prefetch tracking structure with the pages that will migrate
        // due to faults
        uvm_perf_prefetch_get_hint_va_block(va_block,
-                                            &service_context->block_context,
+                                            service_context->block_context,
                                            new_residency,
                                            new_residency_mask,
                                            service_context->region,
@ -10304,11 +11176,11 @@ NV_STATUS uvm_va_block_service_copy(uvm_processor_id_t processor_id,
 {
    uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
    uvm_processor_mask_t *all_involved_processors =
-        &service_context->block_context.make_resident.all_involved_processors;
+        &service_context->block_context->make_resident.all_involved_processors;
    uvm_page_mask_t *new_residency_mask =
        &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency;
-    uvm_page_mask_t *did_migrate_mask = &service_context->block_context.make_resident.pages_changed_residency;
-    uvm_page_mask_t *caller_page_mask = &service_context->block_context.caller_page_mask;
+    uvm_page_mask_t *did_migrate_mask = &service_context->block_context->make_resident.pages_changed_residency;
+    uvm_page_mask_t *caller_page_mask = &service_context->block_context->caller_page_mask;
    uvm_make_resident_cause_t cause;
    NV_STATUS status;

@ -10343,7 +11215,7 @@ NV_STATUS uvm_va_block_service_copy(uvm_processor_id_t processor_id,
                          &service_context->read_duplicate_mask)) {
        status = uvm_va_block_make_resident_read_duplicate(va_block,
                                                           block_retry,
-                                                           &service_context->block_context,
+                                                           service_context->block_context,
                                                           new_residency,
                                                           service_context->region,
                                                           caller_page_mask,
@ -10359,7 +11231,7 @@ NV_STATUS uvm_va_block_service_copy(uvm_processor_id_t processor_id,
            uvm_page_mask_copy(caller_page_mask, new_residency_mask);
        status = uvm_va_block_make_resident_copy(va_block,
                                                 block_retry,
-                                                 &service_context->block_context,
+                                                 service_context->block_context,
                                                 new_residency,
                                                 service_context->region,
                                                 caller_page_mask,
@ -10417,11 +11289,11 @@ NV_STATUS uvm_va_block_service_finish(uvm_processor_id_t processor_id,
                                      uvm_va_block_t *va_block,
                                      uvm_service_block_context_t *service_context)
 {
-    uvm_processor_id_t new_residency = service_context->block_context.make_resident.dest_id;
+    uvm_processor_id_t new_residency = service_context->block_context->make_resident.dest_id;
    uvm_page_mask_t *new_residency_mask =
        &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency;
-    uvm_page_mask_t *did_migrate_mask = &service_context->block_context.make_resident.pages_changed_residency;
-    uvm_page_mask_t *caller_page_mask = &service_context->block_context.caller_page_mask;
+    uvm_page_mask_t *did_migrate_mask = &service_context->block_context->make_resident.pages_changed_residency;
+    uvm_page_mask_t *caller_page_mask = &service_context->block_context->caller_page_mask;
    uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
    uvm_prot_t new_prot;
    uvm_page_index_t page_index;
@ -10430,7 +11302,7 @@ NV_STATUS uvm_va_block_service_finish(uvm_processor_id_t processor_id,
    // Update residency.
    if (service_context->read_duplicate_count == 0 || !uvm_page_mask_empty(caller_page_mask))
        uvm_va_block_make_resident_finish(va_block,
-                                          &service_context->block_context,
+                                          service_context->block_context,
                                          service_context->region,
                                          caller_page_mask);

@ -10450,7 +11322,7 @@ NV_STATUS uvm_va_block_service_finish(uvm_processor_id_t processor_id,

    for_each_va_block_page_in_region_mask(page_index, new_residency_mask, service_context->region) {
        new_prot = compute_new_permission(va_block,
-                                          service_context->block_context.hmm.vma,
+                                          service_context->block_context->hmm.vma,
                                          page_index,
                                          processor_id,
                                          new_residency,
@ -10523,7 +11395,7 @@ NV_STATUS uvm_va_block_service_finish(uvm_processor_id_t processor_id,
            // uvm_va_block_make_resident_read_duplicate, above.
            if (service_context->operation == UVM_SERVICE_OPERATION_ACCESS_COUNTERS) {
                UVM_ASSERT(check_access_counters_dont_revoke(va_block,
-                                                             &service_context->block_context,
+                                                             service_context->block_context,
                                                             service_context->region,
                                                             &revoke_processors,
                                                             &service_context->revocation_mask,
@ -10532,7 +11404,7 @@ NV_STATUS uvm_va_block_service_finish(uvm_processor_id_t processor_id,

            // Downgrade other processors' mappings
            status = uvm_va_block_revoke_prot_mask(va_block,
-                                                   &service_context->block_context,
+                                                   service_context->block_context,
                                                   &revoke_processors,
                                                   service_context->region,
                                                   &service_context->revocation_mask,
@ -10562,7 +11434,7 @@ NV_STATUS uvm_va_block_service_finish(uvm_processor_id_t processor_id,
            // A CPU fault is unexpected if:
            // curr_prot == RW || (!is_write && curr_prot == RO)
            status = uvm_va_block_unmap(va_block,
-                                        &service_context->block_context,
+                                        service_context->block_context,
                                        UVM_ID_CPU,
                                        service_context->region,
                                        map_prot_mask,
@ -10579,13 +11451,13 @@ NV_STATUS uvm_va_block_service_finish(uvm_processor_id_t processor_id,

        // Map pages that are thrashing first
        if (service_context->thrashing_pin_count > 0 && va_space->tools.enabled) {
-            uvm_page_mask_t *helper_page_mask = &service_context->block_context.caller_page_mask;
+            uvm_page_mask_t *helper_page_mask = &service_context->block_context->caller_page_mask;
            bool pages_need_mapping = uvm_page_mask_and(helper_page_mask,
                                                        map_prot_mask,
                                                        &service_context->thrashing_pin_mask);
            if (pages_need_mapping) {
                status = uvm_va_block_map(va_block,
-                                          &service_context->block_context,
+                                          service_context->block_context,
                                          processor_id,
                                          service_context->region,
                                          helper_page_mask,
@ -10607,7 +11479,7 @@ NV_STATUS uvm_va_block_service_finish(uvm_processor_id_t processor_id,
        }

        status = uvm_va_block_map(va_block,
-                                  &service_context->block_context,
+                                  service_context->block_context,
                                  processor_id,
                                  service_context->region,
                                  map_prot_mask,
@ -10649,7 +11521,7 @@ NV_STATUS uvm_va_block_service_finish(uvm_processor_id_t processor_id,
                map_thrashing_processors = uvm_perf_thrashing_get_thrashing_processors(va_block, page_addr);

                status = uvm_va_block_add_mappings_after_migration(va_block,
-                                                                   &service_context->block_context,
+                                                                   service_context->block_context,
                                                                   new_residency,
                                                                   processor_id,
                                                                   uvm_va_block_region_for_page(page_index),
@ -10669,7 +11541,7 @@ NV_STATUS uvm_va_block_service_finish(uvm_processor_id_t processor_id,

        // Map the rest of pages in a single shot
        status = uvm_va_block_add_mappings_after_migration(va_block,
-                                                           &service_context->block_context,
+                                                           service_context->block_context,
                                                           new_residency,
                                                           processor_id,
                                                           service_context->region,
@ -10694,7 +11566,7 @@ NV_STATUS uvm_va_block_service_locked(uvm_processor_id_t processor_id,

    uvm_assert_mutex_locked(&va_block->lock);
    UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block,
-                                                  service_context->block_context.hmm.vma,
+                                                  service_context->block_context->hmm.vma,
                                                  service_context->region));

    // GPU fault servicing must be done under the VA space read lock. GPU fault
@ -10777,7 +11649,7 @@ NV_STATUS uvm_va_block_check_logical_permissions(uvm_va_block_t *va_block,
            return NV_ERR_INVALID_OPERATION;
        }
        else {
-            uvm_va_space_t *va_space = va_range->va_space;
+            uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);

            return uvm_processor_mask_test(
                    &va_space->accessible_from[uvm_id_value(uvm_va_range_get_policy(va_range)->preferred_location)],
@ -10856,7 +11728,7 @@ static NV_STATUS block_cpu_fault_locked(uvm_va_block_t *va_block,
    UVM_ASSERT(fault_addr >= va_block->start);
    UVM_ASSERT(fault_addr <= va_block->end);

-    uvm_assert_mmap_lock_locked(service_context->block_context.mm);
+    uvm_assert_mmap_lock_locked(service_context->block_context->mm);

    policy = uvm_va_policy_get(va_block, fault_addr);

@ -10873,7 +11745,7 @@ static NV_STATUS block_cpu_fault_locked(uvm_va_block_t *va_block,
    // Check logical permissions
    page_index = uvm_va_block_cpu_page_index(va_block, fault_addr);
    status = uvm_va_block_check_logical_permissions(va_block,
-                                                    &service_context->block_context,
+                                                    service_context->block_context,
                                                    UVM_ID_CPU,
                                                    page_index,
                                                    fault_access_type,
@ -10905,7 +11777,7 @@ static NV_STATUS block_cpu_fault_locked(uvm_va_block_t *va_block,

    // Compute new residency and update the masks
    new_residency = uvm_va_block_select_residency(va_block,
-                                                  &service_context->block_context,
+                                                  service_context->block_context,
                                                  page_index,
                                                  UVM_ID_CPU,
                                                  uvm_fault_access_type_mask_bit(fault_access_type),
@ -11251,7 +12123,7 @@ NV_STATUS uvm_va_block_write_from_cpu(uvm_va_block_t *va_block,

    if (UVM_ID_IS_CPU(proc)) {
        char *mapped_page;
-        struct page *page = uvm_cpu_chunk_get_cpu_page(va_block, page_index);
+        struct page *page = uvm_va_block_get_cpu_page(va_block, page_index);
        void *src = uvm_mem_get_cpu_addr_kernel(src_mem);

        status = uvm_tracker_wait(&va_block->tracker);
@ -11272,7 +12144,9 @@ NV_STATUS uvm_va_block_write_from_cpu(uvm_va_block_t *va_block,

        dst_gpu = block_get_gpu(va_block, proc);

-        dst_gpu_address = block_phys_page_copy_address(va_block, block_phys_page(proc, page_index), dst_gpu);
+        dst_gpu_address = block_phys_page_copy_address(va_block,
+                                                       block_phys_page(proc, NUMA_NO_NODE, page_index),
+                                                       dst_gpu);
        dst_gpu_address.address += page_offset;

        return va_block_write_cpu_to_gpu(va_block, dst_gpu, dst_gpu_address, dst, src_mem, size);
@ -11333,7 +12207,7 @@ NV_STATUS uvm_va_block_read_to_cpu(uvm_va_block_t *va_block, uvm_mem_t *dst_mem,
    else if (UVM_ID_IS_CPU(proc)) {
        NV_STATUS status;
        char *mapped_page;
-        struct page *page = uvm_cpu_chunk_get_cpu_page(va_block, page_index);
+        struct page *page = uvm_va_block_get_cpu_page(va_block, page_index);

        status = uvm_tracker_wait(&va_block->tracker);
        if (status != NV_OK)
@ -11349,7 +12223,9 @@ NV_STATUS uvm_va_block_read_to_cpu(uvm_va_block_t *va_block, uvm_mem_t *dst_mem,
        uvm_gpu_address_t src_gpu_address;
        uvm_gpu_t *gpu = block_get_gpu(va_block, proc);

-        src_gpu_address = block_phys_page_copy_address(va_block, block_phys_page(proc, page_index), gpu);
+        src_gpu_address = block_phys_page_copy_address(va_block,
+                                                       block_phys_page(proc, NUMA_NO_NODE, page_index),
+                                                       gpu);
        src_gpu_address.address += page_offset;

        return va_block_read_gpu_to_cpu(va_block, dst_mem, gpu, src_gpu_address, src, size);
@ -11539,7 +12415,7 @@ NV_STATUS uvm_va_block_evict_chunks(uvm_va_block_t *va_block,
        goto out;

    // Only move pages resident on the GPU
-    uvm_page_mask_and(pages_to_evict, pages_to_evict, uvm_va_block_resident_mask_get(va_block, gpu->id));
+    uvm_page_mask_and(pages_to_evict, pages_to_evict, uvm_va_block_resident_mask_get(va_block, gpu->id, NUMA_NO_NODE));
    uvm_processor_mask_zero(&block_context->make_resident.all_involved_processors);

    if (uvm_va_block_is_hmm(va_block)) {
@ -11776,6 +12652,12 @@ NV_STATUS uvm_test_va_block_inject_error(UVM_TEST_VA_BLOCK_INJECT_ERROR_PARAMS *
        va_block_test->cpu_chunk_allocation_size_mask = params->cpu_chunk_allocation_size_mask & UVM_CPU_CHUNK_SIZES;
    }

+    if (params->cpu_chunk_allocation_target_id != NUMA_NO_NODE)
+        va_block_test->cpu_chunk_allocation_target_id = params->cpu_chunk_allocation_target_id;
+
+    if (params->cpu_chunk_allocation_actual_id != NUMA_NO_NODE)
+        va_block_test->cpu_chunk_allocation_actual_id = params->cpu_chunk_allocation_actual_id;
+
    if (params->eviction_error)
        va_block_test->inject_eviction_error = params->eviction_error;

@ -12028,6 +12910,7 @@ NV_STATUS uvm_test_va_residency_info(UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params,
            params->resident_on_count = 0;
            params->populated_on_count = 0;
            params->mapped_on_count = 0;
+            params->resident_nid = -1;

            status = NV_OK;

@ -12040,19 +12923,34 @@ NV_STATUS uvm_test_va_residency_info(UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params,
    page_index = uvm_va_block_cpu_page_index(block, addr);
    uvm_va_block_page_resident_processors(block, page_index, &resident_on_mask);

+    params->resident_nid = -1;
    for_each_id_in_mask(id, &resident_on_mask) {
-        block_phys_page_t block_page = block_phys_page(id, page_index);
+        block_phys_page_t block_page;
+        int nid = block_get_page_node_residency(block, page_index);
+
+        block_page = block_phys_page(id, nid, page_index);
        uvm_va_space_processor_uuid(va_space, &params->resident_on[count], id);
        params->resident_physical_size[count] = block_phys_page_size(block, block_page);
        if (UVM_ID_IS_CPU(id)) {
-            params->resident_physical_address[count] = page_to_phys(uvm_cpu_chunk_get_cpu_page(block, page_index));
+            params->resident_physical_address[count] = page_to_phys(uvm_va_block_get_cpu_page(block, page_index));
+            params->resident_nid = nid;
+
+            // Check that the page is only resident on a single CPU NUMA node.
+            for_each_possible_uvm_node(nid) {
+                if (uvm_va_block_cpu_is_page_resident_on(block, nid, page_index) && nid != params->resident_nid) {
+                    status = NV_ERR_INVALID_STATE;
+                    goto out;
+                }
+            }
        }
        else {
            params->resident_physical_address[count] =
                block_phys_page_address(block, block_page, uvm_va_space_get_gpu(va_space, id)).address;
        }
+
        ++count;
    }
+
    params->resident_on_count = count;

    count = 0;
@ -12060,6 +12958,7 @@ NV_STATUS uvm_test_va_residency_info(UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params,
        uvm_processor_id_t processor_to_map;
        block_phys_page_t block_page;
        NvU32 page_size = uvm_va_block_page_size_processor(block, id, page_index);
+        int nid = NUMA_NO_NODE;

        if (page_size == 0)
            continue;
@ -12069,7 +12968,10 @@ NV_STATUS uvm_test_va_residency_info(UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params,
        params->mapping_type[count] = g_uvm_prot_to_test_pte_mapping[block_page_prot(block, id, page_index)];
        UVM_ASSERT(params->mapping_type[count] != UVM_TEST_PTE_MAPPING_INVALID);
        processor_to_map = block_get_processor_to_map(block, id, page_index);
-        block_page = block_phys_page(processor_to_map, page_index);
+        if (UVM_ID_IS_CPU(processor_to_map))
+            nid = block_get_page_node_residency(block, page_index);
+
+        block_page = block_phys_page(processor_to_map, nid, page_index);

        if (!UVM_ID_IS_CPU(id)) {
            uvm_gpu_phys_address_t gpu_phys_addr = block_phys_page_address(block,
@ -12093,7 +12995,7 @@ NV_STATUS uvm_test_va_residency_info(UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params,
                for_each_gpu_id(id) {
                    NvU32 page_size = uvm_va_block_page_size_processor(block, id, page_index);
                    uvm_reverse_map_t sysmem_page;
-                    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
+                    uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page_resident(block, page_index);
                    size_t num_pages;
                    uvm_gpu_t *gpu;

--- a/kernel-open/nvidia-uvm/uvm_va_block.h
+++ b/kernel-open/nvidia-uvm/uvm_va_block.h
@ -44,6 +44,7 @@

 #include <linux/mmu_notifier.h>
 #include <linux/wait.h>
+#include <linux/nodemask.h>

 // VA blocks are the leaf nodes in the uvm_va_space tree for managed allocations
 // (VA ranges with type == UVM_VA_RANGE_TYPE_MANAGED):
@ -229,6 +230,42 @@ typedef struct

 } uvm_va_block_gpu_state_t;

+typedef struct
+{
+    // Per-page residency bit vector, used for fast traversal of resident
+    // pages.
+    //
+    // A set bit means the CPU has a coherent copy of the physical page
+    // resident in the NUMA node's memory, and that a CPU chunk for the
+    // corresponding page index has been allocated. This does not mean that
+    // the coherent copy is currently mapped anywhere, however. A page may be
+    // resident on multiple processors (but not multiple CPU NUMA nodes) when in
+    // read-duplicate mode.
+    //
+    // A cleared bit means the CPU NUMA node does not have a coherent copy of
+    // that page resident. A CPU chunk for the corresponding page index may or
+    // may not have been allocated. If the chunk is present, it's a cached chunk
+    // which can be reused in the future.
+    //
+    // Allocating PAGES_PER_UVM_VA_BLOCK is overkill when the block is
+    // smaller than UVM_VA_BLOCK_SIZE, but it's not much extra memory
+    // overhead on the whole.
+    uvm_page_mask_t resident;
+
+    // Per-page allocation bit vector.
+    //
+    // A set bit means that a CPU chunk has been allocated for the
+    // corresponding page index on this NUMA node.
+    uvm_page_mask_t allocated;
+
+    // CPU memory chunks represent physically contiguous CPU memory
+    // allocations. See uvm_pmm_sysmem.h for more details on CPU chunks.
+    // This member is meant to hold an opaque value indicating the CPU
+    // chunk storage method. For more details on CPU chunk storage,
+    // see uvm_cpu_chunk_storage_type_t in uvm_va_block.c.
+    unsigned long chunks;
+} uvm_va_block_cpu_node_state_t;
+
 // TODO: Bug 1766180: Worst-case we could have one of these per system page.
 //       Options:
 //       1) Rely on the OOM killer to prevent the user from trying to do that
@ -306,38 +343,30 @@ struct uvm_va_block_struct

    struct
    {
-        // Per-page residency bit vector, used for fast traversal of resident
-        // pages.
-        //
-        // A set bit means the CPU has a coherent copy of the physical page
-        // resident in its memory, and that the corresponding entry in the pages
-        // array is present. This does not mean that the coherent copy is
-        // currently mapped anywhere, however. A page may be resident on
-        // multiple processors when in read-duplicate mode.
-        //
-        // A cleared bit means the CPU does not have a coherent copy of that
-        // page resident. The corresponding entry in the pages array may or may
-        // not present. If the entry is present, it's a cached page which can be
-        // reused in the future.
-        //
-        // Allocating PAGES_PER_UVM_VA_BLOCK is overkill when the block is
-        // smaller than UVM_VA_BLOCK_SIZE, but it's not much extra memory
-        // overhead on the whole.
-        uvm_page_mask_t resident;
-
-        // CPU memory chunks represent physically contiguous CPU memory
-        // allocations. See uvm_pmm_sysmem.h for more details on CPU chunks.
-        // This member is meant to hold an opaque value indicating the CPU
-        // chunk storage method. For more details on CPU chunk storage,
-        // see uvm_cpu_chunk_storage_type_t in uvm_va_block.c.
-        unsigned long chunks;
+        // Per-NUMA node tracking of CPU allocations.
+        // This is a dense array with one entry per possible NUMA node.
+        uvm_va_block_cpu_node_state_t **node_state;

        // Per-page allocation bit vector.
        //
        // A set bit means that a CPU page has been allocated for the
-        // corresponding page index.
+        // corresponding page index on at least one CPU NUMA node.
        uvm_page_mask_t allocated;

+        // Per-page residency bit vector. See
+        // uvm_va_block_cpu_numa_state_t::resident for a detailed description.
+        // This mask is a cumulative mask (logical OR) of all
+        // uvm_va_block_cpu_node_state_t::resident masks. It is meant to be used
+        // only for fast testing of page residency when it matters only if the
+        // page is resident on the CPU.
+        //
+        // Note that this mask cannot be set directly as this will cause
+        // inconsistencies between this mask and the per-NUMA residency masks.
+        // In order to properly maintain consistency between the per-NUMA masks
+        // and this one, uvm_va_block_cpu_[set|clear]_residency_*() helpers
+        // should be used.
+        uvm_page_mask_t resident;
+
        // Per-page mapping bit vectors, one per bit we need to track. These are
        // used for fast traversal of valid mappings in the block. These contain
        // all non-address bits needed to establish a virtual mapping on this
@ -418,7 +447,8 @@ struct uvm_va_block_struct
    uvm_page_mask_t read_duplicated_pages;

    // Mask to keep track of the pages that are not mapped on any non-UVM-Lite
-    // processor.
+    // processor. This mask is not used for HMM because the CPU can map pages
+    // at any time without notifying the driver.
    //     0: Page is definitely not mapped by any processors
    //     1: Page may or may not be mapped by a processor
    //
@ -525,6 +555,13 @@ struct uvm_va_block_wrapper_struct
        // a successful migration if this error flag is cleared.
        NvU32 inject_cpu_pages_allocation_error_count;

+        // A NUMA node ID on which any CPU chunks will be allocated from.
+        // This will override any other setting and/or policy.
+        // Note that the kernel is still free to allocate from any of the
+        // nodes in the thread's policy.
+        int cpu_chunk_allocation_target_id;
+        int cpu_chunk_allocation_actual_id;
+
        // Force the next eviction attempt on this block to fail. Used for
        // testing only.
        bool inject_eviction_error;
@ -668,17 +705,12 @@ void uvm_va_block_context_free(uvm_va_block_context_t *va_block_context);
 // Initialization of an already-allocated uvm_va_block_context_t.
 //
 // mm is used to initialize the value of va_block_context->mm. NULL is allowed.
-static void uvm_va_block_context_init(uvm_va_block_context_t *va_block_context, struct mm_struct *mm)
-{
-    UVM_ASSERT(va_block_context);
+void uvm_va_block_context_init(uvm_va_block_context_t *va_block_context, struct mm_struct *mm);

-    // Write garbage into the VA Block context to ensure that the UVM code
-    // clears masks appropriately
-    if (UVM_IS_DEBUG())
-        memset(va_block_context, 0xff, sizeof(*va_block_context));
-
-    va_block_context->mm = mm;
-}
+// Return the preferred NUMA node ID for the block's policy.
+// If the preferred node ID is NUMA_NO_NODE, the current NUMA node ID
+// is returned.
+int uvm_va_block_context_get_node(uvm_va_block_context_t *va_block_context);

 // TODO: Bug 1766480: Using only page masks instead of a combination of regions
 //       and page masks could simplify the below APIs and their implementations
@ -734,6 +766,9 @@ static void uvm_va_block_context_init(uvm_va_block_context_t *va_block_context,
 // those masks. It is the caller's responsiblity to zero the masks or
 // not first.
 //
+// va_block_context->make_resident.dest_nid is used to guide the NUMA node for
+// CPU allocations.
+//
 // Notably any status other than NV_OK indicates that the block's lock might
 // have been unlocked and relocked.
 //
@ -1377,8 +1412,14 @@ static uvm_va_block_test_t *uvm_va_block_get_test(uvm_va_block_t *va_block)

 // Get the page residency mask for a processor if it's known to be there.
 //
+// If the processor is the CPU, the residency mask for the NUMA node ID
+// specified by nid will be returned (see
+// uvm_va_block_cpu_node_state_t::resident). If nid is NUMA_NO_NODE,
+// the cumulative CPU residency mask will be returned (see
+// uvm_va_block_t::cpu::resident).
+//
 // If the processor is a GPU, this will assert that GPU state is indeed present.
-uvm_page_mask_t *uvm_va_block_resident_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor);
+uvm_page_mask_t *uvm_va_block_resident_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor, int nid);

 // Get the page mapped mask for a processor. The returned mask cannot be
 // directly modified by the caller
@ -1386,6 +1427,13 @@ uvm_page_mask_t *uvm_va_block_resident_mask_get(uvm_va_block_t *block, uvm_proce
 // If the processor is a GPU, this will assert that GPU state is indeed present.
 const uvm_page_mask_t *uvm_va_block_map_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor);

+// Return a mask of non-UVM-Lite pages that are unmapped within the given
+// region.
+// Locking: The block lock must be held.
+void uvm_va_block_unmapped_pages_get(uvm_va_block_t *va_block,
+                                     uvm_va_block_region_t region,
+                                     uvm_page_mask_t *out_mask);
+
 // VA block lookup functions. There are a number of permutations which might be
 // useful, such as looking up the block from {va_space, va_range} x {addr,
 // block index}. The ones implemented here and in uvm_va_range.h support the
@ -1756,17 +1804,28 @@ static bool uvm_page_mask_full(const uvm_page_mask_t *mask)
    return bitmap_full(mask->bitmap, PAGES_PER_UVM_VA_BLOCK);
 }

-static bool uvm_page_mask_and(uvm_page_mask_t *mask_out, const uvm_page_mask_t *mask_in1, const uvm_page_mask_t *mask_in2)
+static void uvm_page_mask_fill(uvm_page_mask_t *mask)
+{
+    bitmap_fill(mask->bitmap, PAGES_PER_UVM_VA_BLOCK);
+}
+
+static bool uvm_page_mask_and(uvm_page_mask_t *mask_out,
+                              const uvm_page_mask_t *mask_in1,
+                              const uvm_page_mask_t *mask_in2)
 {
    return bitmap_and(mask_out->bitmap, mask_in1->bitmap, mask_in2->bitmap, PAGES_PER_UVM_VA_BLOCK);
 }

-static bool uvm_page_mask_andnot(uvm_page_mask_t *mask_out, const uvm_page_mask_t *mask_in1, const uvm_page_mask_t *mask_in2)
+static bool uvm_page_mask_andnot(uvm_page_mask_t *mask_out,
+                                 const uvm_page_mask_t *mask_in1,
+                                 const uvm_page_mask_t *mask_in2)
 {
    return bitmap_andnot(mask_out->bitmap, mask_in1->bitmap, mask_in2->bitmap, PAGES_PER_UVM_VA_BLOCK);
 }

-static void uvm_page_mask_or(uvm_page_mask_t *mask_out, const uvm_page_mask_t *mask_in1, const uvm_page_mask_t *mask_in2)
+static void uvm_page_mask_or(uvm_page_mask_t *mask_out,
+                             const uvm_page_mask_t *mask_in1,
+                             const uvm_page_mask_t *mask_in2)
 {
    bitmap_or(mask_out->bitmap, mask_in1->bitmap, mask_in2->bitmap, PAGES_PER_UVM_VA_BLOCK);
 }
@ -2036,30 +2095,49 @@ uvm_processor_id_t uvm_va_block_page_get_closest_resident(uvm_va_block_t *va_blo
                                                          uvm_page_index_t page_index,
                                                          uvm_processor_id_t processor);

+// Mark CPU page page_index as resident on NUMA node specified by nid.
+// nid cannot be NUMA_NO_NODE.
+void uvm_va_block_cpu_set_resident_page(uvm_va_block_t *va_block, int nid, uvm_page_index_t page_index);
+
+// Test if a CPU page is resident on NUMA node nid. If nid is NUMA_NO_NODE,
+// the function will return True if the page is resident on any CPU NUMA node.
+bool uvm_va_block_cpu_is_page_resident_on(uvm_va_block_t *va_block, int nid, uvm_page_index_t page_index);
+
+// Test if all pages in region are resident on NUMA node nid. If nid is
+// NUMA_NO_NODE, the function will test if the pages in the region are
+// resident on any CPU NUMA node.
+bool uvm_va_block_cpu_is_region_resident_on(uvm_va_block_t *va_block, int nid, uvm_va_block_region_t region);
+
 // Insert a CPU chunk at the given page_index into the va_block.
 // Locking: The va_block lock must be held.
-NV_STATUS uvm_cpu_chunk_insert_in_block(uvm_va_block_t *va_block,
-                                        uvm_cpu_chunk_t *chunk,
-                                        uvm_page_index_t page_index);
+NV_STATUS uvm_cpu_chunk_insert_in_block(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index);

 // Remove a CPU chunk at the given page_index from the va_block.
+// nid cannot be NUMA_NO_NODE.
 // Locking: The va_block lock must be held.
-void uvm_cpu_chunk_remove_from_block(uvm_va_block_t *va_block,
-                                     uvm_page_index_t page_index);
+void uvm_cpu_chunk_remove_from_block(uvm_va_block_t *va_block, int nid, uvm_page_index_t page_index);

-// Return the CPU chunk at the given page_index from the va_block.
+// Return the CPU chunk at the given page_index on the given NUMA node from the
+// va_block. nid cannot be NUMA_NO_NODE.
 // Locking: The va_block lock must be held.
 uvm_cpu_chunk_t *uvm_cpu_chunk_get_chunk_for_page(uvm_va_block_t *va_block,
+                                                  int nid,
                                                  uvm_page_index_t page_index);

-// Return the CPU chunk at the given page_index from the va_block.
+// Return the struct page * from the chunk corresponding to the given page_index
 // Locking: The va_block lock must be held.
-struct page *uvm_cpu_chunk_get_cpu_page(uvm_va_block_t *va_block,
-                                        uvm_page_index_t page_index);
+struct page *uvm_cpu_chunk_get_cpu_page(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index);
+
+// Return the struct page * of the resident chunk at the given page_index from
+// the va_block. The given page_index must be resident on the CPU.
+// Locking: The va_block lock must be held.
+struct page *uvm_va_block_get_cpu_page(uvm_va_block_t *va_block, uvm_page_index_t page_index);

 // Physically map a CPU chunk so it is DMA'able from all registered GPUs.
+// nid cannot be NUMA_NO_NODE.
 // Locking: The va_block lock must be held.
 NV_STATUS uvm_va_block_map_cpu_chunk_on_gpus(uvm_va_block_t *va_block,
+                                             uvm_cpu_chunk_t *chunk,
                                             uvm_page_index_t page_index);

 // Physically unmap a CPU chunk from all registered GPUs.
--- a/kernel-open/nvidia-uvm/uvm_va_block_types.h
+++ b/kernel-open/nvidia-uvm/uvm_va_block_types.h
@ -30,6 +30,7 @@
 #include "uvm_forward_decl.h"

 #include <linux/migrate.h>
+#include <linux/nodemask.h>

 // UVM_VA_BLOCK_BITS is 21, meaning the maximum block size is 2MB. Rationale:
 // - 2MB matches the largest Pascal GPU page size so it's a natural fit
@ -145,6 +146,18 @@ typedef struct
    unsigned count;
 } uvm_prot_page_mask_array_t[UVM_PROT_MAX - 1];

+typedef struct
+{
+    // A per-NUMA-node array of page masks (size num_possible_nodes()) that hold
+    // the set of CPU pages used by the migration operation.
+    uvm_page_mask_t **node_masks;
+
+    // Node mask used to iterate over the page masks above.
+    // If a node's bit is set, it means that the page mask given by
+    // node_to_index() in node_masks has set pages.
+    nodemask_t nodes;
+} uvm_make_resident_page_tracking_t;
+
 // In the worst case some VA block operations require more state than we should
 // reasonably store on the stack. Instead, we dynamically allocate VA block
 // contexts. These are used for almost all operations on VA blocks.
@ -159,6 +172,9 @@ typedef struct
    // this block_context.
    uvm_page_mask_t scratch_page_mask;

+    // Scratch node mask. This follows the same rules as scratch_page_mask;
+    nodemask_t scratch_node_mask;
+
    // State used by uvm_va_block_make_resident
    struct uvm_make_resident_context_struct
    {
@ -181,10 +197,24 @@ typedef struct
        // Used to perform ECC checks after the migration is done.
        uvm_processor_mask_t all_involved_processors;

+        // Page mask used to compute the set of CPU pages for each CPU node.
+        uvm_page_mask_t node_pages_mask;
+
        // Final residency for the data. This is useful for callees to know if
        // a migration is part of a staging copy
        uvm_processor_id_t dest_id;

+        // Final residency NUMA node if the migration destination is the CPU.
+        int dest_nid;
+
+        // This structure is used to track CPU pages used for migrations on
+        // a per-NUMA node basis.
+        //
+        // The pages could be used for either migrations to the CPU (used to
+        // track the destination CPU pages) or staging copies (used to track
+        // the CPU pages used for the staging).
+        uvm_make_resident_page_tracking_t cpu_pages_used;
+
        // Event that triggered the call
        uvm_make_resident_cause_t cause;
    } make_resident;
--- a/kernel-open/nvidia-uvm/uvm_va_policy.c
+++ b/kernel-open/nvidia-uvm/uvm_va_policy.c
@ -31,6 +31,7 @@

 const uvm_va_policy_t uvm_va_policy_default = {
    .preferred_location = UVM_ID_INVALID,
+    .preferred_nid = NUMA_NO_NODE,
    .read_duplication = UVM_READ_DUPLICATION_UNSET,
 };

--- a/Show More
+++ b/Show More