535.43.24

2025-02-27 09:54:14 +01:00 · 2024-01-31 14:02:06 -08:00 · 2024-01-31 14:02:06 -08:00 · e558660fc2
commit e558660fc2
parent 2a3b58b8c8
267 changed files with 89045 additions and 82824 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,7 +2,7 @@

 ## Release 535 Entries

-### [535.54.03] 2023-06-14
+### [535.43.24] 2024-01-31

 ### [535.43.23] 2024-01-24

@ -30,6 +30,7 @@

 #### Fixed

+- Fixed building main against current centos stream 8 fails, [#550](https://github.com/NVIDIA/open-gpu-kernel-modules/issues/550) by @airlied
 - Fixed console restore with traditional VGA consoles.

 #### Added
@ -58,6 +59,14 @@

 ## Release 525 Entries

+### [525.147.05] 2023-10-31
+
+#### Fixed
+
+- Fix nvidia_p2p_get_pages(): Fix double-free in register-callback error path, [#557](https://github.com/NVIDIA/open-gpu-kernel-modules/pull/557) by @BrendanCunningham
+
+### [525.125.06] 2023-06-26
+
 ### [525.116.04] 2023-05-09

 ### [525.116.03] 2023-04-25
--- a/README.md
+++ b/README.md
@ -1,7 +1,7 @@
 # NVIDIA Linux Open GPU Kernel Module Source

 This is the source release of the NVIDIA Linux open GPU kernel modules,
-version 535.43.23.
+version 535.43.24.


 ## How to Build
@ -17,7 +17,7 @@ as root:

 Note that the kernel modules built here must be used with GSP
 firmware and user-space NVIDIA GPU driver components from a corresponding
-535.43.23 driver release.  This can be achieved by installing
+535.43.24 driver release.  This can be achieved by installing
 the NVIDIA GPU driver from the .run file using the `--no-kernel-modules`
 option.  E.g.,

@ -180,7 +180,7 @@ software applications.
 ## Compatible GPUs

 The open-gpu-kernel-modules can be used on any Turing or later GPU
-(see the table below). However, in the 535.43.23 release,
+(see the table below). However, in the 535.43.24 release,
 GeForce and Workstation support is still considered alpha-quality.

 To enable use of the open kernel modules on GeForce and Workstation GPUs,
@ -188,7 +188,7 @@ set the "NVreg_OpenRmEnableUnsupportedGpus" nvidia.ko kernel module
 parameter to 1. For more details, see the NVIDIA GPU driver end user
 README here:

-https://us.download.nvidia.com/XFree86/Linux-x86_64/535.43.23/README/kernel_open.html
+https://us.download.nvidia.com/XFree86/Linux-x86_64/535.43.24/README/kernel_open.html

 In the below table, if three IDs are listed, the first is the PCI Device 
 ID, the second is the PCI Subsystem Vendor ID, and the third is the PCI
--- a/kernel-open/Kbuild
+++ b/kernel-open/Kbuild
@ -72,7 +72,7 @@ EXTRA_CFLAGS += -I$(src)/common/inc
 EXTRA_CFLAGS += -I$(src)
 EXTRA_CFLAGS += -Wall $(DEFINES) $(INCLUDES) -Wno-cast-qual -Wno-error -Wno-format-extra-args
 EXTRA_CFLAGS += -D__KERNEL__ -DMODULE -DNVRM
-EXTRA_CFLAGS += -DNV_VERSION_STRING=\"535.43.23\"
+EXTRA_CFLAGS += -DNV_VERSION_STRING=\"535.43.24\"

 ifneq ($(SYSSRCHOST1X),)
 EXTRA_CFLAGS += -I$(SYSSRCHOST1X)
@ -123,6 +123,9 @@ ifneq ($(wildcard /proc/sgi_uv),)
 EXTRA_CFLAGS += -DNV_CONFIG_X86_UV
 endif

+ifdef VGX_FORCE_VFIO_PCI_CORE
+ EXTRA_CFLAGS += -DNV_VGPU_FORCE_VFIO_PCI_CORE
+endif

 #
 # The conftest.sh script tests various aspects of the target kernel.
--- a/kernel-open/common/inc/nv-linux.h
+++ b/kernel-open/common/inc/nv-linux.h
@ -2067,4 +2067,7 @@ typedef enum
 #include <linux/clk-provider.h>
 #endif

+#define NV_EXPORT_SYMBOL(symbol)        EXPORT_SYMBOL_GPL(symbol)
+#define NV_CHECK_EXPORT_SYMBOL(symbol)  NV_IS_EXPORT_SYMBOL_PRESENT_##symbol
+
 #endif  /* _NV_LINUX_H_ */
--- a/kernel-open/common/inc/nv.h
+++ b/kernel-open/common/inc/nv.h
@ -924,6 +924,7 @@ NV_STATUS  NV_API_CALL  rm_ioctl                 (nvidia_stack_t *, nv_state_t *
 NvBool     NV_API_CALL  rm_isr                   (nvidia_stack_t *, nv_state_t *, NvU32 *);
 void       NV_API_CALL  rm_isr_bh                (nvidia_stack_t *, nv_state_t *);
 void       NV_API_CALL  rm_isr_bh_unlocked       (nvidia_stack_t *, nv_state_t *);
+NvBool     NV_API_CALL  rm_is_msix_allowed       (nvidia_stack_t *, nv_state_t *);
 NV_STATUS  NV_API_CALL  rm_power_management      (nvidia_stack_t *, nv_state_t *, nv_pm_action_t);
 NV_STATUS  NV_API_CALL  rm_stop_user_channels    (nvidia_stack_t *, nv_state_t *);
 NV_STATUS  NV_API_CALL  rm_restart_user_channels (nvidia_stack_t *, nv_state_t *);
--- a/kernel-open/common/inc/os-interface.h
+++ b/kernel-open/common/inc/os-interface.h
@ -207,9 +207,13 @@ enum os_pci_req_atomics_type {
    OS_INTF_PCIE_REQ_ATOMICS_128BIT
 };
 NV_STATUS   NV_API_CALL  os_enable_pci_req_atomics   (void *, enum os_pci_req_atomics_type);
+NV_STATUS   NV_API_CALL  os_get_numa_node_memory_usage (NvS32, NvU64 *, NvU64 *);
 NV_STATUS   NV_API_CALL  os_numa_add_gpu_memory      (void *, NvU64, NvU64, NvU32 *);
 NV_STATUS   NV_API_CALL  os_numa_remove_gpu_memory   (void *, NvU64, NvU64, NvU32); 
 NV_STATUS   NV_API_CALL  os_offline_page_at_address(NvU64 address);
+void*       NV_API_CALL  os_get_pid_info(void);
+void        NV_API_CALL  os_put_pid_info(void *pid_info);
+NV_STATUS   NV_API_CALL  os_find_ns_pid(void *pid_info, NvU32 *ns_pid);

 extern NvU32 os_page_size;
 extern NvU64 os_page_mask;
--- a/kernel-open/conftest.sh
+++ b/kernel-open/conftest.sh
@ -316,7 +316,7 @@ export_symbol_present_conftest() {
    SYMBOL="$1"
    TAB='	'

-    if grep -e "${TAB}${SYMBOL}${TAB}.*${TAB}EXPORT_SYMBOL.*\$" \
+    if grep -e "${TAB}${SYMBOL}${TAB}.*${TAB}EXPORT_SYMBOL\(_GPL\)\?\s*\$" \
               "$OUTPUT/Module.symvers" >/dev/null 2>&1; then
        echo "#define NV_IS_EXPORT_SYMBOL_PRESENT_$SYMBOL 1" |
            append_conftest "symbols"
@ -337,7 +337,7 @@ export_symbol_gpl_conftest() {
    SYMBOL="$1"
    TAB='	'

-    if grep -e "${TAB}${SYMBOL}${TAB}.*${TAB}EXPORT_\(UNUSED_\)*SYMBOL_GPL\$" \
+    if grep -e "${TAB}${SYMBOL}${TAB}.*${TAB}EXPORT_\(UNUSED_\)*SYMBOL_GPL\s*\$" \
               "$OUTPUT/Module.symvers" >/dev/null 2>&1; then
        echo "#define NV_IS_EXPORT_SYMBOL_GPL_$SYMBOL 1" |
            append_conftest "symbols"
@ -4468,6 +4468,24 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_MMU_NOTIFIER_OPS_HAS_INVALIDATE_RANGE" "" "types"
        ;;

+        mmu_notifier_ops_arch_invalidate_secondary_tlbs)
+            #
+            # Determine if the mmu_notifier_ops struct has the
+            # 'arch_invalidate_secondary_tlbs' member.
+            #
+            # struct mmu_notifier_ops.invalidate_range was renamed to
+            # arch_invalidate_secondary_tlbs by commit 1af5a8109904
+            # ("mmu_notifiers: rename invalidate_range notifier") due to be
+            # added in v6.6
+           CODE="
+            #include <linux/mmu_notifier.h>
+            int conftest_mmu_notifier_ops_arch_invalidate_secondary_tlbs(void) {
+                return offsetof(struct mmu_notifier_ops, arch_invalidate_secondary_tlbs);
+            }"
+
+            compile_check_conftest "$CODE" "NV_MMU_NOTIFIER_OPS_HAS_ARCH_INVALIDATE_SECONDARY_TLBS" "" "types"
+        ;;
+
        drm_format_num_planes)
            #
            # Determine if drm_format_num_planes() function is present.
@ -5636,23 +5654,6 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_GPIO_TO_IRQ_PRESENT" "" "functions"
        ;;

-        migrate_vma_setup)
-            #
-            # Determine if migrate_vma_setup() function is present
-            #
-            # migrate_vma_setup() function was added by commit
-            # a7d1f22bb74f32cf3cd93f52776007e161f1a738 ("mm: turn migrate_vma
-            # upside down) in v5.4.
-            # (2019-08-20).
-            CODE="
-            #include <linux/migrate.h>
-            int conftest_migrate_vma_setup(void) {
-                migrate_vma_setup();
-            }"
-
-            compile_check_conftest "$CODE" "NV_MIGRATE_VMA_SETUP_PRESENT" "" "functions"
-        ;;
-
        migrate_vma_added_flags)
            #
            # Determine if migrate_vma structure has flags
@ -5743,23 +5744,25 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_IOASID_GET_PRESENT" "" "functions"
        ;;

-        mm_pasid_set)
+        mm_pasid_drop)
            #
-            # Determine if mm_pasid_set() function is present
+            # Determine if mm_pasid_drop() function is present
+            #
+            # Added by commit 701fac40384f ("iommu/sva: Assign a PASID to mm
+            # on PASID allocation and free it on mm exit") in v5.18.
+            # Moved to linux/iommu.h in commit cd3891158a77 ("iommu/sva: Move
+            # PASID helpers to sva code") in v6.4.
            #
-            # mm_pasid_set() function was added by commit
-            # 701fac40384f07197b106136012804c3cae0b3de (iommu/sva: Assign a
-            # PASID to mm on PASID allocation and free it on mm exit) in v5.18.
-            # (2022-02-15).
            CODE="
            #if defined(NV_LINUX_SCHED_MM_H_PRESENT)
            #include <linux/sched/mm.h>
            #endif
-            void conftest_mm_pasid_set(void) {
-                mm_pasid_set();
+            #include <linux/iommu.h>
+            void conftest_mm_pasid_drop(void) {
+                mm_pasid_drop();
            }"

-            compile_check_conftest "$CODE" "NV_MM_PASID_SET_PRESENT" "" "functions"
+            compile_check_conftest "$CODE" "NV_MM_PASID_DROP_PRESENT" "" "functions"
        ;;

        drm_crtc_state_has_no_vblank)
@ -6279,6 +6282,21 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_MEMORY_FAILURE_MF_SW_SIMULATED_DEFINED" "" "types"
        ;;

+        crypto_tfm_ctx_aligned)
+            # Determine if 'crypto_tfm_ctx_aligned' is defined.
+            #
+            # Removed by commit 25c74a39e0f6 ("crypto: hmac - remove unnecessary
+            # alignment logic") in v6.7.
+            #
+            CODE="
+            #include <crypto/algapi.h>
+            void conftest_crypto_tfm_ctx_aligned(void) {
+                  (void)crypto_tfm_ctx_aligned();
+            }"
+
+            compile_check_conftest "$CODE" "NV_CRYPTO_TFM_CTX_ALIGNED_PRESENT" "" "functions"
+        ;;
+
        crypto)
            #
            # Determine if we support various crypto functions.
@ -6341,6 +6359,22 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_MEMPOLICY_HAS_HOME_NODE" "" "types"
        ;;

+        mpol_preferred_many_present)
+            #
+            # Determine if MPOL_PREFERRED_MANY enum is present or not
+            #
+            # Added by commit b27abaccf8e8b ("mm/mempolicy: add
+            # MPOL_PREFERRED_MANY for multiple preferred nodes") in
+            # v5.15
+            #
+            CODE="
+            #include <linux/mempolicy.h>
+            int mpol_preferred_many = MPOL_PREFERRED_MANY;
+            "
+
+            compile_check_conftest "$CODE" "NV_MPOL_PREFERRED_MANY_PRESENT" "" "types"
+        ;;
+
        mmu_interval_notifier)
            #
            # Determine if mmu_interval_notifier struct is present or not
@ -6356,6 +6390,21 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_MMU_INTERVAL_NOTIFIER" "" "types"
        ;;

+        drm_unlocked_ioctl_flag_present)
+            # Determine if DRM_UNLOCKED IOCTL flag is present.
+            #
+            # DRM_UNLOCKED was removed by commit 2798ffcc1d6a ("drm: Remove
+            # locking for legacy ioctls and DRM_UNLOCKED") in Linux
+            # next-20231208.
+            CODE="
+            #if defined(NV_DRM_DRM_IOCTL_H_PRESENT)
+            #include <drm/drm_ioctl.h>
+            #endif
+            int flags = DRM_UNLOCKED;"
+
+            compile_check_conftest "$CODE" "NV_DRM_UNLOCKED_IOCTL_FLAG_PRESENT" "" "types"
+        ;;
+
        # When adding a new conftest entry, please use the correct format for
        # specifying the relevant upstream Linux kernel commit.
        #
@ -6680,18 +6729,9 @@ case "$5" in
                VFIO_PCI_CORE_PRESENT=1
            fi

-            # When this sanity check is run via nvidia-installer, it sets ARCH as aarch64.
-            # But, when it is run via Kbuild, ARCH is set as arm64
-            if [ "$ARCH" = "aarch64" ]; then
-                ARCH="arm64"
-            fi
-
            if [ "$VFIO_IOMMU_PRESENT" != "0" ] && [ "$KVM_PRESENT" != "0" ] ; then
-
-                # On x86_64, vGPU requires MDEV framework to be present.
-                # On aarch64, vGPU requires MDEV or vfio-pci-core framework to be present.
-                if ([ "$ARCH" = "arm64" ] && ([ "$VFIO_MDEV_PRESENT" != "0" ] || [ "$VFIO_PCI_CORE_PRESENT" != "0" ])) ||
-                   ([ "$ARCH" = "x86_64" ] && [ "$VFIO_MDEV_PRESENT" != "0" ];) then
+                # vGPU requires either MDEV or vfio-pci-core framework to be present.
+                if [ "$VFIO_MDEV_PRESENT" != "0" ] || [ "$VFIO_PCI_CORE_PRESENT" != "0" ]; then
                    exit 0
                fi
            fi
@ -6702,14 +6742,10 @@ case "$5" in
                echo "CONFIG_VFIO_IOMMU_TYPE1";
            fi

-            if [ "$ARCH" = "arm64" ] && [ "$VFIO_MDEV_PRESENT" = "0" ] && [ "$VFIO_PCI_CORE_PRESENT" = "0" ]; then
+            if [ "$VFIO_MDEV_PRESENT" = "0" ] && [ "$VFIO_PCI_CORE_PRESENT" = "0" ]; then
                echo "either CONFIG_VFIO_MDEV or CONFIG_VFIO_PCI_CORE";
            fi

-            if [ "$ARCH" = "x86_64" ] && [ "$VFIO_MDEV_PRESENT" = "0" ]; then
-                echo "CONFIG_VFIO_MDEV";
-            fi
-
            if [ "$KVM_PRESENT" = "0" ]; then
                echo "CONFIG_KVM";
            fi
--- a/kernel-open/nvidia-drm/nvidia-drm-drv.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-drv.c
@ -1312,9 +1312,21 @@ static const struct drm_ioctl_desc nv_drm_ioctls[] = {
                      DRM_RENDER_ALLOW|DRM_UNLOCKED),
 #endif

+    /*
+     * DRM_UNLOCKED is implicit for all non-legacy DRM driver IOCTLs since Linux
+     * v4.10 commit fa5386459f06 "drm: Used DRM_LEGACY for all legacy functions"
+     * (Linux v4.4 commit ea487835e887 "drm: Enforce unlocked ioctl operation
+     * for kms driver ioctls" previously did it only for drivers that set the
+     * DRM_MODESET flag), so this will race with SET_CLIENT_CAP. Linux v4.11
+     * commit dcf727ab5d17 "drm: setclientcap doesn't need the drm BKL" also
+     * removed locking from SET_CLIENT_CAP so there is no use attempting to lock
+     * manually. The latter commit acknowledges that this can expose userspace
+     * to inconsistent behavior when racing with itself, but accepts that risk.
+     */
    DRM_IOCTL_DEF_DRV(NVIDIA_GET_CLIENT_CAPABILITY,
                      nv_drm_get_client_capability_ioctl,
                      0),
+
 #if defined(NV_DRM_ATOMIC_MODESET_AVAILABLE)
    DRM_IOCTL_DEF_DRV(NVIDIA_GET_CRTC_CRC32,
                      nv_drm_get_crtc_crc32_ioctl,
--- a/kernel-open/nvidia-drm/nvidia-drm-gem-nvkms-memory.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-gem-nvkms-memory.c
@ -243,6 +243,15 @@ static int __nv_drm_nvkms_gem_obj_init(
    NvU64 *pages = NULL;
    NvU32 numPages = 0;

+    if ((size % PAGE_SIZE) != 0) {
+        NV_DRM_DEV_LOG_ERR(
+            nv_dev,
+            "NvKmsKapiMemory 0x%p size should be in a multiple of page size to "
+            "create a gem object",
+            pMemory);
+        return -EINVAL;
+    }
+
    nv_nvkms_memory->pPhysicalAddress = NULL;
    nv_nvkms_memory->pWriteCombinedIORemapAddress = NULL;
    nv_nvkms_memory->physically_mapped = false;
--- a/kernel-open/nvidia-drm/nvidia-drm-helper.h
+++ b/kernel-open/nvidia-drm/nvidia-drm-helper.h
@ -582,6 +582,19 @@ static inline int nv_drm_format_num_planes(uint32_t format)

 #endif /* defined(NV_DRM_FORMAT_MODIFIERS_PRESENT) */

+/*
+ * DRM_UNLOCKED was removed with linux-next commit 2798ffcc1d6a ("drm: Remove
+ * locking for legacy ioctls and DRM_UNLOCKED"), but it was previously made
+ * implicit for all non-legacy DRM driver IOCTLs since Linux v4.10 commit
+ * fa5386459f06 "drm: Used DRM_LEGACY for all legacy functions" (Linux v4.4
+ * commit ea487835e887 "drm: Enforce unlocked ioctl operation for kms driver
+ * ioctls" previously did it only for drivers that set the DRM_MODESET flag), so
+ * it was effectively a no-op anyway.
+ */
+#if !defined(NV_DRM_UNLOCKED_IOCTL_FLAG_PRESENT)
+#define DRM_UNLOCKED 0
+#endif
+
 /*
 * drm_vma_offset_exact_lookup_locked() were added
 * by kernel commit 2225cfe46bcc which was Signed-off-by:
--- a/kernel-open/nvidia-drm/nvidia-drm.Kbuild
+++ b/kernel-open/nvidia-drm/nvidia-drm.Kbuild
@ -133,3 +133,4 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += drm_connector_lookup
 NV_CONFTEST_TYPE_COMPILE_TESTS += drm_connector_put
 NV_CONFTEST_TYPE_COMPILE_TESTS += vm_area_struct_has_const_vm_flags
 NV_CONFTEST_TYPE_COMPILE_TESTS += drm_driver_has_dumb_destroy
+NV_CONFTEST_TYPE_COMPILE_TESTS += drm_unlocked_ioctl_flag_present
--- a/kernel-open/nvidia-modeset/nvidia-modeset-linux.c
+++ b/kernel-open/nvidia-modeset/nvidia-modeset-linux.c
@ -68,6 +68,9 @@ module_param_named(output_rounding_fix, output_rounding_fix, bool, 0400);
 static bool disable_vrr_memclk_switch = false;
 module_param_named(disable_vrr_memclk_switch, disable_vrr_memclk_switch, bool, 0400);

+static bool opportunistic_display_sync = true;
+module_param_named(opportunistic_display_sync, opportunistic_display_sync, bool, 0400);
+
 /* These parameters are used for fault injection tests.  Normally the defaults
 * should be used. */
 MODULE_PARM_DESC(fail_malloc, "Fail the Nth call to nvkms_alloc");
@ -99,6 +102,11 @@ NvBool nvkms_disable_vrr_memclk_switch(void)
    return disable_vrr_memclk_switch;
 }

+NvBool nvkms_opportunistic_display_sync(void)
+{
+    return opportunistic_display_sync;
+}
+
 #define NVKMS_SYNCPT_STUBS_NEEDED

 /*************************************************************************
@ -200,9 +208,23 @@ static inline int nvkms_read_trylock_pm_lock(void)

 static inline void nvkms_read_lock_pm_lock(void)
 {
-    while (!down_read_trylock(&nvkms_pm_lock)) {
-        try_to_freeze();
-        cond_resched();
+    if ((current->flags & PF_NOFREEZE)) {
+        /*
+         * Non-freezable tasks (i.e. kthreads in this case) don't have to worry
+         * about being frozen during system suspend, but do need to block so
+         * that the CPU can go idle during s2idle. Do a normal uninterruptible
+         * blocking wait for the PM lock.
+         */
+        down_read(&nvkms_pm_lock);
+    } else {
+        /*
+         * For freezable tasks, make sure we give the kernel an opportunity to
+         * freeze if taking the PM lock fails.
+         */
+        while (!down_read_trylock(&nvkms_pm_lock)) {
+            try_to_freeze();
+            cond_resched();
+        }
    }
 }

--- a/kernel-open/nvidia-modeset/nvidia-modeset-os-interface.h
+++ b/kernel-open/nvidia-modeset/nvidia-modeset-os-interface.h
@ -99,6 +99,7 @@ typedef struct {
 NvBool nvkms_output_rounding_fix(void);

 NvBool nvkms_disable_vrr_memclk_switch(void);
+NvBool nvkms_opportunistic_display_sync(void);

 void   nvkms_call_rm    (void *ops);
 void*  nvkms_alloc      (size_t size,
--- a/kernel-open/nvidia-peermem/nvidia-peermem.c
+++ b/kernel-open/nvidia-peermem/nvidia-peermem.c
@ -1,20 +1,25 @@
-/* SPDX-License-Identifier: Linux-OpenIB */
 /*
 * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
 * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
 *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
 *
- *  - Redistributions of source code must retain the above
- *    copyright notice, this list of conditions and the following
- *    disclaimer.
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
 *
- *  - Redistributions in binary form must reproduce the above
- *    copyright notice, this list of conditions and the following
- *    disclaimer in the documentation and/or other materials
- *    provided with the distribution.
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
@ -43,7 +48,9 @@

 MODULE_AUTHOR("Yishai Hadas");
 MODULE_DESCRIPTION("NVIDIA GPU memory plug-in");
-MODULE_LICENSE("Linux-OpenIB");
+
+MODULE_LICENSE("Dual BSD/GPL");
+
 MODULE_VERSION(DRV_VERSION);
 enum {
        NV_MEM_PEERDIRECT_SUPPORT_DEFAULT = 0,
@ -53,7 +60,13 @@ static int peerdirect_support = NV_MEM_PEERDIRECT_SUPPORT_DEFAULT;
 module_param(peerdirect_support, int, S_IRUGO);
 MODULE_PARM_DESC(peerdirect_support, "Set level of support for Peer-direct, 0 [default] or 1 [legacy, for example MLNX_OFED 4.9 LTS]");

-#define peer_err(FMT, ARGS...) printk(KERN_ERR "nvidia-peermem" " %s:%d " FMT, __FUNCTION__, __LINE__, ## ARGS)
+
+#define peer_err(FMT, ARGS...) printk(KERN_ERR "nvidia-peermem" " %s:%d ERROR " FMT, __FUNCTION__, __LINE__, ## ARGS)
+#ifdef NV_MEM_DEBUG
+#define peer_trace(FMT, ARGS...) printk(KERN_DEBUG "nvidia-peermem" " %s:%d TRACE " FMT, __FUNCTION__, __LINE__, ## ARGS)
+#else
+#define peer_trace(FMT, ARGS...) do {} while (0)
+#endif

 #if defined(NV_MLNX_IB_PEER_MEM_SYMBOLS_PRESENT)

@ -74,7 +87,10 @@ invalidate_peer_memory mem_invalidate_callback;
 static void *reg_handle = NULL;
 static void *reg_handle_nc = NULL;

+#define NV_MEM_CONTEXT_MAGIC ((u64)0xF1F4F1D0FEF0DAD0ULL)
+
 struct nv_mem_context {
+    u64 pad1;
    struct nvidia_p2p_page_table *page_table;
    struct nvidia_p2p_dma_mapping *dma_mapping;
    u64 core_context;
@ -86,8 +102,22 @@ struct nv_mem_context {
    struct task_struct *callback_task;
    int sg_allocated;
    struct sg_table sg_head;
+    u64 pad2;
 };

+#define NV_MEM_CONTEXT_CHECK_OK(MC) ({                                  \
+    struct nv_mem_context *mc = (MC);                                   \
+    int rc = ((0 != mc) &&                                              \
+              (READ_ONCE(mc->pad1) == NV_MEM_CONTEXT_MAGIC) &&          \
+              (READ_ONCE(mc->pad2) == NV_MEM_CONTEXT_MAGIC));           \
+    if (!rc) {                                                          \
+        peer_trace("invalid nv_mem_context=%px pad1=%016llx pad2=%016llx\n", \
+                   mc,                                                  \
+                   mc?mc->pad1:0,                                       \
+                   mc?mc->pad2:0);                                      \
+    }                                                                   \
+    rc;                                                                 \
+})

 static void nv_get_p2p_free_callback(void *data)
 {
@ -97,8 +127,9 @@ static void nv_get_p2p_free_callback(void *data)
    struct nvidia_p2p_dma_mapping *dma_mapping = NULL;

    __module_get(THIS_MODULE);
-    if (!nv_mem_context) {
-        peer_err("nv_get_p2p_free_callback -- invalid nv_mem_context\n");
+
+    if (!NV_MEM_CONTEXT_CHECK_OK(nv_mem_context)) {
+        peer_err("detected invalid context, skipping further processing\n");
        goto out;
    }

@ -169,9 +200,11 @@ static int nv_mem_acquire(unsigned long addr, size_t size, void *peer_mem_privat
        /* Error case handled as not mine */
        return 0;

+    nv_mem_context->pad1 = NV_MEM_CONTEXT_MAGIC;
    nv_mem_context->page_virt_start = addr & GPU_PAGE_MASK;
    nv_mem_context->page_virt_end   = (addr + size + GPU_PAGE_SIZE - 1) & GPU_PAGE_MASK;
    nv_mem_context->mapped_size  = nv_mem_context->page_virt_end - nv_mem_context->page_virt_start;
+    nv_mem_context->pad2 = NV_MEM_CONTEXT_MAGIC;

    ret = nvidia_p2p_get_pages(0, 0, nv_mem_context->page_virt_start, nv_mem_context->mapped_size,
                               &nv_mem_context->page_table, nv_mem_dummy_callback, nv_mem_context);
@ -195,6 +228,7 @@ static int nv_mem_acquire(unsigned long addr, size_t size, void *peer_mem_privat
    return 1;

 err:
+    memset(nv_mem_context, 0, sizeof(*nv_mem_context));
    kfree(nv_mem_context);

    /* Error case handled as not mine */
@ -342,6 +376,7 @@ static void nv_mem_release(void *context)
        sg_free_table(&nv_mem_context->sg_head);
        nv_mem_context->sg_allocated = 0;
    }
+    memset(nv_mem_context, 0, sizeof(*nv_mem_context));
    kfree(nv_mem_context);
    module_put(THIS_MODULE);
    return;
--- a/kernel-open/nvidia-uvm/nvidia-uvm.Kbuild
+++ b/kernel-open/nvidia-uvm/nvidia-uvm.Kbuild
@ -81,8 +81,7 @@ NV_CONFTEST_FUNCTION_COMPILE_TESTS += set_memory_uc
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += set_pages_uc
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += ktime_get_raw_ts64
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += ioasid_get
-NV_CONFTEST_FUNCTION_COMPILE_TESTS += mm_pasid_set
-NV_CONFTEST_FUNCTION_COMPILE_TESTS += migrate_vma_setup
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += mm_pasid_drop
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += mmget_not_zero
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += mmgrab
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += iommu_sva_bind_device_has_drvdata_arg
@ -100,6 +99,7 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += kmem_cache_has_kobj_remove_work
 NV_CONFTEST_TYPE_COMPILE_TESTS += sysfs_slab_unlink
 NV_CONFTEST_TYPE_COMPILE_TESTS += vm_fault_t
 NV_CONFTEST_TYPE_COMPILE_TESTS += mmu_notifier_ops_invalidate_range
+NV_CONFTEST_TYPE_COMPILE_TESTS += mmu_notifier_ops_arch_invalidate_secondary_tlbs
 NV_CONFTEST_TYPE_COMPILE_TESTS += proc_ops
 NV_CONFTEST_TYPE_COMPILE_TESTS += timespec64
 NV_CONFTEST_TYPE_COMPILE_TESTS += mm_has_mmap_lock
@ -110,6 +110,8 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += handle_mm_fault_has_mm_arg
 NV_CONFTEST_TYPE_COMPILE_TESTS += handle_mm_fault_has_pt_regs_arg
 NV_CONFTEST_TYPE_COMPILE_TESTS += mempolicy_has_unified_nodes
 NV_CONFTEST_TYPE_COMPILE_TESTS += mempolicy_has_home_node
+NV_CONFTEST_TYPE_COMPILE_TESTS += mpol_preferred_many_present
 NV_CONFTEST_TYPE_COMPILE_TESTS += mmu_interval_notifier

 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_int_active_memcg
+NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_migrate_vma_setup
--- a/kernel-open/nvidia-uvm/uvm.c
+++ b/kernel-open/nvidia-uvm/uvm.c
@ -571,7 +571,6 @@ static void uvm_vm_open_managed_entry(struct vm_area_struct *vma)
 static void uvm_vm_close_managed(struct vm_area_struct *vma)
 {
    uvm_va_space_t *va_space = uvm_va_space_get(vma->vm_file);
-    uvm_processor_id_t gpu_id;
    bool make_zombie = false;

    if (current->mm != NULL)
@ -606,12 +605,6 @@ static void uvm_vm_close_managed(struct vm_area_struct *vma)

    uvm_destroy_vma_managed(vma, make_zombie);

-    // Notify GPU address spaces that the fault buffer needs to be flushed to
-    // avoid finding stale entries that can be attributed to new VA ranges
-    // reallocated at the same address.
-    for_each_gpu_id_in_mask(gpu_id, &va_space->registered_gpu_va_spaces) {
-        uvm_processor_mask_set_atomic(&va_space->needs_fault_buffer_flush, gpu_id);
-    }
    uvm_va_space_up_write(va_space);

    if (current->mm != NULL)
--- a/kernel-open/nvidia-uvm/uvm_ada.c
+++ b/kernel-open/nvidia-uvm/uvm_ada.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2021 NVIDIA Corporation
+    Copyright (c) 2021-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -94,4 +94,6 @@ void uvm_hal_ada_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
    parent_gpu->map_remap_larger_page_promotion = false;

    parent_gpu->plc_supported = true;
+
+    parent_gpu->no_ats_range_required = false;
 }
--- a/kernel-open/nvidia-uvm/uvm_ampere.c
+++ b/kernel-open/nvidia-uvm/uvm_ampere.c
@ -101,4 +101,6 @@ void uvm_hal_ampere_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
        parent_gpu->map_remap_larger_page_promotion = false;

    parent_gpu->plc_supported = true;
+
+    parent_gpu->no_ats_range_required = false;
 }
--- a/kernel-open/nvidia-uvm/uvm_ats_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_ats_faults.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2018 NVIDIA Corporation
+    Copyright (c) 2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -107,10 +107,10 @@ static NV_STATUS service_ats_faults(uvm_gpu_va_space_t *gpu_va_space,
    return status;
 }

-static void flush_tlb_write_faults(uvm_gpu_va_space_t *gpu_va_space,
-                                   NvU64 addr,
-                                   size_t size,
-                                   uvm_fault_client_type_t client_type)
+static void flush_tlb_va_region(uvm_gpu_va_space_t *gpu_va_space,
+                                NvU64 addr,
+                                size_t size,
+                                uvm_fault_client_type_t client_type)
 {
    uvm_ats_fault_invalidate_t *ats_invalidate;

@ -119,12 +119,12 @@ static void flush_tlb_write_faults(uvm_gpu_va_space_t *gpu_va_space,
    else
        ats_invalidate = &gpu_va_space->gpu->parent->fault_buffer_info.non_replayable.ats_invalidate;

-    if (!ats_invalidate->write_faults_in_batch) {
-        uvm_tlb_batch_begin(&gpu_va_space->page_tables, &ats_invalidate->write_faults_tlb_batch);
-        ats_invalidate->write_faults_in_batch = true;
+    if (!ats_invalidate->tlb_batch_pending) {
+        uvm_tlb_batch_begin(&gpu_va_space->page_tables, &ats_invalidate->tlb_batch);
+        ats_invalidate->tlb_batch_pending = true;
    }

-    uvm_tlb_batch_invalidate(&ats_invalidate->write_faults_tlb_batch, addr, size, PAGE_SIZE, UVM_MEMBAR_NONE);
+    uvm_tlb_batch_invalidate(&ats_invalidate->tlb_batch, addr, size, PAGE_SIZE, UVM_MEMBAR_NONE);
 }

 static void ats_batch_select_residency(uvm_gpu_va_space_t *gpu_va_space,
@ -149,7 +149,11 @@ static void ats_batch_select_residency(uvm_gpu_va_space_t *gpu_va_space,

    mode = vma_policy->mode;

-    if ((mode == MPOL_BIND) || (mode == MPOL_PREFERRED_MANY) || (mode == MPOL_PREFERRED)) {
+    if ((mode == MPOL_BIND)
+#if defined(NV_MPOL_PREFERRED_MANY_PRESENT)
+         || (mode == MPOL_PREFERRED_MANY)
+#endif
+         || (mode == MPOL_PREFERRED)) {
        int home_node = NUMA_NO_NODE;

 #if defined(NV_MEMPOLICY_HAS_HOME_NODE)
@ -467,6 +471,10 @@ NV_STATUS uvm_ats_service_faults(uvm_gpu_va_space_t *gpu_va_space,
            uvm_page_mask_and(write_fault_mask, write_fault_mask, read_fault_mask);
        else
            uvm_page_mask_zero(write_fault_mask);
+
+        // There are no pending faults beyond write faults to RO region.
+        if (uvm_page_mask_empty(read_fault_mask))
+            return status;
    }

    ats_batch_select_residency(gpu_va_space, vma, ats_context);
@ -489,6 +497,7 @@ NV_STATUS uvm_ats_service_faults(uvm_gpu_va_space_t *gpu_va_space,

        if (vma->vm_flags & VM_WRITE) {
            uvm_page_mask_region_fill(faults_serviced_mask, subregion);
+            uvm_ats_smmu_invalidate_tlbs(gpu_va_space, start, length);

            // The Linux kernel never invalidates TLB entries on mapping
            // permission upgrade. This is a problem if the GPU has cached
@ -499,7 +508,7 @@ NV_STATUS uvm_ats_service_faults(uvm_gpu_va_space_t *gpu_va_space,
            // infinite loop because we just forward the fault to the Linux
            // kernel and it will see that the permissions in the page table are
            // correct. Therefore, we flush TLB entries on ATS write faults.
-            flush_tlb_write_faults(gpu_va_space, start, length, client_type);
+            flush_tlb_va_region(gpu_va_space, start, length, client_type);
        }
        else {
            uvm_page_mask_region_fill(reads_serviced_mask, subregion);
@ -522,6 +531,15 @@ NV_STATUS uvm_ats_service_faults(uvm_gpu_va_space_t *gpu_va_space,
            return status;

        uvm_page_mask_region_fill(faults_serviced_mask, subregion);
+
+        // Similarly to permission upgrade scenario, discussed above, GPU
+        // will not re-fetch the entry if the PTE is invalid and page size
+        // is 4K. To avoid infinite faulting loop, invalidate TLB for every
+        // new translation written explicitly like in the case of permission
+        // upgrade.
+        if (PAGE_SIZE == UVM_PAGE_SIZE_4K)
+            flush_tlb_va_region(gpu_va_space, start, length, client_type);
+
    }

    return status;
@ -556,7 +574,7 @@ NV_STATUS uvm_ats_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space,
    NV_STATUS status;
    uvm_push_t push;

-    if (!ats_invalidate->write_faults_in_batch)
+    if (!ats_invalidate->tlb_batch_pending)
        return NV_OK;

    UVM_ASSERT(gpu_va_space);
@ -568,7 +586,7 @@ NV_STATUS uvm_ats_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space,
                            "Invalidate ATS entries");

    if (status == NV_OK) {
-        uvm_tlb_batch_end(&ats_invalidate->write_faults_tlb_batch, &push, UVM_MEMBAR_NONE);
+        uvm_tlb_batch_end(&ats_invalidate->tlb_batch, &push, UVM_MEMBAR_NONE);
        uvm_push_end(&push);

        // Add this push to the GPU's tracker so that fault replays/clears can
@ -576,8 +594,7 @@ NV_STATUS uvm_ats_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space,
        status = uvm_tracker_add_push_safe(out_tracker, &push);
    }

-    ats_invalidate->write_faults_in_batch = false;
+    ats_invalidate->tlb_batch_pending = false;

    return status;
 }
-
--- a/kernel-open/nvidia-uvm/uvm_ats_faults.h
+++ b/kernel-open/nvidia-uvm/uvm_ats_faults.h
@ -52,7 +52,7 @@ NV_STATUS uvm_ats_service_faults(uvm_gpu_va_space_t *gpu_va_space,
 bool uvm_ats_check_in_gmmu_region(uvm_va_space_t *va_space, NvU64 address, uvm_va_range_t *next);

 // This function performs pending TLB invalidations for ATS and clears the
-// ats_invalidate->write_faults_in_batch flag
+// ats_invalidate->tlb_batch_pending flag
 NV_STATUS uvm_ats_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space,
                                  uvm_ats_fault_invalidate_t *ats_invalidate,
                                  uvm_tracker_t *out_tracker);
--- a/kernel-open/nvidia-uvm/uvm_ats_sva.c
+++ b/kernel-open/nvidia-uvm/uvm_ats_sva.c
@ -29,8 +29,13 @@
 #include "uvm_va_space.h"
 #include "uvm_va_space_mm.h"

+#include <asm/io.h>
+#include <linux/log2.h>
 #include <linux/iommu.h>
 #include <linux/mm_types.h>
+#include <linux/acpi.h>
+#include <linux/device.h>
+#include <linux/mmu_context.h>

 // linux/sched/mm.h is needed for mmget_not_zero and mmput to get the mm
 // reference required for the iommu_sva_bind_device() call. This header is not
@ -46,17 +51,276 @@
 #define UVM_IOMMU_SVA_BIND_DEVICE(dev, mm) iommu_sva_bind_device(dev, mm)
 #endif

+// Type to represent a 128-bit SMMU command queue command.
+struct smmu_cmd {
+    NvU64 low;
+    NvU64 high;
+};
+
+// Base address of SMMU CMDQ-V for GSMMU0.
+#define SMMU_CMDQV_BASE_ADDR(smmu_base) (smmu_base + 0x200000)
+#define SMMU_CMDQV_BASE_LEN 0x00830000
+
+// CMDQV configuration is done by firmware but we check status here.
+#define SMMU_CMDQV_CONFIG 0x0
+#define SMMU_CMDQV_CONFIG_CMDQV_EN BIT(0)
+
+// Used to map a particular VCMDQ to a VINTF.
+#define SMMU_CMDQV_CMDQ_ALLOC_MAP(vcmdq_id) (0x200 + 0x4 * (vcmdq_id))
+#define SMMU_CMDQV_CMDQ_ALLOC_MAP_ALLOC BIT(0)
+
+// Shift for the field containing the index of the virtual interface
+// owning the VCMDQ.
+#define SMMU_CMDQV_CMDQ_ALLOC_MAP_VIRT_INTF_INDX_SHIFT 15
+
+// Base address for the VINTF registers.
+#define SMMU_VINTF_BASE_ADDR(cmdqv_base_addr, vintf_id) (cmdqv_base_addr + 0x1000 + 0x100 * (vintf_id))
+
+// Virtual interface (VINTF) configuration registers. The WAR only
+// works on baremetal so we need to configure ourselves as the
+// hypervisor owner.
+#define SMMU_VINTF_CONFIG 0x0
+#define SMMU_VINTF_CONFIG_ENABLE BIT(0)
+#define SMMU_VINTF_CONFIG_HYP_OWN BIT(17)
+
+#define SMMU_VINTF_STATUS 0x0
+#define SMMU_VINTF_STATUS_ENABLED BIT(0)
+
+// Caclulates the base address for a particular VCMDQ instance.
+#define SMMU_VCMDQ_BASE_ADDR(cmdqv_base_addr, vcmdq_id) (cmdqv_base_addr + 0x10000 + 0x80 * (vcmdq_id))
+
+// SMMU command queue consumer index register. Updated by SMMU
+// when commands are consumed.
+#define SMMU_VCMDQ_CONS 0x0
+
+// SMMU command queue producer index register. Updated by UVM when
+// commands are added to the queue.
+#define SMMU_VCMDQ_PROD 0x4
+
+// Configuration register used to enable a VCMDQ.
+#define SMMU_VCMDQ_CONFIG 0x8
+#define SMMU_VCMDQ_CONFIG_ENABLE BIT(0)
+
+// Status register used to check the VCMDQ is enabled.
+#define SMMU_VCMDQ_STATUS 0xc
+#define SMMU_VCMDQ_STATUS_ENABLED BIT(0)
+
+// Base address offset for the VCMDQ registers.
+#define SMMU_VCMDQ_CMDQ_BASE 0x10000
+
+// Size of the command queue. Each command is 16 bytes and we can't
+// have a command queue greater than one page in size.
+#define SMMU_VCMDQ_CMDQ_BASE_LOG2SIZE (PAGE_SHIFT - ilog2(sizeof(struct smmu_cmd)))
+#define SMMU_VCMDQ_CMDQ_ENTRIES (1UL << SMMU_VCMDQ_CMDQ_BASE_LOG2SIZE)
+
+// We always use VINTF63 for the WAR
+#define VINTF 63
+static void smmu_vintf_write32(void __iomem *smmu_cmdqv_base, int reg, NvU32 val)
+{
+    iowrite32(val, SMMU_VINTF_BASE_ADDR(smmu_cmdqv_base, VINTF) + reg);
+}
+
+static NvU32 smmu_vintf_read32(void __iomem *smmu_cmdqv_base, int reg)
+{
+    return ioread32(SMMU_VINTF_BASE_ADDR(smmu_cmdqv_base, VINTF) + reg);
+}
+
+// We always use VCMDQ127 for the WAR
+#define VCMDQ 127
+void smmu_vcmdq_write32(void __iomem *smmu_cmdqv_base, int reg, NvU32 val)
+{
+    iowrite32(val, SMMU_VCMDQ_BASE_ADDR(smmu_cmdqv_base, VCMDQ) + reg);
+}
+
+NvU32 smmu_vcmdq_read32(void __iomem *smmu_cmdqv_base, int reg)
+{
+    return ioread32(SMMU_VCMDQ_BASE_ADDR(smmu_cmdqv_base, VCMDQ) + reg);
+}
+
+static void smmu_vcmdq_write64(void __iomem *smmu_cmdqv_base, int reg, NvU64 val)
+{
+    iowrite64(val, SMMU_VCMDQ_BASE_ADDR(smmu_cmdqv_base, VCMDQ) + reg);
+}
+
+// Fix for Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU
+// TLB invalidates on read-only to read-write upgrades
+static NV_STATUS uvm_ats_smmu_war_init(uvm_parent_gpu_t *parent_gpu)
+{
+    uvm_spin_loop_t spin;
+    NV_STATUS status;
+    unsigned long cmdqv_config;
+    void __iomem *smmu_cmdqv_base;
+    struct acpi_iort_node *node;
+    struct acpi_iort_smmu_v3 *iort_smmu;
+
+    node = *(struct acpi_iort_node **) dev_get_platdata(parent_gpu->pci_dev->dev.iommu->iommu_dev->dev->parent);
+    iort_smmu = (struct acpi_iort_smmu_v3 *) node->node_data;
+
+    smmu_cmdqv_base = ioremap(SMMU_CMDQV_BASE_ADDR(iort_smmu->base_address), SMMU_CMDQV_BASE_LEN);
+    if (!smmu_cmdqv_base)
+        return NV_ERR_NO_MEMORY;
+
+    parent_gpu->smmu_war.smmu_cmdqv_base = smmu_cmdqv_base;
+    cmdqv_config = ioread32(smmu_cmdqv_base + SMMU_CMDQV_CONFIG);
+    if (!(cmdqv_config & SMMU_CMDQV_CONFIG_CMDQV_EN)) {
+        status = NV_ERR_OBJECT_NOT_FOUND;
+        goto out;
+    }
+
+    // Allocate SMMU CMDQ pages for WAR
+    parent_gpu->smmu_war.smmu_cmdq = alloc_page(NV_UVM_GFP_FLAGS | __GFP_ZERO);
+    if (!parent_gpu->smmu_war.smmu_cmdq) {
+        status = NV_ERR_NO_MEMORY;
+        goto out;
+    }
+
+    // Initialise VINTF for the WAR
+    smmu_vintf_write32(smmu_cmdqv_base, SMMU_VINTF_CONFIG, SMMU_VINTF_CONFIG_ENABLE | SMMU_VINTF_CONFIG_HYP_OWN);
+    UVM_SPIN_WHILE(!(smmu_vintf_read32(smmu_cmdqv_base, SMMU_VINTF_STATUS) & SMMU_VINTF_STATUS_ENABLED), &spin);
+
+    // Allocate VCMDQ to VINTF
+    iowrite32((VINTF << SMMU_CMDQV_CMDQ_ALLOC_MAP_VIRT_INTF_INDX_SHIFT) | SMMU_CMDQV_CMDQ_ALLOC_MAP_ALLOC,
+              smmu_cmdqv_base + SMMU_CMDQV_CMDQ_ALLOC_MAP(VCMDQ));
+
+    smmu_vcmdq_write64(smmu_cmdqv_base, SMMU_VCMDQ_CMDQ_BASE,
+                       page_to_phys(parent_gpu->smmu_war.smmu_cmdq) | SMMU_VCMDQ_CMDQ_BASE_LOG2SIZE);
+    smmu_vcmdq_write32(smmu_cmdqv_base, SMMU_VCMDQ_CONS, 0);
+    smmu_vcmdq_write32(smmu_cmdqv_base, SMMU_VCMDQ_PROD, 0);
+    smmu_vcmdq_write32(smmu_cmdqv_base, SMMU_VCMDQ_CONFIG, SMMU_VCMDQ_CONFIG_ENABLE);
+    UVM_SPIN_WHILE(!(smmu_vcmdq_read32(smmu_cmdqv_base, SMMU_VCMDQ_STATUS) & SMMU_VCMDQ_STATUS_ENABLED), &spin);
+
+    uvm_mutex_init(&parent_gpu->smmu_war.smmu_lock, UVM_LOCK_ORDER_LEAF);
+    parent_gpu->smmu_war.smmu_prod = 0;
+    parent_gpu->smmu_war.smmu_cons = 0;
+
+    return NV_OK;
+
+out:
+    iounmap(parent_gpu->smmu_war.smmu_cmdqv_base);
+    parent_gpu->smmu_war.smmu_cmdqv_base = NULL;
+
+    return status;
+}
+
+static void uvm_ats_smmu_war_deinit(uvm_parent_gpu_t *parent_gpu)
+{
+    void __iomem *smmu_cmdqv_base = parent_gpu->smmu_war.smmu_cmdqv_base;
+    NvU32 cmdq_alloc_map;
+
+    if (parent_gpu->smmu_war.smmu_cmdqv_base) {
+        smmu_vcmdq_write32(smmu_cmdqv_base, SMMU_VCMDQ_CONFIG, 0);
+        cmdq_alloc_map = ioread32(smmu_cmdqv_base + SMMU_CMDQV_CMDQ_ALLOC_MAP(VCMDQ));
+        iowrite32(cmdq_alloc_map & SMMU_CMDQV_CMDQ_ALLOC_MAP_ALLOC, smmu_cmdqv_base + SMMU_CMDQV_CMDQ_ALLOC_MAP(VCMDQ));
+        smmu_vintf_write32(smmu_cmdqv_base, SMMU_VINTF_CONFIG, 0);
+    }
+
+    if (parent_gpu->smmu_war.smmu_cmdq)
+        __free_page(parent_gpu->smmu_war.smmu_cmdq);
+
+    if (parent_gpu->smmu_war.smmu_cmdqv_base)
+        iounmap(parent_gpu->smmu_war.smmu_cmdqv_base);
+}
+
+// The SMMU on ARM64 can run under different translation regimes depending on
+// what features the OS and CPU variant support. The CPU for GH180 supports
+// virtualisation extensions and starts the kernel at EL2 meaning SMMU operates
+// under the NS-EL2-E2H translation regime. Therefore we need to use the
+// TLBI_EL2_* commands which invalidate TLB entries created under this
+// translation regime.
+#define CMDQ_OP_TLBI_EL2_ASID 0x21;
+#define CMDQ_OP_TLBI_EL2_VA 0x22;
+#define CMDQ_OP_CMD_SYNC 0x46
+
+// Use the same maximum as used for MAX_TLBI_OPS in the upstream
+// kernel.
+#define UVM_MAX_TLBI_OPS (1UL << (PAGE_SHIFT - 3))
+
+#if UVM_ATS_SMMU_WAR_REQUIRED()
+void uvm_ats_smmu_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space, NvU64 addr, size_t size)
+{
+    struct mm_struct *mm = gpu_va_space->va_space->va_space_mm.mm;
+    uvm_parent_gpu_t *parent_gpu = gpu_va_space->gpu->parent;
+    struct {
+        NvU64 low;
+        NvU64 high;
+    } *vcmdq;
+    unsigned long vcmdq_prod;
+    NvU64 end;
+    uvm_spin_loop_t spin;
+    NvU16 asid;
+
+    if (!parent_gpu->smmu_war.smmu_cmdqv_base)
+        return;
+
+    asid = arm64_mm_context_get(mm);
+    vcmdq = kmap(parent_gpu->smmu_war.smmu_cmdq);
+    uvm_mutex_lock(&parent_gpu->smmu_war.smmu_lock);
+    vcmdq_prod = parent_gpu->smmu_war.smmu_prod;
+
+    // Our queue management is very simple. The mutex prevents multiple
+    // producers writing to the queue and all our commands require waiting for
+    // the queue to drain so we know it's empty. If we can't fit enough commands
+    // in the queue we just invalidate the whole ASID.
+    //
+    // The command queue is a cirular buffer with the MSB representing a wrap
+    // bit that must toggle on each wrap. See the SMMU architecture
+    // specification for more details.
+    //
+    // SMMU_VCMDQ_CMDQ_ENTRIES - 1 because we need to leave space for the
+    // CMD_SYNC.
+    if ((size >> PAGE_SHIFT) > min(UVM_MAX_TLBI_OPS, SMMU_VCMDQ_CMDQ_ENTRIES - 1)) {
+        vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low = CMDQ_OP_TLBI_EL2_ASID;
+        vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low |= (NvU64) asid << 48;
+        vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].high = 0;
+        vcmdq_prod++;
+    }
+    else {
+        for (end = addr + size; addr < end; addr += PAGE_SIZE) {
+            vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low = CMDQ_OP_TLBI_EL2_VA;
+            vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low |= (NvU64) asid << 48;
+            vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].high = addr & ~((1UL << 12) - 1);
+            vcmdq_prod++;
+        }
+    }
+
+    vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].low = CMDQ_OP_CMD_SYNC;
+    vcmdq[vcmdq_prod % SMMU_VCMDQ_CMDQ_ENTRIES].high = 0x0;
+    vcmdq_prod++;
+
+    // MSB is the wrap bit
+    vcmdq_prod &= (1UL << (SMMU_VCMDQ_CMDQ_BASE_LOG2SIZE + 1)) - 1;
+    parent_gpu->smmu_war.smmu_prod = vcmdq_prod;
+    smmu_vcmdq_write32(parent_gpu->smmu_war.smmu_cmdqv_base, SMMU_VCMDQ_PROD, parent_gpu->smmu_war.smmu_prod);
+
+    UVM_SPIN_WHILE(
+        (smmu_vcmdq_read32(parent_gpu->smmu_war.smmu_cmdqv_base, SMMU_VCMDQ_CONS) & GENMASK(19, 0)) != vcmdq_prod,
+        &spin);
+
+    uvm_mutex_unlock(&parent_gpu->smmu_war.smmu_lock);
+    kunmap(parent_gpu->smmu_war.smmu_cmdq);
+    arm64_mm_context_put(mm);
+}
+#endif
+
 NV_STATUS uvm_ats_sva_add_gpu(uvm_parent_gpu_t *parent_gpu)
 {
    int ret;

    ret = iommu_dev_enable_feature(&parent_gpu->pci_dev->dev, IOMMU_DEV_FEAT_SVA);
+    if (ret)
+        return errno_to_nv_status(ret);

-    return errno_to_nv_status(ret);
+    if (UVM_ATS_SMMU_WAR_REQUIRED())
+        return uvm_ats_smmu_war_init(parent_gpu);
+    else
+        return NV_OK;
 }

 void uvm_ats_sva_remove_gpu(uvm_parent_gpu_t *parent_gpu)
 {
+    if (UVM_ATS_SMMU_WAR_REQUIRED())
+        uvm_ats_smmu_war_deinit(parent_gpu);
+
    iommu_dev_disable_feature(&parent_gpu->pci_dev->dev, IOMMU_DEV_FEAT_SVA);
 }

--- a/kernel-open/nvidia-uvm/uvm_ats_sva.h
+++ b/kernel-open/nvidia-uvm/uvm_ats_sva.h
@ -32,23 +32,38 @@
 // For ATS support on aarch64, arm_smmu_sva_bind() is needed for
 // iommu_sva_bind_device() calls. Unfortunately, arm_smmu_sva_bind() is not
 // conftest-able. We instead look for the presence of ioasid_get() or
-// mm_pasid_set(). ioasid_get() was added in the same patch series as
-// arm_smmu_sva_bind() and removed in v6.0. mm_pasid_set() was added in the
+// mm_pasid_drop(). ioasid_get() was added in the same patch series as
+// arm_smmu_sva_bind() and removed in v6.0. mm_pasid_drop() was added in the
 // same patch as the removal of ioasid_get(). We assume the presence of
-// arm_smmu_sva_bind() if ioasid_get(v5.11 - v5.17) or mm_pasid_set(v5.18+) is
+// arm_smmu_sva_bind() if ioasid_get(v5.11 - v5.17) or mm_pasid_drop(v5.18+) is
 // present.
 //
 // arm_smmu_sva_bind() was added with commit
 // 32784a9562fb0518b12e9797ee2aec52214adf6f and ioasid_get() was added with
 // commit cb4789b0d19ff231ce9f73376a023341300aed96 (11/23/2020). Commit
 // 701fac40384f07197b106136012804c3cae0b3de (02/15/2022) removed ioasid_get()
-// and added mm_pasid_set().
-    #if UVM_CAN_USE_MMU_NOTIFIERS() && (defined(NV_IOASID_GET_PRESENT) || defined(NV_MM_PASID_SET_PRESENT))
-        #define UVM_ATS_SVA_SUPPORTED() 1
+// and added mm_pasid_drop().
+    #if UVM_CAN_USE_MMU_NOTIFIERS() && (defined(NV_IOASID_GET_PRESENT) || defined(NV_MM_PASID_DROP_PRESENT))
+        #if defined(CONFIG_IOMMU_SVA)
+            #define UVM_ATS_SVA_SUPPORTED() 1
+        #else
+            #define UVM_ATS_SVA_SUPPORTED() 0
+        #endif
    #else
        #define UVM_ATS_SVA_SUPPORTED() 0
    #endif

+// If NV_ARCH_INVALIDATE_SECONDARY_TLBS is defined it means the upstream fix is
+// in place so no need for the WAR from Bug 4130089: [GH180][r535] WAR for
+// kernel not issuing SMMU TLB invalidates on read-only
+#if defined(NV_ARCH_INVALIDATE_SECONDARY_TLBS)
+    #define UVM_ATS_SMMU_WAR_REQUIRED() 0
+#elif NVCPU_IS_AARCH64
+    #define UVM_ATS_SMMU_WAR_REQUIRED() 1
+#else
+    #define UVM_ATS_SMMU_WAR_REQUIRED() 0
+#endif
+
 typedef struct
 {
    int placeholder;
@ -77,6 +92,17 @@ typedef struct

    // LOCKING: None
    void uvm_ats_sva_unregister_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space);
+
+    // Fix for Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU
+    // TLB invalidates on read-only to read-write upgrades
+    #if UVM_ATS_SMMU_WAR_REQUIRED()
+        void uvm_ats_smmu_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space, NvU64 addr, size_t size);
+    #else
+        static void uvm_ats_smmu_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space, NvU64 addr, size_t size)
+        {
+
+        }
+    #endif
 #else
    static NV_STATUS uvm_ats_sva_add_gpu(uvm_parent_gpu_t *parent_gpu)
    {
@ -107,6 +133,11 @@ typedef struct
    {

    }
+
+    static void uvm_ats_smmu_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space, NvU64 addr, size_t size)
+    {
+
+    }
 #endif // UVM_ATS_SVA_SUPPORTED

 #endif // __UVM_ATS_SVA_H__
--- a/kernel-open/nvidia-uvm/uvm_ce_test.c
+++ b/kernel-open/nvidia-uvm/uvm_ce_test.c
@ -191,7 +191,7 @@ static NV_STATUS test_membar(uvm_gpu_t *gpu)

    for (i = 0; i < REDUCTIONS; ++i) {
        uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
-        gpu->parent->ce_hal->semaphore_reduction_inc(&push, host_mem_gpu_va, REDUCTIONS + 1);
+        gpu->parent->ce_hal->semaphore_reduction_inc(&push, host_mem_gpu_va, REDUCTIONS);
    }

    // Without a sys membar the channel tracking semaphore can and does complete
@ -577,7 +577,7 @@ static NV_STATUS test_semaphore_reduction_inc(uvm_gpu_t *gpu)

    for (i = 0; i < REDUCTIONS; i++) {
        uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
-        gpu->parent->ce_hal->semaphore_reduction_inc(&push, gpu_va, i+1);
+        gpu->parent->ce_hal->semaphore_reduction_inc(&push, gpu_va, REDUCTIONS);
    }

    status = uvm_push_end_and_wait(&push);
--- a/kernel-open/nvidia-uvm/uvm_common.h
+++ b/kernel-open/nvidia-uvm/uvm_common.h
@ -21,8 +21,8 @@

 *******************************************************************************/

-#ifndef _UVM_COMMON_H
-#define _UVM_COMMON_H
+#ifndef __UVM_COMMON_H__
+#define __UVM_COMMON_H__

 #ifdef DEBUG
    #define UVM_IS_DEBUG() 1
@ -413,4 +413,40 @@ static inline void uvm_touch_page(struct page *page)
 // Return true if the VMA is one used by UVM managed allocations.
 bool uvm_vma_is_managed(struct vm_area_struct *vma);

-#endif /* _UVM_COMMON_H */
+static bool uvm_platform_uses_canonical_form_address(void)
+{
+    if (NVCPU_IS_PPC64LE)
+        return false;
+
+    return true;
+}
+
+// Similar to the GPU MMU HAL num_va_bits(), it returns the CPU's num_va_bits().
+static NvU32 uvm_cpu_num_va_bits(void)
+{
+    return fls64(TASK_SIZE - 1) + 1;
+}
+
+// Return the unaddressable range in a num_va_bits-wide VA space, [first, outer)
+static void uvm_get_unaddressable_range(NvU32 num_va_bits, NvU64 *first, NvU64 *outer)
+{
+    UVM_ASSERT(num_va_bits < 64);
+    UVM_ASSERT(first);
+    UVM_ASSERT(outer);
+
+    if (uvm_platform_uses_canonical_form_address()) {
+        *first = 1ULL << (num_va_bits - 1);
+        *outer = (NvU64)((NvS64)(1ULL << 63) >> (64 - num_va_bits));
+    }
+    else {
+        *first = 1ULL << num_va_bits;
+        *outer = ~0Ull;
+    }
+}
+
+static void uvm_cpu_get_unaddressable_range(NvU64 *first, NvU64 *outer)
+{
+    return uvm_get_unaddressable_range(uvm_cpu_num_va_bits(), first, outer);
+}
+
+#endif /* __UVM_COMMON_H__ */
--- a/kernel-open/nvidia-uvm/uvm_gpu.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu.c
@ -218,19 +218,12 @@ static bool gpu_supports_uvm(uvm_parent_gpu_t *parent_gpu)
    return parent_gpu->rm_info.subdeviceCount == 1;
 }

-static bool platform_uses_canonical_form_address(void)
-{
-    if (NVCPU_IS_PPC64LE)
-        return false;
-
-    return true;
-}
-
 bool uvm_gpu_can_address(uvm_gpu_t *gpu, NvU64 addr, NvU64 size)
 {
    // Lower and upper address spaces are typically found in platforms that use
    // the canonical address form.
    NvU64 max_va_lower;
+    NvU64 min_va_upper;
    NvU64 addr_end = addr + size - 1;
    NvU8 gpu_addr_shift;
    NvU8 cpu_addr_shift;
@ -243,7 +236,7 @@ bool uvm_gpu_can_address(uvm_gpu_t *gpu, NvU64 addr, NvU64 size)
    UVM_ASSERT(size > 0);

    gpu_addr_shift = gpu->address_space_tree.hal->num_va_bits();
-    cpu_addr_shift = fls64(TASK_SIZE - 1) + 1;
+    cpu_addr_shift = uvm_cpu_num_va_bits();
    addr_shift = gpu_addr_shift;

    // Pascal+ GPUs are capable of accessing kernel pointers in various modes
@ -279,9 +272,7 @@ bool uvm_gpu_can_address(uvm_gpu_t *gpu, NvU64 addr, NvU64 size)
    //               0 +----------------+               0 +----------------+

    // On canonical form address platforms and Pascal+ GPUs.
-    if (platform_uses_canonical_form_address() && gpu_addr_shift > 40) {
-        NvU64 min_va_upper;
-
+    if (uvm_platform_uses_canonical_form_address() && gpu_addr_shift > 40) {
        // On x86, when cpu_addr_shift > gpu_addr_shift, it means the CPU uses
        // 5-level paging and the GPU is pre-Hopper. On Pascal-Ada GPUs (49b
        // wide VA) we set addr_shift to match a 4-level paging x86 (48b wide).
@ -292,15 +283,11 @@ bool uvm_gpu_can_address(uvm_gpu_t *gpu, NvU64 addr, NvU64 size)
            addr_shift = gpu_addr_shift;
        else
            addr_shift = cpu_addr_shift;
+    }

-        min_va_upper = (NvU64)((NvS64)(1ULL << 63) >> (64 - addr_shift));
-        max_va_lower = 1ULL << (addr_shift - 1);
-        return (addr_end < max_va_lower) || (addr >= min_va_upper);
-    }
-    else {
-        max_va_lower = 1ULL << addr_shift;
-        return addr_end < max_va_lower;
-    }
+    uvm_get_unaddressable_range(addr_shift, &max_va_lower, &min_va_upper);
+
+    return (addr_end < max_va_lower) || (addr >= min_va_upper);
 }

 // The internal UVM VAS does not use canonical form addresses.
@ -326,14 +313,14 @@ NvU64 uvm_parent_gpu_canonical_address(uvm_parent_gpu_t *parent_gpu, NvU64 addr)
    NvU8 addr_shift;
    NvU64 input_addr = addr;

-    if (platform_uses_canonical_form_address()) {
+    if (uvm_platform_uses_canonical_form_address()) {
        // When the CPU VA width is larger than GPU's, it means that:
        // On ARM: the CPU is on LVA mode and the GPU is pre-Hopper.
        // On x86: the CPU uses 5-level paging and the GPU is pre-Hopper.
        // We sign-extend on the 48b on ARM and on the 47b on x86 to mirror the
        // behavior of CPUs with smaller (than GPU) VA widths.
        gpu_addr_shift = parent_gpu->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K)->num_va_bits();
-        cpu_addr_shift = fls64(TASK_SIZE - 1) + 1;
+        cpu_addr_shift = uvm_cpu_num_va_bits();

        if (cpu_addr_shift > gpu_addr_shift)
            addr_shift = NVCPU_IS_X86_64 ? 48 : 49;
--- a/kernel-open/nvidia-uvm/uvm_gpu.h
+++ b/kernel-open/nvidia-uvm/uvm_gpu.h
@ -57,14 +57,16 @@

 typedef struct
 {
-    // Number of faults from this uTLB that have been fetched but have not been serviced yet
+    // Number of faults from this uTLB that have been fetched but have not been
+    // serviced yet.
    NvU32 num_pending_faults;

    // Whether the uTLB contains fatal faults
    bool has_fatal_faults;

-    // We have issued a replay of type START_ACK_ALL while containing fatal faults. This puts
-    // the uTLB in lockdown mode and no new translations are accepted
+    // We have issued a replay of type START_ACK_ALL while containing fatal
+    // faults. This puts the uTLB in lockdown mode and no new translations are
+    // accepted.
    bool in_lockdown;

    // We have issued a cancel on this uTLB
@ -126,8 +128,8 @@ struct uvm_service_block_context_struct
        struct list_head service_context_list;

        // A mask of GPUs that need to be checked for ECC errors before the CPU
-        // fault handler returns, but after the VA space lock has been unlocked to
-        // avoid the RM/UVM VA space lock deadlocks.
+        // fault handler returns, but after the VA space lock has been unlocked
+        // to avoid the RM/UVM VA space lock deadlocks.
        uvm_processor_mask_t gpus_to_check_for_ecc;

        // This is set to throttle page fault thrashing.
@ -160,9 +162,9 @@ struct uvm_service_block_context_struct

    struct
    {
-        // Per-processor mask with the pages that will be resident after servicing.
-        // We need one mask per processor because we may coalesce faults that
-        // trigger migrations to different processors.
+        // Per-processor mask with the pages that will be resident after
+        // servicing. We need one mask per processor because we may coalesce
+        // faults that trigger migrations to different processors.
        uvm_page_mask_t new_residency;
    } per_processor_masks[UVM_ID_MAX_PROCESSORS];

@ -263,7 +265,10 @@ struct uvm_fault_service_batch_context_struct

    NvU32 num_coalesced_faults;

-    bool has_fatal_faults;
+    // One of the VA spaces in this batch which had fatal faults. If NULL, no
+    // faults were fatal. More than one VA space could have fatal faults, but we
+    // pick one to be the target of the cancel sequence.
+    uvm_va_space_t *fatal_va_space;

    bool has_throttled_faults;

@ -291,11 +296,8 @@ struct uvm_fault_service_batch_context_struct

 struct uvm_ats_fault_invalidate_struct
 {
-    // Whether the TLB batch contains any information
-    bool            write_faults_in_batch;
-
-    // Batch of TLB entries to be invalidated
-    uvm_tlb_batch_t write_faults_tlb_batch;
+    bool            tlb_batch_pending;
+    uvm_tlb_batch_t tlb_batch;
 };

 typedef struct
@ -440,20 +442,9 @@ struct uvm_access_counter_service_batch_context_struct
        NvU32                             num_notifications;

        // Boolean used to avoid sorting the fault batch by instance_ptr if we
-        // determine at fetch time that all the access counter notifications in the
-        // batch report the same instance_ptr
+        // determine at fetch time that all the access counter notifications in
+        // the batch report the same instance_ptr
        bool is_single_instance_ptr;
-
-        // Scratch space, used to generate artificial physically addressed notifications.
-        // Virtual address notifications are always aligned to 64k. This means up to 16
-        // different physical locations could have been accessed to trigger one notification.
-        // The sub-granularity mask can correspond to any of them.
-        struct
-        {
-            uvm_processor_id_t resident_processors[16];
-            uvm_gpu_phys_address_t phys_addresses[16];
-            uvm_access_counter_buffer_entry_t phys_entry;
-        } scratch;
    } virt;

    struct
@ -464,8 +455,8 @@ struct uvm_access_counter_service_batch_context_struct
        NvU32                              num_notifications;

        // Boolean used to avoid sorting the fault batch by aperture if we
-        // determine at fetch time that all the access counter notifications in the
-        // batch report the same aperture
+        // determine at fetch time that all the access counter notifications in
+        // the batch report the same aperture
        bool                              is_single_aperture;
    } phys;

@ -661,8 +652,8 @@ struct uvm_gpu_struct
    struct
    {
        // Big page size used by the internal UVM VA space
-        // Notably it may be different than the big page size used by a user's VA
-        // space in general.
+        // Notably it may be different than the big page size used by a user's
+        // VA space in general.
        NvU32 internal_size;
    } big_page;

@ -688,8 +679,8 @@ struct uvm_gpu_struct
        // lazily-populated array of peer GPUs, indexed by the peer's GPU index
        uvm_gpu_t *peer_gpus[UVM_ID_MAX_GPUS];

-        // Leaf spinlock used to synchronize access to the peer_gpus table so that
-        // it can be safely accessed from the access counters bottom half
+        // Leaf spinlock used to synchronize access to the peer_gpus table so
+        // that it can be safely accessed from the access counters bottom half
        uvm_spinlock_t peer_gpus_lock;
    } peer_info;

@ -980,6 +971,10 @@ struct uvm_parent_gpu_struct

    bool plc_supported;

+    // If true, page_tree initialization pre-populates no_ats_ranges. It only
+    // affects ATS systems.
+    bool no_ats_range_required;
+
    // Parameters used by the TLB batching API
    struct
    {
@ -1051,14 +1046,16 @@ struct uvm_parent_gpu_struct
    // Interrupt handling state and locks
    uvm_isr_info_t isr;

-    // Fault buffer info. This is only valid if supports_replayable_faults is set to true
+    // Fault buffer info. This is only valid if supports_replayable_faults is
+    // set to true.
    uvm_fault_buffer_info_t fault_buffer_info;

    // PMM lazy free processing queue.
    // TODO: Bug 3881835: revisit whether to use nv_kthread_q_t or workqueue.
    nv_kthread_q_t lazy_free_q;

-    // Access counter buffer info. This is only valid if supports_access_counters is set to true
+    // Access counter buffer info. This is only valid if
+    // supports_access_counters is set to true.
    uvm_access_counter_buffer_info_t access_counter_buffer_info;

    // Number of uTLBs per GPC. This information is only valid on Pascal+ GPUs.
@ -1108,7 +1105,7 @@ struct uvm_parent_gpu_struct
    uvm_rb_tree_t instance_ptr_table;
    uvm_spinlock_t instance_ptr_table_lock;

-    // This is set to true if the GPU belongs to an SLI group. Else, set to false.
+    // This is set to true if the GPU belongs to an SLI group.
    bool sli_enabled;

    struct
@ -1135,8 +1132,8 @@ struct uvm_parent_gpu_struct
    // environment, rather than using the peer-id field of the PTE (which can
    // only address 8 gpus), all gpus are assigned a 47-bit physical address
    // space by the fabric manager. Any physical address access to these
-    // physical address spaces are routed through the switch to the corresponding
-    // peer.
+    // physical address spaces are routed through the switch to the
+    // corresponding peer.
    struct
    {
        bool is_nvswitch_connected;
@ -1162,6 +1159,16 @@ struct uvm_parent_gpu_struct
        NvU64 memory_window_start;
        NvU64 memory_window_end;
    } system_bus;
+
+    // WAR to issue ATS TLB invalidation commands ourselves.
+    struct
+    {
+        uvm_mutex_t smmu_lock;
+        struct page *smmu_cmdq;
+        void __iomem *smmu_cmdqv_base;
+        unsigned long smmu_prod;
+        unsigned long smmu_cons;
+    } smmu_war;
 };

 static const char *uvm_gpu_name(uvm_gpu_t *gpu)
@ -1351,7 +1358,8 @@ void uvm_gpu_release_pcie_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1);
 // They must not be the same gpu.
 uvm_aperture_t uvm_gpu_peer_aperture(uvm_gpu_t *local_gpu, uvm_gpu_t *remote_gpu);

-// Get the processor id accessible by the given GPU for the given physical address
+// Get the processor id accessible by the given GPU for the given physical
+// address.
 uvm_processor_id_t uvm_gpu_get_processor_id_by_address(uvm_gpu_t *gpu, uvm_gpu_phys_address_t addr);

 // Get the P2P capabilities between the gpus with the given indexes
@ -1448,9 +1456,9 @@ NV_STATUS uvm_gpu_check_ecc_error(uvm_gpu_t *gpu);

 // Check for ECC errors without calling into RM
 //
-// Calling into RM is problematic in many places, this check is always safe to do.
-// Returns NV_WARN_MORE_PROCESSING_REQUIRED if there might be an ECC error and
-// it's required to call uvm_gpu_check_ecc_error() to be sure.
+// Calling into RM is problematic in many places, this check is always safe to
+// do. Returns NV_WARN_MORE_PROCESSING_REQUIRED if there might be an ECC error
+// and it's required to call uvm_gpu_check_ecc_error() to be sure.
 NV_STATUS uvm_gpu_check_ecc_error_no_rm(uvm_gpu_t *gpu);

 // Map size bytes of contiguous sysmem on the GPU for physical access
@ -1507,6 +1515,8 @@ bool uvm_gpu_can_address(uvm_gpu_t *gpu, NvU64 addr, NvU64 size);
 // The GPU must be initialized before calling this function.
 bool uvm_gpu_can_address_kernel(uvm_gpu_t *gpu, NvU64 addr, NvU64 size);

+bool uvm_platform_uses_canonical_form_address(void);
+
 // Returns addr's canonical form for host systems that use canonical form
 // addresses.
 NvU64 uvm_parent_gpu_canonical_address(uvm_parent_gpu_t *parent_gpu, NvU64 addr);
@ -1553,8 +1563,9 @@ uvm_aperture_t uvm_gpu_page_tree_init_location(const uvm_gpu_t *gpu);
 // Debug print of GPU properties
 void uvm_gpu_print(uvm_gpu_t *gpu);

-// Add the given instance pointer -> user_channel mapping to this GPU. The bottom
-// half GPU page fault handler uses this to look up the VA space for GPU faults.
+// Add the given instance pointer -> user_channel mapping to this GPU. The
+// bottom half GPU page fault handler uses this to look up the VA space for GPU
+// faults.
 NV_STATUS uvm_gpu_add_user_channel(uvm_gpu_t *gpu, uvm_user_channel_t *user_channel);
 void uvm_gpu_remove_user_channel(uvm_gpu_t *gpu, uvm_user_channel_t *user_channel);

--- a/kernel-open/nvidia-uvm/uvm_gpu_access_counters.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_access_counters.c
@ -33,17 +33,17 @@
 #include "uvm_va_space_mm.h"
 #include "uvm_pmm_sysmem.h"
 #include "uvm_perf_module.h"
+#include "uvm_ats_ibm.h"

 #define UVM_PERF_ACCESS_COUNTER_BATCH_COUNT_MIN     1
 #define UVM_PERF_ACCESS_COUNTER_BATCH_COUNT_DEFAULT 256
-#define UVM_PERF_ACCESS_COUNTER_GRANULARITY_DEFAULT "2m"
+#define UVM_PERF_ACCESS_COUNTER_GRANULARITY         UVM_ACCESS_COUNTER_GRANULARITY_2M
 #define UVM_PERF_ACCESS_COUNTER_THRESHOLD_MIN       1
 #define UVM_PERF_ACCESS_COUNTER_THRESHOLD_MAX       ((1 << 16) - 1)
 #define UVM_PERF_ACCESS_COUNTER_THRESHOLD_DEFAULT   256

-#define UVM_ACCESS_COUNTER_ACTION_NOTIFY 0x1
-#define UVM_ACCESS_COUNTER_ACTION_CLEAR  0x2
-#define UVM_ACCESS_COUNTER_ON_MANAGED    0x4
+#define UVM_ACCESS_COUNTER_ACTION_CLEAR     0x1
+#define UVM_ACCESS_COUNTER_PHYS_ON_MANAGED  0x2

 // Each page in a tracked physical range may belong to a different VA Block. We
 // preallocate an array of reverse map translations. However, access counter
@ -54,12 +54,6 @@
 #define UVM_MAX_TRANSLATION_SIZE (2 * 1024 * 1024ULL)
 #define UVM_SUB_GRANULARITY_REGIONS 32

-// The GPU offers the following tracking granularities: 64K, 2M, 16M, 16G
-//
-// Use the largest granularity to minimize the number of access counter
-// notifications. This is fine because we simply drop the notifications during
-// normal operation, and tests override these values.
-static UVM_ACCESS_COUNTER_GRANULARITY g_uvm_access_counter_granularity;
 static unsigned g_uvm_access_counter_threshold;

 // Per-VA space access counters information
@ -87,7 +81,6 @@ static int uvm_perf_access_counter_momc_migration_enable = -1;
 static unsigned uvm_perf_access_counter_batch_count = UVM_PERF_ACCESS_COUNTER_BATCH_COUNT_DEFAULT;

 // See module param documentation below
-static char *uvm_perf_access_counter_granularity = UVM_PERF_ACCESS_COUNTER_GRANULARITY_DEFAULT;
 static unsigned uvm_perf_access_counter_threshold = UVM_PERF_ACCESS_COUNTER_THRESHOLD_DEFAULT;

 // Module parameters for the tunables
@ -100,10 +93,6 @@ MODULE_PARM_DESC(uvm_perf_access_counter_momc_migration_enable,
                 "Whether MOMC access counters will trigger migrations."
                 "Valid values: <= -1 (default policy), 0 (off), >= 1 (on)");
 module_param(uvm_perf_access_counter_batch_count, uint, S_IRUGO);
-module_param(uvm_perf_access_counter_granularity, charp, S_IRUGO);
-MODULE_PARM_DESC(uvm_perf_access_counter_granularity,
-                 "Size of the physical memory region tracked by each counter. Valid values as"
-                 "of Volta: 64k, 2m, 16m, 16g");
 module_param(uvm_perf_access_counter_threshold, uint, S_IRUGO);
 MODULE_PARM_DESC(uvm_perf_access_counter_threshold,
                 "Number of remote accesses on a region required to trigger a notification."
@ -136,7 +125,7 @@ static va_space_access_counters_info_t *va_space_access_counters_info_get(uvm_va

 // Whether access counter migrations are enabled or not. The policy is as
 // follows:
-// - MIMC migrations are enabled by default on P9 systems with ATS support
+// - MIMC migrations are disabled by default on all systems except P9.
 // - MOMC migrations are disabled by default on all systems
 // - Users can override this policy by specifying on/off
 static bool is_migration_enabled(uvm_access_counter_type_t type)
@ -159,7 +148,10 @@ static bool is_migration_enabled(uvm_access_counter_type_t type)
    if (type == UVM_ACCESS_COUNTER_TYPE_MOMC)
        return false;

-    return g_uvm_global.ats.supported;
+    if (UVM_ATS_IBM_SUPPORTED())
+        return g_uvm_global.ats.supported;
+
+    return false;
 }

 // Create the access counters tracking struct for the given VA space
@ -225,30 +217,18 @@ static NV_STATUS config_granularity_to_bytes(UVM_ACCESS_COUNTER_GRANULARITY gran
    return NV_OK;
 }

-// Clear the given access counter and add it to the per-GPU clear tracker
-static NV_STATUS access_counter_clear_targeted(uvm_gpu_t *gpu,
-                                               const uvm_access_counter_buffer_entry_t *entry)
+// Clear the access counter notifications and add it to the per-GPU clear
+// tracker.
+static NV_STATUS access_counter_clear_notifications(uvm_gpu_t *gpu,
+                                                    uvm_access_counter_buffer_entry_t **notification_start,
+                                                    NvU32 num_notifications)
 {
+    NvU32 i;
    NV_STATUS status;
    uvm_push_t push;
    uvm_access_counter_buffer_info_t *access_counters = &gpu->parent->access_counter_buffer_info;

-    if (entry->address.is_virtual) {
-        status = uvm_push_begin(gpu->channel_manager,
-                                UVM_CHANNEL_TYPE_MEMOPS,
-                                &push,
-                                "Clear access counter with virtual address: 0x%llx",
-                                entry->address.address);
-    }
-    else {
-        status = uvm_push_begin(gpu->channel_manager,
-                                UVM_CHANNEL_TYPE_MEMOPS,
-                                &push,
-                                "Clear access counter with physical address: 0x%llx:%s",
-                                entry->address.address,
-                                uvm_aperture_string(entry->address.aperture));
-    }
-
+    status = uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_MEMOPS, &push, "Clear access counter batch");
    if (status != NV_OK) {
        UVM_ERR_PRINT("Error creating push to clear access counters: %s, GPU %s\n",
                      nvstatusToString(status),
@ -256,7 +236,8 @@ static NV_STATUS access_counter_clear_targeted(uvm_gpu_t *gpu,
        return status;
    }

-    gpu->parent->host_hal->access_counter_clear_targeted(&push, entry);
+    for (i = 0; i < num_notifications; i++)
+        gpu->parent->host_hal->access_counter_clear_targeted(&push, notification_start[i]);

    uvm_push_end(&push);

@ -381,25 +362,6 @@ NV_STATUS uvm_gpu_init_access_counters(uvm_parent_gpu_t *parent_gpu)
        g_uvm_access_counter_threshold = uvm_perf_access_counter_threshold;
    }

-    if (strcmp(uvm_perf_access_counter_granularity, "64k") == 0) {
-        g_uvm_access_counter_granularity = UVM_ACCESS_COUNTER_GRANULARITY_64K;
-    }
-    else if (strcmp(uvm_perf_access_counter_granularity, "2m") == 0) {
-        g_uvm_access_counter_granularity = UVM_ACCESS_COUNTER_GRANULARITY_2M;
-    }
-    else if (strcmp(uvm_perf_access_counter_granularity, "16m") == 0) {
-        g_uvm_access_counter_granularity = UVM_ACCESS_COUNTER_GRANULARITY_16M;
-    }
-    else if (strcmp(uvm_perf_access_counter_granularity, "16g") == 0) {
-        g_uvm_access_counter_granularity = UVM_ACCESS_COUNTER_GRANULARITY_16G;
-    }
-    else {
-        g_uvm_access_counter_granularity = UVM_ACCESS_COUNTER_GRANULARITY_2M;
-        pr_info("Invalid value '%s' for uvm_perf_access_counter_granularity, using '%s' instead",
-                uvm_perf_access_counter_granularity,
-                UVM_PERF_ACCESS_COUNTER_GRANULARITY_DEFAULT);
-    }
-
    uvm_assert_mutex_locked(&g_uvm_global.global_lock);
    UVM_ASSERT(parent_gpu->access_counter_buffer_hal != NULL);

@ -422,7 +384,7 @@ NV_STATUS uvm_gpu_init_access_counters(uvm_parent_gpu_t *parent_gpu)
    UVM_ASSERT(access_counters->rm_info.bufferSize %
               parent_gpu->access_counter_buffer_hal->entry_size(parent_gpu) == 0);

-    status = config_granularity_to_bytes(g_uvm_access_counter_granularity, &granularity_bytes);
+    status = config_granularity_to_bytes(UVM_PERF_ACCESS_COUNTER_GRANULARITY, &granularity_bytes);
    UVM_ASSERT(status == NV_OK);
    if (granularity_bytes > UVM_MAX_TRANSLATION_SIZE)
        UVM_ASSERT(granularity_bytes % UVM_MAX_TRANSLATION_SIZE == 0);
@ -641,8 +603,8 @@ NV_STATUS uvm_gpu_access_counters_enable(uvm_gpu_t *gpu, uvm_va_space_t *va_spac
    else {
        UvmGpuAccessCntrConfig default_config =
        {
-            .mimcGranularity = g_uvm_access_counter_granularity,
-            .momcGranularity = g_uvm_access_counter_granularity,
+            .mimcGranularity = UVM_PERF_ACCESS_COUNTER_GRANULARITY,
+            .momcGranularity = UVM_PERF_ACCESS_COUNTER_GRANULARITY,
            .mimcUseLimit = UVM_ACCESS_COUNTER_USE_LIMIT_FULL,
            .momcUseLimit = UVM_ACCESS_COUNTER_USE_LIMIT_FULL,
            .threshold = g_uvm_access_counter_threshold,
@ -767,6 +729,22 @@ static int cmp_sort_virt_notifications_by_instance_ptr(const void *_a, const voi
    return cmp_access_counter_instance_ptr(a, b);
 }

+// Sort comparator for pointers to GVA access counter notification buffer
+// entries that sorts by va_space, and fault address.
+static int cmp_sort_virt_notifications_by_va_space_address(const void *_a, const void *_b)
+{
+    const uvm_access_counter_buffer_entry_t **a = (const uvm_access_counter_buffer_entry_t **)_a;
+    const uvm_access_counter_buffer_entry_t **b = (const uvm_access_counter_buffer_entry_t **)_b;
+
+    int result;
+
+    result = UVM_CMP_DEFAULT((*a)->virtual_info.va_space, (*b)->virtual_info.va_space);
+    if (result != 0)
+        return result;
+
+    return UVM_CMP_DEFAULT((*a)->address.address, (*b)->address.address);
+}
+
 // Sort comparator for pointers to GPA access counter notification buffer
 // entries that sorts by physical address' aperture
 static int cmp_sort_phys_notifications_by_processor_id(const void *_a, const void *_b)
@ -924,12 +902,11 @@ static void translate_virt_notifications_instance_ptrs(uvm_gpu_t *gpu,

 // GVA notifications provide an instance_ptr and ve_id that can be directly
 // translated to a VA space. In order to minimize translations, we sort the
-// entries by instance_ptr.
+// entries by instance_ptr, va_space and notification address in that order.
 static void preprocess_virt_notifications(uvm_gpu_t *gpu,
                                          uvm_access_counter_service_batch_context_t *batch_context)
 {
    if (!batch_context->virt.is_single_instance_ptr) {
-        // Sort by instance_ptr
        sort(batch_context->virt.notifications,
             batch_context->virt.num_notifications,
             sizeof(*batch_context->virt.notifications),
@ -938,6 +915,12 @@ static void preprocess_virt_notifications(uvm_gpu_t *gpu,
    }

    translate_virt_notifications_instance_ptrs(gpu, batch_context);
+
+    sort(batch_context->virt.notifications,
+         batch_context->virt.num_notifications,
+         sizeof(*batch_context->virt.notifications),
+         cmp_sort_virt_notifications_by_va_space_address,
+         NULL);
 }

 // GPA notifications provide a physical address and an aperture. Sort
@ -946,7 +929,6 @@ static void preprocess_virt_notifications(uvm_gpu_t *gpu,
 static void preprocess_phys_notifications(uvm_access_counter_service_batch_context_t *batch_context)
 {
    if (!batch_context->phys.is_single_aperture) {
-        // Sort by instance_ptr
        sort(batch_context->phys.notifications,
             batch_context->phys.num_notifications,
             sizeof(*batch_context->phys.notifications),
@ -955,6 +937,28 @@ static void preprocess_phys_notifications(uvm_access_counter_service_batch_conte
    }
 }

+static NV_STATUS notify_tools_and_process_flags(uvm_gpu_t *gpu,
+                                                uvm_access_counter_buffer_entry_t **notification_start,
+                                                NvU32 num_entries,
+                                                NvU32 flags)
+{
+    NV_STATUS status = NV_OK;
+
+    if (uvm_enable_builtin_tests) {
+        // TODO: Bug 4310744: [UVM][TOOLS] Attribute access counter tools events
+        //                    to va_space instead of broadcasting.
+        NvU32 i;
+
+        for (i = 0; i < num_entries; i++)
+            uvm_tools_broadcast_access_counter(gpu, notification_start[i], flags & UVM_ACCESS_COUNTER_PHYS_ON_MANAGED);
+    }
+
+    if (flags & UVM_ACCESS_COUNTER_ACTION_CLEAR)
+        status = access_counter_clear_notifications(gpu, notification_start, num_entries);
+
+    return status;
+}
+
 static NV_STATUS service_va_block_locked(uvm_processor_id_t processor,
                                         uvm_va_block_t *va_block,
                                         uvm_va_block_retry_t *va_block_retry,
@ -1163,7 +1167,7 @@ static NV_STATUS service_phys_single_va_block(uvm_gpu_t *gpu,
                                              const uvm_access_counter_buffer_entry_t *current_entry,
                                              const uvm_reverse_map_t *reverse_mappings,
                                              size_t num_reverse_mappings,
-                                              unsigned *out_flags)
+                                              NvU32 *out_flags)
 {
    size_t index;
    uvm_va_block_t *va_block = reverse_mappings[0].va_block;
@ -1190,7 +1194,6 @@ static NV_STATUS service_phys_single_va_block(uvm_gpu_t *gpu,
        // If an mm is registered with the VA space, we have to retain it
        // in order to lock it before locking the VA space.
        mm = uvm_va_space_mm_retain_lock(va_space);
-
        uvm_va_space_down_read(va_space);

        // Re-check that the VA block is valid after taking the VA block lock.
@ -1251,7 +1254,7 @@ static NV_STATUS service_phys_va_blocks(uvm_gpu_t *gpu,
                                        const uvm_access_counter_buffer_entry_t *current_entry,
                                        const uvm_reverse_map_t *reverse_mappings,
                                        size_t num_reverse_mappings,
-                                        unsigned *out_flags)
+                                        NvU32 *out_flags)
 {
    NV_STATUS status = NV_OK;
    size_t index;
@ -1259,7 +1262,7 @@ static NV_STATUS service_phys_va_blocks(uvm_gpu_t *gpu,
    *out_flags &= ~UVM_ACCESS_COUNTER_ACTION_CLEAR;

    for (index = 0; index < num_reverse_mappings; ++index) {
-        unsigned out_flags_local = 0;
+        NvU32 out_flags_local = 0;
        status = service_phys_single_va_block(gpu,
                                              batch_context,
                                              current_entry,
@ -1318,7 +1321,7 @@ static NV_STATUS service_phys_notification_translation(uvm_gpu_t *gpu,
                                                       NvU64 address,
                                                       unsigned long sub_granularity,
                                                       size_t *num_reverse_mappings,
-                                                       unsigned *out_flags)
+                                                       NvU32 *out_flags)
 {
    NV_STATUS status;
    NvU32 region_start, region_end;
@ -1327,7 +1330,10 @@ static NV_STATUS service_phys_notification_translation(uvm_gpu_t *gpu,

    // Get the reverse_map translations for all the regions set in the
    // sub_granularity field of the counter.
-    for_each_sub_granularity_region(region_start, region_end, sub_granularity, config->sub_granularity_regions_per_translation) {
+    for_each_sub_granularity_region(region_start,
+                                    region_end,
+                                    sub_granularity,
+                                    config->sub_granularity_regions_per_translation) {
        NvU64 local_address = address + region_start * config->sub_granularity_region_size;
        NvU32 local_translation_size = (region_end - region_start) * config->sub_granularity_region_size;
        uvm_reverse_map_t *local_reverse_mappings = batch_context->phys.translations + *num_reverse_mappings;
@ -1376,7 +1382,7 @@ static NV_STATUS service_phys_notification_translation(uvm_gpu_t *gpu,
 static NV_STATUS service_phys_notification(uvm_gpu_t *gpu,
                                           uvm_access_counter_service_batch_context_t *batch_context,
                                           const uvm_access_counter_buffer_entry_t *current_entry,
-                                           unsigned *out_flags)
+                                           NvU32 *out_flags)
 {
    NvU64 address;
    NvU64 translation_index;
@ -1387,7 +1393,7 @@ static NV_STATUS service_phys_notification(uvm_gpu_t *gpu,
    size_t total_reverse_mappings = 0;
    uvm_gpu_t *resident_gpu = NULL;
    NV_STATUS status = NV_OK;
-    unsigned flags = 0;
+    NvU32 flags = 0;

    address = current_entry->address.address;
    UVM_ASSERT(address % config->translation_size == 0);
@ -1415,7 +1421,7 @@ static NV_STATUS service_phys_notification(uvm_gpu_t *gpu,

    for (translation_index = 0; translation_index < config->translations_per_counter; ++translation_index) {
        size_t num_reverse_mappings;
-        unsigned out_flags_local = 0;
+        NvU32 out_flags_local = 0;
        status = service_phys_notification_translation(gpu,
                                                       resident_gpu,
                                                       batch_context,
@ -1437,11 +1443,8 @@ static NV_STATUS service_phys_notification(uvm_gpu_t *gpu,
        sub_granularity = sub_granularity >> config->sub_granularity_regions_per_translation;
    }

-    // Currently we only report events for our tests, not for tools
-    if (uvm_enable_builtin_tests) {
-        *out_flags |= UVM_ACCESS_COUNTER_ACTION_NOTIFY;
-        *out_flags |= ((total_reverse_mappings != 0) ? UVM_ACCESS_COUNTER_ON_MANAGED : 0);
-    }
+    if (uvm_enable_builtin_tests)
+        *out_flags |= ((total_reverse_mappings != 0) ? UVM_ACCESS_COUNTER_PHYS_ON_MANAGED : 0);

    if (status == NV_OK && (flags & UVM_ACCESS_COUNTER_ACTION_CLEAR))
        *out_flags |= UVM_ACCESS_COUNTER_ACTION_CLEAR;
@ -1454,22 +1457,21 @@ static NV_STATUS service_phys_notifications(uvm_gpu_t *gpu,
                                            uvm_access_counter_service_batch_context_t *batch_context)
 {
    NvU32 i;
+    uvm_access_counter_buffer_entry_t **notifications = batch_context->phys.notifications;
+
    preprocess_phys_notifications(batch_context);

    for (i = 0; i < batch_context->phys.num_notifications; ++i) {
        NV_STATUS status;
-        uvm_access_counter_buffer_entry_t *current_entry = batch_context->phys.notifications[i];
-        unsigned flags = 0;
+        uvm_access_counter_buffer_entry_t *current_entry = notifications[i];
+        NvU32 flags = 0;

        if (!UVM_ID_IS_VALID(current_entry->physical_info.resident_id))
            continue;

        status = service_phys_notification(gpu, batch_context, current_entry, &flags);
-        if (flags & UVM_ACCESS_COUNTER_ACTION_NOTIFY)
-            uvm_tools_broadcast_access_counter(gpu, current_entry, flags & UVM_ACCESS_COUNTER_ON_MANAGED);

-        if (status == NV_OK && (flags & UVM_ACCESS_COUNTER_ACTION_CLEAR))
-            status = access_counter_clear_targeted(gpu, current_entry);
+        notify_tools_and_process_flags(gpu, &notifications[i], 1, flags);

        if (status != NV_OK)
            return status;
@ -1478,152 +1480,218 @@ static NV_STATUS service_phys_notifications(uvm_gpu_t *gpu,
    return NV_OK;
 }

-static int cmp_sort_gpu_phys_addr(const void *_a, const void *_b)
+static NV_STATUS service_notification_va_block_helper(struct mm_struct *mm,
+                                                      uvm_va_block_t *va_block,
+                                                      uvm_processor_id_t processor,
+                                                      uvm_access_counter_service_batch_context_t *batch_context)
 {
-    return uvm_gpu_phys_addr_cmp(*(uvm_gpu_phys_address_t*)_a,
-                                 *(uvm_gpu_phys_address_t*)_b);
-}
+    uvm_va_block_retry_t va_block_retry;
+    uvm_page_mask_t *accessed_pages = &batch_context->accessed_pages;
+    uvm_service_block_context_t *service_context = &batch_context->block_service_context;

-static bool gpu_phys_same_region(uvm_gpu_phys_address_t a, uvm_gpu_phys_address_t b, NvU64 granularity)
-{
-    if (a.aperture != b.aperture)
-        return false;
-
-    UVM_ASSERT(is_power_of_2(granularity));
-
-    return UVM_ALIGN_DOWN(a.address, granularity) == UVM_ALIGN_DOWN(b.address, granularity);
-}
-
-static bool phys_address_in_accessed_sub_region(uvm_gpu_phys_address_t address,
-                                                NvU64 region_size,
-                                                NvU64 sub_region_size,
-                                                NvU32 accessed_mask)
-{
-    const unsigned accessed_index = (address.address % region_size) / sub_region_size;
-
-    // accessed_mask is only filled for tracking granularities larger than 64K
-    if (region_size == UVM_PAGE_SIZE_64K)
-        return true;
-
-    UVM_ASSERT(accessed_index < 32);
-    return ((1 << accessed_index) & accessed_mask) != 0;
-}
-
-static NV_STATUS service_virt_notification(uvm_gpu_t *gpu,
-                                           uvm_access_counter_service_batch_context_t *batch_context,
-                                           const uvm_access_counter_buffer_entry_t *current_entry,
-                                           unsigned *out_flags)
-{
-    NV_STATUS status = NV_OK;
-    NvU64 notification_size;
-    NvU64 address;
-    uvm_processor_id_t *resident_processors = batch_context->virt.scratch.resident_processors;
-    uvm_gpu_phys_address_t *phys_addresses = batch_context->virt.scratch.phys_addresses;
-    int num_addresses = 0;
-    int i;
-
-    // Virtual address notifications are always 64K aligned
-    NvU64 region_start = current_entry->address.address;
-    NvU64 region_end = current_entry->address.address + UVM_PAGE_SIZE_64K;
-    
-
-    uvm_access_counter_buffer_info_t *access_counters = &gpu->parent->access_counter_buffer_info;
-    uvm_access_counter_type_t counter_type = current_entry->counter_type;
-
-    const uvm_gpu_access_counter_type_config_t *config = get_config_for_type(access_counters, counter_type);
-
-    uvm_va_space_t *va_space = current_entry->virtual_info.va_space;
-
-    UVM_ASSERT(counter_type == UVM_ACCESS_COUNTER_TYPE_MIMC);
-
-    // Entries with NULL va_space are simply dropped.
-    if (!va_space)
+    if (uvm_page_mask_empty(accessed_pages))
        return NV_OK;

-    status = config_granularity_to_bytes(config->rm.granularity, &notification_size);
-    if (status != NV_OK)
-        return status;
+    uvm_assert_mutex_locked(&va_block->lock);

-    // Collect physical locations that could have been touched
-    // in the reported 64K VA region. The notification mask can
-    // correspond to any of them.
-    uvm_va_space_down_read(va_space);
-    for (address = region_start; address < region_end;) {
-        uvm_va_block_t *va_block;
+    service_context->operation = UVM_SERVICE_OPERATION_ACCESS_COUNTERS;
+    service_context->num_retries = 0;
+    service_context->block_context.mm = mm;

-        NV_STATUS local_status = uvm_va_block_find(va_space, address, &va_block);
-        if (local_status == NV_ERR_INVALID_ADDRESS || local_status == NV_ERR_OBJECT_NOT_FOUND) {
-            address += PAGE_SIZE;
-            continue;
-        }
+    return UVM_VA_BLOCK_RETRY_LOCKED(va_block,
+                                     &va_block_retry,
+                                     service_va_block_locked(processor,
+                                                             va_block,
+                                                             &va_block_retry,
+                                                             service_context,
+                                                             accessed_pages));
+}

-        uvm_mutex_lock(&va_block->lock);
-        while (address < va_block->end && address < region_end) {
-            const unsigned page_index = uvm_va_block_cpu_page_index(va_block, address);
+static void expand_notification_block(struct mm_struct *mm,
+                                      uvm_gpu_va_space_t *gpu_va_space,
+                                      uvm_va_block_t *va_block,
+                                      uvm_page_mask_t *accessed_pages,
+                                      const uvm_access_counter_buffer_entry_t *current_entry)
+{
+    NvU64 addr;
+    NvU64 granularity = 0;
+    uvm_gpu_t *resident_gpu = NULL;
+    uvm_processor_id_t resident_id;
+    uvm_page_index_t page_index;
+    uvm_gpu_t *gpu = gpu_va_space->gpu;
+    const uvm_access_counter_buffer_info_t *access_counters = &gpu->parent->access_counter_buffer_info;
+    const uvm_gpu_access_counter_type_config_t *config = get_config_for_type(access_counters,
+                                                                             UVM_ACCESS_COUNTER_TYPE_MIMC);

-            // UVM va_block always maps the closest resident location to processor
-            const uvm_processor_id_t res_id = uvm_va_block_page_get_closest_resident(va_block, page_index, gpu->id);
+    config_granularity_to_bytes(config->rm.granularity, &granularity);

-            // Add physical location if it's valid and not local vidmem
-            if (UVM_ID_IS_VALID(res_id) && !uvm_id_equal(res_id, gpu->id)) {
-                uvm_gpu_phys_address_t phys_address = uvm_va_block_res_phys_page_address(va_block, page_index, res_id, gpu);
-                if (phys_address_in_accessed_sub_region(phys_address,
-                                                        notification_size,
-                                                        config->sub_granularity_region_size,
-                                                        current_entry->sub_granularity)) {
-                    resident_processors[num_addresses] = res_id;
-                    phys_addresses[num_addresses] = phys_address;
-                    ++num_addresses;
-                }
-                else {
-                    UVM_DBG_PRINT_RL("Skipping phys address %llx:%s, because it couldn't have been accessed in mask %x",
-                                     phys_address.address,
-                                     uvm_aperture_string(phys_address.aperture),
-                                     current_entry->sub_granularity);
-                }
-            }
+    // Granularities other than 2MB can only be enabled by UVM tests. Do nothing
+    // in that case.
+    if (granularity != UVM_PAGE_SIZE_2M)
+        return;

-            address += PAGE_SIZE;
-        }
-        uvm_mutex_unlock(&va_block->lock);
+    addr = current_entry->address.address;
+
+    uvm_assert_rwsem_locked(&gpu_va_space->va_space->lock);
+    uvm_assert_mutex_locked(&va_block->lock);
+
+    page_index = uvm_va_block_cpu_page_index(va_block, addr);
+
+    resident_id = uvm_va_block_page_get_closest_resident(va_block, page_index, gpu->id);
+
+    // resident_id might be invalid or might already be the same as the GPU
+    // which received the notification if the memory was already migrated before
+    // acquiring the locks either during the servicing of previous notifications
+    // or during faults or because of explicit migrations or if the VA range was
+    // freed after receving the notification. Return NV_OK in such cases.
+    if (!UVM_ID_IS_VALID(resident_id) || uvm_id_equal(resident_id, gpu->id))
+        return;
+
+    if (UVM_ID_IS_GPU(resident_id))
+        resident_gpu = uvm_va_space_get_gpu(gpu_va_space->va_space, resident_id);
+
+    if (uvm_va_block_get_physical_size(va_block, resident_id, page_index) != granularity) {
+        uvm_page_mask_set(accessed_pages, page_index);
    }
-    uvm_va_space_up_read(va_space);
+    else {
+        NvU32 region_start;
+        NvU32 region_end;
+        unsigned long sub_granularity = current_entry->sub_granularity;
+        NvU32 num_regions = config->sub_granularity_regions_per_translation;
+        NvU32 num_sub_pages = config->sub_granularity_region_size / PAGE_SIZE;
+        uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id);

-    // The addresses need to be sorted to aid coalescing.
-    sort(phys_addresses,
-         num_addresses,
-         sizeof(*phys_addresses),
-         cmp_sort_gpu_phys_addr,
-         NULL);
+        UVM_ASSERT(num_sub_pages >= 1);

-    for (i = 0; i < num_addresses; ++i) {
-        uvm_access_counter_buffer_entry_t *fake_entry = &batch_context->virt.scratch.phys_entry;
-
-        // Skip the current pointer if the physical region was already handled
-        if (i > 0 && gpu_phys_same_region(phys_addresses[i - 1], phys_addresses[i], notification_size)) {
-            UVM_ASSERT(uvm_id_equal(resident_processors[i - 1], resident_processors[i]));
-            continue;
+        // region_start and region_end refer to sub_granularity indices, not
+        // page_indices.
+        for_each_sub_granularity_region(region_start, region_end, sub_granularity, num_regions) {
+            uvm_page_mask_region_fill(accessed_pages,
+                                      uvm_va_block_region(region_start * num_sub_pages,
+                                                          region_end * num_sub_pages));
        }
-        UVM_DBG_PRINT_RL("Faking MIMC address[%i/%i]: %llx (granularity mask: %llx) in aperture %s on device %s\n",
-                         i,
-                         num_addresses,
-                         phys_addresses[i].address,
-                         notification_size - 1,
-                         uvm_aperture_string(phys_addresses[i].aperture),
-                         uvm_gpu_name(gpu));

-        // Construct a fake phys addr AC entry
-        fake_entry->counter_type = current_entry->counter_type;
-        fake_entry->address.address = UVM_ALIGN_DOWN(phys_addresses[i].address, notification_size);
-        fake_entry->address.aperture = phys_addresses[i].aperture;
-        fake_entry->address.is_virtual = false;
-        fake_entry->physical_info.resident_id = resident_processors[i];
-        fake_entry->counter_value = current_entry->counter_value;
-        fake_entry->sub_granularity = current_entry->sub_granularity;
+        // Remove pages in the va_block which are not resident on resident_id.
+        // If the GPU is heavily accessing those pages, future access counter
+        // migrations will migrate them to the GPU.
+        uvm_page_mask_and(accessed_pages, accessed_pages, resident_mask);
+    }
+}

-        status = service_phys_notification(gpu, batch_context, fake_entry, out_flags);
-        if (status != NV_OK)
+static NV_STATUS service_virt_notifications_in_block(struct mm_struct *mm,
+                                                     uvm_gpu_va_space_t *gpu_va_space,
+                                                     uvm_va_block_t *va_block,
+                                                     uvm_access_counter_service_batch_context_t *batch_context,
+                                                     NvU32 index,
+                                                     NvU32 *out_index)
+{
+    NvU32 i = index;
+    NvU32 flags = 0;
+    NV_STATUS status = NV_OK;
+    NV_STATUS flags_status;
+    uvm_gpu_t *gpu = gpu_va_space->gpu;
+    uvm_va_space_t *va_space = gpu_va_space->va_space;
+    uvm_page_mask_t *accessed_pages = &batch_context->accessed_pages;
+    uvm_access_counter_buffer_entry_t **notifications = batch_context->virt.notifications;
+
+    UVM_ASSERT(va_block);
+    UVM_ASSERT(i < batch_context->virt.num_notifications);
+
+    uvm_assert_rwsem_locked(&va_space->lock);
+
+    uvm_page_mask_zero(accessed_pages);
+
+    uvm_mutex_lock(&va_block->lock);
+
+    while (i < batch_context->virt.num_notifications) {
+        uvm_access_counter_buffer_entry_t *current_entry = notifications[i];
+        NvU64 address = current_entry->address.address;
+
+        if ((current_entry->virtual_info.va_space != va_space) || (address > va_block->end)) {
+            *out_index = i;
            break;
+        }
+
+        expand_notification_block(mm, gpu_va_space, va_block, accessed_pages, current_entry);
+
+        i++;
+        *out_index = i;
+    }
+
+    status = service_notification_va_block_helper(mm, va_block, gpu->id, batch_context);
+
+    uvm_mutex_unlock(&va_block->lock);
+
+    // Atleast one notification should have been processed.
+    UVM_ASSERT(index < *out_index);
+
+    if (status == NV_OK)
+        flags |= UVM_ACCESS_COUNTER_ACTION_CLEAR;
+
+    flags_status = notify_tools_and_process_flags(gpu, &notifications[index], *out_index - index, flags);
+
+    if ((status == NV_OK) && (flags_status != NV_OK))
+        status = flags_status;
+
+    return status;
+}
+
+static NV_STATUS service_virt_notifications_batch(struct mm_struct *mm,
+                                                  uvm_gpu_va_space_t *gpu_va_space,
+                                                  uvm_access_counter_service_batch_context_t *batch_context,
+                                                  NvU32 index,
+                                                  NvU32 *out_index)
+{
+    NV_STATUS status;
+    uvm_va_block_t *va_block;
+    uvm_va_space_t *va_space = gpu_va_space->va_space;
+    uvm_access_counter_buffer_entry_t *current_entry = batch_context->virt.notifications[index];
+    NvU64 address = current_entry->address.address;
+
+    UVM_ASSERT(va_space);
+
+    uvm_assert_rwsem_locked(&va_space->lock);
+
+    // Virtual address notifications are always 64K aligned
+    UVM_ASSERT(IS_ALIGNED(address, UVM_PAGE_SIZE_64K));
+
+    // TODO: Bug 4309292: [UVM][HMM] Re-enable access counter HMM block
+    //                    migrations for virtual notifications on configs with
+    //                    4KB page size
+    status = uvm_va_block_find(va_space, address, &va_block);
+    if ((status == NV_OK) && !uvm_va_block_is_hmm(va_block)) {
+
+        UVM_ASSERT(va_block);
+
+        status = service_virt_notifications_in_block(mm, gpu_va_space, va_block, batch_context, index, out_index);
+    }
+    else {
+        NvU32 flags = 0;
+
+        UVM_ASSERT((status == NV_ERR_OBJECT_NOT_FOUND) ||
+                   (status == NV_ERR_INVALID_ADDRESS)  ||
+                   uvm_va_block_is_hmm(va_block));
+
+        // NV_ERR_OBJECT_NOT_FOUND is returned if the VA range is valid but no
+        // VA block has been allocated yet. This can happen if there are stale
+        // notifications in the batch. A new VA range may have been allocated in
+        // that range. So, clear the notification entry to continue getting
+        // notifications for the new VA range.
+        if (status == NV_ERR_OBJECT_NOT_FOUND)
+            flags |= UVM_ACCESS_COUNTER_ACTION_CLEAR;
+
+        // NV_ERR_INVALID_ADDRESS is returned if the corresponding VA range
+        // doesn't exist or it's not a managed range. Access counter migrations
+        // are not currently supported on such ranges.
+        //
+        // TODO: Bug 1990466: [uvm] Use access counters to trigger migrations
+        // When support for SAM migrations is addded, clear the notification
+        // entry if the VA range doesn't exist in order to receive notifications
+        // when a new VA range is allocated in that region.
+        status = notify_tools_and_process_flags(gpu_va_space->gpu, &batch_context->virt.notifications[index], 1, flags);
+        *out_index = index + 1;
+
+        status = NV_OK;
    }

    return status;
@ -1632,33 +1700,67 @@ static NV_STATUS service_virt_notification(uvm_gpu_t *gpu,
 static NV_STATUS service_virt_notifications(uvm_gpu_t *gpu,
                                            uvm_access_counter_service_batch_context_t *batch_context)
 {
-    NvU32 i;
+    NvU32 i = 0;
    NV_STATUS status = NV_OK;
+    struct mm_struct *mm = NULL;
+    uvm_va_space_t *va_space = NULL;
+    uvm_va_space_t *prev_va_space = NULL;
+    uvm_gpu_va_space_t *gpu_va_space = NULL;
+
+    // TODO: Bug 4299018 : Add support for virtual access counter migrations on
+    //                     4K page sizes.
+    if (PAGE_SIZE == UVM_PAGE_SIZE_4K) {
+        return notify_tools_and_process_flags(gpu,
+                                              batch_context->virt.notifications,
+                                              batch_context->virt.num_notifications,
+                                              0);
+    }
+
    preprocess_virt_notifications(gpu, batch_context);

-    for (i = 0; i < batch_context->virt.num_notifications; ++i) {
-        unsigned flags = 0;
+    while (i < batch_context->virt.num_notifications) {
        uvm_access_counter_buffer_entry_t *current_entry = batch_context->virt.notifications[i];
+        va_space = current_entry->virtual_info.va_space;

-        status = service_virt_notification(gpu, batch_context, current_entry, &flags);
+        if (va_space != prev_va_space) {

-        UVM_DBG_PRINT_RL("Processed virt access counter (%d/%d): %sMANAGED (status: %d) clear: %s\n",
-                         i + 1,
-                         batch_context->virt.num_notifications,
-                         (flags & UVM_ACCESS_COUNTER_ON_MANAGED) ? "" : "NOT ",
-                         status,
-                         (flags & UVM_ACCESS_COUNTER_ACTION_CLEAR) ? "YES" : "NO");
+            // New va_space detected, drop locks of the old va_space.
+            if (prev_va_space) {
+                uvm_va_space_up_read(prev_va_space);
+                uvm_va_space_mm_release_unlock(prev_va_space, mm);

-        if (uvm_enable_builtin_tests)
-            uvm_tools_broadcast_access_counter(gpu, current_entry, flags & UVM_ACCESS_COUNTER_ON_MANAGED);
+                mm = NULL;
+                gpu_va_space = NULL;
+            }

-        if (status == NV_OK && (flags & UVM_ACCESS_COUNTER_ACTION_CLEAR))
-            status = access_counter_clear_targeted(gpu, current_entry);
+            // Acquire locks for the new va_space.
+            if (va_space) {
+                mm = uvm_va_space_mm_retain_lock(va_space);
+                uvm_va_space_down_read(va_space);
+
+                gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
+            }
+
+            prev_va_space = va_space;
+        }
+
+        if (va_space && gpu_va_space && uvm_va_space_has_access_counter_migrations(va_space)) {
+            status = service_virt_notifications_batch(mm, gpu_va_space, batch_context, i, &i);
+        }
+        else {
+            status = notify_tools_and_process_flags(gpu, &batch_context->virt.notifications[i], 1, 0);
+            i++;
+        }

        if (status != NV_OK)
            break;
    }

+    if (va_space) {
+        uvm_va_space_up_read(va_space);
+        uvm_va_space_mm_release_unlock(va_space, mm);
+    }
+
    return status;
 }

@ -1941,6 +2043,7 @@ NV_STATUS uvm_test_reset_access_counters(UVM_TEST_RESET_ACCESS_COUNTERS_PARAMS *
    }
    else {
        uvm_access_counter_buffer_entry_t entry = { 0 };
+        uvm_access_counter_buffer_entry_t *notification = &entry;

        if (params->counter_type == UVM_TEST_ACCESS_COUNTER_TYPE_MIMC)
            entry.counter_type = UVM_ACCESS_COUNTER_TYPE_MIMC;
@ -1950,7 +2053,7 @@ NV_STATUS uvm_test_reset_access_counters(UVM_TEST_RESET_ACCESS_COUNTERS_PARAMS *
        entry.bank = params->bank;
        entry.tag = params->tag;

-        status = access_counter_clear_targeted(gpu, &entry);
+        status = access_counter_clear_notifications(gpu, &notification, 1);
    }

    if (status == NV_OK)
--- a/kernel-open/nvidia-uvm/uvm_gpu_non_replayable_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_non_replayable_faults.c
@ -235,17 +235,27 @@ static NV_STATUS fetch_non_replayable_fault_buffer_entries(uvm_parent_gpu_t *par
    return NV_OK;
 }

-// In SRIOV, the UVM (guest) driver does not have access to the privileged
-// registers used to clear the faulted bit. Instead, UVM requests host RM to do
-// the clearing on its behalf, using a SW method.
 static bool use_clear_faulted_channel_sw_method(uvm_gpu_t *gpu)
 {
-    if (uvm_gpu_is_virt_mode_sriov(gpu)) {
-        UVM_ASSERT(gpu->parent->has_clear_faulted_channel_sw_method);
-        return true;
-    }
+    // If true, UVM uses a SW method to request RM to do the clearing on its
+    // behalf.
+    bool use_sw_method = false;

-    return false;
+    // In SRIOV, the UVM (guest) driver does not have access to the privileged
+    // registers used to clear the faulted bit.
+    if (uvm_gpu_is_virt_mode_sriov(gpu))
+        use_sw_method = true;
+
+    // In Confidential Computing access to the privileged registers is blocked,
+    // in order to prevent interference between guests, or between the
+    // (untrusted) host and the guests.
+    if (g_uvm_global.conf_computing_enabled)
+        use_sw_method = true;
+
+    if (use_sw_method)
+        UVM_ASSERT(gpu->parent->has_clear_faulted_channel_sw_method);
+
+    return use_sw_method;
 }

 static NV_STATUS clear_faulted_method_on_gpu(uvm_gpu_t *gpu,
@ -570,7 +580,7 @@ static NV_STATUS service_non_managed_fault(uvm_gpu_va_space_t *gpu_va_space,

        ats_context->client_type = UVM_FAULT_CLIENT_TYPE_HUB;

-        ats_invalidate->write_faults_in_batch = false;
+        ats_invalidate->tlb_batch_pending = false;

        va_range_next = uvm_va_space_iter_first(gpu_va_space->va_space, fault_entry->fault_address, ~0ULL);

--- a/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2022 NVIDIA Corporation
+    Copyright (c) 2015-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -362,7 +362,8 @@ static NV_STATUS push_cancel_on_gpu(uvm_gpu_t *gpu,
                                        "Cancel targeting instance_ptr {0x%llx:%s}\n",
                                        instance_ptr.address,
                                        uvm_aperture_string(instance_ptr.aperture));
-    } else {
+    }
+    else {
        status = uvm_push_begin_acquire(gpu->channel_manager,
                                        UVM_CHANNEL_TYPE_MEMOPS,
                                        &replayable_faults->replay_tracker,
@ -697,9 +698,6 @@ static inline int cmp_access_type(uvm_fault_access_type_t a, uvm_fault_access_ty

 typedef enum
 {
-    // Fetch a batch of faults from the buffer.
-    FAULT_FETCH_MODE_BATCH_ALL,
-
    // Fetch a batch of faults from the buffer. Stop at the first entry that is
    // not ready yet
    FAULT_FETCH_MODE_BATCH_READY,
@ -857,9 +855,7 @@ static NV_STATUS fetch_fault_buffer_entries(uvm_gpu_t *gpu,
        // written out of order
        UVM_SPIN_WHILE(!gpu->parent->fault_buffer_hal->entry_is_valid(gpu->parent, get), &spin) {
            // We have some entry to work on. Let's do the rest later.
-            if (fetch_mode != FAULT_FETCH_MODE_ALL &&
-                fetch_mode != FAULT_FETCH_MODE_BATCH_ALL &&
-                fault_index > 0)
+            if (fetch_mode == FAULT_FETCH_MODE_BATCH_READY && fault_index > 0)
                goto done;
        }

@ -888,6 +884,7 @@ static NV_STATUS fetch_fault_buffer_entries(uvm_gpu_t *gpu,

        current_entry->va_space = NULL;
        current_entry->filtered = false;
+        current_entry->replayable.cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;

        if (current_entry->fault_source.utlb_id > batch_context->max_utlb_id) {
            UVM_ASSERT(current_entry->fault_source.utlb_id < replayable_faults->utlb_count);
@ -1184,7 +1181,11 @@ static void mark_fault_fatal(uvm_fault_service_batch_context_t *batch_context,
    fault_entry->replayable.cancel_va_mode = cancel_va_mode;

    utlb->has_fatal_faults = true;
-    batch_context->has_fatal_faults = true;
+
+    if (!batch_context->fatal_va_space) {
+        UVM_ASSERT(fault_entry->va_space);
+        batch_context->fatal_va_space = fault_entry->va_space;
+    }
 }

 static void fault_entry_duplicate_flags(uvm_fault_service_batch_context_t *batch_context,
@ -1378,7 +1379,10 @@ static NV_STATUS service_fault_batch_block_locked(uvm_gpu_t *gpu,
        UVM_ASSERT(current_entry->fault_access_type ==
                   uvm_fault_access_type_mask_highest(current_entry->access_type_mask));

-        current_entry->is_fatal            = false;
+        // Unserviceable faults were already skipped by the caller. There are no
+        // unserviceable fault types that could be in the same VA block as a
+        // serviceable fault.
+        UVM_ASSERT(!current_entry->is_fatal);
        current_entry->is_throttled        = false;
        current_entry->is_invalid_prefetch = false;

@ -1512,7 +1516,7 @@ static NV_STATUS service_fault_batch_block_locked(uvm_gpu_t *gpu,

    ++block_context->num_retries;

-    if (status == NV_OK && batch_context->has_fatal_faults)
+    if (status == NV_OK && batch_context->fatal_va_space)
        status = uvm_va_block_set_cancel(va_block, &block_context->block_context, gpu);

    return status;
@ -1676,7 +1680,8 @@ static NV_STATUS service_fault_batch_ats_sub_vma(uvm_gpu_va_space_t *gpu_va_spac
        if (access_type <= UVM_FAULT_ACCESS_TYPE_READ) {
            cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
        }
-        else if (access_type >= UVM_FAULT_ACCESS_TYPE_WRITE) {
+	else {
+            UVM_ASSERT(access_type >= UVM_FAULT_ACCESS_TYPE_WRITE);
            if (uvm_fault_access_type_mask_test(current_entry->access_type_mask, UVM_FAULT_ACCESS_TYPE_READ) &&
                !uvm_page_mask_test(reads_serviced_mask, page_index))
                cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
@ -1735,6 +1740,10 @@ static NV_STATUS service_fault_batch_ats_sub(uvm_gpu_va_space_t *gpu_va_space,
        uvm_fault_access_type_t access_type = current_entry->fault_access_type;
        bool is_duplicate = check_fault_entry_duplicate(current_entry, previous_entry);

+        // ATS faults can't be unserviceable, since unserviceable faults require
+        // GMMU PTEs.
+        UVM_ASSERT(!current_entry->is_fatal);
+
        i++;

        update_batch_and_notify_fault(gpu_va_space->gpu,
@ -1934,14 +1943,198 @@ static NV_STATUS service_fault_batch_dispatch(uvm_va_space_t *va_space,
    return status;
 }

+// Called when a fault in the batch has been marked fatal. Flush the buffer
+// under the VA and mmap locks to remove any potential stale fatal faults, then
+// service all new faults for just that VA space and cancel those which are
+// fatal. Faults in other VA spaces are replayed when done and will be processed
+// when normal fault servicing resumes.
+static NV_STATUS service_fault_batch_for_cancel(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context)
+{
+    NV_STATUS status = NV_OK;
+    NvU32 i;
+    uvm_va_space_t *va_space = batch_context->fatal_va_space;
+    uvm_gpu_va_space_t *gpu_va_space = NULL;
+    struct mm_struct *mm;
+    uvm_replayable_fault_buffer_info_t *replayable_faults = &gpu->parent->fault_buffer_info.replayable;
+    uvm_service_block_context_t *service_context = &gpu->parent->fault_buffer_info.replayable.block_service_context;
+    uvm_va_block_context_t *va_block_context = &service_context->block_context;
+
+    UVM_ASSERT(gpu->parent->replayable_faults_supported);
+    UVM_ASSERT(va_space);
+
+    // Perform the flush and re-fetch while holding the mmap_lock and the
+    // VA space lock. This avoids stale faults because it prevents any vma
+    // modifications (mmap, munmap, mprotect) from happening between the time HW
+    // takes the fault and we cancel it.
+    mm = uvm_va_space_mm_retain_lock(va_space);
+    va_block_context->mm = mm;
+    uvm_va_space_down_read(va_space);
+
+    // We saw fatal faults in this VA space before. Flush while holding
+    // mmap_lock to make sure those faults come back (aren't stale).
+    //
+    // We need to wait until all old fault messages have arrived before
+    // flushing, hence UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT.
+    status = fault_buffer_flush_locked(gpu,
+                                       UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT,
+                                       UVM_FAULT_REPLAY_TYPE_START,
+                                       batch_context);
+    if (status != NV_OK)
+        goto done;
+
+    // Wait for the flush's replay to finish to give the legitimate faults a
+    // chance to show up in the buffer again.
+    status = uvm_tracker_wait(&replayable_faults->replay_tracker);
+    if (status != NV_OK)
+        goto done;
+
+    // We expect all replayed faults to have arrived in the buffer so we can re-
+    // service them. The replay-and-wait sequence above will ensure they're all
+    // in the HW buffer. When GSP owns the HW buffer, we also have to wait for
+    // GSP to copy all available faults from the HW buffer into the shadow
+    // buffer.
+    //
+    // TODO: Bug 2533557: This flush does not actually guarantee that GSP will
+    //       copy over all faults.
+    status = hw_fault_buffer_flush_locked(gpu->parent);
+    if (status != NV_OK)
+        goto done;
+
+    // If there is no GPU VA space for the GPU, ignore all faults in the VA
+    // space. This can happen if the GPU VA space has been destroyed since we
+    // unlocked the VA space in service_fault_batch. That means the fatal faults
+    // are stale, because unregistering the GPU VA space requires preempting the
+    // context and detaching all channels in that VA space. Restart fault
+    // servicing from the top.
+    gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
+    if (!gpu_va_space)
+        goto done;
+
+    // Re-parse the new faults
+    batch_context->num_invalid_prefetch_faults = 0;
+    batch_context->num_duplicate_faults        = 0;
+    batch_context->num_replays                 = 0;
+    batch_context->fatal_va_space              = NULL;
+    batch_context->has_throttled_faults        = false;
+
+    status = fetch_fault_buffer_entries(gpu, batch_context, FAULT_FETCH_MODE_ALL);
+    if (status != NV_OK)
+        goto done;
+
+    // No more faults left. Either the previously-seen fatal entry was stale, or
+    // RM killed the context underneath us.
+    if (batch_context->num_cached_faults == 0)
+        goto done;
+
+    ++batch_context->batch_id;
+
+    status = preprocess_fault_batch(gpu, batch_context);
+    if (status != NV_OK) {
+        if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
+            // Another flush happened due to stale faults or a context-fatal
+            // error. The previously-seen fatal fault might not exist anymore,
+            // so restart fault servicing from the top.
+            status = NV_OK;
+        }
+
+        goto done;
+    }
+
+    // Search for the target VA space
+    for (i = 0; i < batch_context->num_coalesced_faults; i++) {
+        uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
+        UVM_ASSERT(current_entry->va_space);
+        if (current_entry->va_space == va_space)
+            break;
+    }
+
+    while (i < batch_context->num_coalesced_faults) {
+        uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
+
+        if (current_entry->va_space != va_space)
+            break;
+
+        // service_fault_batch_dispatch() doesn't expect unserviceable faults.
+        // Just cancel them directly.
+        if (current_entry->is_fatal) {
+            status = cancel_fault_precise_va(gpu, current_entry, UVM_FAULT_CANCEL_VA_MODE_ALL);
+            if (status != NV_OK)
+                break;
+
+            ++i;
+        }
+        else {
+            uvm_ats_fault_invalidate_t *ats_invalidate = &gpu->parent->fault_buffer_info.replayable.ats_invalidate;
+            NvU32 block_faults;
+
+            ats_invalidate->tlb_batch_pending = false;
+            uvm_hmm_service_context_init(service_context);
+
+            // Service all the faults that we can. We only really need to search
+            // for fatal faults, but attempting to service all is the easiest
+            // way to do that.
+            status = service_fault_batch_dispatch(va_space, gpu_va_space, batch_context, i, &block_faults, false);
+            if (status != NV_OK) {
+                // TODO: Bug 3900733: clean up locking in service_fault_batch().
+                // We need to drop lock and retry. That means flushing and
+                // starting over.
+                if (status == NV_WARN_MORE_PROCESSING_REQUIRED)
+                    status = NV_OK;
+
+                break;
+            }
+
+            // Invalidate TLBs before cancel to ensure that fatal faults don't
+            // get stuck in HW behind non-fatal faults to the same line.
+            status = uvm_ats_invalidate_tlbs(gpu_va_space, ats_invalidate, &batch_context->tracker);
+            if (status != NV_OK)
+                break;
+
+            while (block_faults-- > 0) {
+                current_entry = batch_context->ordered_fault_cache[i];
+                if (current_entry->is_fatal) {
+                    status = cancel_fault_precise_va(gpu, current_entry, current_entry->replayable.cancel_va_mode);
+                    if (status != NV_OK)
+                        break;
+                }
+
+                ++i;
+            }
+        }
+    }
+
+done:
+    uvm_va_space_up_read(va_space);
+    uvm_va_space_mm_release_unlock(va_space, mm);
+
+    if (status == NV_OK) {
+        // There are two reasons to flush the fault buffer here.
+        //
+        // 1) Functional. We need to replay both the serviced non-fatal faults
+        //    and the skipped faults in other VA spaces. The former need to be
+        //    restarted and the latter need to be replayed so the normal fault
+        //    service mechanism can fetch and process them.
+        //
+        // 2) Performance. After cancelling the fatal faults, a flush removes
+        //    any potential duplicated fault that may have been added while
+        //    processing the faults in this batch. This flush also avoids doing
+        //    unnecessary processing after the fatal faults have been cancelled,
+        //    so all the rest are unlikely to remain after a replay because the
+        //    context is probably in the process of dying.
+        status = fault_buffer_flush_locked(gpu,
+                                           UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
+                                           UVM_FAULT_REPLAY_TYPE_START,
+                                           batch_context);
+    }
+
+    return status;
+}
 // Scan the ordered view of faults and group them by different va_blocks
 // (managed faults) and service faults for each va_block, in batch.
 // Service non-managed faults one at a time as they are encountered during the
 // scan.
 //
-// This function returns NV_WARN_MORE_PROCESSING_REQUIRED if the fault buffer
-// was flushed because the needs_fault_buffer_flush flag was set on some GPU VA
-// space
+// Fatal faults are marked for later processing by the caller.
 static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
                                     fault_service_mode_t service_mode,
                                     uvm_fault_service_batch_context_t *batch_context)
@ -1960,7 +2153,7 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,

    UVM_ASSERT(gpu->parent->replayable_faults_supported);

-    ats_invalidate->write_faults_in_batch = false;
+    ats_invalidate->tlb_batch_pending = false;
    uvm_hmm_service_context_init(service_context);

    for (i = 0; i < batch_context->num_coalesced_faults;) {
@ -1995,38 +2188,25 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
            va_block_context->mm = mm;

            uvm_va_space_down_read(va_space);
-
            gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
-            if (uvm_processor_mask_test_and_clear_atomic(&va_space->needs_fault_buffer_flush, gpu->id)) {
-                status = fault_buffer_flush_locked(gpu,
-                                                   UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT,
-                                                   UVM_FAULT_REPLAY_TYPE_START,
-                                                   batch_context);
-                if (status == NV_OK)
-                    status = NV_WARN_MORE_PROCESSING_REQUIRED;
-
-                break;
-            }
-
-            // The case where there is no valid GPU VA space for the GPU in this
-            // VA space is handled next
        }

        // Some faults could be already fatal if they cannot be handled by
        // the UVM driver
        if (current_entry->is_fatal) {
            ++i;
-            batch_context->has_fatal_faults = true;
+            if (!batch_context->fatal_va_space)
+                batch_context->fatal_va_space = va_space;
+
            utlb->has_fatal_faults = true;
            UVM_ASSERT(utlb->num_pending_faults > 0);
            continue;
        }

-        if (!uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->parent->id)) {
+        if (!gpu_va_space) {
            // If there is no GPU VA space for the GPU, ignore the fault. This
            // can happen if a GPU VA space is destroyed without explicitly
-            // freeing all memory ranges (destroying the VA range triggers a
-            // flush of the fault buffer) and there are stale entries in the
+            // freeing all memory ranges and there are stale entries in the
            // buffer that got fixed by the servicing in a previous batch.
            ++i;
            continue;
@ -2044,15 +2224,17 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
            uvm_va_space_mm_release_unlock(va_space, mm);
            mm = NULL;
            va_space = NULL;
+            status = NV_OK;
            continue;
        }
+
        if (status != NV_OK)
            goto fail;

        i += block_faults;

        // Don't issue replays in cancel mode
-        if (replay_per_va_block && !batch_context->has_fatal_faults) {
+        if (replay_per_va_block && !batch_context->fatal_va_space) {
            status = push_replay_on_gpu(gpu, UVM_FAULT_REPLAY_TYPE_START, batch_context);
            if (status != NV_OK)
                goto fail;
@ -2064,8 +2246,6 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
        }
    }

-    // Only clobber status if invalidate_status != NV_OK, since status may also
-    // contain NV_WARN_MORE_PROCESSING_REQUIRED.
    if (va_space != NULL) {
        NV_STATUS invalidate_status = uvm_ats_invalidate_tlbs(gpu_va_space, ats_invalidate, &batch_context->tracker);
        if (invalidate_status != NV_OK)
@ -2273,77 +2453,48 @@ static NvU32 is_fatal_fault_in_buffer(uvm_fault_service_batch_context_t *batch_c
    return false;
 }

-typedef enum
-{
-    // Only cancel faults flagged as fatal
-    FAULT_CANCEL_MODE_FATAL,
-
-    // Cancel all faults in the batch unconditionally
-    FAULT_CANCEL_MODE_ALL,
-} fault_cancel_mode_t;
-
-// Cancel faults in the given fault service batch context. The function provides
-// two different modes depending on the value of cancel_mode:
-// - If cancel_mode == FAULT_CANCEL_MODE_FATAL, only faults flagged as fatal
-// will be cancelled. In this case, the reason reported to tools is the one
-// contained in the fault entry itself.
-// - If cancel_mode == FAULT_CANCEL_MODE_ALL, all faults will be cancelled
-// unconditionally. In this case, the reason reported to tools for non-fatal
-// faults is the one passed to this function.
-static NV_STATUS cancel_faults_precise_va(uvm_gpu_t *gpu,
-                                          uvm_fault_service_batch_context_t *batch_context,
-                                          fault_cancel_mode_t cancel_mode,
-                                          UvmEventFatalReason reason)
+// Cancel all faults in the given fault service batch context, even those not
+// marked as fatal.
+static NV_STATUS cancel_faults_all(uvm_gpu_t *gpu,
+                                   uvm_fault_service_batch_context_t *batch_context,
+                                   UvmEventFatalReason reason)
 {
    NV_STATUS status = NV_OK;
    NV_STATUS fault_status;
-    uvm_va_space_t *va_space = NULL;
-    NvU32 i;
+    NvU32 i = 0;

    UVM_ASSERT(gpu->parent->fault_cancel_va_supported);
-    if (cancel_mode == FAULT_CANCEL_MODE_ALL)
-        UVM_ASSERT(reason != UvmEventFatalReasonInvalid);
+    UVM_ASSERT(reason != UvmEventFatalReasonInvalid);

-    for (i = 0; i < batch_context->num_coalesced_faults; ++i) {
+    while (i < batch_context->num_coalesced_faults && status == NV_OK) {
        uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
+        uvm_va_space_t *va_space = current_entry->va_space;
+        bool skip_va_space;

-        UVM_ASSERT(current_entry->va_space);
+        UVM_ASSERT(va_space);

-        if (current_entry->va_space != va_space) {
-            // Fault on a different va_space, drop the lock of the old one...
-            if (va_space != NULL)
-                uvm_va_space_up_read(va_space);
+        uvm_va_space_down_read(va_space);

-            va_space = current_entry->va_space;
+        // If there is no GPU VA space for the GPU, ignore all faults in
+        // that VA space. This can happen if the GPU VA space has been
+        // destroyed since we unlocked the VA space in service_fault_batch.
+        // Ignoring the fault avoids targetting a PDB that might have been
+        // reused by another process.
+        skip_va_space = !uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);

-            // ... and take the lock of the new one
-            uvm_va_space_down_read(va_space);
+        for (;
+             i < batch_context->num_coalesced_faults && current_entry->va_space == va_space;
+             current_entry = batch_context->ordered_fault_cache[++i]) {
+            uvm_fault_cancel_va_mode_t cancel_va_mode;

-            // We don't need to check whether a buffer flush is required
-            // (due to VA range destruction).
-            // - For cancel_mode == FAULT_CANCEL_MODE_FATAL, once a fault is
-            // flagged as fatal we need to cancel it, even if its VA range no
-            // longer exists.
-            // - For cancel_mode == FAULT_CANCEL_MODE_ALL we don't care about
-            // any of this, we just want to trigger RC in RM.
-        }
+            if (skip_va_space)
+                continue;

-        if (!uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->parent->id)) {
-            // If there is no GPU VA space for the GPU, ignore the fault.
-            // This can happen if the GPU VA did not exist in
-            // service_fault_batch(), or it was destroyed since then.
-            // This is to avoid targetting a PDB that might have been reused
-            // by another process.
-            continue;
-        }
-
-        // Cancel the fault
-        if (cancel_mode == FAULT_CANCEL_MODE_ALL || current_entry->is_fatal) {
-            uvm_fault_cancel_va_mode_t cancel_va_mode = current_entry->replayable.cancel_va_mode;
-
-            // If cancelling unconditionally and the fault was not fatal,
-            // set the cancel reason passed to this function
-            if (!current_entry->is_fatal) {
+            if (current_entry->is_fatal) {
+                UVM_ASSERT(current_entry->fatal_reason != UvmEventFatalReasonInvalid);
+                cancel_va_mode = current_entry->replayable.cancel_va_mode;
+            }
+            else {
                current_entry->fatal_reason = reason;
                cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
            }
@ -2352,17 +2503,13 @@ static NV_STATUS cancel_faults_precise_va(uvm_gpu_t *gpu,
            if (status != NV_OK)
                break;
        }
+
+        uvm_va_space_up_read(va_space);
    }

-    if (va_space != NULL)
-        uvm_va_space_up_read(va_space);
-
-    // After cancelling the fatal faults, the fault buffer is flushed to remove
-    // any potential duplicated fault that may have been added while processing
-    // the faults in this batch. This flush also avoids doing unnecessary
-    // processing after the fatal faults have been cancelled, so all the rest
-    // are unlikely to remain after a replay because the context is probably in
-    // the process of dying.
+    // Because each cancel itself triggers a replay, there may be a large number
+    // of new duplicated faults in the buffer after cancelling all the known
+    // ones. Flushing the buffer discards them to avoid unnecessary processing.
    fault_status = fault_buffer_flush_locked(gpu,
                                             UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
                                             UVM_FAULT_REPLAY_TYPE_START,
@ -2410,12 +2557,12 @@ static void cancel_fault_batch(uvm_gpu_t *gpu,
                               uvm_fault_service_batch_context_t *batch_context,
                               UvmEventFatalReason reason)
 {
-    if (gpu->parent->fault_cancel_va_supported) {
-        cancel_faults_precise_va(gpu, batch_context, FAULT_CANCEL_MODE_ALL, reason);
-        return;
-    }
-
-    cancel_fault_batch_tlb(gpu, batch_context, reason);
+    // Return code is ignored since we're on a global error path and wouldn't be
+    // able to recover anyway.
+    if (gpu->parent->fault_cancel_va_supported)
+        cancel_faults_all(gpu, batch_context, reason);
+    else
+        cancel_fault_batch_tlb(gpu, batch_context, reason);
 }


@ -2502,7 +2649,7 @@ static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_bat

        batch_context->num_invalid_prefetch_faults = 0;
        batch_context->num_replays                 = 0;
-        batch_context->has_fatal_faults            = false;
+        batch_context->fatal_va_space              = NULL;
        batch_context->has_throttled_faults        = false;

        // 5) Fetch all faults from buffer
@ -2549,9 +2696,6 @@ static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_bat
        // 8) Service all non-fatal faults and mark all non-serviceable faults
        // as fatal
        status = service_fault_batch(gpu, FAULT_SERVICE_MODE_CANCEL, batch_context);
-        if (status == NV_WARN_MORE_PROCESSING_REQUIRED)
-            continue;
-
        UVM_ASSERT(batch_context->num_replays == 0);
        if (status == NV_ERR_NO_MEMORY)
            continue;
@ -2559,7 +2703,7 @@ static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_bat
            break;

        // No more fatal faults left, we are done
-        if (!batch_context->has_fatal_faults)
+        if (!batch_context->fatal_va_space)
            break;

        // 9) Search for uTLBs that contain fatal faults and meet the
@ -2581,13 +2725,9 @@ static NV_STATUS cancel_faults_precise_tlb(uvm_gpu_t *gpu, uvm_fault_service_bat

 static NV_STATUS cancel_faults_precise(uvm_gpu_t *gpu, uvm_fault_service_batch_context_t *batch_context)
 {
-    UVM_ASSERT(batch_context->has_fatal_faults);
-    if (gpu->parent->fault_cancel_va_supported) {
-        return cancel_faults_precise_va(gpu,
-                                        batch_context,
-                                        FAULT_CANCEL_MODE_FATAL,
-                                        UvmEventFatalReasonInvalid);
-    }
+    UVM_ASSERT(batch_context->fatal_va_space);
+    if (gpu->parent->fault_cancel_va_supported)
+        return service_fault_batch_for_cancel(gpu, batch_context);

    return cancel_faults_precise_tlb(gpu, batch_context);
 }
@ -2643,7 +2783,7 @@ void uvm_gpu_service_replayable_faults(uvm_gpu_t *gpu)
        batch_context->num_invalid_prefetch_faults = 0;
        batch_context->num_duplicate_faults        = 0;
        batch_context->num_replays                 = 0;
-        batch_context->has_fatal_faults            = false;
+        batch_context->fatal_va_space              = NULL;
        batch_context->has_throttled_faults        = false;

        status = fetch_fault_buffer_entries(gpu, batch_context, FAULT_FETCH_MODE_BATCH_READY);
@ -2671,9 +2811,6 @@ void uvm_gpu_service_replayable_faults(uvm_gpu_t *gpu)
        // was flushed
        num_replays += batch_context->num_replays;

-        if (status == NV_WARN_MORE_PROCESSING_REQUIRED)
-            continue;
-
        enable_disable_prefetch_faults(gpu->parent, batch_context);

        if (status != NV_OK) {
@ -2687,10 +2824,17 @@ void uvm_gpu_service_replayable_faults(uvm_gpu_t *gpu)
            break;
        }

-        if (batch_context->has_fatal_faults) {
+        if (batch_context->fatal_va_space) {
            status = uvm_tracker_wait(&batch_context->tracker);
-            if (status == NV_OK)
+            if (status == NV_OK) {
                status = cancel_faults_precise(gpu, batch_context);
+                if (status == NV_OK) {
+                    // Cancel handling should've issued at least one replay
+                    UVM_ASSERT(batch_context->num_replays > 0);
+                    ++num_batches;
+                    continue;
+                }
+            }

            break;
        }
--- a/kernel-open/nvidia-uvm/uvm_hopper.c
+++ b/kernel-open/nvidia-uvm/uvm_hopper.c
@ -103,5 +103,7 @@ void uvm_hal_hopper_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
    parent_gpu->map_remap_larger_page_promotion = false;

    parent_gpu->plc_supported = true;
+
+    parent_gpu->no_ats_range_required = true;
 }

--- a/kernel-open/nvidia-uvm/uvm_hopper_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_hopper_mmu.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2020-2022 NVIDIA Corporation
+    Copyright (c) 2020-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -33,6 +33,7 @@

 #include "uvm_types.h"
 #include "uvm_global.h"
+#include "uvm_common.h"
 #include "uvm_hal.h"
 #include "uvm_hal_types.h"
 #include "uvm_hopper_fault_buffer.h"
@ -42,6 +43,10 @@
 #define MMU_BIG 0
 #define MMU_SMALL 1

+// Used in pde_pcf().
+#define ATS_ALLOWED 0
+#define ATS_NOT_ALLOWED 1
+
 uvm_mmu_engine_type_t uvm_hal_hopper_mmu_engine_id_to_type(NvU16 mmu_engine_id)
 {
    if (mmu_engine_id >= NV_PFAULT_MMU_ENG_ID_HOST0 && mmu_engine_id <= NV_PFAULT_MMU_ENG_ID_HOST44)
@ -260,7 +265,108 @@ static NvU64 poisoned_pte_hopper(void)
    return WRITE_HWCONST64(pte_bits, _MMU_VER3, PTE, PCF, PRIVILEGE_RO_NO_ATOMIC_UNCACHED_ACD);
 }

-static NvU64 single_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc, NvU32 depth)
+typedef enum
+{
+    PDE_TYPE_SINGLE,
+    PDE_TYPE_DUAL_BIG,
+    PDE_TYPE_DUAL_SMALL,
+    PDE_TYPE_COUNT,
+} pde_type_t;
+
+static const NvU8 valid_pcf[][2] = { { NV_MMU_VER3_PDE_PCF_VALID_UNCACHED_ATS_ALLOWED,
+                                       NV_MMU_VER3_PDE_PCF_VALID_UNCACHED_ATS_NOT_ALLOWED },
+                                     { NV_MMU_VER3_DUAL_PDE_PCF_BIG_VALID_UNCACHED_ATS_ALLOWED,
+                                       NV_MMU_VER3_DUAL_PDE_PCF_BIG_VALID_UNCACHED_ATS_NOT_ALLOWED },
+                                     { NV_MMU_VER3_DUAL_PDE_PCF_SMALL_VALID_UNCACHED_ATS_ALLOWED,
+                                       NV_MMU_VER3_DUAL_PDE_PCF_SMALL_VALID_UNCACHED_ATS_NOT_ALLOWED } };
+
+static const NvU8 invalid_pcf[][2] = { { NV_MMU_VER3_PDE_PCF_INVALID_ATS_ALLOWED,
+                                         NV_MMU_VER3_PDE_PCF_INVALID_ATS_NOT_ALLOWED },
+                                       { NV_MMU_VER3_DUAL_PDE_PCF_BIG_INVALID_ATS_ALLOWED,
+                                         NV_MMU_VER3_DUAL_PDE_PCF_BIG_INVALID_ATS_NOT_ALLOWED },
+                                       { NV_MMU_VER3_DUAL_PDE_PCF_SMALL_INVALID_ATS_ALLOWED,
+                                         NV_MMU_VER3_DUAL_PDE_PCF_SMALL_INVALID_ATS_NOT_ALLOWED } };
+
+static const NvU8 va_base[] = { 56, 47, 38, 29, 21 };
+
+static bool is_ats_range_valid(uvm_page_directory_t *dir, NvU32 child_index)
+{
+    NvU64 pde_base_va;
+    NvU64 min_va_upper;
+    NvU64 max_va_lower;
+    NvU32 index_in_dir;
+
+    uvm_cpu_get_unaddressable_range(&max_va_lower, &min_va_upper);
+
+    UVM_ASSERT(dir->depth < ARRAY_SIZE(va_base));
+
+    // We can use UVM_PAGE_SIZE_AGNOSTIC because page_size is only used in
+    // index_bits_hopper() for PTE table, i.e., depth 5+, which does not use a
+    // PDE PCF or an ATS_ALLOWED/NOT_ALLOWED setting.
+    UVM_ASSERT(child_index < (1ull << index_bits_hopper(dir->depth, UVM_PAGE_SIZE_AGNOSTIC)));
+
+    pde_base_va = 0;
+    index_in_dir = child_index;
+    while (dir) {
+        pde_base_va += index_in_dir * (1ull << va_base[dir->depth]);
+        index_in_dir = dir->index_in_parent;
+        dir = dir->host_parent;
+    }
+    pde_base_va = (NvU64)((NvS64)(pde_base_va << (64 - num_va_bits_hopper())) >> (64 - num_va_bits_hopper()));
+
+    if (pde_base_va < max_va_lower || pde_base_va >= min_va_upper)
+        return true;
+
+    return false;
+}
+
+// PDE Permission Control Flags
+static NvU32 pde_pcf(bool valid, pde_type_t pde_type, uvm_page_directory_t *dir, NvU32 child_index)
+{
+    const NvU8 (*pcf)[2] = valid ? valid_pcf : invalid_pcf;
+    NvU8 depth = dir->depth;
+
+    UVM_ASSERT(pde_type < PDE_TYPE_COUNT);
+    UVM_ASSERT(depth < 5);
+
+    // On non-ATS systems, PDE PCF only sets the valid and volatile/cache bits.
+    if (!g_uvm_global.ats.enabled)
+        return pcf[pde_type][ATS_ALLOWED];
+
+    // We assume all supported ATS platforms use canonical form address.
+    // See comments in uvm_gpu.c:uvm_gpu_can_address() and in
+    // uvm_mmu.c:page_tree_ats_init();
+    UVM_ASSERT(uvm_platform_uses_canonical_form_address());
+
+    // Hopper GPUs on ATS-enabled systems, perform a parallel lookup on both
+    // ATS and GMMU page tables. For managed memory we need to prevent this
+    // parallel lookup since we would not get any GPU fault if the CPU has
+    // a valid mapping. Also, for external ranges that are known to be
+    // mapped entirely on the GMMU page table we can skip the ATS lookup
+    // for performance reasons. Parallel ATS lookup is disabled in PDE1
+    // (depth 3) and, therefore, it applies to the underlying 512MB VA
+    // range.
+    //
+    // UVM sets ATS_NOT_ALLOWED for all Hopper+ mappings on ATS systems.
+    // This is fine because CUDA ensures that all managed and external
+    // allocations are properly compartmentalized in 512MB-aligned VA
+    // regions. For cudaHostRegister CUDA cannot control the VA range, but
+    // we rely on ATS for those allocations so they can't choose the
+    // ATS_NOT_ALLOWED mode.
+    // TODO: Bug 3254055: Relax the NO_ATS setting from 512MB (pde1) range to
+    //                    PTEs.
+    // HW complies with the leaf PDE's ATS_ALLOWED/ATS_NOT_ALLOWED settings,
+    // enabling us to treat any upper-level PDE as a don't care as long as there
+    // are leaf PDEs for the entire upper-level PDE range. We assume PDE4
+    // entries (depth == 0) are always ATS enabled, and the no_ats_range is in
+    // PDE3 or lower.
+    if (depth == 0 || (!valid && is_ats_range_valid(dir, child_index)))
+        return pcf[pde_type][ATS_ALLOWED];
+
+    return pcf[pde_type][ATS_NOT_ALLOWED];
+}
+
+static NvU64 single_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc, uvm_page_directory_t *dir, NvU32 child_index)
 {
    NvU64 pde_bits = 0;

@ -280,38 +386,17 @@ static NvU64 single_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc, NvU32 dep
                break;
        }

-        // PCF (permission control flags) 5:3
-        // Hopper GPUs on ATS-enabled systems, perform a parallel lookup on both
-        // ATS and GMMU page tables. For managed memory we need to prevent this
-        // parallel lookup since we would not get any GPU fault if the CPU has
-        // a valid mapping. Also, for external ranges that are known to be
-        // mapped entirely on the GMMU page table we can skip the ATS lookup
-        // for performance reasons. Parallel ATS lookup is disabled in PDE1
-        // (depth 3) and, therefore, it applies to the underlying 512MB VA
-        // range.
-        //
-        // UVM sets ATS_NOT_ALLOWED for all Hopper+ mappings on ATS systems.
-        // This is fine because CUDA ensures that all managed and external
-        // allocations are properly compartmentalized in 512MB-aligned VA
-        // regions. For cudaHostRegister CUDA cannot control the VA range, but
-        // we rely on ATS for those allocations so they can't choose the
-        // ATS_NOT_ALLOWED mode.
-        //
-        // TODO: Bug 3254055: Relax the NO_ATS setting from 512MB (pde1) range
-        // to PTEs.
-        if (depth == 3 && g_uvm_global.ats.enabled)
-            pde_bits |= HWCONST64(_MMU_VER3, PDE, PCF, VALID_UNCACHED_ATS_NOT_ALLOWED);
-        else
-            pde_bits |= HWCONST64(_MMU_VER3, PDE, PCF, VALID_UNCACHED_ATS_ALLOWED);
-
        // address 51:12
        pde_bits |= HWVALUE64(_MMU_VER3, PDE, ADDRESS, address);
    }

+    // PCF (permission control flags) 5:3
+    pde_bits |= HWVALUE64(_MMU_VER3, PDE, PCF, pde_pcf(phys_alloc != NULL, PDE_TYPE_SINGLE, dir, child_index));
+
    return pde_bits;
 }

-static NvU64 big_half_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc)
+static NvU64 big_half_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc, uvm_page_directory_t *dir, NvU32 child_index)
 {
    NvU64 pde_bits = 0;

@ -330,17 +415,20 @@ static NvU64 big_half_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc)
                break;
        }

-        // PCF (permission control flags) 5:3
-        pde_bits |= HWCONST64(_MMU_VER3, DUAL_PDE, PCF_BIG, VALID_UNCACHED_ATS_NOT_ALLOWED);
-
        // address 51:8
        pde_bits |= HWVALUE64(_MMU_VER3, DUAL_PDE, ADDRESS_BIG, address);
    }

+    // PCF (permission control flags) 5:3
+    pde_bits |= HWVALUE64(_MMU_VER3,
+                          DUAL_PDE,
+                          PCF_BIG,
+                          pde_pcf(phys_alloc != NULL, PDE_TYPE_DUAL_BIG, dir, child_index));
+
    return pde_bits;
 }

-static NvU64 small_half_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc)
+static NvU64 small_half_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc, uvm_page_directory_t *dir, NvU32 child_index)
 {
    NvU64 pde_bits = 0;

@ -359,29 +447,40 @@ static NvU64 small_half_pde_hopper(uvm_mmu_page_table_alloc_t *phys_alloc)
                break;
        }

-        // PCF (permission control flags) 69:67 [5:3]
-        pde_bits |= HWCONST64(_MMU_VER3, DUAL_PDE, PCF_SMALL, VALID_UNCACHED_ATS_NOT_ALLOWED);
-
        // address 115:76 [51:12]
        pde_bits |= HWVALUE64(_MMU_VER3, DUAL_PDE, ADDRESS_SMALL, address);
    }
+
+    // PCF (permission control flags) 69:67 [5:3]
+    pde_bits |= HWVALUE64(_MMU_VER3,
+                          DUAL_PDE,
+                          PCF_SMALL,
+                          pde_pcf(phys_alloc != NULL, PDE_TYPE_DUAL_SMALL, dir, child_index));
+
    return pde_bits;
 }

-static void make_pde_hopper(void *entry, uvm_mmu_page_table_alloc_t **phys_allocs, NvU32 depth)
+static void make_pde_hopper(void *entry,
+                            uvm_mmu_page_table_alloc_t **phys_allocs,
+                            uvm_page_directory_t *dir,
+                            NvU32 child_index)
 {
-    NvU32 entry_count = entries_per_index_hopper(depth);
+    NvU32 entry_count;
    NvU64 *entry_bits = (NvU64 *)entry;

+    UVM_ASSERT(dir);
+
+    entry_count = entries_per_index_hopper(dir->depth);
+
    if (entry_count == 1) {
-        *entry_bits = single_pde_hopper(*phys_allocs, depth);
+        *entry_bits = single_pde_hopper(*phys_allocs, dir, child_index);
    }
    else if (entry_count == 2) {
-        entry_bits[MMU_BIG] = big_half_pde_hopper(phys_allocs[MMU_BIG]);
-        entry_bits[MMU_SMALL] = small_half_pde_hopper(phys_allocs[MMU_SMALL]);
+        entry_bits[MMU_BIG] = big_half_pde_hopper(phys_allocs[MMU_BIG], dir, child_index);
+        entry_bits[MMU_SMALL] = small_half_pde_hopper(phys_allocs[MMU_SMALL], dir, child_index);

        // This entry applies to the whole dual PDE but is stored in the lower
-        // bits
+        // bits.
        entry_bits[MMU_BIG] |= HWCONST64(_MMU_VER3, DUAL_PDE, IS_PTE, FALSE);
    }
    else {
--- a/kernel-open/nvidia-uvm/uvm_linux.h
+++ b/kernel-open/nvidia-uvm/uvm_linux.h
@ -128,8 +128,9 @@ static inline const struct cpumask *uvm_cpumask_of_node(int node)
 // present if we see the callback.
 //
 // The callback was added in commit 0f0a327fa12cd55de5e7f8c05a70ac3d047f405e,
-// v3.19 (2014-11-13).
-    #if defined(NV_MMU_NOTIFIER_OPS_HAS_INVALIDATE_RANGE)
+// v3.19 (2014-11-13) and renamed in commit 1af5a8109904.
+    #if defined(NV_MMU_NOTIFIER_OPS_HAS_INVALIDATE_RANGE) || \
+        defined(NV_MMU_NOTIFIER_OPS_HAS_ARCH_INVALIDATE_SECONDARY_TLBS)
        #define UVM_CAN_USE_MMU_NOTIFIERS() 1
    #else
        #define UVM_CAN_USE_MMU_NOTIFIERS() 0
@ -153,10 +154,6 @@ static inline const struct cpumask *uvm_cpumask_of_node(int node)
 #define VM_MIXEDMAP    0x00000000
 #endif

-#if !defined(MPOL_PREFERRED_MANY)
-#define MPOL_PREFERRED_MANY    5
-#endif
-
 //
 // printk.h already defined pr_fmt, so we have to redefine it so the pr_*
 // routines pick up our version
--- a/kernel-open/nvidia-uvm/uvm_maxwell.c
+++ b/kernel-open/nvidia-uvm/uvm_maxwell.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2021 NVIDIA Corporation
+    Copyright (c) 2016-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -71,4 +71,6 @@ void uvm_hal_maxwell_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
    parent_gpu->smc.supported = false;

    parent_gpu->plc_supported = false;
+
+    parent_gpu->no_ats_range_required = false;
 }
--- a/kernel-open/nvidia-uvm/uvm_maxwell_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_maxwell_mmu.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2021 NVIDIA Corporation
+    Copyright (c) 2016-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -106,10 +106,16 @@ static NvU64 small_half_pde_maxwell(uvm_mmu_page_table_alloc_t *phys_alloc)
    return pde_bits;
 }

-static void make_pde_maxwell(void *entry, uvm_mmu_page_table_alloc_t **phys_allocs, NvU32 depth)
+static void make_pde_maxwell(void *entry,
+                             uvm_mmu_page_table_alloc_t **phys_allocs,
+                             uvm_page_directory_t *dir,
+                             NvU32 child_index)
 {
    NvU64 pde_bits = 0;
-    UVM_ASSERT(depth == 0);
+
+    UVM_ASSERT(dir);
+    UVM_ASSERT(dir->depth == 0);
+
    pde_bits |= HWCONST64(_MMU, PDE, SIZE, FULL);
    pde_bits |= big_half_pde_maxwell(phys_allocs[MMU_BIG]) | small_half_pde_maxwell(phys_allocs[MMU_SMALL]);

--- a/kernel-open/nvidia-uvm/uvm_migrate_pageable.c
+++ b/kernel-open/nvidia-uvm/uvm_migrate_pageable.c
@ -672,14 +672,6 @@ static NV_STATUS nv_migrate_vma(struct migrate_vma *args, migrate_vma_state_t *s
        .finalize_and_map = uvm_migrate_vma_finalize_and_map_helper,
    };

-    // WAR for Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU TLB
-    // invalidates on read-only to read-write upgrades
-    //
-    // This code path isn't used on GH180 but we need to maintain consistent
-    // behaviour on systems that do.
-    if (!vma_is_anonymous(args->vma))
-        return NV_WARN_NOTHING_TO_DO;
-
    ret = migrate_vma(&uvm_migrate_vma_ops, args->vma, args->start, args->end, args->src, args->dst, state);
    if (ret < 0)
        return errno_to_nv_status(ret);
@ -693,24 +685,6 @@ static NV_STATUS nv_migrate_vma(struct migrate_vma *args, migrate_vma_state_t *s
    if (ret < 0)
        return errno_to_nv_status(ret);

-    // TODO: Bug 2419180: support file-backed pages in migrate_vma, when
-    //       support for it is added to the Linux kernel
-    //
-    // A side-effect of migrate_vma_setup() is it calls mmu notifiers even if a
-    // page can't be migrated (eg. because it's a non-anonymous mapping). We
-    // need this side-effect for SMMU on GH180 to ensure any cached read-only
-    // entries are flushed from SMMU on permission upgrade.
-    //
-    // TODO: Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU TLB
-    // invalidates on read-only to read-write upgrades
-    //
-    // The above WAR doesn't work for HugeTLBfs mappings because
-    // migrate_vma_setup() will fail in that case.
-    if (!vma_is_anonymous(args->vma)) {
-        migrate_vma_finalize(args);
-        return NV_WARN_NOTHING_TO_DO;
-    }
-
    uvm_migrate_vma_alloc_and_copy(args, state);
    if (state->status == NV_OK) {
        migrate_vma_pages(args);
@ -862,6 +836,17 @@ static NV_STATUS migrate_pageable_vma_region(struct vm_area_struct *vma,
    return NV_OK;
 }

+NV_STATUS uvm_test_skip_migrate_vma(UVM_TEST_SKIP_MIGRATE_VMA_PARAMS *params, struct file *filp)
+{
+    uvm_va_space_t *va_space = uvm_va_space_get(filp);
+
+    uvm_va_space_down_write(va_space);
+    va_space->test.skip_migrate_vma = params->skip;
+    uvm_va_space_up_write(va_space);
+
+    return NV_OK;
+}
+
 static NV_STATUS migrate_pageable_vma(struct vm_area_struct *vma,
                                      unsigned long start,
                                      unsigned long outer,
@ -884,13 +869,12 @@ static NV_STATUS migrate_pageable_vma(struct vm_area_struct *vma,
    start = max(start, vma->vm_start);
    outer = min(outer, vma->vm_end);

-    // migrate_vma only supports anonymous VMAs. We check for those after
-    // calling migrate_vma_setup() to workaround Bug 4130089. We need to check
-    // for HugeTLB VMAs here because migrate_vma_setup() will return a fatal
-    // error for those.
-    // TODO: Bug 4130089: [GH180][r535] WAR for kernel not issuing SMMU TLB
-    // invalidates on read-only to read-write upgrades
-    if (is_vm_hugetlb_page(vma))
+    if (va_space->test.skip_migrate_vma)
+        return NV_WARN_NOTHING_TO_DO;
+
+    // TODO: Bug 2419180: support file-backed pages in migrate_vma, when
+    //       support for it is added to the Linux kernel
+    if (!vma_is_anonymous(vma))
        return NV_WARN_NOTHING_TO_DO;

    if (uvm_processor_mask_empty(&va_space->registered_gpus))
@ -950,7 +934,9 @@ static NV_STATUS migrate_pageable(migrate_vma_state_t *state)
            bool touch = uvm_migrate_args->touch;
            uvm_populate_permissions_t populate_permissions = uvm_migrate_args->populate_permissions;

-            UVM_ASSERT(!vma_is_anonymous(vma) || uvm_processor_mask_empty(&va_space->registered_gpus));
+            UVM_ASSERT(va_space->test.skip_migrate_vma ||
+                       !vma_is_anonymous(vma) ||
+                       uvm_processor_mask_empty(&va_space->registered_gpus));

            // We can't use migrate_vma to move the pages as desired. Normally
            // this fallback path is supposed to populate the memory then inform
--- a/kernel-open/nvidia-uvm/uvm_migrate_pageable.h
+++ b/kernel-open/nvidia-uvm/uvm_migrate_pageable.h
@ -51,7 +51,7 @@ typedef struct
 #if defined(CONFIG_MIGRATE_VMA_HELPER)
 #define UVM_MIGRATE_VMA_SUPPORTED 1
 #else
-#if defined(CONFIG_DEVICE_PRIVATE) && defined(NV_MIGRATE_VMA_SETUP_PRESENT)
+#if NV_IS_EXPORT_SYMBOL_PRESENT_migrate_vma_setup
 #define UVM_MIGRATE_VMA_SUPPORTED 1
 #endif
 #endif
@ -218,6 +218,9 @@ NV_STATUS uvm_migrate_pageable(uvm_migrate_args_t *uvm_migrate_args);
 NV_STATUS uvm_migrate_pageable_init(void);

 void uvm_migrate_pageable_exit(void);
+
+NV_STATUS uvm_test_skip_migrate_vma(UVM_TEST_SKIP_MIGRATE_VMA_PARAMS *params, struct file *filp);
+
 #else // UVM_MIGRATE_VMA_SUPPORTED

 static NV_STATUS uvm_migrate_pageable(uvm_migrate_args_t *uvm_migrate_args)
@ -251,6 +254,10 @@ static void uvm_migrate_pageable_exit(void)
 {
 }

+static inline NV_STATUS uvm_test_skip_migrate_vma(UVM_TEST_SKIP_MIGRATE_VMA_PARAMS *params, struct file *filp)
+{
+    return NV_OK;
+}
 #endif // UVM_MIGRATE_VMA_SUPPORTED

 #endif
--- a/kernel-open/nvidia-uvm/uvm_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_mmu.c
@ -323,37 +323,156 @@ static void uvm_mmu_page_table_cpu_memset_16(uvm_gpu_t *gpu,
    uvm_mmu_page_table_cpu_unmap(gpu, phys_alloc);
 }

+static void pde_fill_cpu(uvm_page_tree_t *tree,
+                         uvm_page_directory_t *directory,
+                         NvU32 start_index,
+                         NvU32 pde_count,
+                         uvm_mmu_page_table_alloc_t **phys_addr)
+{
+    NvU64 pde_data[2], entry_size;
+    NvU32 i;
+
+    UVM_ASSERT(uvm_mmu_use_cpu(tree));
+
+    entry_size = tree->hal->entry_size(directory->depth);
+    UVM_ASSERT(sizeof(pde_data) >= entry_size);
+
+    for (i = 0; i < pde_count; i++) {
+        tree->hal->make_pde(pde_data, phys_addr, directory, start_index + i);
+
+        if (entry_size == sizeof(pde_data[0]))
+            uvm_mmu_page_table_cpu_memset_8(tree->gpu, &directory->phys_alloc, start_index + i, pde_data[0], 1);
+        else
+            uvm_mmu_page_table_cpu_memset_16(tree->gpu, &directory->phys_alloc, start_index + i, pde_data, 1);
+    }
+}
+
+static void pde_fill_gpu(uvm_page_tree_t *tree,
+                         uvm_page_directory_t *directory,
+                         NvU32 start_index,
+                         NvU32 pde_count,
+                         uvm_mmu_page_table_alloc_t **phys_addr,
+                         uvm_push_t *push)
+{
+    NvU64 pde_data[2], entry_size;
+    uvm_gpu_address_t pde_entry_addr = uvm_mmu_gpu_address(tree->gpu, directory->phys_alloc.addr);
+    NvU32 max_inline_entries;
+    uvm_push_flag_t push_membar_flag = UVM_PUSH_FLAG_COUNT;
+    uvm_gpu_address_t inline_data_addr;
+    uvm_push_inline_data_t inline_data;
+    NvU32 entry_count, i, j;
+
+    UVM_ASSERT(!uvm_mmu_use_cpu(tree));
+
+    entry_size = tree->hal->entry_size(directory->depth);
+    UVM_ASSERT(sizeof(pde_data) >= entry_size);
+
+    max_inline_entries = UVM_PUSH_INLINE_DATA_MAX_SIZE / entry_size;
+
+    if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE))
+        push_membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE;
+    else if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU))
+        push_membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU;
+
+    pde_entry_addr.address += start_index * entry_size;
+
+    for (i = 0; i < pde_count;) {
+        // All but the first memory operation can be pipelined. We respect the
+        // caller's pipelining settings for the first push.
+        if (i != 0)
+            uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
+
+        entry_count = min(pde_count - i, max_inline_entries);
+
+        // No membar is needed until the last memory operation. Otherwise,
+        // use caller's membar flag.
+        if ((i + entry_count) < pde_count)
+            uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
+        else if (push_membar_flag != UVM_PUSH_FLAG_COUNT)
+            uvm_push_set_flag(push, push_membar_flag);
+
+        uvm_push_inline_data_begin(push, &inline_data);
+        for (j = 0; j < entry_count; j++) {
+            tree->hal->make_pde(pde_data, phys_addr, directory, start_index + i + j);
+            uvm_push_inline_data_add(&inline_data, pde_data, entry_size);
+        }
+        inline_data_addr = uvm_push_inline_data_end(&inline_data);
+
+        tree->gpu->parent->ce_hal->memcopy(push, pde_entry_addr, inline_data_addr, entry_count * entry_size);
+
+        i += entry_count;
+        pde_entry_addr.address += entry_size * entry_count;
+    }
+}
+
+// pde_fill() populates pde_count PDE entries (starting at start_index) with
+// the same mapping, i.e., with the same physical address (phys_addr).
+// pde_fill() is optimized for pde_count == 1, which is the common case.
+static void pde_fill(uvm_page_tree_t *tree,
+                     uvm_page_directory_t *directory,
+                     NvU32 start_index,
+                     NvU32 pde_count,
+                     uvm_mmu_page_table_alloc_t **phys_addr,
+                     uvm_push_t *push)
+{
+    UVM_ASSERT(start_index + pde_count <= uvm_mmu_page_tree_entries(tree, directory->depth, UVM_PAGE_SIZE_AGNOSTIC));
+
+    if (push)
+        pde_fill_gpu(tree, directory, start_index, pde_count, phys_addr, push);
+    else
+        pde_fill_cpu(tree, directory, start_index, pde_count, phys_addr);
+}
+
 static void phys_mem_init(uvm_page_tree_t *tree, NvU32 page_size, uvm_page_directory_t *dir, uvm_push_t *push)
 {
-    NvU64 clear_bits[2];
-    uvm_mmu_mode_hal_t *hal = tree->hal;
+    NvU32 entries_count = uvm_mmu_page_tree_entries(tree, dir->depth, page_size);
+    NvU8 max_pde_depth = tree->hal->page_table_depth(UVM_PAGE_SIZE_AGNOSTIC) - 1;

-    if (dir->depth == tree->hal->page_table_depth(page_size)) {
-        *clear_bits = 0; // Invalid PTE
-    }
-    else {
-        // passing in NULL for the phys_allocs will mark the child entries as invalid
-        uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
-        hal->make_pde(clear_bits, phys_allocs, dir->depth);
+    // Passing in NULL for the phys_allocs will mark the child entries as
+    // invalid.
+    uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};

-        // Make sure that using only clear_bits[0] will work
-        UVM_ASSERT(hal->entry_size(dir->depth) == sizeof(clear_bits[0]) || clear_bits[0] == clear_bits[1]);
-    }
+    // Init with an invalid PTE or clean PDE. Only Maxwell PDEs can have more
+    // than 512 entries. In this case, we initialize them all with the same
+    // clean PDE. ATS systems may require clean PDEs with
+    // ATS_ALLOWED/ATS_NOT_ALLOWED bit settings based on the mapping VA.
+    // We only clean_bits to 0 at the lowest page table level (PTE table), i.e.,
+    // when depth is greater than the max_pde_depth.
+    if ((dir->depth > max_pde_depth) || (entries_count > 512 && !g_uvm_global.ats.enabled)) {
+        NvU64 clear_bits[2];

-    // initialize the memory to a reasonable value
-    if (push) {
-        tree->gpu->parent->ce_hal->memset_8(push,
-                                            uvm_mmu_gpu_address(tree->gpu, dir->phys_alloc.addr),
+        // If it is not a PTE, make a clean PDE.
+        if (dir->depth != tree->hal->page_table_depth(page_size)) {
+            // make_pde() child index is zero/ignored, since it is only used in
+            // PDEs on ATS-enabled systems where pde_fill() is preferred.
+            tree->hal->make_pde(clear_bits, phys_allocs, dir, 0);
+
+            // Make sure that using only clear_bits[0] will work.
+            UVM_ASSERT(tree->hal->entry_size(dir->depth) == sizeof(clear_bits[0]) || clear_bits[0] == clear_bits[1]);
+        }
+        else {
+            *clear_bits = 0;
+        }
+
+        // Initialize the memory to a reasonable value.
+        if (push) {
+            tree->gpu->parent->ce_hal->memset_8(push,
+                                                uvm_mmu_gpu_address(tree->gpu, dir->phys_alloc.addr),
+                                                *clear_bits,
+                                                dir->phys_alloc.size);
+        }
+        else {
+            uvm_mmu_page_table_cpu_memset_8(tree->gpu,
+                                            &dir->phys_alloc,
+                                            0,
                                            *clear_bits,
-                                            dir->phys_alloc.size);
+                                            dir->phys_alloc.size / sizeof(*clear_bits));
+        }
    }
    else {
-        uvm_mmu_page_table_cpu_memset_8(tree->gpu,
-                                        &dir->phys_alloc,
-                                        0,
-                                        *clear_bits,
-                                        dir->phys_alloc.size / sizeof(*clear_bits));
+        pde_fill(tree, dir, 0, entries_count, phys_allocs, push);
    }
+
 }

 static uvm_page_directory_t *allocate_directory(uvm_page_tree_t *tree,
@ -367,8 +486,10 @@ static uvm_page_directory_t *allocate_directory(uvm_page_tree_t *tree,
    NvLength phys_alloc_size = hal->allocation_size(depth, page_size);
    uvm_page_directory_t *dir;

-    // The page tree doesn't cache PTEs so space is not allocated for entries that are always PTEs.
-    // 2M PTEs may later become PDEs so pass UVM_PAGE_SIZE_AGNOSTIC, not page_size.
+    // The page tree doesn't cache PTEs so space is not allocated for entries
+    // that are always PTEs.
+    // 2M PTEs may later become PDEs so pass UVM_PAGE_SIZE_AGNOSTIC, not
+    // page_size.
    if (depth == hal->page_table_depth(UVM_PAGE_SIZE_AGNOSTIC))
        entry_count = 0;
    else
@ -409,108 +530,6 @@ static inline NvU32 index_to_entry(uvm_mmu_mode_hal_t *hal, NvU32 entry_index, N
    return hal->entries_per_index(depth) * entry_index + hal->entry_offset(depth, page_size);
 }

-static void pde_fill_cpu(uvm_page_tree_t *tree,
-                         NvU32 depth,
-                         uvm_mmu_page_table_alloc_t *directory,
-                         NvU32 start_index,
-                         NvU32 pde_count,
-                         uvm_mmu_page_table_alloc_t **phys_addr)
-{
-    NvU64 pde_data[2], entry_size;
-
-    UVM_ASSERT(uvm_mmu_use_cpu(tree));
-    entry_size = tree->hal->entry_size(depth);
-    UVM_ASSERT(sizeof(pde_data) >= entry_size);
-
-    tree->hal->make_pde(pde_data, phys_addr, depth);
-
-    if (entry_size == sizeof(pde_data[0]))
-        uvm_mmu_page_table_cpu_memset_8(tree->gpu, directory, start_index, pde_data[0], pde_count);
-    else
-        uvm_mmu_page_table_cpu_memset_16(tree->gpu, directory, start_index, pde_data, pde_count);
-}
-
-static void pde_fill_gpu(uvm_page_tree_t *tree,
-                         NvU32 depth,
-                         uvm_mmu_page_table_alloc_t *directory,
-                         NvU32 start_index,
-                         NvU32 pde_count,
-                         uvm_mmu_page_table_alloc_t **phys_addr,
-                         uvm_push_t *push)
-{
-    NvU64 pde_data[2], entry_size;
-    uvm_gpu_address_t pde_entry_addr = uvm_mmu_gpu_address(tree->gpu, directory->addr);
-
-    UVM_ASSERT(!uvm_mmu_use_cpu(tree));
-
-    entry_size = tree->hal->entry_size(depth);
-    UVM_ASSERT(sizeof(pde_data) >= entry_size);
-
-    tree->hal->make_pde(pde_data, phys_addr, depth);
-    pde_entry_addr.address += start_index * entry_size;
-
-    if (entry_size == sizeof(pde_data[0])) {
-        tree->gpu->parent->ce_hal->memset_8(push, pde_entry_addr, pde_data[0], sizeof(pde_data[0]) * pde_count);
-    }
-    else {
-        NvU32 max_inline_entries = UVM_PUSH_INLINE_DATA_MAX_SIZE / sizeof(pde_data);
-        uvm_gpu_address_t inline_data_addr;
-        uvm_push_inline_data_t inline_data;
-        NvU32 membar_flag = 0;
-        NvU32 i;
-
-        if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE))
-            membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE;
-        else if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU))
-            membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU;
-
-        for (i = 0; i < pde_count;) {
-            NvU32 j;
-            NvU32 entry_count = min(pde_count - i, max_inline_entries);
-
-            uvm_push_inline_data_begin(push, &inline_data);
-            for (j = 0; j < entry_count; j++)
-                uvm_push_inline_data_add(&inline_data, pde_data, sizeof(pde_data));
-            inline_data_addr = uvm_push_inline_data_end(&inline_data);
-
-            // All but the first memcopy can be pipelined. We respect the
-            // caller's pipelining settings for the first push.
-            if (i != 0)
-                uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
-
-            // No membar is needed until the last copy. Otherwise, use
-            // caller's membar flag.
-            if (i + entry_count < pde_count)
-                uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
-            else if (membar_flag)
-                uvm_push_set_flag(push, membar_flag);
-
-            tree->gpu->parent->ce_hal->memcopy(push, pde_entry_addr, inline_data_addr, entry_count * sizeof(pde_data));
-
-            i += entry_count;
-            pde_entry_addr.address += sizeof(pde_data) * entry_count;
-        }
-    }
-}
-
-// pde_fill() populates pde_count PDE entries (starting at start_index) with
-// the same mapping, i.e., with the same physical address (phys_addr).
-static void pde_fill(uvm_page_tree_t *tree,
-                     NvU32 depth,
-                     uvm_mmu_page_table_alloc_t *directory,
-                     NvU32 start_index,
-                     NvU32 pde_count,
-                     uvm_mmu_page_table_alloc_t **phys_addr,
-                     uvm_push_t *push)
-{
-    UVM_ASSERT(start_index + pde_count <= uvm_mmu_page_tree_entries(tree, depth, UVM_PAGE_SIZE_AGNOSTIC));
-
-    if (push)
-        pde_fill_gpu(tree, depth, directory, start_index, pde_count, phys_addr, push);
-    else
-        pde_fill_cpu(tree, depth, directory, start_index, pde_count, phys_addr);
-}
-
 static uvm_page_directory_t *host_pde_write(uvm_page_directory_t *dir,
                                            uvm_page_directory_t *parent,
                                            NvU32 index_in_parent)
@ -540,7 +559,7 @@ static void pde_write(uvm_page_tree_t *tree,
            phys_allocs[i] = &entry->phys_alloc;
    }

-    pde_fill(tree, dir->depth, &dir->phys_alloc, entry_index, 1, phys_allocs, push);
+    pde_fill(tree, dir, entry_index, 1, phys_allocs, push);
 }

 static void host_pde_clear(uvm_page_tree_t *tree, uvm_page_directory_t *dir, NvU32 entry_index, NvU32 page_size)
@ -800,7 +819,6 @@ static void free_unused_directories(uvm_page_tree_t *tree,
            }
        }
    }
-
 }

 static NV_STATUS allocate_page_table(uvm_page_tree_t *tree, NvU32 page_size, uvm_mmu_page_table_alloc_t *out)
@ -811,10 +829,93 @@ static NV_STATUS allocate_page_table(uvm_page_tree_t *tree, NvU32 page_size, uvm
    return phys_mem_allocate(tree, alloc_size, tree->location, UVM_PMM_ALLOC_FLAGS_EVICT, out);
 }

+static bool page_tree_ats_init_required(uvm_page_tree_t *tree)
+{
+    // We have full control of the kernel page tables mappings, no ATS address
+    // aliases is expected.
+    if (tree->type == UVM_PAGE_TREE_TYPE_KERNEL)
+        return false;
+
+    // Enable uvm_page_tree_init() from the page_tree test.
+    if (uvm_enable_builtin_tests && tree->gpu_va_space == NULL)
+        return false;
+
+    if (!tree->gpu_va_space->ats.enabled)
+        return false;
+
+    return tree->gpu->parent->no_ats_range_required;
+}
+
+static NV_STATUS page_tree_ats_init(uvm_page_tree_t *tree)
+{
+    NV_STATUS status;
+    NvU64 min_va_upper, max_va_lower;
+    NvU32 page_size;
+
+    if (!page_tree_ats_init_required(tree))
+        return NV_OK;
+
+    page_size = uvm_mmu_biggest_page_size(tree);
+
+    uvm_cpu_get_unaddressable_range(&max_va_lower, &min_va_upper);
+
+    // Potential violation of the UVM internal get/put_ptes contract. get_ptes()
+    // creates and initializes enough PTEs to populate all PDEs covering the
+    // no_ats_ranges. We store the no_ats_ranges in the tree, so they can be
+    // put_ptes()'ed on deinit(). It doesn't preclude the range to be used by a
+    // future get_ptes(), since we don't write to the PTEs (range->table) from
+    // the tree->no_ats_ranges.
+    //
+    // Lower half
+    status = uvm_page_tree_get_ptes(tree,
+                                    page_size,
+                                    max_va_lower,
+                                    page_size,
+                                    UVM_PMM_ALLOC_FLAGS_EVICT,
+                                    &tree->no_ats_ranges[0]);
+    if (status != NV_OK)
+        return status;
+
+    UVM_ASSERT(tree->no_ats_ranges[0].entry_count == 1);
+
+    if (uvm_platform_uses_canonical_form_address()) {
+        // Upper half
+        status = uvm_page_tree_get_ptes(tree,
+                                        page_size,
+                                        min_va_upper - page_size,
+                                        page_size,
+                                        UVM_PMM_ALLOC_FLAGS_EVICT,
+                                        &tree->no_ats_ranges[1]);
+        if (status != NV_OK)
+            return status;
+
+        UVM_ASSERT(tree->no_ats_ranges[1].entry_count == 1);
+    }
+
+    return NV_OK;
+}
+
+static void page_tree_ats_deinit(uvm_page_tree_t *tree)
+{
+    size_t i;
+
+    if (page_tree_ats_init_required(tree)) {
+        for (i = 0; i < ARRAY_SIZE(tree->no_ats_ranges); i++) {
+            if (tree->no_ats_ranges[i].entry_count)
+                uvm_page_tree_put_ptes(tree, &tree->no_ats_ranges[i]);
+        }
+
+        memset(tree->no_ats_ranges, 0, sizeof(tree->no_ats_ranges));
+    }
+}
+
 static void map_remap_deinit(uvm_page_tree_t *tree)
 {
-    if (tree->map_remap.pde0.size)
-        phys_mem_deallocate(tree, &tree->map_remap.pde0);
+    if (tree->map_remap.pde0) {
+        phys_mem_deallocate(tree, &tree->map_remap.pde0->phys_alloc);
+        uvm_kvfree(tree->map_remap.pde0);
+        tree->map_remap.pde0 = NULL;
+    }

    if (tree->map_remap.ptes_invalid_4k.size)
        phys_mem_deallocate(tree, &tree->map_remap.ptes_invalid_4k);
@ -839,10 +940,16 @@ static NV_STATUS map_remap_init(uvm_page_tree_t *tree)
    // PDE1-depth(512M) PTE. We first map it to the pde0 directory, then we
    // return the PTE for the get_ptes()'s caller.
    if (tree->hal->page_sizes() & UVM_PAGE_SIZE_512M) {
-        status = allocate_page_table(tree, UVM_PAGE_SIZE_2M, &tree->map_remap.pde0);
-        if (status != NV_OK)
+        tree->map_remap.pde0 = allocate_directory(tree,
+                                                  UVM_PAGE_SIZE_2M,
+                                                  tree->hal->page_table_depth(UVM_PAGE_SIZE_2M),
+                                                  UVM_PMM_ALLOC_FLAGS_EVICT);
+        if (tree->map_remap.pde0 == NULL) {
+            status = NV_ERR_NO_MEMORY;
            goto error;
+        }
    }
+
    status = page_tree_begin_acquire(tree, &tree->tracker, &push, "map remap init");
    if (status != NV_OK)
        goto error;
@ -864,22 +971,23 @@ static NV_STATUS map_remap_init(uvm_page_tree_t *tree)
        uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
        NvU32 depth = tree->hal->page_table_depth(UVM_PAGE_SIZE_4K) - 1;
        size_t index_4k = tree->hal->entry_offset(depth, UVM_PAGE_SIZE_4K);
-
-        // pde0 depth equals UVM_PAGE_SIZE_2M.
-        NvU32 pde0_depth = tree->hal->page_table_depth(UVM_PAGE_SIZE_2M);
-        NvU32 pde0_entries = tree->map_remap.pde0.size / tree->hal->entry_size(pde0_depth);
+        NvU32 pde0_entries = tree->map_remap.pde0->phys_alloc.size / tree->hal->entry_size(tree->map_remap.pde0->depth);

        // The big-page entry is NULL which makes it an invalid entry.
        phys_allocs[index_4k] = &tree->map_remap.ptes_invalid_4k;

        // By default CE operations include a MEMBAR_SYS. MEMBAR_GPU is
        // sufficient when pde0 is allocated in VIDMEM.
-        if (tree->map_remap.pde0.addr.aperture == UVM_APERTURE_VID)
+        if (tree->map_remap.pde0->phys_alloc.addr.aperture == UVM_APERTURE_VID)
            uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);

+        // This is an orphan directory, make_pde() requires a directory to
+        // compute the VA. The UVM depth map_remap() operates on is not in the
+        // range make_pde() must operate. We only need to supply the fields used
+        // by make_pde() to not access invalid memory addresses.
+
        pde_fill(tree,
-                 pde0_depth,
-                 &tree->map_remap.pde0,
+                 tree->map_remap.pde0,
                 0,
                 pde0_entries,
                 (uvm_mmu_page_table_alloc_t **)&phys_allocs,
@ -1006,11 +1114,22 @@ NV_STATUS uvm_page_tree_init(uvm_gpu_t *gpu,
        return status;

    phys_mem_init(tree, UVM_PAGE_SIZE_AGNOSTIC, tree->root, &push);
-    return page_tree_end_and_wait(tree, &push);
+
+    status = page_tree_end_and_wait(tree, &push);
+    if (status != NV_OK)
+        return status;
+
+    status = page_tree_ats_init(tree);
+    if (status != NV_OK)
+        return status;
+
+    return NV_OK;
 }

 void uvm_page_tree_deinit(uvm_page_tree_t *tree)
 {
+    page_tree_ats_deinit(tree);
+
    UVM_ASSERT(tree->root->ref_count == 0);

    // Take the tree lock only to avoid assertions. It is not required for
@ -1249,7 +1368,6 @@ static NV_STATUS try_get_ptes(uvm_page_tree_t *tree,
        UVM_ASSERT(uvm_gpu_can_address_kernel(tree->gpu, start, size));

    while (true) {
-
        // index of the entry, for the first byte of the range, within its
        // containing directory
        NvU32 start_index;
@ -1281,7 +1399,8 @@ static NV_STATUS try_get_ptes(uvm_page_tree_t *tree,
                if (dir_cache[dir->depth] == NULL) {
                    *cur_depth = dir->depth;

-                    // Undo the changes to the tree so that the dir cache remains private to the thread
+                    // Undo the changes to the tree so that the dir cache
+                    // remains private to the thread.
                    for (i = 0; i < used_count; i++)
                        host_pde_clear(tree, dirs_used[i]->host_parent, dirs_used[i]->index_in_parent, page_size);

@ -1332,10 +1451,9 @@ static NV_STATUS map_remap(uvm_page_tree_t *tree, NvU64 start, NvLength size, uv
    if (uvm_page_table_range_aperture(range) == UVM_APERTURE_VID)
        uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);

-    phys_alloc[0] = &tree->map_remap.pde0;
+    phys_alloc[0] = &tree->map_remap.pde0->phys_alloc;
    pde_fill(tree,
-             range->table->depth,
-             &range->table->phys_alloc,
+             range->table,
             range->start_index,
             range->entry_count,
             (uvm_mmu_page_table_alloc_t **)&phys_alloc,
@ -1380,7 +1498,8 @@ NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree,
                                  dir_cache)) == NV_ERR_MORE_PROCESSING_REQUIRED) {
        uvm_mutex_unlock(&tree->lock);

-        // try_get_ptes never needs depth 0, so store a directory at its parent's depth
+        // try_get_ptes never needs depth 0, so store a directory at its
+        // parent's depth.
        // TODO: Bug 1766655: Allocate everything below cur_depth instead of
        //       retrying for every level.
        dir_cache[cur_depth] = allocate_directory(tree, page_size, cur_depth + 1, pmm_flags);
@ -1663,8 +1782,12 @@ NV_STATUS uvm_page_table_range_vec_init(uvm_page_tree_t *tree,
                                              range);
        if (status != NV_OK) {
            UVM_ERR_PRINT("Failed to get PTEs for subrange %zd [0x%llx, 0x%llx) size 0x%llx, part of [0x%llx, 0x%llx)\n",
-                    i, range_start, range_start + range_size, range_size,
-                    start, size);
+                          i,
+                          range_start,
+                          range_start + range_size,
+                          range_size,
+                          start,
+                          size);
            goto out;
        }
    }
--- a/kernel-open/nvidia-uvm/uvm_mmu.h
+++ b/kernel-open/nvidia-uvm/uvm_mmu.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2022 NVIDIA Corporation
+    Copyright (c) 2015-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -215,11 +215,14 @@ struct uvm_mmu_mode_hal_struct
    // memory out-of-range error so we can immediately identify bad PTE usage.
    NvU64 (*poisoned_pte)(void);

-    // write a PDE bit-pattern to entry based on the data in entries (which may
+    // Write a PDE bit-pattern to entry based on the data in allocs (which may
    // point to two items for dual PDEs).
-    // any of allocs are allowed to be NULL, in which case they are to be
-    // treated as empty.
-    void (*make_pde)(void *entry, uvm_mmu_page_table_alloc_t **allocs, NvU32 depth);
+    // Any of allocs are allowed to be NULL, in which case they are to be
+    // treated as empty. make_pde() uses dir and child_index to compute the
+    // mapping PDE VA. On ATS-enabled systems, we may set PDE's PCF as
+    // ATS_ALLOWED or ATS_NOT_ALLOWED based on the mapping PDE VA, even for
+    // invalid/clean PDE entries.
+    void (*make_pde)(void *entry, uvm_mmu_page_table_alloc_t **allocs, uvm_page_directory_t *dir, NvU32 child_index);

    // size of an entry in a directory/table.  Generally either 8 or 16 bytes.
    // (in the case of Pascal dual PDEs)
@ -229,7 +232,7 @@ struct uvm_mmu_mode_hal_struct
    NvU32 (*entries_per_index)(NvU32 depth);

    // For dual PDEs, this is ether 1 or 0, depending on the page size.
-    // This is used to index the host copy only.  GPU PDEs are always entirely
+    // This is used to index the host copy only. GPU PDEs are always entirely
    // re-written using make_pde.
    NvLength (*entry_offset)(NvU32 depth, NvU32 page_size);

@ -295,11 +298,16 @@ struct uvm_page_tree_struct

        // PDE0 where all big-page entries are invalid, and small-page entries
        // point to ptes_invalid_4k.
-        // pde0 is only used on Pascal-Ampere, i.e., they have the same PDE
-        // format.
-        uvm_mmu_page_table_alloc_t pde0;
+        // pde0 is used on Pascal+ GPUs, i.e., they have the same PDE format.
+        uvm_page_directory_t *pde0;
    } map_remap;

+    // On ATS-enabled systems where the CPU VA width is smaller than the GPU VA
+    // width, the excess address range is set with ATS_NOT_ALLOWED on all  leaf
+    // PDEs covering that range. We have at most 2 no_ats_ranges, due to
+    // canonical form address systems.
+    uvm_page_table_range_t no_ats_ranges[2];
+
    // Tracker for all GPU operations on the tree
    uvm_tracker_t tracker;
 };
@ -365,21 +373,32 @@ void uvm_page_tree_deinit(uvm_page_tree_t *tree);
 // the same page size without an intervening put_ptes. To duplicate a subset of
 // an existing range or change the size of an existing range, use
 // uvm_page_table_range_get_upper() and/or uvm_page_table_range_shrink().
-NV_STATUS uvm_page_tree_get_ptes(uvm_page_tree_t *tree, NvU32 page_size, NvU64 start, NvLength size,
-        uvm_pmm_alloc_flags_t pmm_flags, uvm_page_table_range_t *range);
+NV_STATUS uvm_page_tree_get_ptes(uvm_page_tree_t *tree,
+                                 NvU32 page_size,
+                                 NvU64 start,
+                                 NvLength size,
+                                 uvm_pmm_alloc_flags_t pmm_flags,
+                                 uvm_page_table_range_t *range);

 // Same as uvm_page_tree_get_ptes(), but doesn't synchronize the GPU work.
 //
 // All pending operations can be waited on with uvm_page_tree_wait().
-NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree, NvU32 page_size, NvU64 start, NvLength size,
-        uvm_pmm_alloc_flags_t pmm_flags, uvm_page_table_range_t *range);
+NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree,
+                                       NvU32 page_size,
+                                       NvU64 start,
+                                       NvLength size,
+                                       uvm_pmm_alloc_flags_t pmm_flags,
+                                       uvm_page_table_range_t *range);

 // Returns a single-entry page table range for the addresses passed.
 // The size parameter must be a page size supported by this tree.
 // This is equivalent to calling uvm_page_tree_get_ptes() with size equal to
 // page_size.
-NV_STATUS uvm_page_tree_get_entry(uvm_page_tree_t *tree, NvU32 page_size, NvU64 start,
-        uvm_pmm_alloc_flags_t pmm_flags, uvm_page_table_range_t *single);
+NV_STATUS uvm_page_tree_get_entry(uvm_page_tree_t *tree,
+                                  NvU32 page_size,
+                                  NvU64 start,
+                                  uvm_pmm_alloc_flags_t pmm_flags,
+                                  uvm_page_table_range_t *single);

 // For a single-entry page table range, write the PDE (which could be a dual
 // PDE) to the GPU.
@ -478,8 +497,8 @@ NV_STATUS uvm_page_table_range_vec_create(uvm_page_tree_t *tree,
 // new_range_vec will contain the upper portion of range_vec, starting at
 // new_end + 1.
 //
-// new_end + 1 is required to be within the address range of range_vec and be aligned to
-// range_vec's page_size.
+// new_end + 1 is required to be within the address range of range_vec and be
+// aligned to range_vec's page_size.
 //
 // On failure, the original range vector is left unmodified.
 NV_STATUS uvm_page_table_range_vec_split_upper(uvm_page_table_range_vec_t *range_vec,
@ -501,18 +520,22 @@ void uvm_page_table_range_vec_destroy(uvm_page_table_range_vec_t *range_vec);
 // for each offset.
 // The caller_data pointer is what the caller passed in as caller_data to
 // uvm_page_table_range_vec_write_ptes().
-typedef NvU64 (*uvm_page_table_range_pte_maker_t)(uvm_page_table_range_vec_t *range_vec, NvU64 offset,
-        void *caller_data);
+typedef NvU64 (*uvm_page_table_range_pte_maker_t)(uvm_page_table_range_vec_t *range_vec,
+                                                  NvU64 offset,
+                                                  void *caller_data);

-// Write all PTEs covered by the range vector using the given PTE making function.
+// Write all PTEs covered by the range vector using the given PTE making
+// function.
 //
 // After writing all the PTEs a TLB invalidate operation is performed including
 // the passed in tlb_membar.
 //
 // See comments about uvm_page_table_range_pte_maker_t for details about the
 // PTE making callback.
-NV_STATUS uvm_page_table_range_vec_write_ptes(uvm_page_table_range_vec_t *range_vec, uvm_membar_t tlb_membar,
-        uvm_page_table_range_pte_maker_t pte_maker, void *caller_data);
+NV_STATUS uvm_page_table_range_vec_write_ptes(uvm_page_table_range_vec_t *range_vec,
+                                              uvm_membar_t tlb_membar,
+                                              uvm_page_table_range_pte_maker_t pte_maker,
+                                              void *caller_data);

 // Set all PTEs covered by the range vector to an empty PTE
 //
@ -636,8 +659,9 @@ static NvU64 uvm_page_table_range_size(uvm_page_table_range_t *range)

 // Get the physical address of the entry at entry_index within the range
 // (counted from range->start_index).
-static uvm_gpu_phys_address_t uvm_page_table_range_entry_address(uvm_page_tree_t *tree, uvm_page_table_range_t *range,
-        size_t entry_index)
+static uvm_gpu_phys_address_t uvm_page_table_range_entry_address(uvm_page_tree_t *tree,
+                                                                 uvm_page_table_range_t *range,
+                                                                 size_t entry_index)
 {
    NvU32 entry_size = uvm_mmu_pte_size(tree, range->page_size);
    uvm_gpu_phys_address_t entry = range->table->phys_alloc.addr;
--- a/kernel-open/nvidia-uvm/uvm_page_tree_test.c
+++ b/kernel-open/nvidia-uvm/uvm_page_tree_test.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2022 NVIDIA Corporation
+    Copyright (c) 2015-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -146,9 +146,15 @@ static void fake_tlb_invals_disable(void)
    g_fake_tlb_invals_tracking_enabled = false;
 }

-// Fake TLB invalidate VA that just saves off the parameters so that they can be verified later
-static void fake_tlb_invalidate_va(uvm_push_t *push, uvm_gpu_phys_address_t pdb,
-        NvU32 depth, NvU64 base, NvU64 size, NvU32 page_size, uvm_membar_t membar)
+// Fake TLB invalidate VA that just saves off the parameters so that they can be
+// verified later.
+static void fake_tlb_invalidate_va(uvm_push_t *push,
+                                   uvm_gpu_phys_address_t pdb,
+                                   NvU32 depth,
+                                   NvU64 base,
+                                   NvU64 size,
+                                   NvU32 page_size,
+                                   uvm_membar_t membar)
 {
    if (!g_fake_tlb_invals_tracking_enabled)
        return;
@ -210,8 +216,8 @@ static bool assert_and_reset_last_invalidate(NvU32 expected_depth, bool expected
    }
    if ((g_last_fake_inval->membar == UVM_MEMBAR_NONE) == expected_membar) {
        UVM_TEST_PRINT("Expected %s membar, got %s instead\n",
-                expected_membar ? "a" : "no",
-                uvm_membar_string(g_last_fake_inval->membar));
+                       expected_membar ? "a" : "no",
+                       uvm_membar_string(g_last_fake_inval->membar));
        result = false;
    }

@ -230,7 +236,8 @@ static bool assert_last_invalidate_all(NvU32 expected_depth, bool expected_memba
    }
    if (g_last_fake_inval->base != 0 || g_last_fake_inval->size != -1) {
        UVM_TEST_PRINT("Expected invalidate all but got range [0x%llx, 0x%llx) instead\n",
-                g_last_fake_inval->base, g_last_fake_inval->base + g_last_fake_inval->size);
+                       g_last_fake_inval->base,
+                       g_last_fake_inval->base + g_last_fake_inval->size);
        return false;
    }
    if (g_last_fake_inval->depth != expected_depth) {
@ -247,15 +254,16 @@ static bool assert_invalidate_range_specific(fake_tlb_invalidate_t *inval,
    UVM_ASSERT(g_fake_tlb_invals_tracking_enabled);

    if (g_fake_invals_count == 0) {
-        UVM_TEST_PRINT("Expected an invalidate for range [0x%llx, 0x%llx), but got none\n",
-                base, base + size);
+        UVM_TEST_PRINT("Expected an invalidate for range [0x%llx, 0x%llx), but got none\n", base, base + size);
        return false;
    }

    if ((inval->base != base || inval->size != size) && inval->base != 0 && inval->size != -1) {
        UVM_TEST_PRINT("Expected invalidate range [0x%llx, 0x%llx), but got range [0x%llx, 0x%llx) instead\n",
-                base, base + size,
-                inval->base, inval->base + inval->size);
+                        base,
+                        base + size,
+                        inval->base,
+                        inval->base + inval->size);
        return false;
    }
    if (inval->depth != expected_depth) {
@ -270,7 +278,13 @@ static bool assert_invalidate_range_specific(fake_tlb_invalidate_t *inval,
    return true;
 }

-static bool assert_invalidate_range(NvU64 base, NvU64 size, NvU32 page_size, bool allow_inval_all, NvU32 range_depth, NvU32 all_depth, bool expected_membar)
+static bool assert_invalidate_range(NvU64 base,
+                                    NvU64 size,
+                                    NvU32 page_size,
+                                    bool allow_inval_all,
+                                    NvU32 range_depth,
+                                    NvU32 all_depth,
+                                    bool expected_membar)
 {
    NvU32 i;

@ -488,7 +502,6 @@ static NV_STATUS alloc_adjacent_pde_64k_memory(uvm_gpu_t *gpu)
    return NV_OK;
 }

-
 static NV_STATUS alloc_nearby_pde_64k_memory(uvm_gpu_t *gpu)
 {
    uvm_page_tree_t tree;
@ -842,6 +855,7 @@ static NV_STATUS get_two_free_apart(uvm_gpu_t *gpu)
    TEST_CHECK_RET(range2.entry_count == 256);
    TEST_CHECK_RET(range2.table->ref_count == 512);
    TEST_CHECK_RET(range1.table == range2.table);
+
    // 4k page is second entry in a dual PDE
    TEST_CHECK_RET(range1.table == tree.root->entries[0]->entries[0]->entries[0]->entries[1]);
    TEST_CHECK_RET(range1.start_index == 256);
@ -871,6 +885,7 @@ static NV_STATUS get_overlapping_dual_pdes(uvm_gpu_t *gpu)
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_64K, size, size, &range64k), NV_OK);
    TEST_CHECK_RET(range64k.entry_count == 16);
    TEST_CHECK_RET(range64k.table->ref_count == 16);
+
    // 4k page is second entry in a dual PDE
    TEST_CHECK_RET(range64k.table == tree.root->entries[0]->entries[0]->entries[0]->entries[0]);
    TEST_CHECK_RET(range64k.start_index == 16);
@ -1030,10 +1045,13 @@ static NV_STATUS test_tlb_invalidates(uvm_gpu_t *gpu)

    // Depth 4
    NvU64 extent_pte = UVM_PAGE_SIZE_2M;
+
    // Depth 3
    NvU64 extent_pde0 = extent_pte * (1ull << 8);
+
    // Depth 2
    NvU64 extent_pde1 = extent_pde0 * (1ull << 9);
+
    // Depth 1
    NvU64 extent_pde2 = extent_pde1 * (1ull << 9);

@ -1081,7 +1099,11 @@ static NV_STATUS test_tlb_invalidates(uvm_gpu_t *gpu)
    return status;
 }

-static NV_STATUS test_tlb_batch_invalidates_case(uvm_page_tree_t *tree, NvU64 base, NvU64 size, NvU32 min_page_size, NvU32 max_page_size)
+static NV_STATUS test_tlb_batch_invalidates_case(uvm_page_tree_t *tree,
+                                                 NvU64 base,
+                                                 NvU64 size,
+                                                 NvU32 min_page_size,
+                                                 NvU32 max_page_size)
 {
    NV_STATUS status = NV_OK;
    uvm_push_t push;
@ -1205,7 +1227,11 @@ static bool assert_range_vec_ptes(uvm_page_table_range_vec_t *range_vec, bool ex
            NvU64 expected_pte = expecting_cleared ? 0 : range_vec->size + offset;
            if (*pte != expected_pte) {
                UVM_TEST_PRINT("PTE is 0x%llx instead of 0x%llx for offset 0x%llx within range [0x%llx, 0x%llx)\n",
-                        *pte, expected_pte, offset, range_vec->start, range_vec->size);
+                               *pte,
+                               expected_pte,
+                               offset,
+                               range_vec->start,
+                               range_vec->size);
                return false;
            }
            offset += range_vec->page_size;
@ -1226,7 +1252,11 @@ static NV_STATUS test_range_vec_write_ptes(uvm_page_table_range_vec_t *range_vec
    TEST_CHECK_RET(data.status == NV_OK);
    TEST_CHECK_RET(data.count == range_vec->size / range_vec->page_size);
    TEST_CHECK_RET(assert_invalidate_range_specific(g_last_fake_inval,
-            range_vec->start, range_vec->size, range_vec->page_size, page_table_depth, membar != UVM_MEMBAR_NONE));
+                                                    range_vec->start,
+                                                    range_vec->size,
+                                                    range_vec->page_size,
+                                                    page_table_depth,
+                                                    membar != UVM_MEMBAR_NONE));
    TEST_CHECK_RET(assert_range_vec_ptes(range_vec, false));

    fake_tlb_invals_disable();
@ -1249,7 +1279,11 @@ static NV_STATUS test_range_vec_clear_ptes(uvm_page_table_range_vec_t *range_vec
    return NV_OK;
 }

-static NV_STATUS test_range_vec_create(uvm_page_tree_t *tree, NvU64 start, NvU64 size, NvU32 page_size, uvm_page_table_range_vec_t **range_vec_out)
+static NV_STATUS test_range_vec_create(uvm_page_tree_t *tree,
+                                       NvU64 start,
+                                       NvU64 size,
+                                       NvU32 page_size,
+                                       uvm_page_table_range_vec_t **range_vec_out)
 {
    uvm_page_table_range_vec_t *range_vec;
    uvm_pmm_alloc_flags_t pmm_flags = UVM_PMM_ALLOC_FLAGS_EVICT;
@ -1544,25 +1578,28 @@ static NV_STATUS entry_test_maxwell(uvm_gpu_t *gpu)
    uvm_mmu_page_table_alloc_t alloc_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x9999999000LL);
    uvm_mmu_page_table_alloc_t alloc_vid = fake_table_alloc(UVM_APERTURE_VID, 0x1BBBBBB000LL);
    uvm_mmu_mode_hal_t *hal;
+    uvm_page_directory_t dir;
    NvU32 i, j, big_page_size, page_size;

+    dir.depth = 0;
+
    for (i = 0; i < ARRAY_SIZE(big_page_sizes); i++) {
        big_page_size = big_page_sizes[i];
        hal = gpu->parent->arch_hal->mmu_mode_hal(big_page_size);

        memset(phys_allocs, 0, sizeof(phys_allocs));

-        hal->make_pde(&pde_bits, phys_allocs, 0);
+        hal->make_pde(&pde_bits, phys_allocs, &dir, 0);
        TEST_CHECK_RET(pde_bits == 0x0L);

        phys_allocs[0] = &alloc_sys;
        phys_allocs[1] = &alloc_vid;
-        hal->make_pde(&pde_bits, phys_allocs, 0);
+        hal->make_pde(&pde_bits, phys_allocs, &dir, 0);
        TEST_CHECK_RET(pde_bits == 0x1BBBBBBD99999992LL);

        phys_allocs[0] = &alloc_vid;
        phys_allocs[1] = &alloc_sys;
-        hal->make_pde(&pde_bits, phys_allocs, 0);
+        hal->make_pde(&pde_bits, phys_allocs, &dir, 0);
        TEST_CHECK_RET(pde_bits == 0x9999999E1BBBBBB1LL);

        for (j = 0; j <= 2; j++) {
@ -1632,38 +1669,47 @@ static NV_STATUS entry_test_pascal(uvm_gpu_t *gpu, entry_test_page_size_func ent
    uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
    uvm_mmu_page_table_alloc_t alloc_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x399999999999000LL);
    uvm_mmu_page_table_alloc_t alloc_vid = fake_table_alloc(UVM_APERTURE_VID, 0x1BBBBBB000LL);
+    uvm_page_directory_t dir;
+
    // big versions have [11:8] set as well to test the page table merging
    uvm_mmu_page_table_alloc_t alloc_big_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x399999999999900LL);
    uvm_mmu_page_table_alloc_t alloc_big_vid = fake_table_alloc(UVM_APERTURE_VID, 0x1BBBBBBB00LL);

    uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);

+    dir.index_in_parent = 0;
+    dir.host_parent = NULL;
+    dir.depth = 0;
+
    // Make sure cleared PDEs work as expected
-    hal->make_pde(pde_bits, phys_allocs, 0);
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0);

    memset(pde_bits, 0xFF, sizeof(pde_bits));
-    hal->make_pde(pde_bits, phys_allocs, 3);
+    dir.depth = 3;
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0 && pde_bits[1] == 0);

    // Sys and vidmem PDEs
    phys_allocs[0] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 0);
+    dir.depth = 0;
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x3999999999990C);

    phys_allocs[0] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 0);
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBB0A);

    // Dual PDEs
    phys_allocs[0] = &alloc_big_sys;
    phys_allocs[1] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 3);
+    dir.depth = 3;
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x3999999999999C && pde_bits[1] == 0x1BBBBBB0A);

    phys_allocs[0] = &alloc_big_vid;
    phys_allocs[1] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 3);
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBBBA && pde_bits[1] == 0x3999999999990C);

    // uncached, i.e., the sysmem data is not cached in GPU's L2 cache. Clear
@ -1719,6 +1765,7 @@ static NV_STATUS entry_test_volta(uvm_gpu_t *gpu, entry_test_page_size_func entr
    uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
    uvm_mmu_page_table_alloc_t alloc_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x399999999999000LL);
    uvm_mmu_page_table_alloc_t alloc_vid = fake_table_alloc(UVM_APERTURE_VID, 0x1BBBBBB000LL);
+    uvm_page_directory_t dir;

    // big versions have [11:8] set as well to test the page table merging
    uvm_mmu_page_table_alloc_t alloc_big_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x399999999999900LL);
@ -1726,37 +1773,45 @@ static NV_STATUS entry_test_volta(uvm_gpu_t *gpu, entry_test_page_size_func entr

    uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);

+    dir.index_in_parent = 0;
+    dir.host_parent = NULL;
+    dir.depth = 0;
+
    // Make sure cleared PDEs work as expected
-    hal->make_pde(pde_bits, phys_allocs, 0);
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0);

    memset(pde_bits, 0xFF, sizeof(pde_bits));
-    hal->make_pde(pde_bits, phys_allocs, 3);
+    dir.depth = 3;
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0 && pde_bits[1] == 0);

    // Sys and vidmem PDEs
    phys_allocs[0] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 0);
+    dir.depth = 0;
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x3999999999990C);

    phys_allocs[0] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 0);
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBB0A);

    // Dual PDEs
    phys_allocs[0] = &alloc_big_sys;
    phys_allocs[1] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 3);
+    dir.depth = 3;
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x3999999999999C && pde_bits[1] == 0x1BBBBBB0A);

    phys_allocs[0] = &alloc_big_vid;
    phys_allocs[1] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 3);
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBBBA && pde_bits[1] == 0x3999999999990C);

    // NO_ATS PDE1 (depth 2)
    phys_allocs[0] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 2);
+    dir.depth = 2;
+    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    if (g_uvm_global.ats.enabled)
        TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBB2A);
    else
@ -1791,104 +1846,203 @@ static NV_STATUS entry_test_ampere(uvm_gpu_t *gpu, entry_test_page_size_func ent

 static NV_STATUS entry_test_hopper(uvm_gpu_t *gpu, entry_test_page_size_func entry_test_page_size)
 {
+    NV_STATUS status = NV_OK;
    NvU32 page_sizes[MAX_NUM_PAGE_SIZES];
    NvU64 pde_bits[2];
+    uvm_page_directory_t *dirs[5];
    size_t i, num_page_sizes;
    uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
    uvm_mmu_page_table_alloc_t alloc_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x9999999999000LL);
    uvm_mmu_page_table_alloc_t alloc_vid = fake_table_alloc(UVM_APERTURE_VID, 0xBBBBBBB000LL);

-    // big versions have [11:8] set as well to test the page table merging
+    // Big versions have [11:8] set as well to test the page table merging
    uvm_mmu_page_table_alloc_t alloc_big_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x9999999999900LL);
    uvm_mmu_page_table_alloc_t alloc_big_vid = fake_table_alloc(UVM_APERTURE_VID, 0xBBBBBBBB00LL);

    uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);

-    // Make sure cleared PDEs work as expected
-    hal->make_pde(pde_bits, phys_allocs, 0);
-    TEST_CHECK_RET(pde_bits[0] == 0);
+    memset(dirs, 0, sizeof(dirs));
+    // Fake directory tree.
+    for (i = 0; i < ARRAY_SIZE(dirs); i++) {
+        dirs[i] = uvm_kvmalloc_zero(sizeof(uvm_page_directory_t) + sizeof(dirs[i]->entries[0]) * 512);
+        TEST_CHECK_GOTO(dirs[i] != NULL, cleanup);
+
+        dirs[i]->depth = i;
+        dirs[i]->index_in_parent = 0;
+
+        if (i == 0)
+            dirs[i]->host_parent = NULL;
+        else
+            dirs[i]->host_parent = dirs[i - 1];
+    }
+
+    // Make sure cleared PDEs work as expected.
+    hal->make_pde(pde_bits, phys_allocs, dirs[0], 0);
+    TEST_CHECK_GOTO(pde_bits[0] == 0, cleanup);

    // Cleared PDEs work as expected for big and small PDEs.
    memset(pde_bits, 0xFF, sizeof(pde_bits));
-    hal->make_pde(pde_bits, phys_allocs, 4);
-    TEST_CHECK_RET(pde_bits[0] == 0 && pde_bits[1] == 0);
+    hal->make_pde(pde_bits, phys_allocs, dirs[4], 0);
+    TEST_CHECK_GOTO(pde_bits[0] == 0 && pde_bits[1] == 0, cleanup);

    // Sys and vidmem PDEs, uncached ATS allowed.
    phys_allocs[0] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 0);
-    TEST_CHECK_RET(pde_bits[0] == 0x999999999900C);
+    hal->make_pde(pde_bits, phys_allocs, dirs[0], 0);
+    TEST_CHECK_GOTO(pde_bits[0] == 0x999999999900C, cleanup);

    phys_allocs[0] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 0);
-    TEST_CHECK_RET(pde_bits[0] == 0xBBBBBBB00A);
+    hal->make_pde(pde_bits, phys_allocs, dirs[0], 0);
+    TEST_CHECK_GOTO(pde_bits[0] == 0xBBBBBBB00A, cleanup);

-    // Dual PDEs, uncached.
+    // Dual PDEs, uncached. We don't use child_dir in the depth 4 checks because
+    // our policy decides the PDE's PCF without using it.
    phys_allocs[0] = &alloc_big_sys;
    phys_allocs[1] = &alloc_vid;
-    hal->make_pde(pde_bits, phys_allocs, 4);
-    TEST_CHECK_RET(pde_bits[0] == 0x999999999991C && pde_bits[1] == 0xBBBBBBB01A);
+    hal->make_pde(pde_bits, phys_allocs, dirs[4], 0);
+    if (g_uvm_global.ats.enabled)
+        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999991C && pde_bits[1] == 0xBBBBBBB01A, cleanup);
+    else
+        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999990C && pde_bits[1] == 0xBBBBBBB00A, cleanup);

    phys_allocs[0] = &alloc_big_vid;
    phys_allocs[1] = &alloc_sys;
-    hal->make_pde(pde_bits, phys_allocs, 4);
-    TEST_CHECK_RET(pde_bits[0] == 0xBBBBBBBB1A && pde_bits[1] == 0x999999999901C);
+    hal->make_pde(pde_bits, phys_allocs, dirs[4], 0);
+    if (g_uvm_global.ats.enabled)
+        TEST_CHECK_GOTO(pde_bits[0] == 0xBBBBBBBB1A && pde_bits[1] == 0x999999999901C, cleanup);
+    else
+        TEST_CHECK_GOTO(pde_bits[0] == 0xBBBBBBBB0A && pde_bits[1] == 0x999999999900C, cleanup);
+
+    // We only need to test make_pde() on ATS when the CPU VA width < GPU's.
+    if (g_uvm_global.ats.enabled && uvm_cpu_num_va_bits() < hal->num_va_bits()) {
+        phys_allocs[0] = &alloc_sys;
+
+        dirs[1]->index_in_parent = 0;
+        hal->make_pde(pde_bits, phys_allocs, dirs[0], 0);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999900C, cleanup);
+
+        dirs[2]->index_in_parent = 0;
+        hal->make_pde(pde_bits, phys_allocs, dirs[1], 0);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
+
+        dirs[2]->index_in_parent = 1;
+        hal->make_pde(pde_bits, phys_allocs, dirs[1], 1);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
+
+        dirs[2]->index_in_parent = 2;
+        hal->make_pde(pde_bits, phys_allocs, dirs[1], 2);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
+
+        dirs[2]->index_in_parent = 511;
+        hal->make_pde(pde_bits, phys_allocs, dirs[1], 511);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
+
+        dirs[1]->index_in_parent = 1;
+        hal->make_pde(pde_bits, phys_allocs, dirs[0], 1);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999900C, cleanup);
+
+        dirs[2]->index_in_parent = 0;
+        hal->make_pde(pde_bits, phys_allocs, dirs[1], 0);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
+
+        dirs[2]->index_in_parent = 509;
+        hal->make_pde(pde_bits, phys_allocs, dirs[1], 509);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
+
+        dirs[2]->index_in_parent = 510;
+        hal->make_pde(pde_bits, phys_allocs, dirs[1], 510);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);
+
+        phys_allocs[0] = NULL;
+
+        dirs[1]->index_in_parent = 0;
+        hal->make_pde(pde_bits, phys_allocs, dirs[0], 0);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x0, cleanup);
+
+        dirs[2]->index_in_parent = 0;
+        hal->make_pde(pde_bits, phys_allocs, dirs[1], 0);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x0, cleanup);
+
+        dirs[2]->index_in_parent = 2;
+        hal->make_pde(pde_bits, phys_allocs, dirs[1], 2);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x10, cleanup);
+
+        dirs[1]->index_in_parent = 1;
+        dirs[2]->index_in_parent = 509;
+        hal->make_pde(pde_bits, phys_allocs, dirs[1], 509);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x10, cleanup);
+
+        dirs[2]->index_in_parent = 510;
+        hal->make_pde(pde_bits, phys_allocs, dirs[1], 510);
+        TEST_CHECK_GOTO(pde_bits[0] == 0x0, cleanup);
+    }

    // uncached, i.e., the sysmem data is not cached in GPU's L2 cache, and
    // access counters disabled.
-    TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_SYS,
-                                 0x9999999999000LL,
-                                 UVM_PROT_READ_WRITE_ATOMIC,
-                                 UVM_MMU_PTE_FLAGS_ACCESS_COUNTERS_DISABLED) == 0x999999999968D);
+    TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_SYS,
+                                  0x9999999999000LL,
+                                  UVM_PROT_READ_WRITE_ATOMIC,
+                                  UVM_MMU_PTE_FLAGS_ACCESS_COUNTERS_DISABLED) == 0x999999999968D,
+                    cleanup);

    // change to cached.
-    TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_SYS,
-                                 0x9999999999000LL,
-                                 UVM_PROT_READ_WRITE_ATOMIC,
-                                 UVM_MMU_PTE_FLAGS_CACHED | UVM_MMU_PTE_FLAGS_ACCESS_COUNTERS_DISABLED) ==
-                   0x9999999999685);
+    TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_SYS,
+                                  0x9999999999000LL,
+                                  UVM_PROT_READ_WRITE_ATOMIC,
+                                  UVM_MMU_PTE_FLAGS_CACHED | UVM_MMU_PTE_FLAGS_ACCESS_COUNTERS_DISABLED) ==
+                                  0x9999999999685,
+                    cleanup);

    // enable access counters.
-    TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_SYS,
-                                 0x9999999999000LL,
-                                 UVM_PROT_READ_WRITE_ATOMIC,
-                                 UVM_MMU_PTE_FLAGS_CACHED) == 0x9999999999605);
+    TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_SYS,
+                                  0x9999999999000LL,
+                                  UVM_PROT_READ_WRITE_ATOMIC,
+                                  UVM_MMU_PTE_FLAGS_CACHED) == 0x9999999999605,
+                    cleanup);

    // remove atomic
-    TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_SYS,
-                                 0x9999999999000LL,
-                                 UVM_PROT_READ_WRITE,
-                                 UVM_MMU_PTE_FLAGS_CACHED) == 0x9999999999645);
+    TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_SYS,
+                                  0x9999999999000LL,
+                                  UVM_PROT_READ_WRITE,
+                                  UVM_MMU_PTE_FLAGS_CACHED) == 0x9999999999645,
+                    cleanup);

    // read only
-    TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_SYS,
-                                 0x9999999999000LL,
-                                 UVM_PROT_READ_ONLY,
-                                 UVM_MMU_PTE_FLAGS_CACHED) == 0x9999999999665);
+    TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_SYS,
+                                  0x9999999999000LL,
+                                  UVM_PROT_READ_ONLY,
+                                  UVM_MMU_PTE_FLAGS_CACHED) == 0x9999999999665,
+                    cleanup);

    // local video
-    TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_VID,
-                                 0xBBBBBBB000LL,
-                                 UVM_PROT_READ_ONLY,
-                                 UVM_MMU_PTE_FLAGS_CACHED) == 0xBBBBBBB661);
+    TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_VID,
+                                  0xBBBBBBB000LL,
+                                  UVM_PROT_READ_ONLY,
+                                  UVM_MMU_PTE_FLAGS_CACHED) == 0xBBBBBBB661,
+                    cleanup);

    // peer 1
-    TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_PEER_1,
-                                 0xBBBBBBB000LL,
-                                 UVM_PROT_READ_ONLY,
-                                 UVM_MMU_PTE_FLAGS_CACHED) == 0x200000BBBBBBB663);
+    TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_PEER_1,
+                                  0xBBBBBBB000LL,
+                                  UVM_PROT_READ_ONLY,
+                                  UVM_MMU_PTE_FLAGS_CACHED) == 0x200000BBBBBBB663,
+                    cleanup);

    // sparse
-    TEST_CHECK_RET(hal->make_sparse_pte() == 0x8);
+    TEST_CHECK_GOTO(hal->make_sparse_pte() == 0x8, cleanup);

    // sked reflected
-    TEST_CHECK_RET(hal->make_sked_reflected_pte() == 0xF09);
+    TEST_CHECK_GOTO(hal->make_sked_reflected_pte() == 0xF09, cleanup);

    num_page_sizes = get_page_sizes(gpu, page_sizes);

    for (i = 0; i < num_page_sizes; i++)
-        TEST_NV_CHECK_RET(entry_test_page_size(gpu, page_sizes[i]));
+        TEST_NV_CHECK_GOTO(entry_test_page_size(gpu, page_sizes[i]), cleanup);

-    return NV_OK;
+cleanup:
+    for (i = 0; i < ARRAY_SIZE(dirs); i++)
+        uvm_kvfree(dirs[i]);
+
+    return status;
 }

 static NV_STATUS alloc_4k_maxwell(uvm_gpu_t *gpu)
@ -2303,7 +2457,8 @@ NV_STATUS uvm_test_page_tree(UVM_TEST_PAGE_TREE_PARAMS *params, struct file *fil
    gpu->parent = parent_gpu;

    // At least test_tlb_invalidates() relies on global state
-    // (g_tlb_invalidate_*) so make sure only one test instance can run at a time.
+    // (g_tlb_invalidate_*) so make sure only one test instance can run at a
+    // time.
    uvm_mutex_lock(&g_uvm_global.global_lock);

    // Allocate the fake TLB tracking state. Notably tests still need to enable
@ -2311,7 +2466,13 @@ NV_STATUS uvm_test_page_tree(UVM_TEST_PAGE_TREE_PARAMS *params, struct file *fil
    // calls.
    TEST_NV_CHECK_GOTO(fake_tlb_invals_alloc(), done);

-    TEST_NV_CHECK_GOTO(maxwell_test_page_tree(gpu), done);
+    // We prevent the maxwell_test_page_tree test from running on ATS-enabled
+    // systems. On "fake" Maxwell-based ATS systems pde_fill() may push more
+    // methods than what we support in UVM. Specifically, on
+    // uvm_page_tree_init() which eventually calls phys_mem_init(). On Maxwell,
+    // upper PDE levels have more than 512 entries.
+    if (!g_uvm_global.ats.enabled)
+        TEST_NV_CHECK_GOTO(maxwell_test_page_tree(gpu), done);
    TEST_NV_CHECK_GOTO(pascal_test_page_tree(gpu), done);
    TEST_NV_CHECK_GOTO(volta_test_page_tree(gpu), done);
    TEST_NV_CHECK_GOTO(ampere_test_page_tree(gpu), done);
--- a/kernel-open/nvidia-uvm/uvm_pascal.c
+++ b/kernel-open/nvidia-uvm/uvm_pascal.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2020 NVIDIA Corporation
+    Copyright (c) 2016-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -100,4 +100,6 @@ void uvm_hal_pascal_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
    parent_gpu->smc.supported = false;

    parent_gpu->plc_supported = false;
+
+    parent_gpu->no_ats_range_required = false;
 }
--- a/kernel-open/nvidia-uvm/uvm_pascal_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_pascal_mmu.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2020 NVIDIA Corporation
+    Copyright (c) 2015-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -140,11 +140,18 @@ static NvU64 small_half_pde_pascal(uvm_mmu_page_table_alloc_t *phys_alloc)
    return pde_bits;
 }

-static void make_pde_pascal(void *entry, uvm_mmu_page_table_alloc_t **phys_allocs, NvU32 depth)
+static void make_pde_pascal(void *entry,
+                            uvm_mmu_page_table_alloc_t **phys_allocs,
+                            uvm_page_directory_t *dir,
+                            NvU32 child_index)
 {
-    NvU32 entry_count = entries_per_index_pascal(depth);
+    NvU32 entry_count;
    NvU64 *entry_bits = (NvU64 *)entry;

+    UVM_ASSERT(dir);
+
+    entry_count = entries_per_index_pascal(dir->depth);
+
    if (entry_count == 1) {
        *entry_bits = single_pde_pascal(*phys_allocs);
    }
@ -152,7 +159,8 @@ static void make_pde_pascal(void *entry, uvm_mmu_page_table_alloc_t **phys_alloc
        entry_bits[MMU_BIG] = big_half_pde_pascal(phys_allocs[MMU_BIG]);
        entry_bits[MMU_SMALL] = small_half_pde_pascal(phys_allocs[MMU_SMALL]);

-        // This entry applies to the whole dual PDE but is stored in the lower bits
+        // This entry applies to the whole dual PDE but is stored in the lower
+        // bits.
        entry_bits[MMU_BIG] |= HWCONST64(_MMU_VER2, DUAL_PDE, IS_PDE, TRUE);
    }
    else {
--- a/kernel-open/nvidia-uvm/uvm_test.c
+++ b/kernel-open/nvidia-uvm/uvm_test.c
@ -36,6 +36,7 @@
 #include "uvm_mmu.h"
 #include "uvm_gpu_access_counters.h"
 #include "uvm_pmm_sysmem.h"
+#include "uvm_migrate_pageable.h"

 static NV_STATUS uvm_test_get_gpu_ref_count(UVM_TEST_GET_GPU_REF_COUNT_PARAMS *params, struct file *filp)
 {
@ -331,6 +332,7 @@ long uvm_test_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        UVM_ROUTE_CMD_STACK_NO_INIT_CHECK(UVM_TEST_CGROUP_ACCOUNTING_SUPPORTED, uvm_test_cgroup_accounting_supported);
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_SPLIT_INVALIDATE_DELAY, uvm_test_split_invalidate_delay);
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_CPU_CHUNK_API, uvm_test_cpu_chunk_api);
+        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_SKIP_MIGRATE_VMA, uvm_test_skip_migrate_vma);
    }

    return -EINVAL;
--- a/kernel-open/nvidia-uvm/uvm_test_ioctl.h
+++ b/kernel-open/nvidia-uvm/uvm_test_ioctl.h
@ -28,6 +28,13 @@
 #include "uvm_ioctl.h"
 #include "nv_uvm_types.h"

+#define UVM_TEST_SKIP_MIGRATE_VMA                        UVM_TEST_IOCTL_BASE(103)
+typedef struct
+{
+    NvBool skip;                                         // In
+    NV_STATUS rmStatus;                                  // Out
+} UVM_TEST_SKIP_MIGRATE_VMA_PARAMS;
+
 #ifdef __cplusplus
 extern "C" {
 #endif
--- a/kernel-open/nvidia-uvm/uvm_tools.c
+++ b/kernel-open/nvidia-uvm/uvm_tools.c
@ -1082,25 +1082,19 @@ void uvm_tools_broadcast_replay(uvm_gpu_t *gpu,
 }


-void uvm_tools_broadcast_replay_sync(uvm_gpu_t *gpu,
-                                     NvU32 batch_id,
-                                     uvm_fault_client_type_t client_type)
+void uvm_tools_broadcast_replay_sync(uvm_gpu_t *gpu, NvU32 batch_id, uvm_fault_client_type_t client_type)
 {
    UVM_ASSERT(!gpu->parent->has_clear_faulted_channel_method);

    if (!tools_is_event_enabled_in_any_va_space(UvmEventTypeGpuFaultReplay))
        return;

-    record_replay_event_helper(gpu->id,
-                               batch_id,
-                               client_type,
-                               NV_GETTIME(),
-                               gpu->parent->host_hal->get_time(gpu));
+    record_replay_event_helper(gpu->id, batch_id, client_type, NV_GETTIME(), gpu->parent->host_hal->get_time(gpu));
 }

 void uvm_tools_broadcast_access_counter(uvm_gpu_t *gpu,
                                        const uvm_access_counter_buffer_entry_t *buffer_entry,
-                                        bool on_managed)
+                                        bool on_managed_phys)
 {
    UvmEventEntry entry;
    UvmEventTestAccessCounterInfo *info = &entry.testEventData.accessCounter;
@ -1119,6 +1113,7 @@ void uvm_tools_broadcast_access_counter(uvm_gpu_t *gpu,
    info->srcIndex            = uvm_id_value(gpu->id);
    info->address             = buffer_entry->address.address;
    info->isVirtual           = buffer_entry->address.is_virtual? 1: 0;
+
    if (buffer_entry->address.is_virtual) {
        info->instancePtr         = buffer_entry->virtual_info.instance_ptr.address;
        info->instancePtrAperture = g_hal_to_tools_aperture_table[buffer_entry->virtual_info.instance_ptr.aperture];
@ -1126,9 +1121,10 @@ void uvm_tools_broadcast_access_counter(uvm_gpu_t *gpu,
    }
    else {
        info->aperture            = g_hal_to_tools_aperture_table[buffer_entry->address.aperture];
+        info->physOnManaged       = on_managed_phys? 1 : 0;
    }
+
    info->isFromCpu           = buffer_entry->counter_type == UVM_ACCESS_COUNTER_TYPE_MOMC? 1: 0;
-    info->onManaged           = on_managed? 1 : 0;
    info->value               = buffer_entry->counter_value;
    info->subGranularity      = buffer_entry->sub_granularity;
    info->bank                = buffer_entry->bank;
--- a/kernel-open/nvidia-uvm/uvm_tools.h
+++ b/kernel-open/nvidia-uvm/uvm_tools.h
@ -102,18 +102,13 @@ void uvm_tools_record_read_duplicate_invalidate(uvm_va_block_t *va_block,
                                                uvm_va_block_region_t region,
                                                const uvm_page_mask_t *page_mask);

-void uvm_tools_broadcast_replay(uvm_gpu_t *gpu,
-                                uvm_push_t *push,
-                                NvU32 batch_id,
-                                uvm_fault_client_type_t client_type);
+void uvm_tools_broadcast_replay(uvm_gpu_t *gpu, uvm_push_t *push, NvU32 batch_id, uvm_fault_client_type_t client_type);

-void uvm_tools_broadcast_replay_sync(uvm_gpu_t *gpu,
-                                     NvU32 batch_id,
-                                     uvm_fault_client_type_t client_type);
+void uvm_tools_broadcast_replay_sync(uvm_gpu_t *gpu, NvU32 batch_id, uvm_fault_client_type_t client_type);

 void uvm_tools_broadcast_access_counter(uvm_gpu_t *gpu,
                                        const uvm_access_counter_buffer_entry_t *buffer_entry,
-                                        bool on_managed);
+                                        bool on_managed_phys);

 void uvm_tools_test_hmm_split_invalidate(uvm_va_space_t *va_space);

--- a/kernel-open/nvidia-uvm/uvm_turing.c
+++ b/kernel-open/nvidia-uvm/uvm_turing.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2017-2021 NVIDIA Corporation
+    Copyright (c) 2017-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -93,4 +93,6 @@ void uvm_hal_turing_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
    parent_gpu->smc.supported = false;

    parent_gpu->plc_supported = true;
+
+    parent_gpu->no_ats_range_required = false;
 }
--- a/kernel-open/nvidia-uvm/uvm_types.h
+++ b/kernel-open/nvidia-uvm/uvm_types.h
@ -967,8 +967,10 @@ typedef struct
    NvU8 isFromCpu;

    NvU8 veId;
-    NvU8 onManaged;         // The access counter notification was triggered on
-                            // a managed memory region
+
+    // The physical access counter notification was triggered on a managed
+    // memory region. This is not set for virtual access counter notifications.
+    NvU8 physOnManaged;

    NvU32 value;
    NvU32 subGranularity;
--- a/kernel-open/nvidia-uvm/uvm_va_block.c
+++ b/kernel-open/nvidia-uvm/uvm_va_block.c
@ -1760,6 +1760,21 @@ static NvU32 block_phys_page_size(uvm_va_block_t *block, block_phys_page_t page)
    return (NvU32)chunk_size;
 }

+NvU32 uvm_va_block_get_physical_size(uvm_va_block_t *block,
+                                     uvm_processor_id_t processor,
+                                     uvm_page_index_t page_index)
+{
+    block_phys_page_t page;
+
+    UVM_ASSERT(block);
+
+    uvm_assert_mutex_locked(&block->lock);
+
+    page = block_phys_page(processor, page_index);
+
+    return block_phys_page_size(block, page);
+}
+
 static uvm_pte_bits_cpu_t get_cpu_pte_bit_index(uvm_prot_t prot)
 {
    uvm_pte_bits_cpu_t pte_bit_index = UVM_PTE_BITS_CPU_MAX;
@ -8248,14 +8263,6 @@ void uvm_va_block_munmap_region(uvm_va_block_t *va_block,
    event_data.block_munmap.region = region;
    uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_BLOCK_MUNMAP, &event_data);

-    // Set a flag so that GPU fault events are flushed since they might refer
-    // to the region being unmapped.
-    // Note that holding the va_block lock prevents GPU VA spaces from
-    // being removed so the registered_gpu_va_spaces mask is stable.
-    for_each_gpu_id_in_mask(gpu_id, &va_space->registered_gpu_va_spaces) {
-        uvm_processor_mask_set_atomic(&va_space->needs_fault_buffer_flush, gpu_id);
-    }
-
    // Release any remaining vidmem chunks in the given region.
    for_each_gpu_id(gpu_id) {
        uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id);
@ -10155,6 +10162,34 @@ static uvm_processor_id_t block_select_residency(uvm_va_block_t *va_block,
        uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(preferred_location)], processor_id))
        return preferred_location;

+    // Check if we should map the closest resident processor remotely on remote CPU fault
+    //
+    // When faulting on CPU, there's a linux process on behalf of it, which is associated
+    // with a unique VM pointed by current->mm. A block of memory residing on GPU is also
+    // associated with VM, pointed by va_block_context->mm. If they match, it's a regular
+    // (local) fault, and we may want to migrate a page from GPU to CPU.
+    // If it's a 'remote' fault, i.e. linux process differs from one associated with block
+    // VM, we might preserve residence.
+    //
+    // Establishing a remote fault without access counters means the memory could stay in
+    // the wrong spot for a long time, which is why we prefer to avoid creating remote
+    // mappings. However when NIC accesses a memory residing on GPU, it's worth to keep it
+    // in place for NIC accesses.
+    //
+    // The logic that's used to detect remote faulting also keeps memory in place for
+    // ptrace accesses. We would prefer to control those policies separately, but the
+    // NIC case takes priority.
+    // If the accessing processor is CPU, we're either handling a fault
+    // from other than owning process, or we're handling an MOMC
+    // notification. Only prevent migration for the former.
+    if (UVM_ID_IS_CPU(processor_id) &&
+        operation != UVM_SERVICE_OPERATION_ACCESS_COUNTERS &&        
+        uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(closest_resident_processor)], processor_id) &&
+        va_block_context->mm != current->mm) {
+        UVM_ASSERT(va_block_context->mm != NULL);
+        return closest_resident_processor;
+    }
+
    // If the page is resident on a processor other than the preferred location,
    // or the faulting processor can't access the preferred location, we select
    // the faulting processor as the new residency.
@ -10713,7 +10748,7 @@ NV_STATUS uvm_va_block_check_logical_permissions(uvm_va_block_t *va_block,
                                                 uvm_va_block_context_t *va_block_context,
                                                 uvm_processor_id_t processor_id,
                                                 uvm_page_index_t page_index,
-                                                 uvm_fault_type_t access_type,
+                                                 uvm_fault_access_type_t access_type,
                                                 bool allow_migration)
 {
    uvm_va_range_t *va_range = va_block->va_range;
--- a/kernel-open/nvidia-uvm/uvm_va_block.h
+++ b/kernel-open/nvidia-uvm/uvm_va_block.h
@ -1000,7 +1000,7 @@ NV_STATUS uvm_va_block_check_logical_permissions(uvm_va_block_t *va_block,
                                                 uvm_va_block_context_t *va_block_context,
                                                 uvm_processor_id_t processor_id,
                                                 uvm_page_index_t page_index,
-                                                 uvm_fault_type_t access_type,
+                                                 uvm_fault_access_type_t access_type,
                                                 bool allow_migration);

 // API for access privilege revocation
@ -2072,6 +2072,14 @@ void uvm_va_block_unmap_cpu_chunk_on_gpus(uvm_va_block_t *va_block,
 // Locking: The va_block lock must be held.
 void uvm_va_block_remove_cpu_chunks(uvm_va_block_t *va_block, uvm_va_block_region_t region);

+// Get the size of the physical allocation backing the page at page_index on the
+// specified processor in the block. Returns 0 if the address is not resident on
+// the specified processor.
+// Locking: The va_block lock must be held.
+NvU32 uvm_va_block_get_physical_size(uvm_va_block_t *block,
+                                     uvm_processor_id_t processor,
+                                     uvm_page_index_t page_index);
+
 // Get CPU page size or 0 if it is not mapped
 NvU32 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block,
                                 uvm_page_index_t page_index);
--- a/kernel-open/nvidia-uvm/uvm_va_policy.h
+++ b/kernel-open/nvidia-uvm/uvm_va_policy.h
@ -193,7 +193,8 @@ uvm_va_policy_node_t *uvm_va_policy_node_iter_next(uvm_va_block_t *va_block, uvm
    for ((node) = uvm_va_policy_node_iter_first((va_block), (start), (end)),  \
         (next) = uvm_va_policy_node_iter_next((va_block), (node), (end));    \
         (node);                                                              \
-         (node) = (next))
+         (node) = (next),                                                     \
+         (next) = uvm_va_policy_node_iter_next((va_block), (node), (end)))

 // Returns the first policy in the range [start, end], if any.
 // Locking: The va_block lock must be held.
--- a/kernel-open/nvidia-uvm/uvm_va_space.c
+++ b/kernel-open/nvidia-uvm/uvm_va_space.c
@ -1540,7 +1540,6 @@ static void remove_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space,
    atomic_inc(&va_space->gpu_va_space_deferred_free.num_pending);

    uvm_processor_mask_clear(&va_space->registered_gpu_va_spaces, gpu_va_space->gpu->id);
-    uvm_processor_mask_clear_atomic(&va_space->needs_fault_buffer_flush, gpu_va_space->gpu->id);
    va_space->gpu_va_spaces[uvm_id_gpu_index(gpu_va_space->gpu->id)] = NULL;
    gpu_va_space->state = UVM_GPU_VA_SPACE_STATE_DEAD;
 }
--- a/kernel-open/nvidia-uvm/uvm_va_space.h
+++ b/kernel-open/nvidia-uvm/uvm_va_space.h
@ -253,17 +253,6 @@ struct uvm_va_space_struct
    // corrupting state.
    uvm_processor_mask_t gpu_unregister_in_progress;

-    // On VMA destruction, the fault buffer needs to be flushed for all the GPUs
-    // registered in the VA space to avoid leaving stale entries of the VA range
-    // that is going to be destroyed. Otherwise, these fault entries can be
-    // attributed to new VA ranges reallocated at the same addresses. However,
-    // uvm_vm_close is called with mm->mmap_lock taken and we cannot take the
-    // ISR lock. Therefore, we use a flag to notify the GPU fault handler that
-    // the fault buffer needs to be flushed, before servicing the faults that
-    // belong to the va_space. The bits are set and cleared atomically so no
-    // va_space lock is required.
-    uvm_processor_mask_t needs_fault_buffer_flush;
-
    // Mask of processors that are participating in system-wide atomics
    uvm_processor_mask_t system_wide_atomics_enabled_processors;

@ -353,6 +342,7 @@ struct uvm_va_space_struct
    struct
    {
        bool  page_prefetch_enabled;
+        bool  skip_migrate_vma;

        atomic_t migrate_vma_allocation_fail_nth;

--- a/kernel-open/nvidia-uvm/uvm_va_space_mm.c
+++ b/kernel-open/nvidia-uvm/uvm_va_space_mm.c
@ -215,7 +215,13 @@ bool uvm_va_space_mm_enabled(uvm_va_space_t *va_space)

    static struct mmu_notifier_ops uvm_mmu_notifier_ops_ats =
    {
+#if defined(NV_MMU_NOTIFIER_OPS_HAS_INVALIDATE_RANGE)
        .invalidate_range = uvm_mmu_notifier_invalidate_range_ats,
+#elif defined(NV_MMU_NOTIFIER_OPS_HAS_ARCH_INVALIDATE_SECONDARY_TLBS)
+        .arch_invalidate_secondary_tlbs = uvm_mmu_notifier_invalidate_range_ats,
+#else
+        #error One of invalidate_range/arch_invalid_secondary must be present
+#endif
    };

    static int uvm_mmu_notifier_register(uvm_va_space_mm_t *va_space_mm)
--- a/kernel-open/nvidia-uvm/uvm_volta.c
+++ b/kernel-open/nvidia-uvm/uvm_volta.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2021 NVIDIA Corporation
+    Copyright (c) 2016-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -98,4 +98,6 @@ void uvm_hal_volta_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
    parent_gpu->smc.supported = false;

    parent_gpu->plc_supported = false;
+
+    parent_gpu->no_ats_range_required = false;
 }
--- a/kernel-open/nvidia-uvm/uvm_volta_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_volta_mmu.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2017-2021 NVIDIA Corporation
+    Copyright (c) 2017-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -145,13 +145,20 @@ static NvU64 small_half_pde_volta(uvm_mmu_page_table_alloc_t *phys_alloc)
    return pde_bits;
 }

-static void make_pde_volta(void *entry, uvm_mmu_page_table_alloc_t **phys_allocs, NvU32 depth)
+static void make_pde_volta(void *entry,
+                           uvm_mmu_page_table_alloc_t **phys_allocs,
+                           uvm_page_directory_t *dir,
+                           NvU32 child_index)
 {
-    NvU32 entry_count = entries_per_index_volta(depth);
+    NvU32 entry_count;
    NvU64 *entry_bits = (NvU64 *)entry;

+    UVM_ASSERT(dir);
+
+    entry_count = entries_per_index_volta(dir->depth);
+
    if (entry_count == 1) {
-        *entry_bits = single_pde_volta(*phys_allocs, depth);
+        *entry_bits = single_pde_volta(*phys_allocs, dir->depth);
    }
    else if (entry_count == 2) {
        entry_bits[MMU_BIG] = big_half_pde_volta(phys_allocs[MMU_BIG]);
--- a/kernel-open/nvidia/libspdm_shash.c
+++ b/kernel-open/nvidia/libspdm_shash.c
@ -23,10 +23,16 @@

 #include "internal_crypt_lib.h"

+#ifdef USE_LKCA
+#ifndef NV_CRYPTO_TFM_CTX_ALIGNED_PRESENT
+#include <crypto/internal/hash.h>
+#endif
+#endif
+
 void *lkca_hash_new(const char* alg_name)
 {
 #ifndef USE_LKCA
-    return false;
+    return NULL;
 #else
    //XXX: can we reuse crypto_shash part and just allocate desc
    struct crypto_shash *alg;
@ -87,9 +93,24 @@ bool lkca_hmac_duplicate(struct shash_desc *dst, struct shash_desc const *src)

        struct crypto_shash *src_tfm = src->tfm;
        struct crypto_shash *dst_tfm = dst->tfm;
+        int ss = crypto_shash_statesize(dst_tfm);
+
+#ifdef NV_CRYPTO_TFM_CTX_ALIGNED_PRESENT
        char *src_ipad = crypto_tfm_ctx_aligned(&src_tfm->base);
        char *dst_ipad = crypto_tfm_ctx_aligned(&dst_tfm->base);
-        int ss = crypto_shash_statesize(dst_tfm);
+#else
+        int ctx_size = crypto_shash_alg(dst_tfm)->base.cra_ctxsize;
+        char *src_ipad = crypto_shash_ctx(src_tfm);
+        char *dst_ipad = crypto_shash_ctx(dst_tfm);
+        /*
+         * Actual struct definition is hidden, so I assume data we need is at
+         * the end. In 6.0 the struct has a pointer to crpyto_shash followed by: 
+         * 'u8 ipad[statesize];', then 'u8 opad[statesize];'
+         */
+        src_ipad += ctx_size - 2 * ss;
+        dst_ipad += ctx_size - 2 * ss;
+#endif
+
        memcpy(dst_ipad, src_ipad, crypto_shash_blocksize(src->tfm));
        memcpy(dst_ipad + ss, src_ipad + ss, crypto_shash_blocksize(src->tfm));
        crypto_shash_clear_flags(dst->tfm, CRYPTO_TFM_NEED_KEY);
--- a/kernel-open/nvidia/nv-msi.c
+++ b/kernel-open/nvidia/nv-msi.c
@ -156,7 +156,7 @@ NvS32 NV_API_CALL nv_request_msix_irq(nv_linux_state_t *nvl)
        {
            for( j = 0; j < i; j++)
            {
-                free_irq(nvl->msix_entries[i].vector, (void *)nvl);
+                free_irq(nvl->msix_entries[j].vector, (void *)nvl);
            }
            break;
        }
--- a/kernel-open/nvidia/nv-p2p.c
+++ b/kernel-open/nvidia/nv-p2p.c
@ -316,14 +316,14 @@ int nvidia_p2p_init_mapping(
    return -ENOTSUPP;
 }

-EXPORT_SYMBOL(nvidia_p2p_init_mapping);
+NV_EXPORT_SYMBOL(nvidia_p2p_init_mapping);

 int nvidia_p2p_destroy_mapping(uint64_t p2p_token)
 {
    return -ENOTSUPP;
 }

-EXPORT_SYMBOL(nvidia_p2p_destroy_mapping);
+NV_EXPORT_SYMBOL(nvidia_p2p_destroy_mapping);

 static void nv_p2p_mem_info_free_callback(void *data)
 {
@ -506,8 +506,13 @@ static int nv_p2p_get_pages(
    (*page_table)->page_size = page_size_index;

    os_free_mem(physical_addresses);
+    physical_addresses = NULL;
+
    os_free_mem(wreqmb_h);
+    wreqmb_h = NULL;
+
    os_free_mem(rreqmb_h);
+    rreqmb_h = NULL;

    if (free_callback != NULL)
    {
@ -582,7 +587,7 @@ int nvidia_p2p_get_pages(
                            p2p_token, va_space, virtual_address,
                            length, page_table, free_callback, data);
 }
-EXPORT_SYMBOL(nvidia_p2p_get_pages);
+NV_EXPORT_SYMBOL(nvidia_p2p_get_pages);

 int nvidia_p2p_get_pages_persistent(
    uint64_t virtual_address,
@ -600,7 +605,7 @@ int nvidia_p2p_get_pages_persistent(
                            virtual_address, length, page_table,
                            NULL, NULL);
 }
-EXPORT_SYMBOL(nvidia_p2p_get_pages_persistent);
+NV_EXPORT_SYMBOL(nvidia_p2p_get_pages_persistent);

 /*
 * This function is a no-op, but is left in place (for now), in order to allow
@ -613,7 +618,7 @@ int nvidia_p2p_free_page_table(struct nvidia_p2p_page_table *page_table)
    return 0;
 }

-EXPORT_SYMBOL(nvidia_p2p_free_page_table);
+NV_EXPORT_SYMBOL(nvidia_p2p_free_page_table);

 int nvidia_p2p_put_pages(
    uint64_t p2p_token,
@ -645,7 +650,7 @@ int nvidia_p2p_put_pages(

    return nvidia_p2p_map_status(status);
 }
-EXPORT_SYMBOL(nvidia_p2p_put_pages);
+NV_EXPORT_SYMBOL(nvidia_p2p_put_pages);

 int nvidia_p2p_put_pages_persistent(
    uint64_t virtual_address,
@ -685,7 +690,7 @@ int nvidia_p2p_put_pages_persistent(

    return nvidia_p2p_map_status(status);
 }
-EXPORT_SYMBOL(nvidia_p2p_put_pages_persistent);
+NV_EXPORT_SYMBOL(nvidia_p2p_put_pages_persistent);

 int nvidia_p2p_dma_map_pages(
    struct pci_dev *peer,
@ -800,7 +805,7 @@ failed:
    return nvidia_p2p_map_status(status);
 }

-EXPORT_SYMBOL(nvidia_p2p_dma_map_pages);
+NV_EXPORT_SYMBOL(nvidia_p2p_dma_map_pages);

 int nvidia_p2p_dma_unmap_pages(
    struct pci_dev *peer,
@ -840,7 +845,7 @@ int nvidia_p2p_dma_unmap_pages(
    return 0;
 }

-EXPORT_SYMBOL(nvidia_p2p_dma_unmap_pages);
+NV_EXPORT_SYMBOL(nvidia_p2p_dma_unmap_pages);

 /*
 * This function is a no-op, but is left in place (for now), in order to allow
@ -855,7 +860,7 @@ int nvidia_p2p_free_dma_mapping(
    return 0;
 }

-EXPORT_SYMBOL(nvidia_p2p_free_dma_mapping);
+NV_EXPORT_SYMBOL(nvidia_p2p_free_dma_mapping);

 int nvidia_p2p_register_rsync_driver(
    nvidia_p2p_rsync_driver_t *driver,
@ -884,7 +889,7 @@ int nvidia_p2p_register_rsync_driver(
                                    driver->wait_for_rsync, data);
 }

-EXPORT_SYMBOL(nvidia_p2p_register_rsync_driver);
+NV_EXPORT_SYMBOL(nvidia_p2p_register_rsync_driver);

 void nvidia_p2p_unregister_rsync_driver(
    nvidia_p2p_rsync_driver_t *driver,
@ -916,7 +921,7 @@ void nvidia_p2p_unregister_rsync_driver(
                               driver->wait_for_rsync, data);
 }

-EXPORT_SYMBOL(nvidia_p2p_unregister_rsync_driver);
+NV_EXPORT_SYMBOL(nvidia_p2p_unregister_rsync_driver);

 int nvidia_p2p_get_rsync_registers(
    nvidia_p2p_rsync_reg_info_t **reg_info
@ -1009,7 +1014,7 @@ int nvidia_p2p_get_rsync_registers(
    return 0;
 }

-EXPORT_SYMBOL(nvidia_p2p_get_rsync_registers);
+NV_EXPORT_SYMBOL(nvidia_p2p_get_rsync_registers);

 void nvidia_p2p_put_rsync_registers(
    nvidia_p2p_rsync_reg_info_t *reg_info
@ -1041,4 +1046,4 @@ void nvidia_p2p_put_rsync_registers(
    os_free_mem(reg_info);
 }

-EXPORT_SYMBOL(nvidia_p2p_put_rsync_registers);
+NV_EXPORT_SYMBOL(nvidia_p2p_put_rsync_registers);
--- a/kernel-open/nvidia/nv.c
+++ b/kernel-open/nvidia/nv.c
@ -1224,12 +1224,11 @@ static int nv_start_device(nv_state_t *nv, nvidia_stack_t *sp)
            rm_read_registry_dword(sp, nv, NV_REG_ENABLE_MSI, &msi_config);
            if (msi_config == 1)
            {
-                if (pci_find_capability(nvl->pci_dev, PCI_CAP_ID_MSIX))
+                if (nvl->pci_dev->msix_cap && rm_is_msix_allowed(sp, nv))
                {
                    nv_init_msix(nv);
                }
-                if (pci_find_capability(nvl->pci_dev, PCI_CAP_ID_MSI) &&
-                    !(nv->flags & NV_FLAG_USES_MSIX))
+                if (nvl->pci_dev->msi_cap && !(nv->flags & NV_FLAG_USES_MSIX))
                {
                    nv_init_msi(nv);
                }
--- a/kernel-open/nvidia/nvidia.Kbuild
+++ b/kernel-open/nvidia/nvidia.Kbuild
@ -195,6 +195,7 @@ NV_CONFTEST_FUNCTION_COMPILE_TESTS += devm_clk_bulk_get_all
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += get_task_ioprio
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += mdev_set_iommu_device
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += offline_and_remove_memory
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += crypto_tfm_ctx_aligned

 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_gpl_of_node_to_nid
 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_gpl_sme_active
@ -215,6 +216,7 @@ NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_get_dram_num_channe
 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_tegra_dram_types
 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_pxm_to_node
 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_screen_info
+NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_gpl_screen_info
 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_i2c_bus_status
 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_tegra_fuse_control_read
 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_tegra_get_platform
--- a/kernel-open/nvidia/nvlink_export.h
+++ b/kernel-open/nvidia/nvlink_export.h
@ -46,6 +46,11 @@ NvlStatus nvlink_lib_unload(void);
 */
 NvlStatus nvlink_lib_ioctl_ctrl(nvlink_ioctrl_params *ctrl_params);

+/*
+* Gets number of devices with type deviceType
+*/
+NvlStatus nvlink_lib_return_device_count_by_type(NvU32 deviceType, NvU32 *numDevices);
+
 #ifdef __cplusplus
 }
 #endif
--- a/kernel-open/nvidia/os-interface.c
+++ b/kernel-open/nvidia/os-interface.c
@ -28,6 +28,11 @@

 #include "nv-time.h"

+#include <linux/mmzone.h>
+#include <linux/numa.h>
+
+#include <linux/pid.h>
+
 extern char *NVreg_TemporaryFilePath;

 #define MAX_ERROR_STRING 512
@ -1242,9 +1247,12 @@ void NV_API_CALL os_get_screen_info(
     * SYSFB_SIMPLEFB registers a dummy framebuffer which does not contain the
     * information required by os_get_screen_info(), therefore you need to
     * fall back onto the screen_info structure.
+     *
+     * After commit b8466fe82b79 ("efi: move screen_info into efi init code")
+     * in v6.7, 'screen_info' is exported as GPL licensed symbol for ARM64.
     */

-#if NV_IS_EXPORT_SYMBOL_PRESENT_screen_info
+#if NV_CHECK_EXPORT_SYMBOL(screen_info)
    /*
     * If there is not a framebuffer console, return 0 size.
     *
@ -2122,6 +2130,43 @@ void NV_API_CALL os_nv_cap_close_fd
    nv_cap_close_fd(fd);
 }

+/*
+ * Reads the total memory and free memory of a NUMA node from the kernel.
+ */
+NV_STATUS NV_API_CALL os_get_numa_node_memory_usage
+(
+    NvS32 node_id,
+    NvU64 *free_memory_bytes,
+    NvU64 *total_memory_bytes
+)
+{
+    struct pglist_data *pgdat;
+    struct zone *zone;
+    NvU32 zone_id;
+
+    if (node_id >= MAX_NUMNODES)
+    {
+        nv_printf(NV_DBG_ERRORS, "Invalid NUMA node ID\n");
+        return NV_ERR_INVALID_ARGUMENT;
+    }
+
+    pgdat = NODE_DATA(node_id);
+
+    *free_memory_bytes = 0;
+    *total_memory_bytes = 0;
+
+    for (zone_id = 0; zone_id < MAX_NR_ZONES; zone_id++)
+    {
+        zone = &(pgdat->node_zones[zone_id]);
+        if (!populated_zone(zone))
+            continue;
+        *free_memory_bytes += (zone_page_state_snapshot(zone, NR_FREE_PAGES) * PAGE_SIZE);
+        *total_memory_bytes += (zone->present_pages * PAGE_SIZE);
+    }
+
+    return NV_OK;
+}
+
 typedef struct os_numa_gpu_mem_hotplug_notifier_s
 {
    NvU64 start_pa;
@ -2373,3 +2418,28 @@ NV_STATUS NV_API_CALL os_offline_page_at_address
 #endif
 }

+void* NV_API_CALL os_get_pid_info(void)
+{
+    return get_task_pid(current, PIDTYPE_PID);
+}
+
+void NV_API_CALL os_put_pid_info(void *pid_info)
+{
+    if (pid_info != NULL)
+        put_pid(pid_info);
+}
+
+NV_STATUS NV_API_CALL os_find_ns_pid(void *pid_info, NvU32 *ns_pid)
+{
+    if ((pid_info == NULL) || (ns_pid == NULL))
+        return NV_ERR_INVALID_ARGUMENT;
+
+    *ns_pid = pid_vnr((struct pid *)pid_info);
+
+    // The call returns 0 if the PID is not found in the current ns
+    if (*ns_pid == 0)
+        return NV_ERR_OBJECT_NOT_FOUND;
+
+    return NV_OK;
+}
+
--- a/src/common/displayport/src/dp_connectorimpl.cpp
+++ b/src/common/displayport/src/dp_connectorimpl.cpp
@ -1360,7 +1360,7 @@ bool ConnectorImpl::compoundQueryAttach(Group * target,
                        if (dev->pconCaps.maxHdmiLinkBandwidthGbps != 0)
                        {
                            NvU64 requiredBW = (NvU64)(modesetParams.modesetInfo.pixelClockHz * modesetParams.modesetInfo.depth);
-                            NvU64 availableBw = (NvU64)(dev->pconCaps.maxHdmiLinkBandwidthGbps * 1000000000);
+                            NvU64 availableBw = (NvU64)(dev->pconCaps.maxHdmiLinkBandwidthGbps * (NvU64)1000000000);
                            if (requiredBW > availableBw)
                            {
                                compoundQueryResult = false;
@ -1375,10 +1375,10 @@ bool ConnectorImpl::compoundQueryAttach(Group * target,
                        else if (dev->pconCaps.maxTmdsClkRate != 0)
                        {
                            NvU64 maxTmdsClkRateU64 = (NvU64)(dev->pconCaps.maxTmdsClkRate);
-                            NvU64 requireBw =  (NvU64)(modesetParams.modesetInfo.pixelClockHz * modesetParams.modesetInfo.depth);
+                            NvU64 requiredBw =  (NvU64)(modesetParams.modesetInfo.pixelClockHz * modesetParams.modesetInfo.depth);
                            if (modesetParams.colorFormat == dpColorFormat_YCbCr420)
                            {
-                                if (maxTmdsClkRateU64 < ((requireBw/24)/2))
+                                if (maxTmdsClkRateU64 < ((requiredBw/24)/2))
                                {
                                    compoundQueryResult = false;
                                    return false;
@ -1386,7 +1386,7 @@ bool ConnectorImpl::compoundQueryAttach(Group * target,
                            }
                            else
                            {
-                                if (maxTmdsClkRateU64 < (requireBw/24))
+                                if (maxTmdsClkRateU64 < (requiredBw/24))
                                {
                                    compoundQueryResult = false;
                                    return false;
@ -4740,7 +4740,7 @@ bool ConnectorImpl::train(const LinkConfiguration & lConfig, bool force,
 {
    LinkTrainingType preferredTrainingType = trainType;
    bool result;
-    bool bEnableFecOnSor;
+
    //
    //  Validate link config against caps
    //
@ -4832,16 +4832,7 @@ bool ConnectorImpl::train(const LinkConfiguration & lConfig, bool force,
        result = postLTAdjustment(activeLinkConfig, force);
    }

-    bEnableFecOnSor = lConfig.bEnableFEC;
-
-    if (main->isEDP())
-    {
-        DeviceImpl * nativeDev = findDeviceInList(Address());
-        if (nativeDev && nativeDev->bIsPreviouslyFakedMuxDevice)
-            bEnableFecOnSor = activeLinkConfig.bEnableFEC;
-    }
-
-    if((lConfig.lanes != 0) && result && bEnableFecOnSor)
+    if((lConfig.lanes != 0) && result && activeLinkConfig.bEnableFEC)
    {
        //
        // Extended latency from link-train end to FEC enable pattern
@ -6057,7 +6048,7 @@ void ConnectorImpl::notifyLongPulseInternal(bool statusConnected)
                if (this->bReassessMaxLink)
                {
                    //
-                    // If the highest assessed LC is not equal to 
+                    // If the highest assessed LC is not equal to
                    // max possible link config, re-assess link
                    //
                    NvU8 retries = 0U;
--- a/src/common/inc/nvBldVer.h
+++ b/src/common/inc/nvBldVer.h
@ -43,18 +43,18 @@
 #endif

 #if defined(NV_LINUX) || defined(NV_BSD) || defined(NV_SUNOS)
-#define NV_BUILD_BRANCH_VERSION         "rel/gpu_drv/r535/VK535_87-147"
-#define NV_BUILD_CHANGELIST_NUM         (33800935)
+#define NV_BUILD_BRANCH_VERSION         "rel/gpu_drv/r535/VK535_87-148"
+#define NV_BUILD_CHANGELIST_NUM         (33833102)
 #define NV_BUILD_TYPE                   "Official"
-#define NV_BUILD_NAME                   "rel/gpu_drv/r535/VK535_87-147"
-#define NV_LAST_OFFICIAL_CHANGELIST_NUM (33800935)
+#define NV_BUILD_NAME                   "rel/gpu_drv/r535/VK535_87-148"
+#define NV_LAST_OFFICIAL_CHANGELIST_NUM (33833102)

 #else     /* Windows builds */
-#define NV_BUILD_BRANCH_VERSION         "VK535_87-24"
-#define NV_BUILD_CHANGELIST_NUM         (33800935)
+#define NV_BUILD_BRANCH_VERSION         "VK535_87-25"
+#define NV_BUILD_CHANGELIST_NUM         (33833102)
 #define NV_BUILD_TYPE                   "Official"
-#define NV_BUILD_NAME                   "538.31"
-#define NV_LAST_OFFICIAL_CHANGELIST_NUM (33800935)
+#define NV_BUILD_NAME                   "538.35"
+#define NV_LAST_OFFICIAL_CHANGELIST_NUM (33833102)
 #define NV_BUILD_BRANCH_BASE_VERSION    R535
 #endif
 // End buildmeister python edited section
--- a/src/common/inc/nvUnixVersion.h
+++ b/src/common/inc/nvUnixVersion.h
@ -4,7 +4,7 @@
 #if defined(NV_LINUX) || defined(NV_BSD) || defined(NV_SUNOS) || defined(NV_VMWARE) || defined(NV_QNX) || defined(NV_INTEGRITY) || \
    (defined(RMCFG_FEATURE_PLATFORM_GSP) && RMCFG_FEATURE_PLATFORM_GSP == 1)

-#define NV_VERSION_STRING               "535.43.23"
+#define NV_VERSION_STRING               "535.43.24"

 #else

--- a/src/common/inc/nvVer.h
+++ b/src/common/inc/nvVer.h
@ -3,7 +3,7 @@
 #define NV_COMPANY_NAME_STRING_SHORT    "NVIDIA"
 #define NV_COMPANY_NAME_STRING_FULL     "NVIDIA Corporation"
 #define NV_COMPANY_NAME_STRING          NV_COMPANY_NAME_STRING_FULL
-#define NV_COPYRIGHT_YEAR               "2023"
+#define NV_COPYRIGHT_YEAR               "2024"
 #define NV_COPYRIGHT                    "(C) " NV_COPYRIGHT_YEAR " NVIDIA Corporation. All rights reserved."  // Please do not use the non-ascii copyright symbol for (C).

 #if defined(NV_LINUX) || defined(NV_BSD) || defined(NV_SUNOS) || defined(NV_VMWARE) || defined(NV_QNX) || defined(NV_INTEGRITY) || \
--- a/src/common/inc/nveGPUConfig.h
+++ b/src/common/inc/nveGPUConfig.h
@ -39,48 +39,63 @@ extern "C" {
 #endif  //NV_UNIX
 #endif  //!__cplusplus

-// Surprise removal capable TB3 and TB2 BUS Device ID
-#define BUS_DEVICE_ID_TB3_ALPINE_RIDGE_01          0x1578
-#define BUS_DEVICE_ID_TB3_02                       0x1576
-#define BUS_DEVICE_ID_TB3_03                       0x15C0
-#define BUS_DEVICE_ID_TB3_04                       0x15D3
-#define BUS_DEVICE_ID_TB3_05                       0x15DA
-#define BUS_DEVICE_ID_TB3_06                       0x15EA
-#define BUS_DEVICE_ID_TB3_07                       0x15E7
-#define BUS_DEVICE_ID_TB3_08                       0x15EF
-#define BUS_DEVICE_ID_TB3_09                       0x1133
-#define BUS_DEVICE_ID_TB3_10                       0x1136
+#define PARENT_EGPU_BUS_DEVICE_43           0x57A4
+#define PARENT_EGPU_BUS_DEVICE_42           0x5786
+#define PARENT_EGPU_BUS_DEVICE_41           0x1578
+#define PARENT_EGPU_BUS_DEVICE_40           0x1576
+#define PARENT_EGPU_BUS_DEVICE_39           0x15C0
+#define PARENT_EGPU_BUS_DEVICE_38           0x15D3
+#define PARENT_EGPU_BUS_DEVICE_37           0x15DA
+#define PARENT_EGPU_BUS_DEVICE_36           0x15EA
+#define PARENT_EGPU_BUS_DEVICE_35           0x15E7
+#define PARENT_EGPU_BUS_DEVICE_34           0x15EF
+#define PARENT_EGPU_BUS_DEVICE_33           0x1133
+#define PARENT_EGPU_BUS_DEVICE_32           0x1136

-// IceLake-U TB3 device ids. Below TB3 would be integrated to CPU.
-#define BUS_DEVICE_ID_ICELAKE_TB3_01               0x8A1D
-#define BUS_DEVICE_ID_ICELAKE_TB3_02               0x8A1F
-#define BUS_DEVICE_ID_ICELAKE_TB3_03               0x8A21
-#define BUS_DEVICE_ID_ICELAKE_TB3_04               0x8A23
-#define BUS_DEVICE_ID_ICELAKE_TB3_05               0x8A0D
-#define BUS_DEVICE_ID_ICELAKE_TB3_06               0x8A17
+#define PARENT_EGPU_BUS_DEVICE_31           0x8A1D
+#define PARENT_EGPU_BUS_DEVICE_30           0x8A1F
+#define PARENT_EGPU_BUS_DEVICE_29           0x8A21
+#define PARENT_EGPU_BUS_DEVICE_28           0x8A23
+#define PARENT_EGPU_BUS_DEVICE_27           0x8A0D
+#define PARENT_EGPU_BUS_DEVICE_26           0x8A17

-// TigerLake Thunderbolt device ids.
-#define BUS_DEVICE_ID_TIGERLAKE_TB3_01             0x9A1B
-#define BUS_DEVICE_ID_TIGERLAKE_TB3_02             0x9A1D
-#define BUS_DEVICE_ID_TIGERLAKE_TB3_03             0x9A1F
-#define BUS_DEVICE_ID_TIGERLAKE_TB3_04             0x9A21
-#define BUS_DEVICE_ID_TIGERLAKE_TB3_05             0x9A23
-#define BUS_DEVICE_ID_TIGERLAKE_TB3_06             0x9A25
-#define BUS_DEVICE_ID_TIGERLAKE_TB3_07             0x9A27
-#define BUS_DEVICE_ID_TIGERLAKE_TB3_08             0x9A29
-#define BUS_DEVICE_ID_TIGERLAKE_TB3_09             0x9A2B
-#define BUS_DEVICE_ID_TIGERLAKE_TB3_10             0x9A2D

-//#define BUS_DEVICE_ID_TB2_FALCON_RIDGE_DSL5520_01  0X156C   // obsolete
-#define BUS_DEVICE_ID_TB2_FALCON_RIDGE_DSL5520_02  0X156D
-#define BUS_DEVICE_ID_TB2_03                       0x157E
-#define BUS_DEVICE_ID_TB2_04                       0x156B
-#define BUS_DEVICE_ID_TB2_05                       0x1567
-#define BUS_DEVICE_ID_TB2_06                       0x1569
-//#define BUS_DEVICE_ID_TB2_07                       0x1548   // obsolete
-#define BUS_DEVICE_ID_TB2_08                       0x151B
-#define BUS_DEVICE_ID_TB2_09                       0x1549
-#define BUS_DEVICE_ID_TB2_10                       0x1513
+#define PARENT_EGPU_BUS_DEVICE_25           0x9A1B
+#define PARENT_EGPU_BUS_DEVICE_24           0x9A1D
+#define PARENT_EGPU_BUS_DEVICE_23           0x9A1F
+#define PARENT_EGPU_BUS_DEVICE_22           0x9A21
+#define PARENT_EGPU_BUS_DEVICE_21           0x9A23
+#define PARENT_EGPU_BUS_DEVICE_20           0x9A25
+#define PARENT_EGPU_BUS_DEVICE_19           0x9A27
+#define PARENT_EGPU_BUS_DEVICE_18           0x9A29
+#define PARENT_EGPU_BUS_DEVICE_17           0x9A2B
+#define PARENT_EGPU_BUS_DEVICE_16           0x9A2D
+
+#define PARENT_EGPU_BUS_DEVICE_15           0x7EB2
+#define PARENT_EGPU_BUS_DEVICE_14           0x7EC2
+#define PARENT_EGPU_BUS_DEVICE_13           0x7EC3
+#define PARENT_EGPU_BUS_DEVICE_12           0x7EB4
+#define PARENT_EGPU_BUS_DEVICE_11           0x7EC4
+#define PARENT_EGPU_BUS_DEVICE_10           0x7EB5
+#define PARENT_EGPU_BUS_DEVICE_09           0x7EC5
+#define PARENT_EGPU_BUS_DEVICE_08           0x7EC6
+#define PARENT_EGPU_BUS_DEVICE_07           0x7EC7
+
+#define PARENT_EGPU_BUS_DEVICE_06           0xA73E
+#define PARENT_EGPU_BUS_DEVICE_05           0xA76D
+#define PARENT_EGPU_BUS_DEVICE_04           0x466E
+#define PARENT_EGPU_BUS_DEVICE_03           0x463F
+#define PARENT_EGPU_BUS_DEVICE_02           0x462F
+#define PARENT_EGPU_BUS_DEVICE_01           0x461F
+
+#define PARENT_EGPU_BUS_DEVICE_02_08        0X156D
+#define PARENT_EGPU_BUS_DEVICE_02_07        0x157E
+#define PARENT_EGPU_BUS_DEVICE_02_06        0x156B
+#define PARENT_EGPU_BUS_DEVICE_02_05        0x1567
+#define PARENT_EGPU_BUS_DEVICE_02_04        0x1569
+#define PARENT_EGPU_BUS_DEVICE_02_03        0x151B
+#define PARENT_EGPU_BUS_DEVICE_02_02        0x1549
+#define PARENT_EGPU_BUS_DEVICE_02_01        0x1513

 //*****************************************************************************
 // Function:  isTB3DeviceID
@ -103,33 +118,51 @@ extern "C" {
 EGPU_INLINE NvBool isTB3DeviceID(NvU16 deviceID)
 {
    NvU32   index;
-    NvU16   tb3DeviceIDList[]={ BUS_DEVICE_ID_TB3_ALPINE_RIDGE_01,
-                                BUS_DEVICE_ID_TB3_02,
-                                BUS_DEVICE_ID_TB3_03,
-                                BUS_DEVICE_ID_TB3_04,
-                                BUS_DEVICE_ID_TB3_05,
-                                BUS_DEVICE_ID_TB3_06,
-                                BUS_DEVICE_ID_TB3_07,
-                                BUS_DEVICE_ID_TB3_08,
-                                BUS_DEVICE_ID_TB3_09,
-                                BUS_DEVICE_ID_TB3_10,
-                                BUS_DEVICE_ID_ICELAKE_TB3_01,
-                                BUS_DEVICE_ID_ICELAKE_TB3_02,
-                                BUS_DEVICE_ID_ICELAKE_TB3_03,
-                                BUS_DEVICE_ID_ICELAKE_TB3_04,
-                                BUS_DEVICE_ID_ICELAKE_TB3_05,
-                                BUS_DEVICE_ID_ICELAKE_TB3_06,
-                                BUS_DEVICE_ID_TIGERLAKE_TB3_01,
-                                BUS_DEVICE_ID_TIGERLAKE_TB3_02,
-                                BUS_DEVICE_ID_TIGERLAKE_TB3_03,
-                                BUS_DEVICE_ID_TIGERLAKE_TB3_04,
-                                BUS_DEVICE_ID_TIGERLAKE_TB3_05,
-                                BUS_DEVICE_ID_TIGERLAKE_TB3_06,
-                                BUS_DEVICE_ID_TIGERLAKE_TB3_07,
-                                BUS_DEVICE_ID_TIGERLAKE_TB3_08,
-                                BUS_DEVICE_ID_TIGERLAKE_TB3_09,
-                                BUS_DEVICE_ID_TIGERLAKE_TB3_10
-                              };
+    NvU16   tb3DeviceIDList[]={ PARENT_EGPU_BUS_DEVICE_01,
+                                PARENT_EGPU_BUS_DEVICE_02,
+                                PARENT_EGPU_BUS_DEVICE_03,
+                                PARENT_EGPU_BUS_DEVICE_04,
+                                PARENT_EGPU_BUS_DEVICE_05,
+                                PARENT_EGPU_BUS_DEVICE_06,
+                                PARENT_EGPU_BUS_DEVICE_07,
+                                PARENT_EGPU_BUS_DEVICE_08,
+                                PARENT_EGPU_BUS_DEVICE_09,
+                                PARENT_EGPU_BUS_DEVICE_10,
+                                PARENT_EGPU_BUS_DEVICE_11,
+								PARENT_EGPU_BUS_DEVICE_12,
+                                PARENT_EGPU_BUS_DEVICE_13,
+								PARENT_EGPU_BUS_DEVICE_14,
+                                PARENT_EGPU_BUS_DEVICE_15,
+                                PARENT_EGPU_BUS_DEVICE_16,
+                                PARENT_EGPU_BUS_DEVICE_17,
+                                PARENT_EGPU_BUS_DEVICE_18,
+                                PARENT_EGPU_BUS_DEVICE_19,
+                                PARENT_EGPU_BUS_DEVICE_20,
+                                PARENT_EGPU_BUS_DEVICE_21,
+                                PARENT_EGPU_BUS_DEVICE_22,
+                                PARENT_EGPU_BUS_DEVICE_23,
+                                PARENT_EGPU_BUS_DEVICE_24,
+                                PARENT_EGPU_BUS_DEVICE_25,
+                                PARENT_EGPU_BUS_DEVICE_26,
+                                PARENT_EGPU_BUS_DEVICE_27,
+                                PARENT_EGPU_BUS_DEVICE_28,
+                                PARENT_EGPU_BUS_DEVICE_29,
+                                PARENT_EGPU_BUS_DEVICE_30,
+                                PARENT_EGPU_BUS_DEVICE_31,
+                                PARENT_EGPU_BUS_DEVICE_32,
+                                PARENT_EGPU_BUS_DEVICE_33,
+                                PARENT_EGPU_BUS_DEVICE_34,
+                                PARENT_EGPU_BUS_DEVICE_35,
+                                PARENT_EGPU_BUS_DEVICE_36,
+                                PARENT_EGPU_BUS_DEVICE_37,
+                                PARENT_EGPU_BUS_DEVICE_38,
+                                PARENT_EGPU_BUS_DEVICE_39,
+                                PARENT_EGPU_BUS_DEVICE_40,
+                                PARENT_EGPU_BUS_DEVICE_41,
+                                PARENT_EGPU_BUS_DEVICE_42,
+                                PARENT_EGPU_BUS_DEVICE_43
+							};
+
    for (index = 0; index < (sizeof(tb3DeviceIDList)/sizeof(NvU16)); index++)
    {
        if (deviceID == tb3DeviceIDList[index])
@ -161,11 +194,14 @@ EGPU_INLINE NvBool isTB3DeviceID(NvU16 deviceID)
 EGPU_INLINE NvBool isTB2DeviceID(NvU16 deviceID)
 {
    NvU32   index;
-    NvU16   tb2DeviceIDList[]={ BUS_DEVICE_ID_TB2_FALCON_RIDGE_DSL5520_02,
-                                BUS_DEVICE_ID_TB2_03, BUS_DEVICE_ID_TB2_04,
-                                BUS_DEVICE_ID_TB2_05, BUS_DEVICE_ID_TB2_06,
-                                BUS_DEVICE_ID_TB2_08, BUS_DEVICE_ID_TB2_09,
-                                BUS_DEVICE_ID_TB2_10
+    NvU16   tb2DeviceIDList[]={ PARENT_EGPU_BUS_DEVICE_02_01,
+                                PARENT_EGPU_BUS_DEVICE_02_02,
+                                PARENT_EGPU_BUS_DEVICE_02_03,
+                                PARENT_EGPU_BUS_DEVICE_02_04,
+                                PARENT_EGPU_BUS_DEVICE_02_05,
+                                PARENT_EGPU_BUS_DEVICE_02_06,
+                                PARENT_EGPU_BUS_DEVICE_02_07,
+                                PARENT_EGPU_BUS_DEVICE_02_08
                              };
    for (index = 0; index < (sizeof(tb2DeviceIDList)/sizeof(NvU16)); index++)
    {
--- a/src/common/inc/swref/published/ampere/ga100/hwproject.h
+++ b/src/common/inc/swref/published/ampere/ga100/hwproject.h
@ -0,0 +1,28 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifndef __ga100_hwproject_h__
+#define __ga100_hwproject_h__
+
+#define NV_SCAL_LITTER_NUM_FBPAS                       24
+
+#endif // __ga100_hwproject_h__
--- a/src/common/inc/swref/published/hopper/gh100/dev_fb.h
+++ b/src/common/inc/swref/published/hopper/gh100/dev_fb.h
@ -20,7 +20,7 @@
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */
- 
+
 #ifndef __gh100_dev_fb_h_
 #define __gh100_dev_fb_h_
 #define NV_PFB_NISO_FLUSH_SYSMEM_ADDR_SHIFT                       8 /*       */
@ -29,4 +29,5 @@
 #define NV_PFB_FBHUB_PCIE_FLUSH_SYSMEM_ADDR_HI           0x00100A38 /* RW-4R */
 #define NV_PFB_FBHUB_PCIE_FLUSH_SYSMEM_ADDR_HI_ADR             31:0 /* RWIVF */
 #define NV_PFB_FBHUB_PCIE_FLUSH_SYSMEM_ADDR_HI_ADR_MASK  0x000FFFFF /* ----V */
+
 #endif // __gh100_dev_fb_h_
--- a/src/common/inc/swref/published/hopper/gh100/dev_fbpa.h
+++ b/src/common/inc/swref/published/hopper/gh100/dev_fbpa.h
@ -0,0 +1,29 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __gh100_dev_fbpa_h_
+#define __gh100_dev_fbpa_h_
+
+#define NV_PFB_FBPA_0_ECC_DED_COUNT__SIZE_1               4 /*       */
+#define NV_PFB_FBPA_0_ECC_DED_COUNT(i)                   (0x009025A0+(i)*4) /* RW-4A */
+#endif // __gh100_dev_fbpa_h_
--- a/src/common/inc/swref/published/hopper/gh100/dev_gsp.h
+++ b/src/common/inc/swref/published/hopper/gh100/dev_gsp.h
@ -31,4 +31,22 @@
 #define NV_PGSP_FALCON_ENGINE_RESET_STATUS_ASSERTED                                                      0x00000000     /* R-E-V */
 #define NV_PGSP_FALCON_ENGINE_RESET_STATUS_DEASSERTED                                                    0x00000002     /* R---V */
 #define NV_PGSP_MAILBOX(i)                                                                               (0x110804+(i)*4) /* RW-4A */
+#define NV_PGSP_EMEMC(i)                                                                                 (0x110ac0+(i)*8) /* RW-4A */
+#define NV_PGSP_EMEMC__SIZE_1                                                                            8              /*       */
+#define NV_PGSP_EMEMC_OFFS                                                                               7:2            /* RWIVF */
+#define NV_PGSP_EMEMC_OFFS_INIT                                                                          0x00000000     /* RWI-V */
+#define NV_PGSP_EMEMC_BLK                                                                                15:8           /* RWIVF */
+#define NV_PGSP_EMEMC_BLK_INIT                                                                           0x00000000     /* RWI-V */
+#define NV_PGSP_EMEMC_AINCW                                                                              24:24          /* RWIVF */
+#define NV_PGSP_EMEMC_AINCW_INIT                                                                         0x00000000     /* RWI-V */
+#define NV_PGSP_EMEMC_AINCW_TRUE                                                                         0x00000001     /* RW--V */
+#define NV_PGSP_EMEMC_AINCW_FALSE                                                                        0x00000000     /* RW--V */
+#define NV_PGSP_EMEMC_AINCR                                                                              25:25          /* RWIVF */
+#define NV_PGSP_EMEMC_AINCR_INIT                                                                         0x00000000     /* RWI-V */
+#define NV_PGSP_EMEMC_AINCR_TRUE                                                                         0x00000001     /* RW--V */
+#define NV_PGSP_EMEMC_AINCR_FALSE                                                                        0x00000000     /* RW--V */
+#define NV_PGSP_EMEMD(i)                                                                                 (0x110ac4+(i)*8) /* RW-4A */
+#define NV_PGSP_EMEMD__SIZE_1                                                                            8              /*       */
+#define NV_PGSP_EMEMD_DATA                                                                               31:0           /* RWXVF */
+
 #endif // __gh100_dev_gsp_h__
--- a/src/common/inc/swref/published/hopper/gh100/dev_nv_xpl.h
+++ b/src/common/inc/swref/published/hopper/gh100/dev_nv_xpl.h
@ -0,0 +1,52 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __gh100_dev_nv_xpl_h_
+#define __gh100_dev_nv_xpl_h_
+#define NV_XPL_DL_ERR_COUNT_RBUF                                               0x00000a54 /* R--4R */
+#define NV_XPL_DL_ERR_COUNT_RBUF__PRIV_LEVEL_MASK                              0x00000b08 /*       */
+#define NV_XPL_DL_ERR_COUNT_RBUF_CORR_ERR                                            15:0 /* R-EVF */
+#define NV_XPL_DL_ERR_COUNT_RBUF_CORR_ERR_INIT                                     0x0000 /* R-E-V */
+#define NV_XPL_DL_ERR_COUNT_RBUF_UNCORR_ERR                                         31:16 /* R-EVF */
+#define NV_XPL_DL_ERR_COUNT_RBUF_UNCORR_ERR_INIT                                   0x0000 /* R-E-V */
+#define NV_XPL_DL_ERR_COUNT_SEQ_LUT                                            0x00000a58 /* R--4R */
+#define NV_XPL_DL_ERR_COUNT_SEQ_LUT__PRIV_LEVEL_MASK                           0x00000b08 /*       */
+#define NV_XPL_DL_ERR_COUNT_SEQ_LUT_CORR_ERR                                         15:0 /* R-EVF */
+#define NV_XPL_DL_ERR_COUNT_SEQ_LUT_CORR_ERR_INIT                                  0x0000 /* R-E-V */
+#define NV_XPL_DL_ERR_COUNT_SEQ_LUT_UNCORR_ERR                                      31:16 /* R-EVF */
+#define NV_XPL_DL_ERR_COUNT_SEQ_LUT_UNCORR_ERR_INIT                                0x0000 /* R-E-V */
+
+#define NV_XPL_DL_ERR_RESET                                                    0x00000a5c /* RW-4R */
+#define NV_XPL_DL_ERR_RESET_RBUF_CORR_ERR_COUNT                                       0:0 /* RWCVF */
+#define NV_XPL_DL_ERR_RESET_RBUF_CORR_ERR_COUNT_DONE                                  0x0 /* RWC-V */
+#define NV_XPL_DL_ERR_RESET_RBUF_CORR_ERR_COUNT_PENDING                               0x1 /* -W--T */
+#define NV_XPL_DL_ERR_RESET_SEQ_LUT_CORR_ERR_COUNT                                    1:1 /* RWCVF */
+#define NV_XPL_DL_ERR_RESET_SEQ_LUT_CORR_ERR_COUNT_DONE                               0x0 /* RWC-V */
+#define NV_XPL_DL_ERR_RESET_SEQ_LUT_CORR_ERR_COUNT_PENDING                            0x1 /* -W--T */
+#define NV_XPL_DL_ERR_RESET_RBUF_UNCORR_ERR_COUNT                                   16:16 /* RWCVF */
+#define NV_XPL_DL_ERR_RESET_RBUF_UNCORR_ERR_COUNT_DONE                                0x0 /* RWC-V */
+#define NV_XPL_DL_ERR_RESET_RBUF_UNCORR_ERR_COUNT_PENDING                             0x1 /* -W--T */
+#define NV_XPL_DL_ERR_RESET_SEQ_LUT_UNCORR_ERR_COUNT                                17:17 /* RWCVF */
+#define NV_XPL_DL_ERR_RESET_SEQ_LUT_UNCORR_ERR_COUNT_DONE                             0x0 /* RWC-V */
+#define NV_XPL_DL_ERR_RESET_SEQ_LUT_UNCORR_ERR_COUNT_PENDING                          0x1 /* -W--T */
+#endif // __gh100_dev_nv_xpl_h__
--- a/src/common/inc/swref/published/hopper/gh100/dev_xtl_ep_pri.h
+++ b/src/common/inc/swref/published/hopper/gh100/dev_xtl_ep_pri.h
@ -24,4 +24,7 @@
 #ifndef __gh100_dev_xtl_ep_pri_h__
 #define __gh100_dev_xtl_ep_pri_h__
 #define NV_EP_PCFGM                                                              0x92FFF:0x92000        /* RW--D */
+
+#define NV_XTL_EP_PRI_DED_ERROR_STATUS                                           0x0000043C    /* RW-4R */
+#define NV_XTL_EP_PRI_RAM_ERROR_INTR_STATUS                                      0x000003C8    /* RW-4R */
 #endif // __gh100_dev_xtl_ep_pri_h__
--- a/src/common/inc/swref/published/hopper/gh100/hwproject.h
+++ b/src/common/inc/swref/published/hopper/gh100/hwproject.h
@ -21,3 +21,6 @@
 * DEALINGS IN THE SOFTWARE.
 */
 #define NV_CHIP_EXTENDED_SYSTEM_PHYSICAL_ADDRESS_BITS              52
+#define NV_XPL_BASE_ADDRESS                    540672
+#define NV_XTL_BASE_ADDRESS                    593920
+#define NV_FBPA_PRI_STRIDE                      16384
--- a/src/common/inc/swref/published/hopper/gh100/pri_nv_xal_ep.h
+++ b/src/common/inc/swref/published/hopper/gh100/pri_nv_xal_ep.h
@ -47,5 +47,17 @@
 #define NV_XAL_EP_INTR_0_PRI_RSP_TIMEOUT                                              3:3
 #define NV_XAL_EP_INTR_0_PRI_RSP_TIMEOUT_PENDING                                      0x1
 #define NV_XAL_EP_SCPM_PRI_DUMMY_DATA_PATTERN_INIT                             0xbadf0200
+
+#define NV_XAL_EP_REORDER_ECC_UNCORRECTED_ERR_COUNT                            0x0010f364 /* RW-4R */
+#define NV_XAL_EP_REORDER_ECC_UNCORRECTED_ERR_COUNT_TOTAL                            15:0 /* RWIUF */
+#define NV_XAL_EP_REORDER_ECC_UNCORRECTED_ERR_COUNT_TOTAL_INIT                     0x0000 /* RWI-V */
+#define NV_XAL_EP_REORDER_ECC_UNCORRECTED_ERR_COUNT_UNIQUE                          31:16 /* RWIUF */
+#define NV_XAL_EP_REORDER_ECC_UNCORRECTED_ERR_COUNT_UNIQUE_INIT                    0x0000 /* RWI-V */
+
+#define NV_XAL_EP_P2PREQ_ECC_UNCORRECTED_ERR_COUNT                             0x0010f37c /* RW-4R */
+#define NV_XAL_EP_P2PREQ_ECC_UNCORRECTED_ERR_COUNT_TOTAL                             15:0 /* RWIUF */
+#define NV_XAL_EP_P2PREQ_ECC_UNCORRECTED_ERR_COUNT_TOTAL_INIT                      0x0000 /* RWI-V */
+#define NV_XAL_EP_P2PREQ_ECC_UNCORRECTED_ERR_COUNT_UNIQUE                           31:16 /* RWIUF */
+#define NV_XAL_EP_P2PREQ_ECC_UNCORRECTED_ERR_COUNT_UNIQUE_INIT                     0x0000 /* RWI-V */
 #endif // __gh100_pri_nv_xal_ep_h__

--- a/src/common/inc/swref/published/nvswitch/ls10/dev_nvlipt_lnk_ip.h
+++ b/src/common/inc/swref/published/nvswitch/ls10/dev_nvlipt_lnk_ip.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2003-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2003-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -635,4 +635,7 @@
 #define NV_NVLIPT_LNK_CTRL_CAP_LOCAL_LINK_CHANNEL_ALI_SUPPORT 28:28           /* RWIVF */
 #define NV_NVLIPT_LNK_CTRL_CAP_LOCAL_LINK_CHANNEL_ALI_SUPPORT_SUPPORTED 0x00000001 /* RWI-V */
 #define NV_NVLIPT_LNK_CTRL_CAP_LOCAL_LINK_CHANNEL_ALI_SUPPORT_NOT_SUPPORTED 0x00000000 /* RW--V */
+#define NV_NVLIPT_LNK_SCRATCH_WARM                            0x000007c0      /* RW-4R */
+#define NV_NVLIPT_LNK_SCRATCH_WARM_DATA                       31:0            /* RWEVF */
+#define NV_NVLIPT_LNK_SCRATCH_WARM_DATA_INIT                  0xdeadbaad      /* RWE-V */
 #endif // __ls10_dev_nvlipt_lnk_ip_h__
--- a/src/common/inc/swref/published/nvswitch/ls10/ptop_discovery_ip.h
+++ b/src/common/inc/swref/published/nvswitch/ls10/ptop_discovery_ip.h
@ -0,0 +1,28 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2003-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the Software),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __ls10_ptop_discovery_ip_h__
+#define __ls10_ptop_discovery_ip_h__
+/* This file is autogenerated.  Do not edit */
+#define  NV_PTOP_UNICAST_SW_DEVICE_BASE_SAW_0                                0x00028000                    /*       */
+#endif // __ls10_ptop_discovery_ip_h__
--- a/src/common/inc/swref/published/turing/tu102/dev_fb.h
+++ b/src/common/inc/swref/published/turing/tu102/dev_fb.h
@ -38,4 +38,25 @@
 #define NV_PFB_PRI_MMU_WPR2_ADDR_HI_VAL                                              31:4 /* RWEVF */
 #define NV_PFB_PRI_MMU_WPR2_ADDR_HI_ALIGNMENT                                  0x0000000c /*       */

+#define NV_PFB_PRI_MMU_L2TLB_ECC_UNCORRECTED_ERR_COUNT               0x00100E78 /* RW-4R */
+#define NV_PFB_PRI_MMU_L2TLB_ECC_UNCORRECTED_ERR_COUNT               0x00100E78 /* RW-4R */
+#define NV_PFB_PRI_MMU_L2TLB_ECC_UNCORRECTED_ERR_COUNT_TOTAL               15:0 /* RWEVF */
+#define NV_PFB_PRI_MMU_L2TLB_ECC_UNCORRECTED_ERR_COUNT_TOTAL_INIT             0 /* RWE-V */
+#define NV_PFB_PRI_MMU_L2TLB_ECC_UNCORRECTED_ERR_COUNT_UNIQUE             31:16 /* RWEVF */
+#define NV_PFB_PRI_MMU_L2TLB_ECC_UNCORRECTED_ERR_COUNT_UNIQUE_INIT            0 /* RWE-V */
+
+#define NV_PFB_PRI_MMU_HUBTLB_ECC_UNCORRECTED_ERR_COUNT               0x00100E8C /* RW-4R */
+#define NV_PFB_PRI_MMU_HUBTLB_ECC_UNCORRECTED_ERR_COUNT               0x00100E8C /* RW-4R */
+#define NV_PFB_PRI_MMU_HUBTLB_ECC_UNCORRECTED_ERR_COUNT_TOTAL               15:0 /* RWEVF */
+#define NV_PFB_PRI_MMU_HUBTLB_ECC_UNCORRECTED_ERR_COUNT_TOTAL_INIT             0 /* RWE-V */
+#define NV_PFB_PRI_MMU_HUBTLB_ECC_UNCORRECTED_ERR_COUNT_UNIQUE             31:16 /* RWEVF */
+#define NV_PFB_PRI_MMU_HUBTLB_ECC_UNCORRECTED_ERR_COUNT_UNIQUE_INIT            0 /* RWE-V */
+
+#define NV_PFB_PRI_MMU_FILLUNIT_ECC_UNCORRECTED_ERR_COUNT               0x00100EA0 /* RW-4R */
+#define NV_PFB_PRI_MMU_FILLUNIT_ECC_UNCORRECTED_ERR_COUNT               0x00100EA0 /* RW-4R */
+#define NV_PFB_PRI_MMU_FILLUNIT_ECC_UNCORRECTED_ERR_COUNT_TOTAL               15:0 /* RWEVF */
+#define NV_PFB_PRI_MMU_FILLUNIT_ECC_UNCORRECTED_ERR_COUNT_TOTAL_INIT             0 /* RWE-V */
+#define NV_PFB_PRI_MMU_FILLUNIT_ECC_UNCORRECTED_ERR_COUNT_UNIQUE             31:16 /* RWEVF */
+#define NV_PFB_PRI_MMU_FILLUNIT_ECC_UNCORRECTED_ERR_COUNT_UNIQUE_INIT            0 /* RWE-V */
+
 #endif // __tu102_dev_fb_h__
--- a/src/common/inc/swref/published/turing/tu102/dev_fbpa.h
+++ b/src/common/inc/swref/published/turing/tu102/dev_fbpa.h
@ -0,0 +1,29 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __tu102_dev_fbpa_h_
+#define __tu102_dev_fbpa_h_
+
+#define NV_PFB_FBPA_0_ECC_DED_COUNT__SIZE_1               2 /*       */
+#define NV_PFB_FBPA_0_ECC_DED_COUNT(i)                   (0x00900488+(i)*4) /* RW-4A */
+#endif // __tu102_dev_fbpa_h_
--- a/src/common/inc/swref/published/turing/tu102/dev_gc6_island.h
+++ b/src/common/inc/swref/published/turing/tu102/dev_gc6_island.h
@ -24,6 +24,7 @@
 #ifndef __tu102_dev_gc6_island_h__
 #define __tu102_dev_gc6_island_h__

+#define NV_PGC6                                                                          0x118fff:0x118000 /* RW--D */
 #define NV_PGC6_AON_SECURE_SCRATCH_GROUP_05_PRIV_LEVEL_MASK                                     0x00118128 /* RW-4R */
 #define NV_PGC6_AON_SECURE_SCRATCH_GROUP_05_PRIV_LEVEL_MASK_READ_PROTECTION                            3:0 /* RWIVF */
 #define NV_PGC6_AON_SECURE_SCRATCH_GROUP_05_PRIV_LEVEL_MASK_READ_PROTECTION_LEVEL0                     0:0 /*       */
--- a/src/common/inc/swref/published/turing/tu102/dev_gsp.h
+++ b/src/common/inc/swref/published/turing/tu102/dev_gsp.h
@ -38,5 +38,22 @@
 #define NV_PGSP_QUEUE_HEAD(i)                                                                            (0x110c00+(i)*8) /* RW-4A */
 #define NV_PGSP_QUEUE_HEAD__SIZE_1                                                                       8              /*       */
 #define NV_PGSP_QUEUE_HEAD_ADDRESS                                                                       31:0           /* RWIVF */
+#define NV_PGSP_EMEMC(i)                                                                                 (0x110ac0+(i)*8) /* RW-4A */
+#define NV_PGSP_EMEMC__SIZE_1                                                                            4              /*       */
+#define NV_PGSP_EMEMC_OFFS                                                                               7:2            /* RWIVF */
+#define NV_PGSP_EMEMC_OFFS_INIT                                                                          0x00000000     /* RWI-V */
+#define NV_PGSP_EMEMC_BLK                                                                                15:8           /* RWIVF */
+#define NV_PGSP_EMEMC_BLK_INIT                                                                           0x00000000     /* RWI-V */
+#define NV_PGSP_EMEMC_AINCW                                                                              24:24          /* RWIVF */
+#define NV_PGSP_EMEMC_AINCW_INIT                                                                         0x00000000     /* RWI-V */
+#define NV_PGSP_EMEMC_AINCW_TRUE                                                                         0x00000001     /* RW--V */
+#define NV_PGSP_EMEMC_AINCW_FALSE                                                                        0x00000000     /* RW--V */
+#define NV_PGSP_EMEMC_AINCR                                                                              25:25          /* RWIVF */
+#define NV_PGSP_EMEMC_AINCR_INIT                                                                         0x00000000     /* RWI-V */
+#define NV_PGSP_EMEMC_AINCR_TRUE                                                                         0x00000001     /* RW--V */
+#define NV_PGSP_EMEMC_AINCR_FALSE                                                                        0x00000000     /* RW--V */
+#define NV_PGSP_EMEMD(i)                                                                                 (0x110ac4+(i)*8) /* RW-4A */
+#define NV_PGSP_EMEMD__SIZE_1                                                                            4              /*       */
+#define NV_PGSP_EMEMD_DATA                                                                               31:0           /* RW-VF */

 #endif // __tu102_dev_gsp_h__
--- a/src/common/inc/swref/published/turing/tu102/dev_ltc.h
+++ b/src/common/inc/swref/published/turing/tu102/dev_ltc.h
@ -0,0 +1,33 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __tu102_dev_ltc_h_
+#define __tu102_dev_ltc_h_
+
+#define NV_PLTCG_LTC0_LTS0_L2_CACHE_ECC_UNCORRECTED_ERR_COUNT                  0x001404f8 /* RW-4R */
+#define NV_PLTCG_LTC0_LTS0_L2_CACHE_ECC_UNCORRECTED_ERR_COUNT_TOTAL                  15:0 /* RWIVF */
+#define NV_PLTCG_LTC0_LTS0_L2_CACHE_ECC_UNCORRECTED_ERR_COUNT_TOTAL_INIT           0x0000 /* RWI-V */
+#define NV_PLTCG_LTC0_LTS0_L2_CACHE_ECC_UNCORRECTED_ERR_COUNT_UNIQUE                31:16 /* RWIVF */
+#define NV_PLTCG_LTC0_LTS0_L2_CACHE_ECC_UNCORRECTED_ERR_COUNT_UNIQUE_INIT          0x0000 /* RWI-V */
+
+#endif // __tu102_dev_ltc_h_
--- a/src/common/inc/swref/published/turing/tu102/dev_nv_xve.h
+++ b/src/common/inc/swref/published/turing/tu102/dev_nv_xve.h
@ -28,6 +28,10 @@
 #define NV_XVE_MSIX_CAP_HDR_ENABLE                                  31:31 /* RWIVF */
 #define NV_XVE_MSIX_CAP_HDR_ENABLE_ENABLED                     0x00000001 /* RW--V */
 #define NV_XVE_MSIX_CAP_HDR_ENABLE_DISABLED                    0x00000000 /* RWI-V */
+#define NV_XVE_PRIV_MISC_1                                     0x0000041C /* RW-4R */
+#define NV_XVE_PRIV_MISC_1_CYA_HIDE_MSIX_CAP                        29:29 /* RWCVF */
+#define NV_XVE_PRIV_MISC_1_CYA_HIDE_MSIX_CAP_TRUE              0x00000001 /* RW--V */
+#define NV_XVE_PRIV_MISC_1_CYA_HIDE_MSIX_CAP_FALSE             0x00000000 /* RWC-V */
 #define NV_XVE_SRIOV_CAP_HDR3                                  0x00000BD8 /* R--4R */
 #define NV_XVE_SRIOV_CAP_HDR3_TOTAL_VFS                             31:16 /* R-EVF */
 #define NV_XVE_SRIOV_CAP_HDR5                                  0x00000BE0 /* R--4R */
--- a/src/common/inc/swref/published/turing/tu102/hwproject.h
+++ b/src/common/inc/swref/published/turing/tu102/hwproject.h
@ -25,5 +25,9 @@
 #define __tu102_hwproject_h__

 #define NV_CHIP_EXTENDED_SYSTEM_PHYSICAL_ADDRESS_BITS              47
+#define NV_SCAL_LITTER_NUM_FBPAS                       16
+#define NV_FBPA_PRI_STRIDE                      16384
+#define NV_LTC_PRI_STRIDE                            8192
+#define NV_LTS_PRI_STRIDE                             512

 #endif // __tu102_hwproject_h__
--- a/src/common/nvlink/interface/nvlink.h
+++ b/src/common/nvlink/interface/nvlink.h
@ -439,6 +439,11 @@ NvlStatus nvlink_lib_register_link(nvlink_device *dev, nvlink_link *link);
 */
 NvlStatus nvlink_lib_unregister_link(nvlink_link *link);

+/*
+* Gets number of devices with type deviceType
+*/
+NvlStatus nvlink_lib_return_device_count_by_type(NvU32 deviceType, NvU32 *numDevices);
+

 /************************************************************************************************/
 /******************************* NVLink link management functions *******************************/
--- a/src/common/nvlink/interface/nvlink_export.h
+++ b/src/common/nvlink/interface/nvlink_export.h
@ -46,6 +46,11 @@ NvlStatus nvlink_lib_unload(void);
 */
 NvlStatus nvlink_lib_ioctl_ctrl(nvlink_ioctrl_params *ctrl_params);

+/*
+* Gets number of devices with type deviceType
+*/
+NvlStatus nvlink_lib_return_device_count_by_type(NvU32 deviceType, NvU32 *numDevices);
+
 #ifdef __cplusplus
 }
 #endif
--- a/src/common/nvlink/kernel/nvlink/nvlink_lib_mgmt.c
+++ b/src/common/nvlink/kernel/nvlink/nvlink_lib_mgmt.c
@ -198,3 +198,48 @@ nvlink_lib_is_registerd_device_with_reduced_config(void)

    return bIsReducedConfg;
 }
+
+/*
+* Get the number of devices that have the device type deviceType
+*/
+NvlStatus
+nvlink_lib_return_device_count_by_type
+(
+    NvU32 deviceType,
+    NvU32 *numDevices
+)
+{
+    NvlStatus lock_status = NVL_SUCCESS;
+    nvlink_device *dev = NULL;
+    NvU32 device_count = 0;
+
+    if (nvlink_lib_is_initialized())
+    {
+        // Acquire top-level lock
+        lock_status = nvlink_lib_top_lock_acquire();
+        if (lock_status != NVL_SUCCESS)
+        {
+            NVLINK_PRINT((DBG_MODULE_NVLINK_CORE, NVLINK_DBG_LEVEL_ERRORS,
+                "%s: Failed to acquire top-level lock\n",
+                __FUNCTION__));
+
+            return lock_status;
+         }
+
+        // Top-level lock is now acquired
+
+        // Loop through device list
+        FOR_EACH_DEVICE_REGISTERED(dev, nvlinkLibCtx.nv_devicelist_head, node)
+        {
+            if (dev->type == deviceType)
+            {
+                device_count++;
+            }
+        }
+
+        // Release top-level lock
+        nvlink_lib_top_lock_release(); 
+    }
+    *numDevices = device_count;
+    return NVL_SUCCESS;
+}
--- a/src/common/nvswitch/common/inc/soe/soeififr.h
+++ b/src/common/nvswitch/common/inc/soe/soeififr.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -37,6 +37,17 @@ enum
    RM_SOE_IFR_BBX_SHUTDOWN,
    RM_SOE_IFR_BBX_SXID_ADD,
    RM_SOE_IFR_BBX_SXID_GET,
+    RM_SOE_IFR_BBX_DATA_GET,
+};
+
+enum
+{
+    RM_SOE_IFR_BBX_GET_NONE,
+    RM_SOE_IFR_BBX_GET_SXID,
+    RM_SOE_IFR_BBX_GET_SYS_INFO,
+    RM_SOE_IFR_BBX_GET_TIME_INFO,
+    RM_SOE_IFR_BBX_GET_TEMP_DATA,
+    RM_SOE_IFR_BBX_GET_TEMP_SAMPLES,
 };

 typedef struct
@ -75,6 +86,14 @@ typedef struct
    RM_FLCN_U64 dmaHandle;
 } RM_SOE_IFR_CMD_BBX_SXID_GET_PARAMS;

+typedef struct
+{
+    NvU8 cmdType;
+    NvU32 sizeInBytes;
+    RM_FLCN_U64 dmaHandle;
+    NvU8 dataType;
+} RM_SOE_IFR_CMD_BBX_GET_DATA_PARAMS;
+
 typedef union
 {
 	NvU8	cmdType;
@ -82,6 +101,7 @@ typedef union
    RM_SOE_IFR_CMD_BBX_INIT_PARAMS bbxInit;
    RM_SOE_IFR_CMD_BBX_SXID_ADD_PARAMS bbxSxidAdd;
    RM_SOE_IFR_CMD_BBX_SXID_GET_PARAMS bbxSxidGet;
+    RM_SOE_IFR_CMD_BBX_GET_DATA_PARAMS bbxDataGet;
 } RM_SOE_IFR_CMD;

 // entry of getSxid
@ -99,4 +119,81 @@ typedef struct
    RM_SOE_BBX_SXID_ENTRY sxidLast[INFOROM_BBX_OBJ_XID_ENTRIES];
 } RM_SOE_BBX_GET_SXID_DATA;

+// NVSwitch system version information returning with the command GET_SYS_INFO
+typedef struct
+{
+    NvU32 driverLo;             //Driver Version Low 32 bits
+    NvU16 driverHi;             //Driver Version High 16 bits
+    NvU32 vbiosVersion;         //VBIOS Version 
+    NvU8 vbiosVersionOem;       //VBIOS OEM Version byte
+    NvU8  osType;               //OS Type (UNIX/WIN/WIN2K/WIN9x/OTHER)
+    NvU32 osVersion;            //OS Version (Build|MINOR|MAJOR)
+} RM_SOE_BBX_GET_SYS_INFO_DATA;
+
+// NVSwitch time information returning with the command GET_TIME_INFO
+typedef struct
+{
+    NvU32 timeStart;            //Timestamp (EPOCH) when the driver was loaded on the GPU for the first time
+    NvU32 timeEnd;              //Timestamp (EPOCH) when the data was last flushed
+    NvU32 timeRun;              //Amount of time (in seconds) driver was loaded, and GPU has run
+    NvU32 time24Hours;          //Timestamp (EPOCH) of when the first 24 operational hours is hit
+    NvU32 time100Hours;         //Timestamp (EPOCH) of when the first 100 operational hours is hit
+} RM_SOE_BBX_GET_TIME_INFO_DATA;
+
+#define RM_SOE_BBX_TEMP_DAY_ENTRIES               5
+#define RM_SOE_BBX_TEMP_WEEK_ENTRIES              5
+#define RM_SOE_BBX_TEMP_MNT_ENTRIES               5
+#define RM_SOE_BBX_TEMP_ALL_ENTRIES               5
+#define RM_SOE_BBX_TEMP_SUM_HOUR_ENTRIES          23
+#define RM_SOE_BBX_TEMP_SUM_DAY_ENTRIES           5
+#define RM_SOE_BBX_TEMP_SUM_MNT_ENTRIES           3
+#define RM_SOE_BBX_TEMP_HISTOGRAM_THLD_ENTRIES    20
+#define RM_SOE_BBX_TEMP_HISTOGRAM_TIME_ENTRIES    21
+#define RM_SOE_BBX_TEMP_HOURLY_MAX_ENTRIES        168
+#define RM_SOE_BBX_TEMP_COMPRESS_BUFFER_ENTRIES   1096
+#define RM_SOE_BBX_NUM_COMPRESSION_PERIODS        8
+
+// NVSwitch Temperature Entry
+typedef struct
+{
+    NvU16 value;                //Temperature (SFXP 9.7 format in Celsius)
+    NvU32 timestamp;            //Timestamp (EPOCH) of when the entry is recorded
+} RM_SOE_BBX_TEMP_ENTRY;
+
+// NVSwitch Temperature Data returning with the command GET_TEMP_DATA
+typedef struct
+{
+    NvU32                 tempMaxDayIdx;
+    RM_SOE_BBX_TEMP_ENTRY tempMaxDay[RM_SOE_BBX_TEMP_DAY_ENTRIES];
+    NvU32                 tempMaxWeekIdx;
+    RM_SOE_BBX_TEMP_ENTRY tempMaxWeek[RM_SOE_BBX_TEMP_WEEK_ENTRIES];
+    NvU32                 tempMaxMntIdx;
+    RM_SOE_BBX_TEMP_ENTRY tempMaxMnt[RM_SOE_BBX_TEMP_MNT_ENTRIES];
+    NvU32                 tempMaxAllIdx;
+    RM_SOE_BBX_TEMP_ENTRY tempMaxAll[RM_SOE_BBX_TEMP_ALL_ENTRIES];
+    NvU32                 tempMinDayIdx;
+    RM_SOE_BBX_TEMP_ENTRY tempMinDay[RM_SOE_BBX_TEMP_DAY_ENTRIES];
+    NvU32                 tempMinWeekIdx;
+    RM_SOE_BBX_TEMP_ENTRY tempMinWeek[RM_SOE_BBX_TEMP_WEEK_ENTRIES];
+    NvU32                 tempMinMntIdx;
+    RM_SOE_BBX_TEMP_ENTRY tempMinMnt[RM_SOE_BBX_TEMP_MNT_ENTRIES];
+    NvU32                 tempMinAllIdx;
+    RM_SOE_BBX_TEMP_ENTRY tempMinAll[RM_SOE_BBX_TEMP_ALL_ENTRIES];
+    NvU32                 tempSumDelta;
+    NvU32                 tempSumHour[RM_SOE_BBX_TEMP_SUM_HOUR_ENTRIES];
+    NvU32                 tempSumDay[RM_SOE_BBX_TEMP_SUM_DAY_ENTRIES];
+    NvU32                 tempSumMnt[RM_SOE_BBX_TEMP_SUM_MNT_ENTRIES];
+    NvU32                 tempHistogramThld[RM_SOE_BBX_TEMP_HISTOGRAM_THLD_ENTRIES];
+    NvU32                 tempHistogramTime[RM_SOE_BBX_TEMP_HISTOGRAM_TIME_ENTRIES];
+    RM_SOE_BBX_TEMP_ENTRY tempHourlyMaxSample[RM_SOE_BBX_TEMP_HOURLY_MAX_ENTRIES];
+} RM_SOE_BBX_GET_TEMP_DATA;
+
+// NVSwitch Temperature Compressed Samples returning with the command GET_TEMP_SAMPLES
+typedef struct
+{
+    NvU32                 compressionPeriodIdx;
+    NvU32                 compressionPeriod[RM_SOE_BBX_NUM_COMPRESSION_PERIODS];
+    RM_SOE_BBX_TEMP_ENTRY tempCompressionBuffer[RM_SOE_BBX_TEMP_COMPRESS_BUFFER_ENTRIES];
+} RM_SOE_BBX_GET_TEMP_SAMPLES;
+
 #endif // _SOEIFIFR_H_
--- a/src/common/nvswitch/interface/ctrl_dev_nvswitch.h
+++ b/src/common/nvswitch/interface/ctrl_dev_nvswitch.h
@ -830,6 +830,7 @@ typedef enum nvswitch_err_type
    NVSWITCH_ERR_HW_HOST_THERMAL_SHUTDOWN                              = 10006,
    NVSWITCH_ERR_HW_HOST_IO_FAILURE                                    = 10007,
    NVSWITCH_ERR_HW_HOST_FIRMWARE_INITIALIZATION_FAILURE               = 10008,
+    NVSWITCH_ERR_HW_HOST_FIRMWARE_RECOVERY_MODE                        = 10009,
    NVSWITCH_ERR_HW_HOST_LAST,


@ -2973,6 +2974,197 @@ typedef struct
    NVSWITCH_SXID_ENTRY sxidLast[NVSWITCH_SXID_ENTRIES_NUM];
 } NVSWITCH_GET_SXIDS_PARAMS;

+/*
+ * CTRL_NVSWITCH_GET_SYS_INFO
+ *
+ * Control to get the NVSwitch system version information from inforom cache 
+ *
+ * Parameters:
+ *    driverLo [OUT]
+ *      The driver version low 32 bits. Example: driverLo = 54531 (Driver 545.31)
+ *    driverHi [OUT]
+ *      The driver version high 16 bits
+ *    vbiosVersion [OUT]
+ *      The vbios version number. Example: vbiosVersion=0x96104100 (release 96.10.41.00)
+ *    vbiosVersionOem [OUT]
+ *      The vbios OEM version byte.
+ *    osType [OUT]
+ *      The OS type. Example:  osType=0x05 (UNIX)
+ *    osVersion [OUT]
+ *      The OS version number. [BUILD[31:16]|MINOR[15:8]|MAJOR[7:0]]
+ */
+
+typedef struct
+{
+    NvU32 driverLo;
+    NvU16 driverHi;
+    NvU32 vbiosVersion;
+    NvU8  vbiosVersionOem;
+    NvU8  osType;
+    NvU32 osVersion;
+} NVSWITCH_GET_SYS_INFO_PARAMS;
+
+/*
+ * CTRL_NVSWITCH_GET_TIME_INFO
+ *
+ * Control to get the NVSwitch time information from inforom cache 
+ *
+ * Parameters:
+ *    timeStart [OUT]
+ *      The timestamp (EPOCH) when driver load onto the NVSwitch for the 1st time
+ *    timeEnd [OUT]
+ *      The timestamp (EPOCH) when the data was last flushed
+ *    timeRun [OUT]
+ *      The amount of time (in seconds) driver was loaded/running
+ *    time24Hours [OUT]
+ *      The timestamp (EPOCH) when the first 24 operational hours is hit
+ *    time100Hours [OUT]
+ *      The timestamp (EPOCH) when the first 100 operational hours is hit
+ */
+
+typedef struct
+{
+    NvU32 timeStart;
+    NvU32 timeEnd;
+    NvU32 timeRun;
+    NvU32 time24Hours;
+    NvU32 time100Hours;
+} NVSWITCH_GET_TIME_INFO_PARAMS;
+
+#define NVSWITCH_TEMP_DAY_ENTRIES               5
+#define NVSWITCH_TEMP_WEEK_ENTRIES              5
+#define NVSWITCH_TEMP_MNT_ENTRIES               5
+#define NVSWITCH_TEMP_ALL_ENTRIES               5
+#define NVSWITCH_TEMP_SUM_HOUR_ENTRIES          23
+#define NVSWITCH_TEMP_SUM_DAY_ENTRIES           5
+#define NVSWITCH_TEMP_SUM_MNT_ENTRIES           3
+#define NVSWITCH_TEMP_HISTOGRAM_THLD_ENTRIES    20
+#define NVSWITCH_TEMP_HISTOGRAM_TIME_ENTRIES    21
+#define NVSWITCH_TEMP_HOURLY_MAX_ENTRIES        168
+
+/*
+ * NVSWITCH_TEMP_ENTRY
+ *
+ * This structure represents the NVSwitch TEMP with its timestamp.
+ *
+ *   value
+ *     This parameter specifies the NVSwitch Temperature
+ *     (SFXP 9.7 format in Celsius).
+ *
+ *   timestamp
+ *     This parameter specifies the timestamp (EPOCH) of the entry.
+ */
+typedef struct
+{
+    NvU16 value;
+    NvU32 timestamp;
+} NVSWITCH_TEMP_ENTRY;
+
+/*
+ * CTRL_NVSWITCH_GET_TEMP_DATA
+ *
+ * Control to get the NVSwitch device historical temperature information from inforom cache 
+ *
+ * Parameters:
+ *    tempMaxDayIdx [OUT]
+ *      The current index to the maximum day temperature array
+ *    tempMaxDay[] [OUT]
+ *      The maximum temperature array for last NVSWITCH_TEMP_DAY_ENTRIES days
+ *    tempMaxWeekIdx [OUT]
+ *      The current index to the maximum week temperature array
+ *    tempMaxWeek[] [OUT]
+ *      The maximum temperature array for last NVSWITCH_TEMP_WEEK_ENTRIES weeks
+ *    tempMaxMntIdx [OUT]
+ *      The current index to the maximum month temperature array
+ *    tempMaxMnt[] [OUT]
+ *      The maximum temperature array for last NVSWITCH_TEMP_MNT_ENTRIES months
+ *    tempMaxAllIdx [OUT]
+ *      The current index to the maximum temperature array
+ *    tempMaxAll[] [OUT]
+ *      The maximum temperature array for the device 
+ *    tempMinDayIdx [OUT]
+ *      The current index to the minimum day temperature array
+ *    tempMinDay[] [OUT]
+ *      The minimum temperature array for last NVSWITCH_TEMP_DAY_ENTRIES days
+ *    tempMinWeekIdx [OUT]
+ *      The current index to the minimum week temperature array
+ *    tempMinWeek[] [OUT]
+ *      The minimum temperature array for last NVSWITCH_TEMP_WEEK_ENTRIES weeks
+ *    tempMinMntIdx [OUT]
+ *      The current index to the minimum month temperature array
+ *    tempMinMnt[] [OUT]
+ *      The minimum temperature array for last NVSWITCH_TEMP_MNT_ENTRIES months
+ *    tempMinAllIdx [OUT]
+ *      The current index to the minimum temperature array
+ *    tempMinAll[] [OUT]
+ *      The minimum temperature array for the device
+ *    tempSumDelta [OUT]
+ *      The total sum of temperature change in 0.1C granularity
+ *    tempSumHour[] [OUT]
+ *      The moving average of temperature per hour, for last NVSWITCH_TEMP_SUM_HOUR_ENTRIES hours
+ *    tempSumDay[] [OUT]
+ *      The moving average of temperature per day, for last NVSWITCH_TEMP_SUM_DAY_ENTRIES days
+ *    tempSumMnt[] [OUT]
+ *      The moving average of temperature per month, for last NVSWITCH_TEMP_SUM_MNT_ENTRIES months
+ *    tempHistogramThld[] [OUT]
+ *      The histogram of temperature crossing various thresholds (5/10/15/.../95/100)
+ *    tempHistogramTime[] [OUT]
+ *      The histogram of time was in various temperature ranges (0..5/5..10/.../100..)
+ *    tempHourlyMaxSample[] [OUT]
+ *      The maximum hourly temperature array for the device
+ */
+
+typedef struct
+{
+    NvU32               tempMaxDayIdx;
+    NVSWITCH_TEMP_ENTRY tempMaxDay[NVSWITCH_TEMP_DAY_ENTRIES];
+    NvU32               tempMaxWeekIdx;
+    NVSWITCH_TEMP_ENTRY tempMaxWeek[NVSWITCH_TEMP_WEEK_ENTRIES];
+    NvU32               tempMaxMntIdx;
+    NVSWITCH_TEMP_ENTRY tempMaxMnt[NVSWITCH_TEMP_MNT_ENTRIES];
+    NvU32               tempMaxAllIdx;
+    NVSWITCH_TEMP_ENTRY tempMaxAll[NVSWITCH_TEMP_ALL_ENTRIES];
+    NvU32               tempMinDayIdx;
+    NVSWITCH_TEMP_ENTRY tempMinDay[NVSWITCH_TEMP_DAY_ENTRIES];
+    NvU32               tempMinWeekIdx;
+    NVSWITCH_TEMP_ENTRY tempMinWeek[NVSWITCH_TEMP_WEEK_ENTRIES];
+    NvU32               tempMinMntIdx;
+    NVSWITCH_TEMP_ENTRY tempMinMnt[NVSWITCH_TEMP_MNT_ENTRIES];
+    NvU32               tempMinAllIdx;
+    NVSWITCH_TEMP_ENTRY tempMinAll[NVSWITCH_TEMP_ALL_ENTRIES];
+    NvU32               tempSumDelta;
+    NvU32               tempSumHour[NVSWITCH_TEMP_SUM_HOUR_ENTRIES];
+    NvU32               tempSumDay[NVSWITCH_TEMP_SUM_DAY_ENTRIES];
+    NvU32               tempSumMnt[NVSWITCH_TEMP_SUM_MNT_ENTRIES];
+    NvU32               tempHistogramThld[NVSWITCH_TEMP_HISTOGRAM_THLD_ENTRIES];
+    NvU32               tempHistogramTime[NVSWITCH_TEMP_HISTOGRAM_TIME_ENTRIES];
+    NVSWITCH_TEMP_ENTRY tempHourlyMaxSample[NVSWITCH_TEMP_HOURLY_MAX_ENTRIES];
+} NVSWITCH_GET_TEMP_DATA_PARAMS;
+
+#define NVSWITCH_TEMP_COMPRESS_BUFFER_ENTRIES   1096
+#define NVSWITCH_NUM_COMPRESSION_PERIODS        8
+
+/*
+ * CTRL_NVSWITCH_GET_TEMP_DATA
+ *
+ * Control to get the NVSwitch device temperature information from inforom cache 
+ *
+ * Parameters:
+ *    compressionPeriodIdx [OUT]
+ *      The current index to the sample period array
+ *    compressionPeriod[] [OUT]
+ *      The samples period array (seconds)
+ *    tempCompressionBuffer[] [OUT]
+ *      The temperature array sampling at a specific period in compressionPeriod[]
+ */
+
+typedef struct
+{
+    NvU32               compressionPeriodIdx;
+    NvU32               compressionPeriod[NVSWITCH_NUM_COMPRESSION_PERIODS];
+    NVSWITCH_TEMP_ENTRY tempCompressionBuffer[NVSWITCH_TEMP_COMPRESS_BUFFER_ENTRIES];
+} NVSWITCH_GET_TEMP_SAMPLES_PARAMS;
+
 /*
 * CTRL_NVSWITCH_GET_FOM_VALUES
 *   This command gives the FOM values to MODS
@ -3848,6 +4040,10 @@ typedef struct
 #define CTRL_NVSWITCH_RESERVED_11                           0x55
 #define CTRL_NVSWITCH_GET_BOARD_PART_NUMBER                 0x56
 #define CTRL_NVSWITCH_GET_POWER                             0x57
+#define CTRL_NVSWITCH_GET_SYS_INFO                          0x58
+#define CTRL_NVSWITCH_GET_TIME_INFO                         0x59
+#define CTRL_NVSWITCH_GET_TEMP_DATA                         0x60
+#define CTRL_NVSWITCH_GET_TEMP_SAMPLES                      0x61

 #ifdef __cplusplus
 }
--- a/src/common/nvswitch/kernel/inc/haldef_nvswitch.h
+++ b/src/common/nvswitch/kernel/inc/haldef_nvswitch.h
@ -158,6 +158,7 @@
    _op(NvlStatus, nvswitch_bbx_unload, (nvswitch_device *device), _arch)  \
    _op(NvlStatus, nvswitch_bbx_load, (nvswitch_device *device, NvU64 time_ns, NvU8 osType, NvU32 osVersion), _arch)  \
    _op(NvlStatus, nvswitch_bbx_get_sxid, (nvswitch_device *device, NVSWITCH_GET_SXIDS_PARAMS * params), _arch)  \
+    _op(NvlStatus, nvswitch_bbx_get_data, (nvswitch_device *device, NvU8 dataType, void * params), _arch)  \
    _op(NvlStatus, nvswitch_smbpbi_alloc,           (nvswitch_device *device), _arch)  \
    _op(NvlStatus, nvswitch_smbpbi_post_init_hal,   (nvswitch_device *device), _arch)  \
    _op(void,      nvswitch_smbpbi_destroy_hal,     (nvswitch_device *device), _arch)  \
@ -213,6 +214,7 @@
    _op(void,      nvswitch_reset_persistent_link_hw_state, (nvswitch_device *device, NvU32 linkNumber), _arch)\
    _op(void,      nvswitch_store_topology_information, (nvswitch_device *device, nvlink_link *link), _arch) \
    _op(void,      nvswitch_init_lpwr_regs, (nvlink_link *link), _arch) \
+    _op(void,      nvswitch_program_l1_scratch_reg, (nvswitch_device *device, NvU32 linkNumber), _arch) \
    _op(NvlStatus, nvswitch_set_training_mode, (nvswitch_device *device), _arch) \
    _op(NvU32,     nvswitch_get_sublink_width, (nvswitch_device *device, NvU32 linkNumber), _arch) \
    _op(NvBool,    nvswitch_i2c_is_device_access_allowed, (nvswitch_device *device, NvU32 port, NvU8 addr, NvBool bIsRead), _arch) \
@ -234,6 +236,7 @@
    _op(NvlStatus, nvswitch_ctrl_therm_read_power, (nvswitch_device *device, NVSWITCH_GET_POWER_PARAMS *info), _arch) \
    _op(NvBool,    nvswitch_does_link_need_termination_enabled, (nvswitch_device *device, nvlink_link *link), _arch) \
    _op(NvlStatus, nvswitch_link_termination_setup, (nvswitch_device *device, nvlink_link *link), _arch) \
+    _op(NvlStatus, nvswitch_check_io_sanity, (nvswitch_device *device), _arch) \

 #define NVSWITCH_HAL_FUNCTION_LIST_LS10(_op, _arch) \
    _op(NvlStatus, nvswitch_launch_ALI, (nvswitch_device *device), _arch) \
--- a/src/common/nvswitch/kernel/inc/inforom/inforom_nvswitch.h
+++ b/src/common/nvswitch/kernel/inc/inforom/inforom_nvswitch.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2020 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -184,6 +184,7 @@ NvlStatus nvswitch_inforom_bbx_add_sxid(nvswitch_device *device,
                                    NvU32 data1, NvU32 data2);
 NvlStatus nvswitch_inforom_bbx_get_sxid(nvswitch_device *device,
                            NVSWITCH_GET_SXIDS_PARAMS *params);
+NvlStatus nvswitch_inforom_bbx_get_data(nvswitch_device *device, NvU8 dataType, void *params);

 // InfoROM DEM APIs
 NvlStatus nvswitch_inforom_dem_load(nvswitch_device *device);
--- a/src/common/nvswitch/kernel/inc/lr10/inforom_lr10.h
+++ b/src/common/nvswitch/kernel/inc/lr10/inforom_lr10.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2020 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -169,4 +169,12 @@ nvswitch_bbx_get_sxid_lr10
    NVSWITCH_GET_SXIDS_PARAMS * params
 );

+NvlStatus
+nvswitch_bbx_get_data_lr10
+(
+    nvswitch_device *device,
+    NvU8 dataType,
+    void *params
+);
+
 #endif //_INFOROM_LR10_H_
--- a/src/common/nvswitch/kernel/inc/lr10/lr10.h
+++ b/src/common/nvswitch/kernel/inc/lr10/lr10.h
@ -583,9 +583,12 @@ typedef struct
    NvBool bDisabledRemoteEndLinkMaskCached;
 } lr10_device;

+#define NVSWITCH_NUM_DEVICES_PER_DELTA_LR10 6
+
 typedef struct {
    NvU32 switchPhysicalId;
-    NvU64  linkMask;
+    NvU64 accessLinkMask;
+    NvU64 trunkLinkMask;
 } lr10_links_connected_to_disabled_remote_end;

 #define NVSWITCH_GET_CHIP_DEVICE_LR10(_device)                  \
@ -649,6 +652,7 @@ void      nvswitch_setup_link_loopback_mode_lr10(nvswitch_device *device, NvU32
 void nvswitch_reset_persistent_link_hw_state_lr10(nvswitch_device *device, NvU32 linkNumber);
 void nvswitch_store_topology_information_lr10(nvswitch_device *device, nvlink_link *link);
 void nvswitch_init_lpwr_regs_lr10(nvlink_link *link);
+void nvswitch_program_l1_scratch_reg_lr10(nvswitch_device *device, NvU32 linkNumber);
 NvlStatus nvswitch_set_training_mode_lr10(nvswitch_device *device);
 NvBool nvswitch_i2c_is_device_access_allowed_lr10(nvswitch_device *device, NvU32 port, NvU8 addr, NvBool bIsRead);
 NvU32     nvswitch_get_sublink_width_lr10(nvswitch_device *device,NvU32 linkNumber);
--- a/src/common/nvswitch/kernel/inc/ls10/inforom_ls10.h
+++ b/src/common/nvswitch/kernel/inc/ls10/inforom_ls10.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2020-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -154,4 +154,11 @@ nvswitch_bbx_get_sxid_ls10
    NVSWITCH_GET_SXIDS_PARAMS * params
 );

+NvlStatus
+nvswitch_bbx_get_data_ls10
+(
+    nvswitch_device *device,
+    NvU8 dataType,
+    void *params
+);
 #endif //_INFOROM_LS10_H_
--- a/src/common/nvswitch/kernel/inc/ls10/ls10.h
+++ b/src/common/nvswitch/kernel/inc/ls10/ls10.h
@ -529,10 +529,20 @@ typedef struct
 {
    NvBool bLinkErrorsCallBackEnabled;
    NvBool bLinkStateCallBackEnabled;
-    NvBool bResetAndDrainRetry;
+    NvU64  lastRetrainTime;
+    NvU64  lastLinkUpTime;
+} NVLINK_LINK_ERROR_REPORTING_STATE;

+typedef struct
+{
    NVLINK_LINK_ERROR_INFO_ERR_MASKS fatalIntrMask;
    NVLINK_LINK_ERROR_INFO_ERR_MASKS nonFatalIntrMask;
+} NVLINK_LINK_ERROR_REPORTING_DATA;
+
+typedef struct
+{
+    NVLINK_LINK_ERROR_REPORTING_STATE state;
+    NVLINK_LINK_ERROR_REPORTING_DATA  data;
 } NVLINK_LINK_ERROR_REPORTING;

 typedef struct
@ -834,7 +844,6 @@ typedef const struct
 #define nvswitch_setup_link_loopback_mode_ls10       nvswitch_setup_link_loopback_mode_lr10

 #define nvswitch_link_lane_reversed_ls10             nvswitch_link_lane_reversed_lr10
-#define nvswitch_request_tl_link_state_ls10          nvswitch_request_tl_link_state_lr10

 #define nvswitch_i2c_get_port_info_ls10             nvswitch_i2c_get_port_info_lr10
 #define nvswitch_i2c_set_hw_speed_mode_ls10         nvswitch_i2c_set_hw_speed_mode_lr10
@ -929,6 +938,7 @@ void   nvswitch_corelib_clear_link_state_lr10(nvlink_link *link);
 NvlStatus nvswitch_corelib_set_dl_link_mode_ls10(nvlink_link *link, NvU64 mode, NvU32 flags);
 NvlStatus nvswitch_corelib_set_tx_mode_ls10(nvlink_link *link, NvU64 mode, NvU32 flags);
 void nvswitch_init_lpwr_regs_ls10(nvlink_link *link);
+void nvswitch_program_l1_scratch_reg_ls10(nvswitch_device *device, NvU32 linkNumber);

 NvlStatus nvswitch_minion_service_falcon_interrupts_ls10(nvswitch_device *device, NvU32 instance);

@ -986,6 +996,7 @@ NvlStatus nvswitch_reset_and_drain_links_ls10(nvswitch_device *device, NvU64 lin
 void      nvswitch_service_minion_all_links_ls10(nvswitch_device *device);
 NvlStatus nvswitch_ctrl_get_board_part_number_ls10(nvswitch_device *device, NVSWITCH_GET_BOARD_PART_NUMBER_VECTOR *p);
 void      nvswitch_create_deferred_link_state_check_task_ls10(nvswitch_device *device, NvU32 nvlipt_instance, NvU32 link);
+NvlStatus nvswitch_request_tl_link_state_ls10(nvlink_link *link, NvU32 tlLinkState, NvBool bSync);

 //
 // SU generated functions
--- a/src/common/nvswitch/kernel/inc/ls10/minion_nvlink_defines_public_ls10.h
+++ b/src/common/nvswitch/kernel/inc/ls10/minion_nvlink_defines_public_ls10.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -46,6 +46,9 @@ typedef enum _MINION_STATUS
    MINION_ALARM_BUSY                                   = 80,
 } MINION_STATUS;

+  #define LINKSTATUS_RESET                      0x0
+  #define LINKSTATUS_UNINIT                     0x1
+  #define LINKSTATUS_LANESHUTDOWN               0x13
  #define LINKSTATUS_EMERGENCY_SHUTDOWN         0x29
-  #define LINKSTATUS_INITPHASE1                 0x24
+  #define LINKSTATUS_ACTIVE_PENDING             0x25
 #endif // _MINION_NVLINK_DEFINES_PUBLIC_H_
--- a/src/common/nvswitch/kernel/inc/soe/bin/g_soeuc_lr10_dbg.h
+++ b/src/common/nvswitch/kernel/inc/soe/bin/g_soeuc_lr10_dbg.h
@ -751,7 +751,7 @@ const NvU32 soe_ucode_data_lr10_dbg[] = {
   0x00f0b305, 0x0a09584a, 0x90014afe, 0xafb508aa, 0x010f9801, 0xb60093f0, 0xa9b50294, 0x02afb503, 
   0xb2100918, 0x18a9351b, 0xb5020f98, 0x099804af, 0x05a9b503, 0xa0a000bf, 0x005b0b7e, 0xf001a6b0, 
   0x9a120b9c, 0x59ab3e01, 0xfb020a00, 0xe27e1c15, 0x943d0059, 0xf001a6b0, 0xa6f00bac, 0xa29a3c01, 
-   0x548900f8, 0x9ebf0005, 0xb5019f98, 0x9ea0019f, 0x005a267e, 0x0801a4b3, 0x00f8a43d, 0xff0a09f8, 
+   0x548900f8, 0x9ebf0005, 0xb5019f98, 0x9ea0019f, 0x005a267e, 0x0801a4b3, 0x00f8a43d, 0xff0a02f8, 
   0x12f900f8, 0x000f8c89, 0xf20a99bf, 0x380090b3, 0x000fa881, 0xf10a10bf, 0x2c0004b3, 0x000a747e, 
   0x19a00109, 0x000f9889, 0x948990a0, 0xff0f0010, 0x90899fa0, 0x90a0000f, 0x000f9489, 0x587e9fa0, 
   0x10a00037, 0x12f911fb, 0x000f8c89, 0xb4bd04bd, 0xb44c90a0, 0x0fac8a00, 0x0b947e00, 0x0cb4bd00, 
@ -1370,7 +1370,7 @@ const NvU32 soe_ucode_data_lr10_dbg[] = {
   0xb232f900, 0xbdb2b2a1, 0x3ef00304, 0xbf00a6f0, 0x01009019, 0x93a61ab2, 0x0a090df4, 0xa6f73e03, 
   0xf493a600, 0x020a091b, 0x00a6f73e, 0x00a6aa7e, 0x08f402a6, 0xfba4bddd, 0xf830f431, 0x0005dcdf, 
   0xbf82f900, 0x0149feff, 0xb2289990, 0xb29fa0a3, 0x00a9b3b8, 0xb0b30084, 0x47fe7f00, 0x05a49801, 
-   0x54bd24bd, 0x779014bd, 0xa7613e24, 0x0c3a9800, 0x02bc94bd, 0xb279a0b0, 0xb65f7e7c, 0x0f79bf00, 
+   0x14bd54bd, 0x779024bd, 0xa7613e24, 0x0c3a9800, 0x02bc94bd, 0xb279a0b0, 0xb65f7e7c, 0x0f79bf00, 
   0xf49fa6ff, 0x643d090b, 0x00a74f3e, 0x90015590, 0x04a60100, 0x33d908f4, 0x90070060, 0x24bc0111, 
   0x03399820, 0x18f429a6, 0xbd01060b, 0xa7523e04, 0xb24bb200, 0x16fc7e1a, 0xf45aa600, 0x1190060d, 
   0x06399801, 0x19a6f43d, 0x0f050cf4, 0xbd8f2001, 0xa7973ea4, 0xfe020a00, 0x99900149, 0xd99fbf28, 
@ -1420,7 +1420,7 @@ const NvU32 soe_ucode_data_lr10_dbg[] = {
   0x070b943a, 0xb200804c, 0xb7797e2d, 0x0ca1b000, 0xb600adb3, 0x05291801, 0x76042f18, 0xf4f00894, 
   0xe59fffff, 0xe966ff09, 0x01980bf5, 0xffffe9e4, 0x08f589a6, 0xf4bd018e, 0x18902fbc, 0x9d330999, 
   0x90018200, 0xf4b301ff, 0xfc3ef207, 0x8e3c00ae, 0xf59f26f2, 0xc4016d08, 0x94f0fffd, 0x529dbcff, 
-   0x0df456a6, 0x9065b205, 0xe4bd10d9, 0x3db029bc, 0x3ec43da4, 0xb100ada7, 0xf5006fd6, 0xb401450c, 
+   0x0df456a6, 0x9065b205, 0xa43d10d9, 0x3db029bc, 0x3ee4bdc4, 0xb100ada7, 0xf5006fd6, 0xb401450c, 
   0xbe3c0b10, 0xf81e3c98, 0x0bf4f926, 0xff94f017, 0xfd009939, 0x9033049f, 0x010a0600, 0x0ce9bf3c, 
   0x01ee9001, 0xa601dd90, 0xce08f4e5, 0xed00c933, 0xf0293f00, 0x0bf40894, 0x00a93308, 0x94bd00d0, 
   0x91b03ab2, 0x1391b014, 0x301291b0, 0x4bfe5b91, 0x5bbb9001, 0x00a6f97e, 0xadb3a0b2, 0x3400ef00, 
@ -2269,8 +2269,8 @@ const NvU32 soe_ucode_data_lr10_dbg[] = {
   0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
   0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
   0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
-   0xf0cc97fc, 0xc5e27e17, 0x63cc4ffc, 0xc48564fa, 0x979b9cb7, 0x7359186e, 0x8b211603, 0x878da8fe, 
-   0x956b7a40, 0x90bcaaf7, 0xdea25edb, 0x9aaef423, 0x269562e0, 0x626d8a06, 0xc3df044b, 0x11ecee8e, 
+   0xf0cc97fc, 0xc5e27e17, 0x63cc4ffc, 0xc48564fa, 0x6073f3d9, 0x573ea3ef, 0xf0764322, 0xf8dacef7, 
+   0x956b7a40, 0x90bcaaf7, 0xdea25edb, 0x9aaef423, 0xe0830635, 0xb9c7326b, 0x27f96395, 0x7078f754, 
   0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
   0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
   0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
--- a/src/common/nvswitch/kernel/inc/soe/bin/g_soeuc_lr10_prd.h
+++ b/src/common/nvswitch/kernel/inc/soe/bin/g_soeuc_lr10_prd.h
@ -751,7 +751,7 @@ const NvU32 soe_ucode_data_lr10_prd[] = {
   0x00f0b305, 0x0a09584a, 0x90014afe, 0xafb508aa, 0x010f9801, 0xb60093f0, 0xa9b50294, 0x02afb503, 
   0xb2100918, 0x18a9351b, 0xb5020f98, 0x099804af, 0x05a9b503, 0xa0a000bf, 0x005b0b7e, 0xf001a6b0, 
   0x9a120b9c, 0x59ab3e01, 0xfb020a00, 0xe27e1c15, 0x943d0059, 0xf001a6b0, 0xa6f00bac, 0xa29a3c01, 
-   0x548900f8, 0x9ebf0005, 0xb5019f98, 0x9ea0019f, 0x005a267e, 0x0801a4b3, 0x00f8a43d, 0xff0a09f8, 
+   0x548900f8, 0x9ebf0005, 0xb5019f98, 0x9ea0019f, 0x005a267e, 0x0801a4b3, 0x00f8a43d, 0xff0a02f8, 
   0x12f900f8, 0x000f8c89, 0xf20a99bf, 0x380090b3, 0x000fa881, 0xf10a10bf, 0x2c0004b3, 0x000a747e, 
   0x19a00109, 0x000f9889, 0x948990a0, 0xff0f0010, 0x90899fa0, 0x90a0000f, 0x000f9489, 0x587e9fa0, 
   0x10a00037, 0x12f911fb, 0x000f8c89, 0xb4bd04bd, 0xb44c90a0, 0x0fac8a00, 0x0b947e00, 0x0cb4bd00, 
@ -1370,7 +1370,7 @@ const NvU32 soe_ucode_data_lr10_prd[] = {
   0xb232f900, 0xbdb2b2a1, 0x3ef00304, 0xbf00a6f0, 0x01009019, 0x93a61ab2, 0x0a090df4, 0xa6f73e03, 
   0xf493a600, 0x020a091b, 0x00a6f73e, 0x00a6aa7e, 0x08f402a6, 0xfba4bddd, 0xf830f431, 0x0005dcdf, 
   0xbf82f900, 0x0149feff, 0xb2289990, 0xb29fa0a3, 0x00a9b3b8, 0xb0b30084, 0x47fe7f00, 0x05a49801, 
-   0x54bd24bd, 0x779014bd, 0xa7613e24, 0x0c3a9800, 0x02bc94bd, 0xb279a0b0, 0xb65f7e7c, 0x0f79bf00, 
+   0x14bd54bd, 0x779024bd, 0xa7613e24, 0x0c3a9800, 0x02bc94bd, 0xb279a0b0, 0xb65f7e7c, 0x0f79bf00, 
   0xf49fa6ff, 0x643d090b, 0x00a74f3e, 0x90015590, 0x04a60100, 0x33d908f4, 0x90070060, 0x24bc0111, 
   0x03399820, 0x18f429a6, 0xbd01060b, 0xa7523e04, 0xb24bb200, 0x16fc7e1a, 0xf45aa600, 0x1190060d, 
   0x06399801, 0x19a6f43d, 0x0f050cf4, 0xbd8f2001, 0xa7973ea4, 0xfe020a00, 0x99900149, 0xd99fbf28, 
@ -1420,7 +1420,7 @@ const NvU32 soe_ucode_data_lr10_prd[] = {
   0x070b943a, 0xb200804c, 0xb7797e2d, 0x0ca1b000, 0xb600adb3, 0x05291801, 0x76042f18, 0xf4f00894, 
   0xe59fffff, 0xe966ff09, 0x01980bf5, 0xffffe9e4, 0x08f589a6, 0xf4bd018e, 0x18902fbc, 0x9d330999, 
   0x90018200, 0xf4b301ff, 0xfc3ef207, 0x8e3c00ae, 0xf59f26f2, 0xc4016d08, 0x94f0fffd, 0x529dbcff, 
-   0x0df456a6, 0x9065b205, 0xe4bd10d9, 0x3db029bc, 0x3ec43da4, 0xb100ada7, 0xf5006fd6, 0xb401450c, 
+   0x0df456a6, 0x9065b205, 0xa43d10d9, 0x3db029bc, 0x3ee4bdc4, 0xb100ada7, 0xf5006fd6, 0xb401450c, 
   0xbe3c0b10, 0xf81e3c98, 0x0bf4f926, 0xff94f017, 0xfd009939, 0x9033049f, 0x010a0600, 0x0ce9bf3c, 
   0x01ee9001, 0xa601dd90, 0xce08f4e5, 0xed00c933, 0xf0293f00, 0x0bf40894, 0x00a93308, 0x94bd00d0, 
   0x91b03ab2, 0x1391b014, 0x301291b0, 0x4bfe5b91, 0x5bbb9001, 0x00a6f97e, 0xadb3a0b2, 0x3400ef00, 
@ -2269,8 +2269,8 @@ const NvU32 soe_ucode_data_lr10_prd[] = {
   0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
   0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
   0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
-   0xf0cc97fc, 0xc5e27e17, 0x63cc4ffc, 0xc48564fa, 0x979b9cb7, 0x7359186e, 0x8b211603, 0x878da8fe, 
-   0x956b7a40, 0x90bcaaf7, 0xdea25edb, 0x9aaef423, 0x269562e0, 0x626d8a06, 0xc3df044b, 0x11ecee8e, 
+   0xf0cc97fc, 0xc5e27e17, 0x63cc4ffc, 0xc48564fa, 0x6073f3d9, 0x573ea3ef, 0xf0764322, 0xf8dacef7, 
+   0x956b7a40, 0x90bcaaf7, 0xdea25edb, 0x9aaef423, 0xe0830635, 0xb9c7326b, 0x27f96395, 0x7078f754, 
   0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
   0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
   0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
--- a/src/common/nvswitch/kernel/inforom/ifrbbx_nvswitch.c
+++ b/src/common/nvswitch/kernel/inforom/ifrbbx_nvswitch.c
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2020 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2020-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -130,3 +130,21 @@ nvswitch_inforom_bbx_get_sxid
    return status;
 }

+NvlStatus
+nvswitch_inforom_bbx_get_data
+(
+    nvswitch_device *device,
+    NvU8 dataType,
+    void *params
+)
+{
+    NvlStatus status;
+
+    status = device->hal.nvswitch_bbx_get_data(device, dataType, params);
+    if (status != NVL_SUCCESS)
+    {
+        NVSWITCH_PRINT(device, ERROR, "%s: (type=%d) failed, status=%d\n", __FUNCTION__, dataType, status);
+    }
+
+    return status;
+}
--- a/src/common/nvswitch/kernel/lr10/inforom_lr10.c
+++ b/src/common/nvswitch/kernel/lr10/inforom_lr10.c
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2020 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -32,6 +32,7 @@
 #include "nvVer.h"
 #include "regkey_nvswitch.h"
 #include "inforom/inforom_nvl_v3_nvswitch.h"
+#include "soe/soeififr.h"

 //
 // TODO: Split individual object hals to their own respective files
@ -1280,3 +1281,14 @@ nvswitch_bbx_get_sxid_lr10
    return -NVL_ERR_NOT_SUPPORTED;
 }

+NvlStatus
+nvswitch_bbx_get_data_lr10
+(
+    nvswitch_device *device,
+    NvU8 dataType,
+    void *params
+)
+{
+    return -NVL_ERR_NOT_SUPPORTED;
+}
+
--- a/Show More
+++ b/Show More