mirror of
https://github.com/NVIDIA/open-gpu-kernel-modules.git
synced 2025-01-31 21:52:11 +01:00
555.42.02
(cherry picked from commit 5a1c474040e1c3ed20760267510cc9d9332898f1)
This commit is contained in:
parent
caa2dd11a0
commit
3084c04453
@ -1,11 +1,11 @@
|
||||
# Changelog
|
||||
|
||||
## Release 555 Entries
|
||||
|
||||
### [555.42.02] 2024-05-21
|
||||
|
||||
## Release 550 Entries
|
||||
|
||||
### [550.100] 2024-07-09
|
||||
|
||||
### [550.90.07] 2024-06-04
|
||||
|
||||
### [550.78] 2024-04-25
|
||||
|
||||
### [550.76] 2024-04-17
|
||||
|
12
README.md
12
README.md
@ -1,7 +1,7 @@
|
||||
# NVIDIA Linux Open GPU Kernel Module Source
|
||||
|
||||
This is the source release of the NVIDIA Linux open GPU kernel modules,
|
||||
version 550.100.
|
||||
version 555.42.02.
|
||||
|
||||
|
||||
## How to Build
|
||||
@ -17,7 +17,7 @@ as root:
|
||||
|
||||
Note that the kernel modules built here must be used with GSP
|
||||
firmware and user-space NVIDIA GPU driver components from a corresponding
|
||||
550.100 driver release. This can be achieved by installing
|
||||
555.42.02 driver release. This can be achieved by installing
|
||||
the NVIDIA GPU driver from the .run file using the `--no-kernel-modules`
|
||||
option. E.g.,
|
||||
|
||||
@ -74,7 +74,7 @@ kernel.
|
||||
|
||||
The NVIDIA open kernel modules support the same range of Linux kernel
|
||||
versions that are supported with the proprietary NVIDIA kernel modules.
|
||||
This is currently Linux kernel 3.10 or newer.
|
||||
This is currently Linux kernel 4.15 or newer.
|
||||
|
||||
|
||||
## How to Contribute
|
||||
@ -188,7 +188,7 @@ encountered specific to them.
|
||||
For details on feature support and limitations, see the NVIDIA GPU driver
|
||||
end user README here:
|
||||
|
||||
https://us.download.nvidia.com/XFree86/Linux-x86_64/550.100/README/kernel_open.html
|
||||
https://us.download.nvidia.com/XFree86/Linux-x86_64/555.42.02/README/kernel_open.html
|
||||
|
||||
For vGPU support, please refer to the README.vgpu packaged in the vGPU Host
|
||||
Package for more details.
|
||||
@ -757,8 +757,6 @@ Subsystem Device ID.
|
||||
| NVIDIA H100 80GB HBM3 | 2330 10DE 16C0 |
|
||||
| NVIDIA H100 80GB HBM3 | 2330 10DE 16C1 |
|
||||
| NVIDIA H100 PCIe | 2331 10DE 1626 |
|
||||
| NVIDIA H200 | 2335 10DE 18BE |
|
||||
| NVIDIA H200 | 2335 10DE 18BF |
|
||||
| NVIDIA H100 | 2339 10DE 17FC |
|
||||
| NVIDIA H800 NVL | 233A 10DE 183A |
|
||||
| NVIDIA GH200 120GB | 2342 10DE 16EB |
|
||||
@ -858,6 +856,7 @@ Subsystem Device ID.
|
||||
| NVIDIA RTX A500 Embedded GPU | 25FB |
|
||||
| NVIDIA GeForce RTX 4090 | 2684 |
|
||||
| NVIDIA GeForce RTX 4090 D | 2685 |
|
||||
| NVIDIA GeForce RTX 4070 Ti SUPER | 2689 |
|
||||
| NVIDIA RTX 6000 Ada Generation | 26B1 1028 16A1 |
|
||||
| NVIDIA RTX 6000 Ada Generation | 26B1 103C 16A1 |
|
||||
| NVIDIA RTX 6000 Ada Generation | 26B1 10DE 16A1 |
|
||||
@ -875,7 +874,6 @@ Subsystem Device ID.
|
||||
| NVIDIA L40S | 26B9 10DE 1851 |
|
||||
| NVIDIA L40S | 26B9 10DE 18CF |
|
||||
| NVIDIA L20 | 26BA 10DE 1957 |
|
||||
| NVIDIA L20 | 26BA 10DE 1990 |
|
||||
| NVIDIA GeForce RTX 4080 SUPER | 2702 |
|
||||
| NVIDIA GeForce RTX 4080 | 2704 |
|
||||
| NVIDIA GeForce RTX 4070 Ti SUPER | 2705 |
|
||||
|
@ -72,7 +72,7 @@ EXTRA_CFLAGS += -I$(src)/common/inc
|
||||
EXTRA_CFLAGS += -I$(src)
|
||||
EXTRA_CFLAGS += -Wall $(DEFINES) $(INCLUDES) -Wno-cast-qual -Wno-format-extra-args
|
||||
EXTRA_CFLAGS += -D__KERNEL__ -DMODULE -DNVRM
|
||||
EXTRA_CFLAGS += -DNV_VERSION_STRING=\"550.100\"
|
||||
EXTRA_CFLAGS += -DNV_VERSION_STRING=\"555.42.02\"
|
||||
|
||||
ifneq ($(SYSSRCHOST1X),)
|
||||
EXTRA_CFLAGS += -I$(SYSSRCHOST1X)
|
||||
@ -118,7 +118,7 @@ ifeq ($(ARCH),x86_64)
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH),powerpc)
|
||||
EXTRA_CFLAGS += -mlittle-endian -mno-strict-align -mno-altivec
|
||||
EXTRA_CFLAGS += -mlittle-endian -mno-strict-align
|
||||
endif
|
||||
|
||||
EXTRA_CFLAGS += -DNV_UVM_ENABLE
|
||||
@ -172,6 +172,7 @@ NV_CFLAGS_FROM_CONFTEST := $(shell $(NV_CONFTEST_CMD) build_cflags)
|
||||
NV_CONFTEST_CFLAGS = $(NV_CFLAGS_FROM_CONFTEST) $(EXTRA_CFLAGS) -fno-pie
|
||||
NV_CONFTEST_CFLAGS += $(call cc-disable-warning,pointer-sign)
|
||||
NV_CONFTEST_CFLAGS += $(call cc-option,-fshort-wchar,)
|
||||
NV_CONFTEST_CFLAGS += $(call cc-option,-Werror=incompatible-pointer-types,)
|
||||
NV_CONFTEST_CFLAGS += -Wno-error
|
||||
|
||||
NV_CONFTEST_COMPILE_TEST_HEADERS := $(obj)/conftest/macros.h
|
||||
|
@ -28,7 +28,7 @@ else
|
||||
else
|
||||
KERNEL_UNAME ?= $(shell uname -r)
|
||||
KERNEL_MODLIB := /lib/modules/$(KERNEL_UNAME)
|
||||
KERNEL_SOURCES := $(shell test -d $(KERNEL_MODLIB)/source && echo $(KERNEL_MODLIB)/source || echo $(KERNEL_MODLIB)/build)
|
||||
KERNEL_SOURCES := $(shell ((test -d $(KERNEL_MODLIB)/source && echo $(KERNEL_MODLIB)/source) || (test -d $(KERNEL_MODLIB)/build/source && echo $(KERNEL_MODLIB)/build/source)) || echo $(KERNEL_MODLIB)/build)
|
||||
endif
|
||||
|
||||
KERNEL_OUTPUT := $(KERNEL_SOURCES)
|
||||
@ -42,7 +42,11 @@ else
|
||||
else
|
||||
KERNEL_UNAME ?= $(shell uname -r)
|
||||
KERNEL_MODLIB := /lib/modules/$(KERNEL_UNAME)
|
||||
ifeq ($(KERNEL_SOURCES), $(KERNEL_MODLIB)/source)
|
||||
# $(filter patter...,text) - Returns all whitespace-separated words in text that
|
||||
# do match any of the pattern words, removing any words that do not match.
|
||||
# Set the KERNEL_OUTPUT only if either $(KERNEL_MODLIB)/source or
|
||||
# $(KERNEL_MODLIB)/build/source path matches the KERNEL_SOURCES.
|
||||
ifneq ($(filter $(KERNEL_SOURCES),$(KERNEL_MODLIB)/source $(KERNEL_MODLIB)/build/source),)
|
||||
KERNEL_OUTPUT := $(KERNEL_MODLIB)/build
|
||||
KBUILD_PARAMS := KBUILD_OUTPUT=$(KERNEL_OUTPUT)
|
||||
endif
|
||||
|
@ -58,14 +58,10 @@
|
||||
#include <linux/version.h>
|
||||
#include <linux/utsname.h>
|
||||
|
||||
#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 32)
|
||||
#error "This driver does not support kernels older than 2.6.32!"
|
||||
#elif LINUX_VERSION_CODE < KERNEL_VERSION(2, 7, 0)
|
||||
# define KERNEL_2_6
|
||||
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3, 0, 0)
|
||||
# define KERNEL_3
|
||||
#else
|
||||
#error "This driver does not support development kernels!"
|
||||
#if LINUX_VERSION_CODE == KERNEL_VERSION(4, 4, 0)
|
||||
// Version 4.4 is allowed, temporarily, although not officially supported.
|
||||
#elif LINUX_VERSION_CODE < KERNEL_VERSION(4, 15, 0)
|
||||
#error "This driver does not support kernels older than Linux 4.15!"
|
||||
#endif
|
||||
|
||||
#if defined (CONFIG_SMP) && !defined (__SMP__)
|
||||
@ -836,16 +832,16 @@ static inline dma_addr_t nv_phys_to_dma(struct device *dev, NvU64 pa)
|
||||
#define NV_PRINT_AT(nv_debug_level,at) \
|
||||
{ \
|
||||
nv_printf(nv_debug_level, \
|
||||
"NVRM: VM: %s:%d: 0x%p, %d page(s), count = %d, flags = 0x%08x, " \
|
||||
"NVRM: VM: %s:%d: 0x%p, %d page(s), count = %d, " \
|
||||
"page_table = 0x%p\n", __FUNCTION__, __LINE__, at, \
|
||||
at->num_pages, NV_ATOMIC_READ(at->usage_count), \
|
||||
at->flags, at->page_table); \
|
||||
at->page_table); \
|
||||
}
|
||||
|
||||
#define NV_PRINT_VMA(nv_debug_level,vma) \
|
||||
{ \
|
||||
nv_printf(nv_debug_level, \
|
||||
"NVRM: VM: %s:%d: 0x%lx - 0x%lx, 0x%08x bytes @ 0x%016llx, 0x%p, 0x%p\n", \
|
||||
"NVRM: VM: %s:%d: 0x%lx - 0x%lx, 0x%08lx bytes @ 0x%016llx, 0x%p, 0x%p\n", \
|
||||
__FUNCTION__, __LINE__, vma->vm_start, vma->vm_end, NV_VMA_SIZE(vma), \
|
||||
NV_VMA_OFFSET(vma), NV_VMA_PRIVATE(vma), NV_VMA_FILE(vma)); \
|
||||
}
|
||||
@ -1078,6 +1074,8 @@ static inline void nv_kmem_ctor_dummy(void *arg)
|
||||
kmem_cache_destroy(kmem_cache); \
|
||||
}
|
||||
|
||||
#define NV_KMEM_CACHE_ALLOC_ATOMIC(kmem_cache) \
|
||||
kmem_cache_alloc(kmem_cache, GFP_ATOMIC)
|
||||
#define NV_KMEM_CACHE_ALLOC(kmem_cache) \
|
||||
kmem_cache_alloc(kmem_cache, GFP_KERNEL)
|
||||
#define NV_KMEM_CACHE_FREE(ptr, kmem_cache) \
|
||||
@ -1104,6 +1102,23 @@ static inline void *nv_kmem_cache_zalloc(struct kmem_cache *k, gfp_t flags)
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline int nv_kmem_cache_alloc_stack_atomic(nvidia_stack_t **stack)
|
||||
{
|
||||
nvidia_stack_t *sp = NULL;
|
||||
#if defined(NVCPU_X86_64)
|
||||
if (rm_is_altstack_in_use())
|
||||
{
|
||||
sp = NV_KMEM_CACHE_ALLOC_ATOMIC(nvidia_stack_t_cache);
|
||||
if (sp == NULL)
|
||||
return -ENOMEM;
|
||||
sp->size = sizeof(sp->stack);
|
||||
sp->top = sp->stack + sp->size;
|
||||
}
|
||||
#endif
|
||||
*stack = sp;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int nv_kmem_cache_alloc_stack(nvidia_stack_t **stack)
|
||||
{
|
||||
nvidia_stack_t *sp = NULL;
|
||||
|
@ -29,17 +29,17 @@
|
||||
typedef int vm_fault_t;
|
||||
#endif
|
||||
|
||||
/* pin_user_pages
|
||||
/*
|
||||
* pin_user_pages()
|
||||
*
|
||||
* Presence of pin_user_pages() also implies the presence of unpin-user_page().
|
||||
* Both were added in the v5.6-rc1
|
||||
* Both were added in the v5.6.
|
||||
*
|
||||
* pin_user_pages() was added by commit eddb1c228f7951d399240
|
||||
* ("mm/gup: introduce pin_user_pages*() and FOLL_PIN") in v5.6-rc1 (2020-01-30)
|
||||
*
|
||||
* Removed vmas parameter from pin_user_pages() by commit 40896a02751
|
||||
* ("mm/gup: remove vmas parameter from pin_user_pages()")
|
||||
* in linux-next, expected in v6.5-rc1 (2023-05-17)
|
||||
* pin_user_pages() was added by commit eddb1c228f79
|
||||
* ("mm/gup: introduce pin_user_pages*() and FOLL_PIN") in v5.6.
|
||||
*
|
||||
* Removed vmas parameter from pin_user_pages() by commit 4c630f307455
|
||||
* ("mm/gup: remove vmas parameter from pin_user_pages()") in v6.5.
|
||||
*/
|
||||
|
||||
#include <linux/mm.h>
|
||||
@ -63,25 +63,28 @@ typedef int vm_fault_t;
|
||||
#define NV_UNPIN_USER_PAGE put_page
|
||||
#endif // NV_PIN_USER_PAGES_PRESENT
|
||||
|
||||
/* get_user_pages
|
||||
/*
|
||||
* get_user_pages()
|
||||
*
|
||||
* The 8-argument version of get_user_pages was deprecated by commit
|
||||
* (2016 Feb 12: cde70140fed8429acf7a14e2e2cbd3e329036653)for the non-remote case
|
||||
* The 8-argument version of get_user_pages() was deprecated by commit
|
||||
* cde70140fed8 ("mm/gup: Overload get_user_pages() functions") in v4.6-rc1.
|
||||
* (calling get_user_pages with current and current->mm).
|
||||
*
|
||||
* Completely moved to the 6 argument version of get_user_pages -
|
||||
* 2016 Apr 4: c12d2da56d0e07d230968ee2305aaa86b93a6832
|
||||
* Completely moved to the 6 argument version of get_user_pages() by
|
||||
* commit c12d2da56d0e ("mm/gup: Remove the macro overload API migration
|
||||
* helpers from the get_user*() APIs") in v4.6-rc4.
|
||||
*
|
||||
* write and force parameters were replaced with gup_flags by -
|
||||
* 2016 Oct 12: 768ae309a96103ed02eb1e111e838c87854d8b51
|
||||
* write and force parameters were replaced with gup_flags by
|
||||
* commit 768ae309a961 ("mm: replace get_user_pages() write/force parameters
|
||||
* with gup_flags") in v4.9.
|
||||
*
|
||||
* A 7-argument version of get_user_pages was introduced into linux-4.4.y by
|
||||
* commit 8e50b8b07f462ab4b91bc1491b1c91bd75e4ad40 which cherry-picked the
|
||||
* replacement of the write and force parameters with gup_flags
|
||||
* commit 8e50b8b07f462 ("mm: replace get_user_pages() write/force parameters
|
||||
* with gup_flags") which cherry-picked the replacement of the write and
|
||||
* force parameters with gup_flags.
|
||||
*
|
||||
* Removed vmas parameter from get_user_pages() by commit 7bbf9c8c99
|
||||
* ("mm/gup: remove unused vmas parameter from get_user_pages()")
|
||||
* in linux-next, expected in v6.5-rc1 (2023-05-17)
|
||||
* Removed vmas parameter from get_user_pages() by commit 54d020692b34
|
||||
* ("mm/gup: remove unused vmas parameter from get_user_pages()") in v6.5.
|
||||
*
|
||||
*/
|
||||
|
||||
@ -112,18 +115,19 @@ typedef int vm_fault_t;
|
||||
}
|
||||
#endif // NV_GET_USER_PAGES_HAS_ARGS_FLAGS
|
||||
|
||||
/* pin_user_pages_remote
|
||||
/*
|
||||
* pin_user_pages_remote()
|
||||
*
|
||||
* pin_user_pages_remote() was added by commit eddb1c228f7951d399240
|
||||
* ("mm/gup: introduce pin_user_pages*() and FOLL_PIN") in v5.6 (2020-01-30)
|
||||
* pin_user_pages_remote() was added by commit eddb1c228f79
|
||||
* ("mm/gup: introduce pin_user_pages*() and FOLL_PIN") in v5.6.
|
||||
*
|
||||
* pin_user_pages_remote() removed 'tsk' parameter by commit
|
||||
* 64019a2e467a ("mm/gup: remove task_struct pointer for all gup code")
|
||||
* in v5.9-rc1 (2020-08-11). *
|
||||
* in v5.9.
|
||||
*
|
||||
* Removed unused vmas parameter from pin_user_pages_remote() by commit
|
||||
* 83bcc2e132("mm/gup: remove unused vmas parameter from pin_user_pages_remote()")
|
||||
* in linux-next, expected in v6.5-rc1 (2023-05-14)
|
||||
* 0b295316b3a9 ("mm/gup: remove unused vmas parameter from
|
||||
* pin_user_pages_remote()") in v6.5.
|
||||
*
|
||||
*/
|
||||
|
||||
@ -143,7 +147,7 @@ typedef int vm_fault_t;
|
||||
|
||||
/*
|
||||
* get_user_pages_remote() was added by commit 1e9877902dc7
|
||||
* ("mm/gup: Introduce get_user_pages_remote()") in v4.6 (2016-02-12).
|
||||
* ("mm/gup: Introduce get_user_pages_remote()") in v4.6.
|
||||
*
|
||||
* Note that get_user_pages_remote() requires the caller to hold a reference on
|
||||
* the task_struct (if non-NULL and if this API has tsk argument) and the mm_struct.
|
||||
@ -153,19 +157,17 @@ typedef int vm_fault_t;
|
||||
*
|
||||
* get_user_pages_remote() write/force parameters were replaced
|
||||
* with gup_flags by commit 9beae1ea8930 ("mm: replace get_user_pages_remote()
|
||||
* write/force parameters with gup_flags") in v4.9 (2016-10-13).
|
||||
* write/force parameters with gup_flags") in v4.9.
|
||||
*
|
||||
* get_user_pages_remote() added 'locked' parameter by commit 5b56d49fc31d
|
||||
* ("mm: add locked parameter to get_user_pages_remote()") in
|
||||
* v4.10 (2016-12-14).
|
||||
* ("mm: add locked parameter to get_user_pages_remote()") in v4.10.
|
||||
*
|
||||
* get_user_pages_remote() removed 'tsk' parameter by
|
||||
* commit 64019a2e467a ("mm/gup: remove task_struct pointer for
|
||||
* all gup code") in v5.9-rc1 (2020-08-11).
|
||||
* all gup code") in v5.9.
|
||||
*
|
||||
* Removed vmas parameter from get_user_pages_remote() by commit a4bde14d549
|
||||
* ("mm/gup: remove vmas parameter from get_user_pages_remote()")
|
||||
* in linux-next, expected in v6.5-rc1 (2023-05-14)
|
||||
* Removed vmas parameter from get_user_pages_remote() by commit ca5e863233e8
|
||||
* ("mm/gup: remove vmas parameter from get_user_pages_remote()") in v6.5.
|
||||
*
|
||||
*/
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 1999-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-FileCopyrightText: Copyright (c) 1999-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
@ -609,6 +609,15 @@ typedef enum
|
||||
NV_POWER_STATE_RUNNING
|
||||
} nv_power_state_t;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
const char *vidmem_power_status;
|
||||
const char *dynamic_power_status;
|
||||
const char *gc6_support;
|
||||
const char *gcoff_support;
|
||||
const char *s0ix_status;
|
||||
} nv_power_info_t;
|
||||
|
||||
#define NV_PRIMARY_VGA(nv) ((nv)->primary_vga)
|
||||
|
||||
#define NV_IS_CTL_DEVICE(nv) ((nv)->flags & NV_FLAG_CONTROL)
|
||||
@ -778,7 +787,7 @@ nv_state_t* NV_API_CALL nv_get_ctl_state (void);
|
||||
|
||||
void NV_API_CALL nv_set_dma_address_size (nv_state_t *, NvU32 );
|
||||
|
||||
NV_STATUS NV_API_CALL nv_alias_pages (nv_state_t *, NvU32, NvU32, NvU32, NvU64, NvU64 *, void **);
|
||||
NV_STATUS NV_API_CALL nv_alias_pages (nv_state_t *, NvU32, NvU64, NvU32, NvU32, NvU64, NvU64 *, void **);
|
||||
NV_STATUS NV_API_CALL nv_alloc_pages (nv_state_t *, NvU32, NvU64, NvBool, NvU32, NvBool, NvBool, NvS32, NvU64 *, void **);
|
||||
NV_STATUS NV_API_CALL nv_free_pages (nv_state_t *, NvU32, NvBool, NvU32, void *);
|
||||
|
||||
@ -822,6 +831,7 @@ void NV_API_CALL nv_acpi_methods_init (NvU32 *);
|
||||
void NV_API_CALL nv_acpi_methods_uninit (void);
|
||||
|
||||
NV_STATUS NV_API_CALL nv_acpi_method (NvU32, NvU32, NvU32, void *, NvU16, NvU32 *, void *, NvU16 *);
|
||||
NV_STATUS NV_API_CALL nv_acpi_d3cold_dsm_for_upstream_port (nv_state_t *, NvU8 *, NvU32, NvU32, NvU32 *);
|
||||
NV_STATUS NV_API_CALL nv_acpi_dsm_method (nv_state_t *, NvU8 *, NvU32, NvBool, NvU32, void *, NvU16, NvU32 *, void *, NvU16 *);
|
||||
NV_STATUS NV_API_CALL nv_acpi_ddc_method (nv_state_t *, void *, NvU32 *, NvBool);
|
||||
NV_STATUS NV_API_CALL nv_acpi_dod_method (nv_state_t *, NvU32 *, NvU32 *);
|
||||
@ -990,10 +1000,10 @@ NV_STATUS NV_API_CALL rm_p2p_init_mapping (nvidia_stack_t *, NvU64, NvU6
|
||||
NV_STATUS NV_API_CALL rm_p2p_destroy_mapping (nvidia_stack_t *, NvU64);
|
||||
NV_STATUS NV_API_CALL rm_p2p_get_pages (nvidia_stack_t *, NvU64, NvU32, NvU64, NvU64, NvU64 *, NvU32 *, NvU32 *, NvU32 *, NvU8 **, void *);
|
||||
NV_STATUS NV_API_CALL rm_p2p_get_gpu_info (nvidia_stack_t *, NvU64, NvU64, NvU8 **, void **);
|
||||
NV_STATUS NV_API_CALL rm_p2p_get_pages_persistent (nvidia_stack_t *, NvU64, NvU64, void **, NvU64 *, NvU32 *, void *, void *);
|
||||
NV_STATUS NV_API_CALL rm_p2p_get_pages_persistent (nvidia_stack_t *, NvU64, NvU64, void **, NvU64 *, NvU32 *, void *, void *, void **);
|
||||
NV_STATUS NV_API_CALL rm_p2p_register_callback (nvidia_stack_t *, NvU64, NvU64, NvU64, void *, void (*)(void *), void *);
|
||||
NV_STATUS NV_API_CALL rm_p2p_put_pages (nvidia_stack_t *, NvU64, NvU32, NvU64, void *);
|
||||
NV_STATUS NV_API_CALL rm_p2p_put_pages_persistent(nvidia_stack_t *, void *, void *);
|
||||
NV_STATUS NV_API_CALL rm_p2p_put_pages_persistent(nvidia_stack_t *, void *, void *, void *);
|
||||
NV_STATUS NV_API_CALL rm_p2p_dma_map_pages (nvidia_stack_t *, nv_dma_device_t *, NvU8 *, NvU64, NvU32, NvU64 *, void **);
|
||||
NV_STATUS NV_API_CALL rm_dma_buf_dup_mem_handle (nvidia_stack_t *, nv_state_t *, NvHandle, NvHandle, NvHandle, NvHandle, void *, NvHandle, NvU64, NvU64, NvHandle *, void **);
|
||||
void NV_API_CALL rm_dma_buf_undup_mem_handle(nvidia_stack_t *, nv_state_t *, NvHandle, NvHandle);
|
||||
@ -1027,9 +1037,7 @@ void NV_API_CALL rm_enable_dynamic_power_management(nvidia_stack_t *, nv_s
|
||||
NV_STATUS NV_API_CALL rm_ref_dynamic_power(nvidia_stack_t *, nv_state_t *, nv_dynamic_power_mode_t);
|
||||
void NV_API_CALL rm_unref_dynamic_power(nvidia_stack_t *, nv_state_t *, nv_dynamic_power_mode_t);
|
||||
NV_STATUS NV_API_CALL rm_transition_dynamic_power(nvidia_stack_t *, nv_state_t *, NvBool, NvBool *);
|
||||
const char* NV_API_CALL rm_get_vidmem_power_status(nvidia_stack_t *, nv_state_t *);
|
||||
const char* NV_API_CALL rm_get_dynamic_power_management_status(nvidia_stack_t *, nv_state_t *);
|
||||
const char* NV_API_CALL rm_get_gpu_gcx_support(nvidia_stack_t *, nv_state_t *, NvBool);
|
||||
void NV_API_CALL rm_get_power_info(nvidia_stack_t *, nv_state_t *, nv_power_info_t *);
|
||||
|
||||
void NV_API_CALL rm_acpi_notify(nvidia_stack_t *, nv_state_t *, NvU32);
|
||||
void NV_API_CALL rm_acpi_nvpcf_notify(nvidia_stack_t *);
|
||||
|
@ -1462,6 +1462,29 @@ NV_STATUS nvUvmInterfacePagingChannelPushStream(UvmGpuPagingChannelHandle channe
|
||||
char *methodStream,
|
||||
NvU32 methodStreamSize);
|
||||
|
||||
/*******************************************************************************
|
||||
nvUvmInterfaceKeyRotationChannelDisable
|
||||
|
||||
This function notifies RM that the given channels are idle.
|
||||
|
||||
This function is called after RM has notified UVM that keys need to be rotated.
|
||||
When called RM will disable the channels, rotate their keys, and then re-enable
|
||||
the channels.
|
||||
|
||||
Locking: This function acquires an API and GPU lock.
|
||||
Memory : This function dynamically allocates memory.
|
||||
|
||||
Arguments:
|
||||
channelList[IN] - An array of channel handles whose channels are idle.
|
||||
channelListCount[IN] - Number of channels in channelList. Its value must be
|
||||
greater than 0.
|
||||
|
||||
Error codes:
|
||||
NV_ERR_INVALID_ARGUMENT - channelList is NULL or channeListCount is 0.
|
||||
*/
|
||||
NV_STATUS nvUvmInterfaceKeyRotationChannelDisable(uvmGpuChannelHandle channelList[],
|
||||
NvU32 channeListCount);
|
||||
|
||||
/*******************************************************************************
|
||||
Cryptography Services Library (CSL) Interface
|
||||
*/
|
||||
@ -1505,21 +1528,15 @@ NV_STATUS nvUvmInterfaceCslInitContext(UvmCslContext *uvmCslContext,
|
||||
void nvUvmInterfaceDeinitCslContext(UvmCslContext *uvmCslContext);
|
||||
|
||||
/*******************************************************************************
|
||||
nvUvmInterfaceCslRotateKey
|
||||
nvUvmInterfaceCslUpdateContext
|
||||
|
||||
Disables channels and rotates keys.
|
||||
Updates contexts after a key rotation event and can only be called once per
|
||||
key rotation event. Following a key rotation event, and before
|
||||
nvUvmInterfaceCslUpdateContext is called, data encrypted by the GPU with the
|
||||
previous key can be decrypted with nvUvmInterfaceCslDecrypt.
|
||||
|
||||
This function disables channels and rotates associated keys. The channels
|
||||
associated with the given CSL contexts must be idled before this function is
|
||||
called. To trigger key rotation all allocated channels for a given key must
|
||||
be present in the list. If the function returns successfully then the CSL
|
||||
contexts have been updated with the new key.
|
||||
|
||||
Locking: This function attempts to acquire the GPU lock. In case of failure
|
||||
to acquire the return code is NV_ERR_STATE_IN_USE. The caller must
|
||||
guarantee that no CSL function, including this one, is invoked
|
||||
concurrently with the CSL contexts in contextList.
|
||||
Memory : This function dynamically allocates memory.
|
||||
Locking: This function acquires an API lock.
|
||||
Memory : This function does not dynamically allocate memory.
|
||||
|
||||
Arguments:
|
||||
contextList[IN/OUT] - An array of pointers to CSL contexts.
|
||||
@ -1527,12 +1544,8 @@ void nvUvmInterfaceDeinitCslContext(UvmCslContext *uvmCslContext);
|
||||
must be greater than 0.
|
||||
Error codes:
|
||||
NV_ERR_INVALID_ARGUMENT - contextList is NULL or contextListCount is 0.
|
||||
NV_ERR_STATE_IN_USE - Unable to acquire lock / resource. Caller
|
||||
can retry at a later time.
|
||||
NV_ERR_GENERIC - A failure other than _STATE_IN_USE occurred
|
||||
when attempting to acquire a lock.
|
||||
*/
|
||||
NV_STATUS nvUvmInterfaceCslRotateKey(UvmCslContext *contextList[],
|
||||
NV_STATUS nvUvmInterfaceCslUpdateContext(UvmCslContext *contextList[],
|
||||
NvU32 contextListCount);
|
||||
|
||||
/*******************************************************************************
|
||||
@ -1541,13 +1554,17 @@ NV_STATUS nvUvmInterfaceCslRotateKey(UvmCslContext *contextList[],
|
||||
Rotates the IV for a given channel and operation.
|
||||
|
||||
This function will rotate the IV on both the CPU and the GPU.
|
||||
For a given operation the channel must be idle before calling this function.
|
||||
This function can be called regardless of the value of the IV's message counter.
|
||||
Outstanding messages that have been encrypted by the GPU should first be
|
||||
decrypted before calling this function with operation equal to
|
||||
UVM_CSL_OPERATION_DECRYPT. Similarly, outstanding messages that have been
|
||||
encrypted by the CPU should first be decrypted before calling this function
|
||||
with operation equal to UVM_CSL_OPERATION_ENCRYPT. For a given operation
|
||||
the channel must be idle before calling this function. This function can be
|
||||
called regardless of the value of the IV's message counter.
|
||||
|
||||
Locking: This function attempts to acquire the GPU lock. In case of failure to
|
||||
acquire the return code is NV_ERR_STATE_IN_USE. The caller must guarantee
|
||||
that no CSL function, including this one, is invoked concurrently with
|
||||
the same CSL context.
|
||||
Locking: This function attempts to acquire the GPU lock.
|
||||
In case of failure to acquire the return code
|
||||
is NV_ERR_STATE_IN_USE.
|
||||
Memory : This function does not dynamically allocate memory.
|
||||
|
||||
Arguments:
|
||||
@ -1581,8 +1598,8 @@ NV_STATUS nvUvmInterfaceCslRotateIv(UvmCslContext *uvmCslContext,
|
||||
However, it is optional. If it is NULL, the next IV in line will be used.
|
||||
|
||||
Locking: This function does not acquire an API or GPU lock.
|
||||
The caller must guarantee that no CSL function, including this one,
|
||||
is invoked concurrently with the same CSL context.
|
||||
If called concurrently in different threads with the same UvmCslContext
|
||||
the caller must guarantee exclusion.
|
||||
Memory : This function does not dynamically allocate memory.
|
||||
|
||||
Arguments:
|
||||
@ -1618,14 +1635,9 @@ NV_STATUS nvUvmInterfaceCslEncrypt(UvmCslContext *uvmCslContext,
|
||||
maximized when the input and output buffers are 16-byte aligned. This is
|
||||
natural alignment for AES block.
|
||||
|
||||
During a key rotation event the previous key is stored in the CSL context.
|
||||
This allows data encrypted by the GPU to be decrypted with the previous key.
|
||||
The keyRotationId parameter identifies which key is used. The first key rotation
|
||||
ID has a value of 0 that increments by one for each key rotation event.
|
||||
|
||||
Locking: This function does not acquire an API or GPU lock.
|
||||
The caller must guarantee that no CSL function, including this one,
|
||||
is invoked concurrently with the same CSL context.
|
||||
If called concurrently in different threads with the same UvmCslContext
|
||||
the caller must guarantee exclusion.
|
||||
Memory : This function does not dynamically allocate memory.
|
||||
|
||||
Arguments:
|
||||
@ -1635,8 +1647,6 @@ NV_STATUS nvUvmInterfaceCslEncrypt(UvmCslContext *uvmCslContext,
|
||||
decryptIv[IN] - IV used to decrypt the ciphertext. Its value can either be given by
|
||||
nvUvmInterfaceCslIncrementIv, or, if NULL, the CSL context's
|
||||
internal counter is used.
|
||||
keyRotationId[IN] - Specifies the key that is used for decryption.
|
||||
A value of NV_U32_MAX specifies the current key.
|
||||
inputBuffer[IN] - Address of ciphertext input buffer.
|
||||
outputBuffer[OUT] - Address of plaintext output buffer.
|
||||
addAuthData[IN] - Address of the plaintext additional authenticated data used to
|
||||
@ -1657,7 +1667,6 @@ NV_STATUS nvUvmInterfaceCslDecrypt(UvmCslContext *uvmCslContext,
|
||||
NvU32 bufferSize,
|
||||
NvU8 const *inputBuffer,
|
||||
UvmCslIv const *decryptIv,
|
||||
NvU32 keyRotationId,
|
||||
NvU8 *outputBuffer,
|
||||
NvU8 const *addAuthData,
|
||||
NvU32 addAuthDataSize,
|
||||
@ -1672,8 +1681,8 @@ NV_STATUS nvUvmInterfaceCslDecrypt(UvmCslContext *uvmCslContext,
|
||||
undefined behavior.
|
||||
|
||||
Locking: This function does not acquire an API or GPU lock.
|
||||
The caller must guarantee that no CSL function, including this one,
|
||||
is invoked concurrently with the same CSL context.
|
||||
If called concurrently in different threads with the same UvmCslContext
|
||||
the caller must guarantee exclusion.
|
||||
Memory : This function does not dynamically allocate memory.
|
||||
|
||||
Arguments:
|
||||
@ -1701,8 +1710,8 @@ NV_STATUS nvUvmInterfaceCslSign(UvmCslContext *uvmCslContext,
|
||||
|
||||
Locking: This function does not acquire an API or GPU lock.
|
||||
Memory : This function does not dynamically allocate memory.
|
||||
The caller must guarantee that no CSL function, including this one,
|
||||
is invoked concurrently with the same CSL context.
|
||||
If called concurrently in different threads with the same UvmCslContext
|
||||
the caller must guarantee exclusion.
|
||||
|
||||
Arguments:
|
||||
uvmCslContext[IN/OUT] - The CSL context.
|
||||
@ -1727,8 +1736,8 @@ NV_STATUS nvUvmInterfaceCslQueryMessagePool(UvmCslContext *uvmCslContext,
|
||||
the returned IV can be used in nvUvmInterfaceCslDecrypt.
|
||||
|
||||
Locking: This function does not acquire an API or GPU lock.
|
||||
The caller must guarantee that no CSL function, including this one,
|
||||
is invoked concurrently with the same CSL context.
|
||||
If called concurrently in different threads with the same UvmCslContext
|
||||
the caller must guarantee exclusion.
|
||||
Memory : This function does not dynamically allocate memory.
|
||||
|
||||
Arguments:
|
||||
@ -1750,13 +1759,13 @@ NV_STATUS nvUvmInterfaceCslIncrementIv(UvmCslContext *uvmCslContext,
|
||||
UvmCslIv *iv);
|
||||
|
||||
/*******************************************************************************
|
||||
nvUvmInterfaceCslLogEncryption
|
||||
nvUvmInterfaceCslLogExternalEncryption
|
||||
|
||||
Checks and logs information about encryptions associated with the given
|
||||
CSL context.
|
||||
Checks and logs information about non-CSL encryptions, such as those that
|
||||
originate from the GPU.
|
||||
|
||||
For contexts associated with channels, this function does not modify elements of
|
||||
the UvmCslContext, and must be called for every CPU/GPU encryption.
|
||||
the UvmCslContext and must be called for each external encryption invocation.
|
||||
|
||||
For the context associated with fault buffers, bufferSize can encompass multiple
|
||||
encryption invocations, and the UvmCslContext will be updated following a key
|
||||
@ -1766,25 +1775,19 @@ NV_STATUS nvUvmInterfaceCslIncrementIv(UvmCslContext *uvmCslContext,
|
||||
|
||||
Locking: This function does not acquire an API or GPU lock.
|
||||
Memory : This function does not dynamically allocate memory.
|
||||
The caller must guarantee that no CSL function, including this one,
|
||||
is invoked concurrently with the same CSL context.
|
||||
If called concurrently in different threads with the same UvmCslContext
|
||||
the caller must guarantee exclusion.
|
||||
|
||||
Arguments:
|
||||
uvmCslContext[IN/OUT] - The CSL context.
|
||||
operation[IN] - If the CSL context is associated with a fault
|
||||
buffer, this argument is ignored. If it is
|
||||
associated with a channel, it must be either
|
||||
- UVM_CSL_OPERATION_ENCRYPT
|
||||
- UVM_CSL_OPERATION_DECRYPT
|
||||
bufferSize[IN] - The size of the buffer(s) encrypted by the
|
||||
bufferSize[OUT] - The size of the buffer(s) encrypted by the
|
||||
external entity in units of bytes.
|
||||
|
||||
Error codes:
|
||||
NV_ERR_INSUFFICIENT_RESOURCES - The encryption would cause a counter
|
||||
NV_ERR_INSUFFICIENT_RESOURCES - The device encryption would cause a counter
|
||||
to overflow.
|
||||
*/
|
||||
NV_STATUS nvUvmInterfaceCslLogEncryption(UvmCslContext *uvmCslContext,
|
||||
UvmCslOperation operation,
|
||||
NV_STATUS nvUvmInterfaceCslLogExternalEncryption(UvmCslContext *uvmCslContext,
|
||||
NvU32 bufferSize);
|
||||
|
||||
#endif // _NV_UVM_INTERFACE_H_
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2014-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2014-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
@ -39,12 +39,12 @@
|
||||
// are multiple BIG page sizes in RM. These defines are used as flags to "0"
|
||||
// should be OK when user is not sure which pagesize allocation it wants
|
||||
//
|
||||
#define UVM_PAGE_SIZE_DEFAULT 0x0
|
||||
#define UVM_PAGE_SIZE_4K 0x1000
|
||||
#define UVM_PAGE_SIZE_64K 0x10000
|
||||
#define UVM_PAGE_SIZE_128K 0x20000
|
||||
#define UVM_PAGE_SIZE_2M 0x200000
|
||||
#define UVM_PAGE_SIZE_512M 0x20000000
|
||||
#define UVM_PAGE_SIZE_DEFAULT 0x0ULL
|
||||
#define UVM_PAGE_SIZE_4K 0x1000ULL
|
||||
#define UVM_PAGE_SIZE_64K 0x10000ULL
|
||||
#define UVM_PAGE_SIZE_128K 0x20000ULL
|
||||
#define UVM_PAGE_SIZE_2M 0x200000ULL
|
||||
#define UVM_PAGE_SIZE_512M 0x20000000ULL
|
||||
|
||||
//
|
||||
// When modifying flags, make sure they are compatible with the mirrored
|
||||
@ -605,8 +605,6 @@ typedef struct UvmGpuConfComputeCaps_tag
|
||||
{
|
||||
// Out: GPU's confidential compute mode
|
||||
UvmGpuConfComputeMode mode;
|
||||
// Is key rotation enabled for UVM keys
|
||||
NvBool bKeyRotationEnabled;
|
||||
} UvmGpuConfComputeCaps;
|
||||
|
||||
#define UVM_GPU_NAME_LENGTH 0x40
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 1993-2020 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
@ -494,6 +494,23 @@ do \
|
||||
//
|
||||
#define NV_TWO_N_MINUS_ONE(n) (((1ULL<<(n/2))<<((n+1)/2))-1)
|
||||
|
||||
//
|
||||
// Create a 64b bitmask with n bits set
|
||||
// This is the same as ((1ULL<<n) - 1), but it doesn't overflow for n=64
|
||||
//
|
||||
// ...
|
||||
// n=-1, 0x0000000000000000
|
||||
// n=0, 0x0000000000000000
|
||||
// n=1, 0x0000000000000001
|
||||
// ...
|
||||
// n=63, 0x7FFFFFFFFFFFFFFF
|
||||
// n=64, 0xFFFFFFFFFFFFFFFF
|
||||
// n=65, 0xFFFFFFFFFFFFFFFF
|
||||
// n=66, 0xFFFFFFFFFFFFFFFF
|
||||
// ...
|
||||
//
|
||||
#define NV_BITMASK64(n) ((n<1) ? 0ULL : (NV_U64_MAX>>((n>64) ? 0 : (64-n))))
|
||||
|
||||
#define DRF_READ_1WORD_BS(d,r,f,v) \
|
||||
((DRF_EXTENT_MW(NV##d##r##f)<8)?DRF_READ_1BYTE_BS(NV##d##r##f,(v)): \
|
||||
((DRF_EXTENT_MW(NV##d##r##f)<16)?DRF_READ_2BYTE_BS(NV##d##r##f,(v)): \
|
||||
@ -574,6 +591,13 @@ nvMaskPos32(const NvU32 mask, const NvU32 bitIdx)
|
||||
n32 = BIT_IDX_32(LOWESTBIT(n32));\
|
||||
}
|
||||
|
||||
// Destructive operation on n64
|
||||
#define LOWESTBITIDX_64(n64) \
|
||||
{ \
|
||||
n64 = BIT_IDX_64(LOWESTBIT(n64));\
|
||||
}
|
||||
|
||||
|
||||
// Destructive operation on n32
|
||||
#define HIGHESTBITIDX_32(n32) \
|
||||
{ \
|
||||
@ -918,6 +942,11 @@ static NV_FORCEINLINE void *NV_NVUPTR_TO_PTR(NvUPtr address)
|
||||
// Use (lo) if (b) is less than 64, and (hi) if >= 64.
|
||||
//
|
||||
#define NV_BIT_SET_128(b, lo, hi) { nvAssert( (b) < 128 ); if ( (b) < 64 ) (lo) |= NVBIT64(b); else (hi) |= NVBIT64( b & 0x3F ); }
|
||||
//
|
||||
// Clear the bit at pos (b) for U64 which is < 128.
|
||||
// Use (lo) if (b) is less than 64, and (hi) if >= 64.
|
||||
//
|
||||
#define NV_BIT_CLEAR_128(b, lo, hi) { nvAssert( (b) < 128 ); if ( (b) < 64 ) (lo) &= ~NVBIT64(b); else (hi) &= ~NVBIT64( b & 0x3F ); }
|
||||
|
||||
// Get the number of elements the specified fixed-size array
|
||||
#define NV_ARRAY_ELEMENTS(x) ((sizeof(x)/sizeof((x)[0])))
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2014-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2014-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
@ -152,6 +152,7 @@ NV_STATUS_CODE(NV_ERR_FABRIC_MANAGER_NOT_PRESENT, 0x0000007A, "Fabric Manag
|
||||
NV_STATUS_CODE(NV_ERR_ALREADY_SIGNALLED, 0x0000007B, "Semaphore Surface value already >= requested wait value")
|
||||
NV_STATUS_CODE(NV_ERR_QUEUE_TASK_SLOT_NOT_AVAILABLE, 0x0000007C, "PMU RPC error due to no queue slot available for this event")
|
||||
NV_STATUS_CODE(NV_ERR_KEY_ROTATION_IN_PROGRESS, 0x0000007D, "Operation not allowed as key rotation is in progress")
|
||||
NV_STATUS_CODE(NV_ERR_TEST_ONLY_CODE_NOT_ENABLED, 0x0000007E, "Test-only code path not enabled")
|
||||
|
||||
// Warnings:
|
||||
NV_STATUS_CODE(NV_WARN_HOT_SWITCH, 0x00010001, "WARNING Hot switch")
|
||||
|
@ -152,6 +152,12 @@ typedef signed short NvS16; /* -32768 to 32767 */
|
||||
(((NvU32)(c) & 0xff) << 8) | \
|
||||
(((NvU32)(d) & 0xff))))
|
||||
|
||||
// Macro to build an NvU64 from two DWORDS, listed from msb to lsb
|
||||
#define NvU64_BUILD(a, b) \
|
||||
((NvU64)( \
|
||||
(((NvU64)(a) & ~0U) << 32) | \
|
||||
(((NvU64)(b) & ~0U))))
|
||||
|
||||
#if NVTYPES_USE_STDINT
|
||||
typedef uint32_t NvV32; /* "void": enumerated or multiple fields */
|
||||
typedef uint32_t NvU32; /* 0 to 4294967295 */
|
||||
|
@ -101,16 +101,17 @@ NV_STATUS NV_API_CALL rm_gpu_ops_paging_channels_map(nvidia_stack_t *, nvgpuAdd
|
||||
void NV_API_CALL rm_gpu_ops_paging_channels_unmap(nvidia_stack_t *, nvgpuAddressSpaceHandle_t, NvU64, nvgpuDeviceHandle_t);
|
||||
NV_STATUS NV_API_CALL rm_gpu_ops_paging_channel_push_stream(nvidia_stack_t *, nvgpuPagingChannelHandle_t, char *, NvU32);
|
||||
|
||||
NV_STATUS NV_API_CALL rm_gpu_ops_key_rotation_channel_disable(nvidia_stack_t *, nvgpuChannelHandle_t [], NvU32);
|
||||
NV_STATUS NV_API_CALL rm_gpu_ops_ccsl_context_init(nvidia_stack_t *, struct ccslContext_t **, nvgpuChannelHandle_t);
|
||||
NV_STATUS NV_API_CALL rm_gpu_ops_ccsl_context_clear(nvidia_stack_t *, struct ccslContext_t *);
|
||||
NV_STATUS NV_API_CALL rm_gpu_ops_ccsl_rotate_key(nvidia_stack_t *, UvmCslContext *[], NvU32);
|
||||
NV_STATUS NV_API_CALL rm_gpu_ops_ccsl_context_update(nvidia_stack_t *, UvmCslContext *[], NvU32);
|
||||
NV_STATUS NV_API_CALL rm_gpu_ops_ccsl_rotate_iv(nvidia_stack_t *, struct ccslContext_t *, NvU8);
|
||||
NV_STATUS NV_API_CALL rm_gpu_ops_ccsl_encrypt(nvidia_stack_t *, struct ccslContext_t *, NvU32, NvU8 const *, NvU8 *, NvU8 *);
|
||||
NV_STATUS NV_API_CALL rm_gpu_ops_ccsl_encrypt_with_iv(nvidia_stack_t *, struct ccslContext_t *, NvU32, NvU8 const *, NvU8*, NvU8 *, NvU8 *);
|
||||
NV_STATUS NV_API_CALL rm_gpu_ops_ccsl_decrypt(nvidia_stack_t *, struct ccslContext_t *, NvU32, NvU8 const *, NvU8 const *, NvU32, NvU8 *, NvU8 const *, NvU32, NvU8 const *);
|
||||
NV_STATUS NV_API_CALL rm_gpu_ops_ccsl_decrypt(nvidia_stack_t *, struct ccslContext_t *, NvU32, NvU8 const *, NvU8 const *, NvU8 *, NvU8 const *, NvU32, NvU8 const *);
|
||||
NV_STATUS NV_API_CALL rm_gpu_ops_ccsl_sign(nvidia_stack_t *, struct ccslContext_t *, NvU32, NvU8 const *, NvU8 *);
|
||||
NV_STATUS NV_API_CALL rm_gpu_ops_ccsl_query_message_pool(nvidia_stack_t *, struct ccslContext_t *, NvU8, NvU64 *);
|
||||
NV_STATUS NV_API_CALL rm_gpu_ops_ccsl_increment_iv(nvidia_stack_t *, struct ccslContext_t *, NvU8, NvU64, NvU8 *);
|
||||
NV_STATUS NV_API_CALL rm_gpu_ops_ccsl_log_encryption(nvidia_stack_t *, struct ccslContext_t *, NvU8, NvU32);
|
||||
NV_STATUS NV_API_CALL rm_gpu_ops_ccsl_log_device_encryption(nvidia_stack_t *, struct ccslContext_t *, NvU32);
|
||||
|
||||
#endif
|
||||
|
@ -5252,23 +5252,25 @@ compile_test() {
|
||||
compile_check_conftest "$CODE" "NV_PCI_CLASS_MULTIMEDIA_HD_AUDIO_PRESENT" "" "generic"
|
||||
;;
|
||||
|
||||
follow_pfn)
|
||||
unsafe_follow_pfn)
|
||||
#
|
||||
# Determine if follow_pfn() is present.
|
||||
# Determine if unsafe_follow_pfn() is present.
|
||||
#
|
||||
# follow_pfn() was added by commit 3b6748e2dd69
|
||||
# ("mm: introduce follow_pfn()") in v2.6.31-rc1, and removed
|
||||
# by commit 233eb0bf3b94 ("mm: remove follow_pfn")
|
||||
# from linux-next 233eb0bf3b94.
|
||||
# unsafe_follow_pfn() was added by commit 69bacee7f9ad
|
||||
# ("mm: Add unsafe_follow_pfn") in v5.13-rc1.
|
||||
#
|
||||
# Note: this commit never made it to the linux kernel, so
|
||||
# unsafe_follow_pfn() never existed.
|
||||
#
|
||||
CODE="
|
||||
#include <linux/mm.h>
|
||||
void conftest_follow_pfn(void) {
|
||||
follow_pfn();
|
||||
void conftest_unsafe_follow_pfn(void) {
|
||||
unsafe_follow_pfn();
|
||||
}"
|
||||
|
||||
compile_check_conftest "$CODE" "NV_FOLLOW_PFN_PRESENT" "" "functions"
|
||||
compile_check_conftest "$CODE" "NV_UNSAFE_FOLLOW_PFN_PRESENT" "" "functions"
|
||||
;;
|
||||
|
||||
drm_plane_atomic_check_has_atomic_state_arg)
|
||||
#
|
||||
# Determine if drm_plane_helper_funcs::atomic_check takes 'state'
|
||||
@ -5554,7 +5556,8 @@ compile_test() {
|
||||
|
||||
of_dma_configure)
|
||||
#
|
||||
# Determine if of_dma_configure() function is present
|
||||
# Determine if of_dma_configure() function is present, and how
|
||||
# many arguments it takes.
|
||||
#
|
||||
# Added by commit 591c1ee465ce ("of: configure the platform
|
||||
# device dma parameters") in v3.16. However, it was a static,
|
||||
@ -5564,17 +5567,69 @@ compile_test() {
|
||||
# commit 1f5c69aa51f9 ("of: Move of_dma_configure() to device.c
|
||||
# to help re-use") in v4.1.
|
||||
#
|
||||
CODE="
|
||||
# It subsequently began taking a third parameter with commit
|
||||
# 3d6ce86ee794 ("drivers: remove force dma flag from buses")
|
||||
# in v4.18.
|
||||
#
|
||||
|
||||
echo "$CONFTEST_PREAMBLE
|
||||
#if defined(NV_LINUX_OF_DEVICE_H_PRESENT)
|
||||
#include <linux/of_device.h>
|
||||
#endif
|
||||
|
||||
void conftest_of_dma_configure(void)
|
||||
{
|
||||
of_dma_configure();
|
||||
}
|
||||
"
|
||||
" > conftest$$.c
|
||||
|
||||
compile_check_conftest "$CODE" "NV_OF_DMA_CONFIGURE_PRESENT" "" "functions"
|
||||
$CC $CFLAGS -c conftest$$.c > /dev/null 2>&1
|
||||
rm -f conftest$$.c
|
||||
|
||||
if [ -f conftest$$.o ]; then
|
||||
rm -f conftest$$.o
|
||||
|
||||
echo "#undef NV_OF_DMA_CONFIGURE_PRESENT" | append_conftest "functions"
|
||||
echo "#undef NV_OF_DMA_CONFIGURE_ARGUMENT_COUNT" | append_conftest "functions"
|
||||
else
|
||||
echo "#define NV_OF_DMA_CONFIGURE_PRESENT" | append_conftest "functions"
|
||||
|
||||
echo "$CONFTEST_PREAMBLE
|
||||
#if defined(NV_LINUX_OF_DEVICE_H_PRESENT)
|
||||
#include <linux/of_device.h>
|
||||
#endif
|
||||
|
||||
void conftest_of_dma_configure(void) {
|
||||
of_dma_configure(NULL, NULL, false);
|
||||
}" > conftest$$.c
|
||||
|
||||
$CC $CFLAGS -c conftest$$.c > /dev/null 2>&1
|
||||
rm -f conftest$$.c
|
||||
|
||||
if [ -f conftest$$.o ]; then
|
||||
rm -f conftest$$.o
|
||||
echo "#define NV_OF_DMA_CONFIGURE_ARGUMENT_COUNT 3" | append_conftest "functions"
|
||||
return
|
||||
fi
|
||||
|
||||
echo "$CONFTEST_PREAMBLE
|
||||
#if defined(NV_LINUX_OF_DEVICE_H_PRESENT)
|
||||
#include <linux/of_device.h>
|
||||
#endif
|
||||
|
||||
void conftest_of_dma_configure(void) {
|
||||
of_dma_configure(NULL, NULL);
|
||||
}" > conftest$$.c
|
||||
|
||||
$CC $CFLAGS -c conftest$$.c > /dev/null 2>&1
|
||||
rm -f conftest$$.c
|
||||
|
||||
if [ -f conftest$$.o ]; then
|
||||
rm -f conftest$$.o
|
||||
echo "#define NV_OF_DMA_CONFIGURE_ARGUMENT_COUNT 2" | append_conftest "functions"
|
||||
return
|
||||
fi
|
||||
fi
|
||||
;;
|
||||
|
||||
icc_get)
|
||||
@ -6795,12 +6850,45 @@ compile_test() {
|
||||
compile_check_conftest "$CODE" "NV_DRM_MODE_CREATE_DP_COLORSPACE_PROPERTY_HAS_SUPPORTED_COLORSPACES_ARG" "" "types"
|
||||
;;
|
||||
|
||||
drm_syncobj_features_present)
|
||||
# Determine if DRIVER_SYNCOBJ and DRIVER_SYNCOBJ_TIMELINE DRM
|
||||
# driver features are present. Timeline DRM synchronization objects
|
||||
# may only be used if both of these are supported by the driver.
|
||||
#
|
||||
# DRIVER_SYNCOBJ_TIMELINE Added by commit 060cebb20cdb ("drm:
|
||||
# introduce a capability flag for syncobj timeline support") in
|
||||
# v5.2
|
||||
#
|
||||
# DRIVER_SYNCOBJ Added by commit e9083420bbac ("drm: introduce
|
||||
# sync objects (v4)") in v4.12
|
||||
CODE="
|
||||
#if defined(NV_DRM_DRM_DRV_H_PRESENT)
|
||||
#include <drm/drm_drv.h>
|
||||
#endif
|
||||
int features = DRIVER_SYNCOBJ | DRIVER_SYNCOBJ_TIMELINE;"
|
||||
|
||||
compile_check_conftest "$CODE" "NV_DRM_SYNCOBJ_FEATURES_PRESENT" "" "types"
|
||||
;;
|
||||
|
||||
stack_trace)
|
||||
# Determine if functions stack_trace_{save,print} are present.
|
||||
# Added by commit e9b98e162 ("stacktrace: Provide helpers for
|
||||
# common stack trace operations") in v5.2.
|
||||
CODE="
|
||||
#include <linux/stacktrace.h>
|
||||
void conftest_stack_trace(void) {
|
||||
stack_trace_save();
|
||||
stack_trace_print();
|
||||
}"
|
||||
|
||||
compile_check_conftest "$CODE" "NV_STACK_TRACE_PRESENT" "" "functions"
|
||||
;;
|
||||
|
||||
drm_unlocked_ioctl_flag_present)
|
||||
# Determine if DRM_UNLOCKED IOCTL flag is present.
|
||||
#
|
||||
# DRM_UNLOCKED was removed by commit 2798ffcc1d6a ("drm: Remove
|
||||
# locking for legacy ioctls and DRM_UNLOCKED") in Linux
|
||||
# next-20231208.
|
||||
# locking for legacy ioctls and DRM_UNLOCKED") in v6.8.
|
||||
#
|
||||
# DRM_UNLOCKED definition was moved from drmP.h to drm_ioctl.h by
|
||||
# commit 2640981f3600 ("drm: document drm_ioctl.[hc]") in v4.12.
|
||||
|
@ -52,6 +52,7 @@ NV_HEADER_PRESENCE_TESTS = \
|
||||
linux/dma-resv.h \
|
||||
soc/tegra/chip-id.h \
|
||||
soc/tegra/fuse.h \
|
||||
soc/tegra/fuse-helper.h \
|
||||
soc/tegra/tegra_bpmp.h \
|
||||
video/nv_internal.h \
|
||||
linux/platform/tegra/dce/dce-client-ipc.h \
|
||||
|
@ -201,7 +201,7 @@ static struct task_struct *thread_create_on_node(int (*threadfn)(void *data),
|
||||
|
||||
// Ran out of attempts - return thread even if its stack may not be
|
||||
// allocated on the preferred node
|
||||
if (i == (attempts - 1))
|
||||
if ((i == (attempts - 1)))
|
||||
break;
|
||||
|
||||
// Get the NUMA node where the first page of the stack is resident. If
|
||||
|
@ -176,12 +176,10 @@ cursor_plane_req_config_update(struct drm_plane *plane,
|
||||
return;
|
||||
}
|
||||
|
||||
*req_config = (struct NvKmsKapiCursorRequestedConfig) {
|
||||
.surface = to_nv_framebuffer(plane_state->fb)->pSurface,
|
||||
|
||||
.dstX = plane_state->crtc_x,
|
||||
.dstY = plane_state->crtc_y,
|
||||
};
|
||||
memset(req_config, 0, sizeof(*req_config));
|
||||
req_config->surface = to_nv_framebuffer(plane_state->fb)->pSurface;
|
||||
req_config->dstX = plane_state->crtc_x;
|
||||
req_config->dstY = plane_state->crtc_y;
|
||||
|
||||
#if defined(NV_DRM_ALPHA_BLENDING_AVAILABLE)
|
||||
if (plane->blend_mode_property != NULL && plane->alpha_property != NULL) {
|
||||
@ -275,24 +273,22 @@ plane_req_config_update(struct drm_plane *plane,
|
||||
return 0;
|
||||
}
|
||||
|
||||
*req_config = (struct NvKmsKapiLayerRequestedConfig) {
|
||||
.config = {
|
||||
.surface = to_nv_framebuffer(plane_state->fb)->pSurface,
|
||||
memset(req_config, 0, sizeof(*req_config));
|
||||
|
||||
req_config->config.surface = to_nv_framebuffer(plane_state->fb)->pSurface;
|
||||
|
||||
/* Source values are 16.16 fixed point */
|
||||
.srcX = plane_state->src_x >> 16,
|
||||
.srcY = plane_state->src_y >> 16,
|
||||
.srcWidth = plane_state->src_w >> 16,
|
||||
.srcHeight = plane_state->src_h >> 16,
|
||||
req_config->config.srcX = plane_state->src_x >> 16;
|
||||
req_config->config.srcY = plane_state->src_y >> 16;
|
||||
req_config->config.srcWidth = plane_state->src_w >> 16;
|
||||
req_config->config.srcHeight = plane_state->src_h >> 16;
|
||||
|
||||
.dstX = plane_state->crtc_x,
|
||||
.dstY = plane_state->crtc_y,
|
||||
.dstWidth = plane_state->crtc_w,
|
||||
.dstHeight = plane_state->crtc_h,
|
||||
req_config->config.dstX = plane_state->crtc_x;
|
||||
req_config->config.dstY = plane_state->crtc_y;
|
||||
req_config->config.dstWidth = plane_state->crtc_w;
|
||||
req_config->config.dstHeight = plane_state->crtc_h;
|
||||
|
||||
.csc = old_config.csc
|
||||
},
|
||||
};
|
||||
req_config->config.csc = old_config.csc;
|
||||
|
||||
#if defined(NV_DRM_ROTATION_AVAILABLE)
|
||||
/*
|
||||
@ -688,9 +684,7 @@ static int nv_drm_plane_atomic_set_property(
|
||||
to_nv_drm_plane_state(state);
|
||||
|
||||
if (property == nv_dev->nv_out_fence_property) {
|
||||
#if defined(NV_LINUX_NVHOST_H_PRESENT) && defined(CONFIG_TEGRA_GRHOST)
|
||||
nv_drm_plane_state->fd_user_ptr = u64_to_user_ptr(val);
|
||||
#endif
|
||||
nv_drm_plane_state->fd_user_ptr = (void __user *)(uintptr_t)(val);
|
||||
return 0;
|
||||
} else if (property == nv_dev->nv_input_colorspace_property) {
|
||||
nv_drm_plane_state->input_colorspace = val;
|
||||
@ -875,14 +869,12 @@ static inline void nv_drm_crtc_duplicate_req_head_modeset_config(
|
||||
* there is no change in new configuration yet with respect
|
||||
* to older one!
|
||||
*/
|
||||
*new = (struct NvKmsKapiHeadRequestedConfig) {
|
||||
.modeSetConfig = old->modeSetConfig,
|
||||
};
|
||||
memset(new, 0, sizeof(*new));
|
||||
new->modeSetConfig = old->modeSetConfig;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(old->layerRequestedConfig); i++) {
|
||||
new->layerRequestedConfig[i] = (struct NvKmsKapiLayerRequestedConfig) {
|
||||
.config = old->layerRequestedConfig[i].config,
|
||||
};
|
||||
new->layerRequestedConfig[i].config =
|
||||
old->layerRequestedConfig[i].config;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -373,18 +373,14 @@ static int nv_drm_create_properties(struct nv_drm_device *nv_dev)
|
||||
len++;
|
||||
}
|
||||
|
||||
#if defined(NV_LINUX_NVHOST_H_PRESENT) && defined(CONFIG_TEGRA_GRHOST)
|
||||
if (!nv_dev->supportsSyncpts) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (nv_dev->supportsSyncpts) {
|
||||
nv_dev->nv_out_fence_property =
|
||||
drm_property_create_range(nv_dev->dev, DRM_MODE_PROP_ATOMIC,
|
||||
"NV_DRM_OUT_FENCE_PTR", 0, U64_MAX);
|
||||
if (nv_dev->nv_out_fence_property == NULL) {
|
||||
return -ENOMEM;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
nv_dev->nv_input_colorspace_property =
|
||||
drm_property_create_enum(nv_dev->dev, 0, "NV_INPUT_COLORSPACE",
|
||||
@ -480,6 +476,22 @@ static int nv_drm_load(struct drm_device *dev, unsigned long flags)
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE)
|
||||
/*
|
||||
* If fbdev is enabled, take modeset ownership now before other DRM clients
|
||||
* can take master (and thus NVKMS ownership).
|
||||
*/
|
||||
if (nv_drm_fbdev_module_param) {
|
||||
if (!nvKms->grabOwnership(pDevice)) {
|
||||
nvKms->freeDevice(pDevice);
|
||||
NV_DRM_DEV_LOG_ERR(nv_dev, "Failed to grab NVKMS modeset ownership");
|
||||
return -EBUSY;
|
||||
}
|
||||
|
||||
nv_dev->hasFramebufferConsole = NV_TRUE;
|
||||
}
|
||||
#endif
|
||||
|
||||
mutex_lock(&nv_dev->lock);
|
||||
|
||||
/* Set NvKmsKapiDevice */
|
||||
@ -590,6 +602,15 @@ static void __nv_drm_unload(struct drm_device *dev)
|
||||
return;
|
||||
}
|
||||
|
||||
/* Release modeset ownership if fbdev is enabled */
|
||||
|
||||
#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE)
|
||||
if (nv_dev->hasFramebufferConsole) {
|
||||
drm_atomic_helper_shutdown(dev);
|
||||
nvKms->releaseOwnership(nv_dev->pDevice);
|
||||
}
|
||||
#endif
|
||||
|
||||
cancel_delayed_work_sync(&nv_dev->hotplug_event_work);
|
||||
mutex_lock(&nv_dev->lock);
|
||||
|
||||
@ -781,6 +802,14 @@ static int nv_drm_get_dev_info_ioctl(struct drm_device *dev,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int nv_drm_get_drm_file_unique_id_ioctl(struct drm_device *dev,
|
||||
void *data, struct drm_file *filep)
|
||||
{
|
||||
struct drm_nvidia_get_drm_file_unique_id_params *params = data;
|
||||
params->id = (u64)(filep->driver_priv);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int nv_drm_dmabuf_supported_ioctl(struct drm_device *dev,
|
||||
void *data, struct drm_file *filep)
|
||||
{
|
||||
@ -1279,6 +1308,17 @@ static void nv_drm_postclose(struct drm_device *dev, struct drm_file *filep)
|
||||
}
|
||||
#endif /* NV_DRM_ATOMIC_MODESET_AVAILABLE */
|
||||
|
||||
static int nv_drm_open(struct drm_device *dev, struct drm_file *filep)
|
||||
{
|
||||
_Static_assert(sizeof(filep->driver_priv) >= sizeof(u64),
|
||||
"filep->driver_priv can not hold an u64");
|
||||
static atomic64_t id = ATOMIC_INIT(0);
|
||||
|
||||
filep->driver_priv = (void *)atomic64_inc_return(&id);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if defined(NV_DRM_MASTER_HAS_LEASES)
|
||||
static struct drm_master *nv_drm_find_lessee(struct drm_master *master,
|
||||
int lessee_id)
|
||||
@ -1522,6 +1562,9 @@ static const struct drm_ioctl_desc nv_drm_ioctls[] = {
|
||||
DRM_IOCTL_DEF_DRV(NVIDIA_GET_DEV_INFO,
|
||||
nv_drm_get_dev_info_ioctl,
|
||||
DRM_RENDER_ALLOW|DRM_UNLOCKED),
|
||||
DRM_IOCTL_DEF_DRV(NVIDIA_GET_DRM_FILE_UNIQUE_ID,
|
||||
nv_drm_get_drm_file_unique_id_ioctl,
|
||||
DRM_RENDER_ALLOW|DRM_UNLOCKED),
|
||||
|
||||
#if defined(NV_DRM_FENCE_AVAILABLE)
|
||||
DRM_IOCTL_DEF_DRV(NVIDIA_FENCE_SUPPORTED,
|
||||
@ -1604,6 +1647,9 @@ static struct drm_driver nv_drm_driver = {
|
||||
.driver_features =
|
||||
#if defined(NV_DRM_DRIVER_PRIME_FLAG_PRESENT)
|
||||
DRIVER_PRIME |
|
||||
#endif
|
||||
#if defined(NV_DRM_SYNCOBJ_FEATURES_PRESENT)
|
||||
DRIVER_SYNCOBJ | DRIVER_SYNCOBJ_TIMELINE |
|
||||
#endif
|
||||
DRIVER_GEM | DRIVER_RENDER,
|
||||
|
||||
@ -1615,14 +1661,14 @@ static struct drm_driver nv_drm_driver = {
|
||||
.num_ioctls = ARRAY_SIZE(nv_drm_ioctls),
|
||||
|
||||
/*
|
||||
* linux-next commit 71a7974ac701 ("drm/prime: Unexport helpers for fd/handle
|
||||
* conversion") unexports drm_gem_prime_handle_to_fd() and
|
||||
* Linux kernel v6.6 commit 71a7974ac701 ("drm/prime: Unexport helpers
|
||||
* for fd/handle conversion") unexports drm_gem_prime_handle_to_fd() and
|
||||
* drm_gem_prime_fd_to_handle().
|
||||
*
|
||||
* Prior linux-next commit 6b85aa68d9d5 ("drm: Enable PRIME import/export for
|
||||
* all drivers") made these helpers the default when .prime_handle_to_fd /
|
||||
* .prime_fd_to_handle are unspecified, so it's fine to just skip specifying
|
||||
* them if the helpers aren't present.
|
||||
* Prior Linux kernel v6.6 commit 6b85aa68d9d5 ("drm: Enable PRIME
|
||||
* import/export for all drivers") made these helpers the default when
|
||||
* .prime_handle_to_fd / .prime_fd_to_handle are unspecified, so it's fine
|
||||
* to just skip specifying them if the helpers aren't present.
|
||||
*/
|
||||
#if NV_IS_EXPORT_SYMBOL_PRESENT_drm_gem_prime_handle_to_fd
|
||||
.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
|
||||
@ -1656,6 +1702,7 @@ static struct drm_driver nv_drm_driver = {
|
||||
#if defined(NV_DRM_ATOMIC_MODESET_AVAILABLE)
|
||||
.postclose = nv_drm_postclose,
|
||||
#endif
|
||||
.open = nv_drm_open,
|
||||
|
||||
.fops = &nv_drm_fops,
|
||||
|
||||
@ -1714,6 +1761,7 @@ void nv_drm_register_drm_device(const nv_gpu_info_t *gpu_info)
|
||||
struct nv_drm_device *nv_dev = NULL;
|
||||
struct drm_device *dev = NULL;
|
||||
struct device *device = gpu_info->os_device_ptr;
|
||||
bool bus_is_pci;
|
||||
|
||||
DRM_DEBUG(
|
||||
"Registering device for NVIDIA GPU ID 0x08%x",
|
||||
@ -1747,7 +1795,7 @@ void nv_drm_register_drm_device(const nv_gpu_info_t *gpu_info)
|
||||
dev->dev_private = nv_dev;
|
||||
nv_dev->dev = dev;
|
||||
|
||||
bool bus_is_pci =
|
||||
bus_is_pci =
|
||||
#if defined(NV_LINUX)
|
||||
device->bus == &pci_bus_type;
|
||||
#elif defined(NV_BSD)
|
||||
@ -1771,11 +1819,6 @@ void nv_drm_register_drm_device(const nv_gpu_info_t *gpu_info)
|
||||
if (nv_drm_fbdev_module_param &&
|
||||
drm_core_check_feature(dev, DRIVER_MODESET)) {
|
||||
|
||||
if (!nvKms->grabOwnership(nv_dev->pDevice)) {
|
||||
NV_DRM_DEV_LOG_ERR(nv_dev, "Failed to grab NVKMS modeset ownership");
|
||||
goto failed_grab_ownership;
|
||||
}
|
||||
|
||||
if (bus_is_pci) {
|
||||
struct pci_dev *pdev = to_pci_dev(device);
|
||||
|
||||
@ -1786,8 +1829,6 @@ void nv_drm_register_drm_device(const nv_gpu_info_t *gpu_info)
|
||||
#endif
|
||||
}
|
||||
drm_fbdev_generic_setup(dev, 32);
|
||||
|
||||
nv_dev->hasFramebufferConsole = NV_TRUE;
|
||||
}
|
||||
#endif /* defined(NV_DRM_FBDEV_GENERIC_AVAILABLE) */
|
||||
|
||||
@ -1798,12 +1839,6 @@ void nv_drm_register_drm_device(const nv_gpu_info_t *gpu_info)
|
||||
|
||||
return; /* Success */
|
||||
|
||||
#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE)
|
||||
failed_grab_ownership:
|
||||
|
||||
drm_dev_unregister(dev);
|
||||
#endif
|
||||
|
||||
failed_drm_register:
|
||||
|
||||
nv_drm_dev_free(dev);
|
||||
@ -1870,12 +1905,6 @@ void nv_drm_remove_devices(void)
|
||||
struct nv_drm_device *next = dev_list->next;
|
||||
struct drm_device *dev = dev_list->dev;
|
||||
|
||||
#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE)
|
||||
if (dev_list->hasFramebufferConsole) {
|
||||
drm_atomic_helper_shutdown(dev);
|
||||
nvKms->releaseOwnership(dev_list->pDevice);
|
||||
}
|
||||
#endif
|
||||
drm_dev_unregister(dev);
|
||||
nv_drm_dev_free(dev);
|
||||
|
||||
|
@ -293,14 +293,12 @@ __nv_drm_prime_fence_context_new(
|
||||
* to check a return value.
|
||||
*/
|
||||
|
||||
*nv_prime_fence_context = (struct nv_drm_prime_fence_context) {
|
||||
.base.ops = &nv_drm_prime_fence_context_ops,
|
||||
.base.nv_dev = nv_dev,
|
||||
.base.context = nv_dma_fence_context_alloc(1),
|
||||
.base.fenceSemIndex = p->index,
|
||||
.pSemSurface = pSemSurface,
|
||||
.pLinearAddress = pLinearAddress,
|
||||
};
|
||||
nv_prime_fence_context->base.ops = &nv_drm_prime_fence_context_ops;
|
||||
nv_prime_fence_context->base.nv_dev = nv_dev;
|
||||
nv_prime_fence_context->base.context = nv_dma_fence_context_alloc(1);
|
||||
nv_prime_fence_context->base.fenceSemIndex = p->index;
|
||||
nv_prime_fence_context->pSemSurface = pSemSurface;
|
||||
nv_prime_fence_context->pLinearAddress = pLinearAddress;
|
||||
|
||||
INIT_LIST_HEAD(&nv_prime_fence_context->pending);
|
||||
|
||||
@ -1261,18 +1259,16 @@ __nv_drm_semsurf_fence_ctx_new(
|
||||
* to check a return value.
|
||||
*/
|
||||
|
||||
*ctx = (struct nv_drm_semsurf_fence_ctx) {
|
||||
.base.ops = &nv_drm_semsurf_fence_ctx_ops,
|
||||
.base.nv_dev = nv_dev,
|
||||
.base.context = nv_dma_fence_context_alloc(1),
|
||||
.base.fenceSemIndex = p->index,
|
||||
.pSemSurface = pSemSurface,
|
||||
.pSemMapping.pVoid = semMapping,
|
||||
.pMaxSubmittedMapping = (volatile NvU64 *)maxSubmittedMapping,
|
||||
.callback.local = NULL,
|
||||
.callback.nvKms = NULL,
|
||||
.current_wait_value = 0,
|
||||
};
|
||||
ctx->base.ops = &nv_drm_semsurf_fence_ctx_ops;
|
||||
ctx->base.nv_dev = nv_dev;
|
||||
ctx->base.context = nv_dma_fence_context_alloc(1);
|
||||
ctx->base.fenceSemIndex = p->index;
|
||||
ctx->pSemSurface = pSemSurface;
|
||||
ctx->pSemMapping.pVoid = semMapping;
|
||||
ctx->pMaxSubmittedMapping = (volatile NvU64 *)maxSubmittedMapping;
|
||||
ctx->callback.local = NULL;
|
||||
ctx->callback.nvKms = NULL;
|
||||
ctx->current_wait_value = 0;
|
||||
|
||||
spin_lock_init(&ctx->lock);
|
||||
INIT_LIST_HEAD(&ctx->pending_fences);
|
||||
|
@ -551,14 +551,12 @@ static struct drm_gem_object *__nv_drm_gem_nvkms_prime_dup(
|
||||
{
|
||||
struct nv_drm_device *nv_dev = to_nv_device(dev);
|
||||
const struct nv_drm_device *nv_dev_src;
|
||||
const struct nv_drm_gem_nvkms_memory *nv_nvkms_memory_src;
|
||||
struct nv_drm_gem_nvkms_memory *nv_nvkms_memory;
|
||||
struct NvKmsKapiMemory *pMemory;
|
||||
|
||||
BUG_ON(nv_gem_src == NULL || nv_gem_src->ops != &nv_gem_nvkms_memory_ops);
|
||||
|
||||
nv_dev_src = to_nv_device(nv_gem_src->base.dev);
|
||||
nv_nvkms_memory_src = to_nv_nvkms_memory_const(nv_gem_src);
|
||||
|
||||
if ((nv_nvkms_memory =
|
||||
nv_drm_calloc(1, sizeof(*nv_nvkms_memory))) == NULL) {
|
||||
|
@ -45,8 +45,7 @@
|
||||
|
||||
/*
|
||||
* The inclusion of drm_framebuffer.h was removed from drm_crtc.h by commit
|
||||
* 720cf96d8fecde29b72e1101f8a567a0ce99594f ("drm: Drop drm_framebuffer.h from
|
||||
* drm_crtc.h") in linux-next, expected in v5.19-rc7.
|
||||
* 720cf96d8fec ("drm: Drop drm_framebuffer.h from drm_crtc.h") in v6.0.
|
||||
*
|
||||
* We only need drm_framebuffer.h for drm_framebuffer_put(), and it is always
|
||||
* present (v4.9+) when drm_framebuffer_{put,get}() is present (v4.12+), so it
|
||||
|
@ -613,8 +613,8 @@ static inline int nv_drm_format_num_planes(uint32_t format)
|
||||
#endif /* defined(NV_DRM_FORMAT_MODIFIERS_PRESENT) */
|
||||
|
||||
/*
|
||||
* DRM_UNLOCKED was removed with linux-next commit 2798ffcc1d6a ("drm: Remove
|
||||
* locking for legacy ioctls and DRM_UNLOCKED"), but it was previously made
|
||||
* DRM_UNLOCKED was removed with commit 2798ffcc1d6a ("drm: Remove locking for
|
||||
* legacy ioctls and DRM_UNLOCKED") in v6.8, but it was previously made
|
||||
* implicit for all non-legacy DRM driver IOCTLs since Linux v4.10 commit
|
||||
* fa5386459f06 "drm: Used DRM_LEGACY for all legacy functions" (Linux v4.4
|
||||
* commit ea487835e887 "drm: Enforce unlocked ioctl operation for kms driver
|
||||
|
@ -52,6 +52,7 @@
|
||||
#define DRM_NVIDIA_SEMSURF_FENCE_CREATE 0x15
|
||||
#define DRM_NVIDIA_SEMSURF_FENCE_WAIT 0x16
|
||||
#define DRM_NVIDIA_SEMSURF_FENCE_ATTACH 0x17
|
||||
#define DRM_NVIDIA_GET_DRM_FILE_UNIQUE_ID 0x18
|
||||
|
||||
#define DRM_IOCTL_NVIDIA_GEM_IMPORT_NVKMS_MEMORY \
|
||||
DRM_IOWR((DRM_COMMAND_BASE + DRM_NVIDIA_GEM_IMPORT_NVKMS_MEMORY), \
|
||||
@ -157,6 +158,11 @@
|
||||
DRM_NVIDIA_SEMSURF_FENCE_ATTACH), \
|
||||
struct drm_nvidia_semsurf_fence_attach_params)
|
||||
|
||||
#define DRM_IOCTL_NVIDIA_GET_DRM_FILE_UNIQUE_ID \
|
||||
DRM_IOWR((DRM_COMMAND_BASE + \
|
||||
DRM_NVIDIA_GET_DRM_FILE_UNIQUE_ID), \
|
||||
struct drm_nvidia_get_drm_file_unique_id_params)
|
||||
|
||||
struct drm_nvidia_gem_import_nvkms_memory_params {
|
||||
uint64_t mem_size; /* IN */
|
||||
|
||||
@ -385,4 +391,8 @@ struct drm_nvidia_semsurf_fence_attach_params {
|
||||
uint64_t wait_value; /* IN Semaphore value to reach before signal */
|
||||
};
|
||||
|
||||
struct drm_nvidia_get_drm_file_unique_id_params {
|
||||
uint64_t id; /* OUT Unique ID of the DRM file */
|
||||
};
|
||||
|
||||
#endif /* _UAPI_NVIDIA_DRM_IOCTL_H_ */
|
||||
|
@ -587,6 +587,9 @@ int nv_drm_atomic_commit(struct drm_device *dev,
|
||||
NV_DRM_DEV_LOG_ERR(
|
||||
nv_dev,
|
||||
"Flip event timeout on head %u", nv_crtc->head);
|
||||
while (!list_empty(&nv_crtc->flip_list)) {
|
||||
__nv_drm_handle_flip_event(nv_crtc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -128,4 +128,5 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += drm_driver_has_dumb_destroy
|
||||
NV_CONFTEST_TYPE_COMPILE_TESTS += fence_ops_use_64bit_seqno
|
||||
NV_CONFTEST_TYPE_COMPILE_TESTS += drm_aperture_remove_conflicting_pci_framebuffers_has_driver_arg
|
||||
NV_CONFTEST_TYPE_COMPILE_TESTS += drm_mode_create_dp_colorspace_property_has_supported_colorspaces_arg
|
||||
NV_CONFTEST_TYPE_COMPILE_TESTS += drm_syncobj_features_present
|
||||
NV_CONFTEST_TYPE_COMPILE_TESTS += drm_unlocked_ioctl_flag_present
|
||||
|
@ -201,7 +201,7 @@ static struct task_struct *thread_create_on_node(int (*threadfn)(void *data),
|
||||
|
||||
// Ran out of attempts - return thread even if its stack may not be
|
||||
// allocated on the preferred node
|
||||
if (i == (attempts - 1))
|
||||
if ((i == (attempts - 1)))
|
||||
break;
|
||||
|
||||
// Get the NUMA node where the first page of the stack is resident. If
|
||||
|
@ -77,10 +77,10 @@ module_param_named(disable_hdmi_frl, disable_hdmi_frl, bool, 0400);
|
||||
static bool disable_vrr_memclk_switch = false;
|
||||
module_param_named(disable_vrr_memclk_switch, disable_vrr_memclk_switch, bool, 0400);
|
||||
|
||||
static bool hdmi_deepcolor = false;
|
||||
static bool hdmi_deepcolor = true;
|
||||
module_param_named(hdmi_deepcolor, hdmi_deepcolor, bool, 0400);
|
||||
|
||||
static bool vblank_sem_control = false;
|
||||
static bool vblank_sem_control = true;
|
||||
module_param_named(vblank_sem_control, vblank_sem_control, bool, 0400);
|
||||
|
||||
static bool opportunistic_display_sync = true;
|
||||
@ -139,6 +139,20 @@ NvBool nvkms_opportunistic_display_sync(void)
|
||||
return opportunistic_display_sync;
|
||||
}
|
||||
|
||||
NvBool nvkms_kernel_supports_syncpts(void)
|
||||
{
|
||||
/*
|
||||
* Note this only checks that the kernel has the prerequisite
|
||||
* support for syncpts; callers must also check that the hardware
|
||||
* supports syncpts.
|
||||
*/
|
||||
#if (defined(CONFIG_TEGRA_GRHOST) || defined(NV_LINUX_HOST1X_NEXT_H_PRESENT))
|
||||
return NV_TRUE;
|
||||
#else
|
||||
return NV_FALSE;
|
||||
#endif
|
||||
}
|
||||
|
||||
#define NVKMS_SYNCPT_STUBS_NEEDED
|
||||
|
||||
/*************************************************************************
|
||||
@ -1234,6 +1248,26 @@ void nvkms_close_from_kapi(struct nvkms_per_open *popen)
|
||||
nvkms_close_pm_unlocked(popen);
|
||||
}
|
||||
|
||||
NvBool nvkms_ioctl_from_kapi_try_pmlock
|
||||
(
|
||||
struct nvkms_per_open *popen,
|
||||
NvU32 cmd, void *params_address, const size_t param_size
|
||||
)
|
||||
{
|
||||
NvBool ret;
|
||||
|
||||
if (nvkms_read_trylock_pm_lock()) {
|
||||
return NV_FALSE;
|
||||
}
|
||||
|
||||
ret = nvkms_ioctl_common(popen,
|
||||
cmd,
|
||||
(NvU64)(NvUPtr)params_address, param_size) == 0;
|
||||
nvkms_read_unlock_pm_lock();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
NvBool nvkms_ioctl_from_kapi
|
||||
(
|
||||
struct nvkms_per_open *popen,
|
||||
|
@ -304,6 +304,11 @@ NvU32 nvkms_enumerate_gpus(nv_gpu_info_t *gpu_info);
|
||||
|
||||
NvBool nvkms_allow_write_combining(void);
|
||||
|
||||
/*!
|
||||
* Check if OS supports syncpoints.
|
||||
*/
|
||||
NvBool nvkms_kernel_supports_syncpts(void);
|
||||
|
||||
/*!
|
||||
* Checks whether the fd is associated with an nvidia character device.
|
||||
*/
|
||||
@ -328,6 +333,16 @@ NvBool nvkms_ioctl_from_kapi
|
||||
NvU32 cmd, void *params_address, const size_t params_size
|
||||
);
|
||||
|
||||
/*!
|
||||
* Like nvkms_ioctl_from_kapi, but return NV_FALSE instead of waiting if the
|
||||
* power management read lock cannot be acquired.
|
||||
*/
|
||||
NvBool nvkms_ioctl_from_kapi_try_pmlock
|
||||
(
|
||||
struct nvkms_per_open *popen,
|
||||
NvU32 cmd, void *params_address, const size_t params_size
|
||||
);
|
||||
|
||||
/*!
|
||||
* APIs for locking.
|
||||
*/
|
||||
|
@ -105,3 +105,4 @@ NV_CONFTEST_FUNCTION_COMPILE_TESTS += list_is_first
|
||||
NV_CONFTEST_FUNCTION_COMPILE_TESTS += ktime_get_real_ts64
|
||||
NV_CONFTEST_FUNCTION_COMPILE_TESTS += ktime_get_raw_ts64
|
||||
NV_CONFTEST_FUNCTION_COMPILE_TESTS += acpi_video_backlight_use_native
|
||||
NV_CONFTEST_FUNCTION_COMPILE_TESTS += kernel_read_has_pointer_pos_arg
|
||||
|
@ -201,7 +201,7 @@ static struct task_struct *thread_create_on_node(int (*threadfn)(void *data),
|
||||
|
||||
// Ran out of attempts - return thread even if its stack may not be
|
||||
// allocated on the preferred node
|
||||
if (i == (attempts - 1))
|
||||
if ((i == (attempts - 1)))
|
||||
break;
|
||||
|
||||
// Get the NUMA node where the first page of the stack is resident. If
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2013-2023 NVIDIA Corporation
|
||||
Copyright (c) 2013-2024 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@ -1448,9 +1448,7 @@ NV_STATUS UvmAllocSemaphorePool(void *base,
|
||||
//
|
||||
// preferredCpuMemoryNode: (INPUT)
|
||||
// Preferred CPU NUMA memory node used if the destination processor is
|
||||
// the CPU. -1 indicates no preference, in which case the pages used
|
||||
// can be on any of the available CPU NUMA nodes. If NUMA is disabled
|
||||
// only 0 and -1 are allowed.
|
||||
// the CPU.
|
||||
//
|
||||
// Error codes:
|
||||
// NV_ERR_INVALID_ADDRESS:
|
||||
@ -1464,11 +1462,6 @@ NV_STATUS UvmAllocSemaphorePool(void *base,
|
||||
// The VA range exceeds the largest virtual address supported by the
|
||||
// destination processor.
|
||||
//
|
||||
// NV_ERR_INVALID_ARGUMENT:
|
||||
// preferredCpuMemoryNode is not a valid CPU NUMA node or it corresponds
|
||||
// to a NUMA node ID for a registered GPU. If NUMA is disabled, it
|
||||
// indicates that preferredCpuMemoryNode was not either 0 or -1.
|
||||
//
|
||||
// NV_ERR_INVALID_DEVICE:
|
||||
// destinationUuid does not represent a valid processor such as a CPU or
|
||||
// a GPU with a GPU VA space registered for it. Or destinationUuid is a
|
||||
@ -1535,9 +1528,8 @@ NV_STATUS UvmMigrate(void *base,
|
||||
//
|
||||
// preferredCpuMemoryNode: (INPUT)
|
||||
// Preferred CPU NUMA memory node used if the destination processor is
|
||||
// the CPU. -1 indicates no preference, in which case the pages used
|
||||
// can be on any of the available CPU NUMA nodes. If NUMA is disabled
|
||||
// only 0 and -1 are allowed.
|
||||
// the CPU. This argument is ignored if the given virtual address range
|
||||
// corresponds to managed memory.
|
||||
//
|
||||
// semaphoreAddress: (INPUT)
|
||||
// Base address of the semaphore.
|
||||
@ -1594,8 +1586,8 @@ NV_STATUS UvmMigrateAsync(void *base,
|
||||
//
|
||||
// Migrates the backing of all virtual address ranges associated with the given
|
||||
// range group to the specified destination processor. The behavior of this API
|
||||
// is equivalent to calling UvmMigrate with preferredCpuMemoryNode = -1 on each
|
||||
// VA range associated with this range group.
|
||||
// is equivalent to calling UvmMigrate on each VA range associated with this
|
||||
// range group.
|
||||
//
|
||||
// Any errors encountered during migration are returned immediately. No attempt
|
||||
// is made to migrate the remaining unmigrated ranges and the ranges that are
|
||||
@ -2177,8 +2169,7 @@ NV_STATUS UvmMapDynamicParallelismRegion(void *base,
|
||||
//
|
||||
// If any page in the VA range has a preferred location, then the migration and
|
||||
// mapping policies associated with this API take precedence over those related
|
||||
// to the preferred location. If the preferred location is a specific CPU NUMA
|
||||
// node, that NUMA node will be used for a CPU-resident copy of the page.
|
||||
// to the preferred location.
|
||||
//
|
||||
// If any pages in this VA range have any processors present in their
|
||||
// accessed-by list, the migration and mapping policies associated with this
|
||||
@ -2309,7 +2300,7 @@ NV_STATUS UvmDisableReadDuplication(void *base,
|
||||
// UvmPreventMigrationRangeGroups has not been called on the range group that
|
||||
// those pages are associated with, then the migration and mapping policies
|
||||
// associated with UvmEnableReadDuplication override the policies outlined
|
||||
// above. Note that enabling read duplication on any pages in this VA range
|
||||
// above. Note that enabling read duplication on on any pages in this VA range
|
||||
// does not clear the state set by this API for those pages. It merely overrides
|
||||
// the policies associated with this state until read duplication is disabled
|
||||
// for those pages.
|
||||
@ -2342,8 +2333,7 @@ NV_STATUS UvmDisableReadDuplication(void *base,
|
||||
// preferredCpuMemoryNode: (INPUT)
|
||||
// Preferred CPU NUMA memory node used if preferredLocationUuid is the
|
||||
// UUID of the CPU. -1 is a special value which indicates all CPU nodes
|
||||
// allowed by the global and thread memory policies. If NUMA is disabled
|
||||
// only 0 and -1 are allowed.
|
||||
// allowed by the global and thread memory policies.
|
||||
//
|
||||
// Errors:
|
||||
// NV_ERR_INVALID_ADDRESS:
|
||||
@ -3473,8 +3463,7 @@ NV_STATUS UvmToolsDestroySession(UvmToolsSessionHandle session);
|
||||
//
|
||||
|
||||
#if UVM_API_REV_IS_AT_MOST(10)
|
||||
// This is deprecated and replaced by sizeof(UvmToolsEventControlData_V1) or
|
||||
// sizeof(UvmToolsEventControlData_V2).
|
||||
// This is deprecated and replaced by sizeof(UvmToolsEventControlData).
|
||||
NvLength UvmToolsGetEventControlSize(void);
|
||||
|
||||
// This is deprecated and replaced by sizeof(UvmEventEntry_V1) or
|
||||
@ -3498,8 +3487,6 @@ NvLength UvmToolsGetNumberOfCounters(void);
|
||||
// version: (INPUT)
|
||||
// Requested version for events or counters.
|
||||
// See UvmEventEntry_V1 and UvmEventEntry_V2.
|
||||
// UvmToolsEventControlData_V2::version records the entry version that
|
||||
// will be generated.
|
||||
//
|
||||
// event_buffer: (INPUT)
|
||||
// User allocated buffer. Must be page-aligned. Must be large enough to
|
||||
@ -3512,8 +3499,7 @@ NvLength UvmToolsGetNumberOfCounters(void);
|
||||
//
|
||||
// event_control (INPUT)
|
||||
// User allocated buffer. Must be page-aligned. Must be large enough to
|
||||
// hold UvmToolsEventControlData_V1 if version is UvmEventEntry_V1 or
|
||||
// UvmToolsEventControlData_V2 (although single page-size allocation
|
||||
// hold UvmToolsEventControlData (although single page-size allocation
|
||||
// should be more than enough). Gets pinned until queue is destroyed.
|
||||
//
|
||||
// queue: (OUTPUT)
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2018-2023 NVIDIA Corporation
|
||||
Copyright (c) 2018-2024 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@ -205,7 +205,7 @@ void uvm_hal_ampere_host_clear_faulted_channel_sw_method(uvm_push_t *push,
|
||||
CLEAR_FAULTED_B, HWVALUE(C076, CLEAR_FAULTED_B, INST_HI, instance_ptr_hi));
|
||||
}
|
||||
|
||||
// Copy from Pascal, this version sets TLB_INVALIDATE_INVAL_SCOPE.
|
||||
// Copy from Turing, this version sets TLB_INVALIDATE_INVAL_SCOPE.
|
||||
void uvm_hal_ampere_host_tlb_invalidate_all(uvm_push_t *push,
|
||||
uvm_gpu_phys_address_t pdb,
|
||||
NvU32 depth,
|
||||
@ -216,6 +216,7 @@ void uvm_hal_ampere_host_tlb_invalidate_all(uvm_push_t *push,
|
||||
NvU32 pdb_lo;
|
||||
NvU32 pdb_hi;
|
||||
NvU32 ack_value = 0;
|
||||
NvU32 sysmembar_value = 0;
|
||||
|
||||
UVM_ASSERT_MSG(pdb.aperture == UVM_APERTURE_VID || pdb.aperture == UVM_APERTURE_SYS, "aperture: %u", pdb.aperture);
|
||||
|
||||
@ -230,8 +231,8 @@ void uvm_hal_ampere_host_tlb_invalidate_all(uvm_push_t *push,
|
||||
pdb_lo = pdb.address & HWMASK(C56F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);
|
||||
pdb_hi = pdb.address >> HWSIZE(C56F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);
|
||||
|
||||
// PDE3 is the highest level on Pascal, see the comment in uvm_pascal_mmu.c
|
||||
// for details.
|
||||
// PDE3 is the highest level on Pascal-Ampere, see the comment in
|
||||
// uvm_pascal_mmu.c for details.
|
||||
UVM_ASSERT_MSG(depth < NVC56F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE3, "depth %u", depth);
|
||||
page_table_level = NVC56F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE3 - depth;
|
||||
|
||||
@ -242,7 +243,12 @@ void uvm_hal_ampere_host_tlb_invalidate_all(uvm_push_t *push,
|
||||
ack_value = HWCONST(C56F, MEM_OP_C, TLB_INVALIDATE_ACK_TYPE, GLOBALLY);
|
||||
}
|
||||
|
||||
NV_PUSH_4U(C56F, MEM_OP_A, HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS) |
|
||||
if (membar == UVM_MEMBAR_SYS)
|
||||
sysmembar_value = HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, EN);
|
||||
else
|
||||
sysmembar_value = HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS);
|
||||
|
||||
NV_PUSH_4U(C56F, MEM_OP_A, sysmembar_value |
|
||||
HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS),
|
||||
MEM_OP_B, 0,
|
||||
MEM_OP_C, HWCONST(C56F, MEM_OP_C, TLB_INVALIDATE_PDB, ONE) |
|
||||
@ -255,16 +261,18 @@ void uvm_hal_ampere_host_tlb_invalidate_all(uvm_push_t *push,
|
||||
MEM_OP_D, HWCONST(C56F, MEM_OP_D, OPERATION, MMU_TLB_INVALIDATE) |
|
||||
HWVALUE(C56F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));
|
||||
|
||||
uvm_hal_tlb_invalidate_membar(push, membar);
|
||||
// GPU membar still requires an explicit membar method.
|
||||
if (membar == UVM_MEMBAR_GPU)
|
||||
uvm_push_get_gpu(push)->parent->host_hal->membar_gpu(push);
|
||||
}
|
||||
|
||||
// Copy from Volta, this version sets TLB_INVALIDATE_INVAL_SCOPE.
|
||||
// Copy from Turing, this version sets TLB_INVALIDATE_INVAL_SCOPE.
|
||||
void uvm_hal_ampere_host_tlb_invalidate_va(uvm_push_t *push,
|
||||
uvm_gpu_phys_address_t pdb,
|
||||
NvU32 depth,
|
||||
NvU64 base,
|
||||
NvU64 size,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
uvm_membar_t membar)
|
||||
{
|
||||
NvU32 aperture_value;
|
||||
@ -272,6 +280,7 @@ void uvm_hal_ampere_host_tlb_invalidate_va(uvm_push_t *push,
|
||||
NvU32 pdb_lo;
|
||||
NvU32 pdb_hi;
|
||||
NvU32 ack_value = 0;
|
||||
NvU32 sysmembar_value = 0;
|
||||
NvU32 va_lo;
|
||||
NvU32 va_hi;
|
||||
NvU64 end;
|
||||
@ -281,9 +290,9 @@ void uvm_hal_ampere_host_tlb_invalidate_va(uvm_push_t *push,
|
||||
NvU32 log2_invalidation_size;
|
||||
uvm_gpu_t *gpu = uvm_push_get_gpu(push);
|
||||
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(page_size, 1 << 12), "page_size 0x%x\n", page_size);
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(base, page_size), "base 0x%llx page_size 0x%x\n", base, page_size);
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%x\n", size, page_size);
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(page_size, 1 << 12), "page_size 0x%llx\n", page_size);
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(base, page_size), "base 0x%llx page_size 0x%llx\n", base, page_size);
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%llx\n", size, page_size);
|
||||
UVM_ASSERT_MSG(size > 0, "size 0x%llx\n", size);
|
||||
|
||||
// The invalidation size must be a power-of-two number of pages containing
|
||||
@ -325,7 +334,7 @@ void uvm_hal_ampere_host_tlb_invalidate_va(uvm_push_t *push,
|
||||
pdb_lo = pdb.address & HWMASK(C56F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);
|
||||
pdb_hi = pdb.address >> HWSIZE(C56F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);
|
||||
|
||||
// PDE3 is the highest level on Pascal-Ampere , see the comment in
|
||||
// PDE3 is the highest level on Pascal-Ampere, see the comment in
|
||||
// uvm_pascal_mmu.c for details.
|
||||
UVM_ASSERT_MSG(depth < NVC56F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE3, "depth %u", depth);
|
||||
page_table_level = NVC56F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE3 - depth;
|
||||
@ -337,10 +346,15 @@ void uvm_hal_ampere_host_tlb_invalidate_va(uvm_push_t *push,
|
||||
ack_value = HWCONST(C56F, MEM_OP_C, TLB_INVALIDATE_ACK_TYPE, GLOBALLY);
|
||||
}
|
||||
|
||||
if (membar == UVM_MEMBAR_SYS)
|
||||
sysmembar_value = HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, EN);
|
||||
else
|
||||
sysmembar_value = HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS);
|
||||
|
||||
NV_PUSH_4U(C56F, MEM_OP_A, HWVALUE(C56F, MEM_OP_A, TLB_INVALIDATE_INVALIDATION_SIZE, log2_invalidation_size) |
|
||||
HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS) |
|
||||
HWVALUE(C56F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO, va_lo) |
|
||||
HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS),
|
||||
HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS) |
|
||||
sysmembar_value |
|
||||
HWVALUE(C56F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO, va_lo),
|
||||
MEM_OP_B, HWVALUE(C56F, MEM_OP_B, TLB_INVALIDATE_TARGET_ADDR_HI, va_hi),
|
||||
MEM_OP_C, HWCONST(C56F, MEM_OP_C, TLB_INVALIDATE_PDB, ONE) |
|
||||
HWVALUE(C56F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO, pdb_lo) |
|
||||
@ -352,21 +366,23 @@ void uvm_hal_ampere_host_tlb_invalidate_va(uvm_push_t *push,
|
||||
MEM_OP_D, HWCONST(C56F, MEM_OP_D, OPERATION, MMU_TLB_INVALIDATE_TARGETED) |
|
||||
HWVALUE(C56F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));
|
||||
|
||||
uvm_hal_tlb_invalidate_membar(push, membar);
|
||||
// GPU membar still requires an explicit membar method.
|
||||
if (membar == UVM_MEMBAR_GPU)
|
||||
gpu->parent->host_hal->membar_gpu(push);
|
||||
}
|
||||
|
||||
// Copy from Pascal, this version sets TLB_INVALIDATE_INVAL_SCOPE.
|
||||
// Copy from Turing, this version sets TLB_INVALIDATE_INVAL_SCOPE.
|
||||
void uvm_hal_ampere_host_tlb_invalidate_test(uvm_push_t *push,
|
||||
uvm_gpu_phys_address_t pdb,
|
||||
UVM_TEST_INVALIDATE_TLB_PARAMS *params)
|
||||
{
|
||||
NvU32 ack_value = 0;
|
||||
NvU32 sysmembar_value = 0;
|
||||
NvU32 invalidate_gpc_value = 0;
|
||||
NvU32 aperture_value = 0;
|
||||
NvU32 pdb_lo = 0;
|
||||
NvU32 pdb_hi = 0;
|
||||
NvU32 page_table_level = 0;
|
||||
uvm_membar_t membar;
|
||||
|
||||
UVM_ASSERT_MSG(pdb.aperture == UVM_APERTURE_VID || pdb.aperture == UVM_APERTURE_SYS, "aperture: %u", pdb.aperture);
|
||||
if (pdb.aperture == UVM_APERTURE_VID)
|
||||
@ -381,7 +397,7 @@ void uvm_hal_ampere_host_tlb_invalidate_test(uvm_push_t *push,
|
||||
pdb_hi = pdb.address >> HWSIZE(C56F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);
|
||||
|
||||
if (params->page_table_level != UvmInvalidatePageTableLevelAll) {
|
||||
// PDE3 is the highest level on Pascal, see the comment in
|
||||
// PDE3 is the highest level on Pascal-Ampere, see the comment in
|
||||
// uvm_pascal_mmu.c for details.
|
||||
page_table_level = min((NvU32)UvmInvalidatePageTableLevelPde3, params->page_table_level) - 1;
|
||||
}
|
||||
@ -393,6 +409,11 @@ void uvm_hal_ampere_host_tlb_invalidate_test(uvm_push_t *push,
|
||||
ack_value = HWCONST(C56F, MEM_OP_C, TLB_INVALIDATE_ACK_TYPE, GLOBALLY);
|
||||
}
|
||||
|
||||
if (params->membar == UvmInvalidateTlbMemBarSys)
|
||||
sysmembar_value = HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, EN);
|
||||
else
|
||||
sysmembar_value = HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS);
|
||||
|
||||
if (params->disable_gpc_invalidate)
|
||||
invalidate_gpc_value = HWCONST(C56F, MEM_OP_C, TLB_INVALIDATE_GPC, DISABLE);
|
||||
else
|
||||
@ -403,9 +424,9 @@ void uvm_hal_ampere_host_tlb_invalidate_test(uvm_push_t *push,
|
||||
|
||||
NvU32 va_lo = va & HWMASK(C56F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO);
|
||||
NvU32 va_hi = va >> HWSIZE(C56F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO);
|
||||
NV_PUSH_4U(C56F, MEM_OP_A, HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS) |
|
||||
HWVALUE(C56F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO, va_lo) |
|
||||
HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS),
|
||||
NV_PUSH_4U(C56F, MEM_OP_A, sysmembar_value |
|
||||
HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS) |
|
||||
HWVALUE(C56F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO, va_lo),
|
||||
MEM_OP_B, HWVALUE(C56F, MEM_OP_B, TLB_INVALIDATE_TARGET_ADDR_HI, va_hi),
|
||||
MEM_OP_C, HWCONST(C56F, MEM_OP_C, TLB_INVALIDATE_REPLAY, NONE) |
|
||||
HWVALUE(C56F, MEM_OP_C, TLB_INVALIDATE_PAGE_TABLE_LEVEL, page_table_level) |
|
||||
@ -418,7 +439,7 @@ void uvm_hal_ampere_host_tlb_invalidate_test(uvm_push_t *push,
|
||||
HWVALUE(C56F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));
|
||||
}
|
||||
else {
|
||||
NV_PUSH_4U(C56F, MEM_OP_A, HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS) |
|
||||
NV_PUSH_4U(C56F, MEM_OP_A, sysmembar_value |
|
||||
HWCONST(C56F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS),
|
||||
MEM_OP_B, 0,
|
||||
MEM_OP_C, HWCONST(C56F, MEM_OP_C, TLB_INVALIDATE_REPLAY, NONE) |
|
||||
@ -432,12 +453,7 @@ void uvm_hal_ampere_host_tlb_invalidate_test(uvm_push_t *push,
|
||||
HWVALUE(C56F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));
|
||||
}
|
||||
|
||||
if (params->membar == UvmInvalidateTlbMemBarSys)
|
||||
membar = UVM_MEMBAR_SYS;
|
||||
else if (params->membar == UvmInvalidateTlbMemBarLocal)
|
||||
membar = UVM_MEMBAR_GPU;
|
||||
else
|
||||
membar = UVM_MEMBAR_NONE;
|
||||
|
||||
uvm_hal_tlb_invalidate_membar(push, membar);
|
||||
// GPU membar still requires an explicit membar method.
|
||||
if (params->membar == UvmInvalidateTlbMemBarLocal)
|
||||
uvm_push_get_gpu(push)->parent->host_hal->membar_gpu(push);
|
||||
}
|
||||
|
@ -51,7 +51,7 @@ uvm_mmu_engine_type_t uvm_hal_ampere_mmu_engine_id_to_type(NvU16 mmu_engine_id)
|
||||
return UVM_MMU_ENGINE_TYPE_GRAPHICS;
|
||||
}
|
||||
|
||||
static NvU32 page_table_depth_ampere(NvU32 page_size)
|
||||
static NvU32 page_table_depth_ampere(NvU64 page_size)
|
||||
{
|
||||
// The common-case is page_size == UVM_PAGE_SIZE_2M, hence the first check
|
||||
if (page_size == UVM_PAGE_SIZE_2M)
|
||||
@ -62,14 +62,14 @@ static NvU32 page_table_depth_ampere(NvU32 page_size)
|
||||
return 4;
|
||||
}
|
||||
|
||||
static NvU32 page_sizes_ampere(void)
|
||||
static NvU64 page_sizes_ampere(void)
|
||||
{
|
||||
return UVM_PAGE_SIZE_512M | UVM_PAGE_SIZE_2M | UVM_PAGE_SIZE_64K | UVM_PAGE_SIZE_4K;
|
||||
}
|
||||
|
||||
static uvm_mmu_mode_hal_t ampere_mmu_mode_hal;
|
||||
|
||||
uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_ampere(NvU32 big_page_size)
|
||||
uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_ampere(NvU64 big_page_size)
|
||||
{
|
||||
static bool initialized = false;
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2018-2021 NVIDIA Corporation
|
||||
Copyright (c) 2018-2024 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2018-2021 NVIDIA Corporation
|
||||
Copyright (c) 2018-2024 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@ -29,10 +29,9 @@
|
||||
#include "uvm_ats_ibm.h"
|
||||
#include "nv_uvm_types.h"
|
||||
#include "uvm_lock.h"
|
||||
#include "uvm_ats_sva.h"
|
||||
|
||||
#include "uvm_ats_sva.h"
|
||||
|
||||
#define UVM_ATS_SUPPORTED() (UVM_ATS_IBM_SUPPORTED() || UVM_ATS_SVA_SUPPORTED())
|
||||
#define UVM_ATS_SUPPORTED() (UVM_ATS_IBM_SUPPORTED() || UVM_ATS_SVA_SUPPORTED())
|
||||
|
||||
typedef struct
|
||||
{
|
||||
|
@ -855,7 +855,6 @@ static NV_STATUS cpu_decrypt_in_order(uvm_channel_t *channel,
|
||||
uvm_mem_t *dst_mem,
|
||||
uvm_mem_t *src_mem,
|
||||
const UvmCslIv *decrypt_iv,
|
||||
NvU32 key_version,
|
||||
uvm_mem_t *auth_tag_mem,
|
||||
size_t size,
|
||||
NvU32 copy_size)
|
||||
@ -870,7 +869,6 @@ static NV_STATUS cpu_decrypt_in_order(uvm_channel_t *channel,
|
||||
dst_plain + i * copy_size,
|
||||
src_cipher + i * copy_size,
|
||||
decrypt_iv + i,
|
||||
key_version,
|
||||
copy_size,
|
||||
auth_tag_buffer + i * UVM_CONF_COMPUTING_AUTH_TAG_SIZE));
|
||||
}
|
||||
@ -881,7 +879,6 @@ static NV_STATUS cpu_decrypt_out_of_order(uvm_channel_t *channel,
|
||||
uvm_mem_t *dst_mem,
|
||||
uvm_mem_t *src_mem,
|
||||
const UvmCslIv *decrypt_iv,
|
||||
NvU32 key_version,
|
||||
uvm_mem_t *auth_tag_mem,
|
||||
size_t size,
|
||||
NvU32 copy_size)
|
||||
@ -899,7 +896,6 @@ static NV_STATUS cpu_decrypt_out_of_order(uvm_channel_t *channel,
|
||||
dst_plain + i * copy_size,
|
||||
src_cipher + i * copy_size,
|
||||
decrypt_iv + i,
|
||||
key_version,
|
||||
copy_size,
|
||||
auth_tag_buffer + i * UVM_CONF_COMPUTING_AUTH_TAG_SIZE));
|
||||
}
|
||||
@ -963,7 +959,7 @@ static void gpu_encrypt(uvm_push_t *push,
|
||||
i * UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
|
||||
dst_cipher);
|
||||
|
||||
uvm_conf_computing_log_gpu_encryption(push->channel, copy_size, decrypt_iv);
|
||||
uvm_conf_computing_log_gpu_encryption(push->channel, decrypt_iv);
|
||||
|
||||
if (i > 0)
|
||||
uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
|
||||
@ -1024,7 +1020,6 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu,
|
||||
size_t auth_tag_buffer_size = (size / copy_size) * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
|
||||
UvmCslIv *decrypt_iv = NULL;
|
||||
UvmCslIv *encrypt_iv = NULL;
|
||||
NvU32 key_version;
|
||||
uvm_tracker_t tracker;
|
||||
size_t src_plain_size;
|
||||
|
||||
@ -1094,11 +1089,6 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu,
|
||||
|
||||
gpu_encrypt(&push, dst_cipher, dst_plain_gpu, auth_tag_mem, decrypt_iv, size, copy_size);
|
||||
|
||||
// There shouldn't be any key rotation between the end of the push and the
|
||||
// CPU decryption(s), but it is more robust against test changes to force
|
||||
// decryption to use the saved key.
|
||||
key_version = uvm_channel_pool_key_version(push.channel->pool);
|
||||
|
||||
TEST_NV_CHECK_GOTO(uvm_push_end_and_wait(&push), out);
|
||||
|
||||
TEST_CHECK_GOTO(!mem_match(src_plain, src_cipher, size), out);
|
||||
@ -1111,7 +1101,6 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu,
|
||||
dst_plain,
|
||||
dst_cipher,
|
||||
decrypt_iv,
|
||||
key_version,
|
||||
auth_tag_mem,
|
||||
size,
|
||||
copy_size),
|
||||
@ -1122,7 +1111,6 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu,
|
||||
dst_plain,
|
||||
dst_cipher,
|
||||
decrypt_iv,
|
||||
key_version,
|
||||
auth_tag_mem,
|
||||
size,
|
||||
copy_size),
|
||||
|
@ -38,32 +38,6 @@
|
||||
#include "clb06f.h"
|
||||
#include "uvm_conf_computing.h"
|
||||
|
||||
// WLC push is decrypted by SEC2 or CE (in WLC schedule).
|
||||
// In sysmem it's followed by auth tag.
|
||||
#define WLC_PUSHBUFFER_ALIGNMENT max3(UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT, \
|
||||
UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT, \
|
||||
UVM_CONF_COMPUTING_BUF_ALIGNMENT)
|
||||
#define WLC_ALIGNED_MAX_PUSH_SIZE UVM_ALIGN_UP(UVM_MAX_WLC_PUSH_SIZE, WLC_PUSHBUFFER_ALIGNMENT)
|
||||
|
||||
// WLC uses the following structures in unprotected sysmem:
|
||||
// * Encrypted pushbuffer location. This gets populated via cpu_encrypt to
|
||||
// launch work on a WLC channel.
|
||||
// * Auth tag associated with the above encrypted (push)buffer
|
||||
// * Another auth tag used to encrypt another channel's pushbuffer during
|
||||
// indirect work launch. This can be allocated with the launched work
|
||||
// but since WLC can oly launch one pushbuffer at a time it's easier
|
||||
// to include it here.
|
||||
#define WLC_SYSMEM_TOTAL_SIZE UVM_ALIGN_UP(WLC_ALIGNED_MAX_PUSH_SIZE + 2 * UVM_CONF_COMPUTING_AUTH_TAG_SIZE, \
|
||||
WLC_PUSHBUFFER_ALIGNMENT)
|
||||
|
||||
#define WLC_SYSMEM_PUSHBUFFER_OFFSET 0
|
||||
#define WLC_SYSMEM_PUSHBUFFER_AUTH_TAG_OFFSET (WLC_SYSMEM_PUSHBUFFER_OFFSET + WLC_ALIGNED_MAX_PUSH_SIZE)
|
||||
#define WLC_SYSMEM_LAUNCH_AUTH_TAG_OFFSET (WLC_SYSMEM_PUSHBUFFER_AUTH_TAG_OFFSET + UVM_CONF_COMPUTING_AUTH_TAG_SIZE)
|
||||
|
||||
// LCIC pushbuffer is populated by SEC2
|
||||
#define LCIC_PUSHBUFFER_ALIGNMENT UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT
|
||||
#define LCIC_ALIGNED_PUSH_SIZE UVM_ALIGN_UP(UVM_LCIC_PUSH_SIZE, LCIC_PUSHBUFFER_ALIGNMENT)
|
||||
|
||||
static unsigned uvm_channel_num_gpfifo_entries = UVM_CHANNEL_NUM_GPFIFO_ENTRIES_DEFAULT;
|
||||
|
||||
#define UVM_CHANNEL_GPFIFO_LOC_DEFAULT "auto"
|
||||
@ -306,16 +280,16 @@ static void unlock_channel_for_push(uvm_channel_t *channel)
|
||||
index = uvm_channel_index_in_pool(channel);
|
||||
|
||||
uvm_channel_pool_assert_locked(channel->pool);
|
||||
UVM_ASSERT(test_bit(index, channel->pool->conf_computing.push_locks));
|
||||
UVM_ASSERT(test_bit(index, channel->pool->push_locks));
|
||||
|
||||
__clear_bit(index, channel->pool->conf_computing.push_locks);
|
||||
uvm_up_out_of_order(&channel->pool->conf_computing.push_sem);
|
||||
__clear_bit(index, channel->pool->push_locks);
|
||||
uvm_up_out_of_order(&channel->pool->push_sem);
|
||||
}
|
||||
|
||||
bool uvm_channel_is_locked_for_push(uvm_channel_t *channel)
|
||||
{
|
||||
if (g_uvm_global.conf_computing_enabled)
|
||||
return test_bit(uvm_channel_index_in_pool(channel), channel->pool->conf_computing.push_locks);
|
||||
return test_bit(uvm_channel_index_in_pool(channel), channel->pool->push_locks);
|
||||
|
||||
// For CE and proxy channels, we always return that the channel is locked,
|
||||
// which has no functional impact in the UVM channel code-flow, this is only
|
||||
@ -329,21 +303,19 @@ static void lock_channel_for_push(uvm_channel_t *channel)
|
||||
|
||||
UVM_ASSERT(g_uvm_global.conf_computing_enabled);
|
||||
uvm_channel_pool_assert_locked(channel->pool);
|
||||
UVM_ASSERT(!test_bit(index, channel->pool->conf_computing.push_locks));
|
||||
UVM_ASSERT(!test_bit(index, channel->pool->push_locks));
|
||||
|
||||
__set_bit(index, channel->pool->conf_computing.push_locks);
|
||||
__set_bit(index, channel->pool->push_locks);
|
||||
}
|
||||
|
||||
static bool test_claim_and_lock_channel(uvm_channel_t *channel, NvU32 num_gpfifo_entries)
|
||||
{
|
||||
NvU32 index = uvm_channel_index_in_pool(channel);
|
||||
|
||||
UVM_ASSERT(g_uvm_global.conf_computing_enabled);
|
||||
uvm_channel_pool_assert_locked(channel->pool);
|
||||
|
||||
// Already locked by someone else
|
||||
if (uvm_channel_is_locked_for_push(channel))
|
||||
return false;
|
||||
|
||||
if (try_claim_channel_locked(channel, num_gpfifo_entries)) {
|
||||
if (!test_bit(index, channel->pool->push_locks) && try_claim_channel_locked(channel, num_gpfifo_entries)) {
|
||||
lock_channel_for_push(channel);
|
||||
return true;
|
||||
}
|
||||
@ -351,112 +323,6 @@ static bool test_claim_and_lock_channel(uvm_channel_t *channel, NvU32 num_gpfifo
|
||||
return false;
|
||||
}
|
||||
|
||||
// Reserve, or release, all channels in the given pool.
|
||||
//
|
||||
// One scenario where reservation of the entire pool is useful is key rotation,
|
||||
// because the reservation blocks addition of new work to the pool while
|
||||
// rotation is in progress.
|
||||
static void channel_pool_reserve_release_all_channels(uvm_channel_pool_t *pool, bool reserve)
|
||||
{
|
||||
NvU32 i;
|
||||
|
||||
UVM_ASSERT(g_uvm_global.conf_computing_enabled);
|
||||
|
||||
// Disable lock tracking: a single thread is acquiring multiple locks of
|
||||
// the same order
|
||||
uvm_thread_context_lock_disable_tracking();
|
||||
|
||||
for (i = 0; i < pool->num_channels; i++) {
|
||||
if (reserve)
|
||||
uvm_down(&pool->conf_computing.push_sem);
|
||||
else
|
||||
uvm_up(&pool->conf_computing.push_sem);
|
||||
}
|
||||
|
||||
uvm_thread_context_lock_enable_tracking();
|
||||
}
|
||||
|
||||
static void channel_pool_reserve_all_channels(uvm_channel_pool_t *pool)
|
||||
{
|
||||
channel_pool_reserve_release_all_channels(pool, true);
|
||||
}
|
||||
|
||||
static void channel_pool_release_all_channels(uvm_channel_pool_t *pool)
|
||||
{
|
||||
channel_pool_reserve_release_all_channels(pool, false);
|
||||
}
|
||||
|
||||
static NV_STATUS channel_pool_rotate_key_locked(uvm_channel_pool_t *pool)
|
||||
{
|
||||
uvm_channel_t *channel;
|
||||
|
||||
// A rotation is not necessarily pending, because UVM can trigger rotations
|
||||
// at will.
|
||||
UVM_ASSERT(uvm_conf_computing_is_key_rotation_enabled_in_pool(pool));
|
||||
|
||||
uvm_assert_mutex_locked(&pool->conf_computing.key_rotation.mutex);
|
||||
|
||||
uvm_for_each_channel_in_pool(channel, pool) {
|
||||
NV_STATUS status = uvm_channel_wait(channel);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
if (uvm_channel_pool_is_wlc(pool)) {
|
||||
uvm_spin_loop_t spin;
|
||||
uvm_channel_t *lcic_channel = uvm_channel_wlc_get_paired_lcic(channel);
|
||||
|
||||
// LCIC pushes don't exist as such. Rely on the tracking semaphore
|
||||
// to determine completion, instead of uvm_channel_wait
|
||||
UVM_SPIN_WHILE(!uvm_gpu_tracking_semaphore_is_completed(&lcic_channel->tracking_sem), &spin);
|
||||
}
|
||||
}
|
||||
|
||||
return uvm_conf_computing_rotate_pool_key(pool);
|
||||
}
|
||||
|
||||
static NV_STATUS channel_pool_rotate_key(uvm_channel_pool_t *pool, bool force_rotation)
|
||||
{
|
||||
NV_STATUS status = NV_OK;
|
||||
|
||||
uvm_mutex_lock(&pool->conf_computing.key_rotation.mutex);
|
||||
|
||||
if (force_rotation || uvm_conf_computing_is_key_rotation_pending_in_pool(pool)) {
|
||||
channel_pool_reserve_all_channels(pool);
|
||||
|
||||
status = channel_pool_rotate_key_locked(pool);
|
||||
|
||||
channel_pool_release_all_channels(pool);
|
||||
}
|
||||
|
||||
uvm_mutex_unlock(&pool->conf_computing.key_rotation.mutex);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static NV_STATUS channel_pool_rotate_key_if_pending(uvm_channel_pool_t *pool)
|
||||
{
|
||||
NV_STATUS status;
|
||||
bool force_rotation = false;
|
||||
|
||||
if (!uvm_conf_computing_is_key_rotation_enabled_in_pool(pool))
|
||||
return NV_OK;
|
||||
|
||||
status = channel_pool_rotate_key(pool, force_rotation);
|
||||
|
||||
// RM couldn't acquire the locks it needed, so UVM will try again later.
|
||||
if (status == NV_ERR_STATE_IN_USE)
|
||||
status = NV_OK;
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
NV_STATUS uvm_channel_pool_rotate_key(uvm_channel_pool_t *pool)
|
||||
{
|
||||
bool force_rotation = true;
|
||||
|
||||
return channel_pool_rotate_key(pool, force_rotation);
|
||||
}
|
||||
|
||||
// Reserve a channel in the specified pool. The channel is locked until the push
|
||||
// ends
|
||||
static NV_STATUS channel_reserve_and_lock_in_pool(uvm_channel_pool_t *pool, uvm_channel_t **channel_out)
|
||||
@ -464,28 +330,20 @@ static NV_STATUS channel_reserve_and_lock_in_pool(uvm_channel_pool_t *pool, uvm_
|
||||
uvm_channel_t *channel;
|
||||
uvm_spin_loop_t spin;
|
||||
NvU32 index;
|
||||
NV_STATUS status;
|
||||
|
||||
UVM_ASSERT(pool);
|
||||
UVM_ASSERT(g_uvm_global.conf_computing_enabled);
|
||||
|
||||
// LCIC channels are reserved directly during GPU initialization.
|
||||
UVM_ASSERT(!uvm_channel_pool_is_lcic(pool));
|
||||
|
||||
status = channel_pool_rotate_key_if_pending(pool);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
// This semaphore is uvm_up() in unlock_channel_for_push() as part of the
|
||||
// uvm_channel_end_push() routine.
|
||||
uvm_down(&pool->conf_computing.push_sem);
|
||||
uvm_down(&pool->push_sem);
|
||||
|
||||
// At least one channel is unlocked. We check if any unlocked channel is
|
||||
// available, i.e., if it has free GPFIFO entries.
|
||||
|
||||
channel_pool_lock(pool);
|
||||
|
||||
for_each_clear_bit(index, pool->conf_computing.push_locks, pool->num_channels) {
|
||||
for_each_clear_bit(index, pool->push_locks, pool->num_channels) {
|
||||
channel = &pool->channels[index];
|
||||
if (try_claim_channel_locked(channel, 1)) {
|
||||
lock_channel_for_push(channel);
|
||||
@ -500,7 +358,10 @@ static NV_STATUS channel_reserve_and_lock_in_pool(uvm_channel_pool_t *pool, uvm_
|
||||
uvm_spin_loop_init(&spin);
|
||||
while (1) {
|
||||
uvm_for_each_channel_in_pool(channel, pool) {
|
||||
NV_STATUS status;
|
||||
|
||||
uvm_channel_update_progress(channel);
|
||||
index = uvm_channel_index_in_pool(channel);
|
||||
|
||||
channel_pool_lock(pool);
|
||||
|
||||
@ -511,7 +372,7 @@ static NV_STATUS channel_reserve_and_lock_in_pool(uvm_channel_pool_t *pool, uvm_
|
||||
|
||||
status = uvm_channel_check_errors(channel);
|
||||
if (status != NV_OK) {
|
||||
uvm_up(&pool->conf_computing.push_sem);
|
||||
uvm_up(&pool->push_sem);
|
||||
return status;
|
||||
}
|
||||
|
||||
@ -629,47 +490,31 @@ static NvU32 channel_get_available_push_info_index(uvm_channel_t *channel)
|
||||
return push_info - channel->push_infos;
|
||||
}
|
||||
|
||||
static unsigned channel_pool_num_gpfifo_entries(uvm_channel_pool_t *pool)
|
||||
{
|
||||
UVM_ASSERT(uvm_pool_type_is_valid(pool->pool_type));
|
||||
|
||||
// WLC benefits from larger number of entries since more available entries
|
||||
// result in less frequent calls to uvm_channel_update_progress. 16 is the
|
||||
// maximum size that can re-use static pb preallocated memory when uploading
|
||||
// the WLC schedule.
|
||||
if (uvm_channel_pool_is_wlc(pool))
|
||||
return 16;
|
||||
|
||||
// Every channel needs at least 3 entries; 1 for sentinel and 2 for
|
||||
// submitting GPFIFO control entries. The number also has to be power of 2,
|
||||
// as the HW stores the size as log2 value. LCIC does not accept external
|
||||
// pushes, uvm_channel_update_progress is not a concern.
|
||||
if (uvm_channel_pool_is_lcic(pool))
|
||||
return 4;
|
||||
|
||||
return pool->manager->conf.num_gpfifo_entries;
|
||||
}
|
||||
|
||||
static void channel_semaphore_gpu_encrypt_payload(uvm_push_t *push, NvU64 semaphore_va)
|
||||
{
|
||||
NvU32 iv_index;
|
||||
uvm_gpu_address_t notifier_gpu_va;
|
||||
uvm_gpu_address_t auth_tag_gpu_va;
|
||||
uvm_gpu_address_t semaphore_gpu_va;
|
||||
uvm_gpu_address_t encrypted_payload_gpu_va;
|
||||
uvm_gpu_t *gpu = push->gpu;
|
||||
uvm_channel_t *channel = push->channel;
|
||||
uvm_gpu_semaphore_t *semaphore = &channel->tracking_sem.semaphore;
|
||||
uvm_gpu_address_t notifier_gpu_va = uvm_gpu_semaphore_get_notifier_gpu_va(semaphore);
|
||||
uvm_gpu_address_t auth_tag_gpu_va = uvm_gpu_semaphore_get_auth_tag_gpu_va(semaphore);
|
||||
uvm_gpu_address_t encrypted_payload_gpu_va = uvm_gpu_semaphore_get_encrypted_payload_gpu_va(semaphore);
|
||||
uvm_gpu_address_t semaphore_gpu_va = uvm_gpu_address_virtual(semaphore_va);
|
||||
UvmCslIv *iv_cpu_addr = semaphore->conf_computing.ivs;
|
||||
NvU32 payload_size = sizeof(*uvm_gpu_semaphore_get_encrypted_payload_cpu_va(semaphore));
|
||||
uvm_gpu_semaphore_notifier_t *last_pushed_notifier = &semaphore->conf_computing.last_pushed_notifier;
|
||||
NvU32 payload_size = sizeof(*semaphore->payload);
|
||||
NvU32 *last_pushed_notifier = &semaphore->conf_computing.last_pushed_notifier;
|
||||
|
||||
UVM_ASSERT(g_uvm_global.conf_computing_enabled);
|
||||
UVM_ASSERT(uvm_channel_is_ce(channel));
|
||||
|
||||
encrypted_payload_gpu_va = uvm_rm_mem_get_gpu_va(semaphore->conf_computing.encrypted_payload, gpu, false);
|
||||
notifier_gpu_va = uvm_rm_mem_get_gpu_va(semaphore->conf_computing.notifier, gpu, false);
|
||||
auth_tag_gpu_va = uvm_rm_mem_get_gpu_va(semaphore->conf_computing.auth_tag, gpu, false);
|
||||
semaphore_gpu_va = uvm_gpu_address_virtual(semaphore_va);
|
||||
|
||||
iv_index = ((*last_pushed_notifier + 2) / 2) % channel->num_gpfifo_entries;
|
||||
|
||||
uvm_conf_computing_log_gpu_encryption(channel, payload_size, &iv_cpu_addr[iv_index]);
|
||||
uvm_conf_computing_log_gpu_encryption(channel, &iv_cpu_addr[iv_index]);
|
||||
|
||||
gpu->parent->ce_hal->memset_4(push, notifier_gpu_va, ++(*last_pushed_notifier), sizeof(*last_pushed_notifier));
|
||||
gpu->parent->ce_hal->encrypt(push, encrypted_payload_gpu_va, semaphore_gpu_va, payload_size, auth_tag_gpu_va);
|
||||
@ -690,35 +535,18 @@ static void push_reserve_csl_sign_buf(uvm_push_t *push)
|
||||
UVM_ASSERT((buf - UVM_METHOD_SIZE / sizeof(*buf)) == push->begin);
|
||||
}
|
||||
|
||||
static uvm_channel_pool_t *get_paired_pool(uvm_channel_pool_t *pool)
|
||||
{
|
||||
uvm_channel_type_t paired_channel_type;
|
||||
uvm_channel_pool_t *paired_pool;
|
||||
|
||||
UVM_ASSERT(pool);
|
||||
UVM_ASSERT(uvm_channel_pool_is_wlc(pool) || uvm_channel_pool_is_lcic(pool));
|
||||
|
||||
paired_channel_type = uvm_channel_pool_is_wlc(pool) ? UVM_CHANNEL_TYPE_LCIC : UVM_CHANNEL_TYPE_WLC;
|
||||
paired_pool = pool->manager->pool_to_use.default_for_type[paired_channel_type];
|
||||
|
||||
// Prevent accessing a non-existing paired pool. This can happen if, for
|
||||
// example, the function is invoked when the WLC pool exists, but the LCIC
|
||||
// doesn't (it hasn't been created yet, or it has been already destroyed).
|
||||
UVM_ASSERT(paired_pool);
|
||||
|
||||
return paired_pool;
|
||||
}
|
||||
|
||||
static uvm_channel_t *get_paired_channel(uvm_channel_t *channel)
|
||||
{
|
||||
uvm_channel_pool_t *paired_pool;
|
||||
unsigned index;
|
||||
uvm_channel_pool_t *paired_pool;
|
||||
uvm_channel_type_t paired_channel_type;
|
||||
|
||||
UVM_ASSERT(channel);
|
||||
UVM_ASSERT(uvm_channel_is_wlc(channel) || uvm_channel_is_lcic(channel));
|
||||
|
||||
paired_pool = get_paired_pool(channel->pool);
|
||||
index = uvm_channel_index_in_pool(channel);
|
||||
|
||||
paired_channel_type = uvm_channel_is_wlc(channel) ? UVM_CHANNEL_TYPE_LCIC : UVM_CHANNEL_TYPE_WLC;
|
||||
paired_pool = channel->pool->manager->pool_to_use.default_for_type[paired_channel_type];
|
||||
return paired_pool->channels + index;
|
||||
}
|
||||
|
||||
@ -738,101 +566,6 @@ uvm_channel_t *uvm_channel_wlc_get_paired_lcic(uvm_channel_t *wlc_channel)
|
||||
return get_paired_channel(wlc_channel);
|
||||
}
|
||||
|
||||
NvU64 uvm_channel_get_static_pb_protected_vidmem_gpu_va(uvm_channel_t *channel)
|
||||
{
|
||||
unsigned channel_index;
|
||||
NvU64 pool_vidmem_base;
|
||||
|
||||
UVM_ASSERT(channel);
|
||||
UVM_ASSERT(uvm_channel_is_wlc(channel) || uvm_channel_is_lcic(channel));
|
||||
|
||||
channel_index = uvm_channel_index_in_pool(channel);
|
||||
pool_vidmem_base = uvm_rm_mem_get_gpu_uvm_va(channel->pool->conf_computing.pool_vidmem,
|
||||
uvm_channel_get_gpu(channel));
|
||||
|
||||
if (uvm_channel_is_lcic(channel))
|
||||
return pool_vidmem_base + channel_index * LCIC_ALIGNED_PUSH_SIZE;
|
||||
|
||||
return pool_vidmem_base + 2 * channel_index * WLC_ALIGNED_MAX_PUSH_SIZE;
|
||||
}
|
||||
|
||||
static NvU64 get_channel_unprotected_sysmem_gpu_va(uvm_channel_t *channel)
|
||||
{
|
||||
unsigned channel_index;
|
||||
NvU64 pool_sysmem_base;
|
||||
|
||||
UVM_ASSERT(channel);
|
||||
UVM_ASSERT(uvm_channel_is_wlc(channel));
|
||||
|
||||
channel_index = uvm_channel_index_in_pool(channel);
|
||||
pool_sysmem_base = uvm_rm_mem_get_gpu_uvm_va(channel->pool->conf_computing.pool_sysmem,
|
||||
uvm_channel_get_gpu(channel));
|
||||
|
||||
return pool_sysmem_base + (channel_index * WLC_SYSMEM_TOTAL_SIZE);
|
||||
}
|
||||
|
||||
NvU64 uvm_channel_get_static_pb_unprotected_sysmem_gpu_va(uvm_channel_t *channel)
|
||||
{
|
||||
return get_channel_unprotected_sysmem_gpu_va(channel) + WLC_SYSMEM_PUSHBUFFER_OFFSET;
|
||||
}
|
||||
|
||||
static char* get_channel_unprotected_sysmem_cpu(uvm_channel_t *channel)
|
||||
{
|
||||
unsigned channel_index;
|
||||
char* pool_sysmem_base;
|
||||
|
||||
UVM_ASSERT(channel);
|
||||
UVM_ASSERT(uvm_channel_is_wlc(channel));
|
||||
|
||||
channel_index = uvm_channel_index_in_pool(channel);
|
||||
pool_sysmem_base = uvm_rm_mem_get_cpu_va(channel->pool->conf_computing.pool_sysmem);
|
||||
|
||||
return pool_sysmem_base + (channel_index * WLC_SYSMEM_TOTAL_SIZE);
|
||||
}
|
||||
|
||||
char* uvm_channel_get_static_pb_unprotected_sysmem_cpu(uvm_channel_t *channel)
|
||||
{
|
||||
return get_channel_unprotected_sysmem_cpu(channel) + WLC_SYSMEM_PUSHBUFFER_OFFSET;
|
||||
}
|
||||
|
||||
char *uvm_channel_get_push_crypto_bundle_auth_tags_cpu_va(uvm_channel_t *channel, unsigned tag_index)
|
||||
{
|
||||
char *pool_sysmem_base;
|
||||
unsigned index;
|
||||
|
||||
UVM_ASSERT(channel);
|
||||
UVM_ASSERT(!uvm_channel_is_wlc(channel));
|
||||
UVM_ASSERT(!uvm_channel_is_lcic(channel));
|
||||
UVM_ASSERT(uvm_channel_is_ce(channel));
|
||||
UVM_ASSERT(channel->num_gpfifo_entries == channel_pool_num_gpfifo_entries(channel->pool));
|
||||
UVM_ASSERT(tag_index < channel->num_gpfifo_entries);
|
||||
|
||||
index = uvm_channel_index_in_pool(channel) * channel->num_gpfifo_entries + tag_index;
|
||||
pool_sysmem_base = uvm_rm_mem_get_cpu_va(channel->pool->conf_computing.pool_sysmem);
|
||||
|
||||
return pool_sysmem_base + index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
|
||||
}
|
||||
|
||||
static NvU64 get_push_crypto_bundle_auth_tags_gpu_va(uvm_channel_t *channel, unsigned tag_index)
|
||||
{
|
||||
unsigned index;
|
||||
NvU64 pool_sysmem_base;
|
||||
|
||||
UVM_ASSERT(channel);
|
||||
UVM_ASSERT(!uvm_channel_is_wlc(channel));
|
||||
UVM_ASSERT(!uvm_channel_is_lcic(channel));
|
||||
UVM_ASSERT(uvm_channel_is_ce(channel));
|
||||
UVM_ASSERT(channel->num_gpfifo_entries == channel_pool_num_gpfifo_entries(channel->pool));
|
||||
UVM_ASSERT(tag_index < channel->num_gpfifo_entries);
|
||||
|
||||
index = uvm_channel_index_in_pool(channel) * channel->num_gpfifo_entries + tag_index;
|
||||
pool_sysmem_base = uvm_rm_mem_get_gpu_uvm_va(channel->pool->conf_computing.pool_sysmem,
|
||||
uvm_channel_get_gpu(channel));
|
||||
|
||||
|
||||
return pool_sysmem_base + index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
|
||||
}
|
||||
|
||||
static NV_STATUS channel_rotate_and_reserve_launch_channel(uvm_channel_t *channel, uvm_channel_t **launch_channel)
|
||||
{
|
||||
uvm_channel_manager_t *manager = channel->pool->manager;
|
||||
@ -1008,52 +741,16 @@ static void uvm_channel_tracking_semaphore_release(uvm_push_t *push, NvU64 semap
|
||||
channel_semaphore_gpu_encrypt_payload(push, semaphore_va);
|
||||
}
|
||||
|
||||
static uvm_gpu_semaphore_notifier_t *lcic_static_entry_notifier_cpu_va(uvm_channel_t *lcic)
|
||||
{
|
||||
uvm_gpu_semaphore_notifier_t *notifier_base;
|
||||
|
||||
UVM_ASSERT(uvm_channel_is_lcic(lcic));
|
||||
|
||||
notifier_base = uvm_rm_mem_get_cpu_va(lcic->pool->conf_computing.pool_sysmem);
|
||||
return notifier_base + uvm_channel_index_in_pool(lcic) * 2;
|
||||
}
|
||||
|
||||
static uvm_gpu_semaphore_notifier_t *lcic_static_exit_notifier_cpu_va(uvm_channel_t *lcic)
|
||||
{
|
||||
return lcic_static_entry_notifier_cpu_va(lcic) + 1;
|
||||
}
|
||||
|
||||
static uvm_gpu_address_t lcic_static_entry_notifier_gpu_va(uvm_channel_t *lcic)
|
||||
{
|
||||
NvU64 notifier_base;
|
||||
const NvU64 offset = uvm_channel_index_in_pool(lcic) * 2 * sizeof(uvm_gpu_semaphore_notifier_t);
|
||||
|
||||
UVM_ASSERT(uvm_channel_is_lcic(lcic));
|
||||
|
||||
notifier_base = uvm_rm_mem_get_gpu_uvm_va(lcic->pool->conf_computing.pool_sysmem, uvm_channel_get_gpu(lcic));
|
||||
return uvm_gpu_address_virtual_unprotected(notifier_base + offset);
|
||||
}
|
||||
|
||||
static uvm_gpu_address_t lcic_static_exit_notifier_gpu_va(uvm_channel_t *lcic)
|
||||
{
|
||||
uvm_gpu_address_t notifier_address = lcic_static_entry_notifier_gpu_va(lcic);
|
||||
|
||||
notifier_address.address += sizeof(uvm_gpu_semaphore_notifier_t);
|
||||
return notifier_address;
|
||||
}
|
||||
|
||||
static void internal_channel_submit_work_wlc(uvm_push_t *push)
|
||||
{
|
||||
size_t payload_size;
|
||||
uvm_channel_t *wlc_channel = push->channel;
|
||||
uvm_channel_t *lcic_channel = uvm_channel_wlc_get_paired_lcic(wlc_channel);
|
||||
uvm_gpu_semaphore_t *lcic_semaphore = &lcic_channel->tracking_sem.semaphore;
|
||||
UvmCslIv *iv_cpu_addr = lcic_semaphore->conf_computing.ivs;
|
||||
uvm_gpu_semaphore_notifier_t *last_pushed_notifier;
|
||||
UvmCslIv *iv_cpu_addr = lcic_channel->tracking_sem.semaphore.conf_computing.ivs;
|
||||
NvU32 *last_pushed_notifier;
|
||||
NvU32 iv_index;
|
||||
uvm_spin_loop_t spin;
|
||||
void* auth_tag_cpu = get_channel_unprotected_sysmem_cpu(wlc_channel) + WLC_SYSMEM_PUSHBUFFER_AUTH_TAG_OFFSET;
|
||||
|
||||
UVM_ASSERT(lcic_channel);
|
||||
|
||||
// Wait for the WLC/LCIC to be primed. This means that PUT == GET + 2
|
||||
// and a WLC doorbell ring is enough to start work.
|
||||
@ -1069,21 +766,19 @@ static void internal_channel_submit_work_wlc(uvm_push_t *push)
|
||||
|
||||
// Handles the CPU part of the setup for the LCIC to be able to do GPU
|
||||
// encryption of its tracking semaphore value. See setup_lcic_schedule().
|
||||
last_pushed_notifier = &lcic_semaphore->conf_computing.last_pushed_notifier;
|
||||
*lcic_static_entry_notifier_cpu_va(lcic_channel) = ++(*last_pushed_notifier);
|
||||
*lcic_static_exit_notifier_cpu_va(lcic_channel) = ++(*last_pushed_notifier);
|
||||
last_pushed_notifier = &lcic_channel->tracking_sem.semaphore.conf_computing.last_pushed_notifier;
|
||||
*lcic_channel->conf_computing.static_notifier_entry_unprotected_sysmem_cpu = ++(*last_pushed_notifier);
|
||||
*lcic_channel->conf_computing.static_notifier_exit_unprotected_sysmem_cpu = ++(*last_pushed_notifier);
|
||||
iv_index = (*last_pushed_notifier / 2) % lcic_channel->num_gpfifo_entries;
|
||||
|
||||
payload_size = sizeof(*uvm_gpu_semaphore_get_encrypted_payload_cpu_va(lcic_semaphore));
|
||||
uvm_conf_computing_log_gpu_encryption(lcic_channel, payload_size, &iv_cpu_addr[iv_index]);
|
||||
uvm_conf_computing_log_gpu_encryption(lcic_channel, &iv_cpu_addr[iv_index]);
|
||||
|
||||
// Move push data
|
||||
uvm_conf_computing_cpu_encrypt(wlc_channel,
|
||||
uvm_channel_get_static_pb_unprotected_sysmem_cpu(wlc_channel),
|
||||
wlc_channel->conf_computing.static_pb_unprotected_sysmem_cpu,
|
||||
push->begin,
|
||||
&push->launch_iv,
|
||||
UVM_MAX_WLC_PUSH_SIZE,
|
||||
auth_tag_cpu);
|
||||
wlc_channel->conf_computing.static_pb_unprotected_sysmem_auth_tag_cpu);
|
||||
|
||||
// Make sure all encrypted data is observable before ringing the doorbell.
|
||||
wmb();
|
||||
@ -1103,7 +798,7 @@ static void internal_channel_submit_work_indirect_wlc(uvm_push_t *push, NvU32 ol
|
||||
|
||||
void *push_enc_cpu = uvm_pushbuffer_get_unprotected_cpu_va_for_push(pushbuffer, push);
|
||||
NvU64 push_enc_gpu = uvm_pushbuffer_get_unprotected_gpu_va_for_push(pushbuffer, push);
|
||||
void *push_enc_auth_tag_cpu;
|
||||
void *push_enc_auth_tag;
|
||||
uvm_gpu_address_t push_enc_auth_tag_gpu;
|
||||
NvU64 gpfifo_gpu_va = push->channel->channel_info.gpFifoGpuVa + old_cpu_put * sizeof(gpfifo_entry);
|
||||
|
||||
@ -1127,16 +822,15 @@ static void internal_channel_submit_work_indirect_wlc(uvm_push_t *push, NvU32 ol
|
||||
|
||||
// Move over the pushbuffer data
|
||||
// WLC channels use a static preallocated space for launch auth tags
|
||||
push_enc_auth_tag_cpu = get_channel_unprotected_sysmem_cpu(indirect_push.channel) + WLC_SYSMEM_LAUNCH_AUTH_TAG_OFFSET;
|
||||
push_enc_auth_tag_gpu = uvm_gpu_address_virtual_unprotected(
|
||||
get_channel_unprotected_sysmem_gpu_va(indirect_push.channel) + WLC_SYSMEM_LAUNCH_AUTH_TAG_OFFSET);
|
||||
push_enc_auth_tag = indirect_push.channel->conf_computing.launch_auth_tag_cpu;
|
||||
push_enc_auth_tag_gpu = uvm_gpu_address_virtual(indirect_push.channel->conf_computing.launch_auth_tag_gpu_va);
|
||||
|
||||
uvm_conf_computing_cpu_encrypt(indirect_push.channel,
|
||||
push_enc_cpu,
|
||||
push->begin,
|
||||
NULL,
|
||||
uvm_push_get_size(push),
|
||||
push_enc_auth_tag_cpu);
|
||||
push_enc_auth_tag);
|
||||
|
||||
uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
|
||||
|
||||
@ -1382,13 +1076,14 @@ static void encrypt_push(uvm_push_t *push)
|
||||
{
|
||||
NvU64 push_protected_gpu_va;
|
||||
NvU64 push_unprotected_gpu_va;
|
||||
NvU64 auth_tag_gpu_va;
|
||||
uvm_gpu_address_t auth_tag_gpu_va;
|
||||
uvm_channel_t *channel = push->channel;
|
||||
uvm_push_crypto_bundle_t *crypto_bundle;
|
||||
uvm_gpu_t *gpu = uvm_push_get_gpu(push);
|
||||
NvU32 push_size = uvm_push_get_size(push);
|
||||
uvm_push_info_t *push_info = uvm_push_info_from_push(push);
|
||||
uvm_pushbuffer_t *pushbuffer = uvm_channel_get_pushbuffer(channel);
|
||||
unsigned auth_tag_offset = UVM_CONF_COMPUTING_AUTH_TAG_SIZE * push->push_info_index;
|
||||
|
||||
if (!g_uvm_global.conf_computing_enabled)
|
||||
return;
|
||||
@ -1407,20 +1102,19 @@ static void encrypt_push(uvm_push_t *push)
|
||||
UVM_ASSERT(channel->conf_computing.push_crypto_bundles != NULL);
|
||||
|
||||
crypto_bundle = channel->conf_computing.push_crypto_bundles + push->push_info_index;
|
||||
auth_tag_gpu_va = get_push_crypto_bundle_auth_tags_gpu_va(channel, push->push_info_index);
|
||||
auth_tag_gpu_va = uvm_rm_mem_get_gpu_va(channel->conf_computing.push_crypto_bundle_auth_tags, gpu, false);
|
||||
auth_tag_gpu_va.address += auth_tag_offset;
|
||||
|
||||
crypto_bundle->push_size = push_size;
|
||||
push_protected_gpu_va = uvm_pushbuffer_get_gpu_va_for_push(pushbuffer, push);
|
||||
push_unprotected_gpu_va = uvm_pushbuffer_get_unprotected_gpu_va_for_push(pushbuffer, push);
|
||||
|
||||
uvm_conf_computing_log_gpu_encryption(channel, push_size, &crypto_bundle->iv);
|
||||
crypto_bundle->key_version = uvm_channel_pool_key_version(channel->pool);
|
||||
|
||||
uvm_conf_computing_log_gpu_encryption(channel, &crypto_bundle->iv);
|
||||
gpu->parent->ce_hal->encrypt(push,
|
||||
uvm_gpu_address_virtual_unprotected(push_unprotected_gpu_va),
|
||||
uvm_gpu_address_virtual(push_protected_gpu_va),
|
||||
push_size,
|
||||
uvm_gpu_address_virtual_unprotected(auth_tag_gpu_va));
|
||||
auth_tag_gpu_va);
|
||||
}
|
||||
|
||||
void uvm_channel_end_push(uvm_push_t *push)
|
||||
@ -1435,6 +1129,7 @@ void uvm_channel_end_push(uvm_push_t *push)
|
||||
NvU32 push_size;
|
||||
NvU32 cpu_put;
|
||||
NvU32 new_cpu_put;
|
||||
uvm_gpu_t *gpu = uvm_channel_get_gpu(channel);
|
||||
bool needs_sec2_work_submit = false;
|
||||
|
||||
channel_pool_lock(channel->pool);
|
||||
@ -1448,7 +1143,6 @@ void uvm_channel_end_push(uvm_push_t *push)
|
||||
uvm_channel_tracking_semaphore_release(push, semaphore_va, new_payload);
|
||||
|
||||
if (uvm_channel_is_wlc(channel) && uvm_channel_manager_is_wlc_ready(channel_manager)) {
|
||||
uvm_gpu_t *gpu = uvm_channel_get_gpu(channel);
|
||||
uvm_channel_t *paired_lcic = uvm_channel_wlc_get_paired_lcic(channel);
|
||||
|
||||
gpu->parent->ce_hal->semaphore_reduction_inc(push,
|
||||
@ -1743,16 +1437,9 @@ NV_STATUS uvm_channel_write_ctrl_gpfifo(uvm_channel_t *channel, NvU64 ctrl_fifo_
|
||||
|
||||
static NV_STATUS channel_reserve_and_lock(uvm_channel_t *channel, NvU32 num_gpfifo_entries)
|
||||
{
|
||||
NV_STATUS status;
|
||||
uvm_spin_loop_t spin;
|
||||
uvm_channel_pool_t *pool = channel->pool;
|
||||
|
||||
UVM_ASSERT(g_uvm_global.conf_computing_enabled);
|
||||
|
||||
status = channel_pool_rotate_key_if_pending(pool);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
// This semaphore is uvm_up() in unlock_channel_for_push() as part of the
|
||||
// uvm_channel_end_push() routine. Note that different than in
|
||||
// channel_reserve_and_lock_in_pool, we cannot pick an unlocked channel from
|
||||
@ -1760,7 +1447,7 @@ static NV_STATUS channel_reserve_and_lock(uvm_channel_t *channel, NvU32 num_gpfi
|
||||
// Not a concern given that uvm_channel_reserve() is not the common-case for
|
||||
// channel reservation, and only used for channel initialization, GPFIFO
|
||||
// control work submission, and testing.
|
||||
uvm_down(&pool->conf_computing.push_sem);
|
||||
uvm_down(&pool->push_sem);
|
||||
|
||||
channel_pool_lock(pool);
|
||||
|
||||
@ -1771,6 +1458,8 @@ static NV_STATUS channel_reserve_and_lock(uvm_channel_t *channel, NvU32 num_gpfi
|
||||
|
||||
uvm_spin_loop_init(&spin);
|
||||
while (1) {
|
||||
NV_STATUS status;
|
||||
|
||||
uvm_channel_update_progress(channel);
|
||||
|
||||
channel_pool_lock(pool);
|
||||
@ -1782,7 +1471,7 @@ static NV_STATUS channel_reserve_and_lock(uvm_channel_t *channel, NvU32 num_gpfi
|
||||
|
||||
status = uvm_channel_check_errors(channel);
|
||||
if (status != NV_OK) {
|
||||
uvm_up(&pool->conf_computing.push_sem);
|
||||
uvm_up(&pool->push_sem);
|
||||
return status;
|
||||
}
|
||||
|
||||
@ -1852,14 +1541,14 @@ static uvm_gpfifo_entry_t *uvm_channel_get_first_pending_entry(uvm_channel_t *ch
|
||||
NV_STATUS uvm_channel_get_status(uvm_channel_t *channel)
|
||||
{
|
||||
uvm_gpu_t *gpu;
|
||||
NvNotification *errorNotifier;
|
||||
NvNotification *error_notifier;
|
||||
|
||||
if (uvm_channel_is_proxy(channel))
|
||||
errorNotifier = channel->proxy.channel_info.shadowErrorNotifier;
|
||||
error_notifier = channel->proxy.channel_info.shadowErrorNotifier;
|
||||
else
|
||||
errorNotifier = channel->channel_info.errorNotifier;
|
||||
error_notifier = channel->channel_info.errorNotifier;
|
||||
|
||||
if (errorNotifier->status == 0)
|
||||
if (error_notifier->status == 0)
|
||||
return NV_OK;
|
||||
|
||||
// In case we hit a channel error, check the ECC error notifier as well so
|
||||
@ -1972,8 +1661,6 @@ NV_STATUS uvm_channel_wait(uvm_channel_t *channel)
|
||||
static NV_STATUS csl_init(uvm_channel_t *channel)
|
||||
{
|
||||
NV_STATUS status;
|
||||
unsigned context_index = uvm_channel_index_in_pool(channel);
|
||||
uvm_channel_pool_t *pool = channel->pool;
|
||||
|
||||
UVM_ASSERT(g_uvm_global.conf_computing_enabled);
|
||||
|
||||
@ -1990,38 +1677,17 @@ static NV_STATUS csl_init(uvm_channel_t *channel)
|
||||
uvm_mutex_init(&channel->csl.ctx_lock, UVM_LOCK_ORDER_CSL_CTX);
|
||||
channel->csl.is_ctx_initialized = true;
|
||||
|
||||
if (uvm_channel_is_lcic(channel)) {
|
||||
pool = get_paired_pool(pool);
|
||||
context_index += pool->num_channels;
|
||||
}
|
||||
|
||||
UVM_ASSERT(pool->conf_computing.key_rotation.csl_contexts != NULL);
|
||||
|
||||
pool->conf_computing.key_rotation.csl_contexts[context_index] = &channel->csl.ctx;
|
||||
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
static void csl_destroy(uvm_channel_t *channel)
|
||||
{
|
||||
uvm_channel_pool_t *pool = channel->pool;
|
||||
unsigned context_index = uvm_channel_index_in_pool(channel);
|
||||
|
||||
if (!channel->csl.is_ctx_initialized)
|
||||
return;
|
||||
|
||||
uvm_assert_mutex_unlocked(&channel->csl.ctx_lock);
|
||||
UVM_ASSERT(!uvm_channel_is_locked_for_push(channel));
|
||||
|
||||
if (uvm_channel_is_lcic(channel)) {
|
||||
pool = get_paired_pool(pool);
|
||||
context_index += pool->num_channels;
|
||||
}
|
||||
|
||||
UVM_ASSERT(pool->conf_computing.key_rotation.csl_contexts != NULL);
|
||||
|
||||
pool->conf_computing.key_rotation.csl_contexts[context_index] = NULL;
|
||||
|
||||
uvm_rm_locked_call_void(nvUvmInterfaceDeinitCslContext(&channel->csl.ctx));
|
||||
channel->csl.is_ctx_initialized = false;
|
||||
}
|
||||
@ -2031,45 +1697,187 @@ static void free_conf_computing_buffers(uvm_channel_t *channel)
|
||||
UVM_ASSERT(g_uvm_global.conf_computing_enabled);
|
||||
UVM_ASSERT(uvm_channel_is_ce(channel));
|
||||
|
||||
uvm_rm_mem_free(channel->conf_computing.static_pb_protected_vidmem);
|
||||
uvm_rm_mem_free(channel->conf_computing.static_pb_unprotected_sysmem);
|
||||
uvm_rm_mem_free(channel->conf_computing.static_notifier_unprotected_sysmem);
|
||||
uvm_rm_mem_free(channel->conf_computing.push_crypto_bundle_auth_tags);
|
||||
uvm_kvfree(channel->conf_computing.static_pb_protected_sysmem);
|
||||
channel->conf_computing.static_pb_protected_sysmem = NULL;
|
||||
|
||||
uvm_kvfree(channel->conf_computing.push_crypto_bundles);
|
||||
channel->conf_computing.static_pb_protected_vidmem = NULL;
|
||||
channel->conf_computing.static_pb_unprotected_sysmem = NULL;
|
||||
channel->conf_computing.static_notifier_unprotected_sysmem = NULL;
|
||||
channel->conf_computing.push_crypto_bundle_auth_tags = NULL;
|
||||
channel->conf_computing.static_pb_protected_sysmem = NULL;
|
||||
channel->conf_computing.push_crypto_bundles = NULL;
|
||||
|
||||
uvm_rm_mem_free(channel->tracking_sem.semaphore.conf_computing.encrypted_payload);
|
||||
uvm_rm_mem_free(channel->tracking_sem.semaphore.conf_computing.notifier);
|
||||
uvm_rm_mem_free(channel->tracking_sem.semaphore.conf_computing.auth_tag);
|
||||
uvm_kvfree(channel->tracking_sem.semaphore.conf_computing.ivs);
|
||||
channel->tracking_sem.semaphore.conf_computing.encrypted_payload = NULL;
|
||||
channel->tracking_sem.semaphore.conf_computing.notifier = NULL;
|
||||
channel->tracking_sem.semaphore.conf_computing.auth_tag = NULL;
|
||||
channel->tracking_sem.semaphore.conf_computing.ivs = NULL;
|
||||
}
|
||||
|
||||
static NV_STATUS alloc_conf_computing_buffers(uvm_channel_t *channel)
|
||||
static NV_STATUS alloc_conf_computing_buffers_semaphore(uvm_channel_t *channel)
|
||||
{
|
||||
uvm_gpu_semaphore_t *semaphore = &channel->tracking_sem.semaphore;
|
||||
uvm_gpu_t *gpu = uvm_channel_get_gpu(channel);
|
||||
NV_STATUS status;
|
||||
|
||||
UVM_ASSERT(g_uvm_global.conf_computing_enabled);
|
||||
UVM_ASSERT(uvm_channel_is_ce(channel));
|
||||
|
||||
semaphore->conf_computing.ivs =
|
||||
uvm_kvmalloc(sizeof(*semaphore->conf_computing.ivs) * channel->num_gpfifo_entries);
|
||||
status = uvm_rm_mem_alloc_and_map_cpu(gpu,
|
||||
UVM_RM_MEM_TYPE_SYS,
|
||||
sizeof(semaphore->conf_computing.last_pushed_notifier),
|
||||
UVM_CONF_COMPUTING_BUF_ALIGNMENT,
|
||||
&semaphore->conf_computing.notifier);
|
||||
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
status = uvm_rm_mem_alloc_and_map_cpu(gpu,
|
||||
UVM_RM_MEM_TYPE_SYS,
|
||||
sizeof(*channel->tracking_sem.semaphore.payload),
|
||||
UVM_CONF_COMPUTING_BUF_ALIGNMENT,
|
||||
&semaphore->conf_computing.encrypted_payload);
|
||||
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
status = uvm_rm_mem_alloc_and_map_cpu(gpu,
|
||||
UVM_RM_MEM_TYPE_SYS,
|
||||
UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
|
||||
UVM_CONF_COMPUTING_BUF_ALIGNMENT,
|
||||
&semaphore->conf_computing.auth_tag);
|
||||
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
semaphore->conf_computing.ivs = uvm_kvmalloc_zero(sizeof(*semaphore->conf_computing.ivs)
|
||||
* channel->num_gpfifo_entries);
|
||||
|
||||
if (!semaphore->conf_computing.ivs)
|
||||
return NV_ERR_NO_MEMORY;
|
||||
|
||||
if (uvm_channel_is_wlc(channel)) {
|
||||
channel->conf_computing.static_pb_protected_sysmem =
|
||||
uvm_kvmalloc(UVM_ALIGN_UP(UVM_MAX_WLC_PUSH_SIZE, UVM_PAGE_SIZE_4K));
|
||||
return status;
|
||||
}
|
||||
|
||||
static NV_STATUS alloc_conf_computing_buffers_wlc(uvm_channel_t *channel)
|
||||
{
|
||||
uvm_gpu_t *gpu = uvm_channel_get_gpu(channel);
|
||||
size_t aligned_wlc_push_size = UVM_ALIGN_UP(UVM_MAX_WLC_PUSH_SIZE, UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT);
|
||||
NV_STATUS status = uvm_rm_mem_alloc_and_map_cpu(gpu,
|
||||
UVM_RM_MEM_TYPE_SYS,
|
||||
aligned_wlc_push_size + UVM_CONF_COMPUTING_AUTH_TAG_SIZE * 2,
|
||||
PAGE_SIZE,
|
||||
&channel->conf_computing.static_pb_unprotected_sysmem);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
// Both pushes will be targets for SEC2 decrypt operations and have to
|
||||
// be aligned for SEC2. The first push location will also be a target
|
||||
// for CE decrypt operation and has to be aligned for CE decrypt.
|
||||
status = uvm_rm_mem_alloc(gpu,
|
||||
UVM_RM_MEM_TYPE_GPU,
|
||||
UVM_ALIGN_UP(UVM_MAX_WLC_PUSH_SIZE, UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT) * 2,
|
||||
UVM_CONF_COMPUTING_BUF_ALIGNMENT,
|
||||
&channel->conf_computing.static_pb_protected_vidmem);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
channel->conf_computing.static_pb_unprotected_sysmem_cpu =
|
||||
uvm_rm_mem_get_cpu_va(channel->conf_computing.static_pb_unprotected_sysmem);
|
||||
channel->conf_computing.static_pb_unprotected_sysmem_auth_tag_cpu =
|
||||
(char*)channel->conf_computing.static_pb_unprotected_sysmem_cpu + aligned_wlc_push_size;
|
||||
|
||||
// The location below is only used for launch pushes but reuses
|
||||
// the same sysmem allocation
|
||||
channel->conf_computing.launch_auth_tag_cpu =
|
||||
(char*)channel->conf_computing.static_pb_unprotected_sysmem_cpu +
|
||||
aligned_wlc_push_size + UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
|
||||
channel->conf_computing.launch_auth_tag_gpu_va =
|
||||
uvm_rm_mem_get_gpu_uvm_va(channel->conf_computing.static_pb_unprotected_sysmem, gpu) +
|
||||
aligned_wlc_push_size + UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
|
||||
|
||||
channel->conf_computing.static_pb_protected_sysmem = uvm_kvmalloc(UVM_MAX_WLC_PUSH_SIZE + UVM_PAGE_SIZE_4K);
|
||||
if (!channel->conf_computing.static_pb_protected_sysmem)
|
||||
return NV_ERR_NO_MEMORY;
|
||||
}
|
||||
else if (!uvm_channel_is_lcic(channel)) {
|
||||
channel->conf_computing.push_crypto_bundles =
|
||||
uvm_kvmalloc(sizeof(*channel->conf_computing.push_crypto_bundles) * channel->num_gpfifo_entries);
|
||||
|
||||
if (!channel->conf_computing.push_crypto_bundles)
|
||||
return status;
|
||||
}
|
||||
|
||||
static NV_STATUS alloc_conf_computing_buffers_lcic(uvm_channel_t *channel)
|
||||
{
|
||||
uvm_gpu_t *gpu = uvm_channel_get_gpu(channel);
|
||||
const size_t notifier_size = sizeof(*channel->conf_computing.static_notifier_entry_unprotected_sysmem_cpu);
|
||||
NV_STATUS status = uvm_rm_mem_alloc_and_map_cpu(gpu,
|
||||
UVM_RM_MEM_TYPE_SYS,
|
||||
notifier_size * 2,
|
||||
UVM_CONF_COMPUTING_BUF_ALIGNMENT,
|
||||
&channel->conf_computing.static_notifier_unprotected_sysmem);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
status = uvm_rm_mem_alloc(gpu,
|
||||
UVM_RM_MEM_TYPE_GPU,
|
||||
UVM_LCIC_PUSH_SIZE,
|
||||
UVM_CONF_COMPUTING_BUF_ALIGNMENT,
|
||||
&channel->conf_computing.static_pb_protected_vidmem);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
channel->conf_computing.static_notifier_entry_unprotected_sysmem_cpu =
|
||||
uvm_rm_mem_get_cpu_va(channel->conf_computing.static_notifier_unprotected_sysmem);
|
||||
channel->conf_computing.static_notifier_exit_unprotected_sysmem_cpu =
|
||||
channel->conf_computing.static_notifier_entry_unprotected_sysmem_cpu + 1;
|
||||
|
||||
channel->conf_computing.static_notifier_entry_unprotected_sysmem_gpu_va =
|
||||
uvm_rm_mem_get_gpu_va(channel->conf_computing.static_notifier_unprotected_sysmem, gpu, false);
|
||||
channel->conf_computing.static_notifier_exit_unprotected_sysmem_gpu_va =
|
||||
channel->conf_computing.static_notifier_entry_unprotected_sysmem_gpu_va;
|
||||
channel->conf_computing.static_notifier_exit_unprotected_sysmem_gpu_va.address += notifier_size;
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static NV_STATUS alloc_conf_computing_buffers(uvm_channel_t *channel)
|
||||
{
|
||||
NV_STATUS status;
|
||||
|
||||
UVM_ASSERT(g_uvm_global.conf_computing_enabled);
|
||||
UVM_ASSERT(uvm_channel_is_ce(channel));
|
||||
|
||||
status = alloc_conf_computing_buffers_semaphore(channel);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
if (uvm_channel_is_wlc(channel)) {
|
||||
status = alloc_conf_computing_buffers_wlc(channel);
|
||||
}
|
||||
else if (uvm_channel_is_lcic(channel)) {
|
||||
status = alloc_conf_computing_buffers_lcic(channel);
|
||||
}
|
||||
else {
|
||||
uvm_gpu_t *gpu = uvm_channel_get_gpu(channel);
|
||||
void *push_crypto_bundles = uvm_kvmalloc_zero(sizeof(*channel->conf_computing.push_crypto_bundles) *
|
||||
channel->num_gpfifo_entries);
|
||||
|
||||
if (push_crypto_bundles == NULL)
|
||||
return NV_ERR_NO_MEMORY;
|
||||
|
||||
channel->conf_computing.push_crypto_bundles = push_crypto_bundles;
|
||||
|
||||
status = uvm_rm_mem_alloc_and_map_cpu(gpu,
|
||||
UVM_RM_MEM_TYPE_SYS,
|
||||
channel->num_gpfifo_entries * UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
|
||||
UVM_CONF_COMPUTING_BUF_ALIGNMENT,
|
||||
&channel->conf_computing.push_crypto_bundle_auth_tags);
|
||||
}
|
||||
|
||||
return NV_OK;
|
||||
return status;
|
||||
}
|
||||
|
||||
static void channel_destroy(uvm_channel_pool_t *pool, uvm_channel_t *channel)
|
||||
@ -2117,6 +1925,36 @@ static void channel_destroy(uvm_channel_pool_t *pool, uvm_channel_t *channel)
|
||||
pool->num_channels--;
|
||||
}
|
||||
|
||||
static unsigned channel_pool_type_num_gpfifo_entries(uvm_channel_manager_t *manager, uvm_channel_pool_type_t pool_type)
|
||||
{
|
||||
switch (pool_type) {
|
||||
case UVM_CHANNEL_POOL_TYPE_CE:
|
||||
case UVM_CHANNEL_POOL_TYPE_CE_PROXY:
|
||||
return manager->conf.num_gpfifo_entries;
|
||||
case UVM_CHANNEL_POOL_TYPE_SEC2:
|
||||
return manager->conf.num_gpfifo_entries;
|
||||
case UVM_CHANNEL_POOL_TYPE_WLC: {
|
||||
// WLC benefits from larger number of entries since more available
|
||||
// entries result in less frequent calls to
|
||||
// uvm_channel_update_progress 16 is the maximum size that can
|
||||
// re-use static pb preallocated memory when uploading the WLC
|
||||
// schedule.
|
||||
return 16;
|
||||
}
|
||||
case UVM_CHANNEL_POOL_TYPE_LCIC: {
|
||||
// Every channel needs at least 3 entries; 1 for sentinel and 2 more
|
||||
// for submitting GPFIFO control entries. The number also has to be
|
||||
// power of 2, as the HW stores the size as log2 value.
|
||||
// LCIC does not accept external pushes, uvm_channel_update_progress
|
||||
// is not a concern.
|
||||
return 4;
|
||||
}
|
||||
default:
|
||||
UVM_ASSERT_MSG(0, "Unhandled pool type: %d", pool_type);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
// Returns the TSG for a given channel.
|
||||
static uvmGpuTsgHandle channel_get_tsg(uvm_channel_t *channel)
|
||||
{
|
||||
@ -2144,7 +1982,7 @@ static NV_STATUS internal_channel_create(uvm_channel_t *channel)
|
||||
uvm_channel_manager_t *manager = channel->pool->manager;
|
||||
|
||||
memset(&channel_alloc_params, 0, sizeof(channel_alloc_params));
|
||||
channel_alloc_params.numGpFifoEntries = channel_pool_num_gpfifo_entries(channel->pool);
|
||||
channel_alloc_params.numGpFifoEntries = channel_pool_type_num_gpfifo_entries(manager, channel->pool->pool_type);
|
||||
channel_alloc_params.gpFifoLoc = manager->conf.gpfifo_loc;
|
||||
channel_alloc_params.gpPutLoc = manager->conf.gpput_loc;
|
||||
|
||||
@ -2248,7 +2086,7 @@ static NV_STATUS channel_create(uvm_channel_pool_t *pool, uvm_channel_t *channel
|
||||
if (status != NV_OK)
|
||||
goto error;
|
||||
|
||||
channel->num_gpfifo_entries = channel_pool_num_gpfifo_entries(pool);
|
||||
channel->num_gpfifo_entries = channel_pool_type_num_gpfifo_entries(manager, pool->pool_type);
|
||||
channel->gpfifo_entries = uvm_kvmalloc_zero(sizeof(*channel->gpfifo_entries) * channel->num_gpfifo_entries);
|
||||
if (channel->gpfifo_entries == NULL) {
|
||||
status = NV_ERR_NO_MEMORY;
|
||||
@ -2328,8 +2166,8 @@ static NV_STATUS channel_init(uvm_channel_t *channel)
|
||||
|
||||
if (uvm_channel_is_sec2(channel))
|
||||
pb_base = uvm_pushbuffer_get_sec2_gpu_va_base(pushbuffer);
|
||||
else if (uvm_channel_is_wlc(channel) || uvm_channel_is_lcic(channel))
|
||||
pb_base = uvm_channel_get_static_pb_protected_vidmem_gpu_va(channel);
|
||||
else if (channel->conf_computing.static_pb_protected_vidmem)
|
||||
pb_base = uvm_rm_mem_get_gpu_uvm_va(channel->conf_computing.static_pb_protected_vidmem, gpu);
|
||||
|
||||
gpu->parent->host_hal->set_gpfifo_pushbuffer_segment_base(&gpfifo_entry, pb_base);
|
||||
write_ctrl_gpfifo(channel, gpfifo_entry);
|
||||
@ -2369,68 +2207,34 @@ static bool channel_manager_uses_proxy_pool(uvm_channel_manager_t *manager)
|
||||
}
|
||||
|
||||
// Number of channels to create in a pool of the given type.
|
||||
static unsigned channel_manager_num_channels(uvm_channel_manager_t *manager, uvm_channel_pool_type_t pool_type)
|
||||
//
|
||||
// TODO: Bug 1764958: Tweak this function after benchmarking real workloads.
|
||||
static unsigned channel_pool_type_num_channels(uvm_channel_pool_type_t pool_type)
|
||||
{
|
||||
unsigned num_channels;
|
||||
|
||||
// In the common case, create two channels per pool.
|
||||
//
|
||||
// TODO: Bug 1764958: Tweak this number after benchmarking real workloads.
|
||||
const unsigned channel_pool_type_ce_num_channels = 2;
|
||||
|
||||
UVM_ASSERT(uvm_pool_type_is_valid(pool_type));
|
||||
|
||||
if (pool_type == UVM_CHANNEL_POOL_TYPE_CE_PROXY) {
|
||||
|
||||
// TODO: Bug 3387454: The vGPU plugin implementation supports a single
|
||||
// proxy channel per GPU
|
||||
num_channels = 1;
|
||||
}
|
||||
else if (pool_type == UVM_CHANNEL_POOL_TYPE_SEC2) {
|
||||
if (pool_type == UVM_CHANNEL_POOL_TYPE_CE_PROXY)
|
||||
return 1;
|
||||
|
||||
// Not all GPU architectures support more than 1 channel per TSG. Since
|
||||
// SEC2 is not in UVM critical path for performance, conservatively
|
||||
// create a pool/TSG with a single channel.
|
||||
num_channels = 1;
|
||||
}
|
||||
else if ((pool_type == UVM_CHANNEL_POOL_TYPE_WLC) || (pool_type == UVM_CHANNEL_POOL_TYPE_LCIC)) {
|
||||
unsigned max_concurrent_ce_pushes;
|
||||
unsigned num_used_ces = bitmap_weight(manager->ce_mask, UVM_COPY_ENGINE_COUNT_MAX);
|
||||
// Not all GPU architectures support more than 1 channel per TSG. Since SEC2
|
||||
// is not in UVM critical path for performance, we conservatively create a
|
||||
// pool/TSG with a single channel.
|
||||
if (pool_type == UVM_CHANNEL_POOL_TYPE_SEC2)
|
||||
return 1;
|
||||
|
||||
// CE selection should happen before this function is invoked.
|
||||
UVM_ASSERT(num_used_ces > 0);
|
||||
if (pool_type == UVM_CHANNEL_POOL_TYPE_WLC || pool_type == UVM_CHANNEL_POOL_TYPE_LCIC)
|
||||
return UVM_PUSH_MAX_CONCURRENT_PUSHES;
|
||||
|
||||
// Create as many WLC and LCIC channels as concurrent, ongoing, pushes
|
||||
// of interest are allowed. In the general case, this number of pushes
|
||||
// is capped by UVM_PUSH_MAX_CONCURRENT_PUSHES. But in Confidential
|
||||
// Computing there is at most one ongoing push per channel, so the
|
||||
// number of WLC/LCIC channels is also limited by the number of CE
|
||||
// channels.
|
||||
//
|
||||
// The calculation only considers channels mapped to the
|
||||
// UVM_CHANNEL_POOL_TYPE_CE type, because WLC and LCIC channels are
|
||||
// created to enable work launch exclusively in those other channels.
|
||||
max_concurrent_ce_pushes = num_used_ces * channel_pool_type_ce_num_channels;
|
||||
num_channels = min(max_concurrent_ce_pushes, (unsigned) UVM_PUSH_MAX_CONCURRENT_PUSHES);
|
||||
}
|
||||
else {
|
||||
UVM_ASSERT(pool_type == UVM_CHANNEL_POOL_TYPE_CE);
|
||||
|
||||
num_channels = channel_pool_type_ce_num_channels;
|
||||
}
|
||||
|
||||
UVM_ASSERT(num_channels <= UVM_CHANNEL_MAX_NUM_CHANNELS_PER_POOL);
|
||||
|
||||
return num_channels;
|
||||
return 2;
|
||||
}
|
||||
|
||||
// Number of TSGs to create in a pool of a given type.
|
||||
static unsigned channel_manager_num_tsgs(uvm_channel_manager_t *manager, uvm_channel_pool_type_t pool_type)
|
||||
static unsigned channel_pool_type_num_tsgs(uvm_channel_pool_type_t pool_type)
|
||||
{
|
||||
// For WLC and LCIC channels, we create one TSG per WLC/LCIC channel pair.
|
||||
// The TSG is stored in the WLC pool.
|
||||
if (pool_type == UVM_CHANNEL_POOL_TYPE_WLC)
|
||||
return channel_manager_num_channels(manager, pool_type);
|
||||
return channel_pool_type_num_channels(pool_type);
|
||||
else if (pool_type == UVM_CHANNEL_POOL_TYPE_LCIC)
|
||||
return 0;
|
||||
|
||||
@ -2486,164 +2290,17 @@ static void channel_pool_destroy(uvm_channel_pool_t *pool)
|
||||
|
||||
while (pool->num_channels > 0)
|
||||
channel_destroy(pool, pool->channels + pool->num_channels - 1);
|
||||
|
||||
uvm_kvfree(pool->channels);
|
||||
pool->channels = NULL;
|
||||
|
||||
while (pool->num_tsgs > 0)
|
||||
tsg_destroy(pool, *(pool->tsg_handles + pool->num_tsgs - 1));
|
||||
|
||||
uvm_kvfree(pool->tsg_handles);
|
||||
pool->tsg_handles = NULL;
|
||||
|
||||
uvm_kvfree(pool->conf_computing.key_rotation.csl_contexts);
|
||||
pool->conf_computing.key_rotation.csl_contexts = NULL;
|
||||
|
||||
uvm_rm_mem_free(pool->conf_computing.pool_sysmem);
|
||||
uvm_rm_mem_free(pool->conf_computing.pool_vidmem);
|
||||
|
||||
pool->manager->num_channel_pools--;
|
||||
}
|
||||
|
||||
static void channel_pool_initialize_locks(uvm_channel_pool_t *pool, unsigned num_channels)
|
||||
{
|
||||
uvm_lock_order_t order;
|
||||
|
||||
channel_pool_lock_init(pool);
|
||||
|
||||
if (!g_uvm_global.conf_computing_enabled)
|
||||
return;
|
||||
|
||||
// Use different order lock for SEC2 and WLC channels.
|
||||
// This allows reserving a SEC2 or WLC channel for indirect work
|
||||
// submission while holding a reservation for a channel.
|
||||
if (uvm_channel_pool_is_sec2(pool))
|
||||
order = UVM_LOCK_ORDER_CSL_SEC2_PUSH;
|
||||
else if (uvm_channel_pool_is_wlc(pool))
|
||||
order = UVM_LOCK_ORDER_CSL_WLC_PUSH;
|
||||
else
|
||||
order = UVM_LOCK_ORDER_CSL_PUSH;
|
||||
|
||||
uvm_sema_init(&pool->conf_computing.push_sem, num_channels, order);
|
||||
|
||||
if (uvm_channel_pool_is_wlc(pool))
|
||||
order = UVM_LOCK_ORDER_KEY_ROTATION_WLC;
|
||||
else
|
||||
order = UVM_LOCK_ORDER_KEY_ROTATION;
|
||||
|
||||
uvm_mutex_init(&pool->conf_computing.key_rotation.mutex, order);
|
||||
}
|
||||
|
||||
static NV_STATUS channel_pool_alloc_key_rotation_data(uvm_channel_pool_t *pool, unsigned num_channels)
|
||||
{
|
||||
size_t csl_contexts_size;
|
||||
|
||||
// uvm_conf_computing_is_key_rotation_enabled_in_pool cannot be used to
|
||||
// skip key rotation data initialization, because during GPU initialization
|
||||
// the function always returns false.
|
||||
if (!g_uvm_global.conf_computing_enabled)
|
||||
return NV_OK;
|
||||
|
||||
// CSL contexts associated with LCIC channels are saved in the WLC context
|
||||
// array, not in the LCIC context array, so all the underlying engine
|
||||
// contexts are stored contiguously.
|
||||
if (uvm_channel_pool_is_lcic(pool))
|
||||
return NV_OK;
|
||||
|
||||
if (uvm_channel_pool_is_wlc(pool)) {
|
||||
UVM_ASSERT(channel_manager_num_channels(pool->manager, UVM_CHANNEL_POOL_TYPE_WLC) == num_channels);
|
||||
UVM_ASSERT(channel_manager_num_channels(pool->manager, UVM_CHANNEL_POOL_TYPE_LCIC) == num_channels);
|
||||
|
||||
num_channels *= 2;
|
||||
}
|
||||
|
||||
csl_contexts_size = sizeof(*pool->conf_computing.key_rotation.csl_contexts) * num_channels;
|
||||
pool->conf_computing.key_rotation.csl_contexts = uvm_kvmalloc_zero(csl_contexts_size);
|
||||
|
||||
if (pool->conf_computing.key_rotation.csl_contexts == NULL)
|
||||
return NV_ERR_NO_MEMORY;
|
||||
|
||||
pool->conf_computing.key_rotation.num_csl_contexts = num_channels;
|
||||
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
static NV_STATUS channel_pool_alloc_conf_computing_buffers(uvm_channel_pool_t *pool, unsigned num_channels)
|
||||
{
|
||||
uvm_gpu_t *gpu = pool->manager->gpu;
|
||||
NV_STATUS status = NV_OK;
|
||||
|
||||
if (!g_uvm_global.conf_computing_enabled)
|
||||
return NV_OK;
|
||||
|
||||
if (uvm_channel_pool_is_wlc(pool)) {
|
||||
|
||||
// Allocate unprotected sysmem buffers for WLC channels.
|
||||
// The use/substructures are described by WLC_SYSMEM_TOTAL_SIZE
|
||||
status = uvm_rm_mem_alloc_and_map_cpu(gpu,
|
||||
UVM_RM_MEM_TYPE_SYS,
|
||||
WLC_SYSMEM_TOTAL_SIZE * num_channels,
|
||||
WLC_PUSHBUFFER_ALIGNMENT,
|
||||
&pool->conf_computing.pool_sysmem);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
// WLC stores two pushbuffers used by its static schedule in vidmem.
|
||||
// See setup_wlc_schedule for the expected use of each of the static
|
||||
// pushbuffers.
|
||||
status = uvm_rm_mem_alloc(gpu,
|
||||
UVM_RM_MEM_TYPE_GPU,
|
||||
WLC_ALIGNED_MAX_PUSH_SIZE * 2 * num_channels,
|
||||
WLC_PUSHBUFFER_ALIGNMENT,
|
||||
&pool->conf_computing.pool_vidmem);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
}
|
||||
else if (uvm_channel_pool_is_lcic(pool)) {
|
||||
|
||||
// LCIC uses only static schedule so in order to use dynamic values
|
||||
// for entry/exit notifiers for its tracking semaphore they need
|
||||
// to be populated in a pre-defined sysmem location, before invoking
|
||||
// the LCIC schedule.
|
||||
status = uvm_rm_mem_alloc_and_map_cpu(gpu,
|
||||
UVM_RM_MEM_TYPE_SYS,
|
||||
sizeof(uvm_gpu_semaphore_notifier_t) * 2 * num_channels,
|
||||
0,
|
||||
&pool->conf_computing.pool_sysmem);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
// LCIC static schedule pushbuffer is in vidmem
|
||||
status = uvm_rm_mem_alloc(gpu,
|
||||
UVM_RM_MEM_TYPE_GPU,
|
||||
LCIC_ALIGNED_PUSH_SIZE * num_channels,
|
||||
LCIC_PUSHBUFFER_ALIGNMENT,
|
||||
&pool->conf_computing.pool_vidmem);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
}
|
||||
else if (uvm_channel_pool_is_ce(pool)) {
|
||||
|
||||
// General CE channels need to provide bi-directional communication
|
||||
// using the pushbuffer. Encrypting an updated push from vidmem
|
||||
// to sysmem still needs a place for auth tag in sysmem.
|
||||
status = uvm_rm_mem_alloc_and_map_cpu(gpu,
|
||||
UVM_RM_MEM_TYPE_SYS,
|
||||
UVM_CONF_COMPUTING_AUTH_TAG_SIZE * num_channels *
|
||||
channel_pool_num_gpfifo_entries(pool),
|
||||
UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT,
|
||||
&pool->conf_computing.pool_sysmem);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
}
|
||||
|
||||
status = channel_pool_alloc_key_rotation_data(pool, num_channels);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
static NV_STATUS channel_pool_add(uvm_channel_manager_t *channel_manager,
|
||||
uvm_channel_pool_type_t pool_type,
|
||||
unsigned engine_index,
|
||||
@ -2664,7 +2321,7 @@ static NV_STATUS channel_pool_add(uvm_channel_manager_t *channel_manager,
|
||||
pool->engine_index = engine_index;
|
||||
pool->pool_type = pool_type;
|
||||
|
||||
num_tsgs = channel_manager_num_tsgs(channel_manager, pool_type);
|
||||
num_tsgs = channel_pool_type_num_tsgs(pool_type);
|
||||
if (num_tsgs != 0) {
|
||||
pool->tsg_handles = uvm_kvmalloc_zero(sizeof(*pool->tsg_handles) * num_tsgs);
|
||||
if (!pool->tsg_handles) {
|
||||
@ -2681,13 +2338,21 @@ static NV_STATUS channel_pool_add(uvm_channel_manager_t *channel_manager,
|
||||
}
|
||||
}
|
||||
|
||||
num_channels = channel_manager_num_channels(channel_manager, pool_type);
|
||||
channel_pool_lock_init(pool);
|
||||
|
||||
channel_pool_initialize_locks(pool, num_channels);
|
||||
num_channels = channel_pool_type_num_channels(pool_type);
|
||||
UVM_ASSERT(num_channels <= UVM_CHANNEL_MAX_NUM_CHANNELS_PER_POOL);
|
||||
|
||||
status = channel_pool_alloc_conf_computing_buffers(pool, num_channels);
|
||||
if (status != NV_OK)
|
||||
goto error;
|
||||
if (g_uvm_global.conf_computing_enabled) {
|
||||
// Use different order lock for SEC2 and WLC channels.
|
||||
// This allows reserving a SEC2 or WLC channel for indirect work
|
||||
// submission while holding a reservation for a channel.
|
||||
uvm_lock_order_t order = uvm_channel_pool_is_sec2(pool) ? UVM_LOCK_ORDER_CSL_SEC2_PUSH :
|
||||
(uvm_channel_pool_is_wlc(pool) ? UVM_LOCK_ORDER_CSL_WLC_PUSH :
|
||||
UVM_LOCK_ORDER_CSL_PUSH);
|
||||
|
||||
uvm_sema_init(&pool->push_sem, num_channels, order);
|
||||
}
|
||||
|
||||
pool->channels = uvm_kvmalloc_zero(sizeof(*pool->channels) * num_channels);
|
||||
if (!pool->channels) {
|
||||
@ -2715,41 +2380,24 @@ static NV_STATUS channel_pool_add(uvm_channel_manager_t *channel_manager,
|
||||
return status;
|
||||
}
|
||||
|
||||
static bool ce_is_usable(const UvmGpuCopyEngineCaps *cap)
|
||||
static bool ce_usable_for_channel_type(uvm_channel_type_t type, const UvmGpuCopyEngineCaps *cap)
|
||||
{
|
||||
return cap->supported && !cap->grce;
|
||||
}
|
||||
if (!cap->supported || cap->grce)
|
||||
return false;
|
||||
|
||||
// Check that all asynchronous CEs are usable, and that there is at least one
|
||||
// such CE.
|
||||
static NV_STATUS ces_validate(uvm_channel_manager_t *manager, const UvmGpuCopyEngineCaps *ces_caps)
|
||||
{
|
||||
unsigned ce;
|
||||
bool found_usable_ce = false;
|
||||
|
||||
for (ce = 0; ce < UVM_COPY_ENGINE_COUNT_MAX; ++ce) {
|
||||
const UvmGpuCopyEngineCaps *ce_caps = ces_caps + ce;
|
||||
|
||||
if (!ce_is_usable(ce_caps))
|
||||
continue;
|
||||
|
||||
found_usable_ce = true;
|
||||
|
||||
// All channels may need to release their semaphore to sysmem.
|
||||
// All CEs are expected to have the sysmem flag set.
|
||||
if (!ce_caps->sysmem)
|
||||
return NV_ERR_NOT_SUPPORTED;
|
||||
|
||||
// While P2P capabilities are only required for transfers between GPUs,
|
||||
// in practice all CEs are expected to have the corresponding flag set.
|
||||
if (!ce_caps->p2p)
|
||||
return NV_ERR_NOT_SUPPORTED;
|
||||
switch (type) {
|
||||
case UVM_CHANNEL_TYPE_CPU_TO_GPU:
|
||||
case UVM_CHANNEL_TYPE_GPU_TO_CPU:
|
||||
return cap->sysmem;
|
||||
case UVM_CHANNEL_TYPE_GPU_INTERNAL:
|
||||
case UVM_CHANNEL_TYPE_MEMOPS:
|
||||
return true;
|
||||
case UVM_CHANNEL_TYPE_GPU_TO_GPU:
|
||||
return cap->p2p;
|
||||
default:
|
||||
UVM_ASSERT_MSG(false, "Unexpected channel type 0x%x\n", type);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!found_usable_ce)
|
||||
return NV_ERR_NOT_SUPPORTED;
|
||||
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
static unsigned ce_usage_count(NvU32 ce, const unsigned *preferred_ce)
|
||||
@ -2778,13 +2426,15 @@ static int compare_ce_for_channel_type(const UvmGpuCopyEngineCaps *ce_caps,
|
||||
const UvmGpuCopyEngineCaps *cap0 = ce_caps + ce_index0;
|
||||
const UvmGpuCopyEngineCaps *cap1 = ce_caps + ce_index1;
|
||||
|
||||
UVM_ASSERT(ce_usable_for_channel_type(type, cap0));
|
||||
UVM_ASSERT(ce_usable_for_channel_type(type, cap1));
|
||||
UVM_ASSERT(ce_index0 < UVM_COPY_ENGINE_COUNT_MAX);
|
||||
UVM_ASSERT(ce_index1 < UVM_COPY_ENGINE_COUNT_MAX);
|
||||
UVM_ASSERT(ce_index0 != ce_index1);
|
||||
|
||||
switch (type) {
|
||||
// For CPU to GPU fast sysmem read is the most important
|
||||
case UVM_CHANNEL_TYPE_CPU_TO_GPU:
|
||||
// For CPU to GPU fast sysmem read is the most important
|
||||
if (cap0->sysmemRead != cap1->sysmemRead)
|
||||
return cap1->sysmemRead - cap0->sysmemRead;
|
||||
|
||||
@ -2794,8 +2444,8 @@ static int compare_ce_for_channel_type(const UvmGpuCopyEngineCaps *ce_caps,
|
||||
|
||||
break;
|
||||
|
||||
// For GPU to CPU fast sysmem write is the most important
|
||||
case UVM_CHANNEL_TYPE_GPU_TO_CPU:
|
||||
// For GPU to CPU fast sysmem write is the most important
|
||||
if (cap0->sysmemWrite != cap1->sysmemWrite)
|
||||
return cap1->sysmemWrite - cap0->sysmemWrite;
|
||||
|
||||
@ -2805,8 +2455,8 @@ static int compare_ce_for_channel_type(const UvmGpuCopyEngineCaps *ce_caps,
|
||||
|
||||
break;
|
||||
|
||||
// For GPU to GPU prefer the LCE with the most PCEs
|
||||
case UVM_CHANNEL_TYPE_GPU_TO_GPU:
|
||||
// Prefer the LCE with the most PCEs
|
||||
{
|
||||
int pce_diff = (int)hweight32(cap1->cePceMask) - (int)hweight32(cap0->cePceMask);
|
||||
|
||||
@ -2816,10 +2466,10 @@ static int compare_ce_for_channel_type(const UvmGpuCopyEngineCaps *ce_caps,
|
||||
|
||||
break;
|
||||
|
||||
// For GPU_INTERNAL we want the max possible bandwidth for CEs. For now
|
||||
// assume that the number of PCEs is a good measure.
|
||||
// TODO: Bug 1735254: Add a direct CE query for local FB bandwidth
|
||||
case UVM_CHANNEL_TYPE_GPU_INTERNAL:
|
||||
// We want the max possible bandwidth for CEs used for GPU_INTERNAL,
|
||||
// for now assume that the number of PCEs is a good measure.
|
||||
// TODO: Bug 1735254: Add a direct CE query for local FB bandwidth
|
||||
{
|
||||
int pce_diff = (int)hweight32(cap1->cePceMask) - (int)hweight32(cap0->cePceMask);
|
||||
|
||||
@ -2833,15 +2483,11 @@ static int compare_ce_for_channel_type(const UvmGpuCopyEngineCaps *ce_caps,
|
||||
|
||||
break;
|
||||
|
||||
// For MEMOPS we mostly care about latency which should be better with
|
||||
// less used CEs (although we only know about our own usage and not
|
||||
// system-wide) so just break out to get the default ordering which
|
||||
// prioritizes usage count.
|
||||
case UVM_CHANNEL_TYPE_MEMOPS:
|
||||
// For WLC we only care about using a dedicated CE, which requires
|
||||
// knowing the global CE mappings. For now just rely on the default
|
||||
// ordering, which results on selecting an unused CE (if available).
|
||||
case UVM_CHANNEL_TYPE_WLC:
|
||||
// For MEMOPS we mostly care about latency which should be better
|
||||
// with less used CEs (although we only know about our own usage and
|
||||
// not system-wide) so just break out to get the default ordering
|
||||
// which prioritizes usage count.
|
||||
break;
|
||||
|
||||
default:
|
||||
@ -2864,104 +2510,54 @@ static int compare_ce_for_channel_type(const UvmGpuCopyEngineCaps *ce_caps,
|
||||
return ce_index0 - ce_index1;
|
||||
}
|
||||
|
||||
// Select the preferred CE for the given channel types.
|
||||
static void pick_ces_for_channel_types(uvm_channel_manager_t *manager,
|
||||
// Identify usable CEs, and select the preferred CE for a given channel type.
|
||||
static NV_STATUS pick_ce_for_channel_type(uvm_channel_manager_t *manager,
|
||||
const UvmGpuCopyEngineCaps *ce_caps,
|
||||
uvm_channel_type_t *channel_types,
|
||||
unsigned num_channel_types,
|
||||
uvm_channel_type_t type,
|
||||
unsigned *preferred_ce)
|
||||
{
|
||||
unsigned i;
|
||||
NvU32 i;
|
||||
NvU32 best_ce = UVM_COPY_ENGINE_COUNT_MAX;
|
||||
|
||||
// In Confidential Computing, do not mark all usable CEs, only the preferred
|
||||
// ones, because non-preferred CE channels are guaranteed to not be used.
|
||||
bool mark_all_usable_ces = !g_uvm_global.conf_computing_enabled;
|
||||
UVM_ASSERT(type < UVM_CHANNEL_TYPE_CE_COUNT);
|
||||
|
||||
for (i = 0; i < num_channel_types; ++i) {
|
||||
unsigned ce;
|
||||
unsigned best_ce = UVM_COPY_ENGINE_COUNT_MAX;
|
||||
uvm_channel_type_t type = channel_types[i];
|
||||
for (i = 0; i < UVM_COPY_ENGINE_COUNT_MAX; ++i) {
|
||||
const UvmGpuCopyEngineCaps *cap = ce_caps + i;
|
||||
|
||||
for (ce = 0; ce < UVM_COPY_ENGINE_COUNT_MAX; ++ce) {
|
||||
if (!ce_is_usable(ce_caps + ce))
|
||||
if (!ce_usable_for_channel_type(type, cap))
|
||||
continue;
|
||||
|
||||
if (mark_all_usable_ces)
|
||||
__set_bit(ce, manager->ce_mask);
|
||||
__set_bit(i, manager->ce_mask);
|
||||
|
||||
if (best_ce == UVM_COPY_ENGINE_COUNT_MAX) {
|
||||
best_ce = ce;
|
||||
best_ce = i;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (compare_ce_for_channel_type(ce_caps, type, ce, best_ce, preferred_ce) < 0)
|
||||
best_ce = ce;
|
||||
if (compare_ce_for_channel_type(ce_caps, type, i, best_ce, preferred_ce) < 0)
|
||||
best_ce = i;
|
||||
}
|
||||
|
||||
UVM_ASSERT(best_ce != UVM_COPY_ENGINE_COUNT_MAX);
|
||||
if (best_ce == UVM_COPY_ENGINE_COUNT_MAX) {
|
||||
UVM_ERR_PRINT("Failed to find a suitable CE for channel type %s\n", uvm_channel_type_to_string(type));
|
||||
return NV_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
preferred_ce[type] = best_ce;
|
||||
|
||||
// Preferred CEs are always marked as usable.
|
||||
if (type < UVM_CHANNEL_TYPE_CE_COUNT)
|
||||
__set_bit(best_ce, manager->ce_mask);
|
||||
}
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
static void pick_ces(uvm_channel_manager_t *manager, const UvmGpuCopyEngineCaps *ce_caps, unsigned *preferred_ce)
|
||||
static NV_STATUS channel_manager_pick_copy_engines(uvm_channel_manager_t *manager, unsigned *preferred_ce)
|
||||
{
|
||||
// The order of picking CEs for each type matters as it's affected by
|
||||
// the usage count of each CE and it increases every time a CE
|
||||
// is selected. MEMOPS has the least priority as it only cares about
|
||||
// low usage of the CE to improve latency
|
||||
NV_STATUS status;
|
||||
unsigned i;
|
||||
UvmGpuCopyEnginesCaps *ces_caps;
|
||||
uvm_channel_type_t types[] = {UVM_CHANNEL_TYPE_CPU_TO_GPU,
|
||||
UVM_CHANNEL_TYPE_GPU_TO_CPU,
|
||||
UVM_CHANNEL_TYPE_GPU_INTERNAL,
|
||||
UVM_CHANNEL_TYPE_GPU_TO_GPU,
|
||||
UVM_CHANNEL_TYPE_MEMOPS};
|
||||
|
||||
UVM_ASSERT(!g_uvm_global.conf_computing_enabled);
|
||||
|
||||
pick_ces_for_channel_types(manager, ce_caps, types, ARRAY_SIZE(types), preferred_ce);
|
||||
}
|
||||
|
||||
static void pick_ces_conf_computing(uvm_channel_manager_t *manager,
|
||||
const UvmGpuCopyEngineCaps *ce_caps,
|
||||
unsigned *preferred_ce)
|
||||
{
|
||||
unsigned best_wlc_ce;
|
||||
|
||||
// The WLC type must go last so an unused CE is chosen, if available
|
||||
uvm_channel_type_t types[] = {UVM_CHANNEL_TYPE_CPU_TO_GPU,
|
||||
UVM_CHANNEL_TYPE_GPU_TO_CPU,
|
||||
UVM_CHANNEL_TYPE_GPU_INTERNAL,
|
||||
UVM_CHANNEL_TYPE_MEMOPS,
|
||||
UVM_CHANNEL_TYPE_WLC};
|
||||
|
||||
UVM_ASSERT(g_uvm_global.conf_computing_enabled);
|
||||
|
||||
pick_ces_for_channel_types(manager, ce_caps, types, ARRAY_SIZE(types), preferred_ce);
|
||||
|
||||
// Direct transfers between GPUs are disallowed in Confidential Computing,
|
||||
// but the preferred CE is still set to an arbitrary value for consistency.
|
||||
preferred_ce[UVM_CHANNEL_TYPE_GPU_TO_GPU] = preferred_ce[UVM_CHANNEL_TYPE_GPU_TO_CPU];
|
||||
|
||||
best_wlc_ce = preferred_ce[UVM_CHANNEL_TYPE_WLC];
|
||||
|
||||
// TODO: Bug 4576908: in HCC, the WLC type should not share a CE with any
|
||||
// channel type other than LCIC. The assertion should be a check instead.
|
||||
UVM_ASSERT(ce_usage_count(best_wlc_ce, preferred_ce) == 0);
|
||||
}
|
||||
|
||||
static NV_STATUS channel_manager_pick_ces(uvm_channel_manager_t *manager, unsigned *preferred_ce)
|
||||
{
|
||||
NV_STATUS status;
|
||||
UvmGpuCopyEnginesCaps *ces_caps;
|
||||
uvm_channel_type_t type;
|
||||
|
||||
for (type = 0; type < UVM_CHANNEL_TYPE_COUNT; type++)
|
||||
preferred_ce[type] = UVM_COPY_ENGINE_COUNT_MAX;
|
||||
|
||||
ces_caps = uvm_kvmalloc_zero(sizeof(*ces_caps));
|
||||
if (!ces_caps)
|
||||
return NV_ERR_NO_MEMORY;
|
||||
@ -2970,14 +2566,16 @@ static NV_STATUS channel_manager_pick_ces(uvm_channel_manager_t *manager, unsign
|
||||
if (status != NV_OK)
|
||||
goto out;
|
||||
|
||||
status = ces_validate(manager, ces_caps->copyEngineCaps);
|
||||
// The order of picking CEs for each type matters as it's affected by the
|
||||
// usage count of each CE and it increases every time a CE is selected.
|
||||
// MEMOPS has the least priority as it only cares about low usage of the
|
||||
// CE to improve latency
|
||||
for (i = 0; i < ARRAY_SIZE(types); ++i) {
|
||||
status = pick_ce_for_channel_type(manager, ces_caps->copyEngineCaps, types[i], preferred_ce);
|
||||
if (status != NV_OK)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (g_uvm_global.conf_computing_enabled)
|
||||
pick_ces_conf_computing(manager, ces_caps->copyEngineCaps, preferred_ce);
|
||||
else
|
||||
pick_ces(manager, ces_caps->copyEngineCaps, preferred_ce);
|
||||
out:
|
||||
uvm_kvfree(ces_caps);
|
||||
|
||||
@ -2986,16 +2584,18 @@ out:
|
||||
|
||||
// Return the pool corresponding to the given CE index
|
||||
//
|
||||
// This function cannot be used to access the proxy pool in SR-IOV heavy.
|
||||
// Used to retrieve pools of type UVM_CHANNEL_POOL_TYPE_CE only.
|
||||
static uvm_channel_pool_t *channel_manager_ce_pool(uvm_channel_manager_t *manager, NvU32 ce)
|
||||
{
|
||||
uvm_channel_pool_t *pool;
|
||||
uvm_channel_pool_t *pool = uvm_channel_pool_first(manager, UVM_CHANNEL_POOL_TYPE_CE);
|
||||
|
||||
UVM_ASSERT(pool != NULL);
|
||||
UVM_ASSERT(test_bit(ce, manager->ce_mask));
|
||||
|
||||
// The index of the pool associated with 'ce' is the number of usable CEs
|
||||
// in [0, ce)
|
||||
pool = manager->channel_pools + bitmap_weight(manager->ce_mask, ce);
|
||||
// Pools of type UVM_CHANNEL_POOL_TYPE_CE are stored contiguously. The
|
||||
// offset of the pool associated with 'ce' is the number of usable CEs in
|
||||
// [0, ce).
|
||||
pool += bitmap_weight(manager->ce_mask, ce);
|
||||
|
||||
UVM_ASSERT(pool->pool_type == UVM_CHANNEL_POOL_TYPE_CE);
|
||||
UVM_ASSERT(pool->engine_index == ce);
|
||||
@ -3041,7 +2641,7 @@ static const char *buffer_location_to_string(UVM_BUFFER_LOCATION loc)
|
||||
else if (loc == UVM_BUFFER_LOCATION_DEFAULT)
|
||||
return "auto";
|
||||
|
||||
UVM_ASSERT_MSG(false, "Invalid buffer location value %d\n", loc);
|
||||
UVM_ASSERT_MSG(false, "Invalid buffer locationvalue %d\n", loc);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -3213,27 +2813,28 @@ static unsigned channel_manager_get_max_pools(uvm_channel_manager_t *manager)
|
||||
static NV_STATUS channel_manager_create_ce_pools(uvm_channel_manager_t *manager, unsigned *preferred_ce)
|
||||
{
|
||||
unsigned ce;
|
||||
unsigned type;
|
||||
|
||||
// A pool is created for each usable CE, even if it has not been selected as
|
||||
// the preferred CE for any type, because as more information is discovered
|
||||
// (for example, a pair of peer GPUs is added) we may start using the
|
||||
// previously idle pools. Configurations where non-preferred CEs are
|
||||
// guaranteed to remain unused are allowed to avoid marking those engines as
|
||||
// usable.
|
||||
// previously idle pools.
|
||||
for_each_set_bit(ce, manager->ce_mask, UVM_COPY_ENGINE_COUNT_MAX) {
|
||||
NV_STATUS status;
|
||||
unsigned type;
|
||||
uvm_channel_pool_t *pool = NULL;
|
||||
|
||||
status = channel_pool_add(manager, UVM_CHANNEL_POOL_TYPE_CE, ce, &pool);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
}
|
||||
|
||||
for (type = 0; type < UVM_CHANNEL_TYPE_CE_COUNT; type++) {
|
||||
// Set pool type if it hasn't been set before.
|
||||
if (preferred_ce[type] == ce && manager->pool_to_use.default_for_type[type] == NULL)
|
||||
manager->pool_to_use.default_for_type[type] = pool;
|
||||
}
|
||||
// Avoid overwriting previously set defaults.
|
||||
if (manager->pool_to_use.default_for_type[type] != NULL)
|
||||
continue;
|
||||
|
||||
ce = preferred_ce[type];
|
||||
manager->pool_to_use.default_for_type[type] = channel_manager_ce_pool(manager, ce);
|
||||
}
|
||||
|
||||
return NV_OK;
|
||||
@ -3242,8 +2843,11 @@ static NV_STATUS channel_manager_create_ce_pools(uvm_channel_manager_t *manager,
|
||||
static NV_STATUS setup_wlc_schedule(uvm_channel_t *wlc)
|
||||
{
|
||||
uvm_gpu_t *gpu = uvm_channel_get_gpu(wlc);
|
||||
NvU64 protected_vidmem_gpu_va = uvm_channel_get_static_pb_protected_vidmem_gpu_va(wlc);
|
||||
NvU64 unprotected_sysmem_gpu_va = get_channel_unprotected_sysmem_gpu_va(wlc);
|
||||
NvU64 protected_vidmem = uvm_rm_mem_get_gpu_uvm_va(wlc->conf_computing.static_pb_protected_vidmem, gpu);
|
||||
NvU64 unprotected_sysmem_gpu = uvm_rm_mem_get_gpu_uvm_va(wlc->conf_computing.static_pb_unprotected_sysmem, gpu);
|
||||
void *unprotected_sysmem_cpu = wlc->conf_computing.static_pb_unprotected_sysmem_cpu;
|
||||
NvU64 tag_offset = (uintptr_t)wlc->conf_computing.static_pb_unprotected_sysmem_auth_tag_cpu -
|
||||
(uintptr_t)wlc->conf_computing.static_pb_unprotected_sysmem_cpu;
|
||||
|
||||
NvU64 *wlc_gpfifo_entries;
|
||||
uvm_push_t wlc_decrypt_push, sec2_push;
|
||||
@ -3251,30 +2855,21 @@ static NV_STATUS setup_wlc_schedule(uvm_channel_t *wlc)
|
||||
int i;
|
||||
NV_STATUS status = NV_OK;
|
||||
|
||||
// "gpfifo" is the representation of GPFIFO copied to gpFifoGpuVa.
|
||||
// Resuse static pushbuffer sysmem location for uploading GPFIFO schedule
|
||||
// "gpfifo" is the representation of GPFIFO copied to gpFifoGpu
|
||||
const size_t gpfifo_size = wlc->num_gpfifo_entries * sizeof(*wlc_gpfifo_entries);
|
||||
NvU64 gpfifo_unprotected_gpu_va = unprotected_sysmem_gpu_va;
|
||||
void *gpfifo_unprotected_cpu = get_channel_unprotected_sysmem_cpu(wlc);
|
||||
void *gpfifo_unprotected_cpu = unprotected_sysmem_cpu;
|
||||
NvU64 gpfifo_unprotected_gpu = unprotected_sysmem_gpu;
|
||||
|
||||
// "run_push" represents mutable push location used by WLC. This is the
|
||||
// first part of the WLC schedule, commands are decrypted as part of the
|
||||
// launch sequence to protected_vidmem_gpu_va + 0.
|
||||
// These locations are used in the static part ("decrypt_push") of the WLC schedule.
|
||||
uvm_gpu_address_t run_push_protected_gpu = uvm_gpu_address_virtual(protected_vidmem_gpu_va);
|
||||
uvm_gpu_address_t run_push_unprotected_gpu =
|
||||
uvm_gpu_address_virtual_unprotected(unprotected_sysmem_gpu_va + WLC_SYSMEM_PUSHBUFFER_OFFSET);
|
||||
uvm_gpu_address_t run_push_unprotected_auth_tag_gpu =
|
||||
uvm_gpu_address_virtual_unprotected(unprotected_sysmem_gpu_va + WLC_SYSMEM_PUSHBUFFER_AUTH_TAG_OFFSET);
|
||||
// "run_push" represents mutable push location used by WLC
|
||||
uvm_gpu_address_t run_push_protected_gpu = uvm_gpu_address_virtual(protected_vidmem);
|
||||
uvm_gpu_address_t run_push_unprotected_gpu = uvm_gpu_address_virtual(unprotected_sysmem_gpu);
|
||||
uvm_gpu_address_t run_push_unprotected_auth_tag_gpu = uvm_gpu_address_virtual(unprotected_sysmem_gpu + tag_offset);
|
||||
|
||||
// "decrypt_push" represents WLC decrypt push, constructed using fake_push.
|
||||
// Copied to protected_vidmem_gpu_va + UVM_MAX_WLC_PUSH_SIZE, as the second of the two
|
||||
// Copied to wlc_pb_base + UVM_MAX_WLC_PUSH_SIZE, as the second of the two
|
||||
// pushes that make the WLC fixed schedule.
|
||||
NvU64 decrypt_push_protected_gpu_va = protected_vidmem_gpu_va + WLC_ALIGNED_MAX_PUSH_SIZE;
|
||||
|
||||
// Similar to gpfifo, uploading the "decrypt_push" reuses static sysmem
|
||||
// locations later used for "run_push" when the WLC/LCIC schedule is active
|
||||
NvU64 decrypt_push_unprotected_gpu_va = gpfifo_unprotected_gpu_va + gpfifo_size;
|
||||
NvU64 decrypt_push_protected_gpu = UVM_ALIGN_UP(protected_vidmem + UVM_MAX_WLC_PUSH_SIZE, UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT);
|
||||
NvU64 decrypt_push_unprotected_gpu = unprotected_sysmem_gpu + gpfifo_size;
|
||||
void *decrypt_push_unprotected_cpu = (char*)gpfifo_unprotected_cpu + gpfifo_size;
|
||||
|
||||
// Tags for upload via SEC2
|
||||
@ -3284,6 +2879,7 @@ static NV_STATUS setup_wlc_schedule(uvm_channel_t *wlc)
|
||||
BUILD_BUG_ON(sizeof(*wlc_gpfifo_entries) != sizeof(*wlc->channel_info.gpFifoEntries));
|
||||
|
||||
UVM_ASSERT(uvm_channel_is_wlc(wlc));
|
||||
UVM_ASSERT(tag_offset == UVM_ALIGN_UP(UVM_MAX_WLC_PUSH_SIZE, UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT));
|
||||
|
||||
// WLC schedule consists of two parts, the number of entries needs to be even.
|
||||
// This also guarantees that the size is 16B aligned
|
||||
@ -3330,7 +2926,7 @@ static NV_STATUS setup_wlc_schedule(uvm_channel_t *wlc)
|
||||
for (i = 0; i < wlc->num_gpfifo_entries; ++i) {
|
||||
if (i % 2 == wlc->cpu_put % 2) {
|
||||
gpu->parent->host_hal->set_gpfifo_entry(wlc_gpfifo_entries + i,
|
||||
decrypt_push_protected_gpu_va,
|
||||
decrypt_push_protected_gpu,
|
||||
decrypt_push_size,
|
||||
UVM_GPFIFO_SYNC_PROCEED);
|
||||
}
|
||||
@ -3368,8 +2964,8 @@ static NV_STATUS setup_wlc_schedule(uvm_channel_t *wlc)
|
||||
decrypt_push_size,
|
||||
decrypt_push_auth_tag);
|
||||
gpu->parent->sec2_hal->decrypt(&sec2_push,
|
||||
decrypt_push_protected_gpu_va,
|
||||
decrypt_push_unprotected_gpu_va,
|
||||
decrypt_push_protected_gpu,
|
||||
decrypt_push_unprotected_gpu,
|
||||
decrypt_push_size,
|
||||
decrypt_push_auth_tag_gpu.address);
|
||||
|
||||
@ -3382,7 +2978,7 @@ static NV_STATUS setup_wlc_schedule(uvm_channel_t *wlc)
|
||||
gpfifo_auth_tag);
|
||||
gpu->parent->sec2_hal->decrypt(&sec2_push,
|
||||
wlc->channel_info.gpFifoGpuVa,
|
||||
gpfifo_unprotected_gpu_va,
|
||||
gpfifo_unprotected_gpu,
|
||||
gpfifo_size,
|
||||
gpfifo_auth_tag_gpu.address);
|
||||
|
||||
@ -3404,22 +3000,23 @@ free_gpfifo_entries:
|
||||
static NV_STATUS setup_lcic_schedule(uvm_channel_t *paired_wlc, uvm_channel_t *lcic)
|
||||
{
|
||||
uvm_gpu_t *gpu = uvm_channel_get_gpu(lcic);
|
||||
NvU64 lcic_pb_base = uvm_channel_get_static_pb_protected_vidmem_gpu_va(lcic);
|
||||
NvU64 lcic_pb_base = uvm_rm_mem_get_gpu_uvm_va(lcic->conf_computing.static_pb_protected_vidmem, gpu);
|
||||
|
||||
// Reuse WLC sysmem allocation
|
||||
NvU64 gpu_unprotected = get_channel_unprotected_sysmem_gpu_va(paired_wlc);
|
||||
char *cpu_unprotected = get_channel_unprotected_sysmem_cpu(paired_wlc);
|
||||
|
||||
uvm_gpu_semaphore_t *lcic_semaphore = &lcic->tracking_sem.semaphore;
|
||||
|
||||
uvm_gpu_address_t notifier_src_entry_addr = lcic_static_entry_notifier_gpu_va(lcic);
|
||||
uvm_gpu_address_t notifier_src_exit_addr = lcic_static_exit_notifier_gpu_va(lcic);
|
||||
uvm_gpu_address_t notifier_dst_addr = uvm_gpu_semaphore_get_notifier_gpu_va(lcic_semaphore);
|
||||
uvm_gpu_address_t encrypted_payload_gpu_va = uvm_gpu_semaphore_get_encrypted_payload_gpu_va(lcic_semaphore);
|
||||
uvm_gpu_address_t auth_tag_gpu_va = uvm_gpu_semaphore_get_auth_tag_gpu_va(lcic_semaphore);
|
||||
NvU64 gpu_unprotected = uvm_rm_mem_get_gpu_uvm_va(paired_wlc->conf_computing.static_pb_unprotected_sysmem, gpu);
|
||||
char *cpu_unprotected = paired_wlc->conf_computing.static_pb_unprotected_sysmem_cpu;
|
||||
uvm_gpu_semaphore_t *lcic_gpu_semaphore = &lcic->tracking_sem.semaphore;
|
||||
uvm_gpu_address_t notifier_src_entry_addr = lcic->conf_computing.static_notifier_entry_unprotected_sysmem_gpu_va;
|
||||
uvm_gpu_address_t notifier_src_exit_addr = lcic->conf_computing.static_notifier_exit_unprotected_sysmem_gpu_va;
|
||||
uvm_gpu_address_t notifier_dst_addr = uvm_rm_mem_get_gpu_va(lcic_gpu_semaphore->conf_computing.notifier,
|
||||
gpu,
|
||||
false);
|
||||
uvm_gpu_address_t encrypted_payload_gpu_va =
|
||||
uvm_rm_mem_get_gpu_va(lcic_gpu_semaphore->conf_computing.encrypted_payload, gpu, false);
|
||||
uvm_gpu_address_t semaphore_gpu_va = uvm_gpu_address_virtual(uvm_channel_tracking_semaphore_get_gpu_va(lcic));
|
||||
NvU32 payload_size = sizeof(*uvm_gpu_semaphore_get_encrypted_payload_cpu_va(lcic_semaphore));
|
||||
NvU32 notifier_size = sizeof(uvm_gpu_semaphore_notifier_t);
|
||||
uvm_gpu_address_t auth_tag_gpu_va = uvm_rm_mem_get_gpu_va(lcic_gpu_semaphore->conf_computing.auth_tag, gpu, false);
|
||||
NvU32 payload_size = sizeof(*lcic->tracking_sem.semaphore.payload);
|
||||
NvU32 notifier_size = sizeof(*lcic->conf_computing.static_notifier_entry_unprotected_sysmem_cpu);
|
||||
|
||||
NvU64 *lcic_gpfifo_entries;
|
||||
uvm_push_t lcic_push, sec2_push;
|
||||
@ -3475,11 +3072,7 @@ static NV_STATUS setup_lcic_schedule(uvm_channel_t *paired_wlc, uvm_channel_t *l
|
||||
0xffffffff);
|
||||
|
||||
gpu->parent->ce_hal->memcopy(&lcic_push, notifier_dst_addr, notifier_src_entry_addr, notifier_size);
|
||||
|
||||
// This CE encryption does not need to be logged, it will be logged on every
|
||||
// push_end instead
|
||||
gpu->parent->ce_hal->encrypt(&lcic_push, encrypted_payload_gpu_va, semaphore_gpu_va, payload_size, auth_tag_gpu_va);
|
||||
|
||||
gpu->parent->ce_hal->memcopy(&lcic_push, notifier_dst_addr, notifier_src_exit_addr, notifier_size);
|
||||
|
||||
// End LCIC push
|
||||
@ -3553,7 +3146,6 @@ static NV_STATUS channel_manager_setup_wlc_lcic(uvm_channel_pool_t *wlc_pool, uv
|
||||
NvU32 i;
|
||||
|
||||
UVM_ASSERT(wlc_pool->manager == lcic_pool->manager);
|
||||
UVM_ASSERT(!uvm_channel_manager_is_wlc_ready(wlc_pool->manager));
|
||||
UVM_ASSERT(wlc_pool->manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_WLC] != NULL);
|
||||
UVM_ASSERT(lcic_pool->manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_LCIC] == NULL);
|
||||
UVM_ASSERT(wlc_pool->num_channels == lcic_pool->num_channels);
|
||||
@ -3602,8 +3194,12 @@ static NV_STATUS channel_manager_create_conf_computing_pools(uvm_channel_manager
|
||||
|
||||
manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_SEC2] = sec2_pool;
|
||||
|
||||
// WLC and LCIC must use the same engine for the fixed schedule to work.
|
||||
wlc_lcic_ce_index = preferred_ce[UVM_CHANNEL_TYPE_WLC];
|
||||
// Use the same CE as CPU TO GPU channels for WLC/LCIC
|
||||
// Both need to use the same engine for the fixed schedule to work.
|
||||
// TODO: Bug 3981928: [hcc][uvm] Optimize parameters of WLC/LCIC secure
|
||||
// work launch
|
||||
// Find a metric to select the best CE to use
|
||||
wlc_lcic_ce_index = preferred_ce[UVM_CHANNEL_TYPE_CPU_TO_GPU];
|
||||
|
||||
// Create WLC/LCIC pools. This should be done early, CE channels use
|
||||
// them for secure launch. The WLC pool must be created before the LCIC.
|
||||
@ -3626,19 +3222,20 @@ static NV_STATUS channel_manager_create_conf_computing_pools(uvm_channel_manager
|
||||
// are ready to be used for secure work submission.
|
||||
manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_LCIC] = lcic_pool;
|
||||
|
||||
// WLC and LCIC pools are ready
|
||||
manager->conf_computing.wlc_ready = true;
|
||||
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
static NV_STATUS channel_manager_create_pools(uvm_channel_manager_t *manager)
|
||||
{
|
||||
NV_STATUS status;
|
||||
uvm_channel_type_t type;
|
||||
unsigned max_channel_pools;
|
||||
unsigned preferred_ce[UVM_CHANNEL_TYPE_COUNT];
|
||||
unsigned preferred_ce[UVM_CHANNEL_TYPE_CE_COUNT];
|
||||
|
||||
status = channel_manager_pick_ces(manager, preferred_ce);
|
||||
for (type = 0; type < ARRAY_SIZE(preferred_ce); type++)
|
||||
preferred_ce[type] = UVM_COPY_ENGINE_COUNT_MAX;
|
||||
|
||||
status = channel_manager_pick_copy_engines(manager, preferred_ce);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
@ -3681,8 +3278,6 @@ NV_STATUS uvm_channel_manager_create(uvm_gpu_t *gpu, uvm_channel_manager_t **cha
|
||||
if (!channel_manager)
|
||||
return NV_ERR_NO_MEMORY;
|
||||
|
||||
*channel_manager_out = channel_manager;
|
||||
|
||||
channel_manager->gpu = gpu;
|
||||
init_channel_manager_conf(channel_manager);
|
||||
status = uvm_pushbuffer_create(channel_manager, &channel_manager->pushbuffer);
|
||||
@ -3701,18 +3296,12 @@ NV_STATUS uvm_channel_manager_create(uvm_gpu_t *gpu, uvm_channel_manager_t **cha
|
||||
if (status != NV_OK)
|
||||
goto error;
|
||||
|
||||
// Key rotation is enabled only after all the channels have been created:
|
||||
// RM does not support channel allocation on an engine if key rotation is
|
||||
// pending on that engine. This can become a problem during testing if
|
||||
// key rotation thresholds are very low.
|
||||
uvm_conf_computing_enable_key_rotation(gpu);
|
||||
*channel_manager_out = channel_manager;
|
||||
|
||||
return NV_OK;
|
||||
return status;
|
||||
|
||||
error:
|
||||
*channel_manager_out = NULL;
|
||||
uvm_channel_manager_destroy(channel_manager);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
@ -3763,7 +3352,8 @@ static void channel_manager_stop_wlc(uvm_channel_manager_t *manager)
|
||||
if (status != NV_OK)
|
||||
UVM_ERR_PRINT_NV_STATUS("Failed to end stop push for WLC", status);
|
||||
|
||||
manager->conf_computing.wlc_ready = false;
|
||||
manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_WLC] = NULL;
|
||||
manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_LCIC] = NULL;
|
||||
}
|
||||
|
||||
void uvm_channel_manager_destroy(uvm_channel_manager_t *channel_manager)
|
||||
@ -3785,14 +3375,6 @@ void uvm_channel_manager_destroy(uvm_channel_manager_t *channel_manager)
|
||||
uvm_kvfree(channel_manager);
|
||||
}
|
||||
|
||||
NvU32 uvm_channel_pool_key_version(uvm_channel_pool_t *pool)
|
||||
{
|
||||
if (uvm_channel_pool_is_lcic(pool))
|
||||
pool = get_paired_pool(pool);
|
||||
|
||||
return pool->conf_computing.key_rotation.version;
|
||||
}
|
||||
|
||||
bool uvm_channel_is_privileged(uvm_channel_t *channel)
|
||||
{
|
||||
if (uvm_parent_gpu_is_virt_mode_sriov_heavy(uvm_channel_get_gpu(channel)->parent))
|
||||
@ -3914,7 +3496,7 @@ static void uvm_channel_print_info(uvm_channel_t *channel, struct seq_file *s)
|
||||
UVM_SEQ_OR_DBG_PRINT(s, "get %u\n", channel->gpu_get);
|
||||
UVM_SEQ_OR_DBG_PRINT(s, "put %u\n", channel->cpu_put);
|
||||
UVM_SEQ_OR_DBG_PRINT(s, "Semaphore GPU VA 0x%llx\n", uvm_channel_tracking_semaphore_get_gpu_va(channel));
|
||||
UVM_SEQ_OR_DBG_PRINT(s, "Semaphore CPU VA 0x%llx\n", (NvU64)uvm_gpu_semaphore_get_cpu_va(&channel->tracking_sem.semaphore));
|
||||
UVM_SEQ_OR_DBG_PRINT(s, "Semaphore CPU VA 0x%llx\n", (NvU64)(uintptr_t)channel->tracking_sem.semaphore.payload);
|
||||
|
||||
channel_pool_unlock(channel->pool);
|
||||
}
|
||||
|
@ -228,65 +228,21 @@ typedef struct
|
||||
// variant is required when the thread holding the pool lock must sleep
|
||||
// (ex: acquire another mutex) deeper in the call stack, either in UVM or
|
||||
// RM.
|
||||
union
|
||||
{
|
||||
union {
|
||||
uvm_spinlock_t spinlock;
|
||||
uvm_mutex_t mutex;
|
||||
};
|
||||
|
||||
struct
|
||||
{
|
||||
// Secure operations require that uvm_push_begin order matches
|
||||
// uvm_push_end order, because the engine's state is used in its
|
||||
// internal operation and each push may modify this state.
|
||||
// push_locks is protected by the channel pool lock.
|
||||
// uvm_push_end order, because the engine's state is used in its internal
|
||||
// operation and each push may modify this state. push_locks is protected by
|
||||
// the channel pool lock.
|
||||
DECLARE_BITMAP(push_locks, UVM_CHANNEL_MAX_NUM_CHANNELS_PER_POOL);
|
||||
|
||||
// Counting semaphore for available and unlocked channels, it must be
|
||||
// acquired before submitting work to a channel when the Confidential
|
||||
// Computing feature is enabled.
|
||||
uvm_semaphore_t push_sem;
|
||||
|
||||
// Per channel buffers in unprotected sysmem.
|
||||
uvm_rm_mem_t *pool_sysmem;
|
||||
|
||||
// Per channel buffers in protected vidmem.
|
||||
uvm_rm_mem_t *pool_vidmem;
|
||||
|
||||
struct
|
||||
{
|
||||
// Current encryption key version, incremented upon key rotation.
|
||||
// While there are separate keys for encryption and decryption, the
|
||||
// two keys are rotated at once, so the versioning applies to both.
|
||||
NvU32 version;
|
||||
|
||||
// Lock used to ensure mutual exclusion during key rotation.
|
||||
uvm_mutex_t mutex;
|
||||
|
||||
// CSL contexts passed to RM for key rotation. This is usually an
|
||||
// array containing the CSL contexts associated with the channels in
|
||||
// the pool. In the case of the WLC pool, the array also includes
|
||||
// CSL contexts associated with LCIC channels.
|
||||
UvmCslContext **csl_contexts;
|
||||
|
||||
// Number of elements in the CSL context array.
|
||||
unsigned num_csl_contexts;
|
||||
|
||||
// Number of bytes encrypted, or decrypted, on the engine associated
|
||||
// with the pool since the last key rotation. Only used during
|
||||
// testing, to force key rotations after a certain encryption size,
|
||||
// see UVM_CONF_COMPUTING_KEY_ROTATION_LOWER_THRESHOLD.
|
||||
//
|
||||
// Encryptions on a LCIC pool are accounted for in the paired WLC
|
||||
// pool.
|
||||
//
|
||||
// TODO: Bug 4612912: these accounting variables can be removed once
|
||||
// RM exposes an API to set the key rotation lower threshold.
|
||||
atomic64_t encrypted;
|
||||
atomic64_t decrypted;
|
||||
} key_rotation;
|
||||
|
||||
} conf_computing;
|
||||
} uvm_channel_pool_t;
|
||||
|
||||
struct uvm_channel_struct
|
||||
@ -366,14 +322,43 @@ struct uvm_channel_struct
|
||||
// work launches to match the order of push end-s that triggered them.
|
||||
volatile NvU32 gpu_put;
|
||||
|
||||
// Protected sysmem location makes WLC independent from the pushbuffer
|
||||
// allocator. Unprotected sysmem and protected vidmem counterparts
|
||||
// are allocated from the channel pool (sysmem, vidmem).
|
||||
// Static pushbuffer for channels with static schedule (WLC/LCIC)
|
||||
uvm_rm_mem_t *static_pb_protected_vidmem;
|
||||
|
||||
// Static pushbuffer staging buffer for WLC
|
||||
uvm_rm_mem_t *static_pb_unprotected_sysmem;
|
||||
void *static_pb_unprotected_sysmem_cpu;
|
||||
void *static_pb_unprotected_sysmem_auth_tag_cpu;
|
||||
|
||||
// The above static locations are required by the WLC (and LCIC)
|
||||
// schedule. Protected sysmem location completes WLC's independence
|
||||
// from the pushbuffer allocator.
|
||||
void *static_pb_protected_sysmem;
|
||||
|
||||
// Static tracking semaphore notifier values
|
||||
// Because of LCIC's fixed schedule, the secure semaphore release
|
||||
// mechanism uses two additional static locations for incrementing the
|
||||
// notifier values. See:
|
||||
// . channel_semaphore_secure_release()
|
||||
// . setup_lcic_schedule()
|
||||
// . internal_channel_submit_work_wlc()
|
||||
uvm_rm_mem_t *static_notifier_unprotected_sysmem;
|
||||
NvU32 *static_notifier_entry_unprotected_sysmem_cpu;
|
||||
NvU32 *static_notifier_exit_unprotected_sysmem_cpu;
|
||||
uvm_gpu_address_t static_notifier_entry_unprotected_sysmem_gpu_va;
|
||||
uvm_gpu_address_t static_notifier_exit_unprotected_sysmem_gpu_va;
|
||||
|
||||
// Explicit location for push launch tag used by WLC.
|
||||
// Encryption auth tags have to be located in unprotected sysmem.
|
||||
void *launch_auth_tag_cpu;
|
||||
NvU64 launch_auth_tag_gpu_va;
|
||||
|
||||
// Used to decrypt the push back to protected sysmem.
|
||||
// This happens when profilers register callbacks for migration data.
|
||||
uvm_push_crypto_bundle_t *push_crypto_bundles;
|
||||
|
||||
// Accompanying authentication tags for the crypto bundles
|
||||
uvm_rm_mem_t *push_crypto_bundle_auth_tags;
|
||||
} conf_computing;
|
||||
|
||||
// RM channel information
|
||||
@ -433,7 +418,7 @@ struct uvm_channel_manager_struct
|
||||
unsigned num_channel_pools;
|
||||
|
||||
// Mask containing the indexes of the usable Copy Engines. Each usable CE
|
||||
// has at least one pool of type UVM_CHANNEL_POOL_TYPE_CE associated with it
|
||||
// has at least one pool associated with it.
|
||||
DECLARE_BITMAP(ce_mask, UVM_COPY_ENGINE_COUNT_MAX);
|
||||
|
||||
struct
|
||||
@ -466,16 +451,6 @@ struct uvm_channel_manager_struct
|
||||
UVM_BUFFER_LOCATION gpput_loc;
|
||||
UVM_BUFFER_LOCATION pushbuffer_loc;
|
||||
} conf;
|
||||
|
||||
struct
|
||||
{
|
||||
// Flag indicating that the WLC/LCIC mechanism is ready/setup; should
|
||||
// only be false during (de)initialization.
|
||||
bool wlc_ready;
|
||||
|
||||
// True indicates that key rotation is enabled (UVM-wise).
|
||||
bool key_rotation_enabled;
|
||||
} conf_computing;
|
||||
};
|
||||
|
||||
// Create a channel manager for the GPU
|
||||
@ -526,14 +501,6 @@ uvm_channel_t *uvm_channel_lcic_get_paired_wlc(uvm_channel_t *lcic_channel);
|
||||
|
||||
uvm_channel_t *uvm_channel_wlc_get_paired_lcic(uvm_channel_t *wlc_channel);
|
||||
|
||||
NvU64 uvm_channel_get_static_pb_protected_vidmem_gpu_va(uvm_channel_t *channel);
|
||||
|
||||
NvU64 uvm_channel_get_static_pb_unprotected_sysmem_gpu_va(uvm_channel_t *channel);
|
||||
|
||||
char* uvm_channel_get_static_pb_unprotected_sysmem_cpu(uvm_channel_t *channel);
|
||||
|
||||
char *uvm_channel_get_push_crypto_bundle_auth_tags_cpu_va(uvm_channel_t *channel, unsigned tag_index);
|
||||
|
||||
static bool uvm_channel_pool_is_proxy(uvm_channel_pool_t *pool)
|
||||
{
|
||||
UVM_ASSERT(uvm_pool_type_is_valid(pool->pool_type));
|
||||
@ -565,17 +532,6 @@ static uvm_channel_type_t uvm_channel_proxy_channel_type(void)
|
||||
return UVM_CHANNEL_TYPE_MEMOPS;
|
||||
}
|
||||
|
||||
// Force key rotation in the engine associated with the given channel pool.
|
||||
// Rotation may still not happen if RM cannot acquire the necessary locks (in
|
||||
// which case the function returns NV_ERR_STATE_IN_USE).
|
||||
//
|
||||
// This function should be only invoked in pools in which key rotation is
|
||||
// enabled.
|
||||
NV_STATUS uvm_channel_pool_rotate_key(uvm_channel_pool_t *pool);
|
||||
|
||||
// Retrieve the current encryption key version associated with the channel pool.
|
||||
NvU32 uvm_channel_pool_key_version(uvm_channel_pool_t *pool);
|
||||
|
||||
// Privileged channels support all the Host and engine methods, while
|
||||
// non-privileged channels don't support privileged methods.
|
||||
//
|
||||
@ -623,9 +579,12 @@ NvU32 uvm_channel_manager_update_progress(uvm_channel_manager_t *channel_manager
|
||||
// beginning.
|
||||
NV_STATUS uvm_channel_manager_wait(uvm_channel_manager_t *manager);
|
||||
|
||||
// Check if WLC/LCIC mechanism is ready/setup
|
||||
// Should only return false during initialization
|
||||
static bool uvm_channel_manager_is_wlc_ready(uvm_channel_manager_t *manager)
|
||||
{
|
||||
return manager->conf_computing.wlc_ready;
|
||||
return (manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_WLC] != NULL) &&
|
||||
(manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_LCIC] != NULL);
|
||||
}
|
||||
// Get the GPU VA of semaphore_channel's tracking semaphore within the VA space
|
||||
// associated with access_channel.
|
||||
|
@ -796,8 +796,11 @@ done:
|
||||
NV_STATUS test_conf_computing_channel_selection(uvm_va_space_t *va_space)
|
||||
{
|
||||
NV_STATUS status = NV_OK;
|
||||
uvm_push_t *pushes = NULL;
|
||||
uvm_gpu_t *gpu = NULL;
|
||||
uvm_channel_pool_t *pool;
|
||||
uvm_push_t *pushes;
|
||||
uvm_gpu_t *gpu;
|
||||
NvU32 i;
|
||||
NvU32 num_pushes;
|
||||
|
||||
if (!g_uvm_global.conf_computing_enabled)
|
||||
return NV_OK;
|
||||
@ -807,19 +810,9 @@ NV_STATUS test_conf_computing_channel_selection(uvm_va_space_t *va_space)
|
||||
for_each_va_space_gpu(gpu, va_space) {
|
||||
uvm_channel_type_t channel_type;
|
||||
|
||||
// Key rotation is disabled because this test relies on nested pushes,
|
||||
// which is illegal. If any push other than the first one triggers key
|
||||
// rotation, the test won't complete. This is because key rotation
|
||||
// depends on waiting for ongoing pushes to end, which doesn't happen
|
||||
// if those pushes are ended after the current one begins.
|
||||
uvm_conf_computing_disable_key_rotation(gpu);
|
||||
|
||||
for (channel_type = 0; channel_type < UVM_CHANNEL_TYPE_COUNT; channel_type++) {
|
||||
NvU32 i;
|
||||
NvU32 num_pushes;
|
||||
uvm_channel_pool_t *pool = gpu->channel_manager->pool_to_use.default_for_type[channel_type];
|
||||
|
||||
TEST_CHECK_GOTO(pool != NULL, error);
|
||||
pool = gpu->channel_manager->pool_to_use.default_for_type[channel_type];
|
||||
TEST_CHECK_RET(pool != NULL);
|
||||
|
||||
// Skip LCIC channels as those can't accept any pushes
|
||||
if (uvm_channel_pool_is_lcic(pool))
|
||||
@ -831,7 +824,7 @@ NV_STATUS test_conf_computing_channel_selection(uvm_va_space_t *va_space)
|
||||
num_pushes = min(pool->num_channels, (NvU32)UVM_PUSH_MAX_CONCURRENT_PUSHES);
|
||||
|
||||
pushes = uvm_kvmalloc_zero(sizeof(*pushes) * num_pushes);
|
||||
TEST_CHECK_GOTO(pushes != NULL, error);
|
||||
TEST_CHECK_RET(pushes != NULL);
|
||||
|
||||
for (i = 0; i < num_pushes; i++) {
|
||||
uvm_push_t *push = &pushes[i];
|
||||
@ -848,18 +841,12 @@ NV_STATUS test_conf_computing_channel_selection(uvm_va_space_t *va_space)
|
||||
|
||||
uvm_kvfree(pushes);
|
||||
}
|
||||
|
||||
uvm_conf_computing_enable_key_rotation(gpu);
|
||||
}
|
||||
|
||||
uvm_thread_context_lock_enable_tracking();
|
||||
|
||||
return status;
|
||||
|
||||
error:
|
||||
if (gpu != NULL)
|
||||
uvm_conf_computing_enable_key_rotation(gpu);
|
||||
|
||||
uvm_thread_context_lock_enable_tracking();
|
||||
uvm_kvfree(pushes);
|
||||
|
||||
@ -961,318 +948,6 @@ release:
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
static NV_STATUS force_key_rotations(uvm_channel_pool_t *pool, unsigned num_rotations)
|
||||
{
|
||||
unsigned num_tries;
|
||||
unsigned max_num_tries = 20;
|
||||
unsigned num_rotations_completed = 0;
|
||||
|
||||
if (num_rotations == 0)
|
||||
return NV_OK;
|
||||
|
||||
// The number of accepted rotations is kept low, so failed rotation
|
||||
// invocations due to RM not acquiring the necessary locks (which imply a
|
||||
// sleep in the test) do not balloon the test execution time.
|
||||
UVM_ASSERT(num_rotations <= 10);
|
||||
|
||||
for (num_tries = 0; (num_tries < max_num_tries) && (num_rotations_completed < num_rotations); num_tries++) {
|
||||
// Force key rotation, irrespective of encryption usage.
|
||||
NV_STATUS status = uvm_channel_pool_rotate_key(pool);
|
||||
|
||||
// Key rotation may not be able to complete due to RM failing to acquire
|
||||
// the necessary locks. Detect the situation, sleep for a bit, and then
|
||||
// try again
|
||||
//
|
||||
// The maximum time spent sleeping in a single rotation call is
|
||||
// (max_num_tries * max_sleep_us)
|
||||
if (status == NV_ERR_STATE_IN_USE) {
|
||||
NvU32 min_sleep_us = 1000;
|
||||
NvU32 max_sleep_us = 10000;
|
||||
|
||||
usleep_range(min_sleep_us, max_sleep_us);
|
||||
continue;
|
||||
}
|
||||
|
||||
TEST_NV_CHECK_RET(status);
|
||||
|
||||
num_rotations_completed++;
|
||||
}
|
||||
|
||||
// If not a single key rotation occurred, the dependent tests still pass,
|
||||
// but there is no much value to them. Instead, return an error so the
|
||||
// maximum number of tries, or the maximum sleep time, are adjusted to
|
||||
// ensure that at least one rotation completes.
|
||||
if (num_rotations_completed > 0)
|
||||
return NV_OK;
|
||||
else
|
||||
return NV_ERR_STATE_IN_USE;
|
||||
}
|
||||
|
||||
static NV_STATUS force_key_rotation(uvm_channel_pool_t *pool)
|
||||
{
|
||||
return force_key_rotations(pool, 1);
|
||||
}
|
||||
|
||||
// Test key rotation in all pools. This is useful because key rotation may not
|
||||
// happen otherwise on certain engines during UVM test execution. For example,
|
||||
// if the MEMOPS channel type is mapped to a CE not shared with any other
|
||||
// channel type, then the only encryption taking place in the engine is due to
|
||||
// semaphore releases (4 bytes each). This small encryption size makes it
|
||||
// unlikely to exceed even small rotation thresholds.
|
||||
static NV_STATUS test_channel_key_rotation_basic(uvm_gpu_t *gpu)
|
||||
{
|
||||
uvm_channel_pool_t *pool;
|
||||
|
||||
uvm_for_each_pool(pool, gpu->channel_manager) {
|
||||
if (!uvm_conf_computing_is_key_rotation_enabled_in_pool(pool))
|
||||
continue;
|
||||
|
||||
TEST_NV_CHECK_RET(force_key_rotation(pool));
|
||||
}
|
||||
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
// Interleave GPU encryptions and decryptions, and their CPU counterparts, with
|
||||
// key rotations.
|
||||
static NV_STATUS test_channel_key_rotation_interleave(uvm_gpu_t *gpu)
|
||||
{
|
||||
int i;
|
||||
uvm_channel_pool_t *gpu_to_cpu_pool;
|
||||
uvm_channel_pool_t *cpu_to_gpu_pool;
|
||||
NV_STATUS status = NV_OK;
|
||||
size_t size = UVM_CONF_COMPUTING_DMA_BUFFER_SIZE;
|
||||
void *initial_plain_cpu = NULL;
|
||||
void *final_plain_cpu = NULL;
|
||||
uvm_mem_t *plain_gpu = NULL;
|
||||
uvm_gpu_address_t plain_gpu_address;
|
||||
|
||||
cpu_to_gpu_pool = gpu->channel_manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_CPU_TO_GPU];
|
||||
TEST_CHECK_RET(uvm_conf_computing_is_key_rotation_enabled_in_pool(cpu_to_gpu_pool));
|
||||
|
||||
gpu_to_cpu_pool = gpu->channel_manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_GPU_TO_CPU];
|
||||
TEST_CHECK_RET(uvm_conf_computing_is_key_rotation_enabled_in_pool(gpu_to_cpu_pool));
|
||||
|
||||
initial_plain_cpu = uvm_kvmalloc_zero(size);
|
||||
if (initial_plain_cpu == NULL) {
|
||||
status = NV_ERR_NO_MEMORY;
|
||||
goto out;
|
||||
}
|
||||
|
||||
final_plain_cpu = uvm_kvmalloc_zero(size);
|
||||
if (final_plain_cpu == NULL) {
|
||||
status = NV_ERR_NO_MEMORY;
|
||||
goto out;
|
||||
}
|
||||
|
||||
TEST_NV_CHECK_GOTO(uvm_mem_alloc_vidmem(size, gpu, &plain_gpu), out);
|
||||
TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(plain_gpu, gpu), out);
|
||||
plain_gpu_address = uvm_mem_gpu_address_virtual_kernel(plain_gpu, gpu);
|
||||
|
||||
memset(initial_plain_cpu, 1, size);
|
||||
|
||||
for (i = 0; i < 5; i++) {
|
||||
TEST_NV_CHECK_GOTO(force_key_rotation(gpu_to_cpu_pool), out);
|
||||
TEST_NV_CHECK_GOTO(force_key_rotation(cpu_to_gpu_pool), out);
|
||||
|
||||
TEST_NV_CHECK_GOTO(uvm_conf_computing_util_memcopy_cpu_to_gpu(gpu,
|
||||
plain_gpu_address,
|
||||
initial_plain_cpu,
|
||||
size,
|
||||
NULL,
|
||||
"CPU > GPU"),
|
||||
out);
|
||||
|
||||
TEST_NV_CHECK_GOTO(force_key_rotation(gpu_to_cpu_pool), out);
|
||||
TEST_NV_CHECK_GOTO(force_key_rotation(cpu_to_gpu_pool), out);
|
||||
|
||||
TEST_NV_CHECK_GOTO(uvm_conf_computing_util_memcopy_gpu_to_cpu(gpu,
|
||||
final_plain_cpu,
|
||||
plain_gpu_address,
|
||||
size,
|
||||
NULL,
|
||||
"GPU > CPU"),
|
||||
out);
|
||||
|
||||
TEST_CHECK_GOTO(!memcmp(initial_plain_cpu, final_plain_cpu, size), out);
|
||||
|
||||
memset(final_plain_cpu, 0, size);
|
||||
}
|
||||
|
||||
out:
|
||||
uvm_mem_free(plain_gpu);
|
||||
uvm_kvfree(final_plain_cpu);
|
||||
uvm_kvfree(initial_plain_cpu);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static NV_STATUS memset_vidmem(uvm_mem_t *mem, NvU8 val)
|
||||
{
|
||||
uvm_push_t push;
|
||||
uvm_gpu_address_t gpu_address;
|
||||
uvm_gpu_t *gpu = mem->backing_gpu;
|
||||
|
||||
UVM_ASSERT(uvm_mem_is_vidmem(mem));
|
||||
|
||||
TEST_NV_CHECK_RET(uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_INTERNAL, &push, "zero vidmem"));
|
||||
|
||||
gpu_address = uvm_mem_gpu_address_virtual_kernel(mem, gpu);
|
||||
gpu->parent->ce_hal->memset_1(&push, gpu_address, val, mem->size);
|
||||
|
||||
TEST_NV_CHECK_RET(uvm_push_end_and_wait(&push));
|
||||
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
// Custom version of uvm_conf_computing_util_memcopy_gpu_to_cpu that allows
|
||||
// testing to insert key rotations in between the push end, and the CPU
|
||||
// decryption
|
||||
static NV_STATUS encrypted_memcopy_gpu_to_cpu(uvm_gpu_t *gpu,
|
||||
void *dst_plain,
|
||||
uvm_gpu_address_t src_gpu_address,
|
||||
size_t size,
|
||||
unsigned num_rotations_to_insert)
|
||||
{
|
||||
NV_STATUS status;
|
||||
uvm_push_t push;
|
||||
uvm_conf_computing_dma_buffer_t *dma_buffer;
|
||||
uvm_gpu_address_t dst_gpu_address, auth_tag_gpu_address;
|
||||
void *src_cipher, *auth_tag;
|
||||
uvm_channel_t *channel;
|
||||
|
||||
UVM_ASSERT(g_uvm_global.conf_computing_enabled);
|
||||
UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE);
|
||||
|
||||
status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
status = uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_TO_CPU, &push, "Small GPU > CPU encryption");
|
||||
if (status != NV_OK)
|
||||
goto out;
|
||||
|
||||
channel = push.channel;
|
||||
uvm_conf_computing_log_gpu_encryption(channel, size, dma_buffer->decrypt_iv);
|
||||
dma_buffer->key_version[0] = uvm_channel_pool_key_version(channel->pool);
|
||||
|
||||
dst_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
|
||||
auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
|
||||
gpu->parent->ce_hal->encrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address);
|
||||
|
||||
status = uvm_push_end_and_wait(&push);
|
||||
if (status != NV_OK)
|
||||
goto out;
|
||||
|
||||
TEST_NV_CHECK_GOTO(force_key_rotations(channel->pool, num_rotations_to_insert), out);
|
||||
|
||||
// If num_rotations_to_insert is not zero, the current encryption key will
|
||||
// be different from the one used during CE encryption.
|
||||
|
||||
src_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
|
||||
auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
|
||||
status = uvm_conf_computing_cpu_decrypt(channel,
|
||||
dst_plain,
|
||||
src_cipher,
|
||||
dma_buffer->decrypt_iv,
|
||||
dma_buffer->key_version[0],
|
||||
size,
|
||||
auth_tag);
|
||||
|
||||
out:
|
||||
uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL);
|
||||
return status;
|
||||
}
|
||||
|
||||
static NV_STATUS test_channel_key_rotation_cpu_decryption(uvm_gpu_t *gpu,
|
||||
unsigned num_repetitions,
|
||||
unsigned num_rotations_to_insert)
|
||||
{
|
||||
unsigned i;
|
||||
uvm_channel_pool_t *gpu_to_cpu_pool;
|
||||
NV_STATUS status = NV_OK;
|
||||
size_t size = UVM_CONF_COMPUTING_DMA_BUFFER_SIZE;
|
||||
NvU8 *plain_cpu = NULL;
|
||||
uvm_mem_t *plain_gpu = NULL;
|
||||
uvm_gpu_address_t plain_gpu_address;
|
||||
|
||||
if (!uvm_conf_computing_is_key_rotation_enabled(gpu))
|
||||
return NV_OK;
|
||||
|
||||
gpu_to_cpu_pool = gpu->channel_manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_GPU_TO_CPU];
|
||||
TEST_CHECK_RET(uvm_conf_computing_is_key_rotation_enabled_in_pool(gpu_to_cpu_pool));
|
||||
|
||||
plain_cpu = (NvU8 *) uvm_kvmalloc_zero(size);
|
||||
if (plain_cpu == NULL) {
|
||||
status = NV_ERR_NO_MEMORY;
|
||||
goto out;
|
||||
}
|
||||
|
||||
TEST_NV_CHECK_GOTO(uvm_mem_alloc_vidmem(size, gpu, &plain_gpu), out);
|
||||
TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(plain_gpu, gpu), out);
|
||||
TEST_NV_CHECK_GOTO(memset_vidmem(plain_gpu, 1), out);
|
||||
|
||||
plain_gpu_address = uvm_mem_gpu_address_virtual_kernel(plain_gpu, gpu);
|
||||
|
||||
for (i = 0; i < num_repetitions; i++) {
|
||||
unsigned j;
|
||||
|
||||
TEST_NV_CHECK_GOTO(encrypted_memcopy_gpu_to_cpu(gpu,
|
||||
plain_cpu,
|
||||
plain_gpu_address,
|
||||
size,
|
||||
num_rotations_to_insert),
|
||||
out);
|
||||
|
||||
for (j = 0; j < size; j++)
|
||||
TEST_CHECK_GOTO(plain_cpu[j] == 1, out);
|
||||
|
||||
memset(plain_cpu, 0, size);
|
||||
|
||||
}
|
||||
out:
|
||||
uvm_mem_free(plain_gpu);
|
||||
uvm_kvfree(plain_cpu);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
// Test that CPU decryptions can use old keys i.e. previous versions of the keys
|
||||
// that are no longer the current key, due to key rotation. Given that SEC2
|
||||
// does not expose encryption capabilities, the "decrypt-after-rotation" problem
|
||||
// is exclusive of CE encryptions.
|
||||
static NV_STATUS test_channel_key_rotation_decrypt_after_key_rotation(uvm_gpu_t *gpu)
|
||||
{
|
||||
// Instruct encrypted_memcopy_gpu_to_cpu to insert several key rotations
|
||||
// between the GPU encryption, and the associated CPU decryption.
|
||||
unsigned num_rotations_to_insert = 8;
|
||||
|
||||
TEST_NV_CHECK_RET(test_channel_key_rotation_cpu_decryption(gpu, 1, num_rotations_to_insert));
|
||||
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
static NV_STATUS test_channel_key_rotation(uvm_va_space_t *va_space)
|
||||
{
|
||||
uvm_gpu_t *gpu;
|
||||
|
||||
if (!g_uvm_global.conf_computing_enabled)
|
||||
return NV_OK;
|
||||
|
||||
for_each_va_space_gpu(gpu, va_space) {
|
||||
if (!uvm_conf_computing_is_key_rotation_enabled(gpu))
|
||||
break;
|
||||
|
||||
TEST_NV_CHECK_RET(test_channel_key_rotation_basic(gpu));
|
||||
|
||||
TEST_NV_CHECK_RET(test_channel_key_rotation_interleave(gpu));
|
||||
|
||||
TEST_NV_CHECK_RET(test_channel_key_rotation_decrypt_after_key_rotation(gpu));
|
||||
}
|
||||
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
NV_STATUS test_write_ctrl_gpfifo_noop(uvm_va_space_t *va_space)
|
||||
{
|
||||
uvm_gpu_t *gpu;
|
||||
@ -1528,10 +1203,6 @@ NV_STATUS uvm_test_channel_sanity(UVM_TEST_CHANNEL_SANITY_PARAMS *params, struct
|
||||
if (status != NV_OK)
|
||||
goto done;
|
||||
|
||||
status = test_channel_key_rotation(va_space);
|
||||
if (status != NV_OK)
|
||||
goto done;
|
||||
|
||||
// The following tests have side effects, they reset the GPU's
|
||||
// channel_manager.
|
||||
status = test_channel_pushbuffer_extension_base(va_space);
|
||||
@ -1667,126 +1338,6 @@ done:
|
||||
return status;
|
||||
}
|
||||
|
||||
static NV_STATUS channel_stress_key_rotation_cpu_encryption(uvm_gpu_t *gpu, UVM_TEST_CHANNEL_STRESS_PARAMS *params)
|
||||
{
|
||||
int i;
|
||||
uvm_channel_pool_t *cpu_to_gpu_pool;
|
||||
NV_STATUS status = NV_OK;
|
||||
size_t size = UVM_CONF_COMPUTING_DMA_BUFFER_SIZE;
|
||||
void *initial_plain_cpu = NULL;
|
||||
uvm_mem_t *plain_gpu = NULL;
|
||||
uvm_gpu_address_t plain_gpu_address;
|
||||
|
||||
UVM_ASSERT(params->key_rotation_operation == UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_CPU_TO_GPU);
|
||||
|
||||
cpu_to_gpu_pool = gpu->channel_manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_CPU_TO_GPU];
|
||||
TEST_CHECK_RET(uvm_conf_computing_is_key_rotation_enabled_in_pool(cpu_to_gpu_pool));
|
||||
|
||||
initial_plain_cpu = uvm_kvmalloc_zero(size);
|
||||
if (initial_plain_cpu == NULL) {
|
||||
status = NV_ERR_NO_MEMORY;
|
||||
goto out;
|
||||
}
|
||||
|
||||
TEST_NV_CHECK_GOTO(uvm_mem_alloc_vidmem(size, gpu, &plain_gpu), out);
|
||||
TEST_NV_CHECK_GOTO(uvm_mem_map_gpu_kernel(plain_gpu, gpu), out);
|
||||
plain_gpu_address = uvm_mem_gpu_address_virtual_kernel(plain_gpu, gpu);
|
||||
|
||||
memset(initial_plain_cpu, 1, size);
|
||||
|
||||
for (i = 0; i < params->iterations; i++) {
|
||||
TEST_NV_CHECK_GOTO(uvm_conf_computing_util_memcopy_cpu_to_gpu(gpu,
|
||||
plain_gpu_address,
|
||||
initial_plain_cpu,
|
||||
size,
|
||||
NULL,
|
||||
"CPU > GPU"),
|
||||
out);
|
||||
}
|
||||
|
||||
out:
|
||||
uvm_mem_free(plain_gpu);
|
||||
uvm_kvfree(initial_plain_cpu);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static NV_STATUS channel_stress_key_rotation_cpu_decryption(uvm_gpu_t *gpu, UVM_TEST_CHANNEL_STRESS_PARAMS *params)
|
||||
{
|
||||
unsigned num_rotations_to_insert = 0;
|
||||
|
||||
UVM_ASSERT(params->key_rotation_operation == UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_GPU_TO_CPU);
|
||||
|
||||
return test_channel_key_rotation_cpu_decryption(gpu, params->iterations, num_rotations_to_insert);
|
||||
}
|
||||
|
||||
static NV_STATUS channel_stress_key_rotation_rotate(uvm_gpu_t *gpu, UVM_TEST_CHANNEL_STRESS_PARAMS *params)
|
||||
{
|
||||
NvU32 i;
|
||||
|
||||
UVM_ASSERT(params->key_rotation_operation == UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_ROTATE);
|
||||
|
||||
for (i = 0; i < params->iterations; ++i) {
|
||||
NV_STATUS status;
|
||||
uvm_channel_pool_t *pool;
|
||||
uvm_channel_type_t type;
|
||||
|
||||
if ((i % 3) == 0)
|
||||
type = UVM_CHANNEL_TYPE_CPU_TO_GPU;
|
||||
else if ((i % 3) == 1)
|
||||
type = UVM_CHANNEL_TYPE_GPU_TO_CPU;
|
||||
else
|
||||
type = UVM_CHANNEL_TYPE_WLC;
|
||||
|
||||
pool = gpu->channel_manager->pool_to_use.default_for_type[type];
|
||||
|
||||
if (!uvm_conf_computing_is_key_rotation_enabled_in_pool(pool))
|
||||
return NV_ERR_INVALID_STATE;
|
||||
|
||||
status = force_key_rotation(pool);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
}
|
||||
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
// The objective of this test is documented in the user-level function
|
||||
static NV_STATUS uvm_test_channel_stress_key_rotation(uvm_va_space_t *va_space, UVM_TEST_CHANNEL_STRESS_PARAMS *params)
|
||||
{
|
||||
uvm_test_rng_t rng;
|
||||
uvm_gpu_t *gpu;
|
||||
NV_STATUS status = NV_OK;
|
||||
|
||||
if (!g_uvm_global.conf_computing_enabled)
|
||||
return NV_OK;
|
||||
|
||||
uvm_test_rng_init(&rng, params->seed);
|
||||
|
||||
uvm_va_space_down_read(va_space);
|
||||
|
||||
// Key rotation should be enabled, or disabled, in all GPUs. Pick a random
|
||||
// one.
|
||||
gpu = random_va_space_gpu(&rng, va_space);
|
||||
|
||||
if (!uvm_conf_computing_is_key_rotation_enabled(gpu))
|
||||
goto out;
|
||||
|
||||
if (params->key_rotation_operation == UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_CPU_TO_GPU)
|
||||
status = channel_stress_key_rotation_cpu_encryption(gpu, params);
|
||||
else if (params->key_rotation_operation == UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_GPU_TO_CPU)
|
||||
status = channel_stress_key_rotation_cpu_decryption(gpu, params);
|
||||
else if (params->key_rotation_operation == UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_ROTATE)
|
||||
status = channel_stress_key_rotation_rotate(gpu, params);
|
||||
else
|
||||
status = NV_ERR_INVALID_PARAMETER;
|
||||
|
||||
out:
|
||||
uvm_va_space_up_read(va_space);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
NV_STATUS uvm_test_channel_stress(UVM_TEST_CHANNEL_STRESS_PARAMS *params, struct file *filp)
|
||||
{
|
||||
uvm_va_space_t *va_space = uvm_va_space_get(filp);
|
||||
@ -1798,8 +1349,6 @@ NV_STATUS uvm_test_channel_stress(UVM_TEST_CHANNEL_STRESS_PARAMS *params, struct
|
||||
return uvm_test_channel_stress_update_channels(va_space, params);
|
||||
case UVM_TEST_CHANNEL_STRESS_MODE_NOOP_PUSH:
|
||||
return uvm_test_channel_noop_push(va_space, params);
|
||||
case UVM_TEST_CHANNEL_STRESS_MODE_KEY_ROTATION:
|
||||
return uvm_test_channel_stress_key_rotation(va_space, params);
|
||||
default:
|
||||
return NV_ERR_INVALID_PARAMETER;
|
||||
}
|
||||
|
@ -33,15 +33,6 @@
|
||||
#include "nv_uvm_interface.h"
|
||||
#include "uvm_va_block.h"
|
||||
|
||||
// Amount of encrypted data on a given engine that triggers key rotation. This
|
||||
// is a UVM internal threshold, different from that of RM, and used only during
|
||||
// testing.
|
||||
//
|
||||
// Key rotation is triggered when the total encryption size, or the total
|
||||
// decryption size (whatever comes first) reaches this lower threshold on the
|
||||
// engine.
|
||||
#define UVM_CONF_COMPUTING_KEY_ROTATION_LOWER_THRESHOLD (UVM_SIZE_1MB * 8)
|
||||
|
||||
// The maximum number of secure operations per push is:
|
||||
// UVM_MAX_PUSH_SIZE / min(CE encryption size, CE decryption size)
|
||||
// + 1 (tracking semaphore) = 128 * 1024 / 56 + 1 = 2342
|
||||
@ -361,19 +352,6 @@ error:
|
||||
return status;
|
||||
}
|
||||
|
||||
// The production key rotation defaults are such that key rotations rarely
|
||||
// happen. During UVM testing more frequent rotations are triggering by relying
|
||||
// on internal encryption usage accounting. When key rotations are triggered by
|
||||
// UVM, the driver does not rely on channel key rotation notifiers.
|
||||
//
|
||||
// TODO: Bug 4612912: UVM should be able to programmatically set the rotation
|
||||
// lower threshold. This function, and all the metadata associated with it
|
||||
// (per-pool encryption accounting, for example) can be removed at that point.
|
||||
static bool key_rotation_is_notifier_driven(void)
|
||||
{
|
||||
return !uvm_enable_builtin_tests;
|
||||
}
|
||||
|
||||
NV_STATUS uvm_conf_computing_gpu_init(uvm_gpu_t *gpu)
|
||||
{
|
||||
NV_STATUS status;
|
||||
@ -416,35 +394,17 @@ void uvm_conf_computing_gpu_deinit(uvm_gpu_t *gpu)
|
||||
conf_computing_dma_buffer_pool_deinit(&gpu->conf_computing.dma_buffer_pool);
|
||||
}
|
||||
|
||||
void uvm_conf_computing_log_gpu_encryption(uvm_channel_t *channel, size_t size, UvmCslIv *iv)
|
||||
void uvm_conf_computing_log_gpu_encryption(uvm_channel_t *channel, UvmCslIv *iv)
|
||||
{
|
||||
NV_STATUS status;
|
||||
uvm_channel_pool_t *pool;
|
||||
|
||||
if (uvm_channel_is_lcic(channel))
|
||||
pool = uvm_channel_lcic_get_paired_wlc(channel)->pool;
|
||||
else
|
||||
pool = channel->pool;
|
||||
|
||||
uvm_mutex_lock(&channel->csl.ctx_lock);
|
||||
|
||||
if (uvm_conf_computing_is_key_rotation_enabled_in_pool(pool)) {
|
||||
status = nvUvmInterfaceCslLogEncryption(&channel->csl.ctx, UVM_CSL_OPERATION_DECRYPT, size);
|
||||
|
||||
// Informing RM of an encryption/decryption should not fail
|
||||
UVM_ASSERT(status == NV_OK);
|
||||
|
||||
if (!key_rotation_is_notifier_driven())
|
||||
atomic64_add(size, &pool->conf_computing.key_rotation.encrypted);
|
||||
}
|
||||
|
||||
status = nvUvmInterfaceCslIncrementIv(&channel->csl.ctx, UVM_CSL_OPERATION_DECRYPT, 1, iv);
|
||||
uvm_mutex_unlock(&channel->csl.ctx_lock);
|
||||
|
||||
// IV rotation is done preemptively as needed, so the above
|
||||
// call cannot return failure.
|
||||
UVM_ASSERT(status == NV_OK);
|
||||
|
||||
uvm_mutex_unlock(&channel->csl.ctx_lock);
|
||||
}
|
||||
|
||||
void uvm_conf_computing_acquire_encryption_iv(uvm_channel_t *channel, UvmCslIv *iv)
|
||||
@ -468,46 +428,27 @@ void uvm_conf_computing_cpu_encrypt(uvm_channel_t *channel,
|
||||
void *auth_tag_buffer)
|
||||
{
|
||||
NV_STATUS status;
|
||||
uvm_channel_pool_t *pool;
|
||||
|
||||
UVM_ASSERT(size);
|
||||
|
||||
if (uvm_channel_is_lcic(channel))
|
||||
pool = uvm_channel_lcic_get_paired_wlc(channel)->pool;
|
||||
else
|
||||
pool = channel->pool;
|
||||
|
||||
uvm_mutex_lock(&channel->csl.ctx_lock);
|
||||
|
||||
status = nvUvmInterfaceCslEncrypt(&channel->csl.ctx,
|
||||
size,
|
||||
(NvU8 const *) src_plain,
|
||||
encrypt_iv,
|
||||
(NvU8 *) dst_cipher,
|
||||
(NvU8 *) auth_tag_buffer);
|
||||
uvm_mutex_unlock(&channel->csl.ctx_lock);
|
||||
|
||||
// IV rotation is done preemptively as needed, so the above
|
||||
// call cannot return failure.
|
||||
UVM_ASSERT(status == NV_OK);
|
||||
|
||||
if (uvm_conf_computing_is_key_rotation_enabled_in_pool(pool)) {
|
||||
status = nvUvmInterfaceCslLogEncryption(&channel->csl.ctx, UVM_CSL_OPERATION_ENCRYPT, size);
|
||||
|
||||
// Informing RM of an encryption/decryption should not fail
|
||||
UVM_ASSERT(status == NV_OK);
|
||||
|
||||
if (!key_rotation_is_notifier_driven())
|
||||
atomic64_add(size, &pool->conf_computing.key_rotation.decrypted);
|
||||
}
|
||||
|
||||
uvm_mutex_unlock(&channel->csl.ctx_lock);
|
||||
}
|
||||
|
||||
NV_STATUS uvm_conf_computing_cpu_decrypt(uvm_channel_t *channel,
|
||||
void *dst_plain,
|
||||
const void *src_cipher,
|
||||
const UvmCslIv *src_iv,
|
||||
NvU32 key_version,
|
||||
size_t size,
|
||||
const void *auth_tag_buffer)
|
||||
{
|
||||
@ -528,19 +469,10 @@ NV_STATUS uvm_conf_computing_cpu_decrypt(uvm_channel_t *channel,
|
||||
size,
|
||||
(const NvU8 *) src_cipher,
|
||||
src_iv,
|
||||
key_version,
|
||||
(NvU8 *) dst_plain,
|
||||
NULL,
|
||||
0,
|
||||
(const NvU8 *) auth_tag_buffer);
|
||||
|
||||
if (status != NV_OK) {
|
||||
UVM_ERR_PRINT("nvUvmInterfaceCslDecrypt() failed: %s, channel %s, GPU %s\n",
|
||||
nvstatusToString(status),
|
||||
channel->name,
|
||||
uvm_gpu_name(uvm_channel_get_gpu(channel)));
|
||||
}
|
||||
|
||||
uvm_mutex_unlock(&channel->csl.ctx_lock);
|
||||
|
||||
return status;
|
||||
@ -553,8 +485,6 @@ NV_STATUS uvm_conf_computing_fault_decrypt(uvm_parent_gpu_t *parent_gpu,
|
||||
NvU8 valid)
|
||||
{
|
||||
NV_STATUS status;
|
||||
NvU32 fault_entry_size = parent_gpu->fault_buffer_hal->entry_size(parent_gpu);
|
||||
UvmCslContext *csl_context = &parent_gpu->fault_buffer_info.rm_info.replayable.cslCtx;
|
||||
|
||||
// There is no dedicated lock for the CSL context associated with replayable
|
||||
// faults. The mutual exclusion required by the RM CSL API is enforced by
|
||||
@ -564,48 +494,36 @@ NV_STATUS uvm_conf_computing_fault_decrypt(uvm_parent_gpu_t *parent_gpu,
|
||||
|
||||
UVM_ASSERT(g_uvm_global.conf_computing_enabled);
|
||||
|
||||
status = nvUvmInterfaceCslLogEncryption(csl_context, UVM_CSL_OPERATION_DECRYPT, fault_entry_size);
|
||||
|
||||
// Informing RM of an encryption/decryption should not fail
|
||||
UVM_ASSERT(status == NV_OK);
|
||||
|
||||
status = nvUvmInterfaceCslDecrypt(csl_context,
|
||||
fault_entry_size,
|
||||
status = nvUvmInterfaceCslDecrypt(&parent_gpu->fault_buffer_info.rm_info.replayable.cslCtx,
|
||||
parent_gpu->fault_buffer_hal->entry_size(parent_gpu),
|
||||
(const NvU8 *) src_cipher,
|
||||
NULL,
|
||||
NV_U32_MAX,
|
||||
(NvU8 *) dst_plain,
|
||||
&valid,
|
||||
sizeof(valid),
|
||||
(const NvU8 *) auth_tag_buffer);
|
||||
|
||||
if (status != NV_OK) {
|
||||
if (status != NV_OK)
|
||||
UVM_ERR_PRINT("nvUvmInterfaceCslDecrypt() failed: %s, GPU %s\n",
|
||||
nvstatusToString(status),
|
||||
uvm_parent_gpu_name(parent_gpu));
|
||||
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
void uvm_conf_computing_fault_increment_decrypt_iv(uvm_parent_gpu_t *parent_gpu)
|
||||
void uvm_conf_computing_fault_increment_decrypt_iv(uvm_parent_gpu_t *parent_gpu, NvU64 increment)
|
||||
{
|
||||
NV_STATUS status;
|
||||
NvU32 fault_entry_size = parent_gpu->fault_buffer_hal->entry_size(parent_gpu);
|
||||
UvmCslContext *csl_context = &parent_gpu->fault_buffer_info.rm_info.replayable.cslCtx;
|
||||
|
||||
// See comment in uvm_conf_computing_fault_decrypt
|
||||
UVM_ASSERT(uvm_sem_is_locked(&parent_gpu->isr.replayable_faults.service_lock));
|
||||
|
||||
UVM_ASSERT(g_uvm_global.conf_computing_enabled);
|
||||
|
||||
status = nvUvmInterfaceCslLogEncryption(csl_context, UVM_CSL_OPERATION_DECRYPT, fault_entry_size);
|
||||
|
||||
// Informing RM of an encryption/decryption should not fail
|
||||
UVM_ASSERT(status == NV_OK);
|
||||
|
||||
status = nvUvmInterfaceCslIncrementIv(csl_context, UVM_CSL_OPERATION_DECRYPT, 1, NULL);
|
||||
status = nvUvmInterfaceCslIncrementIv(&parent_gpu->fault_buffer_info.rm_info.replayable.cslCtx,
|
||||
UVM_CSL_OPERATION_DECRYPT,
|
||||
increment,
|
||||
NULL);
|
||||
|
||||
UVM_ASSERT(status == NV_OK);
|
||||
}
|
||||
@ -707,231 +625,3 @@ NV_STATUS uvm_conf_computing_maybe_rotate_channel_ivs_retry_busy(uvm_channel_t *
|
||||
{
|
||||
return uvm_conf_computing_rotate_channel_ivs_below_limit(channel, uvm_conf_computing_channel_iv_rotation_limit, true);
|
||||
}
|
||||
|
||||
void uvm_conf_computing_enable_key_rotation(uvm_gpu_t *gpu)
|
||||
{
|
||||
if (!g_uvm_global.conf_computing_enabled)
|
||||
return;
|
||||
|
||||
// Key rotation cannot be enabled on UVM if it is disabled on RM
|
||||
if (!gpu->parent->rm_info.gpuConfComputeCaps.bKeyRotationEnabled)
|
||||
return;
|
||||
|
||||
gpu->channel_manager->conf_computing.key_rotation_enabled = true;
|
||||
}
|
||||
|
||||
void uvm_conf_computing_disable_key_rotation(uvm_gpu_t *gpu)
|
||||
{
|
||||
if (!g_uvm_global.conf_computing_enabled)
|
||||
return;
|
||||
|
||||
gpu->channel_manager->conf_computing.key_rotation_enabled = false;
|
||||
}
|
||||
|
||||
bool uvm_conf_computing_is_key_rotation_enabled(uvm_gpu_t *gpu)
|
||||
{
|
||||
return gpu->channel_manager->conf_computing.key_rotation_enabled;
|
||||
}
|
||||
|
||||
bool uvm_conf_computing_is_key_rotation_enabled_in_pool(uvm_channel_pool_t *pool)
|
||||
{
|
||||
if (!uvm_conf_computing_is_key_rotation_enabled(pool->manager->gpu))
|
||||
return false;
|
||||
|
||||
// TODO: Bug 4586447: key rotation must be disabled in the SEC2 engine,
|
||||
// because currently the encryption key is shared between UVM and RM, but
|
||||
// UVM is not able to idle SEC2 channels owned by RM.
|
||||
if (uvm_channel_pool_is_sec2(pool))
|
||||
return false;
|
||||
|
||||
// Key rotation happens as part of channel reservation, and LCIC channels
|
||||
// are never reserved directly. Rotation of keys in LCIC channels happens
|
||||
// as the result of key rotation in WLC channels.
|
||||
//
|
||||
// Return false even if there is nothing fundamental prohibiting direct key
|
||||
// rotation on LCIC pools
|
||||
if (uvm_channel_pool_is_lcic(pool))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool conf_computing_is_key_rotation_pending_use_stats(uvm_channel_pool_t *pool)
|
||||
{
|
||||
NvU64 decrypted, encrypted;
|
||||
|
||||
UVM_ASSERT(!key_rotation_is_notifier_driven());
|
||||
|
||||
decrypted = atomic64_read(&pool->conf_computing.key_rotation.decrypted);
|
||||
|
||||
if (decrypted > UVM_CONF_COMPUTING_KEY_ROTATION_LOWER_THRESHOLD)
|
||||
return true;
|
||||
|
||||
encrypted = atomic64_read(&pool->conf_computing.key_rotation.encrypted);
|
||||
|
||||
if (encrypted > UVM_CONF_COMPUTING_KEY_ROTATION_LOWER_THRESHOLD)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool conf_computing_is_key_rotation_pending_use_notifier(uvm_channel_pool_t *pool)
|
||||
{
|
||||
// If key rotation is pending for the pool's engine, then the key rotation
|
||||
// notifier in any of the engine channels can be used by UVM to detect the
|
||||
// situation. Note that RM doesn't update all the notifiers in a single
|
||||
// atomic operation, so it is possible that the channel read by UVM (the
|
||||
// first one in the pool) indicates that a key rotation is pending, but
|
||||
// another channel in the pool (temporarily) indicates the opposite, or vice
|
||||
// versa.
|
||||
uvm_channel_t *first_channel = pool->channels;
|
||||
|
||||
UVM_ASSERT(key_rotation_is_notifier_driven());
|
||||
UVM_ASSERT(first_channel != NULL);
|
||||
|
||||
return first_channel->channel_info.keyRotationNotifier->status == UVM_KEY_ROTATION_STATUS_PENDING;
|
||||
}
|
||||
|
||||
bool uvm_conf_computing_is_key_rotation_pending_in_pool(uvm_channel_pool_t *pool)
|
||||
{
|
||||
if (!uvm_conf_computing_is_key_rotation_enabled_in_pool(pool))
|
||||
return false;
|
||||
|
||||
if (key_rotation_is_notifier_driven())
|
||||
return conf_computing_is_key_rotation_pending_use_notifier(pool);
|
||||
else
|
||||
return conf_computing_is_key_rotation_pending_use_stats(pool);
|
||||
}
|
||||
|
||||
NV_STATUS uvm_conf_computing_rotate_pool_key(uvm_channel_pool_t *pool)
|
||||
{
|
||||
NV_STATUS status;
|
||||
|
||||
UVM_ASSERT(uvm_conf_computing_is_key_rotation_enabled_in_pool(pool));
|
||||
UVM_ASSERT(pool->conf_computing.key_rotation.csl_contexts != NULL);
|
||||
UVM_ASSERT(pool->conf_computing.key_rotation.num_csl_contexts > 0);
|
||||
|
||||
// NV_ERR_STATE_IN_USE indicates that RM was not able to acquire the
|
||||
// required locks at this time. This status is not interpreted as an error,
|
||||
// but as a sign for UVM to try again later. This is the same "protocol"
|
||||
// used in IV rotation.
|
||||
status = nvUvmInterfaceCslRotateKey(pool->conf_computing.key_rotation.csl_contexts,
|
||||
pool->conf_computing.key_rotation.num_csl_contexts);
|
||||
|
||||
if (status == NV_OK) {
|
||||
pool->conf_computing.key_rotation.version++;
|
||||
|
||||
if (!key_rotation_is_notifier_driven()) {
|
||||
atomic64_set(&pool->conf_computing.key_rotation.decrypted, 0);
|
||||
atomic64_set(&pool->conf_computing.key_rotation.encrypted, 0);
|
||||
}
|
||||
}
|
||||
else if (status != NV_ERR_STATE_IN_USE) {
|
||||
UVM_DBG_PRINT("nvUvmInterfaceCslRotateKey() failed in engine %u: %s\n",
|
||||
pool->engine_index,
|
||||
nvstatusToString(status));
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
__attribute__ ((format(printf, 6, 7)))
|
||||
NV_STATUS uvm_conf_computing_util_memcopy_cpu_to_gpu(uvm_gpu_t *gpu,
|
||||
uvm_gpu_address_t dst_gpu_address,
|
||||
void *src_plain,
|
||||
size_t size,
|
||||
uvm_tracker_t *tracker,
|
||||
const char *format,
|
||||
...)
|
||||
{
|
||||
NV_STATUS status;
|
||||
uvm_push_t push;
|
||||
uvm_conf_computing_dma_buffer_t *dma_buffer;
|
||||
uvm_gpu_address_t src_gpu_address, auth_tag_gpu_address;
|
||||
void *dst_cipher, *auth_tag;
|
||||
va_list args;
|
||||
|
||||
UVM_ASSERT(g_uvm_global.conf_computing_enabled);
|
||||
UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE);
|
||||
|
||||
status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
va_start(args, format);
|
||||
status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_CPU_TO_GPU, tracker, &push, format, args);
|
||||
va_end(args);
|
||||
|
||||
if (status != NV_OK)
|
||||
goto out;
|
||||
|
||||
dst_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
|
||||
auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
|
||||
uvm_conf_computing_cpu_encrypt(push.channel, dst_cipher, src_plain, NULL, size, auth_tag);
|
||||
|
||||
src_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
|
||||
auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
|
||||
gpu->parent->ce_hal->decrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address);
|
||||
|
||||
status = uvm_push_end_and_wait(&push);
|
||||
|
||||
out:
|
||||
uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL);
|
||||
return status;
|
||||
}
|
||||
|
||||
__attribute__ ((format(printf, 6, 7)))
|
||||
NV_STATUS uvm_conf_computing_util_memcopy_gpu_to_cpu(uvm_gpu_t *gpu,
|
||||
void *dst_plain,
|
||||
uvm_gpu_address_t src_gpu_address,
|
||||
size_t size,
|
||||
uvm_tracker_t *tracker,
|
||||
const char *format,
|
||||
...)
|
||||
{
|
||||
NV_STATUS status;
|
||||
uvm_push_t push;
|
||||
uvm_conf_computing_dma_buffer_t *dma_buffer;
|
||||
uvm_gpu_address_t dst_gpu_address, auth_tag_gpu_address;
|
||||
void *src_cipher, *auth_tag;
|
||||
va_list args;
|
||||
|
||||
UVM_ASSERT(g_uvm_global.conf_computing_enabled);
|
||||
UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE);
|
||||
|
||||
status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
va_start(args, format);
|
||||
status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_TO_CPU, tracker, &push, format, args);
|
||||
va_end(args);
|
||||
|
||||
if (status != NV_OK)
|
||||
goto out;
|
||||
|
||||
uvm_conf_computing_log_gpu_encryption(push.channel, size, dma_buffer->decrypt_iv);
|
||||
dma_buffer->key_version[0] = uvm_channel_pool_key_version(push.channel->pool);
|
||||
|
||||
dst_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
|
||||
auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
|
||||
gpu->parent->ce_hal->encrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address);
|
||||
|
||||
status = uvm_push_end_and_wait(&push);
|
||||
if (status != NV_OK)
|
||||
goto out;
|
||||
|
||||
src_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
|
||||
auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
|
||||
status = uvm_conf_computing_cpu_decrypt(push.channel,
|
||||
dst_plain,
|
||||
src_cipher,
|
||||
dma_buffer->decrypt_iv,
|
||||
dma_buffer->key_version[0],
|
||||
size,
|
||||
auth_tag);
|
||||
|
||||
out:
|
||||
uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL);
|
||||
return status;
|
||||
}
|
||||
|
@ -87,9 +87,9 @@ typedef struct
|
||||
// a free buffer.
|
||||
uvm_tracker_t tracker;
|
||||
|
||||
// When the DMA buffer is used as the destination of a GPU encryption, the
|
||||
// engine (CE or SEC2) writes the authentication tag here. When the buffer
|
||||
// is decrypted on the CPU the authentication tag is used by CSL to verify
|
||||
// When the DMA buffer is used as the destination of a GPU encryption, SEC2
|
||||
// writes the authentication tag here. Later when the buffer is decrypted
|
||||
// on the CPU the authentication tag is used again (read) for CSL to verify
|
||||
// the authenticity. The allocation is big enough for one authentication
|
||||
// tag per PAGE_SIZE page in the alloc buffer.
|
||||
uvm_mem_t *auth_tag;
|
||||
@ -98,12 +98,7 @@ typedef struct
|
||||
// to the authentication tag. The allocation is big enough for one IV per
|
||||
// PAGE_SIZE page in the alloc buffer. The granularity between the decrypt
|
||||
// IV and authentication tag must match.
|
||||
UvmCslIv decrypt_iv[UVM_CONF_COMPUTING_DMA_BUFFER_SIZE / PAGE_SIZE];
|
||||
|
||||
// When the DMA buffer is used as the destination of a GPU encryption, the
|
||||
// key version used during GPU encryption of each PAGE_SIZE page can be
|
||||
// saved here, so CPU decryption uses the correct decryption key.
|
||||
NvU32 key_version[UVM_CONF_COMPUTING_DMA_BUFFER_SIZE / PAGE_SIZE];
|
||||
UvmCslIv decrypt_iv[(UVM_CONF_COMPUTING_DMA_BUFFER_SIZE / PAGE_SIZE)];
|
||||
|
||||
// Bitmap of the encrypted pages in the backing allocation
|
||||
uvm_page_mask_t encrypted_page_mask;
|
||||
@ -152,7 +147,7 @@ NV_STATUS uvm_conf_computing_gpu_init(uvm_gpu_t *gpu);
|
||||
void uvm_conf_computing_gpu_deinit(uvm_gpu_t *gpu);
|
||||
|
||||
// Logs encryption information from the GPU and returns the IV.
|
||||
void uvm_conf_computing_log_gpu_encryption(uvm_channel_t *channel, size_t size, UvmCslIv *iv);
|
||||
void uvm_conf_computing_log_gpu_encryption(uvm_channel_t *channel, UvmCslIv *iv);
|
||||
|
||||
// Acquires next CPU encryption IV and returns it.
|
||||
void uvm_conf_computing_acquire_encryption_iv(uvm_channel_t *channel, UvmCslIv *iv);
|
||||
@ -172,14 +167,10 @@ void uvm_conf_computing_cpu_encrypt(uvm_channel_t *channel,
|
||||
// CPU side decryption helper. Decrypts data from src_cipher and writes the
|
||||
// plain text in dst_plain. src_cipher and dst_plain can't overlap. IV obtained
|
||||
// from uvm_conf_computing_log_gpu_encryption() needs to be be passed to src_iv.
|
||||
//
|
||||
// The caller must indicate which key to use for decryption by passing the
|
||||
// appropiate key version number.
|
||||
NV_STATUS uvm_conf_computing_cpu_decrypt(uvm_channel_t *channel,
|
||||
void *dst_plain,
|
||||
const void *src_cipher,
|
||||
const UvmCslIv *src_iv,
|
||||
NvU32 key_version,
|
||||
size_t size,
|
||||
const void *auth_tag_buffer);
|
||||
|
||||
@ -200,12 +191,12 @@ NV_STATUS uvm_conf_computing_fault_decrypt(uvm_parent_gpu_t *parent_gpu,
|
||||
NvU8 valid);
|
||||
|
||||
// Increment the CPU-side decrypt IV of the CSL context associated with
|
||||
// replayable faults.
|
||||
// replayable faults. The function is a no-op if the given increment is zero.
|
||||
//
|
||||
// The IV associated with a fault CSL context is a 64-bit counter.
|
||||
//
|
||||
// Locking: this function must be invoked while holding the replayable ISR lock.
|
||||
void uvm_conf_computing_fault_increment_decrypt_iv(uvm_parent_gpu_t *parent_gpu);
|
||||
void uvm_conf_computing_fault_increment_decrypt_iv(uvm_parent_gpu_t *parent_gpu, NvU64 increment);
|
||||
|
||||
// Query the number of remaining messages before IV needs to be rotated.
|
||||
void uvm_conf_computing_query_message_pools(uvm_channel_t *channel,
|
||||
@ -223,71 +214,4 @@ NV_STATUS uvm_conf_computing_maybe_rotate_channel_ivs_retry_busy(uvm_channel_t *
|
||||
// Check if there are fewer than 'limit' messages available in either direction
|
||||
// and rotate if not.
|
||||
NV_STATUS uvm_conf_computing_rotate_channel_ivs_below_limit(uvm_channel_t *channel, NvU64 limit, bool retry_if_busy);
|
||||
|
||||
// Rotate the engine key associated with the given channel pool.
|
||||
NV_STATUS uvm_conf_computing_rotate_pool_key(uvm_channel_pool_t *pool);
|
||||
|
||||
// Returns true if key rotation is allowed in the channel pool.
|
||||
bool uvm_conf_computing_is_key_rotation_enabled_in_pool(uvm_channel_pool_t *pool);
|
||||
|
||||
// Returns true if key rotation is pending in the channel pool.
|
||||
bool uvm_conf_computing_is_key_rotation_pending_in_pool(uvm_channel_pool_t *pool);
|
||||
|
||||
// Enable/disable key rotation in the passed GPU. Note that UVM enablement is
|
||||
// dependent on RM enablement: key rotation may still be disabled upon calling
|
||||
// this function, if it is disabled in RM. On the other hand, key rotation can
|
||||
// be disabled in UVM, even if it is enabled in RM.
|
||||
//
|
||||
// Enablement/Disablement affects only kernel key rotation in keys owned by UVM.
|
||||
// It doesn't affect user key rotation (CUDA, Video...), nor it affects RM
|
||||
// kernel key rotation.
|
||||
void uvm_conf_computing_enable_key_rotation(uvm_gpu_t *gpu);
|
||||
void uvm_conf_computing_disable_key_rotation(uvm_gpu_t *gpu);
|
||||
|
||||
// Returns true if key rotation is enabled on UVM in the given GPU. Key rotation
|
||||
// can be enabled on the GPU but disabled on some of GPU engines (LCEs or SEC2),
|
||||
// see uvm_conf_computing_is_key_rotation_enabled_in_pool.
|
||||
bool uvm_conf_computing_is_key_rotation_enabled(uvm_gpu_t *gpu);
|
||||
|
||||
// Launch a synchronous, encrypted copy between CPU and GPU.
|
||||
//
|
||||
// The maximum copy size allowed is UVM_CONF_COMPUTING_DMA_BUFFER_SIZE.
|
||||
//
|
||||
// The source CPU buffer pointed by src_plain contains the unencrypted (plain
|
||||
// text) contents; the function internally performs a CPU-side encryption step
|
||||
// before launching the GPU-side CE decryption. The source buffer can be in
|
||||
// protected or unprotected sysmem, while the destination buffer must be in
|
||||
// protected vidmem.
|
||||
//
|
||||
// The input tracker, if not NULL, is internally acquired by the push
|
||||
// responsible for the encrypted copy.
|
||||
__attribute__ ((format(printf, 6, 7)))
|
||||
NV_STATUS uvm_conf_computing_util_memcopy_cpu_to_gpu(uvm_gpu_t *gpu,
|
||||
uvm_gpu_address_t dst_gpu_address,
|
||||
void *src_plain,
|
||||
size_t size,
|
||||
uvm_tracker_t *tracker,
|
||||
const char *format,
|
||||
...);
|
||||
|
||||
// Launch a synchronous, encrypted copy between CPU and GPU.
|
||||
//
|
||||
// The maximum copy size allowed is UVM_CONF_COMPUTING_DMA_BUFFER_SIZE.
|
||||
//
|
||||
// The source CPU buffer pointed by src_plain contains the unencrypted (plain
|
||||
// text) contents; the function internally performs a CPU-side encryption step
|
||||
// before launching the GPU-side CE decryption. The source buffer can be in
|
||||
// protected or unprotected sysmem, while the destination buffer must be in
|
||||
// protected vidmem.
|
||||
//
|
||||
// The input tracker, if not NULL, is internally acquired by the push
|
||||
// responsible for the encrypted copy.
|
||||
__attribute__ ((format(printf, 6, 7)))
|
||||
NV_STATUS uvm_conf_computing_util_memcopy_gpu_to_cpu(uvm_gpu_t *gpu,
|
||||
void *dst_plain,
|
||||
uvm_gpu_address_t src_gpu_address,
|
||||
size_t size,
|
||||
uvm_tracker_t *tracker,
|
||||
const char *format,
|
||||
...);
|
||||
#endif // __UVM_CONF_COMPUTING_H__
|
||||
|
@ -218,8 +218,9 @@ static NV_STATUS alloc_and_init_address_space(uvm_gpu_t *gpu)
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
gpu->big_page.internal_size = gpu_address_space_info.bigPageSize;
|
||||
UVM_ASSERT(gpu_address_space_info.bigPageSize <= NV_U32_MAX);
|
||||
|
||||
gpu->big_page.internal_size = gpu_address_space_info.bigPageSize;
|
||||
gpu->time.time0_register = gpu_address_space_info.time0Offset;
|
||||
gpu->time.time1_register = gpu_address_space_info.time1Offset;
|
||||
|
||||
@ -458,6 +459,7 @@ static const char *uvm_gpu_virt_type_string(UVM_VIRT_MODE virtMode)
|
||||
|
||||
static const char *uvm_gpu_link_type_string(uvm_gpu_link_type_t link_type)
|
||||
{
|
||||
|
||||
BUILD_BUG_ON(UVM_GPU_LINK_MAX != 7);
|
||||
|
||||
switch (link_type) {
|
||||
@ -1082,9 +1084,6 @@ static NV_STATUS configure_address_space(uvm_gpu_t *gpu)
|
||||
gpu->parent->rm_va_size,
|
||||
va_per_entry);
|
||||
|
||||
UVM_ASSERT(uvm_mmu_page_size_supported(&gpu->address_space_tree, gpu->big_page.internal_size));
|
||||
UVM_ASSERT(uvm_mmu_page_size_supported(&gpu->address_space_tree, gpu->mem_info.max_vidmem_page_size));
|
||||
|
||||
tree_alloc = uvm_page_tree_pdb(&gpu->address_space_tree);
|
||||
status = uvm_rm_locked_call(nvUvmInterfaceSetPageDirectory(gpu->rm_address_space,
|
||||
tree_alloc->addr.address,
|
||||
@ -2364,9 +2363,7 @@ static NV_STATUS init_peer_access(uvm_gpu_t *gpu0,
|
||||
|
||||
// check for peer-to-peer compatibility (PCI-E or NvLink).
|
||||
peer_caps->link_type = get_gpu_link_type(p2p_caps_params->p2pLink);
|
||||
if (peer_caps->link_type == UVM_GPU_LINK_INVALID
|
||||
|| peer_caps->link_type == UVM_GPU_LINK_C2C
|
||||
)
|
||||
if (peer_caps->link_type == UVM_GPU_LINK_INVALID || peer_caps->link_type == UVM_GPU_LINK_C2C)
|
||||
return NV_ERR_NOT_SUPPORTED;
|
||||
|
||||
peer_caps->total_link_line_rate_mbyte_per_s = p2p_caps_params->totalLinkLineRateMBps;
|
||||
@ -3296,7 +3293,10 @@ void uvm_parent_gpu_dma_free_page(uvm_parent_gpu_t *parent_gpu, void *va, NvU64
|
||||
atomic64_sub(PAGE_SIZE, &parent_gpu->mapped_cpu_pages_size);
|
||||
}
|
||||
|
||||
NV_STATUS uvm_parent_gpu_map_cpu_pages(uvm_parent_gpu_t *parent_gpu, struct page *page, size_t size, NvU64 *dma_address_out)
|
||||
NV_STATUS uvm_parent_gpu_map_cpu_pages(uvm_parent_gpu_t *parent_gpu,
|
||||
struct page *page,
|
||||
size_t size,
|
||||
NvU64 *dma_address_out)
|
||||
{
|
||||
NvU64 dma_addr;
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2015-2023 NVIDIA Corporation
|
||||
Copyright (c) 2015-2024 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
|
@ -591,7 +591,7 @@ static void fault_buffer_skip_replayable_entry(uvm_parent_gpu_t *parent_gpu, NvU
|
||||
// replayable faults still requires manual adjustment so it is kept in sync
|
||||
// with the encryption IV on the GSP-RM's side.
|
||||
if (g_uvm_global.conf_computing_enabled)
|
||||
uvm_conf_computing_fault_increment_decrypt_iv(parent_gpu);
|
||||
uvm_conf_computing_fault_increment_decrypt_iv(parent_gpu, 1);
|
||||
|
||||
parent_gpu->fault_buffer_hal->entry_clear_valid(parent_gpu, index);
|
||||
}
|
||||
|
@ -60,17 +60,6 @@ struct uvm_gpu_semaphore_pool_page_struct
|
||||
// Allocation backing the page
|
||||
uvm_rm_mem_t *memory;
|
||||
|
||||
struct {
|
||||
// Unprotected sysmem storing encrypted value of semaphores
|
||||
uvm_rm_mem_t *encrypted_payload_memory;
|
||||
|
||||
// Unprotected sysmem storing encryption auth tags
|
||||
uvm_rm_mem_t *auth_tag_memory;
|
||||
|
||||
// Unprotected sysmem storing plain text notifier values
|
||||
uvm_rm_mem_t *notifier_memory;
|
||||
} conf_computing;
|
||||
|
||||
// Pool the page is part of
|
||||
uvm_gpu_semaphore_pool_t *pool;
|
||||
|
||||
@ -91,6 +80,26 @@ static bool gpu_semaphore_is_secure(uvm_gpu_semaphore_t *semaphore)
|
||||
return gpu_semaphore_pool_is_secure(semaphore->page->pool);
|
||||
}
|
||||
|
||||
static NvU32 get_index(uvm_gpu_semaphore_t *semaphore)
|
||||
{
|
||||
NvU32 offset;
|
||||
NvU32 index;
|
||||
|
||||
if (gpu_semaphore_is_secure(semaphore))
|
||||
return semaphore->conf_computing.index;
|
||||
|
||||
UVM_ASSERT(semaphore->payload != NULL);
|
||||
UVM_ASSERT(semaphore->page != NULL);
|
||||
|
||||
offset = (char*)semaphore->payload - (char*)uvm_rm_mem_get_cpu_va(semaphore->page->memory);
|
||||
UVM_ASSERT(offset % UVM_SEMAPHORE_SIZE == 0);
|
||||
|
||||
index = offset / UVM_SEMAPHORE_SIZE;
|
||||
UVM_ASSERT(index < UVM_SEMAPHORE_COUNT_PER_PAGE);
|
||||
|
||||
return index;
|
||||
}
|
||||
|
||||
// Use canary values on debug builds to catch semaphore use-after-free. We can
|
||||
// catch release-after-free by simply setting the payload to a known value at
|
||||
// free then checking it on alloc or pool free, but catching acquire-after-free
|
||||
@ -141,83 +150,34 @@ static bool gpu_can_access_semaphore_pool(uvm_gpu_t *gpu, uvm_rm_mem_t *rm_mem)
|
||||
return ((uvm_rm_mem_get_gpu_uvm_va(rm_mem, gpu) + rm_mem->size - 1) < gpu->parent->max_host_va);
|
||||
}
|
||||
|
||||
static void pool_page_free_buffers(uvm_gpu_semaphore_pool_page_t *page)
|
||||
{
|
||||
uvm_rm_mem_free(page->memory);
|
||||
page->memory = NULL;
|
||||
|
||||
if (gpu_semaphore_pool_is_secure(page->pool)) {
|
||||
uvm_rm_mem_free(page->conf_computing.encrypted_payload_memory);
|
||||
uvm_rm_mem_free(page->conf_computing.auth_tag_memory);
|
||||
uvm_rm_mem_free(page->conf_computing.notifier_memory);
|
||||
|
||||
page->conf_computing.encrypted_payload_memory = NULL;
|
||||
page->conf_computing.auth_tag_memory = NULL;
|
||||
page->conf_computing.notifier_memory = NULL;
|
||||
}
|
||||
else {
|
||||
UVM_ASSERT(!page->conf_computing.encrypted_payload_memory);
|
||||
UVM_ASSERT(!page->conf_computing.auth_tag_memory);
|
||||
UVM_ASSERT(!page->conf_computing.notifier_memory);
|
||||
}
|
||||
}
|
||||
|
||||
static NV_STATUS pool_page_alloc_buffers(uvm_gpu_semaphore_pool_page_t *page)
|
||||
// Secure semaphore pools are allocated in the CPR of vidmem and only mapped to
|
||||
// the owning GPU as no other processor have access to it.
|
||||
static NV_STATUS pool_alloc_secure_page(uvm_gpu_semaphore_pool_t *pool,
|
||||
uvm_gpu_semaphore_pool_page_t *pool_page,
|
||||
uvm_rm_mem_type_t memory_type)
|
||||
{
|
||||
NV_STATUS status;
|
||||
uvm_gpu_semaphore_pool_t *pool = page->pool;
|
||||
uvm_rm_mem_type_t memory_type = (pool->aperture == UVM_APERTURE_SYS) ? UVM_RM_MEM_TYPE_SYS : UVM_RM_MEM_TYPE_GPU;
|
||||
size_t align = 0;
|
||||
bool map_all = true;
|
||||
align = gpu_semaphore_pool_is_secure(pool) ? UVM_CONF_COMPUTING_BUF_ALIGNMENT : 0;
|
||||
map_all = gpu_semaphore_pool_is_secure(pool) ? false : true;
|
||||
|
||||
if (map_all)
|
||||
status = uvm_rm_mem_alloc_and_map_all(pool->gpu, memory_type, UVM_SEMAPHORE_PAGE_SIZE, align, &page->memory);
|
||||
else
|
||||
status = uvm_rm_mem_alloc(pool->gpu, memory_type, UVM_SEMAPHORE_PAGE_SIZE, align, &page->memory);
|
||||
|
||||
if (status != NV_OK)
|
||||
goto error;
|
||||
|
||||
if (!gpu_semaphore_pool_is_secure(pool))
|
||||
return NV_OK;
|
||||
|
||||
status = uvm_rm_mem_alloc_and_map_cpu(pool->gpu,
|
||||
UVM_RM_MEM_TYPE_SYS,
|
||||
UVM_ASSERT(gpu_semaphore_pool_is_secure(pool));
|
||||
status = uvm_rm_mem_alloc(pool->gpu,
|
||||
memory_type,
|
||||
UVM_SEMAPHORE_PAGE_SIZE,
|
||||
UVM_CONF_COMPUTING_BUF_ALIGNMENT,
|
||||
&page->conf_computing.encrypted_payload_memory);
|
||||
if (status != NV_OK)
|
||||
goto error;
|
||||
&pool_page->memory);
|
||||
|
||||
BUILD_BUG_ON(UVM_CONF_COMPUTING_AUTH_TAG_SIZE % UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT);
|
||||
status = uvm_rm_mem_alloc_and_map_cpu(pool->gpu,
|
||||
UVM_RM_MEM_TYPE_SYS,
|
||||
UVM_SEMAPHORE_COUNT_PER_PAGE * UVM_CONF_COMPUTING_AUTH_TAG_SIZE,
|
||||
UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT,
|
||||
&page->conf_computing.auth_tag_memory);
|
||||
if (status != NV_OK)
|
||||
goto error;
|
||||
|
||||
status = uvm_rm_mem_alloc_and_map_cpu(pool->gpu,
|
||||
UVM_RM_MEM_TYPE_SYS,
|
||||
UVM_SEMAPHORE_COUNT_PER_PAGE * sizeof(NvU32),
|
||||
0,
|
||||
&page->conf_computing.notifier_memory);
|
||||
if (status != NV_OK)
|
||||
goto error;
|
||||
return status;
|
||||
|
||||
return NV_OK;
|
||||
error:
|
||||
pool_page_free_buffers(page);
|
||||
return status;
|
||||
}
|
||||
|
||||
static NV_STATUS pool_alloc_page(uvm_gpu_semaphore_pool_t *pool)
|
||||
{
|
||||
NV_STATUS status;
|
||||
uvm_gpu_semaphore_pool_page_t *pool_page;
|
||||
NvU32 *payloads;
|
||||
size_t i;
|
||||
uvm_rm_mem_type_t memory_type = (pool->aperture == UVM_APERTURE_SYS) ? UVM_RM_MEM_TYPE_SYS : UVM_RM_MEM_TYPE_GPU;
|
||||
|
||||
uvm_assert_mutex_locked(&pool->mutex);
|
||||
|
||||
@ -228,9 +188,24 @@ static NV_STATUS pool_alloc_page(uvm_gpu_semaphore_pool_t *pool)
|
||||
|
||||
pool_page->pool = pool;
|
||||
|
||||
status = pool_page_alloc_buffers(pool_page);
|
||||
// Whenever the Confidential Computing feature is enabled, engines can
|
||||
// access semaphores only in the CPR of vidmem. Mapping to other GPUs is
|
||||
// also disabled.
|
||||
if (gpu_semaphore_pool_is_secure(pool)) {
|
||||
status = pool_alloc_secure_page(pool, pool_page, memory_type);
|
||||
|
||||
if (status != NV_OK)
|
||||
goto error;
|
||||
}
|
||||
else {
|
||||
status = uvm_rm_mem_alloc_and_map_all(pool->gpu,
|
||||
memory_type,
|
||||
UVM_SEMAPHORE_PAGE_SIZE,
|
||||
0,
|
||||
&pool_page->memory);
|
||||
if (status != NV_OK)
|
||||
goto error;
|
||||
}
|
||||
|
||||
// Verify the GPU can access the semaphore pool.
|
||||
UVM_ASSERT(gpu_can_access_semaphore_pool(pool->gpu, pool_page->memory));
|
||||
@ -242,9 +217,7 @@ static NV_STATUS pool_alloc_page(uvm_gpu_semaphore_pool_t *pool)
|
||||
pool->free_semaphores_count += UVM_SEMAPHORE_COUNT_PER_PAGE;
|
||||
|
||||
if (semaphore_uses_canary(pool)) {
|
||||
size_t i;
|
||||
NvU32 *payloads = uvm_rm_mem_get_cpu_va(pool_page->memory);
|
||||
|
||||
payloads = uvm_rm_mem_get_cpu_va(pool_page->memory);
|
||||
for (i = 0; i < UVM_SEMAPHORE_COUNT_PER_PAGE; i++)
|
||||
payloads[i] = make_canary(0);
|
||||
}
|
||||
@ -280,7 +253,7 @@ static void pool_free_page(uvm_gpu_semaphore_pool_page_t *page)
|
||||
|
||||
pool->free_semaphores_count -= UVM_SEMAPHORE_COUNT_PER_PAGE;
|
||||
list_del(&page->all_pages_node);
|
||||
pool_page_free_buffers(page);
|
||||
uvm_rm_mem_free(page->memory);
|
||||
uvm_kvfree(page);
|
||||
}
|
||||
|
||||
@ -300,22 +273,19 @@ NV_STATUS uvm_gpu_semaphore_alloc(uvm_gpu_semaphore_pool_t *pool, uvm_gpu_semaph
|
||||
goto done;
|
||||
|
||||
list_for_each_entry(page, &pool->pages, all_pages_node) {
|
||||
const NvU32 semaphore_index = find_first_bit(page->free_semaphores, UVM_SEMAPHORE_COUNT_PER_PAGE);
|
||||
|
||||
UVM_ASSERT(semaphore_index <= UVM_SEMAPHORE_COUNT_PER_PAGE);
|
||||
|
||||
NvU32 semaphore_index = find_first_bit(page->free_semaphores, UVM_SEMAPHORE_COUNT_PER_PAGE);
|
||||
if (semaphore_index == UVM_SEMAPHORE_COUNT_PER_PAGE)
|
||||
continue;
|
||||
|
||||
semaphore->page = page;
|
||||
semaphore->index = semaphore_index;
|
||||
|
||||
if (gpu_semaphore_pool_is_secure(pool)) {
|
||||
|
||||
// Reset the notifier to prevent detection of false attack when
|
||||
// checking for updated value
|
||||
*uvm_gpu_semaphore_get_notifier_cpu_va(semaphore) = semaphore->conf_computing.last_observed_notifier;
|
||||
semaphore->conf_computing.index = semaphore_index;
|
||||
}
|
||||
else {
|
||||
semaphore->payload = (NvU32*)((char*)uvm_rm_mem_get_cpu_va(page->memory) +
|
||||
semaphore_index * UVM_SEMAPHORE_SIZE);
|
||||
}
|
||||
|
||||
semaphore->page = page;
|
||||
|
||||
if (semaphore_uses_canary(pool))
|
||||
UVM_ASSERT(is_canary(uvm_gpu_semaphore_get_payload(semaphore)));
|
||||
@ -341,6 +311,7 @@ void uvm_gpu_semaphore_free(uvm_gpu_semaphore_t *semaphore)
|
||||
{
|
||||
uvm_gpu_semaphore_pool_page_t *page;
|
||||
uvm_gpu_semaphore_pool_t *pool;
|
||||
NvU32 index;
|
||||
|
||||
UVM_ASSERT(semaphore);
|
||||
|
||||
@ -352,6 +323,7 @@ void uvm_gpu_semaphore_free(uvm_gpu_semaphore_t *semaphore)
|
||||
return;
|
||||
|
||||
pool = page->pool;
|
||||
index = get_index(semaphore);
|
||||
|
||||
// Write a known value lower than the current payload in an attempt to catch
|
||||
// release-after-free and acquire-after-free.
|
||||
@ -361,9 +333,10 @@ void uvm_gpu_semaphore_free(uvm_gpu_semaphore_t *semaphore)
|
||||
uvm_mutex_lock(&pool->mutex);
|
||||
|
||||
semaphore->page = NULL;
|
||||
semaphore->payload = NULL;
|
||||
|
||||
++pool->free_semaphores_count;
|
||||
__set_bit(semaphore->index, page->free_semaphores);
|
||||
__set_bit(index, page->free_semaphores);
|
||||
|
||||
uvm_mutex_unlock(&pool->mutex);
|
||||
}
|
||||
@ -476,72 +449,18 @@ NvU64 uvm_gpu_semaphore_get_gpu_proxy_va(uvm_gpu_semaphore_t *semaphore, uvm_gpu
|
||||
|
||||
NvU64 uvm_gpu_semaphore_get_gpu_va(uvm_gpu_semaphore_t *semaphore, uvm_gpu_t *gpu, bool is_proxy_va_space)
|
||||
{
|
||||
NvU32 index = get_index(semaphore);
|
||||
NvU64 base_va = uvm_rm_mem_get_gpu_va(semaphore->page->memory, gpu, is_proxy_va_space).address;
|
||||
|
||||
return base_va + semaphore->index * UVM_SEMAPHORE_SIZE;
|
||||
}
|
||||
|
||||
NvU32 *uvm_gpu_semaphore_get_cpu_va(uvm_gpu_semaphore_t *semaphore)
|
||||
{
|
||||
char *base_va;
|
||||
|
||||
if (gpu_semaphore_is_secure(semaphore))
|
||||
return &semaphore->conf_computing.cached_payload;
|
||||
|
||||
base_va = uvm_rm_mem_get_cpu_va(semaphore->page->memory);
|
||||
return (NvU32*)(base_va + semaphore->index * UVM_SEMAPHORE_SIZE);
|
||||
}
|
||||
|
||||
NvU32 *uvm_gpu_semaphore_get_encrypted_payload_cpu_va(uvm_gpu_semaphore_t *semaphore)
|
||||
{
|
||||
char *encrypted_base_va = uvm_rm_mem_get_cpu_va(semaphore->page->conf_computing.encrypted_payload_memory);
|
||||
|
||||
return (NvU32*)(encrypted_base_va + semaphore->index * UVM_SEMAPHORE_SIZE);
|
||||
}
|
||||
|
||||
uvm_gpu_address_t uvm_gpu_semaphore_get_encrypted_payload_gpu_va(uvm_gpu_semaphore_t *semaphore)
|
||||
{
|
||||
NvU64 encrypted_base_va = uvm_rm_mem_get_gpu_uvm_va(semaphore->page->conf_computing.encrypted_payload_memory,
|
||||
semaphore->page->pool->gpu);
|
||||
|
||||
return uvm_gpu_address_virtual_unprotected(encrypted_base_va + semaphore->index * UVM_SEMAPHORE_SIZE);
|
||||
}
|
||||
|
||||
uvm_gpu_semaphore_notifier_t *uvm_gpu_semaphore_get_notifier_cpu_va(uvm_gpu_semaphore_t *semaphore)
|
||||
{
|
||||
uvm_gpu_semaphore_notifier_t *notifier_base_va =
|
||||
uvm_rm_mem_get_cpu_va(semaphore->page->conf_computing.notifier_memory);
|
||||
|
||||
return notifier_base_va + semaphore->index;
|
||||
}
|
||||
|
||||
uvm_gpu_address_t uvm_gpu_semaphore_get_notifier_gpu_va(uvm_gpu_semaphore_t *semaphore)
|
||||
{
|
||||
NvU64 notifier_base_va = uvm_rm_mem_get_gpu_uvm_va(semaphore->page->conf_computing.notifier_memory,
|
||||
semaphore->page->pool->gpu);
|
||||
|
||||
return uvm_gpu_address_virtual_unprotected(notifier_base_va +
|
||||
semaphore->index * sizeof(uvm_gpu_semaphore_notifier_t));
|
||||
}
|
||||
|
||||
void *uvm_gpu_semaphore_get_auth_tag_cpu_va(uvm_gpu_semaphore_t *semaphore)
|
||||
{
|
||||
char *auth_tag_base_va = uvm_rm_mem_get_cpu_va(semaphore->page->conf_computing.auth_tag_memory);
|
||||
|
||||
return (void*)(auth_tag_base_va + semaphore->index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE);
|
||||
}
|
||||
|
||||
uvm_gpu_address_t uvm_gpu_semaphore_get_auth_tag_gpu_va(uvm_gpu_semaphore_t *semaphore)
|
||||
{
|
||||
NvU64 auth_tag_base_va = uvm_rm_mem_get_gpu_uvm_va(semaphore->page->conf_computing.auth_tag_memory,
|
||||
semaphore->page->pool->gpu);
|
||||
|
||||
return uvm_gpu_address_virtual_unprotected(auth_tag_base_va + semaphore->index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE);
|
||||
return base_va + UVM_SEMAPHORE_SIZE * index;
|
||||
}
|
||||
|
||||
NvU32 uvm_gpu_semaphore_get_payload(uvm_gpu_semaphore_t *semaphore)
|
||||
{
|
||||
return UVM_GPU_READ_ONCE(*uvm_gpu_semaphore_get_cpu_va(semaphore));
|
||||
if (gpu_semaphore_is_secure(semaphore))
|
||||
return UVM_GPU_READ_ONCE(semaphore->conf_computing.cached_payload);
|
||||
|
||||
return UVM_GPU_READ_ONCE(*semaphore->payload);
|
||||
}
|
||||
|
||||
void uvm_gpu_semaphore_set_payload(uvm_gpu_semaphore_t *semaphore, NvU32 payload)
|
||||
@ -558,7 +477,10 @@ void uvm_gpu_semaphore_set_payload(uvm_gpu_semaphore_t *semaphore, NvU32 payload
|
||||
// the GPU correctly even on non-SMP).
|
||||
mb();
|
||||
|
||||
UVM_GPU_WRITE_ONCE(*uvm_gpu_semaphore_get_cpu_va(semaphore), payload);
|
||||
if (gpu_semaphore_is_secure(semaphore))
|
||||
UVM_GPU_WRITE_ONCE(semaphore->conf_computing.cached_payload, payload);
|
||||
else
|
||||
UVM_GPU_WRITE_ONCE(*semaphore->payload, payload);
|
||||
}
|
||||
|
||||
// This function is intended to catch channels which have been left dangling in
|
||||
@ -624,11 +546,22 @@ void uvm_gpu_tracking_semaphore_free(uvm_gpu_tracking_semaphore_t *tracking_sem)
|
||||
uvm_gpu_semaphore_free(&tracking_sem->semaphore);
|
||||
}
|
||||
|
||||
static void gpu_semaphore_encrypted_payload_update(uvm_channel_t *channel, uvm_gpu_semaphore_t *semaphore)
|
||||
static bool should_skip_secure_semaphore_update(NvU32 last_observed_notifier, NvU32 gpu_notifier)
|
||||
{
|
||||
// No new value, or the GPU is currently writing the new encrypted material
|
||||
// and no change in value would still result in corrupted data.
|
||||
return (last_observed_notifier == gpu_notifier) || (gpu_notifier % 2);
|
||||
}
|
||||
|
||||
static void uvm_gpu_semaphore_encrypted_payload_update(uvm_channel_t *channel, uvm_gpu_semaphore_t *semaphore)
|
||||
{
|
||||
UvmCslIv local_iv;
|
||||
NvU32 local_payload;
|
||||
uvm_gpu_semaphore_notifier_t gpu_notifier;
|
||||
uvm_gpu_semaphore_notifier_t new_gpu_notifier = 0;
|
||||
NvU32 new_sem_value;
|
||||
NvU32 gpu_notifier;
|
||||
NvU32 last_observed_notifier;
|
||||
NvU32 new_gpu_notifier = 0;
|
||||
NvU32 iv_index = 0;
|
||||
|
||||
// A channel can have multiple entries pending and the tracking semaphore
|
||||
// update of each entry can race with this function. Since the semaphore
|
||||
@ -637,72 +570,64 @@ static void gpu_semaphore_encrypted_payload_update(uvm_channel_t *channel, uvm_g
|
||||
unsigned tries_left = channel->num_gpfifo_entries;
|
||||
NV_STATUS status = NV_OK;
|
||||
NvU8 local_auth_tag[UVM_CONF_COMPUTING_AUTH_TAG_SIZE];
|
||||
uvm_gpu_semaphore_notifier_t *semaphore_notifier_cpu_addr = uvm_gpu_semaphore_get_notifier_cpu_va(semaphore);
|
||||
UvmCslIv *ivs_cpu_addr = semaphore->conf_computing.ivs;
|
||||
void *auth_tag_cpu_addr = uvm_rm_mem_get_cpu_va(semaphore->conf_computing.auth_tag);
|
||||
NvU32 *gpu_notifier_cpu_addr = (NvU32 *)uvm_rm_mem_get_cpu_va(semaphore->conf_computing.notifier);
|
||||
NvU32 *payload_cpu_addr = (NvU32 *)uvm_rm_mem_get_cpu_va(semaphore->conf_computing.encrypted_payload);
|
||||
|
||||
UVM_ASSERT(g_uvm_global.conf_computing_enabled);
|
||||
UVM_ASSERT(uvm_channel_is_ce(channel));
|
||||
|
||||
do {
|
||||
gpu_notifier = UVM_READ_ONCE(*semaphore_notifier_cpu_addr);
|
||||
last_observed_notifier = semaphore->conf_computing.last_observed_notifier;
|
||||
gpu_notifier = UVM_READ_ONCE(*gpu_notifier_cpu_addr);
|
||||
UVM_ASSERT(last_observed_notifier <= gpu_notifier);
|
||||
|
||||
UVM_ASSERT(gpu_notifier >= semaphore->conf_computing.last_observed_notifier);
|
||||
if (should_skip_secure_semaphore_update(last_observed_notifier, gpu_notifier))
|
||||
return;
|
||||
|
||||
do {
|
||||
gpu_notifier = UVM_READ_ONCE(*gpu_notifier_cpu_addr);
|
||||
|
||||
// Odd notifier value means there's an update in progress.
|
||||
if (gpu_notifier % 2)
|
||||
continue;
|
||||
|
||||
// There's no change since last time
|
||||
if (gpu_notifier == semaphore->conf_computing.last_observed_notifier)
|
||||
return;
|
||||
|
||||
// Make sure no memory accesses happen before we read the notifier
|
||||
smp_mb__after_atomic();
|
||||
|
||||
memcpy(local_auth_tag, uvm_gpu_semaphore_get_auth_tag_cpu_va(semaphore), sizeof(local_auth_tag));
|
||||
local_payload = UVM_READ_ONCE(*uvm_gpu_semaphore_get_encrypted_payload_cpu_va(semaphore));
|
||||
iv_index = (gpu_notifier / 2) % channel->num_gpfifo_entries;
|
||||
memcpy(local_auth_tag, auth_tag_cpu_addr, sizeof(local_auth_tag));
|
||||
local_payload = UVM_READ_ONCE(*payload_cpu_addr);
|
||||
memcpy(&local_iv, &ivs_cpu_addr[iv_index], sizeof(local_iv));
|
||||
|
||||
// Make sure the second read of notifier happens after
|
||||
// all memory accesses.
|
||||
smp_mb__before_atomic();
|
||||
new_gpu_notifier = UVM_READ_ONCE(*semaphore_notifier_cpu_addr);
|
||||
new_gpu_notifier = UVM_READ_ONCE(*gpu_notifier_cpu_addr);
|
||||
tries_left--;
|
||||
} while ((tries_left > 0) && ((gpu_notifier != new_gpu_notifier) || (gpu_notifier % 2)));
|
||||
|
||||
if (!tries_left) {
|
||||
status = NV_ERR_INVALID_STATE;
|
||||
goto error;
|
||||
}
|
||||
else {
|
||||
NvU32 key_version;
|
||||
const NvU32 iv_index = (gpu_notifier / 2) % channel->num_gpfifo_entries;
|
||||
NvU32 new_semaphore_value;
|
||||
|
||||
UVM_ASSERT(gpu_notifier == new_gpu_notifier);
|
||||
UVM_ASSERT(gpu_notifier % 2 == 0);
|
||||
|
||||
// CPU decryption is guaranteed to use the same key version as the
|
||||
// associated GPU encryption, because if there was any key rotation in
|
||||
// between, then key rotation waited for all channels to complete before
|
||||
// proceeding. The wait implies that the semaphore value matches the
|
||||
// last one encrypted on the GPU, so this CPU decryption should happen
|
||||
// before the key is rotated.
|
||||
key_version = uvm_channel_pool_key_version(channel->pool);
|
||||
|
||||
if (gpu_notifier == new_gpu_notifier) {
|
||||
status = uvm_conf_computing_cpu_decrypt(channel,
|
||||
&new_semaphore_value,
|
||||
&new_sem_value,
|
||||
&local_payload,
|
||||
&semaphore->conf_computing.ivs[iv_index],
|
||||
key_version,
|
||||
sizeof(new_semaphore_value),
|
||||
&local_iv,
|
||||
sizeof(new_sem_value),
|
||||
&local_auth_tag);
|
||||
|
||||
if (status != NV_OK)
|
||||
goto error;
|
||||
|
||||
uvm_gpu_semaphore_set_payload(semaphore, new_semaphore_value);
|
||||
uvm_gpu_semaphore_set_payload(semaphore, new_sem_value);
|
||||
UVM_WRITE_ONCE(semaphore->conf_computing.last_observed_notifier, new_gpu_notifier);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
error:
|
||||
// Decryption failure is a fatal error as well as running out of try left.
|
||||
@ -725,11 +650,11 @@ static NvU64 update_completed_value_locked(uvm_gpu_tracking_semaphore_t *trackin
|
||||
else
|
||||
uvm_assert_spinlock_locked(&tracking_semaphore->s_lock);
|
||||
|
||||
if (gpu_semaphore_is_secure(&tracking_semaphore->semaphore)) {
|
||||
if (tracking_semaphore->semaphore.conf_computing.encrypted_payload) {
|
||||
// TODO: Bug 4008734: [UVM][HCC] Extend secure tracking semaphore
|
||||
// mechanism to all semaphore
|
||||
uvm_channel_t *channel = container_of(tracking_semaphore, uvm_channel_t, tracking_sem);
|
||||
gpu_semaphore_encrypted_payload_update(channel, &tracking_semaphore->semaphore);
|
||||
uvm_gpu_semaphore_encrypted_payload_update(channel, &tracking_semaphore->semaphore);
|
||||
}
|
||||
|
||||
new_sem_value = uvm_gpu_semaphore_get_payload(&tracking_semaphore->semaphore);
|
||||
@ -765,7 +690,7 @@ static NvU64 update_completed_value_locked(uvm_gpu_tracking_semaphore_t *trackin
|
||||
UVM_ASSERT_MSG_RELEASE(new_value - old_value <= UVM_GPU_SEMAPHORE_MAX_JUMP,
|
||||
"GPU %s unexpected semaphore (CPU VA 0x%llx) jump from 0x%llx to 0x%llx\n",
|
||||
uvm_gpu_name(tracking_semaphore->semaphore.page->pool->gpu),
|
||||
(NvU64)(uintptr_t)uvm_gpu_semaphore_get_cpu_va(&tracking_semaphore->semaphore),
|
||||
(NvU64)(uintptr_t)tracking_semaphore->semaphore.payload,
|
||||
old_value, new_value);
|
||||
|
||||
// Use an atomic write even though the lock is held so that the value can
|
||||
|
@ -29,8 +29,6 @@
|
||||
#include "uvm_rm_mem.h"
|
||||
#include "uvm_linux.h"
|
||||
|
||||
typedef NvU32 uvm_gpu_semaphore_notifier_t;
|
||||
|
||||
// A GPU semaphore is a memory location accessible by the GPUs and the CPU
|
||||
// that's used for synchronization among them.
|
||||
// The GPU has primitives to acquire (wait for) and release (set) 4-byte memory
|
||||
@ -47,15 +45,17 @@ struct uvm_gpu_semaphore_struct
|
||||
// The semaphore pool page the semaphore came from
|
||||
uvm_gpu_semaphore_pool_page_t *page;
|
||||
|
||||
// Index of the semaphore in semaphore page
|
||||
NvU16 index;
|
||||
|
||||
// Pointer to the memory location
|
||||
NvU32 *payload;
|
||||
struct {
|
||||
UvmCslIv *ivs;
|
||||
NvU16 index;
|
||||
NvU32 cached_payload;
|
||||
|
||||
uvm_gpu_semaphore_notifier_t last_pushed_notifier;
|
||||
uvm_gpu_semaphore_notifier_t last_observed_notifier;
|
||||
uvm_rm_mem_t *encrypted_payload;
|
||||
uvm_rm_mem_t *notifier;
|
||||
uvm_rm_mem_t *auth_tag;
|
||||
UvmCslIv *ivs;
|
||||
NvU32 last_pushed_notifier;
|
||||
NvU32 last_observed_notifier;
|
||||
} conf_computing;
|
||||
};
|
||||
|
||||
@ -151,17 +151,6 @@ NvU64 uvm_gpu_semaphore_get_gpu_proxy_va(uvm_gpu_semaphore_t *semaphore, uvm_gpu
|
||||
|
||||
NvU64 uvm_gpu_semaphore_get_gpu_va(uvm_gpu_semaphore_t *semaphore, uvm_gpu_t *gpu, bool is_proxy_va_space);
|
||||
|
||||
NvU32 *uvm_gpu_semaphore_get_cpu_va(uvm_gpu_semaphore_t *semaphore);
|
||||
|
||||
NvU32 *uvm_gpu_semaphore_get_encrypted_payload_cpu_va(uvm_gpu_semaphore_t *semaphore);
|
||||
uvm_gpu_address_t uvm_gpu_semaphore_get_encrypted_payload_gpu_va(uvm_gpu_semaphore_t *semaphore);
|
||||
|
||||
uvm_gpu_semaphore_notifier_t *uvm_gpu_semaphore_get_notifier_cpu_va(uvm_gpu_semaphore_t *semaphore);
|
||||
uvm_gpu_address_t uvm_gpu_semaphore_get_notifier_gpu_va(uvm_gpu_semaphore_t *semaphore);
|
||||
|
||||
void *uvm_gpu_semaphore_get_auth_tag_cpu_va(uvm_gpu_semaphore_t *semaphore);
|
||||
uvm_gpu_address_t uvm_gpu_semaphore_get_auth_tag_gpu_va(uvm_gpu_semaphore_t *semaphore);
|
||||
|
||||
// Read the 32-bit payload of the semaphore
|
||||
// Notably doesn't provide any memory ordering guarantees and needs to be used with
|
||||
// care. For an example of what needs to be considered see
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2015-2023 NVIDIA Corporation
|
||||
Copyright (c) 2015-2024 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@ -251,6 +251,9 @@ static uvm_hal_class_ops_t host_table[] =
|
||||
.semaphore_release = uvm_hal_turing_host_semaphore_release,
|
||||
.clear_faulted_channel_method = uvm_hal_turing_host_clear_faulted_channel_method,
|
||||
.set_gpfifo_entry = uvm_hal_turing_host_set_gpfifo_entry,
|
||||
.tlb_invalidate_all = uvm_hal_turing_host_tlb_invalidate_all,
|
||||
.tlb_invalidate_va = uvm_hal_turing_host_tlb_invalidate_va,
|
||||
.tlb_invalidate_test = uvm_hal_turing_host_tlb_invalidate_test,
|
||||
}
|
||||
},
|
||||
{
|
||||
@ -632,13 +635,19 @@ NV_STATUS uvm_hal_init_table(void)
|
||||
return status;
|
||||
}
|
||||
|
||||
status = ops_init_from_parent(host_table, ARRAY_SIZE(host_table), HOST_OP_COUNT, offsetof(uvm_hal_class_ops_t, u.host_ops));
|
||||
status = ops_init_from_parent(host_table,
|
||||
ARRAY_SIZE(host_table),
|
||||
HOST_OP_COUNT,
|
||||
offsetof(uvm_hal_class_ops_t, u.host_ops));
|
||||
if (status != NV_OK) {
|
||||
UVM_ERR_PRINT("ops_init_from_parent(host_table) failed: %s\n", nvstatusToString(status));
|
||||
return status;
|
||||
}
|
||||
|
||||
status = ops_init_from_parent(arch_table, ARRAY_SIZE(arch_table), ARCH_OP_COUNT, offsetof(uvm_hal_class_ops_t, u.arch_ops));
|
||||
status = ops_init_from_parent(arch_table,
|
||||
ARRAY_SIZE(arch_table),
|
||||
ARCH_OP_COUNT,
|
||||
offsetof(uvm_hal_class_ops_t, u.arch_ops));
|
||||
if (status != NV_OK) {
|
||||
UVM_ERR_PRINT("ops_init_from_parent(arch_table) failed: %s\n", nvstatusToString(status));
|
||||
return status;
|
||||
@ -932,14 +941,16 @@ const char *uvm_mmu_engine_type_string(uvm_mmu_engine_type_t mmu_engine_type)
|
||||
void uvm_hal_print_fault_entry(const uvm_fault_buffer_entry_t *entry)
|
||||
{
|
||||
UVM_DBG_PRINT("fault_address: 0x%llx\n", entry->fault_address);
|
||||
UVM_DBG_PRINT(" fault_instance_ptr: {0x%llx:%s}\n", entry->instance_ptr.address,
|
||||
UVM_DBG_PRINT(" fault_instance_ptr: {0x%llx:%s}\n",
|
||||
entry->instance_ptr.address,
|
||||
uvm_aperture_string(entry->instance_ptr.aperture));
|
||||
UVM_DBG_PRINT(" fault_type: %s\n", uvm_fault_type_string(entry->fault_type));
|
||||
UVM_DBG_PRINT(" fault_access_type: %s\n", uvm_fault_access_type_string(entry->fault_access_type));
|
||||
UVM_DBG_PRINT(" is_replayable: %s\n", entry->is_replayable? "true": "false");
|
||||
UVM_DBG_PRINT(" is_virtual: %s\n", entry->is_virtual? "true": "false");
|
||||
UVM_DBG_PRINT(" in_protected_mode: %s\n", entry->in_protected_mode? "true": "false");
|
||||
UVM_DBG_PRINT(" fault_source.client_type: %s\n", uvm_fault_client_type_string(entry->fault_source.client_type));
|
||||
UVM_DBG_PRINT(" fault_source.client_type: %s\n",
|
||||
uvm_fault_client_type_string(entry->fault_source.client_type));
|
||||
UVM_DBG_PRINT(" fault_source.client_id: %d\n", entry->fault_source.client_id);
|
||||
UVM_DBG_PRINT(" fault_source.gpc_id: %d\n", entry->fault_source.gpc_id);
|
||||
UVM_DBG_PRINT(" fault_source.mmu_engine_id: %d\n", entry->fault_source.mmu_engine_id);
|
||||
@ -962,12 +973,14 @@ const char *uvm_access_counter_type_string(uvm_access_counter_type_t access_coun
|
||||
void uvm_hal_print_access_counter_buffer_entry(const uvm_access_counter_buffer_entry_t *entry)
|
||||
{
|
||||
if (!entry->address.is_virtual) {
|
||||
UVM_DBG_PRINT("physical address: {0x%llx:%s}\n", entry->address.address,
|
||||
UVM_DBG_PRINT("physical address: {0x%llx:%s}\n",
|
||||
entry->address.address,
|
||||
uvm_aperture_string(entry->address.aperture));
|
||||
}
|
||||
else {
|
||||
UVM_DBG_PRINT("virtual address: 0x%llx\n", entry->address.address);
|
||||
UVM_DBG_PRINT(" instance_ptr {0x%llx:%s}\n", entry->virtual_info.instance_ptr.address,
|
||||
UVM_DBG_PRINT(" instance_ptr {0x%llx:%s}\n",
|
||||
entry->virtual_info.instance_ptr.address,
|
||||
uvm_aperture_string(entry->virtual_info.instance_ptr.aperture));
|
||||
UVM_DBG_PRINT(" mmu_engine_type %s\n", uvm_mmu_engine_type_string(entry->virtual_info.mmu_engine_type));
|
||||
UVM_DBG_PRINT(" mmu_engine_id %u\n", entry->virtual_info.mmu_engine_id);
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2015-2023 NVIDIA Corporation
|
||||
Copyright (c) 2015-2024 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@ -112,6 +112,10 @@ void uvm_hal_pascal_host_tlb_invalidate_all(uvm_push_t *push,
|
||||
uvm_gpu_phys_address_t pdb,
|
||||
NvU32 depth,
|
||||
uvm_membar_t membar);
|
||||
void uvm_hal_turing_host_tlb_invalidate_all(uvm_push_t *push,
|
||||
uvm_gpu_phys_address_t pdb,
|
||||
NvU32 depth,
|
||||
uvm_membar_t membar);
|
||||
void uvm_hal_ampere_host_tlb_invalidate_all(uvm_push_t *push,
|
||||
uvm_gpu_phys_address_t pdb,
|
||||
NvU32 depth,
|
||||
@ -149,42 +153,49 @@ typedef void (*uvm_hal_host_tlb_invalidate_va_t)(uvm_push_t *push,
|
||||
NvU32 depth,
|
||||
NvU64 base,
|
||||
NvU64 size,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
uvm_membar_t membar);
|
||||
void uvm_hal_maxwell_host_tlb_invalidate_va(uvm_push_t *push,
|
||||
uvm_gpu_phys_address_t pdb,
|
||||
NvU32 depth,
|
||||
NvU64 base,
|
||||
NvU64 size,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
uvm_membar_t membar);
|
||||
void uvm_hal_pascal_host_tlb_invalidate_va(uvm_push_t *push,
|
||||
uvm_gpu_phys_address_t pdb,
|
||||
NvU32 depth,
|
||||
NvU64 base,
|
||||
NvU64 size,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
uvm_membar_t membar);
|
||||
void uvm_hal_volta_host_tlb_invalidate_va(uvm_push_t *push,
|
||||
uvm_gpu_phys_address_t pdb,
|
||||
NvU32 depth,
|
||||
NvU64 base,
|
||||
NvU64 size,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
uvm_membar_t membar);
|
||||
void uvm_hal_turing_host_tlb_invalidate_va(uvm_push_t *push,
|
||||
uvm_gpu_phys_address_t pdb,
|
||||
NvU32 depth,
|
||||
NvU64 base,
|
||||
NvU64 size,
|
||||
NvU64 page_size,
|
||||
uvm_membar_t membar);
|
||||
void uvm_hal_ampere_host_tlb_invalidate_va(uvm_push_t *push,
|
||||
uvm_gpu_phys_address_t pdb,
|
||||
NvU32 depth,
|
||||
NvU64 base,
|
||||
NvU64 size,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
uvm_membar_t membar);
|
||||
void uvm_hal_hopper_host_tlb_invalidate_va(uvm_push_t *push,
|
||||
uvm_gpu_phys_address_t pdb,
|
||||
NvU32 depth,
|
||||
NvU64 base,
|
||||
NvU64 size,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
uvm_membar_t membar);
|
||||
|
||||
typedef void (*uvm_hal_host_tlb_invalidate_test_t)(uvm_push_t *push,
|
||||
@ -196,6 +207,9 @@ void uvm_hal_maxwell_host_tlb_invalidate_test(uvm_push_t *push,
|
||||
void uvm_hal_pascal_host_tlb_invalidate_test(uvm_push_t *push,
|
||||
uvm_gpu_phys_address_t pdb,
|
||||
UVM_TEST_INVALIDATE_TLB_PARAMS *params);
|
||||
void uvm_hal_turing_host_tlb_invalidate_test(uvm_push_t *push,
|
||||
uvm_gpu_phys_address_t pdb,
|
||||
UVM_TEST_INVALIDATE_TLB_PARAMS *params);
|
||||
void uvm_hal_ampere_host_tlb_invalidate_test(uvm_push_t *push,
|
||||
uvm_gpu_phys_address_t pdb,
|
||||
UVM_TEST_INVALIDATE_TLB_PARAMS *params);
|
||||
@ -445,15 +459,15 @@ void uvm_hal_ada_arch_init_properties(uvm_parent_gpu_t *parent_gpu);
|
||||
void uvm_hal_hopper_arch_init_properties(uvm_parent_gpu_t *parent_gpu);
|
||||
|
||||
// Retrieve the page-tree HAL for a given big page size
|
||||
typedef uvm_mmu_mode_hal_t *(*uvm_hal_lookup_mode_hal_t)(NvU32 big_page_size);
|
||||
typedef uvm_mmu_mode_hal_t *(*uvm_hal_lookup_mode_hal_t)(NvU64 big_page_size);
|
||||
typedef void (*uvm_hal_mmu_enable_prefetch_faults_t)(uvm_parent_gpu_t *parent_gpu);
|
||||
typedef void (*uvm_hal_mmu_disable_prefetch_faults_t)(uvm_parent_gpu_t *parent_gpu);
|
||||
uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_maxwell(NvU32 big_page_size);
|
||||
uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_pascal(NvU32 big_page_size);
|
||||
uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_volta(NvU32 big_page_size);
|
||||
uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_turing(NvU32 big_page_size);
|
||||
uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_ampere(NvU32 big_page_size);
|
||||
uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_hopper(NvU32 big_page_size);
|
||||
uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_maxwell(NvU64 big_page_size);
|
||||
uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_pascal(NvU64 big_page_size);
|
||||
uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_volta(NvU64 big_page_size);
|
||||
uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_turing(NvU64 big_page_size);
|
||||
uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_ampere(NvU64 big_page_size);
|
||||
uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_hopper(NvU64 big_page_size);
|
||||
void uvm_hal_maxwell_mmu_enable_prefetch_faults_unsupported(uvm_parent_gpu_t *parent_gpu);
|
||||
void uvm_hal_maxwell_mmu_disable_prefetch_faults_unsupported(uvm_parent_gpu_t *parent_gpu);
|
||||
void uvm_hal_pascal_mmu_enable_prefetch_faults(uvm_parent_gpu_t *parent_gpu);
|
||||
|
@ -284,10 +284,8 @@ static void hmm_va_block_unregister_gpu(uvm_va_block_t *va_block,
|
||||
|
||||
// Reset preferred location and accessed-by of policy nodes if needed.
|
||||
uvm_for_each_va_policy_node_in(node, va_block, va_block->start, va_block->end) {
|
||||
if (uvm_va_policy_preferred_location_equal(&node->policy, gpu->id, NUMA_NO_NODE)) {
|
||||
if (uvm_id_equal(node->policy.preferred_location, gpu->id))
|
||||
node->policy.preferred_location = UVM_ID_INVALID;
|
||||
node->policy.preferred_nid = NUMA_NO_NODE;
|
||||
}
|
||||
|
||||
uvm_processor_mask_clear(&node->policy.accessed_by, gpu->id);
|
||||
}
|
||||
@ -1601,7 +1599,7 @@ static void hmm_va_block_cpu_unpopulate_chunk(uvm_va_block_t *va_block,
|
||||
UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == PAGE_SIZE);
|
||||
|
||||
uvm_cpu_chunk_remove_from_block(va_block, chunk_nid, page_index);
|
||||
uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk, page_index);
|
||||
uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk);
|
||||
uvm_cpu_chunk_free(chunk);
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2020-2022 NVIDIA Corporation
|
||||
Copyright (c) 2020-2024 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@ -157,6 +157,7 @@ void uvm_hal_hopper_host_tlb_invalidate_all(uvm_push_t *push,
|
||||
NvU32 pdb_lo;
|
||||
NvU32 pdb_hi;
|
||||
NvU32 ack_value = 0;
|
||||
NvU32 sysmembar_value = 0;
|
||||
|
||||
UVM_ASSERT_MSG(pdb.aperture == UVM_APERTURE_VID || pdb.aperture == UVM_APERTURE_SYS, "aperture: %u", pdb.aperture);
|
||||
|
||||
@ -183,7 +184,12 @@ void uvm_hal_hopper_host_tlb_invalidate_all(uvm_push_t *push,
|
||||
ack_value = HWCONST(C86F, MEM_OP_C, TLB_INVALIDATE_ACK_TYPE, GLOBALLY);
|
||||
}
|
||||
|
||||
NV_PUSH_4U(C86F, MEM_OP_A, HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS) |
|
||||
if (membar == UVM_MEMBAR_SYS)
|
||||
sysmembar_value = HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, EN);
|
||||
else
|
||||
sysmembar_value = HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS);
|
||||
|
||||
NV_PUSH_4U(C86F, MEM_OP_A, sysmembar_value |
|
||||
HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS),
|
||||
MEM_OP_B, 0,
|
||||
MEM_OP_C, HWCONST(C86F, MEM_OP_C, TLB_INVALIDATE_PDB, ONE) |
|
||||
@ -196,7 +202,9 @@ void uvm_hal_hopper_host_tlb_invalidate_all(uvm_push_t *push,
|
||||
MEM_OP_D, HWCONST(C86F, MEM_OP_D, OPERATION, MMU_TLB_INVALIDATE) |
|
||||
HWVALUE(C86F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));
|
||||
|
||||
uvm_hal_tlb_invalidate_membar(push, membar);
|
||||
// GPU membar still requires an explicit membar method.
|
||||
if (membar == UVM_MEMBAR_GPU)
|
||||
uvm_push_get_gpu(push)->parent->host_hal->membar_gpu(push);
|
||||
}
|
||||
|
||||
void uvm_hal_hopper_host_tlb_invalidate_va(uvm_push_t *push,
|
||||
@ -204,7 +212,7 @@ void uvm_hal_hopper_host_tlb_invalidate_va(uvm_push_t *push,
|
||||
NvU32 depth,
|
||||
NvU64 base,
|
||||
NvU64 size,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
uvm_membar_t membar)
|
||||
{
|
||||
NvU32 aperture_value;
|
||||
@ -212,6 +220,7 @@ void uvm_hal_hopper_host_tlb_invalidate_va(uvm_push_t *push,
|
||||
NvU32 pdb_lo;
|
||||
NvU32 pdb_hi;
|
||||
NvU32 ack_value = 0;
|
||||
NvU32 sysmembar_value = 0;
|
||||
NvU32 va_lo;
|
||||
NvU32 va_hi;
|
||||
NvU64 end;
|
||||
@ -221,9 +230,9 @@ void uvm_hal_hopper_host_tlb_invalidate_va(uvm_push_t *push,
|
||||
NvU32 log2_invalidation_size;
|
||||
uvm_gpu_t *gpu = uvm_push_get_gpu(push);
|
||||
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(page_size, 1 << 12), "page_size 0x%x\n", page_size);
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(base, page_size), "base 0x%llx page_size 0x%x\n", base, page_size);
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%x\n", size, page_size);
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(page_size, 1 << 12), "page_size 0x%llx\n", page_size);
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(base, page_size), "base 0x%llx page_size 0x%llx\n", base, page_size);
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%llx\n", size, page_size);
|
||||
UVM_ASSERT_MSG(size > 0, "size 0x%llx\n", size);
|
||||
|
||||
// The invalidation size must be a power-of-two number of pages containing
|
||||
@ -277,8 +286,13 @@ void uvm_hal_hopper_host_tlb_invalidate_va(uvm_push_t *push,
|
||||
ack_value = HWCONST(C86F, MEM_OP_C, TLB_INVALIDATE_ACK_TYPE, GLOBALLY);
|
||||
}
|
||||
|
||||
if (membar == UVM_MEMBAR_SYS)
|
||||
sysmembar_value = HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, EN);
|
||||
else
|
||||
sysmembar_value = HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS);
|
||||
|
||||
NV_PUSH_4U(C86F, MEM_OP_A, HWVALUE(C86F, MEM_OP_A, TLB_INVALIDATE_INVALIDATION_SIZE, log2_invalidation_size) |
|
||||
HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS) |
|
||||
sysmembar_value |
|
||||
HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS) |
|
||||
HWVALUE(C86F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO, va_lo),
|
||||
MEM_OP_B, HWVALUE(C86F, MEM_OP_B, TLB_INVALIDATE_TARGET_ADDR_HI, va_hi),
|
||||
@ -292,7 +306,9 @@ void uvm_hal_hopper_host_tlb_invalidate_va(uvm_push_t *push,
|
||||
MEM_OP_D, HWCONST(C86F, MEM_OP_D, OPERATION, MMU_TLB_INVALIDATE_TARGETED) |
|
||||
HWVALUE(C86F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));
|
||||
|
||||
uvm_hal_tlb_invalidate_membar(push, membar);
|
||||
// GPU membar still requires an explicit membar method.
|
||||
if (membar == UVM_MEMBAR_GPU)
|
||||
gpu->parent->host_hal->membar_gpu(push);
|
||||
}
|
||||
|
||||
void uvm_hal_hopper_host_tlb_invalidate_test(uvm_push_t *push,
|
||||
@ -300,12 +316,12 @@ void uvm_hal_hopper_host_tlb_invalidate_test(uvm_push_t *push,
|
||||
UVM_TEST_INVALIDATE_TLB_PARAMS *params)
|
||||
{
|
||||
NvU32 ack_value = 0;
|
||||
NvU32 sysmembar_value = 0;
|
||||
NvU32 invalidate_gpc_value = 0;
|
||||
NvU32 aperture_value = 0;
|
||||
NvU32 pdb_lo = 0;
|
||||
NvU32 pdb_hi = 0;
|
||||
NvU32 page_table_level = 0;
|
||||
uvm_membar_t membar;
|
||||
|
||||
UVM_ASSERT_MSG(pdb.aperture == UVM_APERTURE_VID || pdb.aperture == UVM_APERTURE_SYS, "aperture: %u", pdb.aperture);
|
||||
if (pdb.aperture == UVM_APERTURE_VID)
|
||||
@ -332,6 +348,11 @@ void uvm_hal_hopper_host_tlb_invalidate_test(uvm_push_t *push,
|
||||
ack_value = HWCONST(C86F, MEM_OP_C, TLB_INVALIDATE_ACK_TYPE, GLOBALLY);
|
||||
}
|
||||
|
||||
if (params->membar == UvmInvalidateTlbMemBarSys)
|
||||
sysmembar_value = HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, EN);
|
||||
else
|
||||
sysmembar_value = HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS);
|
||||
|
||||
if (params->disable_gpc_invalidate)
|
||||
invalidate_gpc_value = HWCONST(C86F, MEM_OP_C, TLB_INVALIDATE_GPC, DISABLE);
|
||||
else
|
||||
@ -343,7 +364,7 @@ void uvm_hal_hopper_host_tlb_invalidate_test(uvm_push_t *push,
|
||||
NvU32 va_lo = va & HWMASK(C86F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO);
|
||||
NvU32 va_hi = va >> HWSIZE(C86F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO);
|
||||
|
||||
NV_PUSH_4U(C86F, MEM_OP_A, HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS) |
|
||||
NV_PUSH_4U(C86F, MEM_OP_A, sysmembar_value |
|
||||
HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS) |
|
||||
HWVALUE(C86F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO, va_lo),
|
||||
MEM_OP_B, HWVALUE(C86F, MEM_OP_B, TLB_INVALIDATE_TARGET_ADDR_HI, va_hi),
|
||||
@ -358,7 +379,7 @@ void uvm_hal_hopper_host_tlb_invalidate_test(uvm_push_t *push,
|
||||
HWVALUE(C86F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));
|
||||
}
|
||||
else {
|
||||
NV_PUSH_4U(C86F, MEM_OP_A, HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS) |
|
||||
NV_PUSH_4U(C86F, MEM_OP_A, sysmembar_value |
|
||||
HWCONST(C86F, MEM_OP_A, TLB_INVALIDATE_INVAL_SCOPE, NON_LINK_TLBS),
|
||||
MEM_OP_B, 0,
|
||||
MEM_OP_C, HWCONST(C86F, MEM_OP_C, TLB_INVALIDATE_REPLAY, NONE) |
|
||||
@ -372,14 +393,9 @@ void uvm_hal_hopper_host_tlb_invalidate_test(uvm_push_t *push,
|
||||
HWVALUE(C86F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));
|
||||
}
|
||||
|
||||
if (params->membar == UvmInvalidateTlbMemBarSys)
|
||||
membar = UVM_MEMBAR_SYS;
|
||||
else if (params->membar == UvmInvalidateTlbMemBarLocal)
|
||||
membar = UVM_MEMBAR_GPU;
|
||||
else
|
||||
membar = UVM_MEMBAR_NONE;
|
||||
|
||||
uvm_hal_tlb_invalidate_membar(push, membar);
|
||||
// GPU membar still requires an explicit membar method.
|
||||
if (params->membar == UvmInvalidateTlbMemBarLocal)
|
||||
uvm_push_get_gpu(push)->parent->host_hal->membar_gpu(push);
|
||||
}
|
||||
|
||||
void uvm_hal_hopper_host_set_gpfifo_pushbuffer_segment_base(NvU64 *fifo_entry, NvU64 pushbuffer_va)
|
||||
|
@ -61,7 +61,7 @@ uvm_mmu_engine_type_t uvm_hal_hopper_mmu_engine_id_to_type(NvU16 mmu_engine_id)
|
||||
return UVM_MMU_ENGINE_TYPE_GRAPHICS;
|
||||
}
|
||||
|
||||
static NvU32 page_table_depth_hopper(NvU32 page_size)
|
||||
static NvU32 page_table_depth_hopper(NvU64 page_size)
|
||||
{
|
||||
// The common-case is page_size == UVM_PAGE_SIZE_2M, hence the first check
|
||||
if (page_size == UVM_PAGE_SIZE_2M)
|
||||
@ -79,7 +79,7 @@ static NvU32 entries_per_index_hopper(NvU32 depth)
|
||||
return 1;
|
||||
}
|
||||
|
||||
static NvLength entry_offset_hopper(NvU32 depth, NvU32 page_size)
|
||||
static NvLength entry_offset_hopper(NvU32 depth, NvU64 page_size)
|
||||
{
|
||||
UVM_ASSERT(depth < 6);
|
||||
if ((page_size == UVM_PAGE_SIZE_4K) && (depth == 4))
|
||||
@ -92,7 +92,7 @@ static NvLength entry_size_hopper(NvU32 depth)
|
||||
return entries_per_index_hopper(depth) * 8;
|
||||
}
|
||||
|
||||
static NvU32 index_bits_hopper(NvU32 depth, NvU32 page_size)
|
||||
static NvU32 index_bits_hopper(NvU32 depth, NvU64 page_size)
|
||||
{
|
||||
static const NvU32 bit_widths[] = {1, 9, 9, 9, 8};
|
||||
|
||||
@ -120,7 +120,7 @@ static NvU32 num_va_bits_hopper(void)
|
||||
return 57;
|
||||
}
|
||||
|
||||
static NvLength allocation_size_hopper(NvU32 depth, NvU32 page_size)
|
||||
static NvLength allocation_size_hopper(NvU32 depth, NvU64 page_size)
|
||||
{
|
||||
UVM_ASSERT(depth < 6);
|
||||
if (depth == 5 && page_size == UVM_PAGE_SIZE_64K)
|
||||
@ -233,7 +233,7 @@ static NvU64 make_sparse_pte_hopper(void)
|
||||
HWCONST64(_MMU_VER3, PTE, PCF, SPARSE);
|
||||
}
|
||||
|
||||
static NvU64 unmapped_pte_hopper(NvU32 page_size)
|
||||
static NvU64 unmapped_pte_hopper(NvU64 page_size)
|
||||
{
|
||||
// Setting PCF to NO_VALID_4KB_PAGE on an otherwise-zeroed big PTE causes
|
||||
// the corresponding 4k PTEs to be ignored. This allows the invalidation of
|
||||
@ -490,7 +490,7 @@ static void make_pde_hopper(void *entry,
|
||||
|
||||
static uvm_mmu_mode_hal_t hopper_mmu_mode_hal;
|
||||
|
||||
uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_hopper(NvU32 big_page_size)
|
||||
uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_hopper(NvU64 big_page_size)
|
||||
{
|
||||
static bool initialized = false;
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2013-2023 NVidia Corporation
|
||||
Copyright (c) 2013-2024 NVidia Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@ -494,7 +494,7 @@ typedef struct
|
||||
NvU64 base NV_ALIGN_BYTES(8); // IN
|
||||
NvU64 length NV_ALIGN_BYTES(8); // IN
|
||||
NvU64 offset NV_ALIGN_BYTES(8); // IN
|
||||
UvmGpuMappingAttributes perGpuAttributes[UVM_MAX_GPUS_V2]; // IN
|
||||
UvmGpuMappingAttributes perGpuAttributes[UVM_MAX_GPUS]; // IN
|
||||
NvU64 gpuAttributesCount NV_ALIGN_BYTES(8); // IN
|
||||
NvS32 rmCtrlFd; // IN
|
||||
NvU32 hClient; // IN
|
||||
@ -952,7 +952,6 @@ typedef struct
|
||||
NvU32 version; // OUT
|
||||
} UVM_TOOLS_GET_PROCESSOR_UUID_TABLE_PARAMS;
|
||||
|
||||
|
||||
//
|
||||
// UvmMapDynamicParallelismRegion
|
||||
//
|
||||
@ -995,7 +994,7 @@ typedef struct
|
||||
{
|
||||
NvU64 base NV_ALIGN_BYTES(8); // IN
|
||||
NvU64 length NV_ALIGN_BYTES(8); // IN
|
||||
UvmGpuMappingAttributes perGpuAttributes[UVM_MAX_GPUS_V2]; // IN
|
||||
UvmGpuMappingAttributes perGpuAttributes[UVM_MAX_GPUS]; // IN
|
||||
NvU64 gpuAttributesCount NV_ALIGN_BYTES(8); // IN
|
||||
NV_STATUS rmStatus; // OUT
|
||||
} UVM_ALLOC_SEMAPHORE_POOL_PARAMS;
|
||||
|
@ -27,7 +27,7 @@
|
||||
|
||||
const char *uvm_lock_order_to_string(uvm_lock_order_t lock_order)
|
||||
{
|
||||
BUILD_BUG_ON(UVM_LOCK_ORDER_COUNT != 36);
|
||||
BUILD_BUG_ON(UVM_LOCK_ORDER_COUNT != 34);
|
||||
|
||||
switch (lock_order) {
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_INVALID);
|
||||
@ -48,9 +48,7 @@ const char *uvm_lock_order_to_string(uvm_lock_order_t lock_order)
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_CONF_COMPUTING_DMA_BUFFER_POOL);
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_CHUNK_MAPPING);
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_PAGE_TREE);
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_KEY_ROTATION);
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_CSL_PUSH);
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_KEY_ROTATION_WLC);
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_CSL_WLC_PUSH);
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_CSL_SEC2_PUSH);
|
||||
UVM_ENUM_STRING_CASE(UVM_LOCK_ORDER_PUSH);
|
||||
|
@ -322,15 +322,6 @@
|
||||
// Operations not allowed while holding this lock
|
||||
// - GPU memory allocation which can evict
|
||||
//
|
||||
// - Channel pool key rotation lock
|
||||
// Order: UVM_LOCK_ORDER_KEY_ROTATION
|
||||
// Condition: Confidential Computing is enabled
|
||||
// Mutex per channel pool
|
||||
//
|
||||
// The lock ensures mutual exclusion during key rotation affecting all the
|
||||
// channels in the associated pool. Key rotation in WLC pools is handled
|
||||
// using a separate lock order, see UVM_LOCK_ORDER_KEY_ROTATION_WLC below.
|
||||
//
|
||||
// - CE channel CSL channel pool semaphore
|
||||
// Order: UVM_LOCK_ORDER_CSL_PUSH
|
||||
// Condition: The Confidential Computing feature is enabled
|
||||
@ -347,15 +338,6 @@
|
||||
// Operations allowed while holding this lock
|
||||
// - Pushing work to CE channels (except for WLC channels)
|
||||
//
|
||||
// - WLC channel pool key rotation lock
|
||||
// Order: UVM_LOCK_ORDER_KEY_ROTATION_WLC
|
||||
// Condition: Confidential Computing is enabled
|
||||
// Mutex of WLC channel pool
|
||||
//
|
||||
// The lock has the same purpose as the regular channel pool key rotation
|
||||
// lock. Using a different order lock for WLC channels allows key rotation
|
||||
// on those channels during indirect work submission.
|
||||
//
|
||||
// - WLC CSL channel pool semaphore
|
||||
// Order: UVM_LOCK_ORDER_CSL_WLC_PUSH
|
||||
// Condition: The Confidential Computing feature is enabled
|
||||
@ -502,9 +484,7 @@ typedef enum
|
||||
UVM_LOCK_ORDER_CONF_COMPUTING_DMA_BUFFER_POOL,
|
||||
UVM_LOCK_ORDER_CHUNK_MAPPING,
|
||||
UVM_LOCK_ORDER_PAGE_TREE,
|
||||
UVM_LOCK_ORDER_KEY_ROTATION,
|
||||
UVM_LOCK_ORDER_CSL_PUSH,
|
||||
UVM_LOCK_ORDER_KEY_ROTATION_WLC,
|
||||
UVM_LOCK_ORDER_CSL_WLC_PUSH,
|
||||
UVM_LOCK_ORDER_CSL_SEC2_PUSH,
|
||||
UVM_LOCK_ORDER_PUSH,
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2016-2023 NVIDIA Corporation
|
||||
Copyright (c) 2016-2024 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@ -61,7 +61,7 @@ typedef struct
|
||||
size_t buffer_size;
|
||||
|
||||
// Page size in bytes
|
||||
NvU32 page_size;
|
||||
NvU64 page_size;
|
||||
|
||||
// Size of a single PTE in bytes
|
||||
NvU32 pte_size;
|
||||
@ -91,7 +91,7 @@ static NV_STATUS uvm_pte_buffer_init(uvm_va_range_t *va_range,
|
||||
uvm_gpu_t *gpu,
|
||||
const uvm_map_rm_params_t *map_rm_params,
|
||||
NvU64 length,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
uvm_pte_buffer_t *pte_buffer)
|
||||
{
|
||||
uvm_gpu_va_space_t *gpu_va_space = uvm_gpu_va_space_get(va_range->va_space, gpu);
|
||||
@ -650,9 +650,7 @@ static NV_STATUS set_ext_gpu_map_location(uvm_ext_gpu_map_t *ext_gpu_map,
|
||||
return NV_OK;
|
||||
}
|
||||
// This is a local or peer allocation, so the owning GPU must have been
|
||||
// registered.
|
||||
// This also checks for if EGM owning GPU is registered.
|
||||
|
||||
// registered. This also checks for if EGM owning GPU is registered.
|
||||
owning_gpu = uvm_va_space_get_gpu_by_uuid(va_space, &mem_info->uuid);
|
||||
if (!owning_gpu)
|
||||
return NV_ERR_INVALID_DEVICE;
|
||||
@ -665,7 +663,6 @@ static NV_STATUS set_ext_gpu_map_location(uvm_ext_gpu_map_t *ext_gpu_map,
|
||||
// semantics of sysmem allocations.
|
||||
|
||||
// Check if peer access for peer memory is enabled.
|
||||
// This path also handles EGM allocations.
|
||||
if (owning_gpu != mapping_gpu && (!mem_info->sysmem || mem_info->egm)) {
|
||||
// TODO: Bug 1757136: In SLI, the returned UUID may be different but a
|
||||
// local mapping must be used. We need to query SLI groups to know
|
||||
@ -856,9 +853,10 @@ static NV_STATUS uvm_map_external_allocation_on_gpu(uvm_va_range_t *va_range,
|
||||
uvm_ext_gpu_range_tree_t *range_tree = uvm_ext_gpu_range_tree(va_range, mapping_gpu);
|
||||
UvmGpuMemoryInfo mem_info;
|
||||
uvm_gpu_va_space_t *gpu_va_space = uvm_gpu_va_space_get(va_space, mapping_gpu);
|
||||
NvU32 mapping_page_size;
|
||||
NvU64 mapping_page_size;
|
||||
NvU64 biggest_mapping_page_size;
|
||||
NvU64 alignments;
|
||||
NvU32 smallest_alignment;
|
||||
NvU64 smallest_alignment;
|
||||
NV_STATUS status;
|
||||
|
||||
uvm_assert_rwsem_locked_read(&va_space->lock);
|
||||
@ -947,9 +945,11 @@ static NV_STATUS uvm_map_external_allocation_on_gpu(uvm_va_range_t *va_range,
|
||||
|
||||
// Check for the maximum page size for the mapping of vidmem allocations,
|
||||
// the vMMU segment size may limit the range of page sizes.
|
||||
biggest_mapping_page_size = uvm_mmu_biggest_page_size_up_to(&gpu_va_space->page_tables,
|
||||
mapping_gpu->mem_info.max_vidmem_page_size);
|
||||
if (!ext_gpu_map->is_sysmem && (ext_gpu_map->gpu == ext_gpu_map->owning_gpu) &&
|
||||
(mapping_page_size > mapping_gpu->mem_info.max_vidmem_page_size))
|
||||
mapping_page_size = mapping_gpu->mem_info.max_vidmem_page_size;
|
||||
(mapping_page_size > biggest_mapping_page_size))
|
||||
mapping_page_size = biggest_mapping_page_size;
|
||||
|
||||
mem_info.pageSize = mapping_page_size;
|
||||
|
||||
@ -986,7 +986,7 @@ static NV_STATUS uvm_map_external_allocation(uvm_va_space_t *va_space, UVM_MAP_E
|
||||
if (uvm_api_range_invalid_4k(params->base, params->length))
|
||||
return NV_ERR_INVALID_ADDRESS;
|
||||
|
||||
if (params->gpuAttributesCount == 0 || params->gpuAttributesCount > UVM_MAX_GPUS_V2)
|
||||
if (params->gpuAttributesCount == 0 || params->gpuAttributesCount > UVM_MAX_GPUS)
|
||||
return NV_ERR_INVALID_ARGUMENT;
|
||||
|
||||
mapped_gpus = uvm_processor_mask_cache_alloc();
|
||||
|
@ -108,7 +108,7 @@ void uvm_hal_maxwell_host_tlb_invalidate_va(uvm_push_t *push,
|
||||
NvU32 depth,
|
||||
NvU64 base,
|
||||
NvU64 size,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
uvm_membar_t membar)
|
||||
{
|
||||
// No per VA invalidate on Maxwell, redirect to invalidate all.
|
||||
|
@ -52,7 +52,7 @@ static NvU32 entries_per_index_maxwell(NvU32 depth)
|
||||
return 1;
|
||||
}
|
||||
|
||||
static NvLength entry_offset_maxwell(NvU32 depth, NvU32 page_size)
|
||||
static NvLength entry_offset_maxwell(NvU32 depth, NvU64 page_size)
|
||||
{
|
||||
UVM_ASSERT(depth < 2);
|
||||
if (page_size == UVM_PAGE_SIZE_4K && depth == 0)
|
||||
@ -128,7 +128,7 @@ static NvLength entry_size_maxwell(NvU32 depth)
|
||||
return 8;
|
||||
}
|
||||
|
||||
static NvU32 index_bits_maxwell_64(NvU32 depth, NvU32 page_size)
|
||||
static NvU32 index_bits_maxwell_64(NvU32 depth, NvU64 page_size)
|
||||
{
|
||||
UVM_ASSERT(depth < 2);
|
||||
UVM_ASSERT(page_size == UVM_PAGE_SIZE_4K ||
|
||||
@ -146,7 +146,7 @@ static NvU32 index_bits_maxwell_64(NvU32 depth, NvU32 page_size)
|
||||
}
|
||||
}
|
||||
|
||||
static NvU32 index_bits_maxwell_128(NvU32 depth, NvU32 page_size)
|
||||
static NvU32 index_bits_maxwell_128(NvU32 depth, NvU64 page_size)
|
||||
{
|
||||
UVM_ASSERT(depth < 2);
|
||||
UVM_ASSERT(page_size == UVM_PAGE_SIZE_4K ||
|
||||
@ -169,32 +169,32 @@ static NvU32 num_va_bits_maxwell(void)
|
||||
return 40;
|
||||
}
|
||||
|
||||
static NvLength allocation_size_maxwell_64(NvU32 depth, NvU32 page_size)
|
||||
static NvLength allocation_size_maxwell_64(NvU32 depth, NvU64 page_size)
|
||||
{
|
||||
return entry_size_maxwell(depth) << index_bits_maxwell_64(depth, page_size);
|
||||
}
|
||||
|
||||
static NvLength allocation_size_maxwell_128(NvU32 depth, NvU32 page_size)
|
||||
static NvLength allocation_size_maxwell_128(NvU32 depth, NvU64 page_size)
|
||||
{
|
||||
return entry_size_maxwell(depth) << index_bits_maxwell_128(depth, page_size);
|
||||
}
|
||||
|
||||
static NvU32 page_table_depth_maxwell(NvU32 page_size)
|
||||
static NvU32 page_table_depth_maxwell(NvU64 page_size)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
static NvU32 page_sizes_maxwell_128(void)
|
||||
static NvU64 page_sizes_maxwell_128(void)
|
||||
{
|
||||
return UVM_PAGE_SIZE_128K | UVM_PAGE_SIZE_4K;
|
||||
}
|
||||
|
||||
static NvU32 page_sizes_maxwell_64(void)
|
||||
static NvU64 page_sizes_maxwell_64(void)
|
||||
{
|
||||
return UVM_PAGE_SIZE_64K | UVM_PAGE_SIZE_4K;
|
||||
}
|
||||
|
||||
static NvU64 unmapped_pte_maxwell(NvU32 page_size)
|
||||
static NvU64 unmapped_pte_maxwell(NvU64 page_size)
|
||||
{
|
||||
// Setting the privilege bit on an otherwise-zeroed big PTE causes the
|
||||
// corresponding 4k PTEs to be ignored. This allows the invalidation of a
|
||||
@ -356,7 +356,7 @@ static uvm_mmu_mode_hal_t maxwell_128_mmu_mode_hal =
|
||||
.page_sizes = page_sizes_maxwell_128
|
||||
};
|
||||
|
||||
uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_maxwell(NvU32 big_page_size)
|
||||
uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_maxwell(NvU64 big_page_size)
|
||||
{
|
||||
UVM_ASSERT(big_page_size == UVM_PAGE_SIZE_64K || big_page_size == UVM_PAGE_SIZE_128K);
|
||||
if (big_page_size == UVM_PAGE_SIZE_64K)
|
||||
|
@ -290,15 +290,15 @@ uvm_chunk_sizes_mask_t uvm_mem_kernel_chunk_sizes(uvm_gpu_t *gpu)
|
||||
// Get the mmu mode hal directly as the internal address space tree has not
|
||||
// been created yet.
|
||||
uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(gpu->big_page.internal_size);
|
||||
NvU32 page_sizes = hal->page_sizes();
|
||||
NvU64 page_sizes = hal->page_sizes();
|
||||
|
||||
return (uvm_chunk_sizes_mask_t)(page_sizes & UVM_CHUNK_SIZES_MASK);
|
||||
}
|
||||
|
||||
static NvU32 mem_pick_chunk_size(uvm_mem_t *mem)
|
||||
static NvU64 mem_pick_chunk_size(uvm_mem_t *mem)
|
||||
{
|
||||
NvU32 biggest_page_size;
|
||||
NvU32 chunk_size;
|
||||
NvU64 biggest_page_size;
|
||||
NvU64 chunk_size;
|
||||
|
||||
if (uvm_mem_is_sysmem(mem))
|
||||
return PAGE_SIZE;
|
||||
@ -315,12 +315,12 @@ static NvU32 mem_pick_chunk_size(uvm_mem_t *mem)
|
||||
// When UVM_PAGE_SIZE_DEFAULT is used on NUMA-enabled GPUs, we force
|
||||
// chunk_size to be PAGE_SIZE at least, to allow CPU mappings.
|
||||
if (mem->backing_gpu->mem_info.numa.enabled)
|
||||
chunk_size = max(chunk_size, (NvU32)PAGE_SIZE);
|
||||
chunk_size = max(chunk_size, (NvU64)PAGE_SIZE);
|
||||
|
||||
return chunk_size;
|
||||
}
|
||||
|
||||
static NvU32 mem_pick_gpu_page_size(uvm_mem_t *mem, uvm_gpu_t *gpu, uvm_page_tree_t *gpu_page_tree)
|
||||
static NvU64 mem_pick_gpu_page_size(uvm_mem_t *mem, uvm_gpu_t *gpu, uvm_page_tree_t *gpu_page_tree)
|
||||
{
|
||||
if (uvm_mem_is_vidmem(mem)) {
|
||||
// For vidmem allocations the chunk size is picked out of the supported
|
||||
@ -467,7 +467,7 @@ static NV_STATUS mem_alloc_sysmem_dma_chunks(uvm_mem_t *mem, gfp_t gfp_flags)
|
||||
NvU64 *dma_addrs;
|
||||
|
||||
UVM_ASSERT_MSG(mem->chunk_size == PAGE_SIZE,
|
||||
"mem->chunk_size is 0x%x. PAGE_SIZE is only supported.",
|
||||
"mem->chunk_size is 0x%llx. PAGE_SIZE is only supported.",
|
||||
mem->chunk_size);
|
||||
UVM_ASSERT(uvm_mem_is_sysmem_dma(mem));
|
||||
|
||||
@ -528,10 +528,9 @@ static NV_STATUS mem_alloc_sysmem_chunks(uvm_mem_t *mem, gfp_t gfp_flags)
|
||||
|
||||
// In case of failure, the caller is required to handle cleanup by calling
|
||||
// uvm_mem_free
|
||||
static NV_STATUS mem_alloc_vidmem_chunks(uvm_mem_t *mem, bool zero, bool is_unprotected)
|
||||
static NV_STATUS mem_alloc_vidmem_chunks(uvm_mem_t *mem, bool zero)
|
||||
{
|
||||
NV_STATUS status;
|
||||
uvm_pmm_gpu_memory_type_t mem_type;
|
||||
|
||||
UVM_ASSERT(uvm_mem_is_vidmem(mem));
|
||||
|
||||
@ -548,23 +547,15 @@ static NV_STATUS mem_alloc_vidmem_chunks(uvm_mem_t *mem, bool zero, bool is_unpr
|
||||
if (!mem->vidmem.chunks)
|
||||
return NV_ERR_NO_MEMORY;
|
||||
|
||||
// When CC is disabled the behavior is identical to that of PMM, and the
|
||||
// protection flag is ignored (squashed by PMM internally).
|
||||
if (is_unprotected)
|
||||
mem_type = UVM_PMM_GPU_MEMORY_TYPE_KERNEL_UNPROTECTED;
|
||||
else
|
||||
mem_type = UVM_PMM_GPU_MEMORY_TYPE_KERNEL_PROTECTED;
|
||||
|
||||
status = uvm_pmm_gpu_alloc(&mem->backing_gpu->pmm,
|
||||
status = uvm_pmm_gpu_alloc_kernel(&mem->backing_gpu->pmm,
|
||||
mem->chunks_count,
|
||||
mem->chunk_size,
|
||||
mem_type,
|
||||
UVM_PMM_ALLOC_FLAGS_NONE,
|
||||
mem->vidmem.chunks,
|
||||
NULL);
|
||||
|
||||
if (status != NV_OK) {
|
||||
UVM_ERR_PRINT("uvm_pmm_gpu_alloc (count=%zd, size=0x%x) failed: %s\n",
|
||||
UVM_ERR_PRINT("uvm_pmm_gpu_alloc_kernel (count=%zd, size=0x%llx) failed: %s\n",
|
||||
mem->chunks_count,
|
||||
mem->chunk_size,
|
||||
nvstatusToString(status));
|
||||
@ -574,7 +565,7 @@ static NV_STATUS mem_alloc_vidmem_chunks(uvm_mem_t *mem, bool zero, bool is_unpr
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
static NV_STATUS mem_alloc_chunks(uvm_mem_t *mem, struct mm_struct *mm, bool zero, bool is_unprotected)
|
||||
static NV_STATUS mem_alloc_chunks(uvm_mem_t *mem, struct mm_struct *mm, bool zero)
|
||||
{
|
||||
if (uvm_mem_is_sysmem(mem)) {
|
||||
gfp_t gfp_flags;
|
||||
@ -596,7 +587,7 @@ static NV_STATUS mem_alloc_chunks(uvm_mem_t *mem, struct mm_struct *mm, bool zer
|
||||
return status;
|
||||
}
|
||||
|
||||
return mem_alloc_vidmem_chunks(mem, zero, is_unprotected);
|
||||
return mem_alloc_vidmem_chunks(mem, zero);
|
||||
}
|
||||
|
||||
NV_STATUS uvm_mem_map_kernel(uvm_mem_t *mem, const uvm_processor_mask_t *mask)
|
||||
@ -626,7 +617,6 @@ NV_STATUS uvm_mem_alloc(const uvm_mem_alloc_params_t *params, uvm_mem_t **mem_ou
|
||||
NV_STATUS status;
|
||||
NvU64 physical_size;
|
||||
uvm_mem_t *mem = NULL;
|
||||
bool is_unprotected = false;
|
||||
|
||||
UVM_ASSERT(params->size > 0);
|
||||
|
||||
@ -648,12 +638,7 @@ NV_STATUS uvm_mem_alloc(const uvm_mem_alloc_params_t *params, uvm_mem_t **mem_ou
|
||||
physical_size = UVM_ALIGN_UP(mem->size, mem->chunk_size);
|
||||
mem->chunks_count = physical_size / mem->chunk_size;
|
||||
|
||||
if (params->is_unprotected)
|
||||
UVM_ASSERT(uvm_mem_is_vidmem(mem));
|
||||
|
||||
is_unprotected = params->is_unprotected;
|
||||
|
||||
status = mem_alloc_chunks(mem, params->mm, params->zero, is_unprotected);
|
||||
status = mem_alloc_chunks(mem, params->mm, params->zero);
|
||||
if (status != NV_OK)
|
||||
goto error;
|
||||
|
||||
@ -1050,7 +1035,7 @@ static NV_STATUS mem_map_gpu(uvm_mem_t *mem,
|
||||
uvm_page_table_range_vec_t **range_vec)
|
||||
{
|
||||
NV_STATUS status;
|
||||
NvU32 page_size;
|
||||
NvU64 page_size;
|
||||
uvm_pmm_alloc_flags_t pmm_flags = UVM_PMM_ALLOC_FLAGS_EVICT;
|
||||
|
||||
uvm_mem_pte_maker_data_t pte_maker_data = {
|
||||
@ -1059,7 +1044,7 @@ static NV_STATUS mem_map_gpu(uvm_mem_t *mem,
|
||||
};
|
||||
|
||||
page_size = mem_pick_gpu_page_size(mem, gpu, tree);
|
||||
UVM_ASSERT_MSG(uvm_mmu_page_size_supported(tree, page_size), "page_size 0x%x\n", page_size);
|
||||
UVM_ASSERT_MSG(uvm_mmu_page_size_supported(tree, page_size), "page_size 0x%llx\n", page_size);
|
||||
|
||||
// When the Confidential Computing feature is enabled, DMA allocations are
|
||||
// majoritarily allocated and managed by a per-GPU DMA buffer pool
|
||||
|
@ -126,12 +126,7 @@ typedef struct
|
||||
//
|
||||
// CPU mappings will always use PAGE_SIZE, so the physical allocation chunk
|
||||
// has to be aligned to PAGE_SIZE.
|
||||
NvU32 page_size;
|
||||
|
||||
// The protection flag is only observed for vidmem allocations when CC is
|
||||
// enabled. If set to true, the allocation returns unprotected vidmem;
|
||||
// otherwise, the allocation returns protected vidmem.
|
||||
bool is_unprotected;
|
||||
NvU64 page_size;
|
||||
|
||||
// If true, the allocation is zeroed (scrubbed).
|
||||
bool zero;
|
||||
@ -199,7 +194,7 @@ struct uvm_mem_struct
|
||||
size_t chunks_count;
|
||||
|
||||
// Size of each physical chunk (vidmem) or CPU page (sysmem)
|
||||
NvU32 chunk_size;
|
||||
NvU64 chunk_size;
|
||||
|
||||
// Size of the allocation
|
||||
NvU64 size;
|
||||
|
@ -153,7 +153,7 @@ static NV_STATUS check_accessible_from_gpu(uvm_gpu_t *gpu, uvm_mem_t *mem)
|
||||
|
||||
for (i = 0; i < verif_size / sizeof(*sys_verif); ++i) {
|
||||
if (sys_verif[i] != mem->size + i) {
|
||||
UVM_TEST_PRINT("Verif failed for %zd = 0x%llx instead of 0x%llx, verif_size=0x%llx mem(size=0x%llx, page_size=%u, processor=%u)\n",
|
||||
UVM_TEST_PRINT("Verif failed for %zd = 0x%llx instead of 0x%llx, verif_size=0x%llx mem(size=0x%llx, page_size=%llu, processor=%u)\n",
|
||||
i,
|
||||
sys_verif[i],
|
||||
(NvU64)(verif_size + i),
|
||||
@ -241,7 +241,7 @@ static NV_STATUS test_map_cpu(uvm_mem_t *mem)
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
static NV_STATUS test_alloc_sysmem(uvm_va_space_t *va_space, NvU32 page_size, size_t size, uvm_mem_t **mem_out)
|
||||
static NV_STATUS test_alloc_sysmem(uvm_va_space_t *va_space, NvU64 page_size, size_t size, uvm_mem_t **mem_out)
|
||||
{
|
||||
NV_STATUS status;
|
||||
uvm_mem_t *mem;
|
||||
@ -299,7 +299,7 @@ error:
|
||||
return status;
|
||||
}
|
||||
|
||||
static NV_STATUS test_alloc_vidmem(uvm_gpu_t *gpu, NvU32 page_size, size_t size, uvm_mem_t **mem_out)
|
||||
static NV_STATUS test_alloc_vidmem(uvm_gpu_t *gpu, NvU64 page_size, size_t size, uvm_mem_t **mem_out)
|
||||
{
|
||||
NV_STATUS status;
|
||||
uvm_mem_t *mem;
|
||||
@ -334,7 +334,7 @@ error:
|
||||
return status;
|
||||
}
|
||||
|
||||
static bool should_test_page_size(size_t alloc_size, NvU32 page_size)
|
||||
static bool should_test_page_size(size_t alloc_size, NvU64 page_size)
|
||||
{
|
||||
if (g_uvm_global.num_simulated_devices == 0)
|
||||
return true;
|
||||
@ -359,7 +359,7 @@ static NV_STATUS test_all(uvm_va_space_t *va_space)
|
||||
// size on pre-Pascal GPUs with 128K big page size.
|
||||
// Ampere+ also supports 512M PTEs, but since UVM's maximum chunk size is
|
||||
// 2M, we don't test for this page size.
|
||||
static const NvU32 cpu_chunk_sizes = PAGE_SIZE | UVM_PAGE_SIZE_64K | UVM_PAGE_SIZE_128K | UVM_PAGE_SIZE_2M;
|
||||
static const NvU64 cpu_chunk_sizes = PAGE_SIZE | UVM_PAGE_SIZE_64K | UVM_PAGE_SIZE_128K | UVM_PAGE_SIZE_2M;
|
||||
|
||||
// All supported page sizes will be tested, CPU has the most with 4 and +1
|
||||
// for the default.
|
||||
@ -494,41 +494,6 @@ done:
|
||||
return status;
|
||||
}
|
||||
|
||||
static NV_STATUS test_basic_vidmem_unprotected(uvm_gpu_t *gpu)
|
||||
{
|
||||
NV_STATUS status = NV_OK;
|
||||
uvm_mem_t *mem = NULL;
|
||||
|
||||
uvm_mem_alloc_params_t params = { 0 };
|
||||
params.size = UVM_PAGE_SIZE_4K;
|
||||
params.backing_gpu = gpu;
|
||||
params.page_size = UVM_PAGE_SIZE_4K;
|
||||
|
||||
// If CC is enabled, the protection flag is observed. Because currently all
|
||||
// vidmem is in the protected region, the allocation should succeed.
|
||||
//
|
||||
// If CC is disabled, the protection flag is ignored.
|
||||
params.is_unprotected = false;
|
||||
TEST_NV_CHECK_RET(uvm_mem_alloc(¶ms, &mem));
|
||||
|
||||
uvm_mem_free(mem);
|
||||
mem = NULL;
|
||||
|
||||
// If CC is enabled, the allocation should fail because currently the
|
||||
// unprotected region is empty.
|
||||
//
|
||||
// If CC is disabled, the behavior should be identical to that of a
|
||||
// protected allocation.
|
||||
params.is_unprotected = true;
|
||||
if (g_uvm_global.conf_computing_enabled)
|
||||
TEST_CHECK_RET(uvm_mem_alloc(¶ms, &mem) == NV_ERR_NO_MEMORY);
|
||||
else
|
||||
TEST_NV_CHECK_RET(uvm_mem_alloc(¶ms, &mem));
|
||||
|
||||
uvm_mem_free(mem);
|
||||
return status;
|
||||
}
|
||||
|
||||
static NV_STATUS test_basic_sysmem(void)
|
||||
{
|
||||
NV_STATUS status = NV_OK;
|
||||
@ -613,7 +578,6 @@ static NV_STATUS test_basic(uvm_va_space_t *va_space)
|
||||
for_each_va_space_gpu(gpu, va_space) {
|
||||
TEST_NV_CHECK_RET(test_basic_vidmem(gpu));
|
||||
TEST_NV_CHECK_RET(test_basic_sysmem_dma(gpu));
|
||||
TEST_NV_CHECK_RET(test_basic_vidmem_unprotected(gpu));
|
||||
TEST_NV_CHECK_RET(test_basic_dma_pool(gpu));
|
||||
}
|
||||
|
||||
|
@ -589,7 +589,7 @@ static NV_STATUS uvm_migrate_ranges(uvm_va_space_t *va_space,
|
||||
skipped_migrate = true;
|
||||
}
|
||||
else if (uvm_processor_mask_test(&va_range->uvm_lite_gpus, dest_id) &&
|
||||
!uvm_va_policy_preferred_location_equal(policy, dest_id, NUMA_NO_NODE)) {
|
||||
!uvm_id_equal(dest_id, policy->preferred_location)) {
|
||||
// Don't migrate to a non-faultable GPU that is in UVM-Lite mode,
|
||||
// unless it's the preferred location
|
||||
status = NV_ERR_INVALID_DEVICE;
|
||||
|
@ -153,20 +153,17 @@ static NV_STATUS phys_mem_allocate_sysmem(uvm_page_tree_t *tree, NvLength size,
|
||||
// - UVM_APERTURE_VID biggest page size on vidmem mappings
|
||||
// - UVM_APERTURE_SYS biggest page size on sysmem mappings
|
||||
// - UVM_APERTURE_PEER_0-7 biggest page size on peer mappings
|
||||
static NvU32 mmu_biggest_page_size(uvm_page_tree_t *tree, uvm_aperture_t aperture)
|
||||
static NvU64 mmu_biggest_page_size(uvm_page_tree_t *tree, uvm_aperture_t aperture)
|
||||
{
|
||||
UVM_ASSERT(aperture < UVM_APERTURE_DEFAULT);
|
||||
|
||||
// There may be scenarios where the GMMU must use a subset of the supported
|
||||
// page sizes, e.g., to comply with the vMMU supported page sizes due to
|
||||
// segmentation sizes.
|
||||
if (aperture == UVM_APERTURE_VID) {
|
||||
UVM_ASSERT(tree->gpu->mem_info.max_vidmem_page_size <= NV_U32_MAX);
|
||||
return (NvU32) tree->gpu->mem_info.max_vidmem_page_size;
|
||||
}
|
||||
else {
|
||||
return 1 << __fls(tree->hal->page_sizes());
|
||||
}
|
||||
if (aperture == UVM_APERTURE_VID)
|
||||
return uvm_mmu_biggest_page_size_up_to(tree, tree->gpu->mem_info.max_vidmem_page_size);
|
||||
|
||||
return 1ULL << __fls(tree->hal->page_sizes());
|
||||
}
|
||||
|
||||
static NV_STATUS phys_mem_allocate_vidmem(uvm_page_tree_t *tree,
|
||||
@ -254,7 +251,7 @@ static void phys_mem_deallocate(uvm_page_tree_t *tree, uvm_mmu_page_table_alloc_
|
||||
}
|
||||
|
||||
static void page_table_range_init(uvm_page_table_range_t *range,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
uvm_page_directory_t *dir,
|
||||
NvU32 start_index,
|
||||
NvU32 end_index)
|
||||
@ -444,9 +441,9 @@ static void pde_fill(uvm_page_tree_t *tree,
|
||||
pde_fill_cpu(tree, directory, start_index, pde_count, phys_addr);
|
||||
}
|
||||
|
||||
static void phys_mem_init(uvm_page_tree_t *tree, NvU32 page_size, uvm_page_directory_t *dir, uvm_push_t *push)
|
||||
static void phys_mem_init(uvm_page_tree_t *tree, NvU64 page_size, uvm_page_directory_t *dir, uvm_push_t *push)
|
||||
{
|
||||
NvU32 entries_count = uvm_mmu_page_tree_entries(tree, dir->depth, page_size);
|
||||
NvU64 entries_count = uvm_mmu_page_tree_entries(tree, dir->depth, page_size);
|
||||
NvU8 max_pde_depth = tree->hal->page_table_depth(UVM_PAGE_SIZE_AGNOSTIC) - 1;
|
||||
|
||||
// Passing in NULL for the phys_allocs will mark the child entries as
|
||||
@ -497,7 +494,7 @@ static void phys_mem_init(uvm_page_tree_t *tree, NvU32 page_size, uvm_page_direc
|
||||
}
|
||||
|
||||
static uvm_page_directory_t *allocate_directory(uvm_page_tree_t *tree,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
NvU32 depth,
|
||||
uvm_pmm_alloc_flags_t pmm_flags)
|
||||
{
|
||||
@ -546,7 +543,7 @@ static inline NvU32 entry_index_from_vaddr(NvU64 vaddr, NvU32 addr_bit_shift, Nv
|
||||
return (NvU32)((vaddr >> addr_bit_shift) & mask);
|
||||
}
|
||||
|
||||
static inline NvU32 index_to_entry(uvm_mmu_mode_hal_t *hal, NvU32 entry_index, NvU32 depth, NvU32 page_size)
|
||||
static inline NvU32 index_to_entry(uvm_mmu_mode_hal_t *hal, NvU32 entry_index, NvU32 depth, NvU64 page_size)
|
||||
{
|
||||
return hal->entries_per_index(depth) * entry_index + hal->entry_offset(depth, page_size);
|
||||
}
|
||||
@ -583,7 +580,7 @@ static void pde_write(uvm_page_tree_t *tree,
|
||||
pde_fill(tree, dir, entry_index, 1, phys_allocs, push);
|
||||
}
|
||||
|
||||
static void host_pde_clear(uvm_page_tree_t *tree, uvm_page_directory_t *dir, NvU32 entry_index, NvU32 page_size)
|
||||
static void host_pde_clear(uvm_page_tree_t *tree, uvm_page_directory_t *dir, NvU32 entry_index, NvU64 page_size)
|
||||
{
|
||||
UVM_ASSERT(dir->ref_count > 0);
|
||||
|
||||
@ -594,35 +591,38 @@ static void host_pde_clear(uvm_page_tree_t *tree, uvm_page_directory_t *dir, NvU
|
||||
static void pde_clear(uvm_page_tree_t *tree,
|
||||
uvm_page_directory_t *dir,
|
||||
NvU32 entry_index,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
uvm_push_t *push)
|
||||
{
|
||||
host_pde_clear(tree, dir, entry_index, page_size);
|
||||
pde_write(tree, dir, entry_index, false, push);
|
||||
}
|
||||
|
||||
static uvm_chunk_sizes_mask_t allocation_sizes_for_big_page_size(uvm_parent_gpu_t *parent_gpu, NvU32 big_page_size)
|
||||
static uvm_chunk_sizes_mask_t allocation_sizes_for_big_page_size(uvm_parent_gpu_t *parent_gpu, NvU64 big_page_size)
|
||||
{
|
||||
uvm_chunk_sizes_mask_t alloc_sizes = 0;
|
||||
uvm_mmu_mode_hal_t *hal = parent_gpu->arch_hal->mmu_mode_hal(big_page_size);
|
||||
unsigned long page_sizes, page_size_log2;
|
||||
uvm_chunk_sizes_mask_t alloc_sizes;
|
||||
|
||||
if (hal == NULL)
|
||||
return 0;
|
||||
|
||||
page_sizes = hal->page_sizes();
|
||||
alloc_sizes = 0;
|
||||
|
||||
if (hal != NULL) {
|
||||
unsigned long page_size_log2;
|
||||
unsigned long page_sizes = hal->page_sizes();
|
||||
BUILD_BUG_ON(sizeof(hal->page_sizes()) > sizeof(page_sizes));
|
||||
|
||||
for_each_set_bit(page_size_log2, &page_sizes, BITS_PER_LONG) {
|
||||
NvU32 i;
|
||||
NvU32 page_size = (NvU32)(1ULL << page_size_log2);
|
||||
NvU64 page_size = 1ULL << page_size_log2;
|
||||
for (i = 0; i <= hal->page_table_depth(page_size); i++)
|
||||
alloc_sizes |= hal->allocation_size(i, page_size);
|
||||
}
|
||||
}
|
||||
|
||||
return alloc_sizes;
|
||||
}
|
||||
|
||||
static NvU32 page_sizes_for_big_page_size(uvm_parent_gpu_t *parent_gpu, NvU32 big_page_size)
|
||||
static NvU64 page_sizes_for_big_page_size(uvm_parent_gpu_t *parent_gpu, NvU64 big_page_size)
|
||||
{
|
||||
uvm_mmu_mode_hal_t *hal = parent_gpu->arch_hal->mmu_mode_hal(big_page_size);
|
||||
|
||||
@ -662,7 +662,7 @@ static NV_STATUS page_tree_end_and_wait(uvm_page_tree_t *tree, uvm_push_t *push)
|
||||
}
|
||||
|
||||
static NV_STATUS write_gpu_state_cpu(uvm_page_tree_t *tree,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
NvS32 invalidate_depth,
|
||||
NvU32 used_count,
|
||||
uvm_page_directory_t **dirs_used)
|
||||
@ -713,7 +713,7 @@ static NV_STATUS write_gpu_state_cpu(uvm_page_tree_t *tree,
|
||||
}
|
||||
|
||||
static NV_STATUS write_gpu_state_gpu(uvm_page_tree_t *tree,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
NvS32 invalidate_depth,
|
||||
NvU32 used_count,
|
||||
uvm_page_directory_t **dirs_used)
|
||||
@ -805,7 +805,7 @@ static NV_STATUS write_gpu_state_gpu(uvm_page_tree_t *tree,
|
||||
|
||||
// initialize new page tables and insert them into the tree
|
||||
static NV_STATUS write_gpu_state(uvm_page_tree_t *tree,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
NvS32 invalidate_depth,
|
||||
NvU32 used_count,
|
||||
uvm_page_directory_t **dirs_used)
|
||||
@ -842,7 +842,7 @@ static void free_unused_directories(uvm_page_tree_t *tree,
|
||||
}
|
||||
}
|
||||
|
||||
static NV_STATUS allocate_page_table(uvm_page_tree_t *tree, NvU32 page_size, uvm_mmu_page_table_alloc_t *out)
|
||||
static NV_STATUS allocate_page_table(uvm_page_tree_t *tree, NvU64 page_size, uvm_mmu_page_table_alloc_t *out)
|
||||
{
|
||||
NvU32 depth = tree->hal->page_table_depth(page_size);
|
||||
NvLength alloc_size = tree->hal->allocation_size(depth, page_size);
|
||||
@ -871,7 +871,7 @@ static NV_STATUS page_tree_ats_init(uvm_page_tree_t *tree)
|
||||
{
|
||||
NV_STATUS status;
|
||||
NvU64 min_va_upper, max_va_lower;
|
||||
NvU32 page_size;
|
||||
NvU64 page_size;
|
||||
|
||||
if (!page_tree_ats_init_required(tree))
|
||||
return NV_OK;
|
||||
@ -1090,7 +1090,7 @@ static void page_tree_set_location(uvm_page_tree_t *tree, uvm_aperture_t locatio
|
||||
NV_STATUS uvm_page_tree_init(uvm_gpu_t *gpu,
|
||||
uvm_gpu_va_space_t *gpu_va_space,
|
||||
uvm_page_tree_type_t type,
|
||||
NvU32 big_page_size,
|
||||
NvU64 big_page_size,
|
||||
uvm_aperture_t location,
|
||||
uvm_page_tree_t *tree)
|
||||
{
|
||||
@ -1110,7 +1110,7 @@ NV_STATUS uvm_page_tree_init(uvm_gpu_t *gpu,
|
||||
tree->gpu_va_space = gpu_va_space;
|
||||
tree->big_page_size = big_page_size;
|
||||
|
||||
UVM_ASSERT(gpu->mem_info.max_vidmem_page_size & tree->hal->page_sizes());
|
||||
UVM_ASSERT(uvm_mmu_page_size_supported(tree, big_page_size));
|
||||
|
||||
page_tree_set_location(tree, location);
|
||||
|
||||
@ -1347,7 +1347,7 @@ NV_STATUS uvm_page_tree_wait(uvm_page_tree_t *tree)
|
||||
}
|
||||
|
||||
static NV_STATUS try_get_ptes(uvm_page_tree_t *tree,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
NvU64 start,
|
||||
NvLength size,
|
||||
uvm_page_table_range_t *range,
|
||||
@ -1379,7 +1379,7 @@ static NV_STATUS try_get_ptes(uvm_page_tree_t *tree,
|
||||
// This algorithm will work with unaligned ranges, but the caller's intent
|
||||
// is unclear
|
||||
UVM_ASSERT_MSG(start % page_size == 0 && size % page_size == 0,
|
||||
"start 0x%llx size 0x%zx page_size 0x%x\n",
|
||||
"start 0x%llx size 0x%zx page_size 0x%llx\n",
|
||||
start,
|
||||
(size_t)size,
|
||||
page_size);
|
||||
@ -1448,7 +1448,7 @@ static NV_STATUS map_remap(uvm_page_tree_t *tree, NvU64 start, NvLength size, uv
|
||||
{
|
||||
NV_STATUS status;
|
||||
uvm_push_t push;
|
||||
NvU32 page_sizes;
|
||||
NvU64 page_sizes;
|
||||
uvm_mmu_page_table_alloc_t *phys_alloc[1];
|
||||
|
||||
// TODO: Bug 2734399
|
||||
@ -1460,7 +1460,7 @@ static NV_STATUS map_remap(uvm_page_tree_t *tree, NvU64 start, NvLength size, uv
|
||||
status = page_tree_begin_acquire(tree,
|
||||
&tree->tracker,
|
||||
&push,
|
||||
"map remap: [0x%llx, 0x%llx), page_size: %d",
|
||||
"map remap: [0x%llx, 0x%llx), page_size: %lld",
|
||||
start,
|
||||
start + size,
|
||||
range->page_size);
|
||||
@ -1500,7 +1500,7 @@ static NV_STATUS map_remap(uvm_page_tree_t *tree, NvU64 start, NvLength size, uv
|
||||
}
|
||||
|
||||
NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
NvU64 start,
|
||||
NvLength size,
|
||||
uvm_pmm_alloc_flags_t pmm_flags,
|
||||
@ -1545,7 +1545,7 @@ NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree,
|
||||
}
|
||||
|
||||
NV_STATUS uvm_page_tree_get_ptes(uvm_page_tree_t *tree,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
NvU64 start,
|
||||
NvLength size,
|
||||
uvm_pmm_alloc_flags_t pmm_flags,
|
||||
@ -1596,7 +1596,7 @@ void uvm_page_table_range_shrink(uvm_page_tree_t *tree, uvm_page_table_range_t *
|
||||
}
|
||||
|
||||
NV_STATUS uvm_page_tree_get_entry(uvm_page_tree_t *tree,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
NvU64 start,
|
||||
uvm_pmm_alloc_flags_t pmm_flags,
|
||||
uvm_page_table_range_t *single)
|
||||
@ -1621,7 +1621,7 @@ void uvm_page_tree_clear_pde(uvm_page_tree_t *tree, uvm_page_table_range_t *sing
|
||||
static NV_STATUS poison_ptes(uvm_page_tree_t *tree,
|
||||
uvm_page_directory_t *pte_dir,
|
||||
uvm_page_directory_t *parent,
|
||||
NvU32 page_size)
|
||||
NvU64 page_size)
|
||||
{
|
||||
NV_STATUS status;
|
||||
uvm_push_t push;
|
||||
@ -1633,7 +1633,7 @@ static NV_STATUS poison_ptes(uvm_page_tree_t *tree,
|
||||
// The flat mappings should always be set up when executing this path
|
||||
UVM_ASSERT(!uvm_mmu_use_cpu(tree));
|
||||
|
||||
status = page_tree_begin_acquire(tree, &tree->tracker, &push, "Poisoning child table of page size %u", page_size);
|
||||
status = page_tree_begin_acquire(tree, &tree->tracker, &push, "Poisoning child table of page size %llu", page_size);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
@ -1660,7 +1660,7 @@ static NV_STATUS poison_ptes(uvm_page_tree_t *tree,
|
||||
}
|
||||
|
||||
NV_STATUS uvm_page_tree_alloc_table(uvm_page_tree_t *tree,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
uvm_pmm_alloc_flags_t pmm_flags,
|
||||
uvm_page_table_range_t *single,
|
||||
uvm_page_table_range_t *children)
|
||||
@ -1768,7 +1768,7 @@ static size_t range_vec_calc_range_index(uvm_page_table_range_vec_t *range_vec,
|
||||
NV_STATUS uvm_page_table_range_vec_init(uvm_page_tree_t *tree,
|
||||
NvU64 start,
|
||||
NvU64 size,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
uvm_pmm_alloc_flags_t pmm_flags,
|
||||
uvm_page_table_range_vec_t *range_vec)
|
||||
{
|
||||
@ -1776,8 +1776,8 @@ NV_STATUS uvm_page_table_range_vec_init(uvm_page_tree_t *tree,
|
||||
size_t i;
|
||||
|
||||
UVM_ASSERT(size != 0);
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(start, page_size), "start 0x%llx page_size 0x%x\n", start, page_size);
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%x\n", size, page_size);
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(start, page_size), "start 0x%llx page_size 0x%llx\n", start, page_size);
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%llx\n", size, page_size);
|
||||
|
||||
range_vec->tree = tree;
|
||||
range_vec->page_size = page_size;
|
||||
@ -1826,7 +1826,7 @@ out:
|
||||
NV_STATUS uvm_page_table_range_vec_create(uvm_page_tree_t *tree,
|
||||
NvU64 start,
|
||||
NvU64 size,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
uvm_pmm_alloc_flags_t pmm_flags,
|
||||
uvm_page_table_range_vec_t **range_vec_out)
|
||||
{
|
||||
@ -1952,7 +1952,7 @@ static NV_STATUS uvm_page_table_range_vec_clear_ptes_gpu(uvm_page_table_range_ve
|
||||
size_t i;
|
||||
uvm_page_tree_t *tree = range_vec->tree;
|
||||
uvm_gpu_t *gpu = tree->gpu;
|
||||
NvU32 page_size = range_vec->page_size;
|
||||
NvU64 page_size = range_vec->page_size;
|
||||
NvU32 entry_size = uvm_mmu_pte_size(tree, page_size);
|
||||
NvU64 invalid_pte = 0;
|
||||
uvm_push_t push;
|
||||
@ -2237,7 +2237,7 @@ static NV_STATUS create_identity_mapping(uvm_gpu_t *gpu,
|
||||
NvU64 size,
|
||||
uvm_aperture_t aperture,
|
||||
NvU64 phys_offset,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
uvm_pmm_alloc_flags_t pmm_flags)
|
||||
{
|
||||
NV_STATUS status;
|
||||
@ -2312,7 +2312,7 @@ bool uvm_mmu_parent_gpu_needs_dynamic_sysmem_mapping(uvm_parent_gpu_t *parent_gp
|
||||
|
||||
NV_STATUS create_static_vidmem_mapping(uvm_gpu_t *gpu)
|
||||
{
|
||||
NvU32 page_size;
|
||||
NvU64 page_size;
|
||||
NvU64 size;
|
||||
uvm_aperture_t aperture = UVM_APERTURE_VID;
|
||||
NvU64 phys_offset = 0;
|
||||
@ -2351,7 +2351,7 @@ static void destroy_static_vidmem_mapping(uvm_gpu_t *gpu)
|
||||
|
||||
NV_STATUS uvm_mmu_create_peer_identity_mappings(uvm_gpu_t *gpu, uvm_gpu_t *peer)
|
||||
{
|
||||
NvU32 page_size;
|
||||
NvU64 page_size;
|
||||
NvU64 size;
|
||||
uvm_aperture_t aperture;
|
||||
NvU64 phys_offset;
|
||||
@ -2535,7 +2535,7 @@ static void root_chunk_mapping_destroy(uvm_gpu_t *gpu, uvm_gpu_root_chunk_mappin
|
||||
uvm_push_t push;
|
||||
NvU32 entry_size;
|
||||
uvm_pte_batch_t pte_batch;
|
||||
NvU32 page_size;
|
||||
NvU64 page_size;
|
||||
NvU64 size;
|
||||
NvU64 invalid_pte;
|
||||
uvm_page_table_range_t *range = root_chunk_mapping->range;
|
||||
@ -2585,7 +2585,7 @@ static NV_STATUS root_chunk_mapping_create(uvm_gpu_t *gpu, uvm_gpu_root_chunk_ma
|
||||
uvm_push_t push;
|
||||
NvU64 pte_bits;
|
||||
NvU32 entry_size;
|
||||
NvU32 page_size = UVM_CHUNK_SIZE_MAX;
|
||||
NvU64 page_size = UVM_CHUNK_SIZE_MAX;
|
||||
NvU64 size = UVM_CHUNK_SIZE_MAX;
|
||||
|
||||
range = uvm_kvmalloc_zero(sizeof(*range));
|
||||
@ -2852,7 +2852,7 @@ NV_STATUS uvm_mmu_sysmem_map(uvm_gpu_t *gpu, NvU64 pa, NvU64 size)
|
||||
if (sysmem_mapping->range_vec == NULL) {
|
||||
uvm_gpu_address_t virtual_address = uvm_parent_gpu_address_virtual_from_sysmem_phys(gpu->parent, curr_pa);
|
||||
NvU64 phys_offset = curr_pa;
|
||||
NvU32 page_size = mmu_biggest_page_size(&gpu->address_space_tree, UVM_APERTURE_SYS);
|
||||
NvU64 page_size = mmu_biggest_page_size(&gpu->address_space_tree, UVM_APERTURE_SYS);
|
||||
uvm_pmm_alloc_flags_t pmm_flags;
|
||||
|
||||
// No eviction is requested when allocating the page tree storage,
|
||||
|
@ -208,7 +208,7 @@ struct uvm_mmu_mode_hal_struct
|
||||
// This is an optimization which reduces TLB pressure, reduces the number of
|
||||
// TLB invalidates we must issue, and means we don't have to initialize the
|
||||
// 4k PTEs which are covered by big PTEs since the MMU will never read them.
|
||||
NvU64 (*unmapped_pte)(NvU32 page_size);
|
||||
NvU64 (*unmapped_pte)(NvU64 page_size);
|
||||
|
||||
// Bit pattern used for debug purposes to clobber PTEs which ought to be
|
||||
// unused. In practice this will generate a PRIV violation or a physical
|
||||
@ -234,23 +234,23 @@ struct uvm_mmu_mode_hal_struct
|
||||
// For dual PDEs, this is ether 1 or 0, depending on the page size.
|
||||
// This is used to index the host copy only. GPU PDEs are always entirely
|
||||
// re-written using make_pde.
|
||||
NvLength (*entry_offset)(NvU32 depth, NvU32 page_size);
|
||||
NvLength (*entry_offset)(NvU32 depth, NvU64 page_size);
|
||||
|
||||
// number of virtual address bits used to index the directory/table at a
|
||||
// given depth
|
||||
NvU32 (*index_bits)(NvU32 depth, NvU32 page_size);
|
||||
NvU32 (*index_bits)(NvU32 depth, NvU64 page_size);
|
||||
|
||||
// total number of bits that represent the virtual address space
|
||||
NvU32 (*num_va_bits)(void);
|
||||
|
||||
// the size, in bytes, of a directory/table at a given depth.
|
||||
NvLength (*allocation_size)(NvU32 depth, NvU32 page_size);
|
||||
NvLength (*allocation_size)(NvU32 depth, NvU64 page_size);
|
||||
|
||||
// the depth which corresponds to the page tables
|
||||
NvU32 (*page_table_depth)(NvU32 page_size);
|
||||
NvU32 (*page_table_depth)(NvU64 page_size);
|
||||
|
||||
// bitwise-or of supported page sizes
|
||||
NvU32 (*page_sizes)(void);
|
||||
NvU64 (*page_sizes)(void);
|
||||
};
|
||||
|
||||
struct uvm_page_table_range_struct
|
||||
@ -258,7 +258,7 @@ struct uvm_page_table_range_struct
|
||||
uvm_page_directory_t *table;
|
||||
NvU32 start_index;
|
||||
NvU32 entry_count;
|
||||
NvU32 page_size;
|
||||
NvU64 page_size;
|
||||
};
|
||||
|
||||
typedef enum
|
||||
@ -275,7 +275,7 @@ struct uvm_page_tree_struct
|
||||
uvm_page_directory_t *root;
|
||||
uvm_mmu_mode_hal_t *hal;
|
||||
uvm_page_tree_type_t type;
|
||||
NvU32 big_page_size;
|
||||
NvU64 big_page_size;
|
||||
|
||||
// Pointer to the GPU VA space containing the page tree.
|
||||
// This pointer is set only for page trees of type
|
||||
@ -325,7 +325,7 @@ struct uvm_page_table_range_vec_struct
|
||||
NvU64 size;
|
||||
|
||||
// Page size used for all the page table ranges
|
||||
NvU32 page_size;
|
||||
NvU64 page_size;
|
||||
|
||||
// Page table ranges covering the VA
|
||||
uvm_page_table_range_t *ranges;
|
||||
@ -352,7 +352,7 @@ void uvm_mmu_init_gpu_peer_addresses(uvm_gpu_t *gpu);
|
||||
NV_STATUS uvm_page_tree_init(uvm_gpu_t *gpu,
|
||||
uvm_gpu_va_space_t *gpu_va_space,
|
||||
uvm_page_tree_type_t type,
|
||||
NvU32 big_page_size,
|
||||
NvU64 big_page_size,
|
||||
uvm_aperture_t location,
|
||||
uvm_page_tree_t *tree_out);
|
||||
|
||||
@ -374,7 +374,7 @@ void uvm_page_tree_deinit(uvm_page_tree_t *tree);
|
||||
// an existing range or change the size of an existing range, use
|
||||
// uvm_page_table_range_get_upper() and/or uvm_page_table_range_shrink().
|
||||
NV_STATUS uvm_page_tree_get_ptes(uvm_page_tree_t *tree,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
NvU64 start,
|
||||
NvLength size,
|
||||
uvm_pmm_alloc_flags_t pmm_flags,
|
||||
@ -384,7 +384,7 @@ NV_STATUS uvm_page_tree_get_ptes(uvm_page_tree_t *tree,
|
||||
//
|
||||
// All pending operations can be waited on with uvm_page_tree_wait().
|
||||
NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
NvU64 start,
|
||||
NvLength size,
|
||||
uvm_pmm_alloc_flags_t pmm_flags,
|
||||
@ -395,7 +395,7 @@ NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree,
|
||||
// This is equivalent to calling uvm_page_tree_get_ptes() with size equal to
|
||||
// page_size.
|
||||
NV_STATUS uvm_page_tree_get_entry(uvm_page_tree_t *tree,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
NvU64 start,
|
||||
uvm_pmm_alloc_flags_t pmm_flags,
|
||||
uvm_page_table_range_t *single);
|
||||
@ -426,7 +426,7 @@ void uvm_page_tree_clear_pde(uvm_page_tree_t *tree, uvm_page_table_range_t *sing
|
||||
// It is the caller's responsibility to initialize the returned table before
|
||||
// calling uvm_page_tree_write_pde.
|
||||
NV_STATUS uvm_page_tree_alloc_table(uvm_page_tree_t *tree,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
uvm_pmm_alloc_flags_t pmm_flags,
|
||||
uvm_page_table_range_t *single,
|
||||
uvm_page_table_range_t *children);
|
||||
@ -480,7 +480,7 @@ static uvm_mmu_page_table_alloc_t *uvm_page_tree_pdb(uvm_page_tree_t *tree)
|
||||
NV_STATUS uvm_page_table_range_vec_init(uvm_page_tree_t *tree,
|
||||
NvU64 start,
|
||||
NvU64 size,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
uvm_pmm_alloc_flags_t pmm_flags,
|
||||
uvm_page_table_range_vec_t *range_vec);
|
||||
|
||||
@ -489,7 +489,7 @@ NV_STATUS uvm_page_table_range_vec_init(uvm_page_tree_t *tree,
|
||||
NV_STATUS uvm_page_table_range_vec_create(uvm_page_tree_t *tree,
|
||||
NvU64 start,
|
||||
NvU64 size,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
uvm_pmm_alloc_flags_t pmm_flags,
|
||||
uvm_page_table_range_vec_t **range_vec_out);
|
||||
|
||||
@ -601,12 +601,12 @@ void uvm_mmu_chunk_unmap(uvm_gpu_chunk_t *chunk, uvm_tracker_t *tracker);
|
||||
// uvm_parent_gpu_map_cpu_pages for the given GPU.
|
||||
NV_STATUS uvm_mmu_sysmem_map(uvm_gpu_t *gpu, NvU64 pa, NvU64 size);
|
||||
|
||||
static NvU64 uvm_mmu_page_tree_entries(uvm_page_tree_t *tree, NvU32 depth, NvU32 page_size)
|
||||
static NvU64 uvm_mmu_page_tree_entries(uvm_page_tree_t *tree, NvU32 depth, NvU64 page_size)
|
||||
{
|
||||
return 1ull << tree->hal->index_bits(depth, page_size);
|
||||
}
|
||||
|
||||
static NvU64 uvm_mmu_pde_coverage(uvm_page_tree_t *tree, NvU32 page_size)
|
||||
static NvU64 uvm_mmu_pde_coverage(uvm_page_tree_t *tree, NvU64 page_size)
|
||||
{
|
||||
NvU32 depth = tree->hal->page_table_depth(page_size);
|
||||
return uvm_mmu_page_tree_entries(tree, depth, page_size) * page_size;
|
||||
@ -615,21 +615,21 @@ static NvU64 uvm_mmu_pde_coverage(uvm_page_tree_t *tree, NvU32 page_size)
|
||||
// Page sizes supported by the GPU. Use uvm_mmu_biggest_page_size() to retrieve
|
||||
// the largest page size supported in a given system, which considers the GMMU
|
||||
// and vMMU page sizes and segment sizes.
|
||||
static bool uvm_mmu_page_size_supported(uvm_page_tree_t *tree, NvU32 page_size)
|
||||
static bool uvm_mmu_page_size_supported(uvm_page_tree_t *tree, NvU64 page_size)
|
||||
{
|
||||
UVM_ASSERT_MSG(is_power_of_2(page_size), "0x%x\n", page_size);
|
||||
UVM_ASSERT_MSG(is_power_of_2(page_size), "0x%llx\n", page_size);
|
||||
|
||||
return (tree->hal->page_sizes() & page_size) != 0;
|
||||
}
|
||||
|
||||
static NvU32 uvm_mmu_biggest_page_size_up_to(uvm_page_tree_t *tree, NvU32 max_page_size)
|
||||
static NvU64 uvm_mmu_biggest_page_size_up_to(uvm_page_tree_t *tree, NvU64 max_page_size)
|
||||
{
|
||||
NvU32 gpu_page_sizes = tree->hal->page_sizes();
|
||||
NvU32 smallest_gpu_page_size = gpu_page_sizes & ~(gpu_page_sizes - 1);
|
||||
NvU32 page_sizes;
|
||||
NvU32 page_size;
|
||||
NvU64 gpu_page_sizes = tree->hal->page_sizes();
|
||||
NvU64 smallest_gpu_page_size = gpu_page_sizes & ~(gpu_page_sizes - 1);
|
||||
NvU64 page_sizes;
|
||||
NvU64 page_size;
|
||||
|
||||
UVM_ASSERT_MSG(is_power_of_2(max_page_size), "0x%x\n", max_page_size);
|
||||
UVM_ASSERT_MSG(is_power_of_2(max_page_size), "0x%llx\n", max_page_size);
|
||||
|
||||
if (max_page_size < smallest_gpu_page_size)
|
||||
return 0;
|
||||
@ -638,14 +638,14 @@ static NvU32 uvm_mmu_biggest_page_size_up_to(uvm_page_tree_t *tree, NvU32 max_pa
|
||||
page_sizes = gpu_page_sizes & (max_page_size | (max_page_size - 1));
|
||||
|
||||
// And pick the biggest one of them
|
||||
page_size = 1 << __fls(page_sizes);
|
||||
page_size = 1ULL << __fls(page_sizes);
|
||||
|
||||
UVM_ASSERT_MSG(uvm_mmu_page_size_supported(tree, page_size), "page_size 0x%x", page_size);
|
||||
UVM_ASSERT_MSG(uvm_mmu_page_size_supported(tree, page_size), "page_size 0x%llx", page_size);
|
||||
|
||||
return page_size;
|
||||
}
|
||||
|
||||
static NvU32 uvm_mmu_pte_size(uvm_page_tree_t *tree, NvU32 page_size)
|
||||
static NvU32 uvm_mmu_pte_size(uvm_page_tree_t *tree, NvU64 page_size)
|
||||
{
|
||||
return tree->hal->entry_size(tree->hal->page_table_depth(page_size));
|
||||
}
|
||||
|
@ -96,7 +96,7 @@ typedef struct
|
||||
{
|
||||
NvU64 base;
|
||||
NvU64 size;
|
||||
NvU32 page_size;
|
||||
NvU64 page_size;
|
||||
NvU32 depth;
|
||||
uvm_membar_t membar;
|
||||
} fake_tlb_invalidate_t;
|
||||
@ -153,7 +153,7 @@ static void fake_tlb_invalidate_va(uvm_push_t *push,
|
||||
NvU32 depth,
|
||||
NvU64 base,
|
||||
NvU64 size,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
uvm_membar_t membar)
|
||||
{
|
||||
if (!g_fake_tlb_invals_tracking_enabled)
|
||||
@ -249,7 +249,11 @@ static bool assert_last_invalidate_all(NvU32 expected_depth, bool expected_memba
|
||||
}
|
||||
|
||||
static bool assert_invalidate_range_specific(fake_tlb_invalidate_t *inval,
|
||||
NvU64 base, NvU64 size, NvU32 page_size, NvU32 expected_depth, bool expected_membar)
|
||||
NvU64 base,
|
||||
NvU64 size,
|
||||
NvU64 page_size,
|
||||
NvU32 expected_depth,
|
||||
bool expected_membar)
|
||||
{
|
||||
UVM_ASSERT(g_fake_tlb_invals_tracking_enabled);
|
||||
|
||||
@ -271,7 +275,7 @@ static bool assert_invalidate_range_specific(fake_tlb_invalidate_t *inval,
|
||||
return false;
|
||||
}
|
||||
if (inval->page_size != page_size && inval->base != 0 && inval->size != -1) {
|
||||
UVM_TEST_PRINT("Expected page size %u, got %u instead\n", page_size, inval->page_size);
|
||||
UVM_TEST_PRINT("Expected page size %llu, got %llu instead\n", page_size, inval->page_size);
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -280,7 +284,7 @@ static bool assert_invalidate_range_specific(fake_tlb_invalidate_t *inval,
|
||||
|
||||
static bool assert_invalidate_range(NvU64 base,
|
||||
NvU64 size,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
bool allow_inval_all,
|
||||
NvU32 range_depth,
|
||||
NvU32 all_depth,
|
||||
@ -325,7 +329,7 @@ static NV_STATUS test_page_tree_init_kernel(uvm_gpu_t *gpu, NvU32 big_page_size,
|
||||
}
|
||||
|
||||
static NV_STATUS test_page_tree_get_ptes(uvm_page_tree_t *tree,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
NvU64 start,
|
||||
NvLength size,
|
||||
uvm_page_table_range_t *range)
|
||||
@ -341,7 +345,7 @@ static NV_STATUS test_page_tree_get_ptes(uvm_page_tree_t *tree,
|
||||
}
|
||||
|
||||
static NV_STATUS test_page_tree_get_entry(uvm_page_tree_t *tree,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
NvU64 start,
|
||||
uvm_page_table_range_t *single)
|
||||
{
|
||||
@ -355,14 +359,14 @@ static NV_STATUS test_page_tree_get_entry(uvm_page_tree_t *tree,
|
||||
}
|
||||
|
||||
static NV_STATUS test_page_tree_alloc_table(uvm_page_tree_t *tree,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
uvm_page_table_range_t *single,
|
||||
uvm_page_table_range_t *children)
|
||||
{
|
||||
return uvm_page_tree_alloc_table(tree, page_size, UVM_PMM_ALLOC_FLAGS_NONE, single, children);
|
||||
}
|
||||
|
||||
static bool assert_entry_no_invalidate(uvm_page_tree_t *tree, NvU32 page_size, NvU64 start)
|
||||
static bool assert_entry_no_invalidate(uvm_page_tree_t *tree, NvU64 page_size, NvU64 start)
|
||||
{
|
||||
uvm_page_table_range_t entry;
|
||||
bool result = true;
|
||||
@ -378,7 +382,7 @@ static bool assert_entry_no_invalidate(uvm_page_tree_t *tree, NvU32 page_size, N
|
||||
return assert_no_invalidate() && result;
|
||||
}
|
||||
|
||||
static bool assert_entry_invalidate(uvm_page_tree_t *tree, NvU32 page_size, NvU64 start, NvU32 depth, bool membar)
|
||||
static bool assert_entry_invalidate(uvm_page_tree_t *tree, NvU64 page_size, NvU64 start, NvU32 depth, bool membar)
|
||||
{
|
||||
uvm_page_table_range_t entry;
|
||||
bool result = true;
|
||||
@ -932,8 +936,8 @@ static NV_STATUS split_and_free(uvm_gpu_t *gpu)
|
||||
|
||||
static NV_STATUS check_sizes(uvm_gpu_t *gpu)
|
||||
{
|
||||
NvU32 user_sizes = UVM_PAGE_SIZE_2M;
|
||||
NvU32 kernel_sizes = UVM_PAGE_SIZE_4K | 256;
|
||||
NvU64 user_sizes = UVM_PAGE_SIZE_2M;
|
||||
NvU64 kernel_sizes = UVM_PAGE_SIZE_4K | 256;
|
||||
|
||||
if (UVM_PAGE_SIZE_64K >= PAGE_SIZE)
|
||||
user_sizes |= UVM_PAGE_SIZE_64K;
|
||||
@ -1161,7 +1165,7 @@ static NV_STATUS test_tlb_batch_invalidates_case(uvm_page_tree_t *tree,
|
||||
return status;
|
||||
}
|
||||
|
||||
static NV_STATUS test_tlb_batch_invalidates(uvm_gpu_t *gpu, const NvU32 *page_sizes, const NvU32 page_sizes_count)
|
||||
static NV_STATUS test_tlb_batch_invalidates(uvm_gpu_t *gpu, const NvU64 *page_sizes, const NvU32 page_sizes_count)
|
||||
{
|
||||
NV_STATUS status = NV_OK;
|
||||
uvm_page_tree_t tree;
|
||||
@ -1177,8 +1181,8 @@ static NV_STATUS test_tlb_batch_invalidates(uvm_gpu_t *gpu, const NvU32 *page_si
|
||||
for (min_index = 0; min_index < page_sizes_count; ++min_index) {
|
||||
for (max_index = min_index; max_index < page_sizes_count; ++max_index) {
|
||||
for (size_index = 0; size_index < ARRAY_SIZE(sizes_in_max_pages); ++size_index) {
|
||||
NvU32 min_page_size = page_sizes[min_index];
|
||||
NvU32 max_page_size = page_sizes[max_index];
|
||||
NvU64 min_page_size = page_sizes[min_index];
|
||||
NvU64 max_page_size = page_sizes[max_index];
|
||||
NvU64 size = (NvU64)sizes_in_max_pages[size_index] * max_page_size;
|
||||
|
||||
TEST_CHECK_GOTO(test_tlb_batch_invalidates_case(&tree,
|
||||
@ -1282,7 +1286,7 @@ static NV_STATUS test_range_vec_clear_ptes(uvm_page_table_range_vec_t *range_vec
|
||||
static NV_STATUS test_range_vec_create(uvm_page_tree_t *tree,
|
||||
NvU64 start,
|
||||
NvU64 size,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
uvm_page_table_range_vec_t **range_vec_out)
|
||||
{
|
||||
uvm_page_table_range_vec_t *range_vec;
|
||||
@ -1303,7 +1307,7 @@ static NV_STATUS test_range_vec_create(uvm_page_tree_t *tree,
|
||||
// Test page table range vector APIs.
|
||||
// Notably the test leaks the page_tree and range_vec on error as it's hard to
|
||||
// clean up on failure and the destructors would likely assert.
|
||||
static NV_STATUS test_range_vec(uvm_gpu_t *gpu, NvU32 big_page_size, NvU32 page_size)
|
||||
static NV_STATUS test_range_vec(uvm_gpu_t *gpu, NvU32 big_page_size, NvU64 page_size)
|
||||
{
|
||||
NV_STATUS status = NV_OK;
|
||||
uvm_page_tree_t tree;
|
||||
@ -1511,7 +1515,7 @@ static uvm_mmu_page_table_alloc_t fake_table_alloc(uvm_aperture_t aperture, NvU6
|
||||
// Queries the supported page sizes of the GPU(uvm_gpu_t) and fills the
|
||||
// page_sizes array up to MAX_NUM_PAGE_SIZE. Returns the number of elements in
|
||||
// page_sizes;
|
||||
size_t get_page_sizes(uvm_gpu_t *gpu, NvU32 *page_sizes)
|
||||
size_t get_page_sizes(uvm_gpu_t *gpu, NvU64 *page_sizes)
|
||||
{
|
||||
unsigned long page_size_log2;
|
||||
unsigned long page_sizes_bitvec;
|
||||
@ -1524,7 +1528,7 @@ size_t get_page_sizes(uvm_gpu_t *gpu, NvU32 *page_sizes)
|
||||
page_sizes_bitvec = hal->page_sizes();
|
||||
|
||||
for_each_set_bit(page_size_log2, &page_sizes_bitvec, BITS_PER_LONG) {
|
||||
NvU32 page_size = (NvU32)(1ULL << page_size_log2);
|
||||
NvU64 page_size = 1ULL << page_size_log2;
|
||||
UVM_ASSERT(count < MAX_NUM_PAGE_SIZES);
|
||||
page_sizes[count++] = page_size;
|
||||
}
|
||||
@ -1572,7 +1576,7 @@ typedef NV_STATUS (*entry_test_page_size_func)(uvm_gpu_t *gpu, size_t page_size)
|
||||
|
||||
static NV_STATUS entry_test_maxwell(uvm_gpu_t *gpu)
|
||||
{
|
||||
static const NvU32 big_page_sizes[] = {UVM_PAGE_SIZE_64K, UVM_PAGE_SIZE_128K};
|
||||
static const NvU64 big_page_sizes[] = {UVM_PAGE_SIZE_64K, UVM_PAGE_SIZE_128K};
|
||||
NvU64 pde_bits;
|
||||
uvm_mmu_page_table_alloc_t *phys_allocs[2];
|
||||
uvm_mmu_page_table_alloc_t alloc_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x9999999000LL);
|
||||
@ -1663,7 +1667,7 @@ static NV_STATUS entry_test_maxwell(uvm_gpu_t *gpu)
|
||||
|
||||
static NV_STATUS entry_test_pascal(uvm_gpu_t *gpu, entry_test_page_size_func entry_test_page_size)
|
||||
{
|
||||
NvU32 page_sizes[MAX_NUM_PAGE_SIZES];
|
||||
NvU64 page_sizes[MAX_NUM_PAGE_SIZES];
|
||||
NvU64 pde_bits[2];
|
||||
size_t i, num_page_sizes;
|
||||
uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
|
||||
@ -1759,7 +1763,7 @@ static NV_STATUS entry_test_pascal(uvm_gpu_t *gpu, entry_test_page_size_func ent
|
||||
|
||||
static NV_STATUS entry_test_volta(uvm_gpu_t *gpu, entry_test_page_size_func entry_test_page_size)
|
||||
{
|
||||
NvU32 page_sizes[MAX_NUM_PAGE_SIZES];
|
||||
NvU64 page_sizes[MAX_NUM_PAGE_SIZES];
|
||||
NvU64 pde_bits[2];
|
||||
size_t i, num_page_sizes;
|
||||
uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
|
||||
@ -1833,7 +1837,7 @@ static NV_STATUS entry_test_volta(uvm_gpu_t *gpu, entry_test_page_size_func entr
|
||||
|
||||
static NV_STATUS entry_test_ampere(uvm_gpu_t *gpu, entry_test_page_size_func entry_test_page_size)
|
||||
{
|
||||
NvU32 page_sizes[MAX_NUM_PAGE_SIZES];
|
||||
NvU64 page_sizes[MAX_NUM_PAGE_SIZES];
|
||||
NvU32 i, num_page_sizes;
|
||||
|
||||
num_page_sizes = get_page_sizes(gpu, page_sizes);
|
||||
@ -1847,7 +1851,7 @@ static NV_STATUS entry_test_ampere(uvm_gpu_t *gpu, entry_test_page_size_func ent
|
||||
static NV_STATUS entry_test_hopper(uvm_gpu_t *gpu, entry_test_page_size_func entry_test_page_size)
|
||||
{
|
||||
NV_STATUS status = NV_OK;
|
||||
NvU32 page_sizes[MAX_NUM_PAGE_SIZES];
|
||||
NvU64 page_sizes[MAX_NUM_PAGE_SIZES];
|
||||
NvU64 pde_bits[2];
|
||||
uvm_page_directory_t *dirs[5];
|
||||
size_t i, num_page_sizes;
|
||||
@ -2290,8 +2294,8 @@ static NV_STATUS fake_gpu_init_hopper(uvm_gpu_t *fake_gpu)
|
||||
static NV_STATUS maxwell_test_page_tree(uvm_gpu_t *maxwell)
|
||||
{
|
||||
// create a fake Maxwell GPU for this test.
|
||||
static const NvU32 big_page_sizes[] = {UVM_PAGE_SIZE_64K, UVM_PAGE_SIZE_128K};
|
||||
NvU32 i, j, big_page_size, page_size;
|
||||
static const NvU64 big_page_sizes[] = {UVM_PAGE_SIZE_64K, UVM_PAGE_SIZE_128K};
|
||||
NvU64 i, j, big_page_size, page_size;
|
||||
|
||||
TEST_CHECK_RET(fake_gpu_init_maxwell(maxwell) == NV_OK);
|
||||
|
||||
@ -2320,7 +2324,7 @@ static NV_STATUS pascal_test_page_tree(uvm_gpu_t *pascal)
|
||||
// create a fake Pascal GPU for this test.
|
||||
NvU32 tlb_batch_saved_max_pages;
|
||||
NvU32 i;
|
||||
NvU32 page_sizes[MAX_NUM_PAGE_SIZES];
|
||||
NvU64 page_sizes[MAX_NUM_PAGE_SIZES];
|
||||
size_t num_page_sizes;
|
||||
|
||||
TEST_CHECK_RET(fake_gpu_init_pascal(pascal) == NV_OK);
|
||||
@ -2381,7 +2385,7 @@ static NV_STATUS volta_test_page_tree(uvm_gpu_t *volta)
|
||||
static NV_STATUS ampere_test_page_tree(uvm_gpu_t *ampere)
|
||||
{
|
||||
NvU32 i, tlb_batch_saved_max_pages;
|
||||
NvU32 page_sizes[MAX_NUM_PAGE_SIZES];
|
||||
NvU64 page_sizes[MAX_NUM_PAGE_SIZES];
|
||||
size_t num_page_sizes;
|
||||
|
||||
TEST_CHECK_RET(fake_gpu_init_ampere(ampere) == NV_OK);
|
||||
|
@ -92,7 +92,13 @@ void uvm_hal_pascal_host_tlb_invalidate_all(uvm_push_t *push, uvm_gpu_phys_addre
|
||||
uvm_hal_tlb_invalidate_membar(push, membar);
|
||||
}
|
||||
|
||||
void uvm_hal_pascal_host_tlb_invalidate_va(uvm_push_t *push, uvm_gpu_phys_address_t pdb, NvU32 depth, NvU64 base, NvU64 size, NvU32 page_size, uvm_membar_t membar)
|
||||
void uvm_hal_pascal_host_tlb_invalidate_va(uvm_push_t *push,
|
||||
uvm_gpu_phys_address_t pdb,
|
||||
NvU32 depth,
|
||||
NvU64 base,
|
||||
NvU64 size,
|
||||
NvU64 page_size,
|
||||
uvm_membar_t membar)
|
||||
{
|
||||
NvU32 aperture_value;
|
||||
NvU32 page_table_level;
|
||||
@ -127,9 +133,9 @@ void uvm_hal_pascal_host_tlb_invalidate_va(uvm_push_t *push, uvm_gpu_phys_addres
|
||||
ack_value = HWCONST(C06F, MEM_OP_C, TLB_INVALIDATE_ACK_TYPE, GLOBALLY);
|
||||
}
|
||||
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(page_size, 1 << 12), "page_size 0x%x\n", page_size);
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(base, page_size), "base 0x%llx page_size 0x%x\n", base, page_size);
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%x\n", size, page_size);
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(page_size, 1 << 12), "page_size 0x%llx\n", page_size);
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(base, page_size), "base 0x%llx page_size 0x%llx\n", base, page_size);
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%llx\n", size, page_size);
|
||||
UVM_ASSERT_MSG(size > 0, "size 0x%llx\n", size);
|
||||
|
||||
base >>= 12;
|
||||
|
@ -54,7 +54,7 @@ static NvU32 entries_per_index_pascal(NvU32 depth)
|
||||
return 1;
|
||||
}
|
||||
|
||||
static NvLength entry_offset_pascal(NvU32 depth, NvU32 page_size)
|
||||
static NvLength entry_offset_pascal(NvU32 depth, NvU64 page_size)
|
||||
{
|
||||
UVM_ASSERT(depth < 5);
|
||||
if (page_size == UVM_PAGE_SIZE_4K && depth == 3)
|
||||
@ -178,7 +178,7 @@ static NvLength entry_size_pascal(NvU32 depth)
|
||||
return 8;
|
||||
}
|
||||
|
||||
static NvU32 index_bits_pascal(NvU32 depth, NvU32 page_size)
|
||||
static NvU32 index_bits_pascal(NvU32 depth, NvU64 page_size)
|
||||
{
|
||||
static const NvU32 bit_widths[] = {2, 9, 9, 8};
|
||||
// some code paths keep on querying this until they get a 0, meaning only the page offset remains.
|
||||
@ -204,7 +204,7 @@ static NvU32 num_va_bits_pascal(void)
|
||||
return 49;
|
||||
}
|
||||
|
||||
static NvLength allocation_size_pascal(NvU32 depth, NvU32 page_size)
|
||||
static NvLength allocation_size_pascal(NvU32 depth, NvU64 page_size)
|
||||
{
|
||||
UVM_ASSERT(depth < 5);
|
||||
if (depth == 4 && page_size == UVM_PAGE_SIZE_64K)
|
||||
@ -213,7 +213,7 @@ static NvLength allocation_size_pascal(NvU32 depth, NvU32 page_size)
|
||||
return 4096;
|
||||
}
|
||||
|
||||
static NvU32 page_table_depth_pascal(NvU32 page_size)
|
||||
static NvU32 page_table_depth_pascal(NvU64 page_size)
|
||||
{
|
||||
if (page_size == UVM_PAGE_SIZE_2M)
|
||||
return 3;
|
||||
@ -221,12 +221,12 @@ static NvU32 page_table_depth_pascal(NvU32 page_size)
|
||||
return 4;
|
||||
}
|
||||
|
||||
static NvU32 page_sizes_pascal(void)
|
||||
static NvU64 page_sizes_pascal(void)
|
||||
{
|
||||
return UVM_PAGE_SIZE_2M | UVM_PAGE_SIZE_64K | UVM_PAGE_SIZE_4K;
|
||||
}
|
||||
|
||||
static NvU64 unmapped_pte_pascal(NvU32 page_size)
|
||||
static NvU64 unmapped_pte_pascal(NvU64 page_size)
|
||||
{
|
||||
// Setting the privilege bit on an otherwise-zeroed big PTE causes the
|
||||
// corresponding 4k PTEs to be ignored. This allows the invalidation of a
|
||||
@ -362,7 +362,7 @@ static uvm_mmu_mode_hal_t pascal_mmu_mode_hal =
|
||||
.page_sizes = page_sizes_pascal
|
||||
};
|
||||
|
||||
uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_pascal(NvU32 big_page_size)
|
||||
uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_pascal(NvU64 big_page_size)
|
||||
{
|
||||
UVM_ASSERT(big_page_size == UVM_PAGE_SIZE_64K || big_page_size == UVM_PAGE_SIZE_128K);
|
||||
|
||||
|
@ -162,7 +162,7 @@ static void grow_fault_granularity_if_no_thrashing(uvm_perf_prefetch_bitmap_tree
|
||||
}
|
||||
|
||||
static void grow_fault_granularity(uvm_perf_prefetch_bitmap_tree_t *bitmap_tree,
|
||||
NvU32 big_page_size,
|
||||
NvU64 big_page_size,
|
||||
uvm_va_block_region_t big_pages_region,
|
||||
uvm_va_block_region_t max_prefetch_region,
|
||||
const uvm_page_mask_t *faulted_pages,
|
||||
@ -245,7 +245,7 @@ static void update_bitmap_tree_from_va_block(uvm_perf_prefetch_bitmap_tree_t *bi
|
||||
uvm_va_block_region_t max_prefetch_region)
|
||||
|
||||
{
|
||||
NvU32 big_page_size;
|
||||
NvU64 big_page_size;
|
||||
uvm_va_block_region_t big_pages_region;
|
||||
uvm_va_space_t *va_space;
|
||||
const uvm_page_mask_t *thrashing_pages;
|
||||
|
@ -1987,21 +1987,12 @@ NV_STATUS uvm_perf_thrashing_init(void)
|
||||
UVM_PERF_THRASHING_PIN_THRESHOLD_DEFAULT,
|
||||
UVM_PERF_THRASHING_PIN_THRESHOLD_MAX);
|
||||
|
||||
|
||||
|
||||
// In Confidential Computing, the DMA path is slower due to cryptographic
|
||||
// operations & other associated overhead. Enforce a larger window to allow
|
||||
// the thrashing mitigation mechanisms to work properly.
|
||||
if (g_uvm_global.conf_computing_enabled)
|
||||
INIT_THRASHING_PARAMETER_NONZERO(uvm_perf_thrashing_lapse_usec, UVM_PERF_THRASHING_LAPSE_USEC_DEFAULT * 10);
|
||||
else
|
||||
INIT_THRASHING_PARAMETER_NONZERO(uvm_perf_thrashing_lapse_usec, UVM_PERF_THRASHING_LAPSE_USEC_DEFAULT);
|
||||
|
||||
INIT_THRASHING_PARAMETER_NONZERO_MAX(uvm_perf_thrashing_nap,
|
||||
UVM_PERF_THRASHING_NAP_DEFAULT,
|
||||
UVM_PERF_THRASHING_NAP_MAX);
|
||||
|
||||
|
||||
INIT_THRASHING_PARAMETER_NONZERO(uvm_perf_thrashing_epoch, UVM_PERF_THRASHING_EPOCH_DEFAULT);
|
||||
|
||||
INIT_THRASHING_PARAMETER(uvm_perf_thrashing_pin, UVM_PERF_THRASHING_PIN_DEFAULT);
|
||||
|
@ -1890,8 +1890,11 @@ static uvm_gpu_chunk_t *claim_free_chunk(uvm_pmm_gpu_t *pmm, uvm_pmm_gpu_memory_
|
||||
if (!chunk)
|
||||
goto out;
|
||||
|
||||
UVM_ASSERT_MSG(uvm_gpu_chunk_get_size(chunk) == chunk_size, "chunk size %u expected %u\n",
|
||||
uvm_gpu_chunk_get_size(chunk), chunk_size);
|
||||
UVM_ASSERT_MSG(uvm_gpu_chunk_get_size(chunk) == chunk_size,
|
||||
"chunk size %u expected %u\n",
|
||||
uvm_gpu_chunk_get_size(chunk),
|
||||
chunk_size);
|
||||
|
||||
UVM_ASSERT(chunk->type == type);
|
||||
UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_FREE);
|
||||
UVM_ASSERT(!chunk_is_in_eviction(pmm, chunk));
|
||||
@ -2756,7 +2759,7 @@ static bool uvm_pmm_should_inject_pma_eviction_error(uvm_pmm_gpu_t *pmm)
|
||||
// See the documentation of pmaEvictPagesCb_t in pma.h for details of the
|
||||
// expected semantics.
|
||||
static NV_STATUS uvm_pmm_gpu_pma_evict_pages(void *void_pmm,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
NvU64 *pages,
|
||||
NvU32 num_pages_to_evict,
|
||||
NvU64 phys_start,
|
||||
@ -2861,7 +2864,7 @@ error:
|
||||
}
|
||||
|
||||
static NV_STATUS uvm_pmm_gpu_pma_evict_pages_wrapper(void *void_pmm,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
NvU64 *pages,
|
||||
NvU32 num_pages_to_evict,
|
||||
NvU64 phys_start,
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2015-2023 NVIDIA Corporation
|
||||
Copyright (c) 2015-2024 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@ -65,30 +65,30 @@
|
||||
|
||||
typedef enum
|
||||
{
|
||||
UVM_CHUNK_SIZE_1 = 1ULL,
|
||||
UVM_CHUNK_SIZE_2 = 2ULL,
|
||||
UVM_CHUNK_SIZE_4 = 4ULL,
|
||||
UVM_CHUNK_SIZE_8 = 8ULL,
|
||||
UVM_CHUNK_SIZE_16 = 16ULL,
|
||||
UVM_CHUNK_SIZE_32 = 32ULL,
|
||||
UVM_CHUNK_SIZE_64 = 64ULL,
|
||||
UVM_CHUNK_SIZE_128 = 128ULL,
|
||||
UVM_CHUNK_SIZE_256 = 256ULL,
|
||||
UVM_CHUNK_SIZE_512 = 512ULL,
|
||||
UVM_CHUNK_SIZE_1K = 1024ULL,
|
||||
UVM_CHUNK_SIZE_2K = 2*1024ULL,
|
||||
UVM_CHUNK_SIZE_4K = 4*1024ULL,
|
||||
UVM_CHUNK_SIZE_8K = 8*1024ULL,
|
||||
UVM_CHUNK_SIZE_16K = 16*1024ULL,
|
||||
UVM_CHUNK_SIZE_32K = 32*1024ULL,
|
||||
UVM_CHUNK_SIZE_64K = 64*1024ULL,
|
||||
UVM_CHUNK_SIZE_128K = 128*1024ULL,
|
||||
UVM_CHUNK_SIZE_256K = 256*1024ULL,
|
||||
UVM_CHUNK_SIZE_512K = 512*1024ULL,
|
||||
UVM_CHUNK_SIZE_1M = 1024*1024ULL,
|
||||
UVM_CHUNK_SIZE_2M = 2*1024*1024ULL,
|
||||
UVM_CHUNK_SIZE_1 = 1,
|
||||
UVM_CHUNK_SIZE_2 = 2,
|
||||
UVM_CHUNK_SIZE_4 = 4,
|
||||
UVM_CHUNK_SIZE_8 = 8,
|
||||
UVM_CHUNK_SIZE_16 = 16,
|
||||
UVM_CHUNK_SIZE_32 = 32,
|
||||
UVM_CHUNK_SIZE_64 = 64,
|
||||
UVM_CHUNK_SIZE_128 = 128,
|
||||
UVM_CHUNK_SIZE_256 = 256,
|
||||
UVM_CHUNK_SIZE_512 = 512,
|
||||
UVM_CHUNK_SIZE_1K = 1024,
|
||||
UVM_CHUNK_SIZE_2K = 2*1024,
|
||||
UVM_CHUNK_SIZE_4K = 4*1024,
|
||||
UVM_CHUNK_SIZE_8K = 8*1024,
|
||||
UVM_CHUNK_SIZE_16K = 16*1024,
|
||||
UVM_CHUNK_SIZE_32K = 32*1024,
|
||||
UVM_CHUNK_SIZE_64K = 64*1024,
|
||||
UVM_CHUNK_SIZE_128K = 128*1024,
|
||||
UVM_CHUNK_SIZE_256K = 256*1024,
|
||||
UVM_CHUNK_SIZE_512K = 512*1024,
|
||||
UVM_CHUNK_SIZE_1M = 1024*1024,
|
||||
UVM_CHUNK_SIZE_2M = 2*1024*1024,
|
||||
UVM_CHUNK_SIZE_MAX = UVM_CHUNK_SIZE_2M,
|
||||
UVM_CHUNK_SIZE_INVALID = UVM_CHUNK_SIZE_MAX * 2ULL
|
||||
UVM_CHUNK_SIZE_INVALID = UVM_CHUNK_SIZE_MAX * 2
|
||||
} uvm_chunk_size_t;
|
||||
|
||||
#define UVM_CHUNK_SIZES_MASK (uvm_chunk_sizes_mask_t)(UVM_CHUNK_SIZE_MAX | (UVM_CHUNK_SIZE_MAX-1))
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2017-2023 NVIDIA Corporation
|
||||
Copyright (c) 2017-2024 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@ -43,7 +43,7 @@ NV_STATUS uvm_pmm_sysmem_init(void)
|
||||
// Ensure that only supported CPU chunk sizes are enabled.
|
||||
uvm_cpu_chunk_allocation_sizes &= UVM_CPU_CHUNK_SIZES;
|
||||
if (!uvm_cpu_chunk_allocation_sizes || !(uvm_cpu_chunk_allocation_sizes & PAGE_SIZE)) {
|
||||
pr_info("Invalid value for uvm_cpu_chunk_allocation_sizes = 0x%x, using 0x%lx instead\n",
|
||||
pr_info("Invalid value for uvm_cpu_chunk_allocation_sizes = 0x%x, using 0x%llx instead\n",
|
||||
uvm_cpu_chunk_allocation_sizes,
|
||||
UVM_CPU_CHUNK_SIZES);
|
||||
uvm_cpu_chunk_allocation_sizes = UVM_CPU_CHUNK_SIZES;
|
||||
@ -126,7 +126,7 @@ NV_STATUS uvm_pmm_sysmem_mappings_add_gpu_mapping(uvm_pmm_sysmem_mappings_t *sys
|
||||
NvU64 remove_key;
|
||||
|
||||
for (remove_key = base_key; remove_key < key; ++remove_key)
|
||||
(void)radix_tree_delete(&sysmem_mappings->reverse_map_tree, remove_key);
|
||||
(void *)radix_tree_delete(&sysmem_mappings->reverse_map_tree, remove_key);
|
||||
|
||||
kmem_cache_free(g_reverse_page_map_cache, new_reverse_map);
|
||||
status = errno_to_nv_status(ret);
|
||||
@ -461,69 +461,12 @@ static NvU32 compute_gpu_mappings_entry_index(uvm_parent_processor_mask_t *dma_a
|
||||
return uvm_parent_processor_mask_get_gpu_count(&subset_mask);
|
||||
}
|
||||
|
||||
static void cpu_chunk_release(nv_kref_t *kref)
|
||||
{
|
||||
uvm_cpu_chunk_t *chunk = container_of(kref, uvm_cpu_chunk_t, refcount);
|
||||
uvm_parent_processor_mask_t *mapping_mask;
|
||||
uvm_parent_processor_id_t id;
|
||||
uvm_cpu_physical_chunk_t *phys_chunk = NULL;
|
||||
uvm_cpu_logical_chunk_t *logical_chunk = NULL;
|
||||
|
||||
if (uvm_cpu_chunk_is_physical(chunk)) {
|
||||
phys_chunk = uvm_cpu_chunk_to_physical(chunk);
|
||||
uvm_assert_mutex_unlocked(&phys_chunk->lock);
|
||||
mapping_mask = &phys_chunk->gpu_mappings.dma_addrs_mask;
|
||||
}
|
||||
else {
|
||||
logical_chunk = uvm_cpu_chunk_to_logical(chunk);
|
||||
mapping_mask = &logical_chunk->mapped_gpus;
|
||||
}
|
||||
|
||||
for_each_parent_id_in_mask(id, mapping_mask) {
|
||||
uvm_parent_gpu_t *parent_gpu = uvm_parent_gpu_get(id);
|
||||
uvm_cpu_chunk_unmap_parent_gpu_phys(chunk, parent_gpu);
|
||||
}
|
||||
|
||||
if (uvm_cpu_chunk_is_physical(chunk)) {
|
||||
if (phys_chunk->gpu_mappings.max_entries > 1)
|
||||
uvm_kvfree(phys_chunk->gpu_mappings.dynamic_entries);
|
||||
|
||||
if (uvm_cpu_chunk_get_size(chunk) > PAGE_SIZE &&
|
||||
!bitmap_empty(phys_chunk->dirty_bitmap, uvm_cpu_chunk_num_pages(chunk)))
|
||||
SetPageDirty(phys_chunk->common.page);
|
||||
|
||||
uvm_kvfree(phys_chunk->dirty_bitmap);
|
||||
|
||||
if (chunk->type != UVM_CPU_CHUNK_TYPE_HMM)
|
||||
put_page(phys_chunk->common.page);
|
||||
}
|
||||
else {
|
||||
uvm_cpu_chunk_free(logical_chunk->parent);
|
||||
}
|
||||
|
||||
uvm_kvfree(chunk);
|
||||
}
|
||||
|
||||
static void uvm_cpu_chunk_get(uvm_cpu_chunk_t *chunk)
|
||||
{
|
||||
UVM_ASSERT(chunk);
|
||||
nv_kref_get(&chunk->refcount);
|
||||
}
|
||||
|
||||
void uvm_cpu_chunk_free(uvm_cpu_chunk_t *chunk)
|
||||
{
|
||||
if (!chunk)
|
||||
return;
|
||||
|
||||
nv_kref_put(&chunk->refcount, cpu_chunk_release);
|
||||
}
|
||||
|
||||
static uvm_cpu_physical_chunk_t *get_physical_parent(uvm_cpu_chunk_t *chunk)
|
||||
{
|
||||
UVM_ASSERT(chunk);
|
||||
UVM_ASSERT(chunk->page);
|
||||
|
||||
while (!uvm_cpu_chunk_is_physical(chunk))
|
||||
while (uvm_cpu_chunk_is_logical(chunk))
|
||||
chunk = uvm_cpu_chunk_to_logical(chunk)->parent;
|
||||
|
||||
return uvm_cpu_chunk_to_physical(chunk);
|
||||
@ -581,6 +524,7 @@ static uvm_cpu_phys_mapping_t *chunk_phys_mapping_alloc(uvm_cpu_physical_chunk_t
|
||||
static uvm_cpu_phys_mapping_t *chunk_phys_mapping_get(uvm_cpu_physical_chunk_t *chunk, uvm_parent_gpu_id_t id)
|
||||
{
|
||||
uvm_assert_mutex_locked(&chunk->lock);
|
||||
|
||||
if (uvm_parent_processor_mask_test(&chunk->gpu_mappings.dma_addrs_mask, id)) {
|
||||
if (chunk->gpu_mappings.max_entries == 1) {
|
||||
return &chunk->gpu_mappings.static_entry;
|
||||
@ -598,7 +542,6 @@ static void chunk_inc_gpu_mapping(uvm_cpu_physical_chunk_t *chunk, uvm_parent_gp
|
||||
{
|
||||
uvm_cpu_phys_mapping_t *mapping;
|
||||
|
||||
uvm_assert_mutex_locked(&chunk->lock);
|
||||
mapping = chunk_phys_mapping_get(chunk, id);
|
||||
UVM_ASSERT(mapping);
|
||||
mapping->map_count++;
|
||||
@ -608,7 +551,6 @@ static void chunk_dec_gpu_mapping(uvm_cpu_physical_chunk_t *chunk, uvm_parent_gp
|
||||
{
|
||||
uvm_cpu_phys_mapping_t *mapping;
|
||||
|
||||
uvm_assert_mutex_locked(&chunk->lock);
|
||||
mapping = chunk_phys_mapping_get(chunk, id);
|
||||
UVM_ASSERT(mapping);
|
||||
UVM_ASSERT(mapping->dma_addr && mapping->map_count);
|
||||
@ -616,6 +558,8 @@ static void chunk_dec_gpu_mapping(uvm_cpu_physical_chunk_t *chunk, uvm_parent_gp
|
||||
if (mapping->map_count == 0) {
|
||||
uvm_parent_gpu_t *parent_gpu = uvm_parent_gpu_get(id);
|
||||
|
||||
UVM_ASSERT(uvm_sub_processor_mask_empty(&mapping->sub_processors));
|
||||
|
||||
uvm_parent_gpu_unmap_cpu_pages(parent_gpu, mapping->dma_addr, uvm_cpu_chunk_get_size(&chunk->common));
|
||||
mapping->dma_addr = 0;
|
||||
if (chunk->gpu_mappings.max_entries > 1) {
|
||||
@ -631,7 +575,7 @@ static void chunk_dec_gpu_mapping(uvm_cpu_physical_chunk_t *chunk, uvm_parent_gp
|
||||
}
|
||||
}
|
||||
|
||||
NvU64 uvm_cpu_chunk_get_parent_gpu_phys_addr(uvm_cpu_chunk_t *chunk, uvm_parent_gpu_t *parent_gpu)
|
||||
NvU64 uvm_cpu_chunk_get_gpu_phys_addr(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu)
|
||||
{
|
||||
uvm_cpu_physical_chunk_t *phys_chunk = get_physical_parent(chunk);
|
||||
uvm_cpu_phys_mapping_t *mapping;
|
||||
@ -641,36 +585,41 @@ NvU64 uvm_cpu_chunk_get_parent_gpu_phys_addr(uvm_cpu_chunk_t *chunk, uvm_parent_
|
||||
if (uvm_cpu_chunk_is_logical(chunk)) {
|
||||
uvm_cpu_logical_chunk_t *logical_chunk = uvm_cpu_chunk_to_logical(chunk);
|
||||
|
||||
if (!uvm_parent_processor_mask_test(&logical_chunk->mapped_gpus, parent_gpu->id))
|
||||
if (!uvm_processor_mask_test(&logical_chunk->mapped_gpus, gpu->id))
|
||||
return 0;
|
||||
|
||||
parent_offset = cpu_chunk_get_phys_index(logical_chunk);
|
||||
}
|
||||
|
||||
uvm_mutex_lock(&phys_chunk->lock);
|
||||
mapping = chunk_phys_mapping_get(phys_chunk, parent_gpu->id);
|
||||
if (mapping)
|
||||
mapping = chunk_phys_mapping_get(phys_chunk, gpu->parent->id);
|
||||
if (mapping &&
|
||||
(uvm_cpu_chunk_is_logical(chunk) ||
|
||||
uvm_sub_processor_mask_test(&mapping->sub_processors, uvm_id_sub_processor_index(gpu->id))))
|
||||
dma_addr = mapping->dma_addr + (parent_offset * PAGE_SIZE);
|
||||
|
||||
uvm_mutex_unlock(&phys_chunk->lock);
|
||||
|
||||
return dma_addr;
|
||||
}
|
||||
|
||||
// Create a DMA mapping for the chunk on the given parent GPU. This will map the
|
||||
// entire parent physical chunk on the GPU.
|
||||
// Create a DMA mapping for the chunk on the given GPU. This will map the
|
||||
// entire physical chunk on the parent GPU and record that a given MIG
|
||||
// partition is using the mapping.
|
||||
//
|
||||
// Returns NV_OK on success. On error, any of the errors returned by
|
||||
// uvm_parent_gpu_map_cpu_pages() can be returned. In the case that the DMA
|
||||
// mapping structure could not be allocated, NV_ERR_NO_MEMORY is returned.
|
||||
static NV_STATUS cpu_chunk_map_parent_gpu_phys(uvm_cpu_chunk_t *chunk, uvm_parent_gpu_t *parent_gpu)
|
||||
static NV_STATUS cpu_chunk_map_gpu_phys(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu)
|
||||
{
|
||||
uvm_parent_gpu_t *parent_gpu = gpu->parent;
|
||||
uvm_cpu_physical_chunk_t *phys_chunk;
|
||||
uvm_cpu_logical_chunk_t *logical_chunk = NULL;
|
||||
uvm_cpu_phys_mapping_t *mapping;
|
||||
NV_STATUS status = NV_OK;
|
||||
|
||||
if (uvm_cpu_chunk_is_logical(chunk)) {
|
||||
logical_chunk = uvm_cpu_chunk_to_logical(chunk);
|
||||
if (uvm_parent_processor_mask_test(&logical_chunk->mapped_gpus, parent_gpu->id))
|
||||
if (uvm_processor_mask_test(&logical_chunk->mapped_gpus, gpu->id))
|
||||
return status;
|
||||
}
|
||||
|
||||
@ -679,7 +628,6 @@ static NV_STATUS cpu_chunk_map_parent_gpu_phys(uvm_cpu_chunk_t *chunk, uvm_paren
|
||||
|
||||
if (!uvm_parent_processor_mask_test(&phys_chunk->gpu_mappings.dma_addrs_mask, parent_gpu->id)) {
|
||||
uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(&phys_chunk->common);
|
||||
uvm_cpu_phys_mapping_t *mapping;
|
||||
NvU64 dma_addr;
|
||||
|
||||
status = uvm_parent_gpu_map_cpu_pages(parent_gpu, phys_chunk->common.page, chunk_size, &dma_addr);
|
||||
@ -695,39 +643,59 @@ static NV_STATUS cpu_chunk_map_parent_gpu_phys(uvm_cpu_chunk_t *chunk, uvm_paren
|
||||
|
||||
mapping->dma_addr = dma_addr;
|
||||
mapping->map_count = 1;
|
||||
uvm_sub_processor_mask_zero(&mapping->sub_processors);
|
||||
if (!logical_chunk)
|
||||
uvm_sub_processor_mask_set(&mapping->sub_processors, uvm_id_sub_processor_index(gpu->id));
|
||||
|
||||
uvm_parent_processor_mask_set(&phys_chunk->gpu_mappings.dma_addrs_mask, parent_gpu->id);
|
||||
}
|
||||
else {
|
||||
// The mapping count on the physical chunk is only increased when
|
||||
// mapping logical chunks.
|
||||
if (uvm_cpu_chunk_is_logical(chunk))
|
||||
chunk_inc_gpu_mapping(phys_chunk, parent_gpu->id);
|
||||
mapping = chunk_phys_mapping_get(phys_chunk, parent_gpu->id);
|
||||
UVM_ASSERT(mapping);
|
||||
|
||||
// Increment the map_count for logical chunks or the first time a
|
||||
// MIG partition is sharing a physical chunk.
|
||||
if (logical_chunk ||
|
||||
!uvm_sub_processor_mask_test_and_set(&mapping->sub_processors, uvm_id_sub_processor_index(gpu->id)))
|
||||
mapping->map_count++;
|
||||
}
|
||||
|
||||
if (logical_chunk) {
|
||||
uvm_processor_mask_set(&logical_chunk->mapped_gpus, gpu->id);
|
||||
UVM_ASSERT(uvm_sub_processor_mask_empty(&mapping->sub_processors));
|
||||
}
|
||||
else {
|
||||
UVM_ASSERT(!uvm_sub_processor_mask_empty(&mapping->sub_processors));
|
||||
UVM_ASSERT(uvm_sub_processor_mask_get_count(&mapping->sub_processors) == mapping->map_count);
|
||||
}
|
||||
|
||||
done:
|
||||
uvm_mutex_unlock(&phys_chunk->lock);
|
||||
|
||||
if (status == NV_OK && uvm_cpu_chunk_is_logical(chunk))
|
||||
uvm_parent_processor_mask_set(&logical_chunk->mapped_gpus, parent_gpu->id);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
void uvm_cpu_chunk_unmap_parent_gpu_phys(uvm_cpu_chunk_t *chunk, uvm_parent_gpu_t *parent_gpu)
|
||||
static void cpu_chunk_unmap_gpu_phys(uvm_cpu_chunk_t *chunk, uvm_gpu_id_t gpu_id)
|
||||
{
|
||||
uvm_cpu_physical_chunk_t *phys_chunk;
|
||||
uvm_cpu_logical_chunk_t *logical_chunk;
|
||||
uvm_cpu_physical_chunk_t *phys_chunk = get_physical_parent(chunk);
|
||||
uvm_parent_gpu_id_t id = uvm_parent_gpu_id_from_gpu_id(gpu_id);
|
||||
|
||||
uvm_mutex_lock(&phys_chunk->lock);
|
||||
|
||||
if (uvm_cpu_chunk_is_logical(chunk)) {
|
||||
logical_chunk = uvm_cpu_chunk_to_logical(chunk);
|
||||
if (!uvm_parent_processor_mask_test_and_clear(&logical_chunk->mapped_gpus, parent_gpu->id))
|
||||
return;
|
||||
}
|
||||
uvm_processor_mask_t *mapping_mask = &uvm_cpu_chunk_to_logical(chunk)->mapped_gpus;
|
||||
|
||||
phys_chunk = get_physical_parent(chunk);
|
||||
uvm_mutex_lock(&phys_chunk->lock);
|
||||
if (uvm_parent_processor_mask_test(&phys_chunk->gpu_mappings.dma_addrs_mask, parent_gpu->id))
|
||||
chunk_dec_gpu_mapping(phys_chunk, parent_gpu->id);
|
||||
if (uvm_processor_mask_test_and_clear(mapping_mask, gpu_id))
|
||||
chunk_dec_gpu_mapping(phys_chunk, id);
|
||||
}
|
||||
else {
|
||||
if (uvm_parent_processor_mask_test(&phys_chunk->gpu_mappings.dma_addrs_mask, id)) {
|
||||
uvm_cpu_phys_mapping_t *mapping = chunk_phys_mapping_get(phys_chunk, id);
|
||||
|
||||
if (uvm_sub_processor_mask_test_and_clear(&mapping->sub_processors, uvm_id_sub_processor_index(gpu_id)))
|
||||
chunk_dec_gpu_mapping(phys_chunk, id);
|
||||
}
|
||||
}
|
||||
|
||||
uvm_mutex_unlock(&phys_chunk->lock);
|
||||
}
|
||||
@ -737,17 +705,112 @@ NV_STATUS uvm_cpu_chunk_map_gpu(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu)
|
||||
NV_STATUS status;
|
||||
uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk);
|
||||
|
||||
status = cpu_chunk_map_parent_gpu_phys(chunk, gpu->parent);
|
||||
status = cpu_chunk_map_gpu_phys(chunk, gpu);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
status = uvm_mmu_sysmem_map(gpu, uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent), chunk_size);
|
||||
status = uvm_mmu_sysmem_map(gpu, uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu), chunk_size);
|
||||
if (status != NV_OK)
|
||||
uvm_cpu_chunk_unmap_parent_gpu_phys(chunk, gpu->parent);
|
||||
cpu_chunk_unmap_gpu_phys(chunk, gpu->id);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
void uvm_cpu_chunk_unmap_gpu(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu)
|
||||
{
|
||||
cpu_chunk_unmap_gpu_phys(chunk, gpu->id);
|
||||
|
||||
// Note: there is no corresponding uvm_mmu_sysmem_unmap() for
|
||||
// uvm_mmu_sysmem_map().
|
||||
}
|
||||
|
||||
static void cpu_logical_chunk_release(uvm_cpu_logical_chunk_t *logical_chunk)
|
||||
{
|
||||
uvm_cpu_physical_chunk_t *phys_chunk = get_physical_parent(logical_chunk->parent);
|
||||
uvm_processor_id_t gpu_id;
|
||||
|
||||
uvm_mutex_lock(&phys_chunk->lock);
|
||||
|
||||
for_each_id_in_mask(gpu_id, &logical_chunk->mapped_gpus)
|
||||
chunk_dec_gpu_mapping(phys_chunk, uvm_parent_gpu_id_from_gpu_id(gpu_id));
|
||||
|
||||
uvm_mutex_unlock(&phys_chunk->lock);
|
||||
|
||||
uvm_cpu_chunk_free(logical_chunk->parent);
|
||||
}
|
||||
|
||||
static void cpu_physical_chunk_release(uvm_cpu_chunk_t *chunk)
|
||||
{
|
||||
uvm_cpu_physical_chunk_t *phys_chunk = uvm_cpu_chunk_to_physical(chunk);
|
||||
uvm_parent_processor_id_t id;
|
||||
|
||||
uvm_assert_mutex_unlocked(&phys_chunk->lock);
|
||||
|
||||
// There should be no other threads using this chunk but we lock it because
|
||||
// of assertions in chunk_phys_mapping_get() and chunk_dec_gpu_mapping().
|
||||
uvm_mutex_lock(&phys_chunk->lock);
|
||||
|
||||
for_each_parent_id_in_mask(id, &phys_chunk->gpu_mappings.dma_addrs_mask) {
|
||||
uvm_cpu_phys_mapping_t *mapping = chunk_phys_mapping_get(phys_chunk, id);
|
||||
NvU32 count;
|
||||
|
||||
UVM_ASSERT(mapping);
|
||||
UVM_ASSERT(!uvm_sub_processor_mask_empty(&mapping->sub_processors));
|
||||
|
||||
// Get a count of set bits in the sub_processors mask then clear it so
|
||||
// that chunk_dec_gpu_mapping() sees an empty mask when map_count == 0.
|
||||
// Using for_each_sub_processor_in_mask could try to dereference
|
||||
// mapping after map_count == 0 in the loop below.
|
||||
count = uvm_sub_processor_mask_get_count(&mapping->sub_processors);
|
||||
uvm_sub_processor_mask_zero(&mapping->sub_processors);
|
||||
|
||||
for (; count; count--)
|
||||
chunk_dec_gpu_mapping(phys_chunk, id);
|
||||
}
|
||||
|
||||
uvm_mutex_unlock(&phys_chunk->lock);
|
||||
|
||||
UVM_ASSERT(uvm_parent_processor_mask_empty(&phys_chunk->gpu_mappings.dma_addrs_mask));
|
||||
|
||||
if (phys_chunk->gpu_mappings.max_entries > 1)
|
||||
uvm_kvfree(phys_chunk->gpu_mappings.dynamic_entries);
|
||||
|
||||
if (uvm_cpu_chunk_get_size(chunk) > PAGE_SIZE &&
|
||||
!bitmap_empty(phys_chunk->dirty_bitmap, uvm_cpu_chunk_num_pages(chunk)))
|
||||
SetPageDirty(chunk->page);
|
||||
|
||||
uvm_kvfree(phys_chunk->dirty_bitmap);
|
||||
|
||||
if (chunk->type != UVM_CPU_CHUNK_TYPE_HMM)
|
||||
put_page(chunk->page);
|
||||
}
|
||||
|
||||
static void cpu_chunk_release(nv_kref_t *kref)
|
||||
{
|
||||
uvm_cpu_chunk_t *chunk = container_of(kref, uvm_cpu_chunk_t, refcount);
|
||||
|
||||
if (uvm_cpu_chunk_is_logical(chunk))
|
||||
cpu_logical_chunk_release(uvm_cpu_chunk_to_logical(chunk));
|
||||
else
|
||||
cpu_physical_chunk_release(chunk);
|
||||
|
||||
uvm_kvfree(chunk);
|
||||
}
|
||||
|
||||
static void uvm_cpu_chunk_get(uvm_cpu_chunk_t *chunk)
|
||||
{
|
||||
UVM_ASSERT(chunk);
|
||||
nv_kref_get(&chunk->refcount);
|
||||
}
|
||||
|
||||
void uvm_cpu_chunk_free(uvm_cpu_chunk_t *chunk)
|
||||
{
|
||||
if (!chunk)
|
||||
return;
|
||||
|
||||
nv_kref_put(&chunk->refcount, cpu_chunk_release);
|
||||
}
|
||||
|
||||
static struct page *uvm_cpu_chunk_alloc_page(uvm_chunk_size_t alloc_size,
|
||||
int nid,
|
||||
uvm_cpu_chunk_alloc_flags_t alloc_flags)
|
||||
@ -876,14 +939,37 @@ int uvm_cpu_chunk_get_numa_node(uvm_cpu_chunk_t *chunk)
|
||||
return page_to_nid(chunk->page);
|
||||
}
|
||||
|
||||
// Convert the mask of DMA mapped parent GPUs and the sub-processor mask into
|
||||
// one uvm_processor_mask_t in 'dma_map_mask'.
|
||||
static void get_dma_map_mask(uvm_cpu_physical_chunk_t *chunk, uvm_processor_mask_t *dma_map_mask)
|
||||
{
|
||||
uvm_parent_processor_id_t id;
|
||||
NvU32 sub_index;
|
||||
|
||||
uvm_assert_mutex_locked(&chunk->lock);
|
||||
|
||||
for_each_parent_id_in_mask(id, &chunk->gpu_mappings.dma_addrs_mask) {
|
||||
uvm_cpu_phys_mapping_t *mapping = chunk_phys_mapping_get(chunk, id);
|
||||
|
||||
for_each_sub_processor_index_in_mask(sub_index, &mapping->sub_processors) {
|
||||
uvm_processor_id_t gpu_id = uvm_gpu_id_from_sub_processor(id, sub_index);
|
||||
|
||||
uvm_sub_processor_mask_clear(&mapping->sub_processors, sub_index);
|
||||
uvm_processor_mask_set(dma_map_mask, gpu_id);
|
||||
}
|
||||
|
||||
UVM_ASSERT(uvm_sub_processor_mask_empty(&mapping->sub_processors));
|
||||
}
|
||||
}
|
||||
|
||||
NV_STATUS uvm_cpu_chunk_split(uvm_cpu_chunk_t *chunk, uvm_cpu_chunk_t **new_chunks)
|
||||
{
|
||||
NV_STATUS status = NV_OK;
|
||||
uvm_cpu_logical_chunk_t *new_chunk;
|
||||
uvm_cpu_physical_chunk_t *phys_chunk = get_physical_parent(chunk);
|
||||
uvm_cpu_logical_chunk_t *logical_chunk = NULL;
|
||||
uvm_parent_processor_id_t id;
|
||||
uvm_parent_processor_mask_t *dma_map_mask;
|
||||
uvm_processor_id_t gpu_id;
|
||||
uvm_processor_mask_t *dma_map_mask = NULL;
|
||||
uvm_chunk_size_t new_size;
|
||||
size_t num_new_chunks;
|
||||
size_t num_subchunk_pages;
|
||||
@ -902,21 +988,20 @@ NV_STATUS uvm_cpu_chunk_split(uvm_cpu_chunk_t *chunk, uvm_cpu_chunk_t **new_chun
|
||||
|
||||
// Get the largest size below the size of the input chunk.
|
||||
new_size = uvm_chunk_find_prev_size(uvm_cpu_chunk_get_allocation_sizes(), uvm_cpu_chunk_get_size(chunk));
|
||||
UVM_ASSERT(new_size);
|
||||
UVM_ASSERT(new_size != UVM_CHUNK_SIZE_INVALID);
|
||||
num_new_chunks = uvm_cpu_chunk_get_size(chunk) / new_size;
|
||||
num_subchunk_pages = new_size / PAGE_SIZE;
|
||||
|
||||
if (uvm_cpu_chunk_is_physical(chunk)) {
|
||||
dma_map_mask = &phys_chunk->gpu_mappings.dma_addrs_mask;
|
||||
}
|
||||
else {
|
||||
if (uvm_cpu_chunk_is_logical(chunk)) {
|
||||
logical_chunk = uvm_cpu_chunk_to_logical(chunk);
|
||||
dma_map_mask = &logical_chunk->mapped_gpus;
|
||||
}
|
||||
|
||||
uvm_mutex_lock(&phys_chunk->lock);
|
||||
|
||||
for (i = 0; i < num_new_chunks; i++) {
|
||||
new_chunk = uvm_kvmalloc_zero(sizeof(*logical_chunk));
|
||||
new_chunk = uvm_kvmalloc_zero(sizeof(*new_chunk));
|
||||
if (!new_chunk) {
|
||||
uvm_mutex_unlock(&phys_chunk->lock);
|
||||
status = NV_ERR_NO_MEMORY;
|
||||
@ -929,19 +1014,25 @@ NV_STATUS uvm_cpu_chunk_split(uvm_cpu_chunk_t *chunk, uvm_cpu_chunk_t **new_chun
|
||||
nv_kref_init(&new_chunk->common.refcount);
|
||||
new_chunk->parent = chunk;
|
||||
uvm_cpu_chunk_get(new_chunk->parent);
|
||||
for_each_parent_id_in_mask(id, dma_map_mask)
|
||||
chunk_inc_gpu_mapping(phys_chunk, id);
|
||||
uvm_parent_processor_mask_copy(&new_chunk->mapped_gpus, dma_map_mask);
|
||||
if (i == 0 && !logical_chunk) {
|
||||
dma_map_mask = &new_chunk->mapped_gpus;
|
||||
get_dma_map_mask(phys_chunk, dma_map_mask);
|
||||
}
|
||||
else {
|
||||
uvm_processor_mask_copy(&new_chunk->mapped_gpus, dma_map_mask);
|
||||
}
|
||||
for_each_id_in_mask(gpu_id, dma_map_mask)
|
||||
chunk_inc_gpu_mapping(phys_chunk, uvm_parent_gpu_id_from_gpu_id(gpu_id));
|
||||
new_chunks[i] = &new_chunk->common;
|
||||
}
|
||||
|
||||
// Release the references that are held by the chunk being split.
|
||||
for_each_parent_id_in_mask(id, dma_map_mask)
|
||||
chunk_dec_gpu_mapping(phys_chunk, id);
|
||||
for_each_id_in_mask(gpu_id, dma_map_mask)
|
||||
chunk_dec_gpu_mapping(phys_chunk, uvm_parent_gpu_id_from_gpu_id(gpu_id));
|
||||
|
||||
// If the chunk being split is a logical chunk clear it's mapped_gpus mask.
|
||||
if (uvm_cpu_chunk_is_logical(chunk))
|
||||
uvm_parent_processor_mask_zero(&logical_chunk->mapped_gpus);
|
||||
if (logical_chunk)
|
||||
uvm_processor_mask_zero(&logical_chunk->mapped_gpus);
|
||||
|
||||
uvm_mutex_unlock(&phys_chunk->lock);
|
||||
|
||||
@ -963,7 +1054,7 @@ static bool verify_merging_chunks(uvm_cpu_chunk_t **chunks, size_t num_chunks)
|
||||
{
|
||||
uvm_cpu_logical_chunk_t *logical_chunk;
|
||||
uvm_cpu_chunk_t *first_chunk_parent;
|
||||
uvm_parent_processor_mask_t *first_chunk_mapped_gpus;
|
||||
uvm_processor_mask_t *first_chunk_mapped_gpus;
|
||||
uvm_chunk_size_t first_chunk_size;
|
||||
size_t i;
|
||||
|
||||
@ -994,7 +1085,7 @@ static bool verify_merging_chunks(uvm_cpu_chunk_t **chunks, size_t num_chunks)
|
||||
// 2.1 All mappings to GPUs in each of child chunks' masks that are
|
||||
// not also present in the parent chunk's mask are destroyed.
|
||||
// 2.2 mapped_gpus mask of the parent chunk remains unmodified.
|
||||
UVM_ASSERT(uvm_parent_processor_mask_equal(&logical_chunk->mapped_gpus, first_chunk_mapped_gpus));
|
||||
UVM_ASSERT(uvm_processor_mask_equal(&logical_chunk->mapped_gpus, first_chunk_mapped_gpus));
|
||||
}
|
||||
|
||||
return true;
|
||||
@ -1005,14 +1096,14 @@ uvm_cpu_chunk_t *uvm_cpu_chunk_merge(uvm_cpu_chunk_t **chunks)
|
||||
uvm_cpu_chunk_t *parent;
|
||||
uvm_cpu_logical_chunk_t *logical_chunk;
|
||||
uvm_cpu_physical_chunk_t *phys_chunk;
|
||||
uvm_parent_processor_id_t id;
|
||||
uvm_processor_id_t gpu_id;
|
||||
uvm_chunk_size_t chunk_size;
|
||||
uvm_chunk_size_t parent_chunk_size;
|
||||
size_t num_merge_chunks;
|
||||
size_t i;
|
||||
|
||||
UVM_ASSERT(chunks);
|
||||
UVM_ASSERT(!uvm_cpu_chunk_is_physical(chunks[0]));
|
||||
UVM_ASSERT(uvm_cpu_chunk_is_logical(chunks[0]));
|
||||
|
||||
logical_chunk = uvm_cpu_chunk_to_logical(chunks[0]);
|
||||
parent = logical_chunk->parent;
|
||||
@ -1033,11 +1124,22 @@ uvm_cpu_chunk_t *uvm_cpu_chunk_merge(uvm_cpu_chunk_t **chunks)
|
||||
phys_chunk = get_physical_parent(chunks[0]);
|
||||
|
||||
uvm_mutex_lock(&phys_chunk->lock);
|
||||
for_each_parent_id_in_mask(id, &logical_chunk->mapped_gpus)
|
||||
chunk_inc_gpu_mapping(phys_chunk, id);
|
||||
|
||||
if (!uvm_cpu_chunk_is_physical(parent))
|
||||
uvm_parent_processor_mask_copy(&uvm_cpu_chunk_to_logical(parent)->mapped_gpus, &logical_chunk->mapped_gpus);
|
||||
for_each_id_in_mask(gpu_id, &logical_chunk->mapped_gpus)
|
||||
chunk_inc_gpu_mapping(phys_chunk, uvm_parent_gpu_id_from_gpu_id(gpu_id));
|
||||
|
||||
if (uvm_cpu_chunk_is_logical(parent)) {
|
||||
uvm_processor_mask_copy(&uvm_cpu_chunk_to_logical(parent)->mapped_gpus, &logical_chunk->mapped_gpus);
|
||||
}
|
||||
else {
|
||||
// Restore the mapping->sub_processors mask for each mapped GPU.
|
||||
for_each_id_in_mask(gpu_id, &logical_chunk->mapped_gpus) {
|
||||
uvm_cpu_phys_mapping_t *mapping = chunk_phys_mapping_get(phys_chunk, uvm_parent_gpu_id_from_gpu_id(gpu_id));
|
||||
|
||||
UVM_ASSERT(mapping);
|
||||
uvm_sub_processor_mask_set(&mapping->sub_processors, uvm_id_sub_processor_index(gpu_id));
|
||||
}
|
||||
}
|
||||
|
||||
uvm_mutex_unlock(&phys_chunk->lock);
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2017-2023 NVIDIA Corporation
|
||||
Copyright (c) 2017-2024 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@ -246,8 +246,19 @@ struct uvm_cpu_chunk_struct
|
||||
|
||||
typedef struct
|
||||
{
|
||||
// Physical GPU DMA address of the CPU chunk.
|
||||
NvU64 dma_addr;
|
||||
|
||||
// Reference count of all sub_processors using this mapping across logical
|
||||
// and physical chunks.
|
||||
NvU32 map_count;
|
||||
|
||||
// Mask of MIG instances or physical GPU.
|
||||
// This is only valid for physical CPU chunks that have not been split into
|
||||
// logical chunks. When the chunk is split, all the
|
||||
// uvm_cpu_logical_chunk_t::mapped_gpus masks have a bit set for each
|
||||
// count in map_count and sub_processors is set to zero.
|
||||
uvm_sub_processor_mask_t sub_processors;
|
||||
} uvm_cpu_phys_mapping_t;
|
||||
|
||||
typedef struct
|
||||
@ -304,7 +315,9 @@ typedef struct
|
||||
|
||||
// Pointer to the parent chunk (which could also be a logical chunk).
|
||||
uvm_cpu_chunk_t *parent;
|
||||
uvm_parent_processor_mask_t mapped_gpus;
|
||||
|
||||
// This is a reference per bit but also recorded in mapping->map_count.
|
||||
uvm_processor_mask_t mapped_gpus;
|
||||
} uvm_cpu_logical_chunk_t;
|
||||
|
||||
// Return the set of allowed CPU chunk allocation sizes.
|
||||
@ -417,15 +430,15 @@ void uvm_cpu_chunk_free(uvm_cpu_chunk_t *chunk);
|
||||
// For more details see uvm_mmu_sysmem_map().
|
||||
NV_STATUS uvm_cpu_chunk_map_gpu(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu);
|
||||
|
||||
// Destroy a CPU chunk's DMA mapping for the parent GPU.
|
||||
// Destroy a CPU chunk's DMA mapping for the given GPU.
|
||||
// If chunk is a logical chunk, this call may not necessarily destroy the DMA
|
||||
// mapping of the parent physical chunk since all logical chunks share the
|
||||
// parent's DMA mapping.
|
||||
void uvm_cpu_chunk_unmap_parent_gpu_phys(uvm_cpu_chunk_t *chunk, uvm_parent_gpu_t *parent_gpu);
|
||||
// mapping of the parent physical chunk since all logical chunks and MIG
|
||||
// partitions share the parent's DMA mapping.
|
||||
void uvm_cpu_chunk_unmap_gpu(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu);
|
||||
|
||||
// Get the CPU chunk's DMA mapping address for the specified GPU ID.
|
||||
// If there is no mapping for the GPU, 0 is returned.
|
||||
NvU64 uvm_cpu_chunk_get_parent_gpu_phys_addr(uvm_cpu_chunk_t *chunk, uvm_parent_gpu_t *parent_gpu);
|
||||
NvU64 uvm_cpu_chunk_get_gpu_phys_addr(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu);
|
||||
|
||||
// Split a CPU chunk into a set of CPU chunks of the next size down from the set
|
||||
// of enabled CPU chunk sizes.
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2017-2023 NVIDIA Corporation
|
||||
Copyright (c) 2017-2024 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@ -626,7 +626,7 @@ static NV_STATUS test_cpu_chunk_mapping_access(uvm_cpu_chunk_t *chunk, uvm_gpu_t
|
||||
TEST_NV_CHECK_RET(cpu_chunk_map_on_cpu(chunk, (void **)&cpu_addr));
|
||||
memset(cpu_addr, 0, chunk_size);
|
||||
|
||||
dma_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
|
||||
dma_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu);
|
||||
gpu_addr = uvm_gpu_address_copy(gpu, uvm_gpu_phys_address(UVM_APERTURE_SYS, dma_addr));
|
||||
|
||||
TEST_NV_CHECK_GOTO(uvm_push_begin_acquire(gpu->channel_manager,
|
||||
@ -733,21 +733,21 @@ static NV_STATUS test_cpu_chunk_mapping_basic_verify(uvm_gpu_t *gpu,
|
||||
// - no GPU mapping address.
|
||||
TEST_CHECK_GOTO(phys_chunk->gpu_mappings.max_entries == 1, done);
|
||||
TEST_CHECK_GOTO(uvm_parent_processor_mask_get_gpu_count(&phys_chunk->gpu_mappings.dma_addrs_mask) == 0, done);
|
||||
TEST_CHECK_GOTO(uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent) == 0, done);
|
||||
TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu) == 0, done);
|
||||
TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu), done);
|
||||
|
||||
// Test basic access.
|
||||
TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu), done);
|
||||
|
||||
// Test double map is harmless.
|
||||
dma_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
|
||||
dma_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu);
|
||||
TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu), done);
|
||||
TEST_CHECK_GOTO(uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent) == dma_addr, done);
|
||||
TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu) == dma_addr, done);
|
||||
TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu), done);
|
||||
|
||||
// Test unmap, remap.
|
||||
uvm_cpu_chunk_unmap_parent_gpu_phys(chunk, gpu->parent);
|
||||
TEST_CHECK_GOTO(uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent) == 0, done);
|
||||
uvm_cpu_chunk_unmap_gpu(chunk, gpu);
|
||||
TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu) == 0, done);
|
||||
TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu), done);
|
||||
TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu), done);
|
||||
|
||||
@ -768,6 +768,39 @@ static NV_STATUS test_cpu_chunk_mapping_basic(uvm_gpu_t *gpu, uvm_cpu_chunk_allo
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
// TODO: Bug 4351121: This won't actually test anything until uvm_test
|
||||
// enumerates multiple MIG instances.
|
||||
static NV_STATUS test_cpu_chunk_mig(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
|
||||
{
|
||||
NV_STATUS status = NV_OK;
|
||||
uvm_cpu_chunk_t *chunk;
|
||||
uvm_cpu_physical_chunk_t *phys_chunk;
|
||||
NvU64 dma_addr_gpu0;
|
||||
|
||||
UVM_ASSERT(gpu0->parent == gpu1->parent);
|
||||
|
||||
TEST_NV_CHECK_RET(test_cpu_chunk_alloc(PAGE_SIZE, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, NUMA_NO_NODE, &chunk));
|
||||
phys_chunk = uvm_cpu_chunk_to_physical(chunk);
|
||||
|
||||
TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu0), done);
|
||||
TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu1), done);
|
||||
TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu0), done);
|
||||
TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu1), done);
|
||||
|
||||
// MIG instances in the same physical GPU share the same DMA addresses.
|
||||
dma_addr_gpu0 = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu0);
|
||||
TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu1) == dma_addr_gpu0, done);
|
||||
|
||||
// Unmapping one GPU shouldn't affect the other.
|
||||
uvm_cpu_chunk_unmap_gpu(chunk, gpu0);
|
||||
TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu0) == 0, done);
|
||||
TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu1), done);
|
||||
|
||||
done:
|
||||
uvm_cpu_chunk_free(chunk);
|
||||
return status;
|
||||
}
|
||||
|
||||
static NV_STATUS test_cpu_chunk_mapping_array(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1, uvm_gpu_t *gpu2)
|
||||
{
|
||||
NV_STATUS status = NV_OK;
|
||||
@ -783,8 +816,8 @@ static NV_STATUS test_cpu_chunk_mapping_array(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1,
|
||||
TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu2), done);
|
||||
TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu1), done);
|
||||
TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu2), done);
|
||||
dma_addr_gpu1 = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu1->parent);
|
||||
uvm_cpu_chunk_unmap_parent_gpu_phys(chunk, gpu2->parent);
|
||||
dma_addr_gpu1 = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu1);
|
||||
uvm_cpu_chunk_unmap_gpu(chunk, gpu2);
|
||||
TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu1), done);
|
||||
TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu0), done);
|
||||
TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu0), done);
|
||||
@ -798,7 +831,9 @@ static NV_STATUS test_cpu_chunk_mapping_array(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1,
|
||||
// GPU1. It's true that we may get a false negative if both addresses
|
||||
// happened to alias and we had a bug in how the addresses are shifted in
|
||||
// the dense array, but that's better than intermittent failure.
|
||||
TEST_CHECK_GOTO(uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu1->parent) == dma_addr_gpu1, done);
|
||||
// Also note that multiple MIG instances in the same physical GPU share the
|
||||
// parent's physical DMA mapping.
|
||||
TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu1) == dma_addr_gpu1, done);
|
||||
|
||||
done:
|
||||
uvm_cpu_chunk_free(chunk);
|
||||
@ -828,7 +863,7 @@ static NV_STATUS do_test_cpu_chunk_split_and_merge(uvm_cpu_chunk_t *chunk, uvm_g
|
||||
|
||||
TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu), done_free);
|
||||
TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu), done_free);
|
||||
uvm_cpu_chunk_unmap_parent_gpu_phys(chunk, gpu->parent);
|
||||
uvm_cpu_chunk_unmap_gpu(chunk, gpu);
|
||||
|
||||
TEST_NV_CHECK_GOTO(uvm_cpu_chunk_split(chunk, split_chunks), done_free);
|
||||
TEST_CHECK_GOTO(nv_kref_read(&chunk->refcount) == num_split_chunks, done);
|
||||
@ -845,13 +880,14 @@ static NV_STATUS do_test_cpu_chunk_split_and_merge(uvm_cpu_chunk_t *chunk, uvm_g
|
||||
merged_chunk = uvm_cpu_chunk_merge(split_chunks);
|
||||
TEST_CHECK_GOTO(uvm_cpu_chunk_get_size(merged_chunk) == size, done_free);
|
||||
TEST_CHECK_GOTO(merged_chunk == chunk, done_free);
|
||||
TEST_CHECK_GOTO(nv_kref_read(&chunk->refcount) == 1, done_free);
|
||||
|
||||
// Since all logical chunks were mapped, the entire merged chunk should
|
||||
// be accessible without needing to map it.
|
||||
TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(merged_chunk, gpu), done_free);
|
||||
|
||||
// Test that GPU mappings are transferred after a split
|
||||
phys_dma_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
|
||||
phys_dma_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu);
|
||||
|
||||
TEST_NV_CHECK_GOTO(uvm_cpu_chunk_split(chunk, split_chunks), done_free);
|
||||
|
||||
@ -859,9 +895,9 @@ static NV_STATUS do_test_cpu_chunk_split_and_merge(uvm_cpu_chunk_t *chunk, uvm_g
|
||||
NvU64 dma_addr;
|
||||
|
||||
TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(split_chunks[i], gpu), done);
|
||||
dma_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(split_chunks[i], gpu->parent);
|
||||
dma_addr = uvm_cpu_chunk_get_gpu_phys_addr(split_chunks[i], gpu);
|
||||
TEST_CHECK_GOTO(dma_addr == phys_dma_addr + (i * split_size), done);
|
||||
uvm_cpu_chunk_unmap_parent_gpu_phys(split_chunks[i], gpu->parent);
|
||||
uvm_cpu_chunk_unmap_gpu(split_chunks[i], gpu);
|
||||
}
|
||||
|
||||
// Test that mapping one logical chunk does not affect others.
|
||||
@ -871,7 +907,7 @@ static NV_STATUS do_test_cpu_chunk_split_and_merge(uvm_cpu_chunk_t *chunk, uvm_g
|
||||
|
||||
for (i = 0; i < num_split_chunks; i++) {
|
||||
if (i != map_chunk)
|
||||
TEST_CHECK_GOTO(uvm_cpu_chunk_get_parent_gpu_phys_addr(split_chunks[i], gpu->parent) == 0, done);
|
||||
TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(split_chunks[i], gpu) == 0, done);
|
||||
}
|
||||
|
||||
if (split_size > PAGE_SIZE) {
|
||||
@ -927,6 +963,118 @@ static NV_STATUS test_cpu_chunk_split_and_merge(uvm_gpu_t *gpu)
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
static NV_STATUS do_test_cpu_chunk_split_and_merge_2(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
|
||||
{
|
||||
NV_STATUS status = NV_OK;
|
||||
uvm_chunk_size_t size = uvm_cpu_chunk_get_size(chunk);
|
||||
uvm_chunk_sizes_mask_t alloc_sizes = uvm_cpu_chunk_get_allocation_sizes();
|
||||
size_t num_split_chunks;
|
||||
uvm_cpu_chunk_t **split_chunks;
|
||||
uvm_cpu_chunk_t *merged_chunk;
|
||||
uvm_chunk_size_t split_size;
|
||||
size_t i;
|
||||
|
||||
split_size = uvm_chunk_find_prev_size(alloc_sizes, size);
|
||||
UVM_ASSERT(split_size != UVM_CHUNK_SIZE_INVALID);
|
||||
num_split_chunks = size / split_size;
|
||||
split_chunks = uvm_kvmalloc_zero(num_split_chunks * sizeof(*split_chunks));
|
||||
|
||||
if (!split_chunks)
|
||||
return NV_ERR_NO_MEMORY;
|
||||
|
||||
// Map both GPUs.
|
||||
TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu0), done_free);
|
||||
TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu1), done_free);
|
||||
TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu0), done_free);
|
||||
TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu1), done_free);
|
||||
|
||||
// Then split.
|
||||
TEST_NV_CHECK_GOTO(uvm_cpu_chunk_split(chunk, split_chunks), done_free);
|
||||
TEST_CHECK_GOTO(nv_kref_read(&chunk->refcount) == num_split_chunks, done);
|
||||
|
||||
// Unmap gpu0 from all split chunks.
|
||||
for (i = 0; i < num_split_chunks; i++) {
|
||||
TEST_CHECK_GOTO(split_chunks[i], done);
|
||||
TEST_CHECK_GOTO(uvm_cpu_chunk_is_logical(split_chunks[i]), done);
|
||||
TEST_CHECK_GOTO(uvm_cpu_chunk_get_size(split_chunks[i]) == split_size, done);
|
||||
uvm_cpu_chunk_unmap_gpu(split_chunks[i], gpu0);
|
||||
TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(split_chunks[i], gpu0) == 0, done);
|
||||
|
||||
// Test that gpu1 still has access.
|
||||
TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(split_chunks[i], gpu1), done);
|
||||
}
|
||||
|
||||
// Test CPU chunk merging.
|
||||
merged_chunk = uvm_cpu_chunk_merge(split_chunks);
|
||||
TEST_CHECK_GOTO(uvm_cpu_chunk_get_size(merged_chunk) == size, done_free);
|
||||
TEST_CHECK_GOTO(merged_chunk == chunk, done_free);
|
||||
TEST_CHECK_GOTO(nv_kref_read(&chunk->refcount) == 1, done_free);
|
||||
|
||||
// Since all logical chunks were mapped, the entire merged chunk should
|
||||
// be accessible without needing to map it.
|
||||
TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(merged_chunk, gpu0) == 0, done_free);
|
||||
TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(merged_chunk, gpu1), done_free);
|
||||
|
||||
// Unmap gpu1 so we start with a fully unmapped physical chunk.
|
||||
uvm_cpu_chunk_unmap_gpu(chunk, gpu1);
|
||||
|
||||
// Split the physical chunk.
|
||||
TEST_NV_CHECK_GOTO(uvm_cpu_chunk_split(chunk, split_chunks), done_free);
|
||||
|
||||
// Now map everything.
|
||||
for (i = 0; i < num_split_chunks; i++) {
|
||||
TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(split_chunks[i], gpu0), done);
|
||||
TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(split_chunks[i], gpu1), done);
|
||||
}
|
||||
|
||||
// Test CPU chunk merging with everything mapped.
|
||||
merged_chunk = uvm_cpu_chunk_merge(split_chunks);
|
||||
|
||||
// At this point, all split chunks have been merged.
|
||||
num_split_chunks = 0;
|
||||
|
||||
TEST_CHECK_GOTO(uvm_cpu_chunk_get_size(merged_chunk) == size, done_free);
|
||||
TEST_CHECK_GOTO(merged_chunk == chunk, done_free);
|
||||
|
||||
// Since all logical chunks were mapped, the entire merged chunk should
|
||||
// be accessible without needing to map it.
|
||||
TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(merged_chunk, gpu0), done_free);
|
||||
TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(merged_chunk, gpu1), done_free);
|
||||
|
||||
done:
|
||||
for (i = 0; i < num_split_chunks; i++)
|
||||
uvm_cpu_chunk_free(split_chunks[i]);
|
||||
|
||||
done_free:
|
||||
uvm_kvfree(split_chunks);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static NV_STATUS test_cpu_chunk_split_and_merge_2(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
|
||||
{
|
||||
uvm_chunk_sizes_mask_t alloc_sizes = uvm_cpu_chunk_get_allocation_sizes();
|
||||
uvm_chunk_size_t size;
|
||||
|
||||
size = uvm_chunk_find_next_size(alloc_sizes, PAGE_SIZE);
|
||||
for_each_chunk_size_from(size, alloc_sizes) {
|
||||
uvm_cpu_chunk_t *chunk;
|
||||
NV_STATUS status;
|
||||
|
||||
// It is possible that the allocation fails due to lack of large pages
|
||||
// rather than an API issue, which will result in a false negative.
|
||||
// However, that should be very rare.
|
||||
TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, NUMA_NO_NODE, &chunk));
|
||||
status = do_test_cpu_chunk_split_and_merge_2(chunk, gpu0, gpu1);
|
||||
uvm_cpu_chunk_free(chunk);
|
||||
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
}
|
||||
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
static NV_STATUS test_cpu_chunk_dirty_split(uvm_cpu_chunk_t *chunk)
|
||||
{
|
||||
uvm_chunk_size_t size = uvm_cpu_chunk_get_size(chunk);
|
||||
@ -1072,7 +1220,9 @@ done:
|
||||
return status;
|
||||
}
|
||||
|
||||
NV_STATUS do_test_cpu_chunk_free(uvm_cpu_chunk_t *chunk, uvm_va_space_t *va_space, uvm_processor_mask_t *test_gpus)
|
||||
NV_STATUS do_test_cpu_chunk_free(uvm_cpu_chunk_t *chunk,
|
||||
uvm_va_space_t *va_space,
|
||||
const uvm_processor_mask_t *test_gpus)
|
||||
{
|
||||
NV_STATUS status = NV_OK;
|
||||
uvm_cpu_chunk_t **split_chunks;
|
||||
@ -1099,7 +1249,7 @@ NV_STATUS do_test_cpu_chunk_free(uvm_cpu_chunk_t *chunk, uvm_va_space_t *va_spac
|
||||
chunk = NULL;
|
||||
|
||||
// Map every other chunk.
|
||||
// The call to uvm_cpu_chunk_unmap_parent_gpu_phys() is here in case this
|
||||
// The call to uvm_cpu_chunk_unmap_gpu() is here in case this
|
||||
// is part of a double split (see below). In that case, the parent chunk
|
||||
// would be either mapped or unmapped.
|
||||
//
|
||||
@ -1111,7 +1261,7 @@ NV_STATUS do_test_cpu_chunk_free(uvm_cpu_chunk_t *chunk, uvm_va_space_t *va_spac
|
||||
if (i & (1 << uvm_id_gpu_index(gpu->id)))
|
||||
TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(split_chunks[i], gpu), done);
|
||||
else
|
||||
uvm_cpu_chunk_unmap_parent_gpu_phys(split_chunks[i], gpu->parent);
|
||||
uvm_cpu_chunk_unmap_gpu(split_chunks[i], gpu);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1147,9 +1297,9 @@ NV_STATUS do_test_cpu_chunk_free(uvm_cpu_chunk_t *chunk, uvm_va_space_t *va_spac
|
||||
TEST_CHECK_GOTO(uvm_cpu_chunk_get_size(split_chunks[j]) == split_size, done);
|
||||
for_each_va_space_gpu_in_mask(gpu, va_space, test_gpus) {
|
||||
if (j & (1 << uvm_id_gpu_index(gpu->id)))
|
||||
TEST_CHECK_GOTO(uvm_cpu_chunk_get_parent_gpu_phys_addr(split_chunks[j], gpu->parent), done);
|
||||
TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(split_chunks[j], gpu), done);
|
||||
else
|
||||
TEST_CHECK_GOTO(!uvm_cpu_chunk_get_parent_gpu_phys_addr(split_chunks[j], gpu->parent), done);
|
||||
TEST_CHECK_GOTO(!uvm_cpu_chunk_get_gpu_phys_addr(split_chunks[j], gpu), done);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1168,7 +1318,8 @@ done_free:
|
||||
return status;
|
||||
}
|
||||
|
||||
NV_STATUS test_cpu_chunk_free(uvm_va_space_t *va_space, uvm_processor_mask_t *test_gpus)
|
||||
NV_STATUS test_cpu_chunk_free(uvm_va_space_t *va_space,
|
||||
const uvm_processor_mask_t *test_gpus)
|
||||
{
|
||||
uvm_cpu_chunk_t *chunk;
|
||||
uvm_chunk_sizes_mask_t alloc_sizes = uvm_cpu_chunk_get_allocation_sizes();
|
||||
@ -1204,6 +1355,50 @@ static NV_STATUS test_cpu_chunk_numa_alloc(uvm_va_space_t *va_space)
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
static uvm_gpu_t *find_first_parent_gpu(const uvm_processor_mask_t *test_gpus,
|
||||
uvm_va_space_t *va_space)
|
||||
{
|
||||
return uvm_processor_mask_find_first_va_space_gpu(test_gpus, va_space);
|
||||
}
|
||||
|
||||
static uvm_gpu_t *find_next_parent_gpu(const uvm_processor_mask_t *test_gpus,
|
||||
uvm_va_space_t *va_space,
|
||||
uvm_gpu_t *gpu)
|
||||
{
|
||||
uvm_gpu_t *next_gpu = gpu;
|
||||
|
||||
while (next_gpu) {
|
||||
next_gpu = uvm_processor_mask_find_next_va_space_gpu(test_gpus, va_space, next_gpu);
|
||||
if (!next_gpu || next_gpu->parent != gpu->parent)
|
||||
break;
|
||||
}
|
||||
|
||||
return next_gpu;
|
||||
}
|
||||
|
||||
static void find_shared_gpu_pair(const uvm_processor_mask_t *test_gpus,
|
||||
uvm_va_space_t *va_space,
|
||||
uvm_gpu_t **out_gpu0,
|
||||
uvm_gpu_t **out_gpu1)
|
||||
{
|
||||
uvm_gpu_t *gpu0 = uvm_processor_mask_find_first_va_space_gpu(test_gpus, va_space);
|
||||
uvm_gpu_t *gpu1 = uvm_processor_mask_find_next_va_space_gpu(test_gpus, va_space, gpu0);
|
||||
|
||||
while (gpu1) {
|
||||
if (gpu0->parent == gpu1->parent) {
|
||||
*out_gpu0 = gpu0;
|
||||
*out_gpu1 = gpu1;
|
||||
return;
|
||||
}
|
||||
|
||||
gpu0 = gpu1;
|
||||
gpu1 = uvm_processor_mask_find_next_va_space_gpu(test_gpus, va_space, gpu0);
|
||||
}
|
||||
|
||||
*out_gpu0 = NULL;
|
||||
*out_gpu1 = NULL;
|
||||
}
|
||||
|
||||
NV_STATUS uvm_test_cpu_chunk_api(UVM_TEST_CPU_CHUNK_API_PARAMS *params, struct file *filp)
|
||||
{
|
||||
uvm_va_space_t *va_space = uvm_va_space_get(filp);
|
||||
@ -1228,15 +1423,31 @@ NV_STATUS uvm_test_cpu_chunk_api(UVM_TEST_CPU_CHUNK_API_PARAMS *params, struct f
|
||||
TEST_NV_CHECK_GOTO(test_cpu_chunk_free(va_space, test_gpus), done);
|
||||
TEST_NV_CHECK_GOTO(test_cpu_chunk_numa_alloc(va_space), done);
|
||||
|
||||
if (uvm_processor_mask_get_gpu_count(test_gpus) >= 3) {
|
||||
uvm_gpu_t *gpu2, *gpu3;
|
||||
if (uvm_processor_mask_get_gpu_count(test_gpus) >= 2) {
|
||||
uvm_gpu_t *gpu2, *gpu3 = NULL;
|
||||
|
||||
gpu = uvm_processor_mask_find_first_va_space_gpu(test_gpus, va_space);
|
||||
gpu2 = uvm_processor_mask_find_next_va_space_gpu(test_gpus, va_space, gpu);
|
||||
gpu3 = uvm_processor_mask_find_next_va_space_gpu(test_gpus, va_space, gpu2);
|
||||
// Look for a pair of GPUs that don't share a common parent.
|
||||
gpu = find_first_parent_gpu(test_gpus, va_space);
|
||||
gpu2 = find_next_parent_gpu(test_gpus, va_space, gpu);
|
||||
if (gpu2) {
|
||||
TEST_NV_CHECK_GOTO(test_cpu_chunk_split_and_merge_2(gpu, gpu2), done);
|
||||
|
||||
// Look for a third physical GPU.
|
||||
gpu3 = find_next_parent_gpu(test_gpus, va_space, gpu2);
|
||||
|
||||
if (gpu3)
|
||||
TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_array(gpu, gpu2, gpu3), done);
|
||||
}
|
||||
|
||||
// Look for a pair of GPUs that share a common parent.
|
||||
find_shared_gpu_pair(test_gpus, va_space, &gpu, &gpu2);
|
||||
if (gpu) {
|
||||
// Test MIG instances within the same parent GPU.
|
||||
TEST_NV_CHECK_GOTO(test_cpu_chunk_split_and_merge_2(gpu, gpu2), done);
|
||||
TEST_NV_CHECK_GOTO(test_cpu_chunk_mig(gpu, gpu2), done);
|
||||
}
|
||||
}
|
||||
|
||||
done:
|
||||
uvm_va_space_up_read(va_space);
|
||||
uvm_processor_mask_cache_free(test_gpus);
|
||||
|
@ -671,9 +671,6 @@ static NV_STATUS va_block_set_read_duplication_locked(uvm_va_block_t *va_block,
|
||||
|
||||
uvm_assert_mutex_locked(&va_block->lock);
|
||||
|
||||
// Force CPU page residency to be on the preferred NUMA node.
|
||||
va_block_context->make_resident.dest_nid = uvm_va_range_get_policy(va_block->va_range)->preferred_nid;
|
||||
|
||||
for_each_id_in_mask(src_id, &va_block->resident) {
|
||||
NV_STATUS status;
|
||||
uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, src_id, NUMA_NO_NODE);
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2023 NVIDIA Corporation
|
||||
Copyright (c) 2023-2024 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@ -30,6 +30,8 @@ const uvm_processor_mask_t g_uvm_processor_mask_empty = { };
|
||||
|
||||
NV_STATUS uvm_processor_mask_cache_init(void)
|
||||
{
|
||||
BUILD_BUG_ON((8 * sizeof(((uvm_sub_processor_mask_t *)0)->bitmap)) < UVM_PARENT_ID_MAX_SUB_PROCESSORS);
|
||||
|
||||
g_uvm_processor_mask_cache = NV_KMEM_CACHE_CREATE("uvm_processor_mask_t", uvm_processor_mask_t);
|
||||
if (!g_uvm_processor_mask_cache)
|
||||
return NV_ERR_NO_MEMORY;
|
||||
@ -100,8 +102,16 @@ void uvm_parent_gpus_from_processor_mask(uvm_parent_processor_mask_t *parent_mas
|
||||
|
||||
bool uvm_numa_id_eq(int nid0, int nid1)
|
||||
{
|
||||
UVM_ASSERT(nid0 >= NUMA_NO_NODE && nid0 < MAX_NUMNODES);
|
||||
UVM_ASSERT(nid1 >= NUMA_NO_NODE && nid1 < MAX_NUMNODES);
|
||||
UVM_ASSERT(nid0 == -1 || nid0 < MAX_NUMNODES);
|
||||
UVM_ASSERT(nid1 == -1 || nid1 < MAX_NUMNODES);
|
||||
|
||||
if ((nid0 == NUMA_NO_NODE || nid1 == NUMA_NO_NODE) && nodes_weight(node_possible_map) == 1) {
|
||||
if (nid0 == NUMA_NO_NODE)
|
||||
nid0 = first_node(node_possible_map);
|
||||
|
||||
if (nid1 == NUMA_NO_NODE)
|
||||
nid1 = first_node(node_possible_map);
|
||||
}
|
||||
|
||||
return nid0 == nid1;
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2016-2023 NVIDIA Corporation
|
||||
Copyright (c) 2016-2024 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@ -277,8 +277,6 @@ typedef uvm_processor_id_t uvm_gpu_id_t;
|
||||
#define UVM_PARENT_ID_MAX_GPUS NV_MAX_DEVICES
|
||||
#define UVM_PARENT_ID_MAX_PROCESSORS (UVM_PARENT_ID_MAX_GPUS + 1)
|
||||
|
||||
#define UVM_PARENT_ID_MAX_SUB_PROCESSORS 8
|
||||
|
||||
#define UVM_ID_MAX_GPUS (UVM_PARENT_ID_MAX_GPUS * UVM_PARENT_ID_MAX_SUB_PROCESSORS)
|
||||
#define UVM_ID_MAX_PROCESSORS (UVM_ID_MAX_GPUS + 1)
|
||||
#define UVM_MAX_UNIQUE_GPU_PAIRS SUM_FROM_0_TO_N(UVM_ID_MAX_GPUS - 1)
|
||||
@ -292,6 +290,9 @@ typedef uvm_processor_id_t uvm_gpu_id_t;
|
||||
|
||||
#define UVM_ID_CHECK_BOUNDS(id) UVM_ASSERT_MSG(id.val <= UVM_ID_MAX_PROCESSORS, "id %u\n", id.val)
|
||||
|
||||
#define UVM_SUB_PROCESSOR_INDEX_CHECK_BOUNDS(sub_index) \
|
||||
UVM_ASSERT_MSG((sub_index) < UVM_PARENT_ID_MAX_SUB_PROCESSORS, "sub_index %u\n", (sub_index))
|
||||
|
||||
static int uvm_parent_id_cmp(uvm_parent_processor_id_t id1, uvm_parent_processor_id_t id2)
|
||||
{
|
||||
UVM_PARENT_ID_CHECK_BOUNDS(id1);
|
||||
@ -493,11 +494,16 @@ static uvm_gpu_id_t uvm_gpu_id_from_parent_gpu_id(const uvm_parent_gpu_id_t id)
|
||||
static uvm_gpu_id_t uvm_gpu_id_from_sub_processor_index(NvU32 index, NvU32 sub_index)
|
||||
{
|
||||
UVM_ASSERT(index < UVM_PARENT_ID_MAX_GPUS);
|
||||
UVM_ASSERT(sub_index < UVM_PARENT_ID_MAX_SUB_PROCESSORS);
|
||||
UVM_SUB_PROCESSOR_INDEX_CHECK_BOUNDS(sub_index);
|
||||
|
||||
return uvm_gpu_id_from_index(index * UVM_PARENT_ID_MAX_SUB_PROCESSORS + sub_index);
|
||||
}
|
||||
|
||||
static uvm_gpu_id_t uvm_gpu_id_from_sub_processor(uvm_parent_gpu_id_t id, NvU32 sub_index)
|
||||
{
|
||||
return uvm_gpu_id_from_sub_processor_index(uvm_parent_id_gpu_index(id), sub_index);
|
||||
}
|
||||
|
||||
static uvm_parent_gpu_id_t uvm_parent_gpu_id_from_gpu_id(const uvm_gpu_id_t id)
|
||||
{
|
||||
UVM_ASSERT(UVM_ID_IS_GPU(id));
|
||||
@ -525,6 +531,71 @@ UVM_PROCESSOR_MASK(uvm_processor_mask_t, \
|
||||
extern const uvm_processor_mask_t g_uvm_processor_mask_cpu;
|
||||
extern const uvm_processor_mask_t g_uvm_processor_mask_empty;
|
||||
|
||||
// This is similar to uvm_parent_processor_mask_t and uvm_processor_mask_t
|
||||
// but defined as a NvU8 in order to save memory since DECLARE_BITMAP() uses
|
||||
// unsigned long. It also means we need to define our own bitops.
|
||||
// Note that these are not atomic operations.
|
||||
typedef struct
|
||||
{
|
||||
NvU8 bitmap;
|
||||
} uvm_sub_processor_mask_t;
|
||||
|
||||
static bool uvm_sub_processor_mask_test(const uvm_sub_processor_mask_t *mask, NvU32 sub_index)
|
||||
{
|
||||
UVM_SUB_PROCESSOR_INDEX_CHECK_BOUNDS(sub_index);
|
||||
|
||||
return mask->bitmap & (1 << sub_index);
|
||||
}
|
||||
|
||||
static void uvm_sub_processor_mask_set(uvm_sub_processor_mask_t *mask, NvU32 sub_index)
|
||||
{
|
||||
UVM_SUB_PROCESSOR_INDEX_CHECK_BOUNDS(sub_index);
|
||||
|
||||
mask->bitmap |= 1 << sub_index;
|
||||
}
|
||||
|
||||
static void uvm_sub_processor_mask_clear(uvm_sub_processor_mask_t *mask, NvU32 sub_index)
|
||||
{
|
||||
UVM_SUB_PROCESSOR_INDEX_CHECK_BOUNDS(sub_index);
|
||||
|
||||
mask->bitmap &= ~(1 << sub_index);
|
||||
}
|
||||
|
||||
static bool uvm_sub_processor_mask_test_and_set(uvm_sub_processor_mask_t *mask, NvU32 sub_index)
|
||||
{
|
||||
bool result = uvm_sub_processor_mask_test(mask, sub_index);
|
||||
|
||||
if (!result)
|
||||
uvm_sub_processor_mask_set(mask, sub_index);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static bool uvm_sub_processor_mask_test_and_clear(uvm_sub_processor_mask_t *mask, NvU32 sub_index)
|
||||
{
|
||||
bool result = uvm_sub_processor_mask_test(mask, sub_index);
|
||||
|
||||
if (result)
|
||||
uvm_sub_processor_mask_clear(mask, sub_index);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static void uvm_sub_processor_mask_zero(uvm_sub_processor_mask_t *mask)
|
||||
{
|
||||
mask->bitmap = 0;
|
||||
}
|
||||
|
||||
static bool uvm_sub_processor_mask_empty(const uvm_sub_processor_mask_t *mask)
|
||||
{
|
||||
return mask->bitmap == 0;
|
||||
}
|
||||
|
||||
static NvU32 uvm_sub_processor_mask_get_count(const uvm_sub_processor_mask_t *mask)
|
||||
{
|
||||
return hweight8(mask->bitmap);
|
||||
}
|
||||
|
||||
// Like uvm_processor_mask_subset() but ignores the CPU in the subset mask.
|
||||
// Returns whether the GPUs in subset are a subset of the GPUs in mask.
|
||||
bool uvm_processor_mask_gpu_subset(const uvm_processor_mask_t *subset,
|
||||
@ -571,8 +642,28 @@ void uvm_parent_gpus_from_processor_mask(uvm_parent_processor_mask_t *parent_mas
|
||||
i = uvm_gpu_id_next(i))
|
||||
|
||||
// Helper to iterate over all sub processor indexes.
|
||||
#define for_each_sub_processor_index(i) \
|
||||
for (i = 0; i < UVM_PARENT_ID_MAX_SUB_PROCESSORS; i++)
|
||||
#define for_each_sub_processor_index(sub_index) \
|
||||
for ((sub_index) = 0; (sub_index) < UVM_PARENT_ID_MAX_SUB_PROCESSORS; (sub_index)++)
|
||||
|
||||
static NvU32 uvm_sub_processor_mask_find_first_index(const uvm_sub_processor_mask_t *mask)
|
||||
{
|
||||
unsigned long bitmap = mask->bitmap;
|
||||
|
||||
return find_first_bit(&bitmap, UVM_PARENT_ID_MAX_SUB_PROCESSORS);
|
||||
}
|
||||
|
||||
static NvU32 uvm_sub_processor_mask_find_next_index(const uvm_sub_processor_mask_t *mask, NvU32 min_index)
|
||||
{
|
||||
unsigned long bitmap = mask->bitmap;
|
||||
|
||||
return find_next_bit(&bitmap, UVM_PARENT_ID_MAX_SUB_PROCESSORS, min_index);
|
||||
}
|
||||
|
||||
// Helper to iterate over all sub processor indexes in a given mask.
|
||||
#define for_each_sub_processor_index_in_mask(sub_index, sub_mask) \
|
||||
for ((sub_index) = uvm_sub_processor_mask_find_first_index((sub_mask)); \
|
||||
(sub_index) < UVM_PARENT_ID_MAX_SUB_PROCESSORS; \
|
||||
(sub_index) = uvm_sub_processor_mask_find_next_index((sub_mask), (sub_index) + 1))
|
||||
|
||||
// Helper to iterate over all valid processor ids.
|
||||
#define for_each_id(i) for (i = UVM_ID_CPU; UVM_ID_IS_VALID(i); i = uvm_id_next(i))
|
||||
|
@ -65,12 +65,9 @@ typedef enum
|
||||
} uvm_push_flag_t;
|
||||
|
||||
struct uvm_push_crypto_bundle_struct {
|
||||
// Initialization vector used to decrypt the push on the CPU
|
||||
// Initialization vector used to decrypt the push
|
||||
UvmCslIv iv;
|
||||
|
||||
// Key version used to decrypt the push on the CPU
|
||||
NvU32 key_version;
|
||||
|
||||
// Size of the pushbuffer that is encrypted/decrypted
|
||||
NvU32 push_size;
|
||||
};
|
||||
|
@ -451,6 +451,7 @@ static uvm_pushbuffer_chunk_t *gpfifo_to_chunk(uvm_pushbuffer_t *pushbuffer, uvm
|
||||
static void decrypt_push(uvm_channel_t *channel, uvm_gpfifo_entry_t *gpfifo)
|
||||
{
|
||||
NV_STATUS status;
|
||||
NvU32 auth_tag_offset;
|
||||
void *auth_tag_cpu_va;
|
||||
void *push_protected_cpu_va;
|
||||
void *push_unprotected_cpu_va;
|
||||
@ -469,15 +470,16 @@ static void decrypt_push(uvm_channel_t *channel, uvm_gpfifo_entry_t *gpfifo)
|
||||
UVM_ASSERT(!uvm_channel_is_wlc(channel));
|
||||
UVM_ASSERT(!uvm_channel_is_lcic(channel));
|
||||
|
||||
push_protected_cpu_va = get_base_cpu_va(pushbuffer) + pushbuffer_offset;
|
||||
push_protected_cpu_va = (char *)get_base_cpu_va(pushbuffer) + pushbuffer_offset;
|
||||
push_unprotected_cpu_va = (char *)uvm_rm_mem_get_cpu_va(pushbuffer->memory_unprotected_sysmem) + pushbuffer_offset;
|
||||
auth_tag_cpu_va = uvm_channel_get_push_crypto_bundle_auth_tags_cpu_va(channel, push_info_index);
|
||||
auth_tag_offset = push_info_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
|
||||
auth_tag_cpu_va = (char *)uvm_rm_mem_get_cpu_va(channel->conf_computing.push_crypto_bundle_auth_tags) +
|
||||
auth_tag_offset;
|
||||
|
||||
status = uvm_conf_computing_cpu_decrypt(channel,
|
||||
push_protected_cpu_va,
|
||||
push_unprotected_cpu_va,
|
||||
&crypto_bundle->iv,
|
||||
crypto_bundle->key_version,
|
||||
crypto_bundle->push_size,
|
||||
auth_tag_cpu_va);
|
||||
|
||||
@ -556,7 +558,7 @@ NvU64 uvm_pushbuffer_get_gpu_va_for_push(uvm_pushbuffer_t *pushbuffer, uvm_push_
|
||||
if (uvm_channel_is_wlc(push->channel) || uvm_channel_is_lcic(push->channel)) {
|
||||
// We need to use the same static locations for PB as the fixed
|
||||
// schedule because that's what the channels are initialized to use.
|
||||
return uvm_channel_get_static_pb_protected_vidmem_gpu_va(push->channel);
|
||||
return uvm_rm_mem_get_gpu_uvm_va(push->channel->conf_computing.static_pb_protected_vidmem, gpu);
|
||||
}
|
||||
else if (uvm_channel_is_sec2(push->channel)) {
|
||||
// SEC2 PBs are in unprotected sysmem
|
||||
@ -573,7 +575,7 @@ void *uvm_pushbuffer_get_unprotected_cpu_va_for_push(uvm_pushbuffer_t *pushbuffe
|
||||
if (uvm_channel_is_wlc(push->channel)) {
|
||||
// Reuse existing WLC static pb for initialization
|
||||
UVM_ASSERT(!uvm_channel_manager_is_wlc_ready(push->channel->pool->manager));
|
||||
return uvm_channel_get_static_pb_unprotected_sysmem_cpu(push->channel);
|
||||
return push->channel->conf_computing.static_pb_unprotected_sysmem_cpu;
|
||||
}
|
||||
|
||||
pushbuffer_base = uvm_rm_mem_get_cpu_va(pushbuffer->memory_unprotected_sysmem);
|
||||
@ -588,8 +590,8 @@ NvU64 uvm_pushbuffer_get_unprotected_gpu_va_for_push(uvm_pushbuffer_t *pushbuffe
|
||||
if (uvm_channel_is_wlc(push->channel)) {
|
||||
// Reuse existing WLC static pb for initialization
|
||||
UVM_ASSERT(!uvm_channel_manager_is_wlc_ready(push->channel->pool->manager));
|
||||
|
||||
return uvm_channel_get_static_pb_unprotected_sysmem_gpu_va(push->channel);
|
||||
return uvm_rm_mem_get_gpu_uvm_va(push->channel->conf_computing.static_pb_unprotected_sysmem,
|
||||
uvm_push_get_gpu(push));
|
||||
}
|
||||
|
||||
pushbuffer_base = uvm_rm_mem_get_gpu_uvm_va(pushbuffer->memory_unprotected_sysmem, uvm_push_get_gpu(push));
|
||||
|
@ -322,7 +322,6 @@ static NV_STATUS cpu_decrypt(uvm_channel_t *channel,
|
||||
uvm_mem_t *dst_mem,
|
||||
uvm_mem_t *src_mem,
|
||||
UvmCslIv *decrypt_iv,
|
||||
NvU32 key_version,
|
||||
uvm_mem_t *auth_tag_mem,
|
||||
size_t size,
|
||||
size_t copy_size)
|
||||
@ -339,7 +338,6 @@ static NV_STATUS cpu_decrypt(uvm_channel_t *channel,
|
||||
dst_plain,
|
||||
src_cipher,
|
||||
&decrypt_iv[i],
|
||||
key_version,
|
||||
copy_size,
|
||||
auth_tag_buffer));
|
||||
|
||||
@ -370,7 +368,7 @@ static void gpu_encrypt(uvm_push_t *push,
|
||||
uvm_gpu_address_t auth_tag_address = uvm_mem_gpu_address_virtual_kernel(auth_tag_mem, gpu);
|
||||
|
||||
for (i = 0; i < num_iterations; i++) {
|
||||
uvm_conf_computing_log_gpu_encryption(push->channel, copy_size, decrypt_iv);
|
||||
uvm_conf_computing_log_gpu_encryption(push->channel, decrypt_iv);
|
||||
|
||||
if (i > 0)
|
||||
uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
|
||||
@ -429,7 +427,6 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu, size_t copy_size, siz
|
||||
size_t auth_tag_buffer_size = (size / copy_size) * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
|
||||
uvm_push_t push;
|
||||
UvmCslIv *decrypt_iv;
|
||||
NvU32 key_version;
|
||||
|
||||
decrypt_iv = uvm_kvmalloc_zero((size / copy_size) * sizeof(UvmCslIv));
|
||||
if (!decrypt_iv)
|
||||
@ -459,11 +456,6 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu, size_t copy_size, siz
|
||||
|
||||
gpu_encrypt(&push, dst_cipher, dst_plain, decrypt_iv, auth_tag_mem, size, copy_size);
|
||||
|
||||
// There shouldn't be any key rotation between the end of the push and the
|
||||
// CPU decryption(s), but it is more robust against test changes to force
|
||||
// decryption to use the saved key.
|
||||
key_version = uvm_channel_pool_key_version(push.channel->pool);
|
||||
|
||||
TEST_NV_CHECK_GOTO(uvm_push_end_and_wait(&push), out);
|
||||
|
||||
TEST_CHECK_GOTO(!mem_match(src_plain, src_cipher), out);
|
||||
@ -473,7 +465,6 @@ static NV_STATUS test_cpu_to_gpu_roundtrip(uvm_gpu_t *gpu, size_t copy_size, siz
|
||||
dst_plain_cpu,
|
||||
dst_cipher,
|
||||
decrypt_iv,
|
||||
key_version,
|
||||
auth_tag_mem,
|
||||
size,
|
||||
copy_size),
|
||||
|
@ -124,23 +124,24 @@ static NV_STATUS uvm_test_verify_bh_affinity(uvm_intr_handler_t *isr, int node)
|
||||
static NV_STATUS uvm_test_numa_check_affinity(UVM_TEST_NUMA_CHECK_AFFINITY_PARAMS *params, struct file *filp)
|
||||
{
|
||||
uvm_gpu_t *gpu;
|
||||
NV_STATUS status = NV_OK;
|
||||
NV_STATUS status;
|
||||
uvm_rm_user_object_t user_rm_va_space = {
|
||||
.rm_control_fd = -1,
|
||||
.user_client = params->client,
|
||||
.user_object = params->smc_part_ref
|
||||
};
|
||||
|
||||
if (!UVM_THREAD_AFFINITY_SUPPORTED())
|
||||
return NV_ERR_NOT_SUPPORTED;
|
||||
|
||||
uvm_mutex_lock(&g_uvm_global.global_lock);
|
||||
|
||||
gpu = uvm_gpu_get_by_uuid(¶ms->gpu_uuid);
|
||||
if (!gpu) {
|
||||
status = NV_ERR_INVALID_DEVICE;
|
||||
goto unlock;
|
||||
}
|
||||
status = uvm_gpu_retain_by_uuid(¶ms->gpu_uuid, &user_rm_va_space, &gpu);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
// If the GPU is not attached to a NUMA node, there is nothing to do.
|
||||
if (gpu->parent->closest_cpu_numa_node == NUMA_NO_NODE) {
|
||||
status = NV_ERR_NOT_SUPPORTED;
|
||||
goto unlock;
|
||||
goto release;
|
||||
}
|
||||
|
||||
if (gpu->parent->replayable_faults_supported) {
|
||||
@ -149,7 +150,7 @@ static NV_STATUS uvm_test_numa_check_affinity(UVM_TEST_NUMA_CHECK_AFFINITY_PARAM
|
||||
gpu->parent->closest_cpu_numa_node);
|
||||
uvm_parent_gpu_replayable_faults_isr_unlock(gpu->parent);
|
||||
if (status != NV_OK)
|
||||
goto unlock;
|
||||
goto release;
|
||||
|
||||
if (gpu->parent->non_replayable_faults_supported) {
|
||||
uvm_parent_gpu_non_replayable_faults_isr_lock(gpu->parent);
|
||||
@ -157,7 +158,7 @@ static NV_STATUS uvm_test_numa_check_affinity(UVM_TEST_NUMA_CHECK_AFFINITY_PARAM
|
||||
gpu->parent->closest_cpu_numa_node);
|
||||
uvm_parent_gpu_non_replayable_faults_isr_unlock(gpu->parent);
|
||||
if (status != NV_OK)
|
||||
goto unlock;
|
||||
goto release;
|
||||
}
|
||||
|
||||
if (gpu->parent->access_counters_supported) {
|
||||
@ -167,9 +168,8 @@ static NV_STATUS uvm_test_numa_check_affinity(UVM_TEST_NUMA_CHECK_AFFINITY_PARAM
|
||||
uvm_parent_gpu_access_counters_isr_unlock(gpu->parent);
|
||||
}
|
||||
}
|
||||
|
||||
unlock:
|
||||
uvm_mutex_unlock(&g_uvm_global.global_lock);
|
||||
release:
|
||||
uvm_gpu_release(gpu);
|
||||
return status;
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2015-2022 NVidia Corporation
|
||||
Copyright (c) 2015-2024 NVidia Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@ -191,7 +191,7 @@ typedef struct
|
||||
NvU32 read_duplication; // Out (UVM_TEST_READ_DUPLICATION_POLICY)
|
||||
NvProcessorUuid preferred_location; // Out
|
||||
NvS32 preferred_cpu_nid; // Out
|
||||
NvProcessorUuid accessed_by[UVM_MAX_PROCESSORS_V2]; // Out
|
||||
NvProcessorUuid accessed_by[UVM_MAX_PROCESSORS]; // Out
|
||||
NvU32 accessed_by_count; // Out
|
||||
NvU32 type; // Out (UVM_TEST_VA_RANGE_TYPE)
|
||||
union
|
||||
@ -347,30 +347,20 @@ typedef enum
|
||||
UVM_TEST_CHANNEL_STRESS_MODE_NOOP_PUSH = 0,
|
||||
UVM_TEST_CHANNEL_STRESS_MODE_UPDATE_CHANNELS,
|
||||
UVM_TEST_CHANNEL_STRESS_MODE_STREAM,
|
||||
UVM_TEST_CHANNEL_STRESS_MODE_KEY_ROTATION,
|
||||
} UVM_TEST_CHANNEL_STRESS_MODE;
|
||||
|
||||
typedef enum
|
||||
{
|
||||
UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_CPU_TO_GPU,
|
||||
UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_GPU_TO_CPU,
|
||||
UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION_ROTATE,
|
||||
} UVM_TEST_CHANNEL_STRESS_KEY_ROTATION_OPERATION;
|
||||
|
||||
#define UVM_TEST_CHANNEL_STRESS UVM_TEST_IOCTL_BASE(15)
|
||||
typedef struct
|
||||
{
|
||||
NvU32 mode; // In, one of UVM_TEST_CHANNEL_STRESS_MODE
|
||||
NvU32 mode; // In
|
||||
|
||||
// Number of iterations:
|
||||
// mode == NOOP_PUSH: number of noop pushes
|
||||
// mode == UPDATE_CHANNELS: number of updates
|
||||
// mode == STREAM: number of iterations per stream
|
||||
// mode == ROTATION: number of operations
|
||||
NvU32 iterations;
|
||||
|
||||
NvU32 num_streams; // In, used only if mode == STREAM
|
||||
NvU32 key_rotation_operation; // In, used only if mode == ROTATION
|
||||
NvU32 num_streams; // In, used only for mode == UVM_TEST_CHANNEL_STRESS_MODE_STREAM
|
||||
NvU32 seed; // In
|
||||
NvU32 verbose; // In
|
||||
NV_STATUS rmStatus; // Out
|
||||
@ -634,7 +624,7 @@ typedef struct
|
||||
|
||||
// Array of processors which have a resident copy of the page containing
|
||||
// lookup_address.
|
||||
NvProcessorUuid resident_on[UVM_MAX_PROCESSORS_V2]; // Out
|
||||
NvProcessorUuid resident_on[UVM_MAX_PROCESSORS]; // Out
|
||||
NvU32 resident_on_count; // Out
|
||||
|
||||
// If the memory is resident on the CPU, the NUMA node on which the page
|
||||
@ -645,24 +635,24 @@ typedef struct
|
||||
// system-page-sized portion of this allocation which contains
|
||||
// lookup_address is guaranteed to be resident on the corresponding
|
||||
// processor.
|
||||
NvU32 resident_physical_size[UVM_MAX_PROCESSORS_V2]; // Out
|
||||
NvU32 resident_physical_size[UVM_MAX_PROCESSORS]; // Out
|
||||
|
||||
// The physical address of the physical allocation backing lookup_address.
|
||||
NvU64 resident_physical_address[UVM_MAX_PROCESSORS_V2] NV_ALIGN_BYTES(8); // Out
|
||||
NvU64 resident_physical_address[UVM_MAX_PROCESSORS] NV_ALIGN_BYTES(8); // Out
|
||||
|
||||
// Array of processors which have a virtual mapping covering lookup_address.
|
||||
NvProcessorUuid mapped_on[UVM_MAX_PROCESSORS_V2]; // Out
|
||||
NvU32 mapping_type[UVM_MAX_PROCESSORS_V2]; // Out
|
||||
NvU64 mapping_physical_address[UVM_MAX_PROCESSORS_V2] NV_ALIGN_BYTES(8); // Out
|
||||
NvProcessorUuid mapped_on[UVM_MAX_PROCESSORS]; // Out
|
||||
NvU32 mapping_type[UVM_MAX_PROCESSORS]; // Out
|
||||
NvU64 mapping_physical_address[UVM_MAX_PROCESSORS] NV_ALIGN_BYTES(8); // Out
|
||||
NvU32 mapped_on_count; // Out
|
||||
|
||||
// The size of the virtual mapping covering lookup_address on each
|
||||
// mapped_on processor.
|
||||
NvU32 page_size[UVM_MAX_PROCESSORS_V2]; // Out
|
||||
NvU32 page_size[UVM_MAX_PROCESSORS]; // Out
|
||||
|
||||
// Array of processors which have physical memory populated that would back
|
||||
// lookup_address if it was resident.
|
||||
NvProcessorUuid populated_on[UVM_MAX_PROCESSORS_V2]; // Out
|
||||
NvProcessorUuid populated_on[UVM_MAX_PROCESSORS]; // Out
|
||||
NvU32 populated_on_count; // Out
|
||||
|
||||
NV_STATUS rmStatus; // Out
|
||||
@ -1220,6 +1210,8 @@ typedef struct
|
||||
typedef struct
|
||||
{
|
||||
NvProcessorUuid gpu_uuid; // In
|
||||
NvHandle client; // In
|
||||
NvHandle smc_part_ref; // In
|
||||
|
||||
NV_STATUS rmStatus; // Out
|
||||
} UVM_TEST_NUMA_CHECK_AFFINITY_PARAMS;
|
||||
|
@ -30,18 +30,18 @@ void uvm_tlb_batch_begin(uvm_page_tree_t *tree, uvm_tlb_batch_t *batch)
|
||||
batch->tree = tree;
|
||||
}
|
||||
|
||||
static NvU32 smallest_page_size(NvU32 page_sizes)
|
||||
static NvU64 smallest_page_size(NvU64 page_sizes)
|
||||
{
|
||||
UVM_ASSERT(page_sizes != 0);
|
||||
|
||||
return 1u << __ffs(page_sizes);
|
||||
return 1ULL << __ffs(page_sizes);
|
||||
}
|
||||
|
||||
static NvU32 biggest_page_size(NvU32 page_sizes)
|
||||
static NvU64 biggest_page_size(NvU64 page_sizes)
|
||||
{
|
||||
UVM_ASSERT(page_sizes != 0);
|
||||
|
||||
return 1u << __fls(page_sizes);
|
||||
return 1ULL << __fls(page_sizes);
|
||||
}
|
||||
|
||||
static void tlb_batch_flush_invalidate_per_va(uvm_tlb_batch_t *batch, uvm_push_t *push)
|
||||
@ -53,8 +53,8 @@ static void tlb_batch_flush_invalidate_per_va(uvm_tlb_batch_t *batch, uvm_push_t
|
||||
|
||||
for (i = 0; i < batch->count; ++i) {
|
||||
uvm_tlb_batch_range_t *entry = &batch->ranges[i];
|
||||
NvU32 min_page_size = smallest_page_size(entry->page_sizes);
|
||||
NvU32 max_page_size = biggest_page_size(entry->page_sizes);
|
||||
NvU64 min_page_size = smallest_page_size(entry->page_sizes);
|
||||
NvU64 max_page_size = biggest_page_size(entry->page_sizes);
|
||||
|
||||
// Use the depth of the max page size as it's the broadest
|
||||
NvU32 depth = tree->hal->page_table_depth(max_page_size);
|
||||
@ -113,7 +113,7 @@ void uvm_tlb_batch_end(uvm_tlb_batch_t *batch, uvm_push_t *push, uvm_membar_t tl
|
||||
tlb_batch_flush_invalidate_per_va(batch, push);
|
||||
}
|
||||
|
||||
void uvm_tlb_batch_invalidate(uvm_tlb_batch_t *batch, NvU64 start, NvU64 size, NvU32 page_sizes, uvm_membar_t tlb_membar)
|
||||
void uvm_tlb_batch_invalidate(uvm_tlb_batch_t *batch, NvU64 start, NvU64 size, NvU64 page_sizes, uvm_membar_t tlb_membar)
|
||||
{
|
||||
uvm_tlb_batch_range_t *new_entry;
|
||||
|
||||
|
@ -41,7 +41,7 @@ typedef struct
|
||||
NvU64 size;
|
||||
|
||||
// Min and max page size ored together
|
||||
NvU32 page_sizes;
|
||||
NvU64 page_sizes;
|
||||
} uvm_tlb_batch_range_t;
|
||||
|
||||
struct uvm_tlb_batch_struct
|
||||
@ -63,7 +63,7 @@ struct uvm_tlb_batch_struct
|
||||
NvU32 count;
|
||||
|
||||
// Biggest page size across all queued up invalidates
|
||||
NvU32 biggest_page_size;
|
||||
NvU64 biggest_page_size;
|
||||
|
||||
// Max membar across all queued up invalidates
|
||||
uvm_membar_t membar;
|
||||
@ -81,7 +81,7 @@ void uvm_tlb_batch_begin(uvm_page_tree_t *tree, uvm_tlb_batch_t *batch);
|
||||
// If the membar parameter is not UVM_MEMBAR_NONE, the specified membar will
|
||||
// be performed logically after the TLB invalidate such that all physical memory
|
||||
// accesses using the old translations are ordered to the scope of the membar.
|
||||
void uvm_tlb_batch_invalidate(uvm_tlb_batch_t *batch, NvU64 start, NvU64 size, NvU32 page_sizes, uvm_membar_t tlb_membar);
|
||||
void uvm_tlb_batch_invalidate(uvm_tlb_batch_t *batch, NvU64 start, NvU64 size, NvU64 page_sizes, uvm_membar_t tlb_membar);
|
||||
|
||||
// End a TLB invalidate batch
|
||||
//
|
||||
@ -97,8 +97,12 @@ void uvm_tlb_batch_end(uvm_tlb_batch_t *batch, uvm_push_t *push, uvm_membar_t tl
|
||||
// Helper for invalidating a single range immediately.
|
||||
//
|
||||
// Internally begins and ends a TLB batch.
|
||||
static void uvm_tlb_batch_single_invalidate(uvm_page_tree_t *tree, uvm_push_t *push,
|
||||
NvU64 start, NvU64 size, NvU32 page_sizes, uvm_membar_t tlb_membar)
|
||||
static void uvm_tlb_batch_single_invalidate(uvm_page_tree_t *tree,
|
||||
uvm_push_t *push,
|
||||
NvU64 start,
|
||||
NvU64 size,
|
||||
NvU64 page_sizes,
|
||||
uvm_membar_t tlb_membar)
|
||||
{
|
||||
uvm_tlb_batch_t batch;
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2016-2023 NVIDIA Corporation
|
||||
Copyright (c) 2016-2024 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@ -57,20 +57,12 @@ typedef struct
|
||||
struct list_head queue_nodes[UvmEventNumTypesAll];
|
||||
|
||||
struct page **queue_buffer_pages;
|
||||
union
|
||||
{
|
||||
UvmEventEntry_V1 *queue_v1;
|
||||
UvmEventEntry_V2 *queue_v2;
|
||||
};
|
||||
void *queue_buffer;
|
||||
NvU32 queue_buffer_count;
|
||||
NvU32 notification_threshold;
|
||||
|
||||
struct page **control_buffer_pages;
|
||||
union
|
||||
{
|
||||
UvmToolsEventControlData_V1 *control_v1;
|
||||
UvmToolsEventControlData_V2 *control_v2;
|
||||
};
|
||||
UvmToolsEventControlData *control;
|
||||
|
||||
wait_queue_head_t wait_queue;
|
||||
bool is_wakeup_get_valid;
|
||||
@ -398,16 +390,12 @@ static void destroy_event_tracker(uvm_tools_event_tracker_t *event_tracker)
|
||||
|
||||
if (event_tracker->is_queue) {
|
||||
uvm_tools_queue_t *queue = &event_tracker->queue;
|
||||
NvU64 buffer_size, control_size;
|
||||
NvU64 buffer_size;
|
||||
|
||||
if (event_tracker->version == UvmToolsEventQueueVersion_V1) {
|
||||
if (event_tracker->version == UvmToolsEventQueueVersion_V1)
|
||||
buffer_size = queue->queue_buffer_count * sizeof(UvmEventEntry_V1);
|
||||
control_size = sizeof(UvmToolsEventControlData_V1);
|
||||
}
|
||||
else {
|
||||
else
|
||||
buffer_size = queue->queue_buffer_count * sizeof(UvmEventEntry_V2);
|
||||
control_size = sizeof(UvmToolsEventControlData_V2);
|
||||
}
|
||||
|
||||
remove_event_tracker(va_space,
|
||||
queue->queue_nodes,
|
||||
@ -415,16 +403,16 @@ static void destroy_event_tracker(uvm_tools_event_tracker_t *event_tracker)
|
||||
queue->subscribed_queues,
|
||||
&queue->subscribed_queues);
|
||||
|
||||
if (queue->queue_v2 != NULL) {
|
||||
if (queue->queue_buffer != NULL) {
|
||||
unmap_user_pages(queue->queue_buffer_pages,
|
||||
queue->queue_v2,
|
||||
queue->queue_buffer,
|
||||
buffer_size);
|
||||
}
|
||||
|
||||
if (queue->control_v2 != NULL) {
|
||||
if (queue->control != NULL) {
|
||||
unmap_user_pages(queue->control_buffer_pages,
|
||||
queue->control_v2,
|
||||
control_size);
|
||||
queue->control,
|
||||
sizeof(UvmToolsEventControlData));
|
||||
}
|
||||
}
|
||||
else {
|
||||
@ -456,9 +444,9 @@ static void destroy_event_tracker(uvm_tools_event_tracker_t *event_tracker)
|
||||
kmem_cache_free(g_tools_event_tracker_cache, event_tracker);
|
||||
}
|
||||
|
||||
static void enqueue_event_v1(const UvmEventEntry_V1 *entry, uvm_tools_queue_t *queue)
|
||||
static void enqueue_event(const void *entry, size_t entry_size, NvU8 eventType, uvm_tools_queue_t *queue)
|
||||
{
|
||||
UvmToolsEventControlData_V1 *ctrl = queue->control_v1;
|
||||
UvmToolsEventControlData *ctrl = queue->control;
|
||||
uvm_tools_queue_snapshot_t sn;
|
||||
NvU32 queue_size = queue->queue_buffer_count;
|
||||
NvU32 queue_mask = queue_size - 1;
|
||||
@ -481,11 +469,11 @@ static void enqueue_event_v1(const UvmEventEntry_V1 *entry, uvm_tools_queue_t *q
|
||||
|
||||
// one free element means that the queue is full
|
||||
if (((queue_size + sn.get_behind - sn.put_behind) & queue_mask) == 1) {
|
||||
atomic64_inc((atomic64_t *)&ctrl->dropped + entry->eventData.eventType);
|
||||
atomic64_inc((atomic64_t *)&ctrl->dropped + eventType);
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
memcpy(queue->queue_v1 + sn.put_behind, entry, sizeof(*entry));
|
||||
memcpy((char *)queue->queue_buffer + sn.put_behind * entry_size, entry, entry_size);
|
||||
|
||||
sn.put_behind = sn.put_ahead;
|
||||
|
||||
@ -509,79 +497,45 @@ unlock:
|
||||
uvm_spin_unlock(&queue->lock);
|
||||
}
|
||||
|
||||
static void enqueue_event_v1(const UvmEventEntry_V1 *entry, uvm_tools_queue_t *queue)
|
||||
{
|
||||
enqueue_event(entry, sizeof(*entry), entry->eventData.eventType, queue);
|
||||
}
|
||||
|
||||
static void enqueue_event_v2(const UvmEventEntry_V2 *entry, uvm_tools_queue_t *queue)
|
||||
{
|
||||
UvmToolsEventControlData_V2 *ctrl = queue->control_v2;
|
||||
uvm_tools_queue_snapshot_t sn;
|
||||
NvU32 queue_size = queue->queue_buffer_count;
|
||||
NvU32 queue_mask = queue_size - 1;
|
||||
enqueue_event(entry, sizeof(*entry), entry->eventData.eventType, queue);
|
||||
}
|
||||
|
||||
// Prevent processor speculation prior to accessing user-mapped memory to
|
||||
// avoid leaking information from side-channel attacks. There are many
|
||||
// possible paths leading to this point and it would be difficult and error-
|
||||
// prone to audit all of them to determine whether user mode could guide
|
||||
// this access to kernel memory under speculative execution, so to be on the
|
||||
// safe side we'll just always block speculation.
|
||||
nv_speculation_barrier();
|
||||
static void uvm_tools_record_event(struct list_head *head,
|
||||
const void *entry,
|
||||
size_t entry_size,
|
||||
NvU8 eventType)
|
||||
{
|
||||
uvm_tools_queue_t *queue;
|
||||
|
||||
uvm_spin_lock(&queue->lock);
|
||||
UVM_ASSERT(eventType < UvmEventNumTypesAll);
|
||||
|
||||
// ctrl is mapped into user space with read and write permissions,
|
||||
// so its values cannot be trusted.
|
||||
sn.get_behind = atomic_read((atomic_t *)&ctrl->get_behind) & queue_mask;
|
||||
sn.put_behind = atomic_read((atomic_t *)&ctrl->put_behind) & queue_mask;
|
||||
sn.put_ahead = (sn.put_behind + 1) & queue_mask;
|
||||
|
||||
// one free element means that the queue is full
|
||||
if (((queue_size + sn.get_behind - sn.put_behind) & queue_mask) == 1) {
|
||||
atomic64_inc((atomic64_t *)&ctrl->dropped + entry->eventData.eventType);
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
memcpy(queue->queue_v2 + sn.put_behind, entry, sizeof(*entry));
|
||||
|
||||
sn.put_behind = sn.put_ahead;
|
||||
// put_ahead and put_behind will always be the same outside of queue->lock
|
||||
// this allows the user-space consumer to choose either a 2 or 4 pointer synchronization approach
|
||||
atomic_set((atomic_t *)&ctrl->put_ahead, sn.put_behind);
|
||||
atomic_set((atomic_t *)&ctrl->put_behind, sn.put_behind);
|
||||
|
||||
sn.get_ahead = atomic_read((atomic_t *)&ctrl->get_ahead);
|
||||
// if the queue needs to be woken up, only signal if we haven't signaled before for this value of get_ahead
|
||||
if (queue_needs_wakeup(queue, &sn) && !(queue->is_wakeup_get_valid && queue->wakeup_get == sn.get_ahead)) {
|
||||
queue->is_wakeup_get_valid = true;
|
||||
queue->wakeup_get = sn.get_ahead;
|
||||
wake_up_all(&queue->wait_queue);
|
||||
}
|
||||
|
||||
unlock:
|
||||
uvm_spin_unlock(&queue->lock);
|
||||
list_for_each_entry(queue, head + eventType, queue_nodes[eventType])
|
||||
enqueue_event(entry, entry_size, eventType, queue);
|
||||
}
|
||||
|
||||
static void uvm_tools_record_event_v1(uvm_va_space_t *va_space, const UvmEventEntry_V1 *entry)
|
||||
{
|
||||
NvU8 eventType = entry->eventData.eventType;
|
||||
uvm_tools_queue_t *queue;
|
||||
|
||||
UVM_ASSERT(eventType < UvmEventNumTypesAll);
|
||||
|
||||
uvm_assert_rwsem_locked(&va_space->tools.lock);
|
||||
|
||||
list_for_each_entry(queue, va_space->tools.queues_v1 + eventType, queue_nodes[eventType])
|
||||
enqueue_event_v1(entry, queue);
|
||||
uvm_tools_record_event(va_space->tools.queues_v1, entry, sizeof(*entry), eventType);
|
||||
}
|
||||
|
||||
static void uvm_tools_record_event_v2(uvm_va_space_t *va_space, const UvmEventEntry_V2 *entry)
|
||||
{
|
||||
NvU8 eventType = entry->eventData.eventType;
|
||||
uvm_tools_queue_t *queue;
|
||||
|
||||
UVM_ASSERT(eventType < UvmEventNumTypesAll);
|
||||
|
||||
uvm_assert_rwsem_locked(&va_space->tools.lock);
|
||||
|
||||
list_for_each_entry(queue, va_space->tools.queues_v2 + eventType, queue_nodes[eventType])
|
||||
enqueue_event_v2(entry, queue);
|
||||
uvm_tools_record_event(va_space->tools.queues_v2, entry, sizeof(*entry), eventType);
|
||||
}
|
||||
|
||||
static bool counter_matches_processor(UvmCounterName counter, const NvProcessorUuid *processor)
|
||||
@ -751,7 +705,7 @@ static unsigned uvm_tools_poll(struct file *filp, poll_table *wait)
|
||||
int flags = 0;
|
||||
uvm_tools_queue_snapshot_t sn;
|
||||
uvm_tools_event_tracker_t *event_tracker;
|
||||
UvmToolsEventControlData_V2 *ctrl;
|
||||
UvmToolsEventControlData *ctrl;
|
||||
|
||||
if (uvm_global_get_status() != NV_OK)
|
||||
return POLLERR;
|
||||
@ -763,7 +717,7 @@ static unsigned uvm_tools_poll(struct file *filp, poll_table *wait)
|
||||
uvm_spin_lock(&event_tracker->queue.lock);
|
||||
|
||||
event_tracker->queue.is_wakeup_get_valid = false;
|
||||
ctrl = event_tracker->queue.control_v2;
|
||||
ctrl = event_tracker->queue.control;
|
||||
sn.get_ahead = atomic_read((atomic_t *)&ctrl->get_ahead);
|
||||
sn.put_behind = atomic_read((atomic_t *)&ctrl->put_behind);
|
||||
|
||||
@ -878,6 +832,24 @@ static void record_gpu_fault_instance(uvm_gpu_t *gpu,
|
||||
}
|
||||
}
|
||||
|
||||
static void record_cpu_fault(UvmEventCpuFaultInfo *info, uvm_perf_event_data_t *event_data)
|
||||
{
|
||||
info->eventType = UvmEventTypeCpuFault;
|
||||
if (event_data->fault.cpu.is_write)
|
||||
info->accessType = UvmEventMemoryAccessTypeWrite;
|
||||
else
|
||||
info->accessType = UvmEventMemoryAccessTypeRead;
|
||||
|
||||
info->address = event_data->fault.cpu.fault_va;
|
||||
info->timeStamp = NV_GETTIME();
|
||||
// assume that current owns va_space
|
||||
info->pid = uvm_get_stale_process_id();
|
||||
info->threadId = uvm_get_stale_thread_id();
|
||||
info->pc = event_data->fault.cpu.pc;
|
||||
// TODO: Bug 4515381: set info->nid when we decide if it's NUMA node ID or
|
||||
// CPU ID.
|
||||
}
|
||||
|
||||
static void uvm_tools_record_fault(uvm_perf_event_t event_id, uvm_perf_event_data_t *event_data)
|
||||
{
|
||||
uvm_va_space_t *va_space = event_data->fault.space;
|
||||
@ -895,41 +867,17 @@ static void uvm_tools_record_fault(uvm_perf_event_t event_id, uvm_perf_event_dat
|
||||
if (UVM_ID_IS_CPU(event_data->fault.proc_id)) {
|
||||
if (tools_is_event_enabled_version(va_space, UvmEventTypeCpuFault, UvmToolsEventQueueVersion_V1)) {
|
||||
UvmEventEntry_V1 entry;
|
||||
UvmEventCpuFaultInfo_V1 *info = &entry.eventData.cpuFault;
|
||||
memset(&entry, 0, sizeof(entry));
|
||||
|
||||
info->eventType = UvmEventTypeCpuFault;
|
||||
if (event_data->fault.cpu.is_write)
|
||||
info->accessType = UvmEventMemoryAccessTypeWrite;
|
||||
else
|
||||
info->accessType = UvmEventMemoryAccessTypeRead;
|
||||
|
||||
info->address = event_data->fault.cpu.fault_va;
|
||||
info->timeStamp = NV_GETTIME();
|
||||
// assume that current owns va_space
|
||||
info->pid = uvm_get_stale_process_id();
|
||||
info->threadId = uvm_get_stale_thread_id();
|
||||
info->pc = event_data->fault.cpu.pc;
|
||||
record_cpu_fault(&entry.eventData.cpuFault, event_data);
|
||||
|
||||
uvm_tools_record_event_v1(va_space, &entry);
|
||||
}
|
||||
if (tools_is_event_enabled_version(va_space, UvmEventTypeCpuFault, UvmToolsEventQueueVersion_V2)) {
|
||||
UvmEventEntry_V2 entry;
|
||||
UvmEventCpuFaultInfo_V2 *info = &entry.eventData.cpuFault;
|
||||
memset(&entry, 0, sizeof(entry));
|
||||
|
||||
info->eventType = UvmEventTypeCpuFault;
|
||||
if (event_data->fault.cpu.is_write)
|
||||
info->accessType = UvmEventMemoryAccessTypeWrite;
|
||||
else
|
||||
info->accessType = UvmEventMemoryAccessTypeRead;
|
||||
|
||||
info->address = event_data->fault.cpu.fault_va;
|
||||
info->timeStamp = NV_GETTIME();
|
||||
// assume that current owns va_space
|
||||
info->pid = uvm_get_stale_process_id();
|
||||
info->threadId = uvm_get_stale_thread_id();
|
||||
info->pc = event_data->fault.cpu.pc;
|
||||
record_cpu_fault(&entry.eventData.cpuFault, event_data);
|
||||
|
||||
uvm_tools_record_event_v2(va_space, &entry);
|
||||
}
|
||||
@ -1834,7 +1782,7 @@ void uvm_tools_record_thrashing(uvm_va_space_t *va_space,
|
||||
info->size = region_size;
|
||||
info->timeStamp = NV_GETTIME();
|
||||
|
||||
BUILD_BUG_ON(UVM_MAX_PROCESSORS_V2 < UVM_ID_MAX_PROCESSORS);
|
||||
BUILD_BUG_ON(UVM_MAX_PROCESSORS < UVM_ID_MAX_PROCESSORS);
|
||||
bitmap_copy((long unsigned *)&info->processors, processors->bitmap, UVM_ID_MAX_PROCESSORS);
|
||||
|
||||
uvm_tools_record_event_v2(va_space, &entry);
|
||||
@ -2151,7 +2099,7 @@ NV_STATUS uvm_api_tools_init_event_tracker(UVM_TOOLS_INIT_EVENT_TRACKER_PARAMS *
|
||||
event_tracker->is_queue = params->queueBufferSize != 0;
|
||||
if (event_tracker->is_queue) {
|
||||
uvm_tools_queue_t *queue = &event_tracker->queue;
|
||||
NvU64 buffer_size, control_size;
|
||||
NvU64 buffer_size;
|
||||
|
||||
uvm_spin_lock_init(&queue->lock, UVM_LOCK_ORDER_LEAF);
|
||||
init_waitqueue_head(&queue->wait_queue);
|
||||
@ -2170,25 +2118,21 @@ NV_STATUS uvm_api_tools_init_event_tracker(UVM_TOOLS_INIT_EVENT_TRACKER_PARAMS *
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (event_tracker->version == UvmToolsEventQueueVersion_V1) {
|
||||
if (event_tracker->version == UvmToolsEventQueueVersion_V1)
|
||||
buffer_size = queue->queue_buffer_count * sizeof(UvmEventEntry_V1);
|
||||
control_size = sizeof(UvmToolsEventControlData_V1);
|
||||
}
|
||||
else {
|
||||
else
|
||||
buffer_size = queue->queue_buffer_count * sizeof(UvmEventEntry_V2);
|
||||
control_size = sizeof(UvmToolsEventControlData_V2);
|
||||
}
|
||||
|
||||
status = map_user_pages(params->queueBuffer,
|
||||
buffer_size,
|
||||
(void **)&queue->queue_v2,
|
||||
&queue->queue_buffer,
|
||||
&queue->queue_buffer_pages);
|
||||
if (status != NV_OK)
|
||||
goto fail;
|
||||
|
||||
status = map_user_pages(params->controlBuffer,
|
||||
control_size,
|
||||
(void **)&queue->control_v2,
|
||||
sizeof(UvmToolsEventControlData),
|
||||
(void **)&queue->control,
|
||||
&queue->control_buffer_pages);
|
||||
|
||||
if (status != NV_OK)
|
||||
@ -2224,6 +2168,7 @@ NV_STATUS uvm_api_tools_set_notification_threshold(UVM_TOOLS_SET_NOTIFICATION_TH
|
||||
{
|
||||
uvm_tools_queue_snapshot_t sn;
|
||||
uvm_tools_event_tracker_t *event_tracker = tools_event_tracker(filp);
|
||||
UvmToolsEventControlData *ctrl;
|
||||
|
||||
if (!tracker_is_queue(event_tracker))
|
||||
return NV_ERR_INVALID_ARGUMENT;
|
||||
@ -2232,18 +2177,9 @@ NV_STATUS uvm_api_tools_set_notification_threshold(UVM_TOOLS_SET_NOTIFICATION_TH
|
||||
|
||||
event_tracker->queue.notification_threshold = params->notificationThreshold;
|
||||
|
||||
if (event_tracker->version == UvmToolsEventQueueVersion_V1) {
|
||||
UvmToolsEventControlData_V1 *ctrl = event_tracker->queue.control_v1;
|
||||
|
||||
ctrl = event_tracker->queue.control;
|
||||
sn.put_behind = atomic_read((atomic_t *)&ctrl->put_behind);
|
||||
sn.get_ahead = atomic_read((atomic_t *)&ctrl->get_ahead);
|
||||
}
|
||||
else {
|
||||
UvmToolsEventControlData_V2 *ctrl = event_tracker->queue.control_v2;
|
||||
|
||||
sn.put_behind = atomic_read((atomic_t *)&ctrl->put_behind);
|
||||
sn.get_ahead = atomic_read((atomic_t *)&ctrl->get_ahead);
|
||||
}
|
||||
|
||||
if (queue_needs_wakeup(&event_tracker->queue, &sn))
|
||||
wake_up_all(&event_tracker->queue.wait_queue);
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2017-2021 NVIDIA Corporation
|
||||
Copyright (c) 2017-2024 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@ -104,3 +104,248 @@ void uvm_hal_turing_host_set_gpfifo_entry(NvU64 *fifo_entry,
|
||||
*fifo_entry = fifo_entry_value;
|
||||
}
|
||||
|
||||
void uvm_hal_turing_host_tlb_invalidate_all(uvm_push_t *push,
|
||||
uvm_gpu_phys_address_t pdb,
|
||||
NvU32 depth,
|
||||
uvm_membar_t membar)
|
||||
{
|
||||
NvU32 aperture_value;
|
||||
NvU32 page_table_level;
|
||||
NvU32 pdb_lo;
|
||||
NvU32 pdb_hi;
|
||||
NvU32 ack_value = 0;
|
||||
NvU32 sysmembar_value = 0;
|
||||
|
||||
UVM_ASSERT_MSG(pdb.aperture == UVM_APERTURE_VID || pdb.aperture == UVM_APERTURE_SYS, "aperture: %u", pdb.aperture);
|
||||
|
||||
if (pdb.aperture == UVM_APERTURE_VID)
|
||||
aperture_value = HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_APERTURE, VID_MEM);
|
||||
else
|
||||
aperture_value = HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_APERTURE, SYS_MEM_COHERENT);
|
||||
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(pdb.address, 1 << 12), "pdb 0x%llx\n", pdb.address);
|
||||
pdb.address >>= 12;
|
||||
|
||||
pdb_lo = pdb.address & HWMASK(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);
|
||||
pdb_hi = pdb.address >> HWSIZE(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);
|
||||
|
||||
// PDE3 is the highest level on Pascal-Turing, see the comment in
|
||||
// uvm_pascal_mmu.c for details.
|
||||
UVM_ASSERT_MSG(depth < NVC46F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE3, "depth %u", depth);
|
||||
page_table_level = NVC46F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE3 - depth;
|
||||
|
||||
if (membar != UVM_MEMBAR_NONE) {
|
||||
// If a GPU or SYS membar is needed, ACK_TYPE needs to be set to
|
||||
// GLOBALLY to make sure all the pending accesses can be picked up by
|
||||
// the membar.
|
||||
ack_value = HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_ACK_TYPE, GLOBALLY);
|
||||
}
|
||||
|
||||
if (membar == UVM_MEMBAR_SYS)
|
||||
sysmembar_value = HWCONST(C46F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, EN);
|
||||
else
|
||||
sysmembar_value = HWCONST(C46F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS);
|
||||
|
||||
NV_PUSH_4U(C46F, MEM_OP_A, sysmembar_value,
|
||||
MEM_OP_B, 0,
|
||||
MEM_OP_C, HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_PDB, ONE) |
|
||||
HWVALUE(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO, pdb_lo) |
|
||||
HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_GPC, ENABLE) |
|
||||
HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_REPLAY, NONE) |
|
||||
HWVALUE(C46F, MEM_OP_C, TLB_INVALIDATE_PAGE_TABLE_LEVEL, page_table_level) |
|
||||
aperture_value |
|
||||
ack_value,
|
||||
MEM_OP_D, HWCONST(C46F, MEM_OP_D, OPERATION, MMU_TLB_INVALIDATE) |
|
||||
HWVALUE(C46F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));
|
||||
|
||||
// GPU membar still requires an explicit membar method.
|
||||
if (membar == UVM_MEMBAR_GPU)
|
||||
uvm_push_get_gpu(push)->parent->host_hal->membar_gpu(push);
|
||||
}
|
||||
|
||||
void uvm_hal_turing_host_tlb_invalidate_va(uvm_push_t *push,
|
||||
uvm_gpu_phys_address_t pdb,
|
||||
NvU32 depth,
|
||||
NvU64 base,
|
||||
NvU64 size,
|
||||
NvU64 page_size,
|
||||
uvm_membar_t membar)
|
||||
{
|
||||
NvU32 aperture_value;
|
||||
NvU32 page_table_level;
|
||||
NvU32 pdb_lo;
|
||||
NvU32 pdb_hi;
|
||||
NvU32 ack_value = 0;
|
||||
NvU32 sysmembar_value = 0;
|
||||
NvU32 va_lo;
|
||||
NvU32 va_hi;
|
||||
NvU64 end;
|
||||
NvU64 actual_base;
|
||||
NvU64 actual_size;
|
||||
NvU64 actual_end;
|
||||
NvU32 log2_invalidation_size;
|
||||
uvm_gpu_t *gpu = uvm_push_get_gpu(push);
|
||||
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(page_size, 1 << 12), "page_size 0x%llx\n", page_size);
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(base, page_size), "base 0x%llx page_size 0x%llx\n", base, page_size);
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%llx\n", size, page_size);
|
||||
UVM_ASSERT_MSG(size > 0, "size 0x%llx\n", size);
|
||||
|
||||
// The invalidation size must be a power-of-two number of pages containing
|
||||
// the passed interval
|
||||
end = base + size - 1;
|
||||
log2_invalidation_size = __fls((unsigned long)(end ^ base)) + 1;
|
||||
|
||||
if (log2_invalidation_size == 64) {
|
||||
// Invalidate everything
|
||||
gpu->parent->host_hal->tlb_invalidate_all(push, pdb, depth, membar);
|
||||
return;
|
||||
}
|
||||
|
||||
// The hardware aligns the target address down to the invalidation size.
|
||||
actual_size = 1ULL << log2_invalidation_size;
|
||||
actual_base = UVM_ALIGN_DOWN(base, actual_size);
|
||||
actual_end = actual_base + actual_size - 1;
|
||||
UVM_ASSERT(actual_end >= end);
|
||||
|
||||
// The invalidation size field expects log2(invalidation size in 4K), not
|
||||
// log2(invalidation size in bytes)
|
||||
log2_invalidation_size -= 12;
|
||||
|
||||
// Address to invalidate, as a multiple of 4K.
|
||||
base >>= 12;
|
||||
va_lo = base & HWMASK(C46F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO);
|
||||
va_hi = base >> HWSIZE(C46F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO);
|
||||
|
||||
UVM_ASSERT_MSG(pdb.aperture == UVM_APERTURE_VID || pdb.aperture == UVM_APERTURE_SYS, "aperture: %u", pdb.aperture);
|
||||
|
||||
if (pdb.aperture == UVM_APERTURE_VID)
|
||||
aperture_value = HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_APERTURE, VID_MEM);
|
||||
else
|
||||
aperture_value = HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_APERTURE, SYS_MEM_COHERENT);
|
||||
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(pdb.address, 1 << 12), "pdb 0x%llx\n", pdb.address);
|
||||
pdb.address >>= 12;
|
||||
|
||||
pdb_lo = pdb.address & HWMASK(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);
|
||||
pdb_hi = pdb.address >> HWSIZE(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);
|
||||
|
||||
// PDE3 is the highest level on Pascal-Turing, see the comment in
|
||||
// uvm_pascal_mmu.c for details.
|
||||
UVM_ASSERT_MSG(depth < NVC46F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE3, "depth %u", depth);
|
||||
page_table_level = NVC46F_MEM_OP_C_TLB_INVALIDATE_PAGE_TABLE_LEVEL_UP_TO_PDE3 - depth;
|
||||
|
||||
if (membar != UVM_MEMBAR_NONE) {
|
||||
// If a GPU or SYS membar is needed, ACK_TYPE needs to be set to
|
||||
// GLOBALLY to make sure all the pending accesses can be picked up by
|
||||
// the membar.
|
||||
ack_value = HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_ACK_TYPE, GLOBALLY);
|
||||
}
|
||||
|
||||
if (membar == UVM_MEMBAR_SYS)
|
||||
sysmembar_value = HWCONST(C46F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, EN);
|
||||
else
|
||||
sysmembar_value = HWCONST(C46F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS);
|
||||
|
||||
NV_PUSH_4U(C46F, MEM_OP_A, HWVALUE(C46F, MEM_OP_A, TLB_INVALIDATE_INVALIDATION_SIZE, log2_invalidation_size) |
|
||||
sysmembar_value |
|
||||
HWVALUE(C46F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO, va_lo),
|
||||
MEM_OP_B, HWVALUE(C46F, MEM_OP_B, TLB_INVALIDATE_TARGET_ADDR_HI, va_hi),
|
||||
MEM_OP_C, HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_PDB, ONE) |
|
||||
HWVALUE(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO, pdb_lo) |
|
||||
HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_GPC, ENABLE) |
|
||||
HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_REPLAY, NONE) |
|
||||
HWVALUE(C46F, MEM_OP_C, TLB_INVALIDATE_PAGE_TABLE_LEVEL, page_table_level) |
|
||||
aperture_value |
|
||||
ack_value,
|
||||
MEM_OP_D, HWCONST(C46F, MEM_OP_D, OPERATION, MMU_TLB_INVALIDATE_TARGETED) |
|
||||
HWVALUE(C46F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));
|
||||
|
||||
// GPU membar still requires an explicit membar method.
|
||||
if (membar == UVM_MEMBAR_GPU)
|
||||
gpu->parent->host_hal->membar_gpu(push);
|
||||
}
|
||||
|
||||
void uvm_hal_turing_host_tlb_invalidate_test(uvm_push_t *push,
|
||||
uvm_gpu_phys_address_t pdb,
|
||||
UVM_TEST_INVALIDATE_TLB_PARAMS *params)
|
||||
{
|
||||
NvU32 ack_value = 0;
|
||||
NvU32 sysmembar_value = 0;
|
||||
NvU32 invalidate_gpc_value = 0;
|
||||
NvU32 aperture_value = 0;
|
||||
NvU32 pdb_lo = 0;
|
||||
NvU32 pdb_hi = 0;
|
||||
NvU32 page_table_level = 0;
|
||||
|
||||
UVM_ASSERT_MSG(pdb.aperture == UVM_APERTURE_VID || pdb.aperture == UVM_APERTURE_SYS, "aperture: %u", pdb.aperture);
|
||||
if (pdb.aperture == UVM_APERTURE_VID)
|
||||
aperture_value = HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_APERTURE, VID_MEM);
|
||||
else
|
||||
aperture_value = HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_APERTURE, SYS_MEM_COHERENT);
|
||||
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(pdb.address, 1 << 12), "pdb 0x%llx\n", pdb.address);
|
||||
pdb.address >>= 12;
|
||||
|
||||
pdb_lo = pdb.address & HWMASK(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);
|
||||
pdb_hi = pdb.address >> HWSIZE(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO);
|
||||
|
||||
if (params->page_table_level != UvmInvalidatePageTableLevelAll) {
|
||||
// PDE3 is the highest level on Pascal-Turing, see the comment in
|
||||
// uvm_pascal_mmu.c for details.
|
||||
page_table_level = min((NvU32)UvmInvalidatePageTableLevelPde3, params->page_table_level) - 1;
|
||||
}
|
||||
|
||||
if (params->membar != UvmInvalidateTlbMemBarNone) {
|
||||
// If a GPU or SYS membar is needed, ack_value needs to be set to
|
||||
// GLOBALLY to make sure all the pending accesses can be picked up by
|
||||
// the membar.
|
||||
ack_value = HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_ACK_TYPE, GLOBALLY);
|
||||
}
|
||||
|
||||
if (params->membar == UvmInvalidateTlbMemBarSys)
|
||||
sysmembar_value = HWCONST(C46F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, EN);
|
||||
else
|
||||
sysmembar_value = HWCONST(C46F, MEM_OP_A, TLB_INVALIDATE_SYSMEMBAR, DIS);
|
||||
|
||||
if (params->disable_gpc_invalidate)
|
||||
invalidate_gpc_value = HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_GPC, DISABLE);
|
||||
else
|
||||
invalidate_gpc_value = HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_GPC, ENABLE);
|
||||
|
||||
if (params->target_va_mode == UvmTargetVaModeTargeted) {
|
||||
NvU64 va = params->va >> 12;
|
||||
|
||||
NvU32 va_lo = va & HWMASK(C46F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO);
|
||||
NvU32 va_hi = va >> HWSIZE(C46F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO);
|
||||
NV_PUSH_4U(C46F, MEM_OP_A, sysmembar_value |
|
||||
HWVALUE(C46F, MEM_OP_A, TLB_INVALIDATE_TARGET_ADDR_LO, va_lo),
|
||||
MEM_OP_B, HWVALUE(C46F, MEM_OP_B, TLB_INVALIDATE_TARGET_ADDR_HI, va_hi),
|
||||
MEM_OP_C, HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_REPLAY, NONE) |
|
||||
HWVALUE(C46F, MEM_OP_C, TLB_INVALIDATE_PAGE_TABLE_LEVEL, page_table_level) |
|
||||
HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_PDB, ONE) |
|
||||
HWVALUE(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO, pdb_lo) |
|
||||
invalidate_gpc_value |
|
||||
aperture_value |
|
||||
ack_value,
|
||||
MEM_OP_D, HWCONST(C46F, MEM_OP_D, OPERATION, MMU_TLB_INVALIDATE_TARGETED) |
|
||||
HWVALUE(C46F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));
|
||||
}
|
||||
else {
|
||||
NV_PUSH_4U(C46F, MEM_OP_A, sysmembar_value,
|
||||
MEM_OP_B, 0,
|
||||
MEM_OP_C, HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_REPLAY, NONE) |
|
||||
HWVALUE(C46F, MEM_OP_C, TLB_INVALIDATE_PAGE_TABLE_LEVEL, page_table_level) |
|
||||
HWCONST(C46F, MEM_OP_C, TLB_INVALIDATE_PDB, ONE) |
|
||||
HWVALUE(C46F, MEM_OP_C, TLB_INVALIDATE_PDB_ADDR_LO, pdb_lo) |
|
||||
invalidate_gpc_value |
|
||||
aperture_value |
|
||||
ack_value,
|
||||
MEM_OP_D, HWCONST(C46F, MEM_OP_D, OPERATION, MMU_TLB_INVALIDATE) |
|
||||
HWVALUE(C46F, MEM_OP_D, TLB_INVALIDATE_PDB_ADDR_HI, pdb_hi));
|
||||
}
|
||||
|
||||
// GPU membar still requires an explicit membar method.
|
||||
if (params->membar == UvmInvalidateTlbMemBarLocal)
|
||||
uvm_push_get_gpu(push)->parent->host_hal->membar_gpu(push);
|
||||
}
|
||||
|
@ -138,7 +138,7 @@ static NvU64 poisoned_pte_turing(void)
|
||||
|
||||
static uvm_mmu_mode_hal_t turing_mmu_mode_hal;
|
||||
|
||||
uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_turing(NvU32 big_page_size)
|
||||
uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_turing(NvU64 big_page_size)
|
||||
{
|
||||
static bool initialized = false;
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2013-2023 NVidia Corporation
|
||||
Copyright (c) 2013-2024 NVidia Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@ -52,19 +52,18 @@ typedef enum
|
||||
|
||||
typedef unsigned long long UvmStream;
|
||||
|
||||
// The maximum number of sub-processors per parent GPU.
|
||||
#define UVM_PARENT_ID_MAX_SUB_PROCESSORS 8
|
||||
|
||||
// The maximum number of GPUs changed when multiple MIG instances per
|
||||
// uvm_parent_gpu_t were added. See UvmEventQueueCreate().
|
||||
// uvm_parent_gpu_t were added. The old version is kept as a convenience
|
||||
// for code that needs to maintain forward compatibility.
|
||||
#define UVM_MAX_GPUS_V1 NV_MAX_DEVICES
|
||||
#define UVM_MAX_PROCESSORS_V1 (UVM_MAX_GPUS_V1 + 1)
|
||||
#define UVM_MAX_GPUS_V2 (NV_MAX_DEVICES * NV_MAX_SUBDEVICES)
|
||||
#define UVM_MAX_PROCESSORS_V2 (UVM_MAX_GPUS_V2 + 1)
|
||||
#define UVM_MAX_GPUS (NV_MAX_DEVICES * UVM_PARENT_ID_MAX_SUB_PROCESSORS)
|
||||
#define UVM_MAX_PROCESSORS (UVM_MAX_GPUS + 1)
|
||||
|
||||
// For backward compatibility:
|
||||
// TODO: Bug 4465348: remove these after replacing old references.
|
||||
#define UVM_MAX_GPUS UVM_MAX_GPUS_V1
|
||||
#define UVM_MAX_PROCESSORS UVM_MAX_PROCESSORS_V1
|
||||
|
||||
#define UVM_PROCESSOR_MASK_SIZE ((UVM_MAX_PROCESSORS_V2 + (sizeof(NvU64) * 8) - 1) / (sizeof(NvU64) * 8))
|
||||
#define UVM_PROCESSOR_MASK_SIZE ((UVM_MAX_PROCESSORS + (sizeof(NvU64) * 8) - 1) / (sizeof(NvU64) * 8))
|
||||
|
||||
#define UVM_INIT_FLAGS_DISABLE_HMM ((NvU64)0x1)
|
||||
#define UVM_INIT_FLAGS_MULTI_PROCESS_SHARING_MODE ((NvU64)0x2)
|
||||
@ -423,29 +422,7 @@ typedef struct
|
||||
NvU32 pid; // process id causing the fault
|
||||
NvU32 threadId; // thread id causing the fault
|
||||
NvU64 pc; // address of the instruction causing the fault
|
||||
} UvmEventCpuFaultInfo_V1;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
//
|
||||
// eventType has to be 1st argument of this structure. Setting eventType to
|
||||
// UvmEventTypeMemoryViolation helps to identify event data in a queue.
|
||||
//
|
||||
NvU8 eventType;
|
||||
NvU8 accessType; // read/write violation (UvmEventMemoryAccessType)
|
||||
//
|
||||
// This structure is shared between UVM kernel and tools.
|
||||
// Manually padding the structure so that compiler options like pragma pack
|
||||
// or malign-double will have no effect on the field offsets.
|
||||
//
|
||||
NvU16 padding16Bits;
|
||||
NvS32 nid; // NUMA node ID of faulting CPU
|
||||
NvU64 address; // faulting address
|
||||
NvU64 timeStamp; // cpu time when the fault occurred
|
||||
NvU32 pid; // process id causing the fault
|
||||
NvU32 threadId; // thread id causing the fault
|
||||
NvU64 pc; // address of the instruction causing the fault
|
||||
} UvmEventCpuFaultInfo_V2;
|
||||
} UvmEventCpuFaultInfo;
|
||||
|
||||
typedef enum
|
||||
{
|
||||
@ -721,13 +698,7 @@ typedef struct
|
||||
//
|
||||
NvU8 eventType;
|
||||
NvU8 faultType; // type of gpu fault, refer UvmEventFaultType
|
||||
NvU8 accessType; // memory access type, refer UvmEventMemoryAccessType
|
||||
//
|
||||
// This structure is shared between UVM kernel and tools.
|
||||
// Manually padding the structure so that compiler options like pragma pack
|
||||
// or malign-double will have no effect on the field offsets
|
||||
//
|
||||
NvU8 padding8Bits_1;
|
||||
NvU16 gpuIndex; // GPU that experienced the fault
|
||||
union
|
||||
{
|
||||
NvU16 gpcId; // If this is a replayable fault, this field contains
|
||||
@ -759,14 +730,13 @@ typedef struct
|
||||
// UvmEventFaultClientTypeGpc indicates replayable
|
||||
// fault, while UvmEventFaultClientTypeHub indicates
|
||||
// non-replayable fault.
|
||||
|
||||
NvU8 accessType; // memory access type, refer UvmEventMemoryAccessType
|
||||
//
|
||||
// This structure is shared between UVM kernel and tools.
|
||||
// Manually padding the structure so that compiler options like pragma pack
|
||||
// or malign-double will have no effect on the field offsets
|
||||
//
|
||||
NvU8 padding8Bits_2;
|
||||
NvU16 gpuIndex; // GPU that experienced the fault
|
||||
NvU16 padding16bits;
|
||||
} UvmEventGpuFaultInfo_V2;
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
@ -1108,8 +1078,8 @@ typedef struct
|
||||
// or malign-double will have no effect on the field offsets
|
||||
//
|
||||
NvU8 padding8bits;
|
||||
NvU16 padding16bits[2];
|
||||
NvU16 processorIndex; // index of the cpu/gpu that was throttled
|
||||
NvU32 padding32bits;
|
||||
NvU64 address; // address of the page whose servicing is being
|
||||
// throttled
|
||||
NvU64 timeStamp; // cpu start time stamp for the throttling operation
|
||||
@ -1150,8 +1120,8 @@ typedef struct
|
||||
// or malign-double will have no effect on the field offsets
|
||||
//
|
||||
NvU8 padding8bits;
|
||||
NvU16 padding16bits[2];
|
||||
NvU16 processorIndex; // index of the cpu/gpu that was throttled
|
||||
NvU32 padding32bits;
|
||||
NvU64 address; // address of the page whose servicing is being
|
||||
// throttled
|
||||
NvU64 timeStamp; // cpu end time stamp for the throttling operation
|
||||
@ -1409,7 +1379,7 @@ typedef struct
|
||||
NvU8 eventType;
|
||||
UvmEventMigrationInfo_Lite migration_Lite;
|
||||
|
||||
UvmEventCpuFaultInfo_V1 cpuFault;
|
||||
UvmEventCpuFaultInfo cpuFault;
|
||||
UvmEventMigrationInfo_V1 migration;
|
||||
UvmEventGpuFaultInfo_V1 gpuFault;
|
||||
UvmEventGpuFaultReplayInfo_V1 gpuFaultReplay;
|
||||
@ -1443,7 +1413,7 @@ typedef struct
|
||||
NvU8 eventType;
|
||||
UvmEventMigrationInfo_Lite migration_Lite;
|
||||
|
||||
UvmEventCpuFaultInfo_V2 cpuFault;
|
||||
UvmEventCpuFaultInfo cpuFault;
|
||||
UvmEventMigrationInfo_V2 migration;
|
||||
UvmEventGpuFaultInfo_V2 gpuFault;
|
||||
UvmEventGpuFaultReplayInfo_V2 gpuFaultReplay;
|
||||
@ -1510,19 +1480,7 @@ typedef enum {
|
||||
UvmToolsEventQueueVersion_V2 = 2,
|
||||
} UvmToolsEventQueueVersion;
|
||||
|
||||
typedef struct UvmEventControlData_V1_tag {
|
||||
// entries between get_ahead and get_behind are currently being read
|
||||
volatile NvU32 get_ahead;
|
||||
volatile NvU32 get_behind;
|
||||
// entries between put_ahead and put_behind are currently being written
|
||||
volatile NvU32 put_ahead;
|
||||
volatile NvU32 put_behind;
|
||||
|
||||
// counter of dropped events
|
||||
NvU64 dropped[UvmEventNumTypesAll];
|
||||
} UvmToolsEventControlData_V1;
|
||||
|
||||
typedef struct UvmEventControlData_V2_tag {
|
||||
typedef struct UvmEventControlData_tag {
|
||||
// entries between get_ahead and get_behind are currently being read
|
||||
volatile NvU32 get_ahead;
|
||||
volatile NvU32 get_behind;
|
||||
@ -1531,19 +1489,12 @@ typedef struct UvmEventControlData_V2_tag {
|
||||
volatile NvU32 put_ahead;
|
||||
volatile NvU32 put_behind;
|
||||
|
||||
// The version values are limited to UvmToolsEventQueueVersion and
|
||||
// initialized by UvmToolsCreateEventQueue().
|
||||
NvU32 version;
|
||||
NvU32 padding32Bits;
|
||||
|
||||
// counter of dropped events
|
||||
NvU64 dropped[UvmEventNumTypesAll];
|
||||
} UvmToolsEventControlData_V2;
|
||||
} UvmToolsEventControlData;
|
||||
|
||||
// For backward compatibility:
|
||||
// TODO: Bug 4465348: remove these after replacing old references.
|
||||
typedef UvmToolsEventControlData_V1 UvmToolsEventControlData;
|
||||
typedef UvmEventEntry_V1 UvmEventEntry;
|
||||
// TODO: Bug 4465348: remove this after replacing old references.
|
||||
typedef UvmToolsEventControlData UvmToolsEventControlData_V1;
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// UVM Tools forward types (handles) definitions
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2015-2023 NVIDIA Corporation
|
||||
Copyright (c) 2015-2024 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@ -725,9 +725,8 @@ bool uvm_va_block_cpu_is_region_resident_on(uvm_va_block_t *va_block, int nid, u
|
||||
}
|
||||
|
||||
// Return the preferred NUMA node ID for the block's policy.
|
||||
// If the preferred node ID is NUMA_NO_NODE, the nearest NUMA node ID
|
||||
// with memory is returned. In most cases, this should be the current
|
||||
// NUMA node.
|
||||
// If the preferred node ID is NUMA_NO_NODE, the current NUMA node ID
|
||||
// is returned.
|
||||
static int uvm_va_block_context_get_node(uvm_va_block_context_t *va_block_context)
|
||||
{
|
||||
if (va_block_context->make_resident.dest_nid != NUMA_NO_NODE)
|
||||
@ -1329,12 +1328,12 @@ error_block_free:
|
||||
|
||||
static void cpu_chunk_remove_sysmem_gpu_mapping(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu)
|
||||
{
|
||||
NvU64 gpu_mapping_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
|
||||
NvU64 gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu);
|
||||
if (gpu_mapping_addr == 0)
|
||||
return;
|
||||
|
||||
uvm_pmm_sysmem_mappings_remove_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings, gpu_mapping_addr);
|
||||
uvm_cpu_chunk_unmap_parent_gpu_phys(chunk, gpu->parent);
|
||||
uvm_cpu_chunk_unmap_gpu(chunk, gpu);
|
||||
}
|
||||
|
||||
static NV_STATUS cpu_chunk_add_sysmem_gpu_mapping(uvm_cpu_chunk_t *chunk,
|
||||
@ -1357,17 +1356,14 @@ static NV_STATUS cpu_chunk_add_sysmem_gpu_mapping(uvm_cpu_chunk_t *chunk,
|
||||
|
||||
chunk_size = uvm_cpu_chunk_get_size(chunk);
|
||||
|
||||
// TODO: Bug 3744779: Handle benign assertion in
|
||||
// pmm_sysmem_mappings_remove_gpu_mapping() in case of a
|
||||
// failure.
|
||||
status = uvm_pmm_sysmem_mappings_add_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings,
|
||||
uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent),
|
||||
uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu),
|
||||
uvm_va_block_cpu_page_address(block, page_index),
|
||||
chunk_size,
|
||||
block,
|
||||
UVM_ID_CPU);
|
||||
if (status != NV_OK)
|
||||
cpu_chunk_remove_sysmem_gpu_mapping(chunk, gpu);
|
||||
uvm_cpu_chunk_unmap_gpu(chunk, gpu);
|
||||
|
||||
return status;
|
||||
}
|
||||
@ -1396,10 +1392,10 @@ static NV_STATUS block_gpu_map_phys_all_cpu_pages(uvm_va_block_t *block, uvm_gpu
|
||||
|
||||
for_each_possible_uvm_node(nid) {
|
||||
for_each_cpu_chunk_in_block(chunk, page_index, block, nid) {
|
||||
UVM_ASSERT_MSG(uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent) == 0,
|
||||
UVM_ASSERT_MSG(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu) == 0,
|
||||
"GPU%u DMA address 0x%llx\n",
|
||||
uvm_id_value(gpu->id),
|
||||
uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent));
|
||||
uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu));
|
||||
|
||||
status = cpu_chunk_add_sysmem_gpu_mapping(chunk, block, page_index, gpu);
|
||||
if (status != NV_OK)
|
||||
@ -1562,8 +1558,7 @@ NV_STATUS uvm_va_block_gpu_state_alloc(uvm_va_block_t *va_block)
|
||||
}
|
||||
|
||||
void uvm_va_block_unmap_cpu_chunk_on_gpus(uvm_va_block_t *block,
|
||||
uvm_cpu_chunk_t *chunk,
|
||||
uvm_page_index_t page_index)
|
||||
uvm_cpu_chunk_t *chunk)
|
||||
{
|
||||
uvm_gpu_id_t id;
|
||||
|
||||
@ -1602,7 +1597,7 @@ NV_STATUS uvm_va_block_map_cpu_chunk_on_gpus(uvm_va_block_t *block,
|
||||
return NV_OK;
|
||||
|
||||
error:
|
||||
uvm_va_block_unmap_cpu_chunk_on_gpus(block, chunk, page_index);
|
||||
uvm_va_block_unmap_cpu_chunk_on_gpus(block, chunk);
|
||||
return status;
|
||||
}
|
||||
|
||||
@ -1621,7 +1616,7 @@ void uvm_va_block_remove_cpu_chunks(uvm_va_block_t *va_block, uvm_va_block_regio
|
||||
uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], chunk_region);
|
||||
uvm_va_block_cpu_clear_resident_region(va_block, nid, chunk_region);
|
||||
uvm_cpu_chunk_remove_from_block(va_block, nid, page_index);
|
||||
uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk, page_index);
|
||||
uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk);
|
||||
uvm_cpu_chunk_free(chunk);
|
||||
}
|
||||
}
|
||||
@ -2071,7 +2066,6 @@ static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,
|
||||
uvm_page_mask_t *allocated_mask;
|
||||
uvm_cpu_chunk_alloc_flags_t alloc_flags = UVM_CPU_CHUNK_ALLOC_FLAGS_NONE;
|
||||
uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
|
||||
const uvm_va_policy_t *policy = uvm_va_policy_get_region(block, populate_region);
|
||||
uvm_page_index_t page_index;
|
||||
uvm_gpu_id_t id;
|
||||
int preferred_nid = block_context->make_resident.dest_nid;
|
||||
@ -2079,10 +2073,6 @@ static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,
|
||||
if (block_test && block_test->cpu_chunk_allocation_target_id != NUMA_NO_NODE)
|
||||
preferred_nid = block_test->cpu_chunk_allocation_target_id;
|
||||
|
||||
// If the VA range has a preferred NUMA node, use it.
|
||||
if (preferred_nid == NUMA_NO_NODE)
|
||||
preferred_nid = policy->preferred_nid;
|
||||
|
||||
// TODO: Bug 4158598: Using NUMA_NO_NODE for staging allocations is sub-optimal.
|
||||
if (preferred_nid != NUMA_NO_NODE) {
|
||||
uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, preferred_nid);
|
||||
@ -2133,12 +2123,13 @@ static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,
|
||||
uvm_page_mask_t *node_pages_mask = &block_context->make_resident.node_pages_mask;
|
||||
uvm_chunk_sizes_mask_t allocation_sizes;
|
||||
|
||||
if (uvm_page_mask_test(allocated_mask, page_index) ||
|
||||
uvm_va_block_cpu_is_page_resident_on(block, preferred_nid, page_index)) {
|
||||
if (uvm_page_mask_test(allocated_mask, page_index)) {
|
||||
page_index = uvm_va_block_next_unset_page_in_mask(populate_region, allocated_mask, page_index) - 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
UVM_ASSERT(!uvm_va_block_cpu_is_page_resident_on(block, preferred_nid, page_index));
|
||||
|
||||
allocation_sizes = block_calculate_largest_alloc_size(block,
|
||||
page_index,
|
||||
allocated_mask,
|
||||
@ -2313,7 +2304,7 @@ static bool block_gpu_supports_2m(uvm_va_block_t *block, uvm_gpu_t *gpu)
|
||||
return uvm_mmu_page_size_supported(&gpu_va_space->page_tables, UVM_PAGE_SIZE_2M);
|
||||
}
|
||||
|
||||
NvU32 uvm_va_block_gpu_big_page_size(uvm_va_block_t *va_block, uvm_gpu_t *gpu)
|
||||
NvU64 uvm_va_block_gpu_big_page_size(uvm_va_block_t *va_block, uvm_gpu_t *gpu)
|
||||
{
|
||||
uvm_gpu_va_space_t *gpu_va_space;
|
||||
|
||||
@ -2321,7 +2312,7 @@ NvU32 uvm_va_block_gpu_big_page_size(uvm_va_block_t *va_block, uvm_gpu_t *gpu)
|
||||
return gpu_va_space->page_tables.big_page_size;
|
||||
}
|
||||
|
||||
static uvm_va_block_region_t range_big_page_region_all(NvU64 start, NvU64 end, NvU32 big_page_size)
|
||||
static uvm_va_block_region_t range_big_page_region_all(NvU64 start, NvU64 end, NvU64 big_page_size)
|
||||
{
|
||||
NvU64 first_addr = UVM_ALIGN_UP(start, big_page_size);
|
||||
NvU64 outer_addr = UVM_ALIGN_DOWN(end + 1, big_page_size);
|
||||
@ -2335,20 +2326,20 @@ static uvm_va_block_region_t range_big_page_region_all(NvU64 start, NvU64 end, N
|
||||
return uvm_va_block_region((first_addr - start) / PAGE_SIZE, (outer_addr - start) / PAGE_SIZE);
|
||||
}
|
||||
|
||||
static size_t range_num_big_pages(NvU64 start, NvU64 end, NvU32 big_page_size)
|
||||
static size_t range_num_big_pages(NvU64 start, NvU64 end, NvU64 big_page_size)
|
||||
{
|
||||
uvm_va_block_region_t region = range_big_page_region_all(start, end, big_page_size);
|
||||
return (size_t)uvm_div_pow2_64(uvm_va_block_region_size(region), big_page_size);
|
||||
}
|
||||
|
||||
uvm_va_block_region_t uvm_va_block_big_page_region_all(uvm_va_block_t *va_block, NvU32 big_page_size)
|
||||
uvm_va_block_region_t uvm_va_block_big_page_region_all(uvm_va_block_t *va_block, NvU64 big_page_size)
|
||||
{
|
||||
return range_big_page_region_all(va_block->start, va_block->end, big_page_size);
|
||||
}
|
||||
|
||||
uvm_va_block_region_t uvm_va_block_big_page_region_subset(uvm_va_block_t *va_block,
|
||||
uvm_va_block_region_t region,
|
||||
NvU32 big_page_size)
|
||||
NvU64 big_page_size)
|
||||
{
|
||||
NvU64 start = uvm_va_block_region_start(va_block, region);
|
||||
NvU64 end = uvm_va_block_region_end(va_block, region);
|
||||
@ -2366,12 +2357,12 @@ uvm_va_block_region_t uvm_va_block_big_page_region_subset(uvm_va_block_t *va_blo
|
||||
return big_region;
|
||||
}
|
||||
|
||||
size_t uvm_va_block_num_big_pages(uvm_va_block_t *va_block, NvU32 big_page_size)
|
||||
size_t uvm_va_block_num_big_pages(uvm_va_block_t *va_block, NvU64 big_page_size)
|
||||
{
|
||||
return range_num_big_pages(va_block->start, va_block->end, big_page_size);
|
||||
}
|
||||
|
||||
NvU64 uvm_va_block_big_page_addr(uvm_va_block_t *va_block, size_t big_page_index, NvU32 big_page_size)
|
||||
NvU64 uvm_va_block_big_page_addr(uvm_va_block_t *va_block, size_t big_page_index, NvU64 big_page_size)
|
||||
{
|
||||
NvU64 addr = UVM_ALIGN_UP(va_block->start, big_page_size) + (big_page_index * big_page_size);
|
||||
UVM_ASSERT(addr >= va_block->start);
|
||||
@ -2379,7 +2370,7 @@ NvU64 uvm_va_block_big_page_addr(uvm_va_block_t *va_block, size_t big_page_index
|
||||
return addr;
|
||||
}
|
||||
|
||||
uvm_va_block_region_t uvm_va_block_big_page_region(uvm_va_block_t *va_block, size_t big_page_index, NvU32 big_page_size)
|
||||
uvm_va_block_region_t uvm_va_block_big_page_region(uvm_va_block_t *va_block, size_t big_page_index, NvU64 big_page_size)
|
||||
{
|
||||
NvU64 page_addr = uvm_va_block_big_page_addr(va_block, big_page_index, big_page_size);
|
||||
|
||||
@ -2395,7 +2386,7 @@ uvm_va_block_region_t uvm_va_block_big_page_region(uvm_va_block_t *va_block, siz
|
||||
// uvm_va_block_gpu_state_t::big_ptes) corresponding to page_index. If
|
||||
// page_index cannot be covered by a big PTE due to alignment or block size,
|
||||
// MAX_BIG_PAGES_PER_UVM_VA_BLOCK is returned.
|
||||
size_t uvm_va_block_big_page_index(uvm_va_block_t *va_block, uvm_page_index_t page_index, NvU32 big_page_size)
|
||||
size_t uvm_va_block_big_page_index(uvm_va_block_t *va_block, uvm_page_index_t page_index, NvU64 big_page_size)
|
||||
{
|
||||
uvm_va_block_region_t big_region_all = uvm_va_block_big_page_region_all(va_block, big_page_size);
|
||||
size_t big_index;
|
||||
@ -2420,7 +2411,7 @@ static void uvm_page_mask_init_from_big_ptes(uvm_va_block_t *block,
|
||||
{
|
||||
uvm_va_block_region_t big_region;
|
||||
size_t big_page_index;
|
||||
NvU32 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu);
|
||||
NvU64 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu);
|
||||
|
||||
uvm_page_mask_zero(mask_out);
|
||||
|
||||
@ -2430,7 +2421,7 @@ static void uvm_page_mask_init_from_big_ptes(uvm_va_block_t *block,
|
||||
}
|
||||
}
|
||||
|
||||
NvU32 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block, uvm_page_index_t page_index)
|
||||
NvU64 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block, uvm_page_index_t page_index)
|
||||
{
|
||||
if (!uvm_page_mask_test(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index))
|
||||
return 0;
|
||||
@ -2444,7 +2435,7 @@ NvU32 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block, uvm_page_index_t page
|
||||
return PAGE_SIZE;
|
||||
}
|
||||
|
||||
NvU32 uvm_va_block_page_size_gpu(uvm_va_block_t *va_block, uvm_gpu_id_t gpu_id, uvm_page_index_t page_index)
|
||||
NvU64 uvm_va_block_page_size_gpu(uvm_va_block_t *va_block, uvm_gpu_id_t gpu_id, uvm_page_index_t page_index)
|
||||
{
|
||||
uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id);
|
||||
size_t big_page_size, big_page_index;
|
||||
@ -2472,7 +2463,7 @@ NvU32 uvm_va_block_page_size_gpu(uvm_va_block_t *va_block, uvm_gpu_id_t gpu_id,
|
||||
// resident. Note that this is different from uvm_va_block_page_size_* because
|
||||
// those return the size of the PTE which maps the page index, which may be
|
||||
// smaller than the physical allocation.
|
||||
static NvU32 block_phys_page_size(uvm_va_block_t *block, block_phys_page_t page)
|
||||
static NvU64 block_phys_page_size(uvm_va_block_t *block, block_phys_page_t page)
|
||||
{
|
||||
uvm_va_block_gpu_state_t *gpu_state;
|
||||
uvm_chunk_size_t chunk_size;
|
||||
@ -2485,7 +2476,7 @@ static NvU32 block_phys_page_size(uvm_va_block_t *block, block_phys_page_t page)
|
||||
return 0;
|
||||
|
||||
UVM_ASSERT(uvm_processor_mask_test(&block->resident, UVM_ID_CPU));
|
||||
return (NvU32)uvm_cpu_chunk_get_size(chunk);
|
||||
return uvm_cpu_chunk_get_size(chunk);
|
||||
}
|
||||
|
||||
gpu_state = uvm_va_block_gpu_state_get(block, page.processor);
|
||||
@ -2494,10 +2485,10 @@ static NvU32 block_phys_page_size(uvm_va_block_t *block, block_phys_page_t page)
|
||||
|
||||
UVM_ASSERT(uvm_processor_mask_test(&block->resident, page.processor));
|
||||
block_gpu_chunk_index(block, block_get_gpu(block, page.processor), page.page_index, &chunk_size);
|
||||
return (NvU32)chunk_size;
|
||||
return chunk_size;
|
||||
}
|
||||
|
||||
NvU32 uvm_va_block_get_physical_size(uvm_va_block_t *block,
|
||||
NvU64 uvm_va_block_get_physical_size(uvm_va_block_t *block,
|
||||
uvm_processor_id_t processor,
|
||||
uvm_page_index_t page_index)
|
||||
{
|
||||
@ -3349,7 +3340,7 @@ static uvm_gpu_phys_address_t block_phys_page_address(uvm_va_block_t *block,
|
||||
|
||||
if (UVM_ID_IS_CPU(block_page.processor)) {
|
||||
uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, block_page.nid, block_page.page_index);
|
||||
NvU64 dma_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
|
||||
NvU64 dma_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu);
|
||||
uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block,
|
||||
uvm_cpu_chunk_get_size(chunk),
|
||||
block_page.page_index);
|
||||
@ -3848,7 +3839,6 @@ static void conf_computing_block_copy_push_gpu_to_cpu(uvm_va_block_t *block,
|
||||
uvm_gpu_address_t staging_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
|
||||
uvm_gpu_address_t auth_tag_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
|
||||
uvm_gpu_address_t src_address = block_copy_get_address(block, ©_state->src, page_index, gpu);
|
||||
NvU32 key_version = uvm_channel_pool_key_version(push->channel->pool);
|
||||
|
||||
UVM_ASSERT(UVM_ID_IS_GPU(copy_state->src.id));
|
||||
UVM_ASSERT(UVM_ID_IS_CPU(copy_state->dst.id));
|
||||
@ -3866,8 +3856,7 @@ static void conf_computing_block_copy_push_gpu_to_cpu(uvm_va_block_t *block,
|
||||
// crypto-operations and it only guarantees PAGE_SIZE contiguity, all
|
||||
// encryptions and decryptions must happen on a PAGE_SIZE basis.
|
||||
for_each_va_block_page_in_region(page_index, region) {
|
||||
uvm_conf_computing_log_gpu_encryption(push->channel, PAGE_SIZE, &dma_buffer->decrypt_iv[page_index]);
|
||||
dma_buffer->key_version[page_index] = key_version;
|
||||
uvm_conf_computing_log_gpu_encryption(push->channel, &dma_buffer->decrypt_iv[page_index]);
|
||||
|
||||
// All but the first encryption can be pipelined. The first encryption
|
||||
// uses the caller's pipelining settings.
|
||||
@ -3926,8 +3915,7 @@ static NV_STATUS conf_computing_copy_pages_finish(uvm_va_block_t *block,
|
||||
status = uvm_conf_computing_cpu_decrypt(push->channel,
|
||||
cpu_page_address,
|
||||
staging_buffer,
|
||||
dma_buffer->decrypt_iv + page_index,
|
||||
dma_buffer->key_version[page_index],
|
||||
&dma_buffer->decrypt_iv[page_index],
|
||||
PAGE_SIZE,
|
||||
auth_tag_buffer);
|
||||
kunmap(dst_page);
|
||||
@ -4045,7 +4033,7 @@ static NV_STATUS block_copy_pages(uvm_va_block_t *va_block,
|
||||
|
||||
UVM_ASSERT(dst_chunk);
|
||||
UVM_ASSERT(uvm_cpu_chunk_get_size(src_chunk) >= uvm_va_block_region_size(region));
|
||||
UVM_ASSERT(uvm_va_block_region_size(region) <= uvm_cpu_chunk_get_size(dst_chunk));
|
||||
UVM_ASSERT(uvm_cpu_chunk_get_size(src_chunk) <= uvm_cpu_chunk_get_size(dst_chunk));
|
||||
|
||||
// CPU-to-CPU copies using memcpy() don't have any inherent ordering with
|
||||
// copies using GPU CEs. So, we have to make sure that all previously
|
||||
@ -5140,7 +5128,7 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
|
||||
uvm_page_mask_t *dst_resident_mask;
|
||||
uvm_page_mask_t *migrated_pages;
|
||||
uvm_page_mask_t *staged_pages;
|
||||
uvm_page_mask_t *scratch_residency_mask;
|
||||
uvm_page_mask_t *first_touch_mask;
|
||||
|
||||
// TODO: Bug 3660922: need to implement HMM read duplication support.
|
||||
UVM_ASSERT(!uvm_va_block_is_hmm(va_block));
|
||||
@ -5159,10 +5147,6 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
|
||||
uvm_assert_mutex_locked(&va_block->lock);
|
||||
UVM_ASSERT(!uvm_va_block_is_dead(va_block));
|
||||
|
||||
scratch_residency_mask = kmem_cache_alloc(g_uvm_page_mask_cache, NV_UVM_GFP_FLAGS);
|
||||
if (!scratch_residency_mask)
|
||||
return NV_ERR_NO_MEMORY;
|
||||
|
||||
// For pages that are entering read-duplication we need to unmap remote
|
||||
// mappings and revoke RW and higher access permissions.
|
||||
//
|
||||
@ -5189,12 +5173,12 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
|
||||
|
||||
status = block_prep_read_duplicate_mapping(va_block, va_block_context, src_id, region, preprocess_page_mask);
|
||||
if (status != NV_OK)
|
||||
goto out;
|
||||
return status;
|
||||
}
|
||||
|
||||
status = block_populate_pages(va_block, va_block_retry, va_block_context, dest_id, region, page_mask);
|
||||
if (status != NV_OK)
|
||||
goto out;
|
||||
return status;
|
||||
|
||||
status = block_copy_resident_pages(va_block,
|
||||
va_block_context,
|
||||
@ -5204,17 +5188,22 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
|
||||
prefetch_page_mask,
|
||||
UVM_VA_BLOCK_TRANSFER_MODE_COPY);
|
||||
if (status != NV_OK)
|
||||
goto out;
|
||||
return status;
|
||||
|
||||
// Pages that weren't resident anywhere else were populated at the
|
||||
// destination directly. Mark them as resident now, since there were no
|
||||
// errors from block_copy_resident_pages() above.
|
||||
// Note that va_block_context->scratch_page_mask is passed to
|
||||
// block_copy_set_first_touch_residency() which is generally unsafe but in
|
||||
// this case, block_copy_set_first_touch_residency() copies page_mask
|
||||
// before scratch_page_mask could be clobbered.
|
||||
migrated_pages = &va_block_context->make_resident.pages_migrated;
|
||||
uvm_page_mask_init_from_region(scratch_residency_mask, region, page_mask);
|
||||
uvm_page_mask_andnot(scratch_residency_mask, scratch_residency_mask, migrated_pages);
|
||||
first_touch_mask = &va_block_context->scratch_page_mask;
|
||||
uvm_page_mask_init_from_region(first_touch_mask, region, page_mask);
|
||||
uvm_page_mask_andnot(first_touch_mask, first_touch_mask, migrated_pages);
|
||||
|
||||
if (!uvm_page_mask_empty(scratch_residency_mask))
|
||||
block_copy_set_first_touch_residency(va_block, va_block_context, dest_id, region, scratch_residency_mask);
|
||||
if (!uvm_page_mask_empty(first_touch_mask))
|
||||
block_copy_set_first_touch_residency(va_block, va_block_context, dest_id, region, first_touch_mask);
|
||||
|
||||
staged_pages = &va_block_context->make_resident.pages_staged;
|
||||
if (!UVM_ID_IS_CPU(dest_id) && !uvm_page_mask_empty(staged_pages)) {
|
||||
@ -5226,18 +5215,6 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
|
||||
|
||||
if (!uvm_page_mask_empty(migrated_pages)) {
|
||||
if (UVM_ID_IS_CPU(dest_id)) {
|
||||
// Check if the CPU is already in the resident set of processors.
|
||||
// We need to do this since we can't have multiple NUMA nodes with
|
||||
// resident pages.
|
||||
// If any of the migrate pages were already resident on the CPU, the
|
||||
// residency has to be switched to the destination NUMA node.
|
||||
if (uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) &&
|
||||
uvm_page_mask_and(scratch_residency_mask,
|
||||
uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE),
|
||||
migrated_pages)) {
|
||||
uvm_va_block_cpu_clear_resident_all_chunks(va_block, va_block_context, scratch_residency_mask);
|
||||
}
|
||||
|
||||
uvm_va_block_cpu_set_resident_all_chunks(va_block, va_block_context, migrated_pages);
|
||||
}
|
||||
else {
|
||||
@ -5266,9 +5243,7 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
|
||||
// Check state of all chunks after residency change.
|
||||
// TODO: Bug 4207783: Check both CPU and GPU chunks.
|
||||
UVM_ASSERT(block_check_cpu_chunks(va_block));
|
||||
out:
|
||||
kmem_cache_free(g_uvm_page_mask_cache, scratch_residency_mask);
|
||||
return status;
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
// Looks up the current CPU mapping state of page from the
|
||||
@ -5408,7 +5383,7 @@ static bool block_check_gpu_chunks(uvm_va_block_t *block, uvm_gpu_id_t id)
|
||||
|
||||
if (chunk) {
|
||||
if (chunk_size != uvm_gpu_chunk_get_size(chunk)) {
|
||||
UVM_ERR_PRINT("chunk size mismatch: calc %u, actual %u. VA block [0x%llx, 0x%llx) GPU: %u page_index: %u chunk index: %zu\n",
|
||||
UVM_ERR_PRINT("chunk size mismatch: calc %u, actual %u. VA block [0x%llx, 0x%llx) GPU: %u page_index: %u chunk index: %lu\n",
|
||||
chunk_size,
|
||||
uvm_gpu_chunk_get_size(chunk),
|
||||
block->start,
|
||||
@ -5420,7 +5395,7 @@ static bool block_check_gpu_chunks(uvm_va_block_t *block, uvm_gpu_id_t id)
|
||||
}
|
||||
|
||||
if (chunk->state != UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) {
|
||||
UVM_ERR_PRINT("Invalid chunk state %s. VA block [0x%llx, 0x%llx) GPU: %u page_index: %u chunk index: %zu chunk_size: %u\n",
|
||||
UVM_ERR_PRINT("Invalid chunk state %s. VA block [0x%llx, 0x%llx) GPU: %u page_index: %u chunk index: %lu chunk_size: llu\n",
|
||||
uvm_pmm_gpu_chunk_state_string(chunk->state),
|
||||
block->start,
|
||||
block->end + 1,
|
||||
@ -5553,15 +5528,13 @@ static bool block_check_mappings_page(uvm_va_block_t *block,
|
||||
*block->read_duplicated_pages.bitmap);
|
||||
|
||||
// Test read_duplicated_pages mask
|
||||
UVM_ASSERT_MSG((!uvm_page_mask_test(&block->read_duplicated_pages, page_index) &&
|
||||
uvm_processor_mask_get_count(resident_processors) <= 1) ||
|
||||
(uvm_page_mask_test(&block->read_duplicated_pages, page_index) &&
|
||||
uvm_processor_mask_get_count(resident_processors) >= 1),
|
||||
UVM_ASSERT_MSG((uvm_processor_mask_get_count(resident_processors) <= 1 &&
|
||||
!uvm_page_mask_test(&block->read_duplicated_pages, page_index)) ||
|
||||
(uvm_processor_mask_get_count(resident_processors) > 1 &&
|
||||
uvm_page_mask_test(&block->read_duplicated_pages, page_index)),
|
||||
"Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n",
|
||||
*resident_processors->bitmap,
|
||||
*read_mappings->bitmap,
|
||||
*write_mappings->bitmap,
|
||||
*atomic_mappings->bitmap,
|
||||
*read_mappings->bitmap, *write_mappings->bitmap, *atomic_mappings->bitmap,
|
||||
*va_space->system_wide_atomics_enabled_processors.bitmap,
|
||||
*block->read_duplicated_pages.bitmap);
|
||||
|
||||
@ -5741,7 +5714,7 @@ static bool block_check_mappings_ptes(uvm_va_block_t *block, uvm_va_block_contex
|
||||
uvm_pte_bits_gpu_t pte_bit;
|
||||
uvm_processor_id_t resident_id;
|
||||
uvm_prot_t prot;
|
||||
NvU32 big_page_size;
|
||||
NvU64 big_page_size;
|
||||
size_t num_big_pages, big_page_index;
|
||||
uvm_va_block_region_t big_region, chunk_region;
|
||||
uvm_gpu_chunk_t *chunk;
|
||||
@ -6045,7 +6018,7 @@ static bool block_has_remote_mapping_gpu(uvm_va_block_t *block,
|
||||
if (uvm_page_mask_empty(mapped_pages))
|
||||
return false;
|
||||
|
||||
return !uvm_va_policy_preferred_location_equal(uvm_va_range_get_policy(block->va_range), gpu_id, NUMA_NO_NODE);
|
||||
return !uvm_id_equal(uvm_va_range_get_policy(block->va_range)->preferred_location, gpu_id);
|
||||
}
|
||||
|
||||
// Remote pages are pages which are mapped but not resident locally
|
||||
@ -6193,7 +6166,7 @@ static void block_gpu_pte_big_split_write_4k(uvm_va_block_t *block,
|
||||
size_t big_page_index;
|
||||
uvm_processor_id_t curr_resident_id;
|
||||
uvm_prot_t curr_prot;
|
||||
NvU32 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu);
|
||||
NvU64 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu);
|
||||
|
||||
if (UVM_ID_IS_INVALID(resident_id))
|
||||
UVM_ASSERT(new_prot == UVM_PROT_NONE);
|
||||
@ -6275,7 +6248,7 @@ static void block_gpu_pte_clear_big(uvm_va_block_t *block,
|
||||
{
|
||||
uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
|
||||
uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu);
|
||||
NvU32 big_page_size = gpu_va_space->page_tables.big_page_size;
|
||||
NvU64 big_page_size = gpu_va_space->page_tables.big_page_size;
|
||||
uvm_gpu_phys_address_t pte_addr;
|
||||
NvU32 pte_size = uvm_mmu_pte_size(&gpu_va_space->page_tables, big_page_size);
|
||||
size_t big_page_index;
|
||||
@ -6321,7 +6294,7 @@ static void block_gpu_pte_write_big(uvm_va_block_t *block,
|
||||
uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
|
||||
uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu);
|
||||
uvm_page_tree_t *tree = &gpu_va_space->page_tables;
|
||||
NvU32 big_page_size = tree->big_page_size;
|
||||
NvU64 big_page_size = tree->big_page_size;
|
||||
NvU32 pte_size = uvm_mmu_pte_size(tree, big_page_size);
|
||||
size_t big_page_index;
|
||||
uvm_va_block_region_t contig_region = {0};
|
||||
@ -6399,7 +6372,7 @@ static void block_gpu_pte_merge_big_and_end(uvm_va_block_t *block,
|
||||
{
|
||||
uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
|
||||
uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
|
||||
NvU32 big_page_size = tree->big_page_size;
|
||||
NvU64 big_page_size = tree->big_page_size;
|
||||
NvU64 unmapped_pte_val = tree->hal->unmapped_pte(big_page_size);
|
||||
size_t big_page_index;
|
||||
DECLARE_BITMAP(dummy_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
|
||||
@ -6960,7 +6933,7 @@ static void block_gpu_split_big(uvm_va_block_t *block,
|
||||
uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
|
||||
uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
|
||||
uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
|
||||
NvU32 big_page_size = tree->big_page_size;
|
||||
NvU64 big_page_size = tree->big_page_size;
|
||||
uvm_va_block_region_t big_region;
|
||||
uvm_processor_id_t resident_id;
|
||||
size_t big_page_index;
|
||||
@ -7062,7 +7035,7 @@ static void block_gpu_map_big_and_4k(uvm_va_block_t *block,
|
||||
DECLARE_BITMAP(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
|
||||
uvm_va_block_region_t big_region;
|
||||
size_t big_page_index;
|
||||
NvU32 big_page_size = tree->big_page_size;
|
||||
NvU64 big_page_size = tree->big_page_size;
|
||||
uvm_membar_t tlb_membar = block_pte_op_membar(pte_op, gpu, resident_id);
|
||||
|
||||
UVM_ASSERT(!gpu_state->pte_is_2m);
|
||||
@ -7364,7 +7337,7 @@ static void block_gpu_unmap_big_and_4k(uvm_va_block_t *block,
|
||||
DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
|
||||
DECLARE_BITMAP(big_ptes_before_or_after, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
|
||||
DECLARE_BITMAP(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
|
||||
NvU32 big_page_size = tree->big_page_size;
|
||||
NvU64 big_page_size = tree->big_page_size;
|
||||
NvU64 unmapped_pte_val = tree->hal->unmapped_pte(big_page_size);
|
||||
|
||||
UVM_ASSERT(!gpu_state->pte_is_2m);
|
||||
@ -7510,7 +7483,7 @@ static void block_gpu_compute_new_pte_state(uvm_va_block_t *block,
|
||||
{
|
||||
uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
|
||||
uvm_va_block_region_t big_region_all, big_page_region, region;
|
||||
NvU32 big_page_size;
|
||||
NvU64 big_page_size;
|
||||
uvm_page_index_t page_index;
|
||||
size_t big_page_index;
|
||||
DECLARE_BITMAP(big_ptes_not_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
|
||||
@ -7663,7 +7636,7 @@ static void block_gpu_compute_new_pte_state(uvm_va_block_t *block,
|
||||
// happens, the pending tracker is added to the block's tracker.
|
||||
static NV_STATUS block_alloc_pt_range_with_retry(uvm_va_block_t *va_block,
|
||||
uvm_gpu_t *gpu,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
uvm_page_table_range_t *page_table_range,
|
||||
uvm_tracker_t *pending_tracker)
|
||||
{
|
||||
@ -7786,13 +7759,13 @@ allocated:
|
||||
// sizes. See block_alloc_pt_range_with_retry.
|
||||
static NV_STATUS block_alloc_ptes_with_retry(uvm_va_block_t *va_block,
|
||||
uvm_gpu_t *gpu,
|
||||
NvU32 page_sizes,
|
||||
NvU64 page_sizes,
|
||||
uvm_tracker_t *pending_tracker)
|
||||
{
|
||||
uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
|
||||
uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(va_block, gpu);
|
||||
uvm_page_table_range_t *range;
|
||||
NvU32 page_size;
|
||||
NvU64 page_size;
|
||||
NV_STATUS status, final_status = NV_OK;
|
||||
|
||||
UVM_ASSERT(gpu_state);
|
||||
@ -7844,7 +7817,7 @@ static NV_STATUS block_alloc_ptes_new_state(uvm_va_block_t *va_block,
|
||||
uvm_va_block_new_pte_state_t *new_pte_state,
|
||||
uvm_tracker_t *pending_tracker)
|
||||
{
|
||||
NvU32 page_sizes = 0;
|
||||
NvU64 page_sizes = 0;
|
||||
|
||||
if (new_pte_state->pte_is_2m) {
|
||||
page_sizes |= UVM_PAGE_SIZE_2M;
|
||||
@ -7876,8 +7849,8 @@ static NV_STATUS block_pre_populate_pde1_gpu(uvm_va_block_t *block,
|
||||
uvm_gpu_va_space_t *gpu_va_space,
|
||||
uvm_tracker_t *pending_tracker)
|
||||
{
|
||||
NvU32 page_sizes;
|
||||
NvU32 big_page_size;
|
||||
NvU64 page_sizes;
|
||||
NvU64 big_page_size;
|
||||
uvm_gpu_t *gpu;
|
||||
uvm_va_block_gpu_state_t *gpu_state;
|
||||
|
||||
@ -8388,7 +8361,6 @@ static NV_STATUS block_map_gpu_to(uvm_va_block_t *va_block,
|
||||
uvm_va_block_context_t *block_context,
|
||||
uvm_gpu_t *gpu,
|
||||
uvm_processor_id_t resident_id,
|
||||
int resident_nid,
|
||||
uvm_page_mask_t *map_page_mask,
|
||||
uvm_prot_t new_prot,
|
||||
uvm_tracker_t *out_tracker)
|
||||
@ -8398,7 +8370,7 @@ static NV_STATUS block_map_gpu_to(uvm_va_block_t *va_block,
|
||||
uvm_push_t push;
|
||||
NV_STATUS status;
|
||||
uvm_page_mask_t *pages_to_map = &block_context->mapping.page_mask;
|
||||
const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id, resident_nid);
|
||||
const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id, NUMA_NO_NODE);
|
||||
uvm_pte_bits_gpu_t pte_bit;
|
||||
uvm_pte_bits_gpu_t prot_pte_bit = get_gpu_pte_bit_index(new_prot);
|
||||
uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
|
||||
@ -8407,10 +8379,8 @@ static NV_STATUS block_map_gpu_to(uvm_va_block_t *va_block,
|
||||
UVM_ASSERT(map_page_mask);
|
||||
UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(resident_id)], gpu->id));
|
||||
|
||||
if (uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id)) {
|
||||
uvm_va_policy_t *policy = uvm_va_range_get_policy(va_block->va_range);
|
||||
UVM_ASSERT(uvm_va_policy_preferred_location_equal(policy, resident_id, policy->preferred_nid));
|
||||
}
|
||||
if (uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id))
|
||||
UVM_ASSERT(uvm_id_equal(resident_id, uvm_va_range_get_policy(va_block->va_range)->preferred_location));
|
||||
|
||||
UVM_ASSERT(!uvm_page_mask_and(&block_context->scratch_page_mask,
|
||||
map_page_mask,
|
||||
@ -8512,27 +8482,18 @@ static NV_STATUS block_map_gpu_to(uvm_va_block_t *va_block,
|
||||
return uvm_tracker_add_push_safe(out_tracker, &push);
|
||||
}
|
||||
|
||||
// allowed_nid_mask is only valid if the CPU is set in allowed_mask.
|
||||
static void map_get_allowed_destinations(uvm_va_block_t *block,
|
||||
uvm_va_block_context_t *va_block_context,
|
||||
const uvm_va_policy_t *policy,
|
||||
uvm_processor_id_t id,
|
||||
uvm_processor_mask_t *allowed_mask,
|
||||
nodemask_t *allowed_nid_mask)
|
||||
uvm_processor_mask_t *allowed_mask)
|
||||
{
|
||||
uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
|
||||
|
||||
*allowed_nid_mask = node_possible_map;
|
||||
|
||||
if (uvm_processor_mask_test(block_get_uvm_lite_gpus(block), id)) {
|
||||
// UVM-Lite can only map resident pages on the preferred location
|
||||
uvm_processor_mask_zero(allowed_mask);
|
||||
uvm_processor_mask_set(allowed_mask, policy->preferred_location);
|
||||
if (UVM_ID_IS_CPU(policy->preferred_location) &&
|
||||
!uvm_va_policy_preferred_location_equal(policy, UVM_ID_CPU, NUMA_NO_NODE)) {
|
||||
nodes_clear(*allowed_nid_mask);
|
||||
node_set(policy->preferred_nid, *allowed_nid_mask);
|
||||
}
|
||||
}
|
||||
else if ((uvm_va_policy_is_read_duplicate(policy, va_space) ||
|
||||
(uvm_id_equal(policy->preferred_location, id) &&
|
||||
@ -8575,7 +8536,6 @@ NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block,
|
||||
uvm_page_mask_t *running_page_mask = &va_block_context->mapping.map_running_page_mask;
|
||||
NV_STATUS status = NV_OK;
|
||||
const uvm_va_policy_t *policy = uvm_va_policy_get_region(va_block, region);
|
||||
nodemask_t *allowed_nid_destinations;
|
||||
|
||||
va_block_context->mapping.cause = cause;
|
||||
|
||||
@ -8625,20 +8585,10 @@ NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block,
|
||||
if (!allowed_destinations)
|
||||
return NV_ERR_NO_MEMORY;
|
||||
|
||||
allowed_nid_destinations = uvm_kvmalloc(sizeof(*allowed_nid_destinations));
|
||||
if (!allowed_nid_destinations) {
|
||||
uvm_processor_mask_cache_free(allowed_destinations);
|
||||
return NV_ERR_NO_MEMORY;
|
||||
}
|
||||
|
||||
// Map per resident location so we can more easily detect physically-
|
||||
// contiguous mappings.
|
||||
map_get_allowed_destinations(va_block,
|
||||
va_block_context,
|
||||
policy,
|
||||
id,
|
||||
allowed_destinations,
|
||||
allowed_nid_destinations);
|
||||
map_get_allowed_destinations(va_block, va_block_context, policy, id, allowed_destinations);
|
||||
|
||||
for_each_closest_id(resident_id, allowed_destinations, id, va_space) {
|
||||
if (UVM_ID_IS_CPU(id)) {
|
||||
status = block_map_cpu_to(va_block,
|
||||
@ -8649,30 +8599,11 @@ NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block,
|
||||
new_prot,
|
||||
out_tracker);
|
||||
}
|
||||
else if (UVM_ID_IS_CPU(resident_id)) {
|
||||
int nid;
|
||||
|
||||
// map_get_allowed_distinations() will set the mask of CPU NUMA
|
||||
// nodes that should be mapped.
|
||||
for_each_node_mask(nid, *allowed_nid_destinations) {
|
||||
status = block_map_gpu_to(va_block,
|
||||
va_block_context,
|
||||
gpu,
|
||||
resident_id,
|
||||
nid,
|
||||
running_page_mask,
|
||||
new_prot,
|
||||
out_tracker);
|
||||
if (status != NV_OK)
|
||||
break;
|
||||
}
|
||||
}
|
||||
else {
|
||||
status = block_map_gpu_to(va_block,
|
||||
va_block_context,
|
||||
gpu,
|
||||
resident_id,
|
||||
NUMA_NO_NODE,
|
||||
running_page_mask,
|
||||
new_prot,
|
||||
out_tracker);
|
||||
@ -8687,7 +8618,6 @@ NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block,
|
||||
}
|
||||
|
||||
uvm_processor_mask_cache_free(allowed_destinations);
|
||||
uvm_kvfree(allowed_nid_destinations);
|
||||
|
||||
return status;
|
||||
}
|
||||
@ -9575,7 +9505,6 @@ static void block_kill(uvm_va_block_t *block)
|
||||
// Free CPU pages
|
||||
for_each_possible_uvm_node(nid) {
|
||||
uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, nid);
|
||||
size_t index = node_to_index(nid);
|
||||
|
||||
for_each_cpu_chunk_in_block_safe(chunk, page_index, next_page_index, block, nid) {
|
||||
// be conservative.
|
||||
@ -9590,9 +9519,20 @@ static void block_kill(uvm_va_block_t *block)
|
||||
|
||||
UVM_ASSERT(uvm_page_mask_empty(&node_state->allocated));
|
||||
UVM_ASSERT(node_state->chunks == 0);
|
||||
kmem_cache_free(g_uvm_va_block_cpu_node_state_cache, block->cpu.node_state[index]);
|
||||
}
|
||||
|
||||
// While a per-NUMA node_state array is in use, all of its elements are
|
||||
// expected to be valid. Therefore the teardown of these elements must occur
|
||||
// as a single "transaction". This teardown must take place after freeing
|
||||
// the CPU pages (see the "Free CPU pages" loop above). This is because as
|
||||
// part of removing chunks from VA blocks, the per-page allocated bitmap is
|
||||
// recomputed using the per-NUMA node_state array elements.
|
||||
for_each_possible_uvm_node(nid) {
|
||||
uvm_va_block_cpu_node_state_t *node_state;
|
||||
|
||||
node_state = block_node_state_get(block, nid);
|
||||
kmem_cache_free(g_uvm_va_block_cpu_node_state_cache, node_state);
|
||||
}
|
||||
uvm_kvfree((void *)block->cpu.node_state);
|
||||
block->cpu.node_state = NULL;
|
||||
|
||||
@ -9708,8 +9648,8 @@ static NV_STATUS block_split_presplit_ptes_gpu(uvm_va_block_t *existing, uvm_va_
|
||||
uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu->id);
|
||||
uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing);
|
||||
uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
|
||||
NvU32 big_page_size = uvm_va_block_gpu_big_page_size(existing, gpu);
|
||||
NvU32 alloc_sizes;
|
||||
NvU64 big_page_size = uvm_va_block_gpu_big_page_size(existing, gpu);
|
||||
NvU64 alloc_sizes;
|
||||
DECLARE_BITMAP(new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
|
||||
uvm_page_index_t new_start_page_index = uvm_va_block_cpu_page_index(existing, new->start);
|
||||
size_t big_page_index;
|
||||
@ -10052,7 +9992,7 @@ static NV_STATUS block_split_cpu_chunk_one(uvm_va_block_t *block, uvm_page_index
|
||||
gpu = block_get_gpu(block, id);
|
||||
|
||||
// If the parent chunk has not been mapped, there is nothing to split.
|
||||
gpu_mapping_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
|
||||
gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu);
|
||||
if (gpu_mapping_addr == 0)
|
||||
continue;
|
||||
|
||||
@ -10074,7 +10014,7 @@ static NV_STATUS block_split_cpu_chunk_one(uvm_va_block_t *block, uvm_page_index
|
||||
merge:
|
||||
for_each_gpu_id_in_mask(id, gpu_split_mask) {
|
||||
gpu = block_get_gpu(block, id);
|
||||
gpu_mapping_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
|
||||
gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu);
|
||||
uvm_pmm_sysmem_mappings_merge_gpu_mappings(&gpu->pmm_reverse_sysmem_mappings,
|
||||
gpu_mapping_addr,
|
||||
chunk_size);
|
||||
@ -10260,7 +10200,7 @@ static void block_merge_cpu_chunks_one(uvm_va_block_t *block, uvm_page_index_t p
|
||||
continue;
|
||||
|
||||
gpu = block_get_gpu(block, id);
|
||||
gpu_mapping_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
|
||||
gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu);
|
||||
if (gpu_mapping_addr == 0)
|
||||
continue;
|
||||
|
||||
@ -10712,8 +10652,7 @@ static void block_split_gpu(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_g
|
||||
for_each_possible_uvm_node(nid) {
|
||||
for_each_cpu_chunk_in_block(cpu_chunk, page_index, new, nid) {
|
||||
uvm_pmm_sysmem_mappings_reparent_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings,
|
||||
uvm_cpu_chunk_get_parent_gpu_phys_addr(cpu_chunk,
|
||||
gpu->parent),
|
||||
uvm_cpu_chunk_get_gpu_phys_addr(cpu_chunk, gpu),
|
||||
new);
|
||||
}
|
||||
}
|
||||
@ -10751,7 +10690,7 @@ static void block_split_gpu(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_g
|
||||
gpu_va_space = uvm_gpu_va_space_get(va_space, gpu);
|
||||
if (gpu_va_space) {
|
||||
if (existing_gpu_state->page_table_range_big.table) {
|
||||
NvU32 big_page_size = uvm_va_block_gpu_big_page_size(existing, gpu);
|
||||
NvU64 big_page_size = uvm_va_block_gpu_big_page_size(existing, gpu);
|
||||
|
||||
// existing's end has not been adjusted yet
|
||||
existing_pages_big = range_num_big_pages(existing->start, new->start - 1, big_page_size);
|
||||
@ -11241,8 +11180,8 @@ NV_STATUS uvm_va_block_add_mappings_after_migration(uvm_va_block_t *va_block,
|
||||
// so uvm_va_block_map will be a no-op.
|
||||
uvm_processor_mask_and(map_uvm_lite_gpus, map_other_processors, block_get_uvm_lite_gpus(va_block));
|
||||
if (!uvm_processor_mask_empty(map_uvm_lite_gpus) &&
|
||||
uvm_va_policy_preferred_location_equal(policy, new_residency, va_block_context->make_resident.dest_nid)) {
|
||||
for_each_id_in_mask (map_processor_id, map_uvm_lite_gpus) {
|
||||
uvm_id_equal(new_residency, preferred_location)) {
|
||||
for_each_id_in_mask(map_processor_id, map_uvm_lite_gpus) {
|
||||
status = uvm_va_block_map(va_block,
|
||||
va_block_context,
|
||||
map_processor_id,
|
||||
@ -11703,10 +11642,6 @@ static int block_select_node_residency(uvm_va_block_t *va_block,
|
||||
// For GPU faults, the bottom half is pinned to CPUs closest to their GPU.
|
||||
// Therefore, in both cases, we can use numa_mem_id() to get the NUMA node
|
||||
// ID of the faulting processor.
|
||||
// Note that numa_mem_id() returns the nearest node with memory. In most
|
||||
// cases, this will be the current NUMA node. However, in the case that the
|
||||
// current node does not have any memory, we probably want the nearest node
|
||||
// with memory, anyway.
|
||||
int current_nid = numa_mem_id();
|
||||
bool may_read_duplicate = can_read_duplicate(va_block, page_index, policy, thrashing_hint);
|
||||
|
||||
@ -11730,12 +11665,7 @@ static int block_select_node_residency(uvm_va_block_t *va_block,
|
||||
// If read duplication is enabled and the page is also resident on the CPU,
|
||||
// keep its current NUMA node residency.
|
||||
if (may_read_duplicate && uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index))
|
||||
return NUMA_NO_NODE;
|
||||
|
||||
// The new_residency processor is the CPU and the preferred location is not
|
||||
// the CPU. If the page is resident on the CPU, keep its current residency.
|
||||
if (uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index))
|
||||
return NUMA_NO_NODE;
|
||||
return block_get_page_node_residency(va_block, page_index);
|
||||
|
||||
return current_nid;
|
||||
}
|
||||
@ -12639,6 +12569,125 @@ NV_STATUS uvm_va_block_find_create(uvm_va_space_t *va_space,
|
||||
return uvm_hmm_va_block_find_create(va_space, addr, hmm_vma, out_block);
|
||||
}
|
||||
|
||||
// Launch a synchronous, encrypted copy between GPU and CPU.
|
||||
//
|
||||
// The copy entails a GPU-side encryption (relying on the Copy Engine), and a
|
||||
// CPU-side decryption step, such that the destination CPU buffer pointed by
|
||||
// dst_plain will contain the unencrypted (plain text) contents. The destination
|
||||
// buffer can be in protected or unprotected sysmem, while the source buffer
|
||||
// must be in protected vidmem.
|
||||
//
|
||||
// The maximum copy size allowed is UVM_CONF_COMPUTING_DMA_BUFFER_SIZE.
|
||||
//
|
||||
// The input tracker, if not NULL, is internally acquired by the push
|
||||
// responsible for the encrypted copy.
|
||||
__attribute__ ((format(printf, 6, 7)))
|
||||
static NV_STATUS encrypted_memcopy_gpu_to_cpu(uvm_gpu_t *gpu,
|
||||
void *dst_plain,
|
||||
uvm_gpu_address_t src_gpu_address,
|
||||
size_t size,
|
||||
uvm_tracker_t *tracker,
|
||||
const char *format,
|
||||
...)
|
||||
{
|
||||
NV_STATUS status;
|
||||
UvmCslIv decrypt_iv;
|
||||
uvm_push_t push;
|
||||
uvm_conf_computing_dma_buffer_t *dma_buffer;
|
||||
uvm_gpu_address_t dst_gpu_address, auth_tag_gpu_address;
|
||||
void *src_cipher, *auth_tag;
|
||||
va_list args;
|
||||
|
||||
UVM_ASSERT(g_uvm_global.conf_computing_enabled);
|
||||
UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE);
|
||||
|
||||
status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
va_start(args, format);
|
||||
status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_TO_CPU, tracker, &push, format, args);
|
||||
va_end(args);
|
||||
|
||||
if (status != NV_OK)
|
||||
goto out;
|
||||
|
||||
uvm_conf_computing_log_gpu_encryption(push.channel, &decrypt_iv);
|
||||
|
||||
dst_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
|
||||
auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
|
||||
gpu->parent->ce_hal->encrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address);
|
||||
|
||||
status = uvm_push_end_and_wait(&push);
|
||||
if (status != NV_OK)
|
||||
goto out;
|
||||
|
||||
src_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
|
||||
auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
|
||||
status = uvm_conf_computing_cpu_decrypt(push.channel, dst_plain, src_cipher, &decrypt_iv, size, auth_tag);
|
||||
|
||||
out:
|
||||
uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL);
|
||||
return status;
|
||||
}
|
||||
|
||||
// Launch a synchronous, encrypted copy between CPU and GPU.
|
||||
//
|
||||
// The source CPU buffer pointed by src_plain contains the unencrypted (plain
|
||||
// text) contents; the function internally performs a CPU-side encryption step
|
||||
// before launching the GPU-side CE decryption. The source buffer can be in
|
||||
// protected or unprotected sysmem, while the destination buffer must be in
|
||||
// protected vidmem.
|
||||
//
|
||||
// The maximum copy size allowed is UVM_CONF_COMPUTING_DMA_BUFFER_SIZE.
|
||||
//
|
||||
// The input tracker, if not NULL, is internally acquired by the push
|
||||
// responsible for the encrypted copy.
|
||||
__attribute__ ((format(printf, 6, 7)))
|
||||
static NV_STATUS encrypted_memcopy_cpu_to_gpu(uvm_gpu_t *gpu,
|
||||
uvm_gpu_address_t dst_gpu_address,
|
||||
void *src_plain,
|
||||
size_t size,
|
||||
uvm_tracker_t *tracker,
|
||||
const char *format,
|
||||
...)
|
||||
{
|
||||
NV_STATUS status;
|
||||
uvm_push_t push;
|
||||
uvm_conf_computing_dma_buffer_t *dma_buffer;
|
||||
uvm_gpu_address_t src_gpu_address, auth_tag_gpu_address;
|
||||
void *dst_cipher, *auth_tag;
|
||||
va_list args;
|
||||
|
||||
UVM_ASSERT(g_uvm_global.conf_computing_enabled);
|
||||
UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE);
|
||||
|
||||
status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
va_start(args, format);
|
||||
status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_CPU_TO_GPU, tracker, &push, format, args);
|
||||
va_end(args);
|
||||
|
||||
if (status != NV_OK)
|
||||
goto out;
|
||||
|
||||
dst_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
|
||||
auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
|
||||
uvm_conf_computing_cpu_encrypt(push.channel, dst_cipher, src_plain, NULL, size, auth_tag);
|
||||
|
||||
src_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
|
||||
auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
|
||||
gpu->parent->ce_hal->decrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address);
|
||||
|
||||
status = uvm_push_end_and_wait(&push);
|
||||
|
||||
out:
|
||||
uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL);
|
||||
return status;
|
||||
}
|
||||
|
||||
static NV_STATUS va_block_write_cpu_to_gpu(uvm_va_block_t *va_block,
|
||||
uvm_gpu_t *gpu,
|
||||
uvm_gpu_address_t dst_gpu_address,
|
||||
@ -12651,7 +12700,7 @@ static NV_STATUS va_block_write_cpu_to_gpu(uvm_va_block_t *va_block,
|
||||
uvm_gpu_address_t src_gpu_address;
|
||||
|
||||
if (g_uvm_global.conf_computing_enabled) {
|
||||
return uvm_conf_computing_util_memcopy_cpu_to_gpu(gpu,
|
||||
return encrypted_memcopy_cpu_to_gpu(gpu,
|
||||
dst_gpu_address,
|
||||
uvm_mem_get_cpu_addr_kernel(src_mem),
|
||||
size,
|
||||
@ -12755,7 +12804,7 @@ static NV_STATUS va_block_read_gpu_to_cpu(uvm_va_block_t *va_block,
|
||||
uvm_gpu_address_t dst_gpu_address;
|
||||
|
||||
if (g_uvm_global.conf_computing_enabled) {
|
||||
return uvm_conf_computing_util_memcopy_gpu_to_cpu(gpu,
|
||||
return encrypted_memcopy_gpu_to_cpu(gpu,
|
||||
uvm_mem_get_cpu_addr_kernel(dst_mem),
|
||||
src_gpu_address,
|
||||
size,
|
||||
@ -13570,7 +13619,7 @@ NV_STATUS uvm_test_va_residency_info(UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params,
|
||||
for_each_id_in_mask(id, &block->mapped) {
|
||||
uvm_processor_id_t processor_to_map;
|
||||
block_phys_page_t block_page;
|
||||
NvU32 page_size = uvm_va_block_page_size_processor(block, id, page_index);
|
||||
NvU64 page_size = uvm_va_block_page_size_processor(block, id, page_index);
|
||||
int nid = NUMA_NO_NODE;
|
||||
|
||||
if (page_size == 0)
|
||||
@ -13606,7 +13655,7 @@ NV_STATUS uvm_test_va_residency_info(UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params,
|
||||
if (uvm_processor_mask_test(resident_on_mask, UVM_ID_CPU)) {
|
||||
if (uvm_pmm_sysmem_mappings_indirect_supported()) {
|
||||
for_each_gpu_id(id) {
|
||||
NvU32 page_size = uvm_va_block_page_size_processor(block, id, page_index);
|
||||
NvU64 page_size = uvm_va_block_page_size_processor(block, id, page_index);
|
||||
uvm_reverse_map_t sysmem_page;
|
||||
uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page_resident(block, page_index);
|
||||
size_t num_pages;
|
||||
@ -13621,8 +13670,7 @@ NV_STATUS uvm_test_va_residency_info(UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params,
|
||||
continue;
|
||||
|
||||
num_pages = uvm_pmm_sysmem_mappings_dma_to_virt(&gpu->pmm_reverse_sysmem_mappings,
|
||||
uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk,
|
||||
gpu->parent),
|
||||
uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu),
|
||||
uvm_cpu_chunk_get_size(chunk),
|
||||
&sysmem_page,
|
||||
1);
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2015-2023 NVIDIA Corporation
|
||||
Copyright (c) 2015-2024 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@ -111,8 +111,6 @@ typedef struct
|
||||
// Pages that have been evicted to sysmem
|
||||
uvm_page_mask_t evicted;
|
||||
|
||||
NvU64 *cpu_chunks_dma_addrs;
|
||||
|
||||
// Array of naturally-aligned chunks. Each chunk has the largest possible
|
||||
// size which can fit within the block, so they are not uniform size.
|
||||
//
|
||||
@ -2155,8 +2153,7 @@ NV_STATUS uvm_va_block_map_cpu_chunk_on_gpus(uvm_va_block_t *va_block,
|
||||
// Physically unmap a CPU chunk from all registered GPUs.
|
||||
// Locking: The va_block lock must be held.
|
||||
void uvm_va_block_unmap_cpu_chunk_on_gpus(uvm_va_block_t *va_block,
|
||||
uvm_cpu_chunk_t *chunk,
|
||||
uvm_page_index_t page_index);
|
||||
uvm_cpu_chunk_t *chunk);
|
||||
|
||||
// Remove any CPU chunks in the given region.
|
||||
// Locking: The va_block lock must be held.
|
||||
@ -2166,19 +2163,19 @@ void uvm_va_block_remove_cpu_chunks(uvm_va_block_t *va_block, uvm_va_block_regio
|
||||
// specified processor in the block. Returns 0 if the address is not resident on
|
||||
// the specified processor.
|
||||
// Locking: The va_block lock must be held.
|
||||
NvU32 uvm_va_block_get_physical_size(uvm_va_block_t *block,
|
||||
NvU64 uvm_va_block_get_physical_size(uvm_va_block_t *block,
|
||||
uvm_processor_id_t processor,
|
||||
uvm_page_index_t page_index);
|
||||
|
||||
// Get CPU page size or 0 if it is not mapped
|
||||
NvU32 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block,
|
||||
NvU64 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block,
|
||||
uvm_page_index_t page_index);
|
||||
|
||||
// Get GPU page size or 0 if it is not mapped on the given GPU
|
||||
NvU32 uvm_va_block_page_size_gpu(uvm_va_block_t *va_block, uvm_gpu_id_t gpu_id, uvm_page_index_t page_index);
|
||||
NvU64 uvm_va_block_page_size_gpu(uvm_va_block_t *va_block, uvm_gpu_id_t gpu_id, uvm_page_index_t page_index);
|
||||
|
||||
// Get page size or 0 if it is not mapped on the given processor
|
||||
static NvU32 uvm_va_block_page_size_processor(uvm_va_block_t *va_block,
|
||||
static NvU64 uvm_va_block_page_size_processor(uvm_va_block_t *va_block,
|
||||
uvm_processor_id_t processor_id,
|
||||
uvm_page_index_t page_index)
|
||||
{
|
||||
@ -2189,10 +2186,10 @@ static NvU32 uvm_va_block_page_size_processor(uvm_va_block_t *va_block,
|
||||
}
|
||||
|
||||
// Returns the big page size for the GPU VA space of the block
|
||||
NvU32 uvm_va_block_gpu_big_page_size(uvm_va_block_t *va_block, uvm_gpu_t *gpu);
|
||||
NvU64 uvm_va_block_gpu_big_page_size(uvm_va_block_t *va_block, uvm_gpu_t *gpu);
|
||||
|
||||
// Returns the number of big pages in the VA block for the given size
|
||||
size_t uvm_va_block_num_big_pages(uvm_va_block_t *va_block, NvU32 big_page_size);
|
||||
size_t uvm_va_block_num_big_pages(uvm_va_block_t *va_block, NvU64 big_page_size);
|
||||
|
||||
// Returns the number of big pages in the VA block for the big page size on the
|
||||
// given GPU
|
||||
@ -2202,29 +2199,29 @@ static size_t uvm_va_block_gpu_num_big_pages(uvm_va_block_t *va_block, uvm_gpu_t
|
||||
}
|
||||
|
||||
// Returns the start address of the given big page index and big page size
|
||||
NvU64 uvm_va_block_big_page_addr(uvm_va_block_t *va_block, size_t big_page_index, NvU32 big_page_size);
|
||||
NvU64 uvm_va_block_big_page_addr(uvm_va_block_t *va_block, size_t big_page_index, NvU64 big_page_size);
|
||||
|
||||
// Returns the region [start, end] of the given big page index and big page size
|
||||
uvm_va_block_region_t uvm_va_block_big_page_region(uvm_va_block_t *va_block,
|
||||
size_t big_page_index,
|
||||
NvU32 big_page_size);
|
||||
NvU64 big_page_size);
|
||||
|
||||
// Returns the largest sub-region region of [start, end] which can fit big
|
||||
// pages. If the region cannot fit any big pages, an invalid region (0, 0) is
|
||||
// returned.
|
||||
uvm_va_block_region_t uvm_va_block_big_page_region_all(uvm_va_block_t *va_block, NvU32 big_page_size);
|
||||
uvm_va_block_region_t uvm_va_block_big_page_region_all(uvm_va_block_t *va_block, NvU64 big_page_size);
|
||||
|
||||
// Returns the largest sub-region region of 'region' which can fit big pages.
|
||||
// If the region cannot fit any big pages, an invalid region (0, 0) is returned.
|
||||
uvm_va_block_region_t uvm_va_block_big_page_region_subset(uvm_va_block_t *va_block,
|
||||
uvm_va_block_region_t region,
|
||||
NvU32 big_page_size);
|
||||
NvU64 big_page_size);
|
||||
|
||||
// Returns the big page index (the bit index within
|
||||
// uvm_va_block_gpu_state_t::big_ptes) corresponding to page_index. If
|
||||
// page_index cannot be covered by a big PTE due to alignment or block size,
|
||||
// MAX_BIG_PAGES_PER_UVM_VA_BLOCK is returned.
|
||||
size_t uvm_va_block_big_page_index(uvm_va_block_t *va_block, uvm_page_index_t page_index, NvU32 big_page_size);
|
||||
size_t uvm_va_block_big_page_index(uvm_va_block_t *va_block, uvm_page_index_t page_index, NvU64 big_page_size);
|
||||
|
||||
// Returns the new residency for a page that faulted or triggered access counter
|
||||
// notifications. The read_duplicate output parameter indicates if the page
|
||||
|
@ -105,12 +105,6 @@ bool uvm_va_policy_preferred_location_equal(const uvm_va_policy_t *policy, uvm_p
|
||||
{
|
||||
bool equal = uvm_id_equal(policy->preferred_location, proc);
|
||||
|
||||
if (!UVM_ID_IS_CPU(policy->preferred_location))
|
||||
UVM_ASSERT(policy->preferred_nid == NUMA_NO_NODE);
|
||||
|
||||
if (!UVM_ID_IS_CPU(proc))
|
||||
UVM_ASSERT(cpu_numa_id == NUMA_NO_NODE);
|
||||
|
||||
if (equal && UVM_ID_IS_CPU(policy->preferred_location))
|
||||
equal = uvm_numa_id_eq(policy->preferred_nid, cpu_numa_id);
|
||||
|
||||
@ -662,7 +656,7 @@ const uvm_va_policy_t *uvm_va_policy_set_preferred_location(uvm_va_block_t *va_b
|
||||
// and that the policy is changing.
|
||||
UVM_ASSERT(node->node.start >= start);
|
||||
UVM_ASSERT(node->node.end <= end);
|
||||
UVM_ASSERT(!uvm_va_policy_preferred_location_equal(&node->policy, processor_id, cpu_node_id));
|
||||
UVM_ASSERT(!uvm_id_equal(node->policy.preferred_location, processor_id));
|
||||
}
|
||||
|
||||
node->policy.preferred_location = processor_id;
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2015-2023 NVIDIA Corporation
|
||||
Copyright (c) 2015-2024 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@ -868,9 +868,9 @@ static void uvm_va_range_disable_peer_managed(uvm_va_range_t *va_range, uvm_gpu_
|
||||
// preferred location. If peer mappings are being disabled to the
|
||||
// preferred location, then unmap the other GPU.
|
||||
// Nothing to do otherwise.
|
||||
if (uvm_va_policy_preferred_location_equal(uvm_va_range_get_policy(va_range), gpu0->id, NUMA_NO_NODE))
|
||||
if (uvm_id_equal(uvm_va_range_get_policy(va_range)->preferred_location, gpu0->id))
|
||||
uvm_lite_gpu_to_unmap = gpu1;
|
||||
else if (uvm_va_policy_preferred_location_equal(uvm_va_range_get_policy(va_range), gpu1->id, NUMA_NO_NODE))
|
||||
else if (uvm_id_equal(uvm_va_range_get_policy(va_range)->preferred_location, gpu1->id))
|
||||
uvm_lite_gpu_to_unmap = gpu0;
|
||||
else
|
||||
return;
|
||||
@ -951,7 +951,7 @@ static void va_range_unregister_gpu_managed(uvm_va_range_t *va_range, uvm_gpu_t
|
||||
// Reset preferred location and accessed-by of VA ranges if needed
|
||||
// Note: ignoring the return code of uvm_va_range_set_preferred_location since this
|
||||
// will only return on error when setting a preferred location, not on a reset
|
||||
if (uvm_va_policy_preferred_location_equal(uvm_va_range_get_policy(va_range), gpu->id, NUMA_NO_NODE))
|
||||
if (uvm_id_equal(uvm_va_range_get_policy(va_range)->preferred_location, gpu->id))
|
||||
(void)uvm_va_range_set_preferred_location(va_range, UVM_ID_INVALID, NUMA_NO_NODE, mm, NULL);
|
||||
|
||||
uvm_va_range_unset_accessed_by(va_range, gpu->id, NULL);
|
||||
@ -1683,7 +1683,7 @@ void uvm_va_range_unset_accessed_by(uvm_va_range_t *va_range,
|
||||
// If a UVM-Lite GPU is being removed from the accessed_by mask, it will
|
||||
// also stop being a UVM-Lite GPU unless it's also the preferred location.
|
||||
if (uvm_processor_mask_test(&va_range->uvm_lite_gpus, processor_id) &&
|
||||
!uvm_va_policy_preferred_location_equal(uvm_va_range_get_policy(va_range), processor_id, NUMA_NO_NODE)) {
|
||||
!uvm_id_equal(uvm_va_range_get_policy(va_range)->preferred_location, processor_id)) {
|
||||
range_unmap(va_range, processor_id, out_tracker);
|
||||
}
|
||||
|
||||
@ -1853,7 +1853,7 @@ NV_STATUS uvm_api_alloc_semaphore_pool(UVM_ALLOC_SEMAPHORE_POOL_PARAMS *params,
|
||||
|
||||
if (uvm_api_range_invalid(params->base, params->length))
|
||||
return NV_ERR_INVALID_ADDRESS;
|
||||
if (params->gpuAttributesCount > UVM_MAX_GPUS_V2)
|
||||
if (params->gpuAttributesCount > UVM_MAX_GPUS)
|
||||
return NV_ERR_INVALID_ARGUMENT;
|
||||
|
||||
if (g_uvm_global.conf_computing_enabled && params->gpuAttributesCount == 0)
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2015-2022 NVIDIA Corporation
|
||||
Copyright (c) 2015-2024 NVIDIA Corporation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to
|
||||
@ -188,8 +188,7 @@ typedef struct
|
||||
// GPU which owns the allocation. For sysmem, this is the GPU that the
|
||||
// sysmem was originally allocated under. For the allocation to remain valid
|
||||
// we need to prevent the GPU from going away, similarly to P2P mapped
|
||||
// memory.
|
||||
// Similarly for EGM memory.
|
||||
// memory and to EGM memory.
|
||||
//
|
||||
// This field is not used for sparse mappings as they don't have an
|
||||
// allocation and, hence, owning GPU.
|
||||
@ -212,6 +211,7 @@ typedef struct
|
||||
// EGM memory. If true is_sysmem also has to be true and owning_gpu
|
||||
// has to be valid.
|
||||
bool is_egm;
|
||||
|
||||
// GPU page tables mapping the allocation
|
||||
uvm_page_table_range_vec_t pt_range_vec;
|
||||
|
||||
|
@ -199,7 +199,7 @@ void uvm_hal_volta_host_tlb_invalidate_va(uvm_push_t *push,
|
||||
NvU32 depth,
|
||||
NvU64 base,
|
||||
NvU64 size,
|
||||
NvU32 page_size,
|
||||
NvU64 page_size,
|
||||
uvm_membar_t membar)
|
||||
{
|
||||
NvU32 aperture_value;
|
||||
@ -216,9 +216,9 @@ void uvm_hal_volta_host_tlb_invalidate_va(uvm_push_t *push,
|
||||
NvU32 log2_invalidation_size;
|
||||
uvm_gpu_t *gpu = uvm_push_get_gpu(push);
|
||||
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(page_size, 1 << 12), "page_size 0x%x\n", page_size);
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(base, page_size), "base 0x%llx page_size 0x%x\n", base, page_size);
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%x\n", size, page_size);
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(page_size, 1 << 12), "page_size 0x%llx\n", page_size);
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(base, page_size), "base 0x%llx page_size 0x%llx\n", base, page_size);
|
||||
UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%llx\n", size, page_size);
|
||||
UVM_ASSERT_MSG(size > 0, "size 0x%llx\n", size);
|
||||
|
||||
// The invalidation size must be a power-of-two number of pages containing
|
||||
|
@ -42,7 +42,7 @@ static NvU32 entries_per_index_volta(NvU32 depth)
|
||||
return 1;
|
||||
}
|
||||
|
||||
static NvLength entry_offset_volta(NvU32 depth, NvU32 page_size)
|
||||
static NvLength entry_offset_volta(NvU32 depth, NvU64 page_size)
|
||||
{
|
||||
UVM_ASSERT(depth < 5);
|
||||
if (page_size == UVM_PAGE_SIZE_4K && depth == 3)
|
||||
@ -252,7 +252,7 @@ static NvU64 make_pte_volta(uvm_aperture_t aperture, NvU64 address, uvm_prot_t p
|
||||
|
||||
static uvm_mmu_mode_hal_t volta_mmu_mode_hal;
|
||||
|
||||
uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_volta(NvU32 big_page_size)
|
||||
uvm_mmu_mode_hal_t *uvm_hal_mmu_mode_volta(NvU64 big_page_size)
|
||||
{
|
||||
static bool initialized = false;
|
||||
|
||||
|
@ -159,14 +159,7 @@ static int lkca_aead_internal(struct crypto_aead *aead,
|
||||
}
|
||||
|
||||
if (rc != 0) {
|
||||
if (enc) {
|
||||
pr_info("aead.c: Encryption failed with error %i\n", rc);
|
||||
} else {
|
||||
pr_info("aead.c: Decryption failed with error %i\n", rc);
|
||||
if (rc == -EBADMSG) {
|
||||
pr_info("aead.c: Authentication tag mismatch!\n");
|
||||
}
|
||||
}
|
||||
pr_info("Encryption FAILED\n");
|
||||
}
|
||||
|
||||
*data_out_size = data_in_size;
|
||||
|
@ -1,42 +0,0 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* Comments, prototypes and checks taken from DMTF: Copyright 2021-2022 DMTF. All rights reserved.
|
||||
* License: BSD 3-Clause License. For full text see link: https://github.com/DMTF/libspdm/blob/main/LICENSE.md
|
||||
*/
|
||||
|
||||
#include "os-interface.h"
|
||||
#include "internal_crypt_lib.h"
|
||||
#include "library/cryptlib.h"
|
||||
|
||||
bool libspdm_check_crypto_backend(void)
|
||||
{
|
||||
#ifdef USE_LKCA
|
||||
nv_printf(NV_DBG_INFO, "libspdm_check_crypto_backend: LKCA wrappers found.\n");
|
||||
nv_printf(NV_DBG_INFO, "libspdm_check_crypto_backend: LKCA calls may still fail if modules have not been loaded!\n");
|
||||
return true;
|
||||
#else
|
||||
nv_printf(NV_DBG_ERRORS, "libspdm_check_crypto_backend: Error - libspdm expects LKCA but found stubs!\n");
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
@ -39,7 +39,9 @@
|
||||
#define RSA_PSS_PADDING_ZEROS_SIZE_BYTE (8)
|
||||
#define RSA_PSS_TRAILER_FIELD (0xbc)
|
||||
#define SHIFT_RIGHT_AND_GET_BYTE(val, x) ((val >> x) & 0xFF)
|
||||
#ifndef BITS_TO_BYTES
|
||||
#define BITS_TO_BYTES(b) (b >> 3)
|
||||
#endif
|
||||
|
||||
static const unsigned char zeroes[RSA_PSS_PADDING_ZEROS_SIZE_BYTE] = { 0 };
|
||||
|
||||
|
@ -66,6 +66,9 @@ static NvBool battery_present = NV_FALSE;
|
||||
#define ACPI_VIDEO_CLASS "video"
|
||||
#endif
|
||||
|
||||
/* Maximum size of ACPI _DSM method's 4th argument */
|
||||
#define NV_MAX_ACPI_DSM_PARAM_SIZE 1024
|
||||
|
||||
// Used for NVPCF event handling
|
||||
static acpi_handle nvpcf_handle = NULL;
|
||||
static acpi_handle nvpcf_device_handle = NULL;
|
||||
@ -73,21 +76,6 @@ static nv_acpi_t *nvpcf_nv_acpi_object = NULL;
|
||||
|
||||
#define ACPI_NVPCF_EVENT_CHANGE 0xC0
|
||||
|
||||
static int nv_acpi_get_device_handle(nv_state_t *nv, acpi_handle *dev_handle)
|
||||
{
|
||||
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
|
||||
|
||||
#if defined(DEVICE_ACPI_HANDLE)
|
||||
*dev_handle = DEVICE_ACPI_HANDLE(nvl->dev);
|
||||
return NV_TRUE;
|
||||
#elif defined (ACPI_HANDLE)
|
||||
*dev_handle = ACPI_HANDLE(nvl->dev);
|
||||
return NV_TRUE;
|
||||
#else
|
||||
return NV_FALSE;
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* This callback will be invoked by the acpi_notifier_call_chain()
|
||||
*/
|
||||
@ -174,7 +162,7 @@ static void nv_acpi_nvpcf_event(acpi_handle handle, u32 event_type, void *data)
|
||||
}
|
||||
else
|
||||
{
|
||||
nv_printf(NV_DBG_INFO,"NVRM: %s: NVPCF event 0x%x is not supported\n", event_type, __FUNCTION__);
|
||||
nv_printf(NV_DBG_INFO,"NVRM: %s: NVPCF event 0x%x is not supported\n", __FUNCTION__, event_type);
|
||||
}
|
||||
}
|
||||
|
||||
@ -267,11 +255,10 @@ static void nv_acpi_notify_event(acpi_handle handle, u32 event_type, void *data)
|
||||
|
||||
void nv_acpi_register_notifier(nv_linux_state_t *nvl)
|
||||
{
|
||||
acpi_handle dev_handle = NULL;
|
||||
acpi_handle dev_handle = ACPI_HANDLE(nvl->dev);
|
||||
|
||||
/* Install the ACPI notifier corresponding to dGPU ACPI device. */
|
||||
if ((nvl->nv_acpi_object == NULL) &&
|
||||
nv_acpi_get_device_handle(NV_STATE_PTR(nvl), &dev_handle) &&
|
||||
(dev_handle != NULL))
|
||||
{
|
||||
nvl->nv_acpi_object = nv_install_notifier(dev_handle, nv_acpi_notify_event, nvl);
|
||||
@ -657,7 +644,100 @@ static NV_STATUS nv_acpi_nvif_method(
|
||||
return NV_OK;
|
||||
}
|
||||
|
||||
#define MAX_INPUT_PARAM_SIZE 1024
|
||||
static NV_STATUS nv_acpi_evaluate_dsm_method(
|
||||
acpi_handle dev_handle,
|
||||
NvU8 *pathname,
|
||||
NvU8 *pAcpiDsmGuid,
|
||||
NvU32 acpiDsmRev,
|
||||
NvU32 acpiDsmSubFunction,
|
||||
void *arg3,
|
||||
NvU16 arg3Size,
|
||||
NvBool bArg3Integer,
|
||||
NvU32 *outStatus,
|
||||
void *pOutData,
|
||||
NvU16 *pSize
|
||||
)
|
||||
{
|
||||
NV_STATUS rmStatus = NV_OK;
|
||||
acpi_status status;
|
||||
struct acpi_object_list input;
|
||||
union acpi_object *dsm = NULL;
|
||||
struct acpi_buffer output = { ACPI_ALLOCATE_BUFFER, NULL };
|
||||
union acpi_object dsm_params[4];
|
||||
NvU32 data_size;
|
||||
|
||||
if (!NV_MAY_SLEEP())
|
||||
{
|
||||
#if defined(DEBUG)
|
||||
nv_printf(NV_DBG_ERRORS, "NVRM: %s: invalid context!\n", __FUNCTION__);
|
||||
#endif
|
||||
return NV_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
dsm_params[0].buffer.type = ACPI_TYPE_BUFFER;
|
||||
dsm_params[0].buffer.length = 0x10;
|
||||
dsm_params[0].buffer.pointer = pAcpiDsmGuid;
|
||||
|
||||
dsm_params[1].integer.type = ACPI_TYPE_INTEGER;
|
||||
dsm_params[1].integer.value = acpiDsmRev;
|
||||
|
||||
dsm_params[2].integer.type = ACPI_TYPE_INTEGER;
|
||||
dsm_params[2].integer.value = acpiDsmSubFunction;
|
||||
|
||||
if (bArg3Integer)
|
||||
{
|
||||
dsm_params[3].integer.type = ACPI_TYPE_INTEGER;
|
||||
dsm_params[3].integer.value = *((NvU32 *)arg3);
|
||||
}
|
||||
else
|
||||
{
|
||||
dsm_params[3].buffer.type = ACPI_TYPE_BUFFER;
|
||||
dsm_params[3].buffer.length = arg3Size;
|
||||
dsm_params[3].buffer.pointer = arg3;
|
||||
}
|
||||
|
||||
// parameters for dsm calls (GUID, rev, subfunction, data)
|
||||
input.count = 4;
|
||||
input.pointer = dsm_params;
|
||||
|
||||
status = acpi_evaluate_object(dev_handle, pathname, &input, &output);
|
||||
if (ACPI_FAILURE(status))
|
||||
{
|
||||
nv_printf(NV_DBG_INFO,
|
||||
"NVRM: %s: failed to evaluate _DSM method!\n", __FUNCTION__);
|
||||
return NV_ERR_OPERATING_SYSTEM;
|
||||
}
|
||||
|
||||
dsm = output.pointer;
|
||||
if (dsm != NULL)
|
||||
{
|
||||
if (outStatus)
|
||||
{
|
||||
*outStatus = dsm->buffer.pointer[3] << 24 |
|
||||
dsm->buffer.pointer[2] << 16 |
|
||||
dsm->buffer.pointer[1] << 8 |
|
||||
dsm->buffer.pointer[0];
|
||||
}
|
||||
|
||||
rmStatus = nv_acpi_extract_object(dsm, pOutData, *pSize, &data_size);
|
||||
*pSize = data_size;
|
||||
|
||||
kfree(output.pointer);
|
||||
}
|
||||
else
|
||||
{
|
||||
*pSize = 0;
|
||||
}
|
||||
|
||||
if (rmStatus != NV_OK)
|
||||
{
|
||||
nv_printf(NV_DBG_ERRORS,
|
||||
"NVRM: %s: DSM data invalid!\n", __FUNCTION__);
|
||||
}
|
||||
|
||||
return rmStatus;
|
||||
}
|
||||
|
||||
/*
|
||||
* This function executes a _DSM ACPI method.
|
||||
*/
|
||||
@ -674,65 +754,27 @@ NV_STATUS NV_API_CALL nv_acpi_dsm_method(
|
||||
NvU16 *pSize
|
||||
)
|
||||
{
|
||||
NV_STATUS status = NV_ERR_OPERATING_SYSTEM;
|
||||
acpi_status acpi_status;
|
||||
struct acpi_object_list input;
|
||||
union acpi_object *dsm = NULL;
|
||||
struct acpi_buffer output = { ACPI_ALLOCATE_BUFFER, NULL };
|
||||
union acpi_object dsm_params[4];
|
||||
NV_STATUS rmStatus = NV_ERR_OPERATING_SYSTEM;
|
||||
NvU8 *argument3 = NULL;
|
||||
NvU32 data_size;
|
||||
acpi_handle dev_handle = NULL;
|
||||
|
||||
if (!nv_acpi_get_device_handle(nv, &dev_handle))
|
||||
return NV_ERR_NOT_SUPPORTED;
|
||||
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
|
||||
acpi_handle dev_handle = ACPI_HANDLE(nvl->dev);
|
||||
NvU8 *pathname = "_DSM";
|
||||
|
||||
if (!dev_handle)
|
||||
return NV_ERR_INVALID_ARGUMENT;
|
||||
|
||||
if ((!pInParams) || (inParamSize > MAX_INPUT_PARAM_SIZE) || (!pOutData) || (!pSize))
|
||||
if ((!pInParams) || (inParamSize > NV_MAX_ACPI_DSM_PARAM_SIZE) || (!pOutData) || (!pSize))
|
||||
{
|
||||
nv_printf(NV_DBG_INFO,
|
||||
"NVRM: %s: invalid argument(s)!\n", __FUNCTION__);
|
||||
return NV_ERR_INVALID_ARGUMENT;
|
||||
}
|
||||
|
||||
if (!NV_MAY_SLEEP())
|
||||
{
|
||||
#if defined(DEBUG)
|
||||
nv_printf(NV_DBG_INFO,
|
||||
"NVRM: %s: invalid argument(s)!\n", __FUNCTION__);
|
||||
#endif
|
||||
return NV_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
rmStatus = os_alloc_mem((void **)&argument3, inParamSize);
|
||||
if (rmStatus != NV_OK)
|
||||
return rmStatus;
|
||||
|
||||
status = os_alloc_mem((void **)&argument3, inParamSize);
|
||||
if (status != NV_OK)
|
||||
return status;
|
||||
|
||||
//
|
||||
// dsm_params[0].buffer.pointer and dsm_params[1].integer.value set in
|
||||
// switch below based on acpiDsmFunction
|
||||
//
|
||||
|
||||
dsm_params[0].buffer.type = ACPI_TYPE_BUFFER;
|
||||
dsm_params[0].buffer.length = 0x10;
|
||||
dsm_params[0].buffer.pointer = pAcpiDsmGuid;
|
||||
|
||||
dsm_params[1].integer.type = ACPI_TYPE_INTEGER;
|
||||
dsm_params[1].integer.value = acpiDsmRev;
|
||||
|
||||
dsm_params[2].integer.type = ACPI_TYPE_INTEGER;
|
||||
dsm_params[2].integer.value = acpiDsmSubFunction;
|
||||
|
||||
dsm_params[3].buffer.type = ACPI_TYPE_BUFFER;
|
||||
dsm_params[3].buffer.length = inParamSize;
|
||||
memcpy(argument3, pInParams, dsm_params[3].buffer.length);
|
||||
dsm_params[3].buffer.pointer = argument3;
|
||||
|
||||
// parameters for dsm calls (GUID, rev, subfunction, data)
|
||||
input.count = 4;
|
||||
input.pointer = dsm_params;
|
||||
memcpy(argument3, pInParams, inParamSize);
|
||||
|
||||
if (acpiNvpcfDsmFunction)
|
||||
{
|
||||
@ -742,45 +784,15 @@ NV_STATUS NV_API_CALL nv_acpi_dsm_method(
|
||||
// not have device handle for NVPCF device
|
||||
//
|
||||
dev_handle = NULL;
|
||||
acpi_status = acpi_evaluate_object(dev_handle, "\\_SB.NPCF._DSM", &input, &output);
|
||||
}
|
||||
else
|
||||
{
|
||||
acpi_status = acpi_evaluate_object(dev_handle, "_DSM", &input, &output);
|
||||
pathname = "\\_SB.NPCF._DSM";
|
||||
}
|
||||
|
||||
if (ACPI_FAILURE(acpi_status))
|
||||
{
|
||||
nv_printf(NV_DBG_INFO,
|
||||
"NVRM: %s: failed to evaluate _DSM method!\n", __FUNCTION__);
|
||||
goto exit;
|
||||
}
|
||||
rmStatus = nv_acpi_evaluate_dsm_method(dev_handle, pathname, pAcpiDsmGuid, acpiDsmRev,
|
||||
acpiDsmSubFunction, argument3, inParamSize,
|
||||
NV_FALSE, NULL, pOutData, pSize);
|
||||
|
||||
dsm = output.pointer;
|
||||
if (dsm != NULL)
|
||||
{
|
||||
if (outStatus)
|
||||
{
|
||||
*outStatus = dsm->buffer.pointer[3] << 24 |
|
||||
dsm->buffer.pointer[2] << 16 |
|
||||
dsm->buffer.pointer[1] << 8 |
|
||||
dsm->buffer.pointer[0];
|
||||
}
|
||||
|
||||
status = nv_acpi_extract_object(dsm, pOutData, *pSize, &data_size);
|
||||
*pSize = data_size;
|
||||
|
||||
kfree(output.pointer);
|
||||
}
|
||||
if (status != NV_OK)
|
||||
{
|
||||
nv_printf(NV_DBG_ERRORS,
|
||||
"NVRM: %s: DSM data invalid!\n", __FUNCTION__);
|
||||
}
|
||||
|
||||
exit:
|
||||
os_free_mem(argument3);
|
||||
return status;
|
||||
return rmStatus;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -796,13 +808,11 @@ NV_STATUS NV_API_CALL nv_acpi_ddc_method(
|
||||
acpi_status status;
|
||||
union acpi_object *ddc = NULL;
|
||||
NvU32 i, largestEdidSize;
|
||||
acpi_handle dev_handle = NULL;
|
||||
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
|
||||
acpi_handle dev_handle = ACPI_HANDLE(nvl->dev);
|
||||
acpi_handle lcd_dev_handle = NULL;
|
||||
acpi_handle handle = NULL;
|
||||
|
||||
if (!nv_acpi_get_device_handle(nv, &dev_handle))
|
||||
return NV_ERR_NOT_SUPPORTED;
|
||||
|
||||
if (!dev_handle)
|
||||
return NV_ERR_INVALID_ARGUMENT;
|
||||
|
||||
@ -836,7 +846,7 @@ NV_STATUS NV_API_CALL nv_acpi_ddc_method(
|
||||
case 0x0400:
|
||||
case 0xA420:
|
||||
lcd_dev_handle = handle;
|
||||
nv_printf(NV_DBG_INFO, "NVRM: %s Found LCD: %x\n",
|
||||
nv_printf(NV_DBG_INFO, "NVRM: %s Found LCD: %llx\n",
|
||||
__FUNCTION__, device_id);
|
||||
break;
|
||||
default:
|
||||
@ -915,12 +925,10 @@ NV_STATUS NV_API_CALL nv_acpi_rom_method(
|
||||
union acpi_object *rom;
|
||||
union acpi_object rom_arg[2];
|
||||
struct acpi_object_list input = { 2, rom_arg };
|
||||
acpi_handle dev_handle = NULL;
|
||||
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
|
||||
acpi_handle dev_handle = ACPI_HANDLE(nvl->dev);
|
||||
uint32_t offset, length;
|
||||
|
||||
if (!nv_acpi_get_device_handle(nv, &dev_handle))
|
||||
return NV_ERR_NOT_SUPPORTED;
|
||||
|
||||
if (!dev_handle)
|
||||
return NV_ERR_INVALID_ARGUMENT;
|
||||
|
||||
@ -982,12 +990,10 @@ NV_STATUS NV_API_CALL nv_acpi_dod_method(
|
||||
acpi_status status;
|
||||
struct acpi_buffer output = { ACPI_ALLOCATE_BUFFER, NULL };
|
||||
union acpi_object *dod;
|
||||
acpi_handle dev_handle = NULL;
|
||||
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
|
||||
acpi_handle dev_handle = ACPI_HANDLE(nvl->dev);
|
||||
NvU32 i, count = (*pSize / sizeof(NvU32));
|
||||
|
||||
if (!nv_acpi_get_device_handle(nv, &dev_handle))
|
||||
return NV_ERR_NOT_SUPPORTED;
|
||||
|
||||
if (!dev_handle)
|
||||
return NV_ERR_INVALID_ARGUMENT;
|
||||
|
||||
@ -1129,17 +1135,11 @@ NvBool nv_acpi_power_resource_method_present(
|
||||
struct pci_dev *pdev
|
||||
)
|
||||
{
|
||||
acpi_handle handle = NULL;
|
||||
acpi_handle handle = ACPI_HANDLE(&pdev->dev);
|
||||
struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL };
|
||||
union acpi_object *object_package, *object_reference;
|
||||
acpi_status status;
|
||||
|
||||
#if defined(DEVICE_ACPI_HANDLE)
|
||||
handle = DEVICE_ACPI_HANDLE(&pdev->dev);
|
||||
#elif defined (ACPI_HANDLE)
|
||||
handle = ACPI_HANDLE(&pdev->dev);
|
||||
#endif
|
||||
|
||||
if (!handle)
|
||||
return NV_FALSE;
|
||||
|
||||
@ -1198,7 +1198,8 @@ NV_STATUS NV_API_CALL nv_acpi_mux_method(
|
||||
union acpi_object *mux = NULL;
|
||||
union acpi_object mux_arg = { ACPI_TYPE_INTEGER };
|
||||
struct acpi_object_list input = { 1, &mux_arg };
|
||||
acpi_handle dev_handle = NULL;
|
||||
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
|
||||
acpi_handle dev_handle = ACPI_HANDLE(nvl->dev);
|
||||
acpi_handle mux_dev_handle = NULL;
|
||||
acpi_handle handle = NULL;
|
||||
unsigned long long device_id = 0;
|
||||
@ -1216,9 +1217,6 @@ NV_STATUS NV_API_CALL nv_acpi_mux_method(
|
||||
__FUNCTION__, pMethodName);
|
||||
}
|
||||
|
||||
if (!nv_acpi_get_device_handle(nv, &dev_handle))
|
||||
return NV_ERR_NOT_SUPPORTED;
|
||||
|
||||
if (!dev_handle)
|
||||
return NV_ERR_INVALID_ARGUMENT;
|
||||
|
||||
@ -1384,6 +1382,34 @@ NvBool NV_API_CALL nv_acpi_is_battery_present(void)
|
||||
return NV_FALSE;
|
||||
}
|
||||
|
||||
NV_STATUS NV_API_CALL nv_acpi_d3cold_dsm_for_upstream_port(
|
||||
nv_state_t *nv,
|
||||
NvU8 *pAcpiDsmGuid,
|
||||
NvU32 acpiDsmRev,
|
||||
NvU32 acpiDsmSubFunction,
|
||||
NvU32 *data
|
||||
)
|
||||
{
|
||||
NV_STATUS rmStatus = NV_ERR_OPERATING_SYSTEM;
|
||||
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
|
||||
acpi_handle dev_handle = ACPI_HANDLE(nvl->dev->parent);
|
||||
NvU32 outData = 0;
|
||||
NvU16 outDatasize = sizeof(NvU32);
|
||||
NvU16 inParamSize = sizeof(NvU32);
|
||||
|
||||
if (!dev_handle)
|
||||
return NV_ERR_INVALID_ARGUMENT;
|
||||
|
||||
rmStatus = nv_acpi_evaluate_dsm_method(dev_handle, "_DSM", pAcpiDsmGuid, acpiDsmRev,
|
||||
acpiDsmSubFunction, data, inParamSize, NV_TRUE,
|
||||
NULL, &outData, &outDatasize);
|
||||
|
||||
if (rmStatus == NV_OK)
|
||||
*data = outData;
|
||||
|
||||
return rmStatus;
|
||||
}
|
||||
|
||||
#else // NV_LINUX_ACPI_EVENTS_SUPPORTED
|
||||
|
||||
void NV_API_CALL nv_acpi_methods_init(NvU32 *handlePresent)
|
||||
@ -1426,6 +1452,17 @@ NV_STATUS NV_API_CALL nv_acpi_dsm_method(
|
||||
return NV_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
NV_STATUS NV_API_CALL nv_acpi_d3cold_dsm_for_upstream_port(
|
||||
nv_state_t *nv,
|
||||
NvU8 *pAcpiDsmGuid,
|
||||
NvU32 acpiDsmRev,
|
||||
NvU32 acpiDsmSubFunction,
|
||||
NvU32 *data
|
||||
)
|
||||
{
|
||||
return NV_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
NV_STATUS NV_API_CALL nv_acpi_ddc_method(
|
||||
nv_state_t *nv,
|
||||
void *pEdidBuffer,
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
@ -24,6 +24,7 @@
|
||||
#include "nv-linux.h"
|
||||
|
||||
extern int NVreg_ImexChannelCount;
|
||||
extern int NVreg_CreateImexChannel0;
|
||||
|
||||
static int nv_caps_imex_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
@ -104,6 +105,10 @@ int NV_API_CALL nv_caps_imex_init(void)
|
||||
if (NVreg_ImexChannelCount == 0)
|
||||
{
|
||||
nv_printf(NV_DBG_INFO, "nv-caps-imex is disabled.\n");
|
||||
|
||||
// Disable channel creation as well
|
||||
NVreg_CreateImexChannel0 = 0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2019-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2019-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
@ -26,6 +26,8 @@
|
||||
#include "nv-procfs.h"
|
||||
#include "nv-hash.h"
|
||||
|
||||
#include "nvmisc.h"
|
||||
|
||||
extern int NVreg_ModifyDeviceFiles;
|
||||
|
||||
/* sys_close() or __close_fd() */
|
||||
@ -49,7 +51,7 @@ typedef struct nv_cap_table_entry
|
||||
struct hlist_node hlist;
|
||||
} nv_cap_table_entry_t;
|
||||
|
||||
#define NV_CAP_NUM_ENTRIES(_table) (sizeof(_table) / sizeof(_table[0]))
|
||||
#define NV_CAP_NUM_ENTRIES(_table) (NV_ARRAY_ELEMENTS(_table))
|
||||
|
||||
static nv_cap_table_entry_t g_nv_cap_nvlink_table[] =
|
||||
{
|
||||
@ -361,18 +363,28 @@ static ssize_t nv_cap_procfs_write(struct file *file,
|
||||
nv_cap_file_private_t *private = NULL;
|
||||
unsigned long bytes_left;
|
||||
char *proc_buffer;
|
||||
int status;
|
||||
|
||||
status = nv_down_read_interruptible(&nv_system_pm_lock);
|
||||
if (status < 0)
|
||||
{
|
||||
nv_printf(NV_DBG_ERRORS, "nv-caps: failed to lock the nv_system_pm_lock!\n");
|
||||
return status;
|
||||
}
|
||||
|
||||
private = ((struct seq_file *)file->private_data)->private;
|
||||
bytes_left = (sizeof(private->buffer) - private->offset - 1);
|
||||
|
||||
if (count == 0)
|
||||
{
|
||||
return -EINVAL;
|
||||
count = -EINVAL;
|
||||
goto done;
|
||||
}
|
||||
|
||||
if ((bytes_left == 0) || (count > bytes_left))
|
||||
{
|
||||
return -ENOSPC;
|
||||
count = -ENOSPC;
|
||||
goto done;
|
||||
}
|
||||
|
||||
proc_buffer = &private->buffer[private->offset];
|
||||
@ -380,7 +392,8 @@ static ssize_t nv_cap_procfs_write(struct file *file,
|
||||
if (copy_from_user(proc_buffer, buffer, count))
|
||||
{
|
||||
nv_printf(NV_DBG_ERRORS, "nv-caps: failed to copy in proc data!\n");
|
||||
return -EFAULT;
|
||||
count = -EFAULT;
|
||||
goto done;
|
||||
}
|
||||
|
||||
private->offset += count;
|
||||
@ -388,17 +401,28 @@ static ssize_t nv_cap_procfs_write(struct file *file,
|
||||
|
||||
*pos = private->offset;
|
||||
|
||||
done:
|
||||
up_read(&nv_system_pm_lock);
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
static int nv_cap_procfs_read(struct seq_file *s, void *v)
|
||||
{
|
||||
int status;
|
||||
nv_cap_file_private_t *private = s->private;
|
||||
|
||||
status = nv_down_read_interruptible(&nv_system_pm_lock);
|
||||
if (status < 0)
|
||||
{
|
||||
return status;
|
||||
}
|
||||
|
||||
seq_printf(s, "%s: %d\n", "DeviceFileMinor", private->minor);
|
||||
seq_printf(s, "%s: %d\n", "DeviceFileMode", private->permissions);
|
||||
seq_printf(s, "%s: %d\n", "DeviceFileModify", private->modify);
|
||||
|
||||
up_read(&nv_system_pm_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -423,14 +447,6 @@ static int nv_cap_procfs_open(struct inode *inode, struct file *file)
|
||||
if (rc < 0)
|
||||
{
|
||||
NV_KFREE(private, sizeof(nv_cap_file_private_t));
|
||||
return rc;
|
||||
}
|
||||
|
||||
rc = nv_down_read_interruptible(&nv_system_pm_lock);
|
||||
if (rc < 0)
|
||||
{
|
||||
single_release(inode, file);
|
||||
NV_KFREE(private, sizeof(nv_cap_file_private_t));
|
||||
}
|
||||
|
||||
return rc;
|
||||
@ -449,8 +465,6 @@ static int nv_cap_procfs_release(struct inode *inode, struct file *file)
|
||||
private = s->private;
|
||||
}
|
||||
|
||||
up_read(&nv_system_pm_lock);
|
||||
|
||||
single_release(inode, file);
|
||||
|
||||
if (private != NULL)
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user