mirror of
https://github.com/NVIDIA/open-gpu-kernel-modules.git
synced 2024-12-11 09:24:15 +01:00
2480 lines
91 KiB
C
2480 lines
91 KiB
C
/*******************************************************************************
|
|
Copyright (c) 2015-2022 NVIDIA Corporation
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to
|
|
deal in the Software without restriction, including without limitation the
|
|
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
|
sell copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be
|
|
included in all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
DEALINGS IN THE SOFTWARE.
|
|
|
|
*******************************************************************************/
|
|
|
|
#include "uvm_common.h"
|
|
#include "uvm_types.h"
|
|
#include "uvm_forward_decl.h"
|
|
#include "uvm_gpu.h"
|
|
#include "uvm_mmu.h"
|
|
#include "uvm_hal.h"
|
|
#include "uvm_kvmalloc.h"
|
|
#include "uvm_pte_batch.h"
|
|
#include "uvm_tlb_batch.h"
|
|
#include "uvm_push.h"
|
|
#include "uvm_mem.h"
|
|
#include "uvm_va_space.h"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// The page tree has 5 levels on Pascal, and the root is never freed by a normal
|
|
// 'put' operation which leaves a maximum of 4 levels
|
|
#define MAX_OPERATION_DEPTH 4
|
|
|
|
|
|
// Wrappers for push begin handling channel_manager not being there when running
|
|
// the page tree unit test
|
|
//
|
|
// When in SR-IOV heavy, the push needs to be associated with the proxy channel
|
|
// (UVM_CHANNEL_TYPE_MEMOPS), because it is used to manipulate the page tree.
|
|
// In any other scenario using UVM_CHANNEL_TYPE_GPU_INTERNAL is preferred,
|
|
// because that type is normally associated with the LCE mapped to the most
|
|
// PCEs. The higher bandwidth is beneficial when doing bulk operations such as
|
|
// clearing PTEs, or initializing a page directory/table.
|
|
#define page_tree_begin_acquire(tree, tracker, push, format, ...) ({ \
|
|
NV_STATUS status; \
|
|
uvm_channel_manager_t *manager = (tree)->gpu->channel_manager; \
|
|
\
|
|
if (manager == NULL) \
|
|
status = uvm_push_begin_fake((tree)->gpu, (push)); \
|
|
else if (uvm_gpu_is_virt_mode_sriov_heavy((tree)->gpu)) \
|
|
status = uvm_push_begin_acquire(manager, UVM_CHANNEL_TYPE_MEMOPS, (tracker), (push), (format), ##__VA_ARGS__); \
|
|
else \
|
|
status = uvm_push_begin_acquire(manager, UVM_CHANNEL_TYPE_GPU_INTERNAL, (tracker), (push), (format), ##__VA_ARGS__);\
|
|
\
|
|
status; \
|
|
})
|
|
|
|
// Default location of page table allocations
|
|
static uvm_aperture_t page_table_aperture = UVM_APERTURE_DEFAULT;
|
|
|
|
static char *uvm_page_table_location;
|
|
module_param(uvm_page_table_location, charp, S_IRUGO);
|
|
MODULE_PARM_DESC(uvm_page_table_location,
|
|
"Set the location for UVM-allocated page tables. Choices are: vid, sys.");
|
|
|
|
NV_STATUS uvm_mmu_init(void)
|
|
{
|
|
UVM_ASSERT((page_table_aperture == UVM_APERTURE_VID) ||
|
|
(page_table_aperture == UVM_APERTURE_SYS) ||
|
|
(page_table_aperture == UVM_APERTURE_DEFAULT));
|
|
|
|
if (!uvm_page_table_location)
|
|
return NV_OK;
|
|
|
|
// TODO: Bug 1766651: Add modes for testing, e.g. alternating vidmem and
|
|
// sysmem etc.
|
|
if (strcmp(uvm_page_table_location, "vid") == 0) {
|
|
page_table_aperture = UVM_APERTURE_VID;
|
|
}
|
|
else if (strcmp(uvm_page_table_location, "sys") == 0) {
|
|
page_table_aperture = UVM_APERTURE_SYS;
|
|
}
|
|
else {
|
|
pr_info("Invalid uvm_page_table_location %s. Using %s instead.\n",
|
|
uvm_page_table_location,
|
|
uvm_aperture_string(page_table_aperture));
|
|
}
|
|
|
|
return NV_OK;
|
|
}
|
|
|
|
static NV_STATUS phys_mem_allocate_sysmem(uvm_page_tree_t *tree, NvLength size, uvm_mmu_page_table_alloc_t *out)
|
|
{
|
|
NV_STATUS status = NV_OK;
|
|
NvU64 dma_addr;
|
|
unsigned long flags = __GFP_ZERO;
|
|
uvm_memcg_context_t memcg_context;
|
|
uvm_va_space_t *va_space;
|
|
struct mm_struct *mm = NULL;
|
|
|
|
if (tree->type == UVM_PAGE_TREE_TYPE_USER && tree->gpu_va_space && UVM_CGROUP_ACCOUNTING_SUPPORTED()) {
|
|
va_space = tree->gpu_va_space->va_space;
|
|
mm = uvm_va_space_mm_retain(va_space);
|
|
if (mm)
|
|
uvm_memcg_context_start(&memcg_context, mm);
|
|
}
|
|
|
|
// If mm is not NULL, memcg context has been started and we can use
|
|
// the account flags.
|
|
if (mm)
|
|
flags |= NV_UVM_GFP_FLAGS_ACCOUNT;
|
|
else
|
|
flags |= NV_UVM_GFP_FLAGS;
|
|
|
|
out->handle.page = alloc_pages(flags, get_order(size));
|
|
|
|
// va_space and mm will be set only if the memcg context has been started.
|
|
if (mm) {
|
|
uvm_memcg_context_end(&memcg_context);
|
|
uvm_va_space_mm_release(va_space);
|
|
}
|
|
|
|
if (out->handle.page == NULL)
|
|
return NV_ERR_NO_MEMORY;
|
|
|
|
// Check for fake GPUs from the unit test
|
|
if (tree->gpu->parent->pci_dev)
|
|
status = uvm_gpu_map_cpu_pages(tree->gpu, out->handle.page, UVM_PAGE_ALIGN_UP(size), &dma_addr);
|
|
else
|
|
dma_addr = page_to_phys(out->handle.page);
|
|
|
|
if (status != NV_OK) {
|
|
__free_pages(out->handle.page, get_order(size));
|
|
return status;
|
|
}
|
|
|
|
out->addr = uvm_gpu_phys_address(UVM_APERTURE_SYS, dma_addr);
|
|
out->size = size;
|
|
|
|
return NV_OK;
|
|
}
|
|
|
|
static NV_STATUS phys_mem_allocate_vidmem(uvm_page_tree_t *tree,
|
|
NvLength size,
|
|
uvm_pmm_alloc_flags_t pmm_flags,
|
|
uvm_mmu_page_table_alloc_t *out)
|
|
{
|
|
NV_STATUS status;
|
|
uvm_gpu_t *gpu = tree->gpu;
|
|
uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
|
|
|
|
status = uvm_pmm_gpu_alloc_kernel(&gpu->pmm, 1, size, pmm_flags, &out->handle.chunk, &local_tracker);
|
|
if (status != NV_OK)
|
|
return status;
|
|
|
|
if (!uvm_tracker_is_empty(&local_tracker)) {
|
|
uvm_mutex_lock(&tree->lock);
|
|
status = uvm_tracker_add_tracker_safe(&tree->tracker, &local_tracker);
|
|
uvm_mutex_unlock(&tree->lock);
|
|
}
|
|
|
|
uvm_tracker_deinit(&local_tracker);
|
|
|
|
if (status != NV_OK) {
|
|
uvm_pmm_gpu_free(&tree->gpu->pmm, out->handle.chunk, NULL);
|
|
return status;
|
|
}
|
|
|
|
out->addr = uvm_gpu_phys_address(UVM_APERTURE_VID, out->handle.chunk->address);
|
|
out->size = size;
|
|
|
|
return status;
|
|
}
|
|
|
|
static NV_STATUS phys_mem_allocate(uvm_page_tree_t *tree,
|
|
NvLength size,
|
|
uvm_aperture_t location,
|
|
uvm_pmm_alloc_flags_t pmm_flags,
|
|
uvm_mmu_page_table_alloc_t *out)
|
|
{
|
|
UVM_ASSERT((location == UVM_APERTURE_VID) || (location == UVM_APERTURE_SYS));
|
|
|
|
memset(out, 0, sizeof(*out));
|
|
|
|
if (location == UVM_APERTURE_SYS)
|
|
return phys_mem_allocate_sysmem(tree, size, out);
|
|
else
|
|
return phys_mem_allocate_vidmem(tree, size, pmm_flags, out);
|
|
}
|
|
|
|
static void phys_mem_deallocate_vidmem(uvm_page_tree_t *tree, uvm_mmu_page_table_alloc_t *ptr)
|
|
{
|
|
uvm_assert_mutex_locked(&tree->lock);
|
|
UVM_ASSERT(ptr->addr.aperture == UVM_APERTURE_VID);
|
|
|
|
uvm_pmm_gpu_free(&tree->gpu->pmm, ptr->handle.chunk, &tree->tracker);
|
|
}
|
|
|
|
static void phys_mem_deallocate_sysmem(uvm_page_tree_t *tree, uvm_mmu_page_table_alloc_t *ptr)
|
|
{
|
|
NV_STATUS status;
|
|
|
|
uvm_assert_mutex_locked(&tree->lock);
|
|
|
|
// Synchronize any pending operations before freeing the memory that might
|
|
// be used by them.
|
|
status = uvm_tracker_wait(&tree->tracker);
|
|
if (status != NV_OK)
|
|
UVM_ASSERT(status == uvm_global_get_status());
|
|
|
|
UVM_ASSERT(ptr->addr.aperture == UVM_APERTURE_SYS);
|
|
if (tree->gpu->parent->pci_dev)
|
|
uvm_gpu_unmap_cpu_pages(tree->gpu, ptr->addr.address, UVM_PAGE_ALIGN_UP(ptr->size));
|
|
__free_pages(ptr->handle.page, get_order(ptr->size));
|
|
}
|
|
|
|
static void phys_mem_deallocate(uvm_page_tree_t *tree, uvm_mmu_page_table_alloc_t *ptr)
|
|
{
|
|
if (ptr->addr.aperture == UVM_APERTURE_SYS)
|
|
phys_mem_deallocate_sysmem(tree, ptr);
|
|
else
|
|
phys_mem_deallocate_vidmem(tree, ptr);
|
|
|
|
memset(ptr, 0, sizeof(*ptr));
|
|
}
|
|
|
|
static void page_table_range_init(uvm_page_table_range_t *range,
|
|
NvU32 page_size,
|
|
uvm_page_directory_t *dir,
|
|
NvU32 start_index,
|
|
NvU32 end_index)
|
|
{
|
|
range->table = dir;
|
|
range->start_index = start_index;
|
|
range->entry_count = 1 + end_index - start_index;
|
|
range->page_size = page_size;
|
|
dir->ref_count += range->entry_count;
|
|
}
|
|
|
|
static void phys_mem_init(uvm_page_tree_t *tree, NvU32 page_size, uvm_page_directory_t *dir, uvm_push_t *push)
|
|
{
|
|
NvU64 clear_bits[2];
|
|
uvm_mmu_mode_hal_t *hal = tree->hal;
|
|
|
|
if (dir->depth == tree->hal->page_table_depth(page_size)) {
|
|
*clear_bits = 0; // Invalid PTE
|
|
}
|
|
else {
|
|
// passing in NULL for the phys_allocs will mark the child entries as invalid
|
|
uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
|
|
hal->make_pde(clear_bits, phys_allocs, dir->depth);
|
|
|
|
// Make sure that using only clear_bits[0] will work
|
|
UVM_ASSERT(hal->entry_size(dir->depth) == sizeof(clear_bits[0]) || clear_bits[0] == clear_bits[1]);
|
|
}
|
|
|
|
// initialize the memory to a reasonable value
|
|
tree->gpu->parent->ce_hal->memset_8(push,
|
|
uvm_gpu_address_from_phys(dir->phys_alloc.addr),
|
|
*clear_bits,
|
|
dir->phys_alloc.size);
|
|
}
|
|
|
|
static uvm_page_directory_t *allocate_directory(uvm_page_tree_t *tree,
|
|
NvU32 page_size,
|
|
NvU32 depth,
|
|
uvm_pmm_alloc_flags_t pmm_flags)
|
|
{
|
|
NV_STATUS status;
|
|
uvm_mmu_mode_hal_t *hal = tree->hal;
|
|
NvU32 entry_count;
|
|
NvLength phys_alloc_size = hal->allocation_size(depth, page_size);
|
|
uvm_page_directory_t *dir;
|
|
|
|
// The page tree doesn't cache PTEs so space is not allocated for entries that are always PTEs.
|
|
// 2M PTEs may later become PDEs so pass UVM_PAGE_SIZE_AGNOSTIC, not page_size.
|
|
if (depth == hal->page_table_depth(UVM_PAGE_SIZE_AGNOSTIC))
|
|
entry_count = 0;
|
|
else
|
|
entry_count = hal->entries_per_index(depth) << hal->index_bits(depth, page_size);
|
|
|
|
dir = uvm_kvmalloc_zero(sizeof(uvm_page_directory_t) + sizeof(dir->entries[0]) * entry_count);
|
|
if (dir == NULL)
|
|
return NULL;
|
|
|
|
status = phys_mem_allocate(tree, phys_alloc_size, tree->location, pmm_flags, &dir->phys_alloc);
|
|
|
|
// Fall back to sysmem if allocating page tables in vidmem with eviction
|
|
// fails, and the fallback is allowed.
|
|
if ((status == NV_ERR_NO_MEMORY) &&
|
|
(tree->location == UVM_APERTURE_VID) &&
|
|
(tree->location_sys_fallback) &&
|
|
((pmm_flags & UVM_PMM_ALLOC_FLAGS_EVICT) != 0)) {
|
|
status = phys_mem_allocate(tree, phys_alloc_size, UVM_APERTURE_SYS, pmm_flags, &dir->phys_alloc);
|
|
}
|
|
|
|
if (status != NV_OK) {
|
|
uvm_kvfree(dir);
|
|
return NULL;
|
|
}
|
|
dir->depth = depth;
|
|
|
|
return dir;
|
|
}
|
|
|
|
static inline NvU32 entry_index_from_vaddr(NvU64 vaddr, NvU32 addr_bit_shift, NvU32 bits)
|
|
{
|
|
NvU64 mask = ((NvU64)1 << bits) - 1;
|
|
return (NvU32)((vaddr >> addr_bit_shift) & mask);
|
|
}
|
|
|
|
static inline NvU32 index_to_entry(uvm_mmu_mode_hal_t *hal, NvU32 entry_index, NvU32 depth, NvU32 page_size)
|
|
{
|
|
return hal->entries_per_index(depth) * entry_index + hal->entry_offset(depth, page_size);
|
|
}
|
|
|
|
// pde_fill() populates pde_count PDE entries (starting at start_index) with
|
|
// the same mapping, i.e., with the same physical address (phys_addr).
|
|
static void pde_fill(uvm_page_tree_t *tree,
|
|
NvU32 depth,
|
|
uvm_mmu_page_table_alloc_t *directory,
|
|
NvU32 start_index,
|
|
NvU32 pde_count,
|
|
uvm_mmu_page_table_alloc_t **phys_addr,
|
|
uvm_push_t *push)
|
|
{
|
|
NvU64 pde_data[2], entry_size;
|
|
uvm_gpu_address_t pde_entry_addr;
|
|
|
|
UVM_ASSERT(start_index + pde_count <= uvm_mmu_page_tree_entries(tree, depth, UVM_PAGE_SIZE_AGNOSTIC));
|
|
entry_size = tree->hal->entry_size(depth);
|
|
UVM_ASSERT(sizeof(pde_data) >= entry_size);
|
|
|
|
tree->hal->make_pde(pde_data, phys_addr, depth);
|
|
pde_entry_addr = uvm_gpu_address_from_phys(directory->addr);
|
|
pde_entry_addr.address += start_index * entry_size;
|
|
|
|
if (entry_size == sizeof(pde_data[0])) {
|
|
tree->gpu->parent->ce_hal->memset_8(push, pde_entry_addr, pde_data[0], sizeof(pde_data[0]) * pde_count);
|
|
}
|
|
else {
|
|
NvU32 max_inline_entries = UVM_PUSH_INLINE_DATA_MAX_SIZE / sizeof(pde_data);
|
|
uvm_gpu_address_t inline_data_addr;
|
|
uvm_push_inline_data_t inline_data;
|
|
NvU32 membar_flag = 0;
|
|
NvU32 i;
|
|
|
|
if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE))
|
|
membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE;
|
|
else if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU))
|
|
membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU;
|
|
|
|
for (i = 0; i < pde_count;) {
|
|
NvU32 j;
|
|
NvU32 entry_count = min(pde_count - i, max_inline_entries);
|
|
|
|
uvm_push_inline_data_begin(push, &inline_data);
|
|
for (j = 0; j < entry_count; j++)
|
|
uvm_push_inline_data_add(&inline_data, pde_data, sizeof(pde_data));
|
|
inline_data_addr = uvm_push_inline_data_end(&inline_data);
|
|
|
|
// All but the first memcopy can be pipelined. We respect the
|
|
// caller's pipelining settings for the first push.
|
|
if (i != 0)
|
|
uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
|
|
|
|
// No membar is needed until the last copy. Otherwise, use
|
|
// caller's membar flag.
|
|
if (i + entry_count < pde_count)
|
|
uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
|
|
else if (membar_flag)
|
|
uvm_push_set_flag(push, membar_flag);
|
|
|
|
tree->gpu->parent->ce_hal->memcopy(push, pde_entry_addr, inline_data_addr, entry_count * sizeof(pde_data));
|
|
|
|
i += entry_count;
|
|
pde_entry_addr.address += sizeof(pde_data) * entry_count;
|
|
}
|
|
}
|
|
}
|
|
|
|
static uvm_page_directory_t *host_pde_write(uvm_page_directory_t *dir,
|
|
uvm_page_directory_t *parent,
|
|
NvU32 index_in_parent)
|
|
{
|
|
dir->host_parent = parent;
|
|
dir->index_in_parent = index_in_parent;
|
|
parent->ref_count++;
|
|
return dir;
|
|
}
|
|
|
|
static void pde_write(uvm_page_tree_t *tree,
|
|
uvm_page_directory_t *dir,
|
|
NvU32 entry_index,
|
|
bool force_clear,
|
|
uvm_push_t *push)
|
|
{
|
|
NvU32 i;
|
|
uvm_mmu_page_table_alloc_t *phys_allocs[2];
|
|
NvU32 entries_per_index = tree->hal->entries_per_index(dir->depth);
|
|
|
|
// extract physical allocs from non-null entries.
|
|
for (i = 0; i < entries_per_index; i++) {
|
|
uvm_page_directory_t *entry = dir->entries[entries_per_index * entry_index + i];
|
|
if (entry == NULL || force_clear)
|
|
phys_allocs[i] = NULL;
|
|
else
|
|
phys_allocs[i] = &entry->phys_alloc;
|
|
}
|
|
|
|
pde_fill(tree, dir->depth, &dir->phys_alloc, entry_index, 1, phys_allocs, push);
|
|
}
|
|
|
|
static void host_pde_clear(uvm_page_tree_t *tree, uvm_page_directory_t *dir, NvU32 entry_index, NvU32 page_size)
|
|
{
|
|
UVM_ASSERT(dir->ref_count > 0);
|
|
|
|
dir->entries[index_to_entry(tree->hal, entry_index, dir->depth, page_size)] = NULL;
|
|
dir->ref_count--;
|
|
}
|
|
|
|
static void pde_clear(uvm_page_tree_t *tree, uvm_page_directory_t *dir, NvU32 entry_index, NvU32 page_size, void *push)
|
|
{
|
|
host_pde_clear(tree, dir, entry_index, page_size);
|
|
pde_write(tree, dir, entry_index, false, push);
|
|
}
|
|
|
|
static uvm_chunk_sizes_mask_t allocation_sizes_for_big_page_size(uvm_parent_gpu_t *parent_gpu, NvU32 big_page_size)
|
|
{
|
|
uvm_chunk_sizes_mask_t alloc_sizes = 0;
|
|
uvm_mmu_mode_hal_t *hal = parent_gpu->arch_hal->mmu_mode_hal(big_page_size);
|
|
|
|
if (hal != NULL) {
|
|
unsigned long page_size_log2;
|
|
unsigned long page_sizes = hal->page_sizes();
|
|
BUILD_BUG_ON(sizeof(hal->page_sizes()) > sizeof(page_sizes));
|
|
|
|
for_each_set_bit(page_size_log2, &page_sizes, BITS_PER_LONG) {
|
|
NvU32 i;
|
|
NvU32 page_size = (NvU32)(1ULL << page_size_log2);
|
|
for (i = 0; i <= hal->page_table_depth(page_size); i++)
|
|
alloc_sizes |= hal->allocation_size(i, page_size);
|
|
}
|
|
}
|
|
|
|
return alloc_sizes;
|
|
}
|
|
|
|
static NvU32 page_sizes_for_big_page_size(uvm_parent_gpu_t *parent_gpu, NvU32 big_page_size)
|
|
{
|
|
uvm_mmu_mode_hal_t *hal = parent_gpu->arch_hal->mmu_mode_hal(big_page_size);
|
|
|
|
if (hal != NULL)
|
|
return hal->page_sizes();
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void page_tree_end(uvm_page_tree_t *tree, uvm_push_t *push)
|
|
{
|
|
if (tree->gpu->channel_manager != NULL)
|
|
uvm_push_end(push);
|
|
else
|
|
uvm_push_end_fake(push);
|
|
}
|
|
|
|
static void page_tree_tracker_overwrite_with_push(uvm_page_tree_t *tree, uvm_push_t *push)
|
|
{
|
|
uvm_assert_mutex_locked(&tree->lock);
|
|
|
|
// No GPU work to track for fake GPU testing
|
|
if (tree->gpu->channel_manager == NULL)
|
|
return;
|
|
|
|
uvm_tracker_overwrite_with_push(&tree->tracker, push);
|
|
}
|
|
|
|
static NV_STATUS page_tree_end_and_wait(uvm_page_tree_t *tree, uvm_push_t *push)
|
|
{
|
|
if (tree->gpu->channel_manager != NULL)
|
|
return uvm_push_end_and_wait(push);
|
|
else
|
|
uvm_push_end_fake(push);
|
|
|
|
return NV_OK;
|
|
}
|
|
|
|
// initialize new page tables and insert them into the tree
|
|
static NV_STATUS write_gpu_state(uvm_page_tree_t *tree,
|
|
NvU32 page_size,
|
|
NvS32 invalidate_depth,
|
|
NvU32 used_count,
|
|
uvm_page_directory_t **dirs_used)
|
|
{
|
|
NvS32 i;
|
|
uvm_push_t push;
|
|
NV_STATUS status;
|
|
|
|
// The logic of what membar is needed when is pretty subtle, please refer to
|
|
// the UVM Functional Spec (section 5.1) for all the details.
|
|
uvm_membar_t membar_after_writes = UVM_MEMBAR_GPU;
|
|
|
|
uvm_assert_mutex_locked(&tree->lock);
|
|
|
|
if (used_count == 0)
|
|
return NV_OK;
|
|
|
|
status = page_tree_begin_acquire(tree, &tree->tracker, &push, "write_gpu_state: %u dirs", used_count);
|
|
if (status != NV_OK)
|
|
return status;
|
|
|
|
// only do GPU work once all the allocations have succeeded
|
|
// first, zero-out the new allocations
|
|
for (i = 0; i < used_count; i++) {
|
|
// Appropriate membar will be done after all the writes. Pipelining can
|
|
// be enabled as they are all initializing newly allocated memory that
|
|
// cannot have any writes pending.
|
|
uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
|
|
uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
|
|
|
|
phys_mem_init(tree, page_size, dirs_used[i], &push);
|
|
|
|
if (dirs_used[i]->phys_alloc.addr.aperture == UVM_APERTURE_SYS)
|
|
membar_after_writes = UVM_MEMBAR_SYS;
|
|
}
|
|
|
|
// Only a single membar is needed between the memsets of the page tables
|
|
// and the writes of the PDEs pointing to those page tables.
|
|
// The membar can be local if all of the page tables and PDEs are in GPU memory,
|
|
// but must be a sysmembar if any of them are in sysmem.
|
|
tree->gpu->parent->host_hal->wait_for_idle(&push);
|
|
uvm_hal_membar(tree->gpu, &push, membar_after_writes);
|
|
|
|
// Reset back to a local membar by default
|
|
membar_after_writes = UVM_MEMBAR_GPU;
|
|
|
|
// write entries bottom up, so that they are valid once they're inserted into the tree
|
|
for (i = used_count - 1; i >= 0; i--) {
|
|
uvm_page_directory_t *dir = dirs_used[i];
|
|
|
|
// Appropriate membar will be done after all the writes. Pipelining can
|
|
// be enabled as they are all independent and we just did a WFI above.
|
|
uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
|
|
uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
|
|
pde_write(tree, dir->host_parent, dir->index_in_parent, false, &push);
|
|
|
|
// If any of the written PDEs is in sysmem, a sysmembar is needed before
|
|
// the TLB invalidate.
|
|
// Notably sysmembar is needed even though the writer (CE) and reader (MMU) are
|
|
// on the same GPU, because CE physical writes take the L2 bypass path.
|
|
if (dir->host_parent->phys_alloc.addr.aperture == UVM_APERTURE_SYS)
|
|
membar_after_writes = UVM_MEMBAR_SYS;
|
|
}
|
|
|
|
tree->gpu->parent->host_hal->wait_for_idle(&push);
|
|
uvm_hal_membar(tree->gpu, &push, membar_after_writes);
|
|
|
|
UVM_ASSERT(invalidate_depth >= 0);
|
|
|
|
// Upgrades don't have to flush out accesses, so no membar is needed on the TLB invalidate.
|
|
tree->gpu->parent->host_hal->tlb_invalidate_all(&push,
|
|
uvm_page_tree_pdb(tree)->addr,
|
|
invalidate_depth,
|
|
UVM_MEMBAR_NONE);
|
|
|
|
// We just did the appropriate membar after the WFI, so no need for another
|
|
// one in push_end().
|
|
// At least currently as if the L2 bypass path changes to only require a GPU
|
|
// membar between PDE write and TLB invalidate, we'll need to push a
|
|
// sysmembar so the end-of-push semaphore is ordered behind the PDE writes.
|
|
uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
|
|
page_tree_end(tree, &push);
|
|
page_tree_tracker_overwrite_with_push(tree, &push);
|
|
|
|
return NV_OK;
|
|
}
|
|
|
|
static void free_unused_directories(uvm_page_tree_t *tree,
|
|
NvU32 used_count,
|
|
uvm_page_directory_t **dirs_used,
|
|
uvm_page_directory_t **dir_cache)
|
|
{
|
|
NvU32 i;
|
|
|
|
// free unused entries
|
|
for (i = 0; i < MAX_OPERATION_DEPTH; i++) {
|
|
uvm_page_directory_t *dir = dir_cache[i];
|
|
if (dir != NULL) {
|
|
NvU32 j;
|
|
|
|
for (j = 0; j < used_count; j++) {
|
|
if (dir == dirs_used[j])
|
|
break;
|
|
}
|
|
|
|
if (j == used_count) {
|
|
phys_mem_deallocate(tree, &dir->phys_alloc);
|
|
uvm_kvfree(dir);
|
|
}
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
static NV_STATUS allocate_page_table(uvm_page_tree_t *tree, NvU32 page_size, uvm_mmu_page_table_alloc_t *out)
|
|
{
|
|
NvU32 depth = tree->hal->page_table_depth(page_size);
|
|
NvLength alloc_size = tree->hal->allocation_size(depth, page_size);
|
|
|
|
return phys_mem_allocate(tree, alloc_size, tree->location, UVM_PMM_ALLOC_FLAGS_EVICT, out);
|
|
}
|
|
|
|
static void map_remap_deinit(uvm_page_tree_t *tree)
|
|
{
|
|
if (tree->map_remap.pde0.size)
|
|
phys_mem_deallocate(tree, &tree->map_remap.pde0);
|
|
|
|
if (tree->map_remap.ptes_invalid_4k.size)
|
|
phys_mem_deallocate(tree, &tree->map_remap.ptes_invalid_4k);
|
|
}
|
|
|
|
static NV_STATUS map_remap_init(uvm_page_tree_t *tree)
|
|
{
|
|
NV_STATUS status;
|
|
uvm_push_t push;
|
|
uvm_pte_batch_t batch;
|
|
NvU32 entry_size;
|
|
|
|
// Allocate the ptes_invalid_4k.
|
|
status = allocate_page_table(tree, UVM_PAGE_SIZE_4K, &tree->map_remap.ptes_invalid_4k);
|
|
if (status != NV_OK)
|
|
goto error;
|
|
|
|
// Allocate the pde0 struct (pde0 depth = depth(UVM_PAGE_SIZE_2M))
|
|
// This is only needed for the 512M page size, since it requires a
|
|
// PDE1-depth(512M) PTE. We first map it to the pde0 directory, then we
|
|
// return the PTE for the get_ptes()'s caller.
|
|
if (tree->hal->page_sizes() & UVM_PAGE_SIZE_512M) {
|
|
status = allocate_page_table(tree, UVM_PAGE_SIZE_2M, &tree->map_remap.pde0);
|
|
if (status != NV_OK)
|
|
goto error;
|
|
}
|
|
status = page_tree_begin_acquire(tree, &tree->tracker, &push, "map remap init");
|
|
if (status != NV_OK)
|
|
goto error;
|
|
|
|
uvm_pte_batch_begin(&push, &batch);
|
|
entry_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_4K);
|
|
|
|
// Invalidate all entries in the ptes_invalid_4k page table.
|
|
uvm_pte_batch_clear_ptes(&batch,
|
|
tree->map_remap.ptes_invalid_4k.addr,
|
|
0,
|
|
entry_size,
|
|
tree->map_remap.ptes_invalid_4k.size / entry_size);
|
|
|
|
uvm_pte_batch_end(&batch);
|
|
|
|
// Set pde0 entries to ptes_invalid_4k.
|
|
if (tree->hal->page_sizes() & UVM_PAGE_SIZE_512M) {
|
|
uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
|
|
NvU32 depth = tree->hal->page_table_depth(UVM_PAGE_SIZE_4K) - 1;
|
|
size_t index_4k = tree->hal->entry_offset(depth, UVM_PAGE_SIZE_4K);
|
|
|
|
// pde0 depth equals UVM_PAGE_SIZE_2M.
|
|
NvU32 pde0_depth = tree->hal->page_table_depth(UVM_PAGE_SIZE_2M);
|
|
NvU32 pde0_entries = tree->map_remap.pde0.size / tree->hal->entry_size(pde0_depth);
|
|
|
|
// The big-page entry is NULL which makes it an invalid entry.
|
|
phys_allocs[index_4k] = &tree->map_remap.ptes_invalid_4k;
|
|
|
|
// By default CE operations include a MEMBAR_SYS. MEMBAR_GPU is
|
|
// sufficient when pde0 is allocated in VIDMEM.
|
|
if (tree->map_remap.pde0.addr.aperture == UVM_APERTURE_VID)
|
|
uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);
|
|
|
|
pde_fill(tree,
|
|
pde0_depth,
|
|
&tree->map_remap.pde0,
|
|
0,
|
|
pde0_entries,
|
|
(uvm_mmu_page_table_alloc_t **)&phys_allocs,
|
|
&push);
|
|
}
|
|
|
|
return page_tree_end_and_wait(tree, &push);
|
|
error:
|
|
map_remap_deinit(tree);
|
|
|
|
return status;
|
|
}
|
|
|
|
// The location of the page tree backing storage depends on the aperture passed
|
|
// to the page tree initialization ("init location" in the table below), and the
|
|
// uvm_page_table_location module parameter. Only when the initialization
|
|
// aperture is UVM_APERTURE_DEFAULT, and the module parameter is not set,
|
|
// the pages can be in more than one location: vidmem is given priority, but if
|
|
// the allocation fails it will fallback to sysmem.
|
|
//
|
|
// Behavior outside of SR-IOV heavy (bare metal, SR-IOV standard, etc):
|
|
// Inputs Outputs
|
|
// init location | uvm_page_table_location || tree->location | tree->location_sys_fallback
|
|
// --------------|-------------------------||----------------|----------------
|
|
// vidmem | - || vidmem | false
|
|
// sysmem | - || sysmem | false
|
|
// default | <not set> || vidmem | true (1)
|
|
// default | vidmem || vidmem | false
|
|
// default | sysmem || sysmem | false
|
|
//
|
|
|
|
|
|
|
|
// (1) The fallback to sysmem is always enabled.
|
|
|
|
//
|
|
// In SR-IOV heavy the the page tree must be in vidmem, to prevent guest drivers
|
|
// from updating GPU page tables without hypervisor knowledge.
|
|
// Inputs Outputs
|
|
// init location | uvm_page_table_location || tree->location | tree->location_sys_fallback
|
|
// -------------|-------------------------||----------------|----------------
|
|
// vidmem | - || vidmem | false
|
|
// sysmem | - || <disallowed>
|
|
// default | - || <disallowed>
|
|
//
|
|
static void page_tree_set_location(uvm_page_tree_t *tree, uvm_aperture_t location)
|
|
{
|
|
UVM_ASSERT(tree->gpu != NULL);
|
|
UVM_ASSERT_MSG((location == UVM_APERTURE_VID) ||
|
|
(location == UVM_APERTURE_SYS) ||
|
|
(location == UVM_APERTURE_DEFAULT),
|
|
"Invalid location %s (%d)\n", uvm_aperture_string(location), (int)location);
|
|
|
|
// The tree must be explicitly initialized in vidmem when in SR-IOV heavy.
|
|
// The only exception are "fake" GPUs used during page tree testing, which
|
|
// can be identified by having no channel manager.
|
|
if ((tree->gpu->channel_manager != NULL) && uvm_gpu_is_virt_mode_sriov_heavy(tree->gpu))
|
|
UVM_ASSERT(location == UVM_APERTURE_VID);
|
|
|
|
if (location == UVM_APERTURE_DEFAULT) {
|
|
if (page_table_aperture == UVM_APERTURE_DEFAULT) {
|
|
tree->location = UVM_APERTURE_VID;
|
|
|
|
|
|
|
|
|
|
|
|
tree->location_sys_fallback = true;
|
|
|
|
}
|
|
else {
|
|
tree->location = page_table_aperture;
|
|
tree->location_sys_fallback = false;
|
|
}
|
|
}
|
|
else {
|
|
tree->location = location;
|
|
tree->location_sys_fallback = false;
|
|
}
|
|
}
|
|
|
|
NV_STATUS uvm_page_tree_init(uvm_gpu_t *gpu,
|
|
uvm_gpu_va_space_t *gpu_va_space,
|
|
uvm_page_tree_type_t type,
|
|
NvU32 big_page_size,
|
|
uvm_aperture_t location,
|
|
uvm_page_tree_t *tree)
|
|
{
|
|
uvm_push_t push;
|
|
NV_STATUS status;
|
|
BUILD_BUG_ON(sizeof(uvm_page_directory_t) != offsetof(uvm_page_directory_t, entries));
|
|
|
|
UVM_ASSERT(type < UVM_PAGE_TREE_TYPE_COUNT);
|
|
|
|
memset(tree, 0, sizeof(*tree));
|
|
uvm_mutex_init(&tree->lock, UVM_LOCK_ORDER_PAGE_TREE);
|
|
tree->hal = gpu->parent->arch_hal->mmu_mode_hal(big_page_size);
|
|
UVM_ASSERT(tree->hal != NULL);
|
|
UVM_ASSERT(MAX_OPERATION_DEPTH >= tree->hal->page_table_depth(UVM_PAGE_SIZE_AGNOSTIC));
|
|
tree->gpu = gpu;
|
|
tree->type = type;
|
|
tree->gpu_va_space = gpu_va_space;
|
|
tree->big_page_size = big_page_size;
|
|
|
|
page_tree_set_location(tree, location);
|
|
|
|
uvm_tracker_init(&tree->tracker);
|
|
|
|
tree->root = allocate_directory(tree, UVM_PAGE_SIZE_AGNOSTIC, 0, UVM_PMM_ALLOC_FLAGS_EVICT);
|
|
|
|
if (tree->root == NULL)
|
|
return NV_ERR_NO_MEMORY;
|
|
|
|
if (gpu->parent->map_remap_larger_page_promotion) {
|
|
status = map_remap_init(tree);
|
|
if (status != NV_OK)
|
|
return status;
|
|
}
|
|
|
|
status = page_tree_begin_acquire(tree, &tree->tracker, &push, "init page tree");
|
|
if (status != NV_OK)
|
|
return status;
|
|
|
|
phys_mem_init(tree, UVM_PAGE_SIZE_AGNOSTIC, tree->root, &push);
|
|
return page_tree_end_and_wait(tree, &push);
|
|
}
|
|
|
|
void uvm_page_tree_deinit(uvm_page_tree_t *tree)
|
|
{
|
|
UVM_ASSERT(tree->root->ref_count == 0);
|
|
|
|
// Take the tree lock only to avoid assertions. It is not required for
|
|
// thread safety during deinit.
|
|
uvm_mutex_lock(&tree->lock);
|
|
|
|
// Invalidate the entire PDB before destroying it. This is only required for
|
|
// ATS-enabled PDBs, because we have already invalidated all GMMU entries
|
|
// under this PDB. Stale ATS entries however might be present, and we need
|
|
// to invalidate them to handle the unlikely event that this PASID gets
|
|
// reused before those entries are evicted from the TLBs.
|
|
//
|
|
// While this is only required for ATS, we don't know at this point whether
|
|
// the PDB had ATS enabled, so just do it for all user PDBs.
|
|
if (g_uvm_global.ats.enabled && tree->type == UVM_PAGE_TREE_TYPE_USER) {
|
|
uvm_push_t push;
|
|
NV_STATUS status = page_tree_begin_acquire(tree, &tree->tracker, &push, "deinit inval all");
|
|
|
|
// Failure to get a push can only happen if we've hit a fatal UVM channel
|
|
// error. We can't perform the unmap, so just leave things in place for
|
|
// debug.
|
|
if (status == NV_OK) {
|
|
tree->gpu->parent->host_hal->tlb_invalidate_all(&push, uvm_page_tree_pdb(tree)->addr, 0, UVM_MEMBAR_NONE);
|
|
page_tree_end(tree, &push);
|
|
page_tree_tracker_overwrite_with_push(tree, &push);
|
|
}
|
|
else {
|
|
UVM_ASSERT(status == uvm_global_get_status());
|
|
}
|
|
}
|
|
|
|
(void)uvm_tracker_wait(&tree->tracker);
|
|
phys_mem_deallocate(tree, &tree->root->phys_alloc);
|
|
|
|
if (tree->gpu->parent->map_remap_larger_page_promotion)
|
|
map_remap_deinit(tree);
|
|
|
|
uvm_mutex_unlock(&tree->lock);
|
|
|
|
uvm_tracker_deinit(&tree->tracker);
|
|
uvm_kvfree(tree->root);
|
|
}
|
|
|
|
void uvm_page_tree_put_ptes_async(uvm_page_tree_t *tree, uvm_page_table_range_t *range)
|
|
{
|
|
NvU32 free_count = 0;
|
|
NvU32 i;
|
|
uvm_page_directory_t *free_queue[MAX_OPERATION_DEPTH];
|
|
uvm_page_directory_t *dir = range->table;
|
|
uvm_push_t push;
|
|
NV_STATUS status;
|
|
NvU32 invalidate_depth = 0;
|
|
|
|
// The logic of what membar is needed when is pretty subtle, please refer to
|
|
// the UVM Functional Spec (section 5.1) for all the details.
|
|
uvm_membar_t membar_after_pde_clears = UVM_MEMBAR_GPU;
|
|
uvm_membar_t membar_after_invalidate = UVM_MEMBAR_GPU;
|
|
|
|
UVM_ASSERT(tree->hal->page_table_depth(range->page_size) <= MAX_OPERATION_DEPTH);
|
|
|
|
uvm_mutex_lock(&tree->lock);
|
|
|
|
// release the range
|
|
UVM_ASSERT(dir->ref_count >= range->entry_count);
|
|
dir->ref_count -= range->entry_count;
|
|
|
|
// traverse until we hit an in-use page, or the root
|
|
while (dir->host_parent != NULL && dir->ref_count == 0) {
|
|
uvm_page_directory_t *parent = dir->host_parent;
|
|
|
|
if (free_count == 0) {
|
|
|
|
// begin a push which will be submitted before the memory gets freed
|
|
status = page_tree_begin_acquire(tree, &tree->tracker, &push, "put ptes: start: %u, count: %u",
|
|
range->start_index, range->entry_count);
|
|
// Failure to get a push can only happen if we've hit a fatal UVM
|
|
// channel error. We can't perform the unmap, so just leave things
|
|
// in place for debug.
|
|
if (status != NV_OK) {
|
|
UVM_ASSERT(status == uvm_global_get_status());
|
|
dir->ref_count += range->entry_count;
|
|
uvm_mutex_unlock(&tree->lock);
|
|
return;
|
|
}
|
|
}
|
|
|
|
// All writes can be pipelined as put_ptes() cannot be called with any
|
|
// operations pending on the affected PTEs and PDEs.
|
|
uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
|
|
|
|
// Don't issue any membars as part of the clear, a single membar will be
|
|
// done below before the invalidate.
|
|
uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
|
|
pde_clear(tree, dir->host_parent, dir->index_in_parent, range->page_size, &push);
|
|
|
|
invalidate_depth = dir->host_parent->depth;
|
|
|
|
// If any of the pointed to PDEs were in sysmem then a SYS membar is
|
|
// required after the TLB invalidate.
|
|
if (dir->phys_alloc.addr.aperture == UVM_APERTURE_SYS)
|
|
membar_after_invalidate = UVM_MEMBAR_SYS;
|
|
|
|
// If any of the cleared PDEs were in sysmem then a SYS membar is
|
|
// required after the clears and before the TLB invalidate.
|
|
if (dir->host_parent->phys_alloc.addr.aperture == UVM_APERTURE_SYS)
|
|
membar_after_pde_clears = UVM_MEMBAR_SYS;
|
|
|
|
// Add this dir to the queue of directories that should be freed once
|
|
// the tracker value of the associated PDE writes is known.
|
|
UVM_ASSERT(free_count < tree->hal->page_table_depth(range->page_size));
|
|
free_queue[free_count++] = dir;
|
|
|
|
dir = parent;
|
|
}
|
|
|
|
if (free_count == 0) {
|
|
uvm_mutex_unlock(&tree->lock);
|
|
return;
|
|
}
|
|
|
|
tree->gpu->parent->host_hal->wait_for_idle(&push);
|
|
uvm_hal_membar(tree->gpu, &push, membar_after_pde_clears);
|
|
tree->gpu->parent->host_hal->tlb_invalidate_all(&push,
|
|
uvm_page_tree_pdb(tree)->addr,
|
|
invalidate_depth,
|
|
membar_after_invalidate);
|
|
|
|
// We just did the appropriate membar above, no need for another one in push_end().
|
|
// At least currently as if the L2 bypass path changes to only require a GPU
|
|
// membar between PDE write and TLB invalidate, we'll need to push a
|
|
// sysmembar so the end-of-push semaphore is ordered behind the PDE writes.
|
|
uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
|
|
page_tree_end(tree, &push);
|
|
page_tree_tracker_overwrite_with_push(tree, &push);
|
|
|
|
// now that we've traversed all the way up the tree, free everything
|
|
for (i = 0; i < free_count; i++) {
|
|
phys_mem_deallocate(tree, &free_queue[i]->phys_alloc);
|
|
uvm_kvfree(free_queue[i]);
|
|
}
|
|
|
|
uvm_mutex_unlock(&tree->lock);
|
|
}
|
|
|
|
void uvm_page_tree_put_ptes(uvm_page_tree_t *tree, uvm_page_table_range_t *range)
|
|
{
|
|
uvm_page_tree_put_ptes_async(tree, range);
|
|
(void)uvm_page_tree_wait(tree);
|
|
}
|
|
|
|
NV_STATUS uvm_page_tree_wait(uvm_page_tree_t *tree)
|
|
{
|
|
NV_STATUS status;
|
|
|
|
uvm_mutex_lock(&tree->lock);
|
|
|
|
status = uvm_tracker_wait(&tree->tracker);
|
|
|
|
uvm_mutex_unlock(&tree->lock);
|
|
|
|
return status;
|
|
}
|
|
|
|
static NV_STATUS try_get_ptes(uvm_page_tree_t *tree,
|
|
NvU32 page_size,
|
|
NvU64 start,
|
|
NvLength size,
|
|
uvm_page_table_range_t *range,
|
|
NvU32 *cur_depth,
|
|
uvm_page_directory_t **dir_cache)
|
|
{
|
|
uvm_mmu_mode_hal_t *hal = tree->hal;
|
|
|
|
// bit index just beyond the most significant bit used to index the current entry
|
|
NvU32 addr_bit_shift = hal->num_va_bits();
|
|
|
|
// track depth upon which the invalidate occured
|
|
NvS32 invalidate_depth = -1;
|
|
uvm_page_directory_t *dir = tree->root;
|
|
|
|
// directories used in attempt
|
|
NvU32 used_count = 0;
|
|
NvU32 i;
|
|
uvm_page_directory_t *dirs_used[MAX_OPERATION_DEPTH];
|
|
|
|
uvm_assert_mutex_locked(&tree->lock);
|
|
|
|
UVM_ASSERT(is_power_of_2(page_size));
|
|
|
|
// ensure that the caller has specified a valid page size
|
|
UVM_ASSERT((page_size & hal->page_sizes()) != 0);
|
|
|
|
// This algorithm will work with unaligned ranges, but the caller's intent is unclear
|
|
UVM_ASSERT_MSG(start % page_size == 0 && size % page_size == 0, "start 0x%llx size 0x%zx page_size 0x%x",
|
|
start, (size_t)size, page_size);
|
|
|
|
// The GPU should be capable of addressing the passed range
|
|
UVM_ASSERT(uvm_gpu_can_address(tree->gpu, start, size));
|
|
|
|
while (true) {
|
|
|
|
// index of the entry, for the first byte of the range, within its containing directory
|
|
NvU32 start_index;
|
|
|
|
// index of the entry, for the last byte of the range, within its containing directory
|
|
NvU32 end_index;
|
|
|
|
// pointer to PDE/PTE
|
|
uvm_page_directory_t **entry;
|
|
NvU32 index_bits = hal->index_bits(dir->depth, page_size);
|
|
|
|
addr_bit_shift -= index_bits;
|
|
start_index = entry_index_from_vaddr(start, addr_bit_shift, index_bits);
|
|
end_index = entry_index_from_vaddr(start + size - 1, addr_bit_shift, index_bits);
|
|
|
|
UVM_ASSERT(start_index <= end_index && end_index < (1 << index_bits));
|
|
|
|
entry = dir->entries + index_to_entry(hal, start_index, dir->depth, page_size);
|
|
|
|
if (dir->depth == hal->page_table_depth(page_size)) {
|
|
page_table_range_init(range, page_size, dir, start_index, end_index);
|
|
break;
|
|
}
|
|
else {
|
|
UVM_ASSERT(start_index == end_index);
|
|
|
|
if (*entry == NULL) {
|
|
if (dir_cache[dir->depth] == NULL) {
|
|
*cur_depth = dir->depth;
|
|
|
|
// Undo the changes to the tree so that the dir cache remains private to the thread
|
|
for (i = 0; i < used_count; i++)
|
|
host_pde_clear(tree, dirs_used[i]->host_parent, dirs_used[i]->index_in_parent, page_size);
|
|
|
|
return NV_ERR_MORE_PROCESSING_REQUIRED;
|
|
}
|
|
|
|
*entry = host_pde_write(dir_cache[dir->depth], dir, start_index);
|
|
dirs_used[used_count++] = *entry;
|
|
|
|
if (invalidate_depth == -1)
|
|
invalidate_depth = dir->depth;
|
|
}
|
|
}
|
|
dir = *entry;
|
|
}
|
|
|
|
free_unused_directories(tree, used_count, dirs_used, dir_cache);
|
|
return write_gpu_state(tree, page_size, invalidate_depth, used_count, dirs_used);
|
|
}
|
|
|
|
static NV_STATUS map_remap(uvm_page_tree_t *tree, NvU64 start, NvLength size, uvm_page_table_range_t *range)
|
|
{
|
|
NV_STATUS status;
|
|
uvm_push_t push;
|
|
NvU32 page_sizes;
|
|
uvm_mmu_page_table_alloc_t *phys_alloc[1];
|
|
|
|
// TODO: Bug 2734399
|
|
if (range->page_size != UVM_PAGE_SIZE_512M)
|
|
return NV_OK;
|
|
|
|
UVM_ASSERT(tree->hal->entries_per_index(range->table->depth) == 1);
|
|
|
|
status = page_tree_begin_acquire(tree,
|
|
&tree->tracker,
|
|
&push,
|
|
"map remap: [0x%llx, 0x%llx), page_size: %d",
|
|
start,
|
|
start + size,
|
|
range->page_size);
|
|
if (status != NV_OK)
|
|
return status;
|
|
|
|
// By default CE operations include a MEMBAR_SYS. MEMBAR_GPU is
|
|
// sufficient when the range is allocated in VIDMEM. We must enforce the
|
|
// following ordering between operations:
|
|
// PDE write -> TLB invalidate -> MMU fills.
|
|
if (uvm_page_table_range_aperture(range) == UVM_APERTURE_VID)
|
|
uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);
|
|
|
|
phys_alloc[0] = &tree->map_remap.pde0;
|
|
pde_fill(tree,
|
|
range->table->depth,
|
|
&range->table->phys_alloc,
|
|
range->start_index,
|
|
range->entry_count,
|
|
(uvm_mmu_page_table_alloc_t **)&phys_alloc,
|
|
&push);
|
|
|
|
tree->gpu->parent->host_hal->wait_for_idle(&push);
|
|
|
|
// Invalidate all the supported page sizes smaller than or equal to
|
|
// range->page_size, because the GPU TLBs may cache invalid entries using
|
|
// any page size they decide, including the smallest one.
|
|
page_sizes = (range->page_size | (range->page_size - 1)) & tree->hal->page_sizes();
|
|
|
|
// No membar is needed, any in-flight access to this range may fault and a
|
|
// lazy or delayed invalidate will evict the potential stale/invalid TLB
|
|
// entry.
|
|
uvm_tlb_batch_single_invalidate(tree, &push, start, size, page_sizes, UVM_MEMBAR_NONE);
|
|
|
|
page_tree_end(tree, &push);
|
|
page_tree_tracker_overwrite_with_push(tree, &push);
|
|
return NV_OK;
|
|
}
|
|
|
|
NV_STATUS uvm_page_tree_get_ptes_async(uvm_page_tree_t *tree,
|
|
NvU32 page_size,
|
|
NvU64 start,
|
|
NvLength size,
|
|
uvm_pmm_alloc_flags_t pmm_flags,
|
|
uvm_page_table_range_t *range)
|
|
{
|
|
NV_STATUS status;
|
|
NvU32 cur_depth = 0;
|
|
uvm_page_directory_t *dir_cache[MAX_OPERATION_DEPTH];
|
|
memset(dir_cache, 0, sizeof(dir_cache));
|
|
|
|
uvm_mutex_lock(&tree->lock);
|
|
while ((status = try_get_ptes(tree,
|
|
page_size,
|
|
start,
|
|
size,
|
|
range,
|
|
&cur_depth,
|
|
dir_cache)) == NV_ERR_MORE_PROCESSING_REQUIRED) {
|
|
uvm_mutex_unlock(&tree->lock);
|
|
|
|
// try_get_ptes never needs depth 0, so store a directory at its parent's depth
|
|
// TODO: Bug 1766655: Allocate everything below cur_depth instead of
|
|
// retrying for every level.
|
|
dir_cache[cur_depth] = allocate_directory(tree, page_size, cur_depth + 1, pmm_flags);
|
|
if (dir_cache[cur_depth] == NULL) {
|
|
uvm_mutex_lock(&tree->lock);
|
|
free_unused_directories(tree, 0, NULL, dir_cache);
|
|
uvm_mutex_unlock(&tree->lock);
|
|
return NV_ERR_NO_MEMORY;
|
|
}
|
|
|
|
uvm_mutex_lock(&tree->lock);
|
|
}
|
|
|
|
if ((status == NV_OK) && tree->gpu->parent->map_remap_larger_page_promotion)
|
|
status = map_remap(tree, start, size, range);
|
|
|
|
uvm_mutex_unlock(&tree->lock);
|
|
|
|
return status;
|
|
}
|
|
|
|
NV_STATUS uvm_page_tree_get_ptes(uvm_page_tree_t *tree,
|
|
NvU32 page_size,
|
|
NvU64 start,
|
|
NvLength size,
|
|
uvm_pmm_alloc_flags_t pmm_flags,
|
|
uvm_page_table_range_t *range)
|
|
{
|
|
NV_STATUS status = uvm_page_tree_get_ptes_async(tree, page_size, start, size, pmm_flags, range);
|
|
if (status != NV_OK)
|
|
return status;
|
|
|
|
return uvm_page_tree_wait(tree);
|
|
}
|
|
|
|
void uvm_page_table_range_get_upper(uvm_page_tree_t *tree,
|
|
uvm_page_table_range_t *existing,
|
|
uvm_page_table_range_t *upper,
|
|
NvU32 num_upper_pages)
|
|
{
|
|
NvU32 upper_start_index = existing->start_index + (existing->entry_count - num_upper_pages);
|
|
NvU32 upper_end_index = upper_start_index + num_upper_pages - 1;
|
|
|
|
UVM_ASSERT(num_upper_pages);
|
|
UVM_ASSERT(num_upper_pages <= existing->entry_count);
|
|
|
|
uvm_mutex_lock(&tree->lock);
|
|
page_table_range_init(upper, existing->page_size, existing->table, upper_start_index, upper_end_index);
|
|
uvm_mutex_unlock(&tree->lock);
|
|
}
|
|
|
|
void uvm_page_table_range_shrink(uvm_page_tree_t *tree, uvm_page_table_range_t *range, NvU32 new_page_count)
|
|
{
|
|
UVM_ASSERT(range->entry_count >= new_page_count);
|
|
|
|
if (new_page_count > 0) {
|
|
// Take a ref count on the smaller portion of the PTEs, then drop the
|
|
// entire old range.
|
|
uvm_mutex_lock(&tree->lock);
|
|
|
|
UVM_ASSERT(range->table->ref_count >= range->entry_count);
|
|
range->table->ref_count -= (range->entry_count - new_page_count);
|
|
|
|
uvm_mutex_unlock(&tree->lock);
|
|
|
|
range->entry_count = new_page_count;
|
|
}
|
|
else {
|
|
uvm_page_tree_put_ptes(tree, range);
|
|
}
|
|
}
|
|
|
|
NV_STATUS uvm_page_tree_get_entry(uvm_page_tree_t *tree,
|
|
NvU32 page_size,
|
|
NvU64 start,
|
|
uvm_pmm_alloc_flags_t pmm_flags,
|
|
uvm_page_table_range_t *single)
|
|
{
|
|
NV_STATUS status = uvm_page_tree_get_ptes(tree, page_size, start, page_size, pmm_flags, single);
|
|
UVM_ASSERT(single->entry_count == 1);
|
|
return status;
|
|
}
|
|
|
|
void uvm_page_tree_write_pde(uvm_page_tree_t *tree, uvm_page_table_range_t *single, uvm_push_t *push)
|
|
{
|
|
UVM_ASSERT(single->entry_count == 1);
|
|
pde_write(tree, single->table, single->start_index, false, push);
|
|
}
|
|
|
|
void uvm_page_tree_clear_pde(uvm_page_tree_t *tree, uvm_page_table_range_t *single, uvm_push_t *push)
|
|
{
|
|
UVM_ASSERT(single->entry_count == 1);
|
|
pde_write(tree, single->table, single->start_index, true, push);
|
|
}
|
|
|
|
static NV_STATUS poison_ptes(uvm_page_tree_t *tree,
|
|
uvm_page_directory_t *pte_dir,
|
|
uvm_page_directory_t *parent,
|
|
NvU32 page_size)
|
|
{
|
|
NV_STATUS status;
|
|
uvm_push_t push;
|
|
|
|
uvm_assert_mutex_locked(&tree->lock);
|
|
|
|
UVM_ASSERT(pte_dir->depth == tree->hal->page_table_depth(page_size));
|
|
|
|
status = page_tree_begin_acquire(tree, &tree->tracker, &push, "Poisoning child table of page size %u", page_size);
|
|
if (status != NV_OK)
|
|
return status;
|
|
|
|
tree->gpu->parent->ce_hal->memset_8(&push,
|
|
uvm_gpu_address_from_phys(pte_dir->phys_alloc.addr),
|
|
tree->hal->poisoned_pte(),
|
|
pte_dir->phys_alloc.size);
|
|
|
|
// If both the new PTEs and the parent PDE are in vidmem, then a GPU-
|
|
// local membar is enough to keep the memset of the PTEs ordered with
|
|
// any later write of the PDE. Otherwise we need a sysmembar. See the
|
|
// comments in write_gpu_state.
|
|
if (pte_dir->phys_alloc.addr.aperture == UVM_APERTURE_VID &&
|
|
parent->phys_alloc.addr.aperture == UVM_APERTURE_VID)
|
|
uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);
|
|
|
|
page_tree_end(tree, &push);
|
|
|
|
// The push acquired the tracker so it's ok to just overwrite it with
|
|
// the entry tracking the push.
|
|
page_tree_tracker_overwrite_with_push(tree, &push);
|
|
|
|
return NV_OK;
|
|
}
|
|
|
|
NV_STATUS uvm_page_tree_alloc_table(uvm_page_tree_t *tree,
|
|
NvU32 page_size,
|
|
uvm_pmm_alloc_flags_t pmm_flags,
|
|
uvm_page_table_range_t *single,
|
|
uvm_page_table_range_t *children)
|
|
{
|
|
bool should_free = false;
|
|
uvm_page_directory_t **entry;
|
|
uvm_page_directory_t *dir;
|
|
NV_STATUS status = NV_OK;
|
|
|
|
UVM_ASSERT(single->entry_count == 1);
|
|
|
|
entry = single->table->entries + index_to_entry(tree->hal,
|
|
single->start_index,
|
|
single->table->depth,
|
|
page_size);
|
|
|
|
dir = allocate_directory(tree, page_size, single->table->depth + 1, pmm_flags);
|
|
if (dir == NULL)
|
|
return NV_ERR_NO_MEMORY;
|
|
|
|
uvm_mutex_lock(&tree->lock);
|
|
|
|
// The caller is responsible for initializing this table, so enforce that on
|
|
// debug builds.
|
|
if (UVM_IS_DEBUG()) {
|
|
status = poison_ptes(tree, dir, single->table, page_size);
|
|
if (status != NV_OK)
|
|
goto out;
|
|
}
|
|
|
|
status = uvm_tracker_wait(&tree->tracker);
|
|
if (status != NV_OK)
|
|
goto out;
|
|
|
|
// the range always refers to the entire page table
|
|
children->start_index = 0;
|
|
children->entry_count = 1 << tree->hal->index_bits(dir->depth, page_size);
|
|
children->page_size = page_size;
|
|
|
|
// is this entry currently unassigned?
|
|
if (*entry == NULL) {
|
|
children->table = dir;
|
|
*entry = dir;
|
|
host_pde_write(dir, single->table, single->start_index);
|
|
}
|
|
else {
|
|
should_free = true;
|
|
children->table = *entry;
|
|
}
|
|
children->table->ref_count += children->entry_count;
|
|
|
|
out:
|
|
if (should_free || status != NV_OK) {
|
|
phys_mem_deallocate(tree, &dir->phys_alloc);
|
|
uvm_kvfree(dir);
|
|
}
|
|
uvm_mutex_unlock(&tree->lock);
|
|
|
|
return status;
|
|
}
|
|
|
|
static size_t range_vec_calc_range_count(uvm_page_table_range_vec_t *range_vec)
|
|
{
|
|
NvU64 pde_coverage = uvm_mmu_pde_coverage(range_vec->tree, range_vec->page_size);
|
|
NvU64 aligned_start = UVM_ALIGN_DOWN(range_vec->start, pde_coverage);
|
|
NvU64 aligned_end = UVM_ALIGN_UP(range_vec->start + range_vec->size, pde_coverage);
|
|
size_t count = uvm_div_pow2_64(aligned_end - aligned_start, pde_coverage);
|
|
|
|
UVM_ASSERT(count != 0);
|
|
|
|
return count;
|
|
}
|
|
|
|
static NvU64 range_vec_calc_range_start(uvm_page_table_range_vec_t *range_vec, size_t i)
|
|
{
|
|
NvU64 pde_coverage = uvm_mmu_pde_coverage(range_vec->tree, range_vec->page_size);
|
|
NvU64 aligned_start = UVM_ALIGN_DOWN(range_vec->start, pde_coverage);
|
|
NvU64 range_start = aligned_start + i * pde_coverage;
|
|
return max(range_vec->start, range_start);
|
|
}
|
|
|
|
static NvU64 range_vec_calc_range_end(uvm_page_table_range_vec_t *range_vec, size_t i)
|
|
{
|
|
NvU64 pde_coverage = uvm_mmu_pde_coverage(range_vec->tree, range_vec->page_size);
|
|
NvU64 range_start = range_vec_calc_range_start(range_vec, i);
|
|
NvU64 max_range_end = UVM_ALIGN_UP(range_start + 1, pde_coverage);
|
|
return min(range_vec->start + range_vec->size, max_range_end);
|
|
}
|
|
|
|
static NvU64 range_vec_calc_range_size(uvm_page_table_range_vec_t *range_vec, size_t i)
|
|
{
|
|
return range_vec_calc_range_end(range_vec, i) - range_vec_calc_range_start(range_vec, i);
|
|
}
|
|
|
|
static size_t range_vec_calc_range_index(uvm_page_table_range_vec_t *range_vec, NvU64 addr)
|
|
{
|
|
NvU64 pde_coverage = uvm_mmu_pde_coverage(range_vec->tree, range_vec->page_size);
|
|
NvU64 aligned_start = UVM_ALIGN_DOWN(range_vec->start, pde_coverage);
|
|
NvU64 aligned_addr = UVM_ALIGN_DOWN(addr, pde_coverage);
|
|
UVM_ASSERT(addr >= range_vec->start);
|
|
UVM_ASSERT(addr < range_vec->start + range_vec->size);
|
|
return (size_t)uvm_div_pow2_64(aligned_addr - aligned_start, pde_coverage);
|
|
}
|
|
|
|
NV_STATUS uvm_page_table_range_vec_init(uvm_page_tree_t *tree,
|
|
NvU64 start,
|
|
NvU64 size,
|
|
NvU32 page_size,
|
|
uvm_pmm_alloc_flags_t pmm_flags,
|
|
uvm_page_table_range_vec_t *range_vec)
|
|
{
|
|
NV_STATUS status;
|
|
size_t i;
|
|
|
|
UVM_ASSERT(size != 0);
|
|
UVM_ASSERT_MSG(IS_ALIGNED(start, page_size), "start 0x%llx page_size 0x%x\n", start, page_size);
|
|
UVM_ASSERT_MSG(IS_ALIGNED(size, page_size), "size 0x%llx page_size 0x%x\n", size, page_size);
|
|
|
|
range_vec->tree = tree;
|
|
range_vec->page_size = page_size;
|
|
range_vec->start = start;
|
|
range_vec->size = size;
|
|
range_vec->range_count = range_vec_calc_range_count(range_vec);
|
|
|
|
range_vec->ranges = uvm_kvmalloc_zero(sizeof(*range_vec->ranges) * range_vec->range_count);
|
|
if (!range_vec->ranges) {
|
|
status = NV_ERR_NO_MEMORY;
|
|
goto out;
|
|
}
|
|
|
|
for (i = 0; i < range_vec->range_count; ++i) {
|
|
uvm_page_table_range_t *range = &range_vec->ranges[i];
|
|
|
|
NvU64 range_start = range_vec_calc_range_start(range_vec, i);
|
|
NvU64 range_size = range_vec_calc_range_size(range_vec, i);
|
|
|
|
status = uvm_page_tree_get_ptes_async(tree,
|
|
page_size,
|
|
range_start,
|
|
range_size,
|
|
pmm_flags,
|
|
range);
|
|
if (status != NV_OK) {
|
|
UVM_ERR_PRINT("Failed to get PTEs for subrange %zd [0x%llx, 0x%llx) size 0x%llx, part of [0x%llx, 0x%llx)\n",
|
|
i, range_start, range_start + range_size, range_size,
|
|
start, size);
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
status = uvm_page_tree_wait(tree);
|
|
|
|
out:
|
|
if (status != NV_OK)
|
|
uvm_page_table_range_vec_deinit(range_vec);
|
|
return status;
|
|
}
|
|
|
|
NV_STATUS uvm_page_table_range_vec_create(uvm_page_tree_t *tree,
|
|
NvU64 start,
|
|
NvU64 size,
|
|
NvU32 page_size,
|
|
uvm_pmm_alloc_flags_t pmm_flags,
|
|
uvm_page_table_range_vec_t **range_vec_out)
|
|
{
|
|
NV_STATUS status;
|
|
uvm_page_table_range_vec_t *range_vec;
|
|
|
|
range_vec = uvm_kvmalloc(sizeof(*range_vec));
|
|
if (!range_vec)
|
|
return NV_ERR_NO_MEMORY;
|
|
|
|
status = uvm_page_table_range_vec_init(tree, start, size, page_size, pmm_flags, range_vec);
|
|
if (status != NV_OK)
|
|
goto error;
|
|
|
|
*range_vec_out = range_vec;
|
|
|
|
return NV_OK;
|
|
|
|
error:
|
|
uvm_kvfree(range_vec);
|
|
return status;
|
|
}
|
|
|
|
NV_STATUS uvm_page_table_range_vec_split_upper(uvm_page_table_range_vec_t *range_vec,
|
|
NvU64 new_end,
|
|
uvm_page_table_range_vec_t *new_range_vec)
|
|
{
|
|
size_t split_index;
|
|
size_t num_remaining_pages = 0;
|
|
uvm_page_table_range_t *range = NULL;
|
|
NvU64 pde_coverage = uvm_mmu_pde_coverage(range_vec->tree, range_vec->page_size);
|
|
NvU64 new_start = new_end + 1;
|
|
NvU64 range_base_va = UVM_ALIGN_DOWN(new_start, pde_coverage);
|
|
|
|
UVM_ASSERT(new_range_vec);
|
|
UVM_ASSERT(new_end != 0);
|
|
UVM_ASSERT(new_end > range_vec->start && new_end < range_vec->start + range_vec->size);
|
|
UVM_ASSERT(IS_ALIGNED(new_start, range_vec->page_size));
|
|
UVM_ASSERT(range_vec->range_count > 0);
|
|
|
|
split_index = range_vec_calc_range_index(range_vec, new_start);
|
|
range = &range_vec->ranges[split_index];
|
|
if (range_vec->start > range_base_va)
|
|
num_remaining_pages = (new_start - range_vec->start) / range_vec->page_size;
|
|
else
|
|
num_remaining_pages = (new_start - range_base_va) / range_vec->page_size;
|
|
|
|
new_range_vec->tree = range_vec->tree;
|
|
new_range_vec->page_size = range_vec->page_size;
|
|
new_range_vec->start = new_start;
|
|
new_range_vec->size = range_vec->size - (new_range_vec->start - range_vec->start);
|
|
new_range_vec->range_count = range_vec->range_count - split_index;
|
|
|
|
new_range_vec->ranges = uvm_kvmalloc_zero(sizeof(*new_range_vec->ranges) * new_range_vec->range_count);
|
|
if (!new_range_vec->ranges)
|
|
return NV_ERR_NO_MEMORY;
|
|
|
|
// Handle splitting a range
|
|
if (num_remaining_pages) {
|
|
uvm_page_table_range_get_upper(range_vec->tree,
|
|
range,
|
|
&new_range_vec->ranges[0],
|
|
range->entry_count - num_remaining_pages);
|
|
uvm_page_table_range_shrink(range_vec->tree, range, num_remaining_pages);
|
|
}
|
|
|
|
// Copy the remainder of the ranges from the existing vector to the new one.
|
|
memcpy(new_range_vec->ranges + !!num_remaining_pages,
|
|
range_vec->ranges + split_index + !!num_remaining_pages,
|
|
(new_range_vec->range_count - !!num_remaining_pages) * sizeof(*range));
|
|
|
|
// Adjust the coverage of range_vec. If the re-allocation of the ranges array
|
|
// fails, the old array is left untouched.
|
|
range_vec->size -= new_range_vec->size;
|
|
range_vec->range_count -= new_range_vec->range_count - !!num_remaining_pages;
|
|
range = uvm_kvrealloc(range_vec->ranges, range_vec->range_count * sizeof(*range));
|
|
if (range)
|
|
range_vec->ranges = range;
|
|
|
|
return NV_OK;
|
|
}
|
|
|
|
NV_STATUS uvm_page_table_range_vec_clear_ptes(uvm_page_table_range_vec_t *range_vec, uvm_membar_t tlb_membar)
|
|
{
|
|
NV_STATUS status = NV_OK;
|
|
NV_STATUS tracker_status;
|
|
size_t i;
|
|
uvm_page_tree_t *tree = range_vec->tree;
|
|
uvm_gpu_t *gpu = tree->gpu;
|
|
NvU32 page_size = range_vec->page_size;
|
|
NvU32 entry_size = uvm_mmu_pte_size(tree, page_size);
|
|
NvU64 invalid_pte = 0;
|
|
uvm_push_t push;
|
|
uvm_tracker_t tracker = UVM_TRACKER_INIT();
|
|
|
|
uvm_pte_batch_t pte_batch;
|
|
|
|
UVM_ASSERT(range_vec);
|
|
UVM_ASSERT(tree);
|
|
UVM_ASSERT(gpu);
|
|
|
|
i = 0;
|
|
while (i < range_vec->range_count) {
|
|
// Acquiring the previous push is not necessary for correctness as all
|
|
// the memsets can be done independently, but scheduling a lot of
|
|
// independent work for a big range could end up hogging the GPU
|
|
// for a long time while not providing much improvement.
|
|
status = page_tree_begin_acquire(tree, &tracker, &push, "Clearing PTEs for [0x%llx, 0x%llx)",
|
|
range_vec->start, range_vec->start + range_vec->size);
|
|
|
|
if (status != NV_OK)
|
|
goto done;
|
|
|
|
uvm_pte_batch_begin(&push, &pte_batch);
|
|
|
|
for (; i < range_vec->range_count; ++i) {
|
|
uvm_page_table_range_t *range = &range_vec->ranges[i];
|
|
uvm_gpu_phys_address_t first_entry_pa = uvm_page_table_range_entry_address(tree, range, 0);
|
|
uvm_pte_batch_clear_ptes(&pte_batch, first_entry_pa, invalid_pte, entry_size, range->entry_count);
|
|
|
|
if (!uvm_push_has_space(&push, 512)) {
|
|
// Stop pushing the clears once we get close to a full push
|
|
break;
|
|
}
|
|
}
|
|
|
|
uvm_pte_batch_end(&pte_batch);
|
|
|
|
if (i == range_vec->range_count)
|
|
uvm_tlb_batch_single_invalidate(tree, &push, range_vec->start, range_vec->size, page_size, tlb_membar);
|
|
|
|
page_tree_end(tree, &push);
|
|
|
|
// Skip the tracking if in unit test mode
|
|
if (!tree->gpu->channel_manager)
|
|
continue;
|
|
|
|
// The push acquired the tracker so it's ok to just overwrite it with
|
|
// the entry tracking the push.
|
|
uvm_tracker_overwrite_with_push(&tracker, &push);
|
|
}
|
|
|
|
done:
|
|
tracker_status = uvm_tracker_wait_deinit(&tracker);
|
|
if (status == NV_OK)
|
|
status = tracker_status;
|
|
|
|
return status;
|
|
}
|
|
|
|
void uvm_page_table_range_vec_deinit(uvm_page_table_range_vec_t *range_vec)
|
|
{
|
|
size_t i;
|
|
if (!range_vec)
|
|
return;
|
|
|
|
if (range_vec->ranges) {
|
|
for (i = 0; i < range_vec->range_count; ++i) {
|
|
uvm_page_table_range_t *range = &range_vec->ranges[i];
|
|
if (!range->entry_count)
|
|
break;
|
|
uvm_page_tree_put_ptes_async(range_vec->tree, range);
|
|
}
|
|
(void)uvm_page_tree_wait(range_vec->tree);
|
|
|
|
uvm_kvfree(range_vec->ranges);
|
|
}
|
|
|
|
memset(range_vec, 0, sizeof(*range_vec));
|
|
}
|
|
|
|
void uvm_page_table_range_vec_destroy(uvm_page_table_range_vec_t *range_vec)
|
|
{
|
|
if (!range_vec)
|
|
return;
|
|
|
|
uvm_page_table_range_vec_deinit(range_vec);
|
|
|
|
uvm_kvfree(range_vec);
|
|
}
|
|
|
|
NV_STATUS uvm_page_table_range_vec_write_ptes(uvm_page_table_range_vec_t *range_vec,
|
|
uvm_membar_t tlb_membar,
|
|
uvm_page_table_range_pte_maker_t pte_maker,
|
|
void *caller_data)
|
|
{
|
|
NV_STATUS status = NV_OK;
|
|
NV_STATUS tracker_status;
|
|
NvU32 entry;
|
|
size_t i;
|
|
uvm_page_tree_t *tree = range_vec->tree;
|
|
uvm_gpu_t *gpu = tree->gpu;
|
|
NvU32 entry_size = uvm_mmu_pte_size(tree, range_vec->page_size);
|
|
|
|
uvm_push_t push;
|
|
uvm_pte_batch_t pte_batch;
|
|
NvU64 offset = 0;
|
|
uvm_tracker_t tracker = UVM_TRACKER_INIT();
|
|
bool last_push = false;
|
|
|
|
// Use as much push space as possible leaving 1K of margin
|
|
static const NvU32 max_total_entry_size_per_push = UVM_MAX_PUSH_SIZE - 1024;
|
|
|
|
NvU32 max_entries_per_push = max_total_entry_size_per_push / entry_size;
|
|
|
|
for (i = 0; i < range_vec->range_count; ++i) {
|
|
uvm_page_table_range_t *range = &range_vec->ranges[i];
|
|
NvU64 range_start = range_vec_calc_range_start(range_vec, i);
|
|
NvU64 range_size = range_vec_calc_range_size(range_vec, i);
|
|
uvm_gpu_phys_address_t entry_addr = uvm_page_table_range_entry_address(tree, range, 0);
|
|
entry = 0;
|
|
|
|
while (entry < range->entry_count) {
|
|
NvU32 entry_limit_this_push = min(range->entry_count, entry + max_entries_per_push);
|
|
|
|
// Acquiring the previous push is not necessary for correctness as all
|
|
// the PTE writes can be done independently, but scheduling a lot of
|
|
// independent work for a big range could end up hogging the GPU
|
|
// for a long time while not providing much improvement.
|
|
status = page_tree_begin_acquire(tree,
|
|
&tracker,
|
|
&push,
|
|
"Writing PTEs for range at [0x%llx, 0x%llx), subrange of range vec at [0x%llx, 0x%llx)",
|
|
range_start,
|
|
range_start + range_size,
|
|
range_vec->start,
|
|
range_vec->start + range_vec->size);
|
|
if (status != NV_OK) {
|
|
UVM_ERR_PRINT("Failed to begin push for writing PTEs: %s GPU %s\n",
|
|
nvstatusToString(status),
|
|
uvm_gpu_name(gpu));
|
|
goto done;
|
|
}
|
|
|
|
uvm_pte_batch_begin(&push, &pte_batch);
|
|
|
|
for (; entry < entry_limit_this_push; ++entry) {
|
|
NvU64 pte_bits = pte_maker(range_vec, offset, caller_data);
|
|
uvm_pte_batch_write_pte(&pte_batch, entry_addr, pte_bits, entry_size);
|
|
offset += range_vec->page_size;
|
|
entry_addr.address += entry_size;
|
|
}
|
|
|
|
last_push = (i == range_vec->range_count - 1) && entry == range->entry_count;
|
|
|
|
uvm_pte_batch_end(&pte_batch);
|
|
|
|
if (last_push) {
|
|
// Invalidate TLBs as part of the last push
|
|
uvm_tlb_batch_single_invalidate(tree,
|
|
&push,
|
|
range_vec->start,
|
|
range_vec->size,
|
|
range_vec->page_size,
|
|
tlb_membar);
|
|
}
|
|
else {
|
|
// For pushes prior to the last one, uvm_pte_batch_end() has
|
|
// already pushed a membar that's enough to order the PTE writes
|
|
// with the TLB invalidate in the last push and that's all
|
|
// that's needed.
|
|
uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
|
|
}
|
|
|
|
page_tree_end(tree, &push);
|
|
|
|
// Skip the tracking if in unit test mode
|
|
if (!tree->gpu->channel_manager)
|
|
continue;
|
|
|
|
// The push acquired the tracker so it's ok to just overwrite it with
|
|
// the entry tracking the push.
|
|
uvm_tracker_overwrite_with_push(&tracker, &push);
|
|
}
|
|
}
|
|
|
|
done:
|
|
tracker_status = uvm_tracker_wait_deinit(&tracker);
|
|
if (status == NV_OK)
|
|
status = tracker_status;
|
|
return status;
|
|
}
|
|
|
|
typedef struct identity_mapping_pte_maker_data_struct
|
|
{
|
|
NvU64 phys_offset;
|
|
uvm_aperture_t aperture;
|
|
} identity_mapping_pte_maker_data_t;
|
|
|
|
static NvU64 identity_mapping_pte_maker(uvm_page_table_range_vec_t *range_vec, NvU64 offset, void *data)
|
|
{
|
|
identity_mapping_pte_maker_data_t *vpdata = (identity_mapping_pte_maker_data_t *)data;
|
|
NvU64 pte_flags = vpdata->aperture == UVM_APERTURE_VID ? UVM_MMU_PTE_FLAGS_CACHED : UVM_MMU_PTE_FLAGS_NONE;
|
|
return range_vec->tree->hal->make_pte(vpdata->aperture,
|
|
offset + vpdata->phys_offset,
|
|
UVM_PROT_READ_WRITE_ATOMIC,
|
|
pte_flags);
|
|
}
|
|
|
|
static NV_STATUS create_identity_mapping(uvm_gpu_t *gpu,
|
|
NvU64 base,
|
|
NvU64 size,
|
|
uvm_aperture_t aperture,
|
|
NvU64 phys_offset,
|
|
NvU32 page_size,
|
|
uvm_pmm_alloc_flags_t pmm_flags,
|
|
uvm_page_table_range_vec_t **range_vec)
|
|
{
|
|
NV_STATUS status;
|
|
identity_mapping_pte_maker_data_t data =
|
|
{
|
|
.phys_offset = phys_offset,
|
|
.aperture = aperture
|
|
};
|
|
|
|
status = uvm_page_table_range_vec_create(&gpu->address_space_tree,
|
|
base,
|
|
size,
|
|
page_size,
|
|
pmm_flags,
|
|
range_vec);
|
|
if (status != NV_OK) {
|
|
UVM_ERR_PRINT("Failed to init range vec for aperture %d identity mapping at [0x%llx, 0x%llx): %s, GPU %s\n",
|
|
aperture,
|
|
base,
|
|
base + size,
|
|
nvstatusToString(status),
|
|
uvm_gpu_name(gpu));
|
|
return status;
|
|
}
|
|
|
|
status = uvm_page_table_range_vec_write_ptes(*range_vec, UVM_MEMBAR_NONE, identity_mapping_pte_maker, &data);
|
|
if (status != NV_OK) {
|
|
UVM_ERR_PRINT("Failed to write PTEs for aperture %d identity mapping at [0x%llx, 0x%llx): %s, GPU %s\n",
|
|
aperture,
|
|
base,
|
|
base + size,
|
|
nvstatusToString(status),
|
|
uvm_gpu_name(gpu));
|
|
return status;
|
|
}
|
|
|
|
return NV_OK;
|
|
}
|
|
|
|
static void destroy_identity_mapping(uvm_gpu_identity_mapping_t *mapping)
|
|
{
|
|
if (mapping->range_vec == NULL)
|
|
return;
|
|
|
|
(void)uvm_page_table_range_vec_clear_ptes(mapping->range_vec, UVM_MEMBAR_SYS);
|
|
uvm_page_table_range_vec_destroy(mapping->range_vec);
|
|
mapping->range_vec = NULL;
|
|
}
|
|
|
|
bool uvm_mmu_gpu_needs_static_vidmem_mapping(uvm_gpu_t *gpu)
|
|
{
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
bool uvm_mmu_gpu_needs_dynamic_vidmem_mapping(uvm_gpu_t *gpu)
|
|
{
|
|
return uvm_gpu_is_virt_mode_sriov_heavy(gpu);
|
|
}
|
|
|
|
bool uvm_mmu_gpu_needs_dynamic_sysmem_mapping(uvm_gpu_t *gpu)
|
|
{
|
|
return uvm_gpu_is_virt_mode_sriov_heavy(gpu);
|
|
}
|
|
|
|
NV_STATUS create_static_vidmem_mapping(uvm_gpu_t *gpu)
|
|
{
|
|
NvU32 page_size;
|
|
NvU64 size;
|
|
uvm_aperture_t aperture = UVM_APERTURE_VID;
|
|
NvU64 phys_offset = 0;
|
|
uvm_gpu_identity_mapping_t *flat_mapping = &gpu->static_flat_mapping;
|
|
|
|
if (!uvm_mmu_gpu_needs_static_vidmem_mapping(gpu))
|
|
return NV_OK;
|
|
|
|
UVM_ASSERT(!uvm_mmu_gpu_needs_dynamic_vidmem_mapping(gpu));
|
|
|
|
page_size = uvm_mmu_biggest_page_size(&gpu->address_space_tree);
|
|
size = UVM_ALIGN_UP(gpu->mem_info.max_allocatable_address + 1, page_size);
|
|
|
|
UVM_ASSERT(page_size);
|
|
UVM_ASSERT(size);
|
|
UVM_ASSERT(size <= UVM_GPU_MAX_PHYS_MEM);
|
|
|
|
flat_mapping->base = gpu->parent->flat_vidmem_va_base;
|
|
|
|
return create_identity_mapping(gpu,
|
|
flat_mapping->base,
|
|
size,
|
|
aperture,
|
|
phys_offset,
|
|
page_size,
|
|
UVM_PMM_ALLOC_FLAGS_EVICT,
|
|
&flat_mapping->range_vec);
|
|
}
|
|
|
|
static void destroy_static_vidmem_mapping(uvm_gpu_t *gpu)
|
|
{
|
|
if (!uvm_mmu_gpu_needs_static_vidmem_mapping(gpu))
|
|
return;
|
|
|
|
destroy_identity_mapping(&gpu->static_flat_mapping);
|
|
}
|
|
|
|
NV_STATUS uvm_mmu_create_peer_identity_mappings(uvm_gpu_t *gpu, uvm_gpu_t *peer)
|
|
{
|
|
NvU32 page_size;
|
|
NvU64 size;
|
|
uvm_aperture_t aperture;
|
|
NvU64 phys_offset;
|
|
uvm_gpu_identity_mapping_t *peer_mapping;
|
|
|
|
if (gpu->parent->peer_copy_mode != UVM_GPU_PEER_COPY_MODE_VIRTUAL || peer->mem_info.size == 0)
|
|
return NV_OK;
|
|
|
|
page_size = uvm_mmu_biggest_page_size(&gpu->address_space_tree);
|
|
size = UVM_ALIGN_UP(peer->mem_info.max_allocatable_address + 1, page_size);
|
|
aperture = uvm_gpu_peer_aperture(gpu, peer);
|
|
peer_mapping = uvm_gpu_get_peer_mapping(gpu, peer->id);
|
|
phys_offset = 0ULL;
|
|
|
|
if (uvm_gpus_are_nvswitch_connected(gpu, peer)) {
|
|
// Add the 47-bit physical address routing bits for this peer to the
|
|
// generated PTEs
|
|
phys_offset = peer->parent->nvswitch_info.fabric_memory_window_start;
|
|
}
|
|
|
|
UVM_ASSERT(page_size);
|
|
UVM_ASSERT(size);
|
|
UVM_ASSERT(size <= UVM_PEER_IDENTITY_VA_SIZE);
|
|
UVM_ASSERT(peer_mapping->base);
|
|
|
|
return create_identity_mapping(gpu,
|
|
peer_mapping->base,
|
|
size,
|
|
aperture,
|
|
phys_offset,
|
|
page_size,
|
|
UVM_PMM_ALLOC_FLAGS_EVICT,
|
|
&peer_mapping->range_vec);
|
|
}
|
|
|
|
void uvm_mmu_destroy_peer_identity_mappings(uvm_gpu_t *gpu, uvm_gpu_t *peer)
|
|
{
|
|
if (gpu->parent->peer_copy_mode == UVM_GPU_PEER_COPY_MODE_VIRTUAL)
|
|
destroy_identity_mapping(uvm_gpu_get_peer_mapping(gpu, peer->id));
|
|
}
|
|
|
|
void uvm_mmu_init_gpu_chunk_sizes(uvm_parent_gpu_t *parent_gpu)
|
|
{
|
|
uvm_chunk_sizes_mask_t sizes = page_sizes_for_big_page_size(parent_gpu, UVM_PAGE_SIZE_64K) |
|
|
page_sizes_for_big_page_size(parent_gpu, UVM_PAGE_SIZE_128K) |
|
|
PAGE_SIZE;
|
|
|
|
// Although we may have to map PTEs smaller than PAGE_SIZE, user (managed)
|
|
// memory is never allocated with granularity smaller than PAGE_SIZE. Force
|
|
// PAGE_SIZE to be supported and the smallest allowed size so we don't have
|
|
// to handle allocating multiple chunks per page.
|
|
parent_gpu->mmu_user_chunk_sizes = sizes & PAGE_MASK;
|
|
|
|
// Ampere+ GPUs support 512MB page size, however, the maximum chunk size is
|
|
// 2MB(i.e., UVM_CHUNK_SIZE_MAX), therefore we mask out any supported page
|
|
// size greater than UVM_CHUNK_SIZE_MAX from the chunk size list.
|
|
parent_gpu->mmu_user_chunk_sizes &= UVM_CHUNK_SIZES_MASK;
|
|
|
|
parent_gpu->mmu_kernel_chunk_sizes = allocation_sizes_for_big_page_size(parent_gpu, UVM_PAGE_SIZE_64K) |
|
|
allocation_sizes_for_big_page_size(parent_gpu, UVM_PAGE_SIZE_128K);
|
|
}
|
|
|
|
void uvm_mmu_init_gpu_peer_addresses(uvm_gpu_t *gpu)
|
|
{
|
|
if (gpu->parent->peer_copy_mode == UVM_GPU_PEER_COPY_MODE_VIRTUAL) {
|
|
uvm_gpu_id_t gpu_id;
|
|
|
|
for_each_gpu_id(gpu_id) {
|
|
uvm_gpu_get_peer_mapping(gpu, gpu_id)->base = gpu->parent->rm_va_base +
|
|
gpu->parent->rm_va_size +
|
|
UVM_PEER_IDENTITY_VA_SIZE * uvm_id_gpu_index(gpu_id);
|
|
}
|
|
|
|
UVM_ASSERT(gpu->parent->uvm_mem_va_base >=
|
|
uvm_gpu_get_peer_mapping(gpu, uvm_gpu_id_from_value(UVM_ID_MAX_GPUS - 1))->base +
|
|
UVM_PEER_IDENTITY_VA_SIZE);
|
|
}
|
|
}
|
|
|
|
static size_t root_chunk_mapping_index(uvm_gpu_t *gpu, uvm_gpu_root_chunk_mapping_t *root_chunk_mapping)
|
|
{
|
|
return root_chunk_mapping - gpu->root_chunk_mappings.array;
|
|
}
|
|
|
|
static NvU64 root_chunk_mapping_physical_address(uvm_gpu_t *gpu, uvm_gpu_root_chunk_mapping_t *root_chunk_mapping)
|
|
{
|
|
return UVM_CHUNK_SIZE_MAX * root_chunk_mapping_index(gpu, root_chunk_mapping);
|
|
}
|
|
|
|
static void root_chunk_mapping_lock(uvm_gpu_t *gpu, uvm_gpu_root_chunk_mapping_t *root_chunk_mapping)
|
|
{
|
|
uvm_bit_lock(&gpu->root_chunk_mappings.bitlocks, root_chunk_mapping_index(gpu, root_chunk_mapping));
|
|
}
|
|
|
|
static void root_chunk_mapping_unlock(uvm_gpu_t *gpu, uvm_gpu_root_chunk_mapping_t *root_chunk_mapping)
|
|
{
|
|
uvm_bit_unlock(&gpu->root_chunk_mappings.bitlocks, root_chunk_mapping_index(gpu, root_chunk_mapping));
|
|
}
|
|
|
|
static uvm_gpu_root_chunk_mapping_t *root_chunk_mapping_from_address(uvm_gpu_t *gpu, NvU64 addr)
|
|
{
|
|
size_t index = addr / UVM_CHUNK_SIZE_MAX;
|
|
|
|
UVM_ASSERT(addr <= gpu->mem_info.max_allocatable_address);
|
|
|
|
return gpu->root_chunk_mappings.array + index;
|
|
}
|
|
|
|
static uvm_gpu_root_chunk_mapping_t *root_chunk_mapping_from_chunk(uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk)
|
|
{
|
|
return root_chunk_mapping_from_address(gpu, chunk->address);
|
|
}
|
|
|
|
static void destroy_dynamic_vidmem_mapping(uvm_gpu_t *gpu)
|
|
{
|
|
size_t i;
|
|
|
|
if (!uvm_mmu_gpu_needs_dynamic_vidmem_mapping(gpu))
|
|
return;
|
|
|
|
if (gpu->root_chunk_mappings.array == NULL)
|
|
return;
|
|
|
|
uvm_bit_locks_deinit(&gpu->root_chunk_mappings.bitlocks);
|
|
|
|
for (i = 0; i < gpu->root_chunk_mappings.count; ++i) {
|
|
uvm_gpu_root_chunk_mapping_t *root_chunk_mapping = gpu->root_chunk_mappings.array + i;
|
|
|
|
UVM_ASSERT(root_chunk_mapping->range == NULL);
|
|
UVM_ASSERT(root_chunk_mapping->num_mapped_pages == 0);
|
|
}
|
|
|
|
uvm_kvfree(gpu->root_chunk_mappings.array);
|
|
gpu->root_chunk_mappings.array = NULL;
|
|
}
|
|
|
|
static NV_STATUS create_dynamic_vidmem_mapping(uvm_gpu_t *gpu)
|
|
{
|
|
NV_STATUS status;
|
|
size_t count;
|
|
|
|
if (!uvm_mmu_gpu_needs_dynamic_vidmem_mapping(gpu))
|
|
return NV_OK;
|
|
|
|
UVM_ASSERT(!uvm_mmu_gpu_needs_static_vidmem_mapping(gpu));
|
|
BUILD_BUG_ON(UVM_PAGE_SIZE_2M != UVM_CHUNK_SIZE_MAX);
|
|
UVM_ASSERT(uvm_mmu_page_size_supported(&gpu->address_space_tree, UVM_PAGE_SIZE_2M));
|
|
UVM_ASSERT(gpu->pmm.initialized);
|
|
|
|
count = gpu->pmm.root_chunks.count;
|
|
|
|
gpu->root_chunk_mappings.array = uvm_kvmalloc_zero(sizeof(*gpu->root_chunk_mappings.array) * count);
|
|
if (gpu->root_chunk_mappings.array == NULL)
|
|
return NV_ERR_NO_MEMORY;
|
|
|
|
gpu->root_chunk_mappings.count = count;
|
|
|
|
status = uvm_bit_locks_init(&gpu->root_chunk_mappings.bitlocks, count, UVM_LOCK_ORDER_CHUNK_MAPPING);
|
|
if (status != NV_OK)
|
|
goto error;
|
|
|
|
return NV_OK;
|
|
|
|
error:
|
|
destroy_dynamic_vidmem_mapping(gpu);
|
|
return status;
|
|
}
|
|
|
|
// Remove a root chunk mapping from the flat mapping address segment.
|
|
//
|
|
// This function is a specialized version of destroy_identity_mapping
|
|
// because the mapping fits into a single 2MB PTE, so there is no need to use
|
|
// page table range vectors.
|
|
static void root_chunk_mapping_destroy(uvm_gpu_t *gpu, uvm_gpu_root_chunk_mapping_t *root_chunk_mapping)
|
|
{
|
|
NvU64 pa, va;
|
|
uvm_gpu_address_t gpu_virtual_address;
|
|
uvm_page_tree_t *tree;
|
|
uvm_gpu_phys_address_t entry_addr;
|
|
uvm_push_t push;
|
|
NvU32 entry_size;
|
|
uvm_pte_batch_t pte_batch;
|
|
NvU32 page_size;
|
|
NvU64 size;
|
|
NvU64 invalid_pte;
|
|
uvm_page_table_range_t *range = root_chunk_mapping->range;
|
|
|
|
UVM_ASSERT(range != NULL);
|
|
|
|
tree = &gpu->address_space_tree;
|
|
pa = root_chunk_mapping_physical_address(gpu, root_chunk_mapping);
|
|
gpu_virtual_address = uvm_gpu_address_virtual_from_vidmem_phys(gpu, pa);
|
|
va = gpu_virtual_address.address;
|
|
page_size = range->page_size;
|
|
size = page_size;
|
|
|
|
(void) page_tree_begin_acquire(tree, NULL, &push, "Removing linear mapping at [0x%llx, 0x%llx)", va, va + size);
|
|
|
|
uvm_pte_batch_begin(&push, &pte_batch);
|
|
entry_addr = uvm_page_table_range_entry_address(tree, range, 0);
|
|
invalid_pte = 0;
|
|
entry_size = uvm_mmu_pte_size(tree, page_size);
|
|
uvm_pte_batch_clear_ptes(&pte_batch, entry_addr, invalid_pte, entry_size, range->entry_count);
|
|
uvm_pte_batch_end(&pte_batch);
|
|
|
|
uvm_tlb_batch_single_invalidate(tree, &push, va, size, page_size, UVM_MEMBAR_SYS);
|
|
|
|
(void) page_tree_end_and_wait(tree, &push);
|
|
|
|
uvm_page_tree_put_ptes(tree, range);
|
|
|
|
uvm_kvfree(range);
|
|
root_chunk_mapping->range = NULL;
|
|
}
|
|
|
|
// Add a root chunk mapping to the flat mapping address segment located in the
|
|
// UVM-internal GPU kernel address space.
|
|
//
|
|
// This function is a specialized version of create_identity_mapping because the
|
|
// mapping fits into a single 2MB PTE, so there is no need to use page table
|
|
// range vectors.
|
|
static NV_STATUS root_chunk_mapping_create(uvm_gpu_t *gpu, uvm_gpu_root_chunk_mapping_t *root_chunk_mapping)
|
|
{
|
|
NV_STATUS status;
|
|
NvU64 pa, va;
|
|
uvm_gpu_address_t gpu_virtual_address;
|
|
uvm_page_table_range_t *range;
|
|
uvm_page_tree_t *tree;
|
|
uvm_gpu_phys_address_t entry_addr;
|
|
uvm_push_t push;
|
|
NvU64 pte_bits;
|
|
NvU32 entry_size;
|
|
NvU32 page_size = UVM_CHUNK_SIZE_MAX;
|
|
NvU64 size = UVM_CHUNK_SIZE_MAX;
|
|
|
|
range = uvm_kvmalloc_zero(sizeof(*range));
|
|
if (range == NULL)
|
|
return NV_ERR_NO_MEMORY;
|
|
|
|
tree = &gpu->address_space_tree;
|
|
pa = root_chunk_mapping_physical_address(gpu, root_chunk_mapping);
|
|
gpu_virtual_address = uvm_gpu_address_virtual_from_vidmem_phys(gpu, pa);
|
|
va = gpu_virtual_address.address;
|
|
|
|
UVM_ASSERT(IS_ALIGNED(va, page_size));
|
|
|
|
// A VA block lock is likely to be held, so eviction is not requested for
|
|
// allocations of page tree backing storage.
|
|
status = uvm_page_tree_get_ptes(tree, page_size, va, size, UVM_PMM_ALLOC_FLAGS_NONE, range);
|
|
if (status != NV_OK) {
|
|
uvm_kvfree(range);
|
|
return status;
|
|
}
|
|
|
|
UVM_ASSERT(range->entry_count == 1);
|
|
|
|
status = page_tree_begin_acquire(tree, NULL, &push, "Adding linear mapping at [0x%llx, 0x%llx)", va, va + size);
|
|
if (status != NV_OK)
|
|
goto error;
|
|
|
|
entry_addr = uvm_page_table_range_entry_address(tree, range, 0);
|
|
pte_bits = tree->hal->make_pte(UVM_APERTURE_VID, pa, UVM_PROT_READ_WRITE_ATOMIC, UVM_MMU_PTE_FLAGS_CACHED);
|
|
entry_size = uvm_mmu_pte_size(tree, page_size);
|
|
uvm_pte_batch_single_write_ptes(&push, entry_addr, &pte_bits, entry_size, range->entry_count);
|
|
|
|
uvm_tlb_batch_single_invalidate(tree, &push, va, page_size, page_size, UVM_MEMBAR_NONE);
|
|
|
|
status = page_tree_end_and_wait(tree, &push);
|
|
if (status != NV_OK)
|
|
goto error;
|
|
|
|
root_chunk_mapping->range = range;
|
|
|
|
return NV_OK;
|
|
|
|
error:
|
|
uvm_page_tree_put_ptes(tree, range);
|
|
uvm_kvfree(range);
|
|
|
|
return status;
|
|
}
|
|
|
|
NV_STATUS uvm_mmu_chunk_map(uvm_gpu_chunk_t *chunk)
|
|
{
|
|
uvm_gpu_root_chunk_mapping_t *root_chunk_mapping;
|
|
uvm_chunk_size_t chunk_size;
|
|
NvU16 num_mapped_pages;
|
|
NV_STATUS status = NV_OK;
|
|
uvm_gpu_t *gpu = uvm_gpu_chunk_get_gpu(chunk);
|
|
|
|
if (!uvm_mmu_gpu_needs_dynamic_vidmem_mapping(gpu))
|
|
return NV_OK;
|
|
|
|
chunk_size = uvm_gpu_chunk_get_size(chunk);
|
|
num_mapped_pages = chunk_size / PAGE_SIZE;
|
|
|
|
// While there could be legitimate use cases for mapping a kernel chunk,
|
|
// currently it is disallowed. There are a couple of reasons for this.
|
|
// First, in SR-IOV heavy, disallowing kernel chunks prevents unintended
|
|
// mappings of page tables/directories when they share the backing
|
|
// allocation i.e. the memory backing the kernel chunk is in the same root
|
|
// chunk as the memory backing a GPU page tree node. Second, root chunk
|
|
// mappings are reference counted as multiples of PAGE_SIZE. User chunk
|
|
// sizes are guaranteed to be a multiple of that page size, but kernel chunk
|
|
// sizes can be smaller.
|
|
UVM_ASSERT(uvm_pmm_gpu_memory_type_is_user(chunk->type));
|
|
|
|
UVM_ASSERT(PAGE_ALIGNED(chunk_size));
|
|
|
|
root_chunk_mapping = root_chunk_mapping_from_chunk(gpu, chunk);
|
|
root_chunk_mapping_lock(gpu, root_chunk_mapping);
|
|
|
|
if (root_chunk_mapping->num_mapped_pages == 0) {
|
|
UVM_ASSERT(root_chunk_mapping->range == NULL);
|
|
|
|
status = root_chunk_mapping_create(gpu, root_chunk_mapping);
|
|
if (status != NV_OK)
|
|
goto out;
|
|
}
|
|
|
|
// Check for reference counting overflow. The number of mapped pages can be
|
|
// above UVM_CHUNK_SIZE_MAX / PAGE_SIZE because the same chunk can be mapped
|
|
// multiple times.
|
|
UVM_ASSERT((NV_U32_MAX - root_chunk_mapping->num_mapped_pages) >= num_mapped_pages);
|
|
|
|
root_chunk_mapping->num_mapped_pages += num_mapped_pages;
|
|
|
|
out:
|
|
root_chunk_mapping_unlock(gpu, root_chunk_mapping);
|
|
return status;
|
|
}
|
|
|
|
void uvm_mmu_chunk_unmap(uvm_gpu_chunk_t *chunk, uvm_tracker_t *tracker)
|
|
{
|
|
uvm_gpu_root_chunk_mapping_t *root_chunk_mapping;
|
|
uvm_chunk_size_t chunk_size;
|
|
NvU16 num_unmapped_pages;
|
|
uvm_gpu_t *gpu;
|
|
|
|
if (chunk == NULL)
|
|
return;
|
|
|
|
gpu = uvm_gpu_chunk_get_gpu(chunk);
|
|
if (!uvm_mmu_gpu_needs_dynamic_vidmem_mapping(gpu))
|
|
return;
|
|
|
|
if (tracker != NULL)
|
|
uvm_tracker_wait(tracker);
|
|
|
|
chunk_size = uvm_gpu_chunk_get_size(chunk);
|
|
num_unmapped_pages = chunk_size / PAGE_SIZE;
|
|
|
|
UVM_ASSERT(uvm_pmm_gpu_memory_type_is_user(chunk->type));
|
|
UVM_ASSERT(PAGE_ALIGNED(chunk_size));
|
|
|
|
root_chunk_mapping = root_chunk_mapping_from_chunk(gpu, chunk);
|
|
root_chunk_mapping_lock(gpu, root_chunk_mapping);
|
|
|
|
if (root_chunk_mapping->range != NULL) {
|
|
UVM_ASSERT(root_chunk_mapping->num_mapped_pages >= num_unmapped_pages);
|
|
|
|
root_chunk_mapping->num_mapped_pages -= num_unmapped_pages;
|
|
|
|
if (root_chunk_mapping->num_mapped_pages == 0)
|
|
root_chunk_mapping_destroy(gpu, root_chunk_mapping);
|
|
}
|
|
else {
|
|
UVM_ASSERT(root_chunk_mapping->num_mapped_pages == 0);
|
|
}
|
|
|
|
root_chunk_mapping_unlock(gpu, root_chunk_mapping);
|
|
}
|
|
|
|
|
|
static size_t sysmem_mapping_index(uvm_gpu_t *gpu, uvm_gpu_identity_mapping_t *sysmem_mapping)
|
|
{
|
|
return sysmem_mapping - gpu->sysmem_mappings.array;
|
|
}
|
|
|
|
static NvU64 sysmem_mapping_physical_address(uvm_gpu_t *gpu, uvm_gpu_identity_mapping_t *sysmem_mapping)
|
|
{
|
|
return gpu->sysmem_mappings.mapping_size * sysmem_mapping_index(gpu, sysmem_mapping);
|
|
}
|
|
|
|
static uvm_gpu_identity_mapping_t *sysmem_mapping_from_address(uvm_gpu_t *gpu, NvU64 pa)
|
|
{
|
|
size_t index = pa / gpu->sysmem_mappings.mapping_size;
|
|
|
|
UVM_ASSERT(index < gpu->sysmem_mappings.count);
|
|
|
|
return gpu->sysmem_mappings.array + index;
|
|
}
|
|
|
|
static void sysmem_mapping_lock(uvm_gpu_t *gpu, uvm_gpu_identity_mapping_t *sysmem_mapping)
|
|
{
|
|
uvm_bit_lock(&gpu->sysmem_mappings.bitlocks, sysmem_mapping_index(gpu, sysmem_mapping));
|
|
}
|
|
|
|
static void sysmem_mapping_unlock(uvm_gpu_t *gpu, uvm_gpu_identity_mapping_t *sysmem_mapping)
|
|
{
|
|
uvm_bit_unlock(&gpu->sysmem_mappings.bitlocks, sysmem_mapping_index(gpu, sysmem_mapping));
|
|
}
|
|
|
|
static void destroy_dynamic_sysmem_mapping(uvm_gpu_t *gpu)
|
|
{
|
|
size_t i;
|
|
|
|
if (!uvm_mmu_gpu_needs_dynamic_sysmem_mapping(gpu))
|
|
return;
|
|
|
|
if (gpu->sysmem_mappings.array == NULL)
|
|
return;
|
|
|
|
uvm_bit_locks_deinit(&gpu->sysmem_mappings.bitlocks);
|
|
|
|
for (i = 0; i < gpu->sysmem_mappings.count; ++i)
|
|
destroy_identity_mapping(gpu->sysmem_mappings.array + i);
|
|
|
|
uvm_kvfree(gpu->sysmem_mappings.array);
|
|
gpu->sysmem_mappings.array = NULL;
|
|
}
|
|
|
|
static NV_STATUS create_dynamic_sysmem_mapping(uvm_gpu_t *gpu)
|
|
{
|
|
NV_STATUS status;
|
|
size_t count;
|
|
NvU64 mapping_size;
|
|
NvU64 flat_sysmem_va_size;
|
|
|
|
if (!uvm_mmu_gpu_needs_dynamic_sysmem_mapping(gpu))
|
|
return NV_OK;
|
|
|
|
UVM_ASSERT(gpu->parent->flat_sysmem_va_base != 0);
|
|
|
|
// The DMA addressable window is the maximum system physical memory
|
|
// addressable by the GPU (this limit is 128TB in Pascal-Ampere). The
|
|
// virtual mapping to sysmem is linear, so its size matches that of the
|
|
// physical address space.
|
|
flat_sysmem_va_size = gpu->parent->dma_addressable_limit + 1 - gpu->parent->dma_addressable_start;
|
|
|
|
// The optimal mapping granularity is dependent on multiple factors:
|
|
// application access patterns, distribution of system memory pages across
|
|
// the physical address space, mapping costs (themselves highly variable: in
|
|
// SR-IOV each mapping addition adds a lot of overhead due to vGPU plugin
|
|
// involvement), metadata memory footprint (inversely proportional to the
|
|
// mapping size), etc.
|
|
mapping_size = 4ULL * 1024 * 1024 * 1024;
|
|
|
|
// The mapping size should be at least 1GB, due to bitlock limitations. This
|
|
// shouldn't be a problem because the expectation is to use 512MB PTEs, and
|
|
// using a granularity of 1GB already results in allocating a large array of
|
|
// sysmem mappings with 128K entries.
|
|
UVM_ASSERT(is_power_of_2(mapping_size));
|
|
UVM_ASSERT(mapping_size >= 1ULL * 1024 * 1024 * 1024);
|
|
UVM_ASSERT(mapping_size >= uvm_mmu_biggest_page_size(&gpu->address_space_tree));
|
|
UVM_ASSERT(mapping_size <= flat_sysmem_va_size);
|
|
|
|
flat_sysmem_va_size = UVM_ALIGN_UP(flat_sysmem_va_size, mapping_size);
|
|
|
|
UVM_ASSERT((gpu->parent->flat_sysmem_va_base + flat_sysmem_va_size) <= gpu->parent->uvm_mem_va_base);
|
|
|
|
count = flat_sysmem_va_size / mapping_size;
|
|
|
|
gpu->sysmem_mappings.array = uvm_kvmalloc_zero(sizeof(*gpu->sysmem_mappings.array) * count);
|
|
if (gpu->sysmem_mappings.array == NULL)
|
|
return NV_ERR_NO_MEMORY;
|
|
|
|
gpu->sysmem_mappings.mapping_size = mapping_size;
|
|
gpu->sysmem_mappings.count = count;
|
|
|
|
status = uvm_bit_locks_init(&gpu->sysmem_mappings.bitlocks, count, UVM_LOCK_ORDER_CHUNK_MAPPING);
|
|
if (status != NV_OK)
|
|
goto error;
|
|
|
|
return NV_OK;
|
|
|
|
error:
|
|
destroy_dynamic_sysmem_mapping(gpu);
|
|
return status;
|
|
}
|
|
|
|
NV_STATUS uvm_mmu_sysmem_map(uvm_gpu_t *gpu, NvU64 pa, NvU64 size)
|
|
{
|
|
NvU64 curr_pa;
|
|
|
|
if (!uvm_mmu_gpu_needs_dynamic_sysmem_mapping(gpu))
|
|
return NV_OK;
|
|
|
|
curr_pa = UVM_ALIGN_DOWN(pa, gpu->sysmem_mappings.mapping_size);
|
|
|
|
while (curr_pa < (pa + size)) {
|
|
NV_STATUS status = NV_OK;
|
|
uvm_gpu_identity_mapping_t *sysmem_mapping = sysmem_mapping_from_address(gpu, curr_pa);
|
|
|
|
sysmem_mapping_lock(gpu, sysmem_mapping);
|
|
|
|
if (sysmem_mapping->range_vec == NULL) {
|
|
uvm_gpu_address_t virtual_address = uvm_gpu_address_virtual_from_sysmem_phys(gpu, curr_pa);
|
|
NvU64 phys_offset = curr_pa;
|
|
NvU32 page_size = uvm_mmu_biggest_page_size(&gpu->address_space_tree);
|
|
uvm_pmm_alloc_flags_t pmm_flags;
|
|
|
|
// No eviction is requested when allocating the page tree storage,
|
|
// because in the common case the VA block lock is held.
|
|
pmm_flags = UVM_PMM_ALLOC_FLAGS_NONE;
|
|
|
|
sysmem_mapping->base = uvm_parent_gpu_canonical_address(gpu->parent, virtual_address.address);
|
|
|
|
status = create_identity_mapping(gpu,
|
|
sysmem_mapping->base,
|
|
gpu->sysmem_mappings.mapping_size,
|
|
UVM_APERTURE_SYS,
|
|
phys_offset,
|
|
page_size,
|
|
pmm_flags,
|
|
&sysmem_mapping->range_vec);
|
|
}
|
|
|
|
sysmem_mapping_unlock(gpu, sysmem_mapping);
|
|
|
|
// In case of error, don't undo previous mapping additions, since a
|
|
// concurrent thread may already have pushed work to the GPU that is
|
|
// dependent on the new mappings.
|
|
if (status != NV_OK)
|
|
return status;
|
|
|
|
curr_pa += gpu->sysmem_mappings.mapping_size;
|
|
}
|
|
|
|
return NV_OK;
|
|
}
|
|
|
|
NV_STATUS uvm_mmu_create_flat_mappings(uvm_gpu_t *gpu)
|
|
{
|
|
NV_STATUS status;
|
|
|
|
status = create_dynamic_sysmem_mapping(gpu);
|
|
if (status != NV_OK)
|
|
return status;
|
|
|
|
status = create_static_vidmem_mapping(gpu);
|
|
if (status != NV_OK)
|
|
goto error;
|
|
|
|
status = create_dynamic_vidmem_mapping(gpu);
|
|
if (status != NV_OK)
|
|
goto error;
|
|
|
|
return NV_OK;
|
|
|
|
error:
|
|
uvm_mmu_destroy_flat_mappings(gpu);
|
|
return status;
|
|
}
|
|
|
|
void uvm_mmu_destroy_flat_mappings(uvm_gpu_t *gpu)
|
|
{
|
|
destroy_dynamic_vidmem_mapping(gpu);
|
|
destroy_static_vidmem_mapping(gpu);
|
|
destroy_dynamic_sysmem_mapping(gpu);
|
|
}
|
|
|
|
NV_STATUS uvm_test_invalidate_tlb(UVM_TEST_INVALIDATE_TLB_PARAMS *params, struct file *filp)
|
|
{
|
|
NV_STATUS status;
|
|
uvm_gpu_t *gpu = NULL;
|
|
uvm_push_t push;
|
|
uvm_va_space_t *va_space = uvm_va_space_get(filp);
|
|
uvm_gpu_va_space_t *gpu_va_space;
|
|
|
|
// Check parameter values
|
|
if (params->membar < UvmInvalidateTlbMemBarNone ||
|
|
params->membar > UvmInvalidateTlbMemBarLocal) {
|
|
return NV_ERR_INVALID_PARAMETER;
|
|
}
|
|
|
|
if (params->target_va_mode < UvmTargetVaModeAll ||
|
|
params->target_va_mode > UvmTargetVaModeTargeted) {
|
|
return NV_ERR_INVALID_PARAMETER;
|
|
}
|
|
|
|
if (params->page_table_level < UvmInvalidatePageTableLevelAll ||
|
|
params->page_table_level > UvmInvalidatePageTableLevelPde4) {
|
|
return NV_ERR_INVALID_PARAMETER;
|
|
}
|
|
|
|
uvm_va_space_down_read(va_space);
|
|
|
|
gpu = uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(va_space, ¶ms->gpu_uuid);
|
|
if (!gpu) {
|
|
status = NV_ERR_INVALID_DEVICE;
|
|
goto unlock_exit;
|
|
}
|
|
|
|
gpu_va_space = uvm_gpu_va_space_get(va_space, gpu);
|
|
UVM_ASSERT(gpu_va_space);
|
|
|
|
status = uvm_push_begin(gpu->channel_manager,
|
|
UVM_CHANNEL_TYPE_MEMOPS,
|
|
&push,
|
|
"Pushing test invalidate, GPU %s",
|
|
uvm_gpu_name(gpu));
|
|
if (status == NV_OK)
|
|
gpu->parent->host_hal->tlb_invalidate_test(&push, uvm_page_tree_pdb(&gpu_va_space->page_tables)->addr, params);
|
|
|
|
unlock_exit:
|
|
// Wait for the invalidation to be performed
|
|
if (status == NV_OK)
|
|
status = uvm_push_end_and_wait(&push);
|
|
|
|
uvm_va_space_up_read(va_space);
|
|
|
|
return status;
|
|
}
|