mirror of
https://github.com/NVIDIA/open-gpu-kernel-modules.git
synced 2024-12-13 23:48:48 +01:00
1982 lines
91 KiB
C
1982 lines
91 KiB
C
|
/*******************************************************************************
|
||
|
Copyright (c) 2015-2022 NVIDIA Corporation
|
||
|
|
||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||
|
of this software and associated documentation files (the "Software"), to
|
||
|
deal in the Software without restriction, including without limitation the
|
||
|
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
||
|
sell copies of the Software, and to permit persons to whom the Software is
|
||
|
furnished to do so, subject to the following conditions:
|
||
|
|
||
|
The above copyright notice and this permission notice shall be
|
||
|
included in all copies or substantial portions of the Software.
|
||
|
|
||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||
|
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||
|
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||
|
DEALINGS IN THE SOFTWARE.
|
||
|
|
||
|
*******************************************************************************/
|
||
|
|
||
|
#ifndef __UVM_VA_BLOCK_H__
|
||
|
#define __UVM_VA_BLOCK_H__
|
||
|
|
||
|
#include "uvm_forward_decl.h"
|
||
|
#include "uvm_types.h"
|
||
|
#include "uvm_linux.h"
|
||
|
#include "nv-kref.h"
|
||
|
#include "uvm_common.h"
|
||
|
#include "uvm_perf_module.h"
|
||
|
#include "uvm_processors.h"
|
||
|
#include "uvm_lock.h"
|
||
|
#include "uvm_test_ioctl.h"
|
||
|
#include "uvm_tracker.h"
|
||
|
#include "uvm_pmm_gpu.h"
|
||
|
#include "uvm_perf_thrashing.h"
|
||
|
#include "uvm_perf_utils.h"
|
||
|
#include "uvm_va_block_types.h"
|
||
|
#include "uvm_range_tree.h"
|
||
|
#include "uvm_mmu.h"
|
||
|
#include "nv-kthread-q.h"
|
||
|
|
||
|
#include <linux/mmu_notifier.h>
|
||
|
|
||
|
// VA blocks are the leaf nodes in the uvm_va_space tree for managed allocations
|
||
|
// (VA ranges with type == UVM_VA_RANGE_TYPE_MANAGED):
|
||
|
//
|
||
|
// UVM: uvm_va_space -> uvm_va_range -> uvm_va_block
|
||
|
// HMM: uvm_va_space -> uvm_va_block
|
||
|
//
|
||
|
// Each VA block is contained within a single VA range, and contains state on
|
||
|
// VAs covered by that block. Most importantly, the block tracks the current
|
||
|
// state of the virtual-to-physical mappings for all VAs within that block
|
||
|
// across all processors, along with the physical residency location for each
|
||
|
// VA.
|
||
|
//
|
||
|
// The block serializes both CPU and GPU operations on all VAs under that block.
|
||
|
// The CPU work is serialized with the block lock, and the GPU work is
|
||
|
// serialized by the block work tracker which itself is protected by the block
|
||
|
// lock.
|
||
|
//
|
||
|
// The size of each block varies from the size of the smallest VA range
|
||
|
// (PAGE_SIZE) to the max block size specified by UVM_VA_BLOCK_BITS. No block
|
||
|
// will span a 2^UVM_VA_BLOCK_BITS boundary in VA space. The size of the block
|
||
|
// is determined by the alignment of the parent VA range and the block's
|
||
|
// placement within the range.
|
||
|
//
|
||
|
// Note that this means user space will get best allocation efficiency if it
|
||
|
// allocates memory in 2^UVM_VA_BLOCK_BITS naturally-aligned chunks.
|
||
|
|
||
|
// enums used for indexing into the array of pte_bits bitmaps in the VA block
|
||
|
// which hold the current state of each PTE. For a given {processor, PTE}, the
|
||
|
// bits represented here must be enough to re-create the non-address portion of
|
||
|
// the PTE for that processor.
|
||
|
|
||
|
// If _READ is not set, the PTE mapping is not valid.
|
||
|
// If _WRITE is set, _READ is also set (_WRITE implies _READ).
|
||
|
typedef enum
|
||
|
{
|
||
|
UVM_PTE_BITS_CPU_READ,
|
||
|
UVM_PTE_BITS_CPU_WRITE,
|
||
|
UVM_PTE_BITS_CPU_MAX
|
||
|
} uvm_pte_bits_cpu_t;
|
||
|
|
||
|
// If _READ is not set, the PTE mapping is not valid.
|
||
|
// If _WRITE is set, _READ is also set (_WRITE implies _READ).
|
||
|
// If _ATOMIC is set, _WRITE is also set (_ATOMIC implies _WRITE and _READ).
|
||
|
//
|
||
|
// TODO: Bug 1764925: Track volatile here too if we add GPU L2 caching
|
||
|
typedef enum
|
||
|
{
|
||
|
UVM_PTE_BITS_GPU_READ,
|
||
|
UVM_PTE_BITS_GPU_WRITE,
|
||
|
UVM_PTE_BITS_GPU_ATOMIC,
|
||
|
UVM_PTE_BITS_GPU_MAX
|
||
|
} uvm_pte_bits_gpu_t;
|
||
|
|
||
|
typedef struct
|
||
|
{
|
||
|
// Per-page residency bit vector, used for fast traversal
|
||
|
// of resident pages.
|
||
|
//
|
||
|
// This follows the same semantics as the CPU residency bit vector and
|
||
|
// notably each bit still represents a PAGE_SIZE amount of data, but the
|
||
|
// physical GPU memory is tracked by an array of GPU chunks below.
|
||
|
uvm_page_mask_t resident;
|
||
|
|
||
|
// Pages that have been evicted to sysmem
|
||
|
uvm_page_mask_t evicted;
|
||
|
|
||
|
NvU64 *cpu_chunks_dma_addrs;
|
||
|
|
||
|
// Array of naturally-aligned chunks. Each chunk has the largest possible
|
||
|
// size which can fit within the block, so they are not uniform size.
|
||
|
//
|
||
|
// The number of chunks in the array is calculated using
|
||
|
// block_num_gpu_chunks. The size of each chunk is calculated using
|
||
|
// block_gpu_chunk_index.
|
||
|
uvm_gpu_chunk_t **chunks;
|
||
|
|
||
|
// These page table ranges are not necessarily all used at the same time.
|
||
|
// The block might also be too small or not aligned properly to use the
|
||
|
// larger ranges, in which case they're never allocated.
|
||
|
//
|
||
|
// Once a range is allocated we keep it around to avoid constant allocation
|
||
|
// overhead when doing PTE splitting and merging.
|
||
|
//
|
||
|
// Check range.table to see if a given range has been allocated yet.
|
||
|
//
|
||
|
// page_table_range_big's range covers the big PTEs which fit within the
|
||
|
// interior of this block. See the big_ptes field.
|
||
|
uvm_page_table_range_t page_table_range_2m;
|
||
|
uvm_page_table_range_t page_table_range_big;
|
||
|
uvm_page_table_range_t page_table_range_4k;
|
||
|
|
||
|
// These flags are ignored unless the {block, gpu} pair supports a 2M page
|
||
|
// size. In that case it's the responsibility of the block code to make the
|
||
|
// lower page tables active by calling uvm_page_tree_write_pde.
|
||
|
//
|
||
|
// They can be allocated and activated separately, so we have to track them
|
||
|
// separately.
|
||
|
//
|
||
|
// Activated only means that uvm_page_tree_write_pde has been called at some
|
||
|
// point in the past with the appropriate range allocated. It does not imply
|
||
|
// that the 2M entry is a PDE (see pte_is_2m).
|
||
|
bool activated_big;
|
||
|
bool activated_4k;
|
||
|
|
||
|
// For {block, gpu} pairs which support the 2M page size, the page table
|
||
|
// ranges are uninitialized on allocation. This flag tracks whether the big
|
||
|
// PTEs have been initialized.
|
||
|
//
|
||
|
// We don't need an equivalent flag for the 4k range because we always write
|
||
|
// just the 4k PTEs not covered by higher-level PTEs. Big PTEs however can
|
||
|
// be allocated and activated late while the 4k PTEs are already active, in
|
||
|
// which case we need to initialize the entire big range.
|
||
|
bool initialized_big;
|
||
|
|
||
|
// Sticky state to split PTEs to 4k and keep them there. Used when a fatal
|
||
|
// fault has been detected on this GPU to avoid false dependencies within
|
||
|
// the uTLB for fatal and non-fatal faults on the same larger PTE, which
|
||
|
// could lead to wrong fault attribution.
|
||
|
bool force_4k_ptes;
|
||
|
|
||
|
// This table shows the HW PTE states given all permutations of pte_is_2m,
|
||
|
// big_ptes, and pte_bits. Note that the first row assumes that the 4k page
|
||
|
// tables have been allocated (if not, then no PDEs are allocated either).
|
||
|
//
|
||
|
// |-------------- SW state --------------|------------------- HW state --------------------|
|
||
|
// pte_is_2m pte_is_big pte_bits[READ] | Page size PDE0(2M only) Big PTE 4k PTE
|
||
|
// ----------------------------------------------------------------------------------------
|
||
|
// 0 0 0 | 4k Valid PDE Invalid [1] Invalid
|
||
|
// 0 0 1 | 4k Valid PDE Invalid [1] Valid
|
||
|
// 0 1 0 | Big Valid PDE Unmapped [2] x
|
||
|
// 0 1 1 | Big Valid PDE Valid x
|
||
|
// 1 must be 0 0 | 2M Invalid x x
|
||
|
// 1 must be 0 1 | 2M Valid PTE x x
|
||
|
//
|
||
|
// [1]: The big PTE may be unallocated, in which case its pointer won't be
|
||
|
// valid in the parent PDE. If the big PTE is allocated, it will be
|
||
|
// invalid so the 4k PTEs are active.
|
||
|
//
|
||
|
// [2]: The unmapped big PTE pattern differs from the invalid pattern, and
|
||
|
// it prevents HW from reading the 4k entries. See the unmapped_pte()
|
||
|
// MMU HAL function.
|
||
|
|
||
|
// If pte_is_2m is true, there is a 2M PTE covering this VA block (valid or
|
||
|
// invalid). If false then we're in one of the following scenarios:
|
||
|
// 1) This {block, gpu} does not support 2M pages.
|
||
|
// 2) 2M pages are supported but the page_table_range_2m has not been
|
||
|
// allocated (implying that the other page table ranges have not been
|
||
|
// allocated either).
|
||
|
// 3) page_table_range_2m has been allocated, but the big_ptes bitmap should
|
||
|
// be used to determine the mix of big and 4k PTEs.
|
||
|
bool pte_is_2m;
|
||
|
|
||
|
// When pte_is_2m is false, this block consists of any possible mix of big
|
||
|
// and 4k PTEs. This bitmap describes that mix. A set bit indicates that the
|
||
|
// corresponding big-page-sized region of the block is covered by a big PTE.
|
||
|
// A cleared bit indicates that it is covered by 4k PTEs.
|
||
|
//
|
||
|
// Neither setting implies that the PTE currently has a valid mapping, it
|
||
|
// just indicates which PTE is read by the GPU (see the table above).
|
||
|
//
|
||
|
// The indices represent the corresponding big PTEs in the block's interior.
|
||
|
// For example, a block with alignment and size of one 4k page on either
|
||
|
// side of a big page will only use bit 0. Use uvm_va_block_big_page_index to look
|
||
|
// the big_ptes index of a page.
|
||
|
//
|
||
|
// The block might not be able to fit any big PTEs, in which case this
|
||
|
// bitmap is always zero. Use uvm_va_block_gpu_num_big_pages to find the number of
|
||
|
// valid bits in this mask.
|
||
|
DECLARE_BITMAP(big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
|
||
|
|
||
|
// See the comments for uvm_va_block_mmap_t::cpu.pte_bits.
|
||
|
//
|
||
|
// The major difference is that these bits are always accurate since, unlike
|
||
|
// the CPU PTEs, the UVM driver is in full control of these mappings.
|
||
|
//
|
||
|
// Note that the granularity is always PAGE_SIZE, not whatever GPU PTE size
|
||
|
// happens to currently map these regions. PAGE_SIZE is the minimum
|
||
|
// granularity of operations on the VA blocks. As a future optimization we
|
||
|
// could consider sub-PAGE_SIZE operations if PAGE_SIZE > 4K and the CPU
|
||
|
// isn't involved, for example false sharing among peer GPUs.
|
||
|
uvm_page_mask_t pte_bits[UVM_PTE_BITS_GPU_MAX];
|
||
|
|
||
|
} uvm_va_block_gpu_state_t;
|
||
|
|
||
|
// TODO: Bug 1766180: Worst-case we could have one of these per system page.
|
||
|
// Options:
|
||
|
// 1) Rely on the OOM killer to prevent the user from trying to do that
|
||
|
// 2) Be much more space-conscious in this struct (difficult)
|
||
|
// 3) Cap the per-process range and/or block count, like vm.max_map_count
|
||
|
// does for vmas
|
||
|
struct uvm_va_block_struct
|
||
|
{
|
||
|
// Reference count for this block. References are held by:
|
||
|
// - The parent VA range for managed blocks or VA space for HMM blocks
|
||
|
// - The reverse map
|
||
|
// - The eviction path temporarily when attempting to evict a GPU page under
|
||
|
// this block
|
||
|
//
|
||
|
// This isn't protected by the lock on the eviction path, so it must be
|
||
|
// atomic. nv_kref provides that.
|
||
|
nv_kref_t kref;
|
||
|
|
||
|
// Lock protecting the block. See the comment at the top of uvm.c.
|
||
|
uvm_mutex_t lock;
|
||
|
|
||
|
// Parent VA range. UVM managed blocks have this set. HMM blocks will have
|
||
|
// va_range set to NULL and hmm.va_space set instead. Dead blocks that are
|
||
|
// waiting for the last ref count to be removed have va_range and
|
||
|
// hmm.va_space set to NULL (could be either type of block).
|
||
|
//
|
||
|
// This field can be read while holding either the block lock or just the VA
|
||
|
// space lock in read mode, since it can only change when the VA space lock
|
||
|
// is held in write mode.
|
||
|
uvm_va_range_t *va_range;
|
||
|
|
||
|
// Virtual address [start, end] covered by this block. These fields can be
|
||
|
// read while holding either the block lock or just the VA space lock in
|
||
|
// read mode, since they can only change when the VA space lock is held in
|
||
|
// write mode.
|
||
|
NvU64 start;
|
||
|
NvU64 end;
|
||
|
|
||
|
// Per-processor residency bit vector, used for fast lookup of which
|
||
|
// processors are active in this block.
|
||
|
//
|
||
|
// A set bit means the corresponding processor has a coherent physical copy
|
||
|
// of memory somewhere in the block. The per-processor state must then be
|
||
|
// inspected to find out which pages. The processor may or may not have a
|
||
|
// mapping to that physical memory, however.
|
||
|
//
|
||
|
// A cleared bit means the corresponding processor does not have a coherent
|
||
|
// physical copy of any pages under this block. The processor may still have
|
||
|
// cached pages allocated for future use, however. It also may have mappings
|
||
|
// to pages resident on other processors.
|
||
|
uvm_processor_mask_t resident;
|
||
|
|
||
|
// Per-processor mapping bit vector, used for fast lookup of which
|
||
|
// processors are active in this block.
|
||
|
//
|
||
|
// A set bit means the corresponding processor has an active, valid page
|
||
|
// table mapping to some VA in this block. The per-processor pte_bits state
|
||
|
// must then be inspected to find out the mapping address and permissions.
|
||
|
//
|
||
|
// A cleared bit means the corresponding processor has no virtual mappings
|
||
|
// within this block (all pte_bits entries are 0).
|
||
|
uvm_processor_mask_t mapped;
|
||
|
|
||
|
// Per-processor evicted bit vector, used for fast lookup of which GPUs
|
||
|
// have evicted pages in this block.
|
||
|
//
|
||
|
// A set bit means the corresponding processor was the residency of some of
|
||
|
// the pages in the block when they were evicted due to memory capacity
|
||
|
// limitations. The per-processor state must then be inspected to find out
|
||
|
// which pages.
|
||
|
//
|
||
|
// A cleared bit means the corresponding processor has no evicted pages
|
||
|
// within this block (all evicted entries are 0).
|
||
|
uvm_processor_mask_t evicted_gpus;
|
||
|
|
||
|
struct
|
||
|
{
|
||
|
// Per-page residency bit vector, used for fast traversal of resident
|
||
|
// pages.
|
||
|
//
|
||
|
// A set bit means the CPU has a coherent copy of the physical page
|
||
|
// resident in its memory, and that the corresponding entry in the pages
|
||
|
// array is present. This does not mean that the coherent copy is
|
||
|
// currently mapped anywhere, however. A page may be resident on
|
||
|
// multiple processors when in read-duplicate mode.
|
||
|
//
|
||
|
// A cleared bit means the CPU does not have a coherent copy of that
|
||
|
// page resident. The corresponding entry in the pages array may or may
|
||
|
// not present. If the entry is present, it's a cached page which can be
|
||
|
// reused in the future.
|
||
|
//
|
||
|
// Allocating PAGES_PER_UVM_VA_BLOCK is overkill when the block is
|
||
|
// smaller than UVM_VA_BLOCK_SIZE, but it's not much extra memory
|
||
|
// overhead on the whole.
|
||
|
uvm_page_mask_t resident;
|
||
|
|
||
|
// CPU memory chunks represent physically contiguous CPU memory
|
||
|
// allocations. See uvm_pmm_sysmem.h for more details on CPU chunks.
|
||
|
// This member is meant to hold an opaque value indicating the CPU
|
||
|
// chunk storage method. For more details on CPU chunk storage,
|
||
|
// see uvm_cpu_chunk_storage_type_t in uvm_pmm_sysmem.c.
|
||
|
unsigned long chunks;
|
||
|
|
||
|
// Per-page allocation bit vector.
|
||
|
//
|
||
|
// A set bit means that a CPU page has been allocated for the
|
||
|
// corresponding page index.
|
||
|
uvm_page_mask_t allocated;
|
||
|
|
||
|
// Per-page mapping bit vectors, one per bit we need to track. These are
|
||
|
// used for fast traversal of valid mappings in the block. These contain
|
||
|
// all non-address bits needed to establish a virtual mapping on this
|
||
|
// processor (permissions, cacheability, etc).
|
||
|
//
|
||
|
// A cleared bit in UVM_PTE_BITS_CPU_READ means the CPU has no valid
|
||
|
// virtual mapping to that address (the access will fault). Further,
|
||
|
// UVM_PTE_BITS_CPU_WRITE is guaranteed to also be clear.
|
||
|
//
|
||
|
// A set bit in UVM_PTE_BITS_CPU_READ means the CPU has a valid mapping
|
||
|
// at that address with at least read permissions. The physical page for
|
||
|
// that mapping is contained in the pages array. If
|
||
|
// UVM_PTE_BITS_CPU_WRITE is not set, the mapping is read-only.
|
||
|
// Otherwise, the mapping is read-write.
|
||
|
//
|
||
|
// Note that this is the maximum permissions a PTE could have, but not
|
||
|
// necessarily the actual current permissions of the CPU PTEs. The UVM
|
||
|
// driver will never change the PTEs without updating this state, but
|
||
|
// the kernel can downgrade our CPU mappings at any time without
|
||
|
// notifying the UVM driver (for example in response to user space
|
||
|
// calling madvise with MADV_DONTNEED).
|
||
|
uvm_page_mask_t pte_bits[UVM_PTE_BITS_CPU_MAX];
|
||
|
|
||
|
// Whether the CPU has ever mapped a page on this VA block. This is
|
||
|
// used to force GMMU PDE1 pre-population on ATS systems. See
|
||
|
// pre_populate_gpu_pde1 in uvm_va_block.c for more information.
|
||
|
NvU8 ever_mapped : 1;
|
||
|
|
||
|
// We can get "unexpected" faults if multiple CPU threads fault on the
|
||
|
// same address simultaneously and race to create the mapping. Since
|
||
|
// our CPU fault handler always unmaps to handle the case where the
|
||
|
// kernel downgrades our CPU mappings, we can introduce an infinite
|
||
|
// stream of CPU faults in multi-threaded workloads.
|
||
|
//
|
||
|
// In order to handle this scenario, we keep track of the first thread
|
||
|
// that faulted on a page with valid permissions and the timestamp.
|
||
|
// Then, we keep track of the subsequent faults on that page during a
|
||
|
// window of time. If the first thread faults again on the page, that
|
||
|
// will indicate that the mapping has been downgraded by the kernel and
|
||
|
// we need to remap it. Faults from the rest of threads are just
|
||
|
// ignored. The information is also cleared on the following events:
|
||
|
// - The tracking window finishes
|
||
|
// - The page is unmapped
|
||
|
struct
|
||
|
{
|
||
|
// Timestamp when the first fault was detected. This also is used
|
||
|
// as a flag that the contents of this struct are valid
|
||
|
NvU64 first_fault_stamp;
|
||
|
|
||
|
// First thread that faulted while having valid permissions. we
|
||
|
// don't take a reference on the pid so we shouldn't ever use it
|
||
|
// for task-lookup in the kernel. We only use it as a heuristic so
|
||
|
// it's OK if the pid gets destroyed or reused.
|
||
|
pid_t first_pid;
|
||
|
|
||
|
// Index of the page whose faults are being tracked
|
||
|
uvm_page_index_t page_index;
|
||
|
} fault_authorized;
|
||
|
} cpu;
|
||
|
|
||
|
// Per-GPU residency and mapping state
|
||
|
//
|
||
|
// TODO: Bug 1766180: Even though these are pointers, making this a static
|
||
|
// array will use up a non-trivial amount of storage for small blocks.
|
||
|
// In most cases we won't have anywhere near this many GPUs active
|
||
|
// anyway. Consider using a dense array of just the GPUs registered in
|
||
|
// this VA space, depending on the perf of accessing that array and on
|
||
|
// how noticeable this memory overhead actually is.
|
||
|
uvm_va_block_gpu_state_t *gpus[UVM_ID_MAX_GPUS];
|
||
|
|
||
|
// Mask to keep track of the pages that are read-duplicate
|
||
|
uvm_page_mask_t read_duplicated_pages;
|
||
|
|
||
|
// Mask to keep track of the pages that are not mapped on any non-UVM-Lite
|
||
|
// processor.
|
||
|
// 0: Page is definitely not mapped by any processors
|
||
|
// 1: Page may or may not be mapped by a processor
|
||
|
//
|
||
|
// This mask sets the bit when the page is mapped on any non-UVM-Lite
|
||
|
// processor but it is not always unset on unmap (to avoid a performance
|
||
|
// impact). Therefore, it can contain false negatives. It should be only
|
||
|
// used for opportunistic optimizations that have a fast path for pages
|
||
|
// that are not mapped anywhere (see uvm_va_block_migrate_locked, for
|
||
|
// example), but not the other way around.
|
||
|
uvm_page_mask_t maybe_mapped_pages;
|
||
|
|
||
|
// Tracks all outstanding GPU work related to this block: GPU copies, PTE
|
||
|
// updates, TLB invalidates, etc. The residency and mapping state is only
|
||
|
// valid once this tracker is done.
|
||
|
//
|
||
|
// CPU operations need to wait for this tracker to be done. GPU operations
|
||
|
// need to acquire it before pushing their work, then that work must be
|
||
|
// added to this tracker before the block's lock is dropped.
|
||
|
uvm_tracker_t tracker;
|
||
|
|
||
|
// A queue item for establishing eviction mappings in a deferred way
|
||
|
nv_kthread_q_item_t eviction_mappings_q_item;
|
||
|
|
||
|
uvm_perf_module_data_desc_t perf_modules_data[UVM_PERF_MODULE_TYPE_COUNT];
|
||
|
|
||
|
#if UVM_IS_CONFIG_HMM()
|
||
|
struct
|
||
|
{
|
||
|
|
||
|
// The MMU notifier is registered per va_block.
|
||
|
struct mmu_interval_notifier notifier;
|
||
|
|
||
|
|
||
|
// Parent VA space pointer. It is NULL for UVM managed blocks or if
|
||
|
// the HMM block is dead. This field can be read while holding the
|
||
|
// block lock and is only modified while holding the va_space write
|
||
|
// lock and va_block lock (same as the va_range pointer).
|
||
|
uvm_va_space_t *va_space;
|
||
|
|
||
|
// Tree of uvm_va_policy_node_t. The policy node ranges always cover
|
||
|
// all or part of a VMA range or a contiguous range of VMAs within the
|
||
|
// va_block. Policy nodes are resized or deleted when the underlying
|
||
|
// VMA range is changed by Linux via the invalidate() callback.
|
||
|
// Otherwise, policies could be stale after munmap().
|
||
|
// Locking: The va_block lock is needed to access or modify the tree.
|
||
|
uvm_range_tree_t va_policy_tree;
|
||
|
|
||
|
// Storage node for range tree of va_blocks.
|
||
|
uvm_range_tree_node_t node;
|
||
|
} hmm;
|
||
|
#endif
|
||
|
};
|
||
|
|
||
|
// We define additional per-VA Block fields for testing. When
|
||
|
// uvm_enable_builtin_tests is defined, all VA Blocks will have
|
||
|
// uvm_va_block_wrapper_t size. Otherwise, the test fields are not available.
|
||
|
// Use the uvm_va_block_get_test function defined below to obtain a safe
|
||
|
// pointer to uvm_va_block_test_t from a uvm_va_block_t pointer.
|
||
|
struct uvm_va_block_wrapper_struct
|
||
|
{
|
||
|
uvm_va_block_t block;
|
||
|
|
||
|
struct uvm_va_block_test_struct
|
||
|
{
|
||
|
// Count of how many page table allocations should be forced to retry
|
||
|
// with eviction enabled. Used for testing only.
|
||
|
NvU32 page_table_allocation_retry_force_count;
|
||
|
|
||
|
// Count of how many user pages allocations should be forced to retry
|
||
|
// with eviction enabled. Used for testing only.
|
||
|
NvU32 user_pages_allocation_retry_force_count;
|
||
|
|
||
|
// Mask of chunk sizes to be used for CPU chunk allocations.
|
||
|
// The actual set of chunk sizes to be used will be the set resulting
|
||
|
// from AND'ing this value with the value of
|
||
|
// uvm_cpu_chunk_allocation_sizes module parameter.
|
||
|
NvU32 cpu_chunk_allocation_size_mask;
|
||
|
|
||
|
// Force the next eviction attempt on this block to fail. Used for
|
||
|
// testing only.
|
||
|
bool inject_eviction_error;
|
||
|
|
||
|
// Subsequent operations that need to allocate CPU pages will fail. As
|
||
|
// opposed to other error injection settings, this one is persistent.
|
||
|
// This is because this error is supposed to be fatal and tests verify
|
||
|
// the state of the VA blocks after the failure. However, some tests
|
||
|
// use kernels to trigger migrations and a fault replay could trigger
|
||
|
// a successful migration if this error flag is cleared.
|
||
|
bool inject_cpu_pages_allocation_error;
|
||
|
|
||
|
// Force the next successful chunk allocation to then fail. Used for testing
|
||
|
// only to simulate driver metadata allocation failure.
|
||
|
bool inject_populate_error;
|
||
|
} test;
|
||
|
};
|
||
|
|
||
|
// Tracking needed for supporting allocation-retry of user GPU memory
|
||
|
typedef struct
|
||
|
{
|
||
|
// A tracker used for all allocations from PMM.
|
||
|
uvm_tracker_t tracker;
|
||
|
|
||
|
// List of allocated chunks (uvm_gpu_chunk_t). Currently all chunks are of
|
||
|
// the same size. However it can contain chunks from multiple GPUs. All
|
||
|
// remaining free chunks are freed when the operation is finished with
|
||
|
// uvm_va_block_retry_deinit().
|
||
|
struct list_head free_chunks;
|
||
|
|
||
|
// List of chunks allocated and used during the block operation. This list
|
||
|
// can contain chunks from multiple GPUs. All the used chunks are unpinned
|
||
|
// when the operation is finished with uvm_va_block_retry_deinit().
|
||
|
struct list_head used_chunks;
|
||
|
} uvm_va_block_retry_t;
|
||
|
|
||
|
// Module load/exit
|
||
|
NV_STATUS uvm_va_block_init(void);
|
||
|
void uvm_va_block_exit(void);
|
||
|
|
||
|
// Allocates and initializes the block. The block's ref count is initialized to
|
||
|
// 1. The caller is responsible for inserting the block into its parent
|
||
|
// va_range.
|
||
|
//
|
||
|
// The caller must be holding the VA space lock in at least read mode.
|
||
|
//
|
||
|
// The va_range must have type UVM_VA_RANGE_TYPE_MANAGED.
|
||
|
NV_STATUS uvm_va_block_create(uvm_va_range_t *va_range,
|
||
|
NvU64 start,
|
||
|
NvU64 end,
|
||
|
uvm_va_block_t **out_block);
|
||
|
|
||
|
// Internal function called only when uvm_va_block_release drops the ref count
|
||
|
// to 0. Do not call directly.
|
||
|
void uvm_va_block_destroy(nv_kref_t *kref);
|
||
|
|
||
|
static inline void uvm_va_block_retain(uvm_va_block_t *va_block)
|
||
|
{
|
||
|
nv_kref_get(&va_block->kref);
|
||
|
}
|
||
|
|
||
|
static inline void uvm_va_block_release(uvm_va_block_t *va_block)
|
||
|
{
|
||
|
if (va_block) {
|
||
|
// The calling thread shouldn't be holding the block's mutex when
|
||
|
// releasing the block as it might get destroyed.
|
||
|
uvm_assert_unlocked_order(UVM_LOCK_ORDER_VA_BLOCK);
|
||
|
nv_kref_put(&va_block->kref, uvm_va_block_destroy);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Same as uvm_va_block_release but the caller may be holding the VA block lock.
|
||
|
// The caller must ensure that the refcount will not get to zero in this call.
|
||
|
static inline void uvm_va_block_release_no_destroy(uvm_va_block_t *va_block)
|
||
|
{
|
||
|
int destroyed = nv_kref_put(&va_block->kref, uvm_va_block_destroy);
|
||
|
UVM_ASSERT(!destroyed);
|
||
|
}
|
||
|
|
||
|
// Returns true if the block is managed by HMM.
|
||
|
// Locking: This can be called while holding either the block lock or just the
|
||
|
// VA space lock in read mode, since it can only change when the VA space lock
|
||
|
// is held in write mode.
|
||
|
static inline bool uvm_va_block_is_hmm(uvm_va_block_t *va_block)
|
||
|
{
|
||
|
#if UVM_IS_CONFIG_HMM()
|
||
|
return va_block->hmm.va_space;
|
||
|
#else
|
||
|
return false;
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
// Return true if the block is dead.
|
||
|
// Locking: This can be called while holding either the block lock or just the
|
||
|
// VA space lock in read mode, since it can only change when the VA space lock
|
||
|
// is held in write mode.
|
||
|
static inline bool uvm_va_block_is_dead(uvm_va_block_t *va_block)
|
||
|
{
|
||
|
if (va_block->va_range)
|
||
|
return false;
|
||
|
|
||
|
#if UVM_IS_CONFIG_HMM()
|
||
|
if (va_block->hmm.va_space)
|
||
|
return false;
|
||
|
#endif
|
||
|
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
static inline uvm_va_block_gpu_state_t *uvm_va_block_gpu_state_get(uvm_va_block_t *va_block, uvm_gpu_id_t gpu_id)
|
||
|
{
|
||
|
return va_block->gpus[uvm_id_gpu_index(gpu_id)];
|
||
|
}
|
||
|
|
||
|
// Return the va_space pointer of the given block or NULL if the block is dead.
|
||
|
// Locking: This can be called while holding either the block lock or just the
|
||
|
// VA space lock in read mode, since it can only change when the VA space lock
|
||
|
// is held in write mode.
|
||
|
uvm_va_space_t *uvm_va_block_get_va_space_maybe_dead(uvm_va_block_t *va_block);
|
||
|
|
||
|
// Return the va_space pointer of the given block assuming the block is not dead
|
||
|
// (asserts that it is not dead and asserts va_space is not NULL).
|
||
|
// Locking: This can be called while holding either the block lock or just the
|
||
|
// VA space lock in read mode, since it can only change when the VA space lock
|
||
|
// is held in write mode.
|
||
|
uvm_va_space_t *uvm_va_block_get_va_space(uvm_va_block_t *va_block);
|
||
|
|
||
|
// Dynamic cache-based allocation for uvm_va_block_context_t.
|
||
|
//
|
||
|
// See uvm_va_block_context_init() for a description of the mm parameter.
|
||
|
uvm_va_block_context_t *uvm_va_block_context_alloc(struct mm_struct *mm);
|
||
|
void uvm_va_block_context_free(uvm_va_block_context_t *va_block_context);
|
||
|
|
||
|
// Initialization of an already-allocated uvm_va_block_context_t.
|
||
|
//
|
||
|
// mm is used to initialize the value of va_block_context->mm. NULL is allowed.
|
||
|
static void uvm_va_block_context_init(uvm_va_block_context_t *va_block_context, struct mm_struct *mm)
|
||
|
{
|
||
|
UVM_ASSERT(va_block_context);
|
||
|
|
||
|
// Write garbage into the VA Block context to ensure that the UVM code
|
||
|
// clears masks appropriately
|
||
|
if (UVM_IS_DEBUG())
|
||
|
memset(va_block_context, 0xff, sizeof(*va_block_context));
|
||
|
|
||
|
va_block_context->mm = mm;
|
||
|
}
|
||
|
|
||
|
// TODO: Bug 1766480: Using only page masks instead of a combination of regions
|
||
|
// and page masks could simplify the below APIs and their implementations
|
||
|
// at the cost of having to scan the whole mask for small regions.
|
||
|
// Investigate the performance effects of doing that.
|
||
|
|
||
|
// Moves the physical pages of the given region onto the destination processor.
|
||
|
// If page_mask is non-NULL, the movement is further restricted to only those
|
||
|
// pages in the region which are present in the mask.
|
||
|
//
|
||
|
// prefetch_page_mask may be passed as a subset of page_mask when cause is
|
||
|
// UVM_MAKE_RESIDENT_CAUSE_FAULT to indicate pages that have been pulled due
|
||
|
// to automatic page prefetching heuristics. For pages in this mask,
|
||
|
// UVM_MAKE_RESIDENT_CAUSE_PREFETCH will be reported in migration events,
|
||
|
// instead.
|
||
|
//
|
||
|
// This function breaks read duplication for all given pages even if they
|
||
|
// don't migrate. Pages which are not resident on the destination processor
|
||
|
// will also be unmapped from all existing processors, be populated in the
|
||
|
// destination processor's memory, and copied to the new physical location.
|
||
|
// Any new memory will be zeroed if it is the first allocation for that page
|
||
|
// in the system.
|
||
|
//
|
||
|
// This function does not create any new virtual mappings.
|
||
|
//
|
||
|
// This function acquires/waits for the va_block tracker and updates that
|
||
|
// tracker with any new work pushed.
|
||
|
//
|
||
|
// Allocation-retry: this operation may need to perform eviction to be able to
|
||
|
// allocate GPU memory successfully and if that happens,
|
||
|
// NV_WARN_MORE_PROCESSING_REQUIRED will be returned. That also means that the
|
||
|
// block's lock has been unlocked and relocked as part of the call and that the
|
||
|
// whole sequence of operations performed under the block's lock needs to be
|
||
|
// attempted again. To facilitate that, the caller needs to provide the same
|
||
|
// va_block_retry struct for each attempt that has been initialized before the first
|
||
|
// attempt and needs to be deinitialized after the last one. Most callers can
|
||
|
// just use UVM_VA_BLOCK_LOCK_RETRY() that takes care of that for the caller.
|
||
|
//
|
||
|
// If dest_id is the CPU then va_block_retry can be NULL and allocation-retry of
|
||
|
// user memory is guaranteed not to happen. Allocation-retry of page tables can
|
||
|
// still occur though.
|
||
|
//
|
||
|
// va_block_context must be non-NULL. This function will set a bit in
|
||
|
// va_block_context->make_resident.pages_changed_residency for each page that
|
||
|
// changed residency (due to a migration or first population) as a result of the
|
||
|
// operation. This function only sets bits in that mask. It is the caller's
|
||
|
// responsiblity to zero the mask or not first.
|
||
|
//
|
||
|
// Notably any status other than NV_OK indicates that the block's lock might
|
||
|
// have been unlocked and relocked.
|
||
|
//
|
||
|
// LOCKING: The caller must hold the va_block lock.
|
||
|
NV_STATUS uvm_va_block_make_resident(uvm_va_block_t *va_block,
|
||
|
uvm_va_block_retry_t *va_block_retry,
|
||
|
uvm_va_block_context_t *va_block_context,
|
||
|
uvm_processor_id_t dest_id,
|
||
|
uvm_va_block_region_t region,
|
||
|
const uvm_page_mask_t *page_mask,
|
||
|
const uvm_page_mask_t *prefetch_page_mask,
|
||
|
uvm_make_resident_cause_t cause);
|
||
|
|
||
|
// Similar to uvm_va_block_make_resident (read documentation there). The main
|
||
|
// differences are:
|
||
|
// - Pages are copied not moved (i.e. other copies of the page are not
|
||
|
// unmapped)
|
||
|
// - Processors with a resident copy of pages that migrated have write and
|
||
|
// atomic access permission revoked, unlike in uvm_va_block_make_resident
|
||
|
// where they are unmapped
|
||
|
// - All remote mappings (due to either SetAccessedBy or performance heuristics)
|
||
|
// are broken
|
||
|
// - LOCKING: If va_block_context->mm != NULL, va_block_context->mm->mmap_lock
|
||
|
// must be held in at least read mode.
|
||
|
NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
|
||
|
uvm_va_block_retry_t *va_block_retry,
|
||
|
uvm_va_block_context_t *va_block_context,
|
||
|
uvm_processor_id_t dest_id,
|
||
|
uvm_va_block_region_t region,
|
||
|
const uvm_page_mask_t *page_mask,
|
||
|
const uvm_page_mask_t *prefetch_page_mask,
|
||
|
uvm_make_resident_cause_t cause);
|
||
|
|
||
|
// Creates or upgrades a mapping from the input processor to the given virtual
|
||
|
// address region. Pages which already have new_prot permissions or higher are
|
||
|
// skipped, so this call ensures that the range is mapped with at least new_prot
|
||
|
// permissions. new_prot must not be UVM_PROT_NONE. uvm_va_block_unmap or
|
||
|
// uvm_va_block_revoke_prot should be used to downgrade permissions instead.
|
||
|
//
|
||
|
// The mapped pages are described by the region parameter and the map page mask
|
||
|
// that allows the caller to restrict the map operation to specific pages within
|
||
|
// the region. If the page mask is NULL then the whole region is mapped.
|
||
|
//
|
||
|
// If the input processor is a GPU with no GPU VA space registered, or if the
|
||
|
// input processor is the CPU and this thread is not allowed to create CPU
|
||
|
// mappings, this function does nothing. CPU mappings are only allowed if
|
||
|
// uvm_va_range_vma_check(va_block_context->mm) is valid, so the caller must
|
||
|
// set va_block_context->mm before calling this function.
|
||
|
//
|
||
|
// cause specifies the cause to be reported in events in case a remote mapping
|
||
|
// is created.
|
||
|
//
|
||
|
// Any CPU mappings will wait for the va_block tracker. If this function pushes
|
||
|
// GPU work it will first acquire the va_block tracker, then add the pushed work
|
||
|
// to out_tracker. It is the caller's responsibility to add this work to
|
||
|
// va_block's tracker. Note that while it is generally safe to run map
|
||
|
// operations on different GPUs concurrently, two PTE operations (map, unmap,
|
||
|
// revoke) on the same GPU must be serialized even if they target different
|
||
|
// pages because the earlier operation can cause a PTE split or merge which is
|
||
|
// assumed by the later operation.
|
||
|
//
|
||
|
// va_block_context must not be NULL.
|
||
|
//
|
||
|
// If allocation-retry was required as part of the operation and was successful,
|
||
|
// NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case, the entries in the
|
||
|
// out_tracker were added to the block's tracker and then the block's lock was
|
||
|
// unlocked and relocked.
|
||
|
//
|
||
|
// In general, any status other than NV_OK indicates that the block's lock might
|
||
|
// have been unlocked and relocked.
|
||
|
//
|
||
|
// LOCKING: The caller must hold the va block lock. If va_block_context->mm !=
|
||
|
// NULL, va_block_context->mm->mmap_lock must be held in at least read
|
||
|
// mode.
|
||
|
NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block,
|
||
|
uvm_va_block_context_t *va_block_context,
|
||
|
uvm_processor_id_t id,
|
||
|
uvm_va_block_region_t region,
|
||
|
const uvm_page_mask_t *map_page_mask,
|
||
|
uvm_prot_t new_prot,
|
||
|
UvmEventMapRemoteCause cause,
|
||
|
uvm_tracker_t *out_tracker);
|
||
|
|
||
|
// Like uvm_va_block_map, except it maps all processors in the input mask. The
|
||
|
// VA block tracker contains all map operations on return.
|
||
|
//
|
||
|
// Note that this can return NV_ERR_MORE_PROCESSING_REQUIRED just like
|
||
|
// uvm_va_block_map() indicating that the operation needs to be retried.
|
||
|
NV_STATUS uvm_va_block_map_mask(uvm_va_block_t *va_block,
|
||
|
uvm_va_block_context_t *va_block_context,
|
||
|
const uvm_processor_mask_t *map_processor_mask,
|
||
|
uvm_va_block_region_t region,
|
||
|
const uvm_page_mask_t *map_page_mask,
|
||
|
uvm_prot_t new_prot,
|
||
|
UvmEventMapRemoteCause cause);
|
||
|
|
||
|
// Unmaps virtual regions from a single processor. This does not free page
|
||
|
// tables or physical memory. This is safe to call on the eviction path, but the
|
||
|
// caller must ensure that the block hasn't been killed.
|
||
|
//
|
||
|
// The unmapped pages are described by the region parameter and the unmap page
|
||
|
// mask that allows the caller to restrict the unmap operation to specific pages
|
||
|
// within the region. If the page mask is NULL then the whole region is
|
||
|
// unmapped.
|
||
|
//
|
||
|
// If id is UVM_ID_CPU, this is guaranteed to return NV_OK, and this is safe to
|
||
|
// call without holding a reference on the mm which owns the associated vma.
|
||
|
//
|
||
|
// Any CPU unmappings will wait for the va_block tracker. If this function
|
||
|
// pushes GPU work it will first acquire the va_block tracker, then add the
|
||
|
// pushed work to out_tracker. It is the caller's responsibility to add this
|
||
|
// work to va_block's tracker. Note that while it is generally safe to run unmap
|
||
|
// operations on different GPUs concurrently, two PTE operations (map, unmap,
|
||
|
// revoke) on the same GPU must be serialized even if they target different
|
||
|
// pages because the earlier operation can cause a PTE split or merge which is
|
||
|
// assumed by the later operation.
|
||
|
//
|
||
|
// va_block_context must not be NULL.
|
||
|
//
|
||
|
// If allocation-retry was required as part of the operation and was successful,
|
||
|
// NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case, the entries in the
|
||
|
// out_tracker were added to the block's tracker and then the block's lock was
|
||
|
// unlocked and relocked. It is guaranteed that retry will not be required if
|
||
|
// the unmap does not cause a PTE split. Examples of operations which will not
|
||
|
// cause a PTE split include unmapping the entire block, unmapping all PTEs with
|
||
|
// matching attributes, and unmapping all PTEs which point to the same physical
|
||
|
// chunk.
|
||
|
//
|
||
|
// LOCKING: The caller must hold the va_block lock.
|
||
|
NV_STATUS uvm_va_block_unmap(uvm_va_block_t *va_block,
|
||
|
uvm_va_block_context_t *va_block_context,
|
||
|
uvm_processor_id_t id,
|
||
|
uvm_va_block_region_t region,
|
||
|
const uvm_page_mask_t *unmap_page_mask,
|
||
|
uvm_tracker_t *out_tracker);
|
||
|
|
||
|
// Like uvm_va_block_unmap, except it unmaps all processors in the input mask.
|
||
|
// The VA block tracker contains all map operations on return.
|
||
|
NV_STATUS uvm_va_block_unmap_mask(uvm_va_block_t *va_block,
|
||
|
uvm_va_block_context_t *va_block_context,
|
||
|
const uvm_processor_mask_t *unmap_processor_mask,
|
||
|
uvm_va_block_region_t region,
|
||
|
const uvm_page_mask_t *unmap_page_mask);
|
||
|
|
||
|
// Function called when the preferred location changes. Notably:
|
||
|
// - Mark all CPU pages as dirty because the new processor may not have
|
||
|
// up-to-date data.
|
||
|
// - Unmap the preferred location's processor from any pages in this region
|
||
|
// which are not resident on the preferred location.
|
||
|
// LOCKING: The caller must hold the VA block lock.
|
||
|
NV_STATUS uvm_va_block_set_preferred_location_locked(uvm_va_block_t *va_block,
|
||
|
uvm_va_block_context_t *va_block_context);
|
||
|
|
||
|
// Maps the given processor to all resident pages in this block, as allowed by
|
||
|
// location and policy. Waits for the operation to complete before returning.
|
||
|
//
|
||
|
// LOCKING: This takes and releases the VA block lock. If va_block_context->mm
|
||
|
// != NULL, va_block_context->mm->mmap_lock must be held in at least
|
||
|
// read mode.
|
||
|
NV_STATUS uvm_va_block_set_accessed_by(uvm_va_block_t *va_block,
|
||
|
uvm_va_block_context_t *va_block_context,
|
||
|
uvm_processor_id_t processor_id);
|
||
|
|
||
|
// Breaks SetAccessedBy and remote mappings
|
||
|
//
|
||
|
// va_block_context must NOT be NULL
|
||
|
//
|
||
|
// LOCKING: This takes and releases the VA block lock. If va_block_context->mm
|
||
|
// != NULL, va_block_context->mm->mmap_lock must be held in at least
|
||
|
// read mode.
|
||
|
NV_STATUS uvm_va_block_set_read_duplication(uvm_va_block_t *va_block,
|
||
|
uvm_va_block_context_t *va_block_context);
|
||
|
|
||
|
// Restores SetAccessedBy mappings
|
||
|
//
|
||
|
// va_block_context must NOT be NULL
|
||
|
//
|
||
|
// LOCKING: This takes and releases the VA block lock. If va_block_context->mm
|
||
|
// != NULL, va_block_context->mm->mmap_lock must be held in at least
|
||
|
// read mode.
|
||
|
NV_STATUS uvm_va_block_unset_read_duplication(uvm_va_block_t *va_block,
|
||
|
uvm_va_block_context_t *va_block_context);
|
||
|
|
||
|
// API for access privilege revocation
|
||
|
//
|
||
|
// Revoke prot_to_revoke access permissions for the given processor.
|
||
|
//
|
||
|
// The revoked pages are described by the region parameter and the revoke page
|
||
|
// mask that allows the caller to restrict the revoke operation to specific
|
||
|
// pages within the region.
|
||
|
//
|
||
|
// prot_to_revoke must be greater than UVM_PROT_READ_ONLY. Caller should call
|
||
|
// unmap explicitly if it wants to revoke all access privileges.
|
||
|
//
|
||
|
// If id is UVM_ID_CPU, and prot_to_revoke is UVM_PROT_READ_WRITE_ATOMIC, no
|
||
|
// action is performed. If the processor id corresponds to the CPU and the
|
||
|
// caller cannot establish CPU mappings because it does not have a reference on
|
||
|
// vma->vm_mm (va_block_context->mm != vma->vm_mm), the page will be simply
|
||
|
// unmapped. Caller should call unmap explicitly if it wants to revoke all
|
||
|
// access privileges.
|
||
|
//
|
||
|
// Any CPU revocation will wait for the va_block tracker. If this function
|
||
|
// pushes GPU work it will first acquire the va_block tracker, then add the
|
||
|
// pushed work to out_tracker. It is the caller's responsibility to add this
|
||
|
// work to va_block's tracker. Note that while it is generally safe to run
|
||
|
// revocation operations on different GPUs concurrently, two PTE operations
|
||
|
// (map, unmap, revoke) on the same GPU must be serialized even if they target
|
||
|
// different pages because the earlier operation can cause a PTE split or merge
|
||
|
// which is assumed by the later operation.
|
||
|
//
|
||
|
// va_block_context must not be NULL.
|
||
|
//
|
||
|
// If allocation-retry was required as part of the operation and was successful,
|
||
|
// NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case, the entries in the
|
||
|
// out_tracker were added to the block's tracker and then the block's lock was
|
||
|
// unlocked and relocked.
|
||
|
//
|
||
|
// In general, any status other than NV_OK indicates that the block's lock might
|
||
|
// have been unlocked and relocked.
|
||
|
//
|
||
|
// LOCKING: The caller must hold the va block lock. If va_block_context->mm !=
|
||
|
// NULL, va_block_context->mm->mmap_lock must be held in at least read
|
||
|
// mode.
|
||
|
NV_STATUS uvm_va_block_revoke_prot(uvm_va_block_t *va_block,
|
||
|
uvm_va_block_context_t *va_block_context,
|
||
|
uvm_processor_id_t id,
|
||
|
uvm_va_block_region_t region,
|
||
|
const uvm_page_mask_t *revoke_page_mask,
|
||
|
uvm_prot_t prot_to_revoke,
|
||
|
uvm_tracker_t *out_tracker);
|
||
|
|
||
|
// Like uvm_va_block_revoke_prot(), except it revokes all processors in the
|
||
|
// input mask. The VA block tracker contains all revocation operations on
|
||
|
// return.
|
||
|
//
|
||
|
// Note that this can return NV_ERR_MORE_PROCESSING_REQUIRED just like
|
||
|
// uvm_va_block_revoke_prot() indicating that the operation needs to be retried.
|
||
|
NV_STATUS uvm_va_block_revoke_prot_mask(uvm_va_block_t *va_block,
|
||
|
uvm_va_block_context_t *va_block_context,
|
||
|
const uvm_processor_mask_t *revoke_processor_mask,
|
||
|
uvm_va_block_region_t region,
|
||
|
const uvm_page_mask_t *revoke_page_mask,
|
||
|
uvm_prot_t prot_to_revoke);
|
||
|
|
||
|
// Tries to map all pages in the given region and map_page_mask with at most
|
||
|
// max_prot privileges for appropriate processors as determined by the
|
||
|
// accessed_by mask, heuristics and the given processor mask (excluding
|
||
|
// processor_id, which triggered the migration and should have already been
|
||
|
// mapped).
|
||
|
//
|
||
|
// va_block_context must not be NULL.
|
||
|
//
|
||
|
// This function acquires/waits for the va_block tracker and updates that
|
||
|
// tracker with any new work pushed.
|
||
|
//
|
||
|
// Note that this can return NV_ERR_MORE_PROCESSING_REQUIRED just like
|
||
|
// uvm_va_block_map() indicating that the operation needs to be retried.
|
||
|
//
|
||
|
// LOCKING: The caller must hold the va block lock. If va_block_context->mm !=
|
||
|
// NULL, va_block_context->mm->mmap_lock must be held in at least read
|
||
|
// mode.
|
||
|
NV_STATUS uvm_va_block_add_mappings_after_migration(uvm_va_block_t *va_block,
|
||
|
uvm_va_block_context_t *va_block_context,
|
||
|
uvm_processor_id_t new_residency,
|
||
|
uvm_processor_id_t processor_id,
|
||
|
uvm_va_block_region_t region,
|
||
|
const uvm_page_mask_t *map_page_mask,
|
||
|
uvm_prot_t max_prot,
|
||
|
const uvm_processor_mask_t *processor_mask);
|
||
|
|
||
|
// Maps processors using SetAccessedBy to all resident pages in the region
|
||
|
// parameter. On Volta+ it is also used to map evicted pages that can be later
|
||
|
// pulled back by using access counters.
|
||
|
//
|
||
|
// This function acquires/waits for the va_block tracker and updates that
|
||
|
// tracker with any new work pushed.
|
||
|
//
|
||
|
// Note that this can return NV_ERR_MORE_PROCESSING_REQUIRED just like
|
||
|
// uvm_va_block_map() indicating that the operation needs to be retried.
|
||
|
//
|
||
|
// va_block_context must not be NULL.
|
||
|
//
|
||
|
// LOCKING: The caller must hold the va block lock. If va_block_context->mm !=
|
||
|
// NULL, va_block_context->mm->mmap_lock must be held in at least read
|
||
|
// mode.
|
||
|
NV_STATUS uvm_va_block_add_mappings(uvm_va_block_t *va_block,
|
||
|
uvm_va_block_context_t *va_block_context,
|
||
|
uvm_processor_id_t processor_id,
|
||
|
uvm_va_block_region_t region,
|
||
|
const uvm_page_mask_t *page_mask,
|
||
|
UvmEventMapRemoteCause cause);
|
||
|
|
||
|
// Notifies the VA block that a new GPU VA space has been created.
|
||
|
// LOCKING: The caller must hold the va_block lock
|
||
|
NV_STATUS uvm_va_block_add_gpu_va_space(uvm_va_block_t *va_block, uvm_gpu_va_space_t *gpu_va_space);
|
||
|
|
||
|
// Destroys the VA block's mappings and page tables on the GPU, if it has any.
|
||
|
//
|
||
|
// If mm != NULL, that mm is used for any CPU mappings which may be created as
|
||
|
// a result of this call. See uvm_va_block_context_t::mm for details.
|
||
|
//
|
||
|
// LOCKING: The caller must hold the va_block lock. If mm != NULL, the caller
|
||
|
// must hold mm->mmap_lock in at least read mode.
|
||
|
void uvm_va_block_remove_gpu_va_space(uvm_va_block_t *va_block, uvm_gpu_va_space_t *gpu_va_space, struct mm_struct *mm);
|
||
|
|
||
|
// Creates any mappings necessary in this VA block between the two GPUs, in
|
||
|
// either direction.
|
||
|
// LOCKING: The caller must hold the va_block lock
|
||
|
NV_STATUS uvm_va_block_enable_peer(uvm_va_block_t *va_block, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1);
|
||
|
|
||
|
// Unmaps all page tables in this VA block which have peer mappings between
|
||
|
// the two GPUs, in either direction.
|
||
|
// LOCKING: The caller must hold the va_block lock
|
||
|
void uvm_va_block_disable_peer(uvm_va_block_t *va_block, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1);
|
||
|
|
||
|
// Unmap any mappings from GPU to the preferred location.
|
||
|
//
|
||
|
// The GPU has to be in UVM-Lite mode.
|
||
|
//
|
||
|
// LOCKING: The caller must hold the va_block lock
|
||
|
void uvm_va_block_unmap_preferred_location_uvm_lite(uvm_va_block_t *va_block, uvm_gpu_t *gpu);
|
||
|
|
||
|
// Frees all memory under this block associated with this GPU. Any portion of
|
||
|
// the block which is resident on the GPU is evicted to sysmem before being
|
||
|
// freed.
|
||
|
//
|
||
|
// If mm != NULL, that mm is used for any CPU mappings which may be created as
|
||
|
// a result of this call. See uvm_va_block_context_t::mm for details.
|
||
|
//
|
||
|
// LOCKING: This takes and releases the VA block lock. If mm != NULL, the caller
|
||
|
// must hold mm->mmap_lock in at least read mode.
|
||
|
void uvm_va_block_unregister_gpu(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm);
|
||
|
|
||
|
// Unmaps all memory associated with the block and drops the ref count of the
|
||
|
// block. This allows the caller to free resources associated with this block
|
||
|
// regardless of the block's current ref count. Most importantly it allows the
|
||
|
// VA covered by this block to be immediately available for other page table
|
||
|
// mappings upon return.
|
||
|
//
|
||
|
// This clears block->va_range, so only the VA range destroy path should call
|
||
|
// it. Other paths with references on this block, specifically the eviction path
|
||
|
// which temporarily takes a reference to the block, must always check the block
|
||
|
// state after taking the block lock to see if their mapping is still in place.
|
||
|
//
|
||
|
// All of the unmap and state destruction steps are also performed when the ref
|
||
|
// count goes to 0, so this function only needs to be called if the block's
|
||
|
// resources need to be reclaimed immediately.
|
||
|
//
|
||
|
// The caller should not lock the block before calling this function.
|
||
|
//
|
||
|
// This performs a uvm_va_block_release.
|
||
|
void uvm_va_block_kill(uvm_va_block_t *va_block);
|
||
|
|
||
|
// Exactly the same split semantics as uvm_va_range_split, including error
|
||
|
// handling. See that function's comments for details.
|
||
|
//
|
||
|
// new_va_block's va_range is set to new_va_range before any reverse mapping is
|
||
|
// established to the new block, but the caller is responsible for inserting the
|
||
|
// new block into the range.
|
||
|
NV_STATUS uvm_va_block_split(uvm_va_block_t *existing_va_block,
|
||
|
NvU64 new_end,
|
||
|
uvm_va_block_t **new_va_block,
|
||
|
uvm_va_range_t *new_va_range);
|
||
|
|
||
|
// Exactly the same split semantics as uvm_va_block_split, including error
|
||
|
// handling except the existing_va_block block lock needs to be held and
|
||
|
// the new_va_block has to be preallocated.
|
||
|
//
|
||
|
// new_va_block's va_range is set to new_va_range before any reverse mapping is
|
||
|
// established to the new block, but the caller is responsible for inserting the
|
||
|
// new block into the range.
|
||
|
NV_STATUS uvm_va_block_split_locked(uvm_va_block_t *existing_va_block,
|
||
|
NvU64 new_end,
|
||
|
uvm_va_block_t *new_va_block,
|
||
|
uvm_va_range_t *new_va_range);
|
||
|
|
||
|
// Handles a CPU fault in the given VA block, performing any operations
|
||
|
// necessary to establish a coherent CPU mapping (migrations, cache invalidates,
|
||
|
// etc.).
|
||
|
//
|
||
|
// Locking:
|
||
|
// - vma->vm_mm->mmap_lock must be held in at least read mode. Note, that
|
||
|
// might not be the same as current->mm->mmap_lock.
|
||
|
// - va_space lock must be held in at least read mode
|
||
|
//
|
||
|
// service_context->block_context.mm is ignored and vma->vm_mm is used instead.
|
||
|
//
|
||
|
// Returns NV_ERR_INVALID_ACCESS_TYPE if a CPU mapping to fault_addr cannot be
|
||
|
// accessed, for example because it's within a range group which is non-
|
||
|
// migratable.
|
||
|
NV_STATUS uvm_va_block_cpu_fault(uvm_va_block_t *va_block,
|
||
|
NvU64 fault_addr,
|
||
|
bool is_write,
|
||
|
uvm_service_block_context_t *service_context);
|
||
|
|
||
|
// Performs any operations necessary to establish a coherent mapping
|
||
|
// (migrations, cache invalidates, etc.) in response to the given service block
|
||
|
// context
|
||
|
//
|
||
|
// Locking:
|
||
|
// - service_context->block_context.mm->mmap_lock must be held in at least
|
||
|
// read mode, if valid.
|
||
|
// - va_space lock must be held in at least read mode
|
||
|
// - va_block lock must be held
|
||
|
//
|
||
|
// If allocation-retry was required as part of the operation and was successful,
|
||
|
// NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case, the block's lock was
|
||
|
// unlocked and relocked.
|
||
|
//
|
||
|
// NV_WARN_MORE_PROCESSING_REQUIRED indicates that thrashing has been detected
|
||
|
// and the performance heuristics logic decided to throttle execution.
|
||
|
// Any other error code different than NV_OK indicates OOM or a global fatal
|
||
|
// error.
|
||
|
NV_STATUS uvm_va_block_service_locked(uvm_processor_id_t processor_id,
|
||
|
uvm_va_block_t *va_block,
|
||
|
uvm_va_block_retry_t *block_retry,
|
||
|
uvm_service_block_context_t *service_context);
|
||
|
|
||
|
// Size of the block in bytes. Guaranteed to be a page-aligned value between
|
||
|
// PAGE_SIZE and UVM_VA_BLOCK_SIZE.
|
||
|
static inline NvU64 uvm_va_block_size(uvm_va_block_t *block)
|
||
|
{
|
||
|
NvU64 size = block->end - block->start + 1;
|
||
|
UVM_ASSERT(PAGE_ALIGNED(size));
|
||
|
UVM_ASSERT(size >= PAGE_SIZE);
|
||
|
UVM_ASSERT(size <= UVM_VA_BLOCK_SIZE);
|
||
|
return size;
|
||
|
}
|
||
|
|
||
|
// Number of pages with PAGE_SIZE in the block
|
||
|
static inline size_t uvm_va_block_num_cpu_pages(uvm_va_block_t *block)
|
||
|
{
|
||
|
return uvm_va_block_size(block) / PAGE_SIZE;
|
||
|
}
|
||
|
|
||
|
// VA of the given page using CPU page size. page_index must be valid
|
||
|
static inline NvU64 uvm_va_block_cpu_page_address(uvm_va_block_t *block, uvm_page_index_t page_index)
|
||
|
{
|
||
|
UVM_ASSERT(page_index < uvm_va_block_num_cpu_pages(block));
|
||
|
return block->start + PAGE_SIZE * page_index;
|
||
|
}
|
||
|
|
||
|
// Get the page physical address on the given GPU
|
||
|
//
|
||
|
// This will assert that GPU state is indeed present.
|
||
|
uvm_gpu_phys_address_t uvm_va_block_gpu_phys_page_address(uvm_va_block_t *va_block, uvm_page_index_t page_index, uvm_gpu_t *gpu);
|
||
|
|
||
|
static bool uvm_va_block_contains_address(uvm_va_block_t *block, NvU64 address)
|
||
|
{
|
||
|
return address >= block->start && address <= block->end;
|
||
|
}
|
||
|
|
||
|
// Obtain a pointer to the uvm_va_block_test_t structure for the given VA
|
||
|
// block. If uvm_enable_builtin_tests is unset, NULL will be returned.
|
||
|
static uvm_va_block_test_t *uvm_va_block_get_test(uvm_va_block_t *va_block)
|
||
|
{
|
||
|
if (uvm_enable_builtin_tests)
|
||
|
return &container_of(va_block, uvm_va_block_wrapper_t, block)->test;
|
||
|
|
||
|
return NULL;
|
||
|
}
|
||
|
|
||
|
// Get the page residency mask for a processor if it's known to be there.
|
||
|
//
|
||
|
// If the processor is a GPU, this will assert that GPU state is indeed present.
|
||
|
uvm_page_mask_t *uvm_va_block_resident_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor);
|
||
|
|
||
|
// Get the page mapped mask for a processor. The returned mask cannot be
|
||
|
// directly modified by the caller
|
||
|
//
|
||
|
// If the processor is a GPU, this will assert that GPU state is indeed present.
|
||
|
const uvm_page_mask_t *uvm_va_block_map_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor);
|
||
|
|
||
|
// VA block lookup functions. There are a number of permutations which might be
|
||
|
// useful, such as looking up the block from {va_space, va_range} x {addr,
|
||
|
// block index}. The ones implemented here and in uvm_va_range.h support the
|
||
|
// primary two use cases, which are:
|
||
|
// 1) Iterating over all VA blocks in a VA range. This uses block indices on the
|
||
|
// VA range:
|
||
|
// uvm_va_range_num_blocks
|
||
|
// uvm_va_range_block_index
|
||
|
// uvm_va_range_block
|
||
|
// uvm_va_range_block_create
|
||
|
// 2) Operating on a single VA block (fault). This looks up the block using the
|
||
|
// VA space and address:
|
||
|
// uvm_va_block_find
|
||
|
// uvm_va_block_find_create
|
||
|
|
||
|
// Finds the UVM or HMM VA block containing addr, if any. The va_space->lock
|
||
|
// must be held in at least read mode. Return values:
|
||
|
// NV_ERR_INVALID_ADDRESS addr is not a UVM_VA_RANGE_TYPE_MANAGED va_range nor
|
||
|
// a HMM enabled VMA.
|
||
|
//
|
||
|
// NV_ERR_OBJECT_NOT_FOUND addr is valid but no block has been allocated to
|
||
|
// cover it yet
|
||
|
//
|
||
|
// NV_OK The block was returned successfully
|
||
|
NV_STATUS uvm_va_block_find(uvm_va_space_t *va_space, NvU64 addr, uvm_va_block_t **out_block);
|
||
|
|
||
|
// Same as uvm_va_block_find except that the block is created if not found.
|
||
|
// If addr is covered by a UVM_VA_RANGE_TYPE_MANAGED va_range, a managed block
|
||
|
// will be created. Otherwise, if addr is not covered by any va_range, mm is
|
||
|
// non-NULL, and HMM is enabled in the va_space, an HMM block will be created.
|
||
|
// In either case, if mm is non-NULL, it must be retained and locked in at
|
||
|
// least read mode. Return values:
|
||
|
// NV_ERR_INVALID_ADDRESS addr is not a UVM_VA_RANGE_TYPE_MANAGED va_range nor
|
||
|
// a HMM enabled VMA.
|
||
|
// NV_ERR_NO_MEMORY memory could not be allocated.
|
||
|
NV_STATUS uvm_va_block_find_create(uvm_va_space_t *va_space,
|
||
|
struct mm_struct *mm,
|
||
|
NvU64 addr,
|
||
|
uvm_va_block_context_t *va_block_context,
|
||
|
uvm_va_block_t **out_block);
|
||
|
|
||
|
// Same as uvm_va_block_find_create except that only UVM managed va_blocks are
|
||
|
// created if not already present in the VA range.
|
||
|
static NV_STATUS uvm_va_block_find_create_managed(uvm_va_space_t *va_space,
|
||
|
NvU64 addr,
|
||
|
uvm_va_block_t **out_block)
|
||
|
{
|
||
|
return uvm_va_block_find_create(va_space, NULL, addr, NULL, out_block);
|
||
|
}
|
||
|
|
||
|
// Look up a chunk backing a specific address within the VA block. Returns NULL if none.
|
||
|
uvm_gpu_chunk_t *uvm_va_block_lookup_gpu_chunk(uvm_va_block_t *va_block, uvm_gpu_t *gpu, NvU64 address);
|
||
|
|
||
|
typedef enum
|
||
|
{
|
||
|
UVM_MIGRATE_MODE_MAKE_RESIDENT,
|
||
|
UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP,
|
||
|
} uvm_migrate_mode_t;
|
||
|
|
||
|
// Implementation of the UvmMigrate() API at the VA block scope.
|
||
|
//
|
||
|
// The out_tracker can be NULL.
|
||
|
//
|
||
|
// If do_mappings is false, mappings are not added after pages have been
|
||
|
// migrated.
|
||
|
//
|
||
|
// The caller needs to handle allocation-retry. va_block_retry can be NULL if
|
||
|
// the destination is the CPU.
|
||
|
//
|
||
|
// va_block_context must not be NULL.
|
||
|
//
|
||
|
// LOCKING: The caller must hold the va_block lock. If va_block_context->mm !=
|
||
|
// NULL, va_block_context->mm->mmap_lock must be held in at least
|
||
|
// read mode.
|
||
|
NV_STATUS uvm_va_block_migrate_locked(uvm_va_block_t *va_block,
|
||
|
uvm_va_block_retry_t *va_block_retry,
|
||
|
uvm_va_block_context_t *va_block_context,
|
||
|
uvm_va_block_region_t region,
|
||
|
uvm_processor_id_t dest_id,
|
||
|
uvm_migrate_mode_t mode,
|
||
|
uvm_tracker_t *out_tracker);
|
||
|
|
||
|
// Write block's data from a CPU buffer
|
||
|
//
|
||
|
// The [dst, dst + size) range has to fit within a single PAGE_SIZE page.
|
||
|
//
|
||
|
// The caller needs to support allocation-retry of page tables.
|
||
|
//
|
||
|
// LOCKING: The caller must hold the va_block lock
|
||
|
NV_STATUS uvm_va_block_write_from_cpu(uvm_va_block_t *va_block,
|
||
|
uvm_va_block_context_t *block_context,
|
||
|
NvU64 dst,
|
||
|
uvm_mem_t *src,
|
||
|
size_t size);
|
||
|
|
||
|
// Read block's data into a CPU buffer
|
||
|
//
|
||
|
// The [src, src + size) range has to fit within a single PAGE_SIZE page.
|
||
|
//
|
||
|
// LOCKING: The caller must hold the va_block lock
|
||
|
NV_STATUS uvm_va_block_read_to_cpu(uvm_va_block_t *va_block, uvm_mem_t *dst, NvU64 src, size_t size);
|
||
|
|
||
|
// Initialize va block retry tracking
|
||
|
void uvm_va_block_retry_init(uvm_va_block_retry_t *uvm_va_block_retry);
|
||
|
|
||
|
// Deinitialize va block retry tracking after a block operation
|
||
|
//
|
||
|
// Frees all the remaining free chunks and unpins all the used chunks.
|
||
|
void uvm_va_block_retry_deinit(uvm_va_block_retry_t *uvm_va_block_retry, uvm_va_block_t *va_block);
|
||
|
|
||
|
// Evict all chunks from the block that are subchunks of the passed in root_chunk.
|
||
|
//
|
||
|
// Add all the work tracking the eviction to the tracker.
|
||
|
//
|
||
|
// Returns NV_OK if the block is dead or doesn't have any subchunks of the
|
||
|
// root_chunk.
|
||
|
//
|
||
|
// LOCKING: The caller must hold the va_block lock
|
||
|
NV_STATUS uvm_va_block_evict_chunks(uvm_va_block_t *va_block,
|
||
|
uvm_gpu_t *gpu,
|
||
|
uvm_gpu_chunk_t *root_chunk,
|
||
|
uvm_tracker_t *tracker);
|
||
|
|
||
|
NV_STATUS uvm_test_va_block_inject_error(UVM_TEST_VA_BLOCK_INJECT_ERROR_PARAMS *params, struct file *filp);
|
||
|
NV_STATUS uvm_test_change_pte_mapping(UVM_TEST_CHANGE_PTE_MAPPING_PARAMS *params, struct file *filp);
|
||
|
NV_STATUS uvm_test_va_block_info(UVM_TEST_VA_BLOCK_INFO_PARAMS *params, struct file *filp);
|
||
|
NV_STATUS uvm_test_va_residency_info(UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params, struct file *filp);
|
||
|
|
||
|
// Compute the offset in system pages of addr from the start of va_block.
|
||
|
static uvm_page_index_t uvm_va_block_cpu_page_index(uvm_va_block_t *va_block, NvU64 addr)
|
||
|
{
|
||
|
UVM_ASSERT(addr >= va_block->start);
|
||
|
UVM_ASSERT(addr <= va_block->end);
|
||
|
return (addr - va_block->start) / PAGE_SIZE;
|
||
|
}
|
||
|
|
||
|
// Computes the size and index in the gpu_state chunks array of the GPU chunk
|
||
|
// which corresponds to the given page_index of the VA region.
|
||
|
size_t uvm_va_block_gpu_chunk_index_range(NvU64 start,
|
||
|
NvU64 size,
|
||
|
uvm_gpu_t *gpu,
|
||
|
uvm_page_index_t page_index,
|
||
|
uvm_chunk_size_t *out_chunk_size);
|
||
|
|
||
|
// If there are any resident CPU pages in the block, mark them as dirty
|
||
|
void uvm_va_block_mark_cpu_dirty(uvm_va_block_t *va_block);
|
||
|
|
||
|
// Sets the internal state required to handle fault cancellation
|
||
|
//
|
||
|
// This function may require allocating page tables to split big pages into 4K
|
||
|
// pages. If allocation-retry was required as part of the operation and was
|
||
|
// successful, NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case the
|
||
|
// block's lock was unlocked and relocked.
|
||
|
//
|
||
|
// LOCKING: The caller must hold the va_block lock.
|
||
|
NV_STATUS uvm_va_block_set_cancel(uvm_va_block_t *va_block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu);
|
||
|
|
||
|
//
|
||
|
// uvm_va_block_region_t helpers
|
||
|
//
|
||
|
|
||
|
static uvm_va_block_region_t uvm_va_block_region(uvm_page_index_t first, uvm_page_index_t outer)
|
||
|
{
|
||
|
BUILD_BUG_ON(PAGES_PER_UVM_VA_BLOCK >= (1 << (sizeof(first) * 8)));
|
||
|
|
||
|
UVM_ASSERT(first <= outer);
|
||
|
|
||
|
return (uvm_va_block_region_t){ .first = first, .outer = outer };
|
||
|
}
|
||
|
|
||
|
static uvm_va_block_region_t uvm_va_block_region_for_page(uvm_page_index_t page_index)
|
||
|
{
|
||
|
return uvm_va_block_region(page_index, page_index + 1);
|
||
|
}
|
||
|
|
||
|
static size_t uvm_va_block_region_num_pages(uvm_va_block_region_t region)
|
||
|
{
|
||
|
return region.outer - region.first;
|
||
|
}
|
||
|
|
||
|
static NvU64 uvm_va_block_region_size(uvm_va_block_region_t region)
|
||
|
{
|
||
|
return uvm_va_block_region_num_pages(region) * PAGE_SIZE;
|
||
|
}
|
||
|
|
||
|
static NvU64 uvm_va_block_region_start(uvm_va_block_t *va_block, uvm_va_block_region_t region)
|
||
|
{
|
||
|
return va_block->start + region.first * PAGE_SIZE;
|
||
|
}
|
||
|
|
||
|
static NvU64 uvm_va_block_region_end(uvm_va_block_t *va_block, uvm_va_block_region_t region)
|
||
|
{
|
||
|
return va_block->start + region.outer * PAGE_SIZE - 1;
|
||
|
}
|
||
|
|
||
|
static bool uvm_va_block_region_contains_region(uvm_va_block_region_t region, uvm_va_block_region_t subregion)
|
||
|
{
|
||
|
return subregion.first >= region.first && subregion.outer <= region.outer;
|
||
|
}
|
||
|
|
||
|
static bool uvm_va_block_region_contains_page(uvm_va_block_region_t region, uvm_page_index_t page_index)
|
||
|
{
|
||
|
return uvm_va_block_region_contains_region(region, uvm_va_block_region_for_page(page_index));
|
||
|
}
|
||
|
|
||
|
// Create a block range from a va block and start and end virtual addresses
|
||
|
// within the block.
|
||
|
static uvm_va_block_region_t uvm_va_block_region_from_start_end(uvm_va_block_t *va_block, NvU64 start, NvU64 end)
|
||
|
{
|
||
|
uvm_va_block_region_t region;
|
||
|
|
||
|
UVM_ASSERT(start < end);
|
||
|
UVM_ASSERT(start >= va_block->start);
|
||
|
UVM_ASSERT(end <= va_block->end);
|
||
|
UVM_ASSERT(PAGE_ALIGNED(start));
|
||
|
UVM_ASSERT(PAGE_ALIGNED(end + 1));
|
||
|
|
||
|
region.first = uvm_va_block_cpu_page_index(va_block, start);
|
||
|
region.outer = uvm_va_block_cpu_page_index(va_block, end) + 1;
|
||
|
|
||
|
return region;
|
||
|
}
|
||
|
|
||
|
static uvm_va_block_region_t uvm_va_block_region_from_start_size(uvm_va_block_t *va_block, NvU64 start, NvU64 size)
|
||
|
{
|
||
|
return uvm_va_block_region_from_start_end(va_block, start, start + size - 1);
|
||
|
}
|
||
|
|
||
|
static uvm_va_block_region_t uvm_va_block_region_from_block(uvm_va_block_t *va_block)
|
||
|
{
|
||
|
return uvm_va_block_region(0, uvm_va_block_num_cpu_pages(va_block));
|
||
|
}
|
||
|
|
||
|
static bool uvm_page_mask_test(const uvm_page_mask_t *mask, uvm_page_index_t page_index)
|
||
|
{
|
||
|
UVM_ASSERT(page_index < PAGES_PER_UVM_VA_BLOCK);
|
||
|
|
||
|
return test_bit(page_index, mask->bitmap);
|
||
|
}
|
||
|
|
||
|
static bool uvm_page_mask_test_and_set(uvm_page_mask_t *mask, uvm_page_index_t page_index)
|
||
|
{
|
||
|
UVM_ASSERT(page_index < PAGES_PER_UVM_VA_BLOCK);
|
||
|
|
||
|
return __test_and_set_bit(page_index, mask->bitmap);
|
||
|
}
|
||
|
|
||
|
static bool uvm_page_mask_test_and_clear(uvm_page_mask_t *mask, uvm_page_index_t page_index)
|
||
|
{
|
||
|
UVM_ASSERT(page_index < PAGES_PER_UVM_VA_BLOCK);
|
||
|
|
||
|
return __test_and_clear_bit(page_index, mask->bitmap);
|
||
|
}
|
||
|
|
||
|
static void uvm_page_mask_set(uvm_page_mask_t *mask, uvm_page_index_t page_index)
|
||
|
{
|
||
|
UVM_ASSERT(page_index < PAGES_PER_UVM_VA_BLOCK);
|
||
|
|
||
|
__set_bit(page_index, mask->bitmap);
|
||
|
}
|
||
|
|
||
|
static void uvm_page_mask_clear(uvm_page_mask_t *mask, uvm_page_index_t page_index)
|
||
|
{
|
||
|
UVM_ASSERT(page_index < PAGES_PER_UVM_VA_BLOCK);
|
||
|
|
||
|
__clear_bit(page_index, mask->bitmap);
|
||
|
}
|
||
|
|
||
|
static bool uvm_page_mask_region_test(const uvm_page_mask_t *mask,
|
||
|
uvm_va_block_region_t region,
|
||
|
uvm_page_index_t page_index)
|
||
|
{
|
||
|
if (!uvm_va_block_region_contains_page(region, page_index))
|
||
|
return false;
|
||
|
|
||
|
return !mask || uvm_page_mask_test(mask, page_index);
|
||
|
}
|
||
|
|
||
|
static NvU32 uvm_page_mask_region_weight(const uvm_page_mask_t *mask, uvm_va_block_region_t region)
|
||
|
{
|
||
|
NvU32 weight_before = 0;
|
||
|
|
||
|
if (region.first > 0)
|
||
|
weight_before = bitmap_weight(mask->bitmap, region.first);
|
||
|
|
||
|
return bitmap_weight(mask->bitmap, region.outer) - weight_before;
|
||
|
}
|
||
|
|
||
|
static bool uvm_page_mask_region_empty(const uvm_page_mask_t *mask, uvm_va_block_region_t region)
|
||
|
{
|
||
|
return find_next_bit(mask->bitmap, region.outer, region.first) == region.outer;
|
||
|
}
|
||
|
|
||
|
static bool uvm_page_mask_region_full(const uvm_page_mask_t *mask, uvm_va_block_region_t region)
|
||
|
{
|
||
|
return find_next_zero_bit(mask->bitmap, region.outer, region.first) == region.outer;
|
||
|
}
|
||
|
|
||
|
static void uvm_page_mask_region_fill(uvm_page_mask_t *mask, uvm_va_block_region_t region)
|
||
|
{
|
||
|
bitmap_set(mask->bitmap, region.first, region.outer - region.first);
|
||
|
}
|
||
|
|
||
|
static void uvm_page_mask_region_clear(uvm_page_mask_t *mask, uvm_va_block_region_t region)
|
||
|
{
|
||
|
bitmap_clear(mask->bitmap, region.first, region.outer - region.first);
|
||
|
}
|
||
|
|
||
|
static void uvm_page_mask_region_clear_outside(uvm_page_mask_t *mask, uvm_va_block_region_t region)
|
||
|
{
|
||
|
if (region.first > 0)
|
||
|
bitmap_clear(mask->bitmap, 0, region.first);
|
||
|
if (region.outer < PAGES_PER_UVM_VA_BLOCK)
|
||
|
bitmap_clear(mask->bitmap, region.outer, PAGES_PER_UVM_VA_BLOCK - region.outer);
|
||
|
}
|
||
|
|
||
|
static void uvm_page_mask_zero(uvm_page_mask_t *mask)
|
||
|
{
|
||
|
bitmap_zero(mask->bitmap, PAGES_PER_UVM_VA_BLOCK);
|
||
|
}
|
||
|
|
||
|
static bool uvm_page_mask_empty(const uvm_page_mask_t *mask)
|
||
|
{
|
||
|
return bitmap_empty(mask->bitmap, PAGES_PER_UVM_VA_BLOCK);
|
||
|
}
|
||
|
|
||
|
static bool uvm_page_mask_full(const uvm_page_mask_t *mask)
|
||
|
{
|
||
|
return bitmap_full(mask->bitmap, PAGES_PER_UVM_VA_BLOCK);
|
||
|
}
|
||
|
|
||
|
static bool uvm_page_mask_and(uvm_page_mask_t *mask_out, const uvm_page_mask_t *mask_in1, const uvm_page_mask_t *mask_in2)
|
||
|
{
|
||
|
return bitmap_and(mask_out->bitmap, mask_in1->bitmap, mask_in2->bitmap, PAGES_PER_UVM_VA_BLOCK);
|
||
|
}
|
||
|
|
||
|
static bool uvm_page_mask_andnot(uvm_page_mask_t *mask_out, const uvm_page_mask_t *mask_in1, const uvm_page_mask_t *mask_in2)
|
||
|
{
|
||
|
return bitmap_andnot(mask_out->bitmap, mask_in1->bitmap, mask_in2->bitmap, PAGES_PER_UVM_VA_BLOCK);
|
||
|
}
|
||
|
|
||
|
static void uvm_page_mask_or(uvm_page_mask_t *mask_out, const uvm_page_mask_t *mask_in1, const uvm_page_mask_t *mask_in2)
|
||
|
{
|
||
|
bitmap_or(mask_out->bitmap, mask_in1->bitmap, mask_in2->bitmap, PAGES_PER_UVM_VA_BLOCK);
|
||
|
}
|
||
|
|
||
|
static void uvm_page_mask_complement(uvm_page_mask_t *mask_out, const uvm_page_mask_t *mask_in)
|
||
|
{
|
||
|
bitmap_complement(mask_out->bitmap, mask_in->bitmap, PAGES_PER_UVM_VA_BLOCK);
|
||
|
}
|
||
|
|
||
|
static void uvm_page_mask_copy(uvm_page_mask_t *mask_out, const uvm_page_mask_t *mask_in)
|
||
|
{
|
||
|
bitmap_copy(mask_out->bitmap, mask_in->bitmap, PAGES_PER_UVM_VA_BLOCK);
|
||
|
}
|
||
|
|
||
|
static NvU32 uvm_page_mask_weight(const uvm_page_mask_t *mask)
|
||
|
{
|
||
|
return bitmap_weight(mask->bitmap, PAGES_PER_UVM_VA_BLOCK);
|
||
|
}
|
||
|
|
||
|
static bool uvm_page_mask_subset(const uvm_page_mask_t *subset, const uvm_page_mask_t *mask)
|
||
|
{
|
||
|
return bitmap_subset(subset->bitmap, mask->bitmap, PAGES_PER_UVM_VA_BLOCK);
|
||
|
}
|
||
|
|
||
|
static bool uvm_page_mask_init_from_region(uvm_page_mask_t *mask_out,
|
||
|
uvm_va_block_region_t region,
|
||
|
const uvm_page_mask_t *mask_in)
|
||
|
{
|
||
|
uvm_page_mask_zero(mask_out);
|
||
|
uvm_page_mask_region_fill(mask_out, region);
|
||
|
|
||
|
if (mask_in)
|
||
|
return uvm_page_mask_and(mask_out, mask_out, mask_in);
|
||
|
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
static void uvm_page_mask_shift_right(uvm_page_mask_t *mask_out, const uvm_page_mask_t *mask_in, unsigned shift)
|
||
|
{
|
||
|
bitmap_shift_right(mask_out->bitmap, mask_in->bitmap, shift, PAGES_PER_UVM_VA_BLOCK);
|
||
|
}
|
||
|
|
||
|
static void uvm_page_mask_shift_left(uvm_page_mask_t *mask_out, const uvm_page_mask_t *mask_in, unsigned shift)
|
||
|
{
|
||
|
bitmap_shift_left(mask_out->bitmap, mask_in->bitmap, shift, PAGES_PER_UVM_VA_BLOCK);
|
||
|
}
|
||
|
|
||
|
static bool uvm_page_mask_intersects(const uvm_page_mask_t *mask1, const uvm_page_mask_t *mask2)
|
||
|
{
|
||
|
return bitmap_intersects(mask1->bitmap, mask2->bitmap, PAGES_PER_UVM_VA_BLOCK);
|
||
|
}
|
||
|
|
||
|
// Print the given page mask on the given buffer using hex symbols. The
|
||
|
// minimum required size of the buffer is UVM_PAGE_MASK_PRINT_MIN_BUFFER_SIZE.
|
||
|
static void uvm_page_mask_print(const uvm_page_mask_t *mask, char *buffer)
|
||
|
{
|
||
|
// There are two cases, which depend on PAGE_SIZE
|
||
|
if (PAGES_PER_UVM_VA_BLOCK > 32) {
|
||
|
NvLength current_long_idx = UVM_PAGE_MASK_WORDS - 1;
|
||
|
const char *buffer_end = buffer + UVM_PAGE_MASK_PRINT_MIN_BUFFER_SIZE;
|
||
|
|
||
|
UVM_ASSERT(sizeof(*mask->bitmap) == 8);
|
||
|
|
||
|
// For 4KB pages, we need to iterate over multiple words
|
||
|
do {
|
||
|
NvU64 current_long = mask->bitmap[current_long_idx];
|
||
|
|
||
|
buffer += sprintf(buffer, "%016llx", current_long);
|
||
|
if (current_long_idx != 0)
|
||
|
buffer += sprintf(buffer, ":");
|
||
|
} while (current_long_idx-- != 0);
|
||
|
|
||
|
UVM_ASSERT(buffer <= buffer_end);
|
||
|
}
|
||
|
else {
|
||
|
NvU32 value = (unsigned)*mask->bitmap;
|
||
|
|
||
|
UVM_ASSERT(PAGES_PER_UVM_VA_BLOCK == 32);
|
||
|
|
||
|
// For 64KB pages, a single print suffices
|
||
|
sprintf(buffer, "%08x", value);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static uvm_va_block_region_t uvm_va_block_first_subregion_in_mask(uvm_va_block_region_t region,
|
||
|
const uvm_page_mask_t *page_mask)
|
||
|
{
|
||
|
uvm_va_block_region_t subregion;
|
||
|
|
||
|
if (!page_mask)
|
||
|
return region;
|
||
|
|
||
|
subregion.first = find_next_bit(page_mask->bitmap, region.outer, region.first);
|
||
|
subregion.outer = find_next_zero_bit(page_mask->bitmap, region.outer, subregion.first + 1);
|
||
|
return subregion;
|
||
|
}
|
||
|
|
||
|
static uvm_va_block_region_t uvm_va_block_next_subregion_in_mask(uvm_va_block_region_t region,
|
||
|
const uvm_page_mask_t *page_mask,
|
||
|
uvm_va_block_region_t previous_subregion)
|
||
|
{
|
||
|
uvm_va_block_region_t subregion;
|
||
|
|
||
|
if (!page_mask) {
|
||
|
subregion.first = region.outer;
|
||
|
subregion.outer = region.outer;
|
||
|
return subregion;
|
||
|
}
|
||
|
|
||
|
subregion.first = find_next_bit(page_mask->bitmap, region.outer, previous_subregion.outer + 1);
|
||
|
subregion.outer = find_next_zero_bit(page_mask->bitmap, region.outer, subregion.first + 1);
|
||
|
return subregion;
|
||
|
}
|
||
|
|
||
|
// Iterate over contiguous subregions of the region given by the page mask.
|
||
|
// If the page mask is NULL then it behaves as if it was a fully set mask and
|
||
|
// the only subregion iterated over will be the region itself.
|
||
|
#define for_each_va_block_subregion_in_mask(subregion, page_mask, region) \
|
||
|
for ((subregion) = uvm_va_block_first_subregion_in_mask((region), (page_mask)); \
|
||
|
(subregion).first != (region).outer; \
|
||
|
(subregion) = uvm_va_block_next_subregion_in_mask((region), (page_mask), (subregion)))
|
||
|
|
||
|
static uvm_page_index_t uvm_va_block_first_page_in_mask(uvm_va_block_region_t region,
|
||
|
const uvm_page_mask_t *page_mask)
|
||
|
{
|
||
|
if (page_mask)
|
||
|
return find_next_bit(page_mask->bitmap, region.outer, region.first);
|
||
|
else
|
||
|
return region.first;
|
||
|
}
|
||
|
|
||
|
static uvm_page_index_t uvm_va_block_next_page_in_mask(uvm_va_block_region_t region,
|
||
|
const uvm_page_mask_t *page_mask,
|
||
|
uvm_page_index_t previous_page)
|
||
|
{
|
||
|
if (page_mask) {
|
||
|
return find_next_bit(page_mask->bitmap, region.outer, previous_page + 1);
|
||
|
}
|
||
|
else {
|
||
|
UVM_ASSERT(previous_page < region.outer);
|
||
|
return previous_page + 1;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static uvm_page_index_t uvm_va_block_first_unset_page_in_mask(uvm_va_block_region_t region,
|
||
|
const uvm_page_mask_t *page_mask)
|
||
|
{
|
||
|
if (page_mask)
|
||
|
return find_next_zero_bit(page_mask->bitmap, region.outer, region.first);
|
||
|
else
|
||
|
return region.first;
|
||
|
}
|
||
|
|
||
|
static uvm_page_index_t uvm_va_block_next_unset_page_in_mask(uvm_va_block_region_t region,
|
||
|
const uvm_page_mask_t *page_mask,
|
||
|
uvm_page_index_t previous_page)
|
||
|
{
|
||
|
if (page_mask) {
|
||
|
return find_next_zero_bit(page_mask->bitmap, region.outer, previous_page + 1);
|
||
|
}
|
||
|
else {
|
||
|
UVM_ASSERT(previous_page < region.outer);
|
||
|
return previous_page + 1;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static NvU64 uvm_reverse_map_start(const uvm_reverse_map_t *reverse_map)
|
||
|
{
|
||
|
return uvm_va_block_cpu_page_address(reverse_map->va_block, reverse_map->region.first);
|
||
|
}
|
||
|
|
||
|
static NvU64 uvm_reverse_map_end(const uvm_reverse_map_t *reverse_map)
|
||
|
{
|
||
|
return uvm_va_block_cpu_page_address(reverse_map->va_block, reverse_map->region.first) +
|
||
|
uvm_va_block_region_size(reverse_map->region) - 1;
|
||
|
}
|
||
|
|
||
|
// Iterate over contiguous pages of the region given by the page mask.
|
||
|
// If the page mask is NULL then it behaves as if it was a fully set mask and
|
||
|
// it will iterate over all pages within the region.
|
||
|
#define for_each_va_block_page_in_region_mask(page_index, page_mask, region) \
|
||
|
for ((page_index) = uvm_va_block_first_page_in_mask((region), (page_mask)); \
|
||
|
(page_index) != (region).outer; \
|
||
|
(page_index) = uvm_va_block_next_page_in_mask((region), (page_mask), (page_index)))
|
||
|
|
||
|
// Same as for_each_va_block_page_in_region_mask, but the region spans the
|
||
|
// whole given VA block
|
||
|
#define for_each_va_block_page_in_mask(page_index, page_mask, va_block) \
|
||
|
for_each_va_block_page_in_region_mask(page_index, page_mask, uvm_va_block_region_from_block(va_block))
|
||
|
|
||
|
// Similar to for_each_va_block_page_in_region_mask, but iterating over pages
|
||
|
// whose bit is unset.
|
||
|
#define for_each_va_block_unset_page_in_region_mask(page_index, page_mask, region) \
|
||
|
for ((page_index) = uvm_va_block_first_unset_page_in_mask((region), (page_mask)); \
|
||
|
(page_index) != (region).outer; \
|
||
|
(page_index) = uvm_va_block_next_unset_page_in_mask((region), (page_mask), (page_index)))
|
||
|
|
||
|
// Similar to for_each_va_block_page_in_mask, but iterating over pages whose
|
||
|
// bit is unset.
|
||
|
#define for_each_va_block_unset_page_in_mask(page_index, page_mask, va_block) \
|
||
|
for_each_va_block_unset_page_in_region_mask(page_index, page_mask, uvm_va_block_region_from_block(va_block))
|
||
|
|
||
|
// Iterate over all pages within the given region
|
||
|
#define for_each_va_block_page_in_region(page_index, region) \
|
||
|
for_each_va_block_page_in_region_mask((page_index), NULL, (region))
|
||
|
|
||
|
// Iterate over all pages within the given VA block
|
||
|
#define for_each_va_block_page(page_index, va_block) \
|
||
|
for_each_va_block_page_in_region((page_index), uvm_va_block_region_from_block(va_block))
|
||
|
|
||
|
static void uvm_va_block_bitmap_tree_init_from_page_count(uvm_va_block_bitmap_tree_t *bitmap_tree, size_t page_count)
|
||
|
{
|
||
|
bitmap_tree->leaf_count = page_count;
|
||
|
bitmap_tree->level_count = ilog2(roundup_pow_of_two(page_count)) + 1;
|
||
|
uvm_page_mask_zero(&bitmap_tree->pages);
|
||
|
}
|
||
|
|
||
|
static void uvm_va_block_bitmap_tree_init(uvm_va_block_bitmap_tree_t *bitmap_tree, uvm_va_block_t *va_block)
|
||
|
{
|
||
|
size_t num_pages = uvm_va_block_num_cpu_pages(va_block);
|
||
|
uvm_va_block_bitmap_tree_init_from_page_count(bitmap_tree, num_pages);
|
||
|
}
|
||
|
|
||
|
static void uvm_va_block_bitmap_tree_iter_init(const uvm_va_block_bitmap_tree_t *bitmap_tree,
|
||
|
uvm_page_index_t page_index,
|
||
|
uvm_va_block_bitmap_tree_iter_t *iter)
|
||
|
{
|
||
|
UVM_ASSERT(bitmap_tree->level_count > 0);
|
||
|
UVM_ASSERT_MSG(page_index < bitmap_tree->leaf_count,
|
||
|
"%zd vs %zd",
|
||
|
(size_t)page_index,
|
||
|
(size_t)bitmap_tree->leaf_count);
|
||
|
|
||
|
iter->level_idx = bitmap_tree->level_count - 1;
|
||
|
iter->node_idx = page_index;
|
||
|
}
|
||
|
|
||
|
static uvm_va_block_region_t uvm_va_block_bitmap_tree_iter_get_range(const uvm_va_block_bitmap_tree_t *bitmap_tree,
|
||
|
const uvm_va_block_bitmap_tree_iter_t *iter)
|
||
|
{
|
||
|
NvU16 range_leaves = uvm_perf_tree_iter_leaf_range(bitmap_tree, iter);
|
||
|
NvU16 range_start = uvm_perf_tree_iter_leaf_range_start(bitmap_tree, iter);
|
||
|
uvm_va_block_region_t subregion = uvm_va_block_region(range_start, range_start + range_leaves);
|
||
|
|
||
|
UVM_ASSERT(iter->level_idx >= 0);
|
||
|
UVM_ASSERT(iter->level_idx < bitmap_tree->level_count);
|
||
|
|
||
|
return subregion;
|
||
|
}
|
||
|
|
||
|
static NvU16 uvm_va_block_bitmap_tree_iter_get_count(const uvm_va_block_bitmap_tree_t *bitmap_tree,
|
||
|
const uvm_va_block_bitmap_tree_iter_t *iter)
|
||
|
{
|
||
|
uvm_va_block_region_t subregion = uvm_va_block_bitmap_tree_iter_get_range(bitmap_tree, iter);
|
||
|
|
||
|
return uvm_page_mask_region_weight(&bitmap_tree->pages, subregion);
|
||
|
}
|
||
|
|
||
|
#define uvm_va_block_bitmap_tree_traverse_counters(counter,tree,page,iter) \
|
||
|
for (uvm_va_block_bitmap_tree_iter_init((tree), (page), (iter)), \
|
||
|
(counter) = uvm_va_block_bitmap_tree_iter_get_count((tree), (iter)); \
|
||
|
(iter)->level_idx >= 0; \
|
||
|
(counter) = --(iter)->level_idx < 0? 0: \
|
||
|
uvm_va_block_bitmap_tree_iter_get_count((tree), (iter)))
|
||
|
|
||
|
// Return the block region covered by the given chunk size. page_index must be
|
||
|
// any page within the block known to be covered by the chunk.
|
||
|
static uvm_va_block_region_t uvm_va_block_chunk_region(uvm_va_block_t *block,
|
||
|
uvm_chunk_size_t chunk_size,
|
||
|
uvm_page_index_t page_index)
|
||
|
{
|
||
|
NvU64 page_addr = uvm_va_block_cpu_page_address(block, page_index);
|
||
|
NvU64 chunk_start_addr = UVM_ALIGN_DOWN(page_addr, chunk_size);
|
||
|
uvm_page_index_t first = (uvm_page_index_t)((chunk_start_addr - block->start) / PAGE_SIZE);
|
||
|
return uvm_va_block_region(first, first + (chunk_size / PAGE_SIZE));
|
||
|
}
|
||
|
|
||
|
//
|
||
|
// Helpers for page state (permissions, size, residency)
|
||
|
//
|
||
|
|
||
|
// Compute the gpus that have at least the given access permissions for the
|
||
|
// range described by region and page_mask. The function sets the bit if any
|
||
|
// page in the region has the permissions.
|
||
|
void uvm_va_block_region_authorized_gpus(uvm_va_block_t *va_block,
|
||
|
uvm_va_block_region_t region,
|
||
|
uvm_prot_t access_permission,
|
||
|
uvm_processor_mask_t *authorized_gpus);
|
||
|
|
||
|
// Compute the processors that have at least the given access permissions for the
|
||
|
// range described by region and page_mask. The function sets the bit if any
|
||
|
// page in the region has the permissions.
|
||
|
void uvm_va_block_region_authorized_processors(uvm_va_block_t *va_block,
|
||
|
uvm_va_block_region_t region,
|
||
|
uvm_prot_t access_permission,
|
||
|
uvm_processor_mask_t *authorized_processors);
|
||
|
|
||
|
void uvm_va_block_page_authorized_gpus(uvm_va_block_t *va_block,
|
||
|
uvm_page_index_t page_index,
|
||
|
uvm_prot_t access_permission,
|
||
|
uvm_processor_mask_t *authorized_gpus);
|
||
|
|
||
|
void uvm_va_block_page_authorized_processors(uvm_va_block_t *va_block,
|
||
|
uvm_page_index_t page_index,
|
||
|
uvm_prot_t access_permission,
|
||
|
uvm_processor_mask_t *authorized_processors);
|
||
|
|
||
|
bool uvm_va_block_is_gpu_authorized_on_whole_region(uvm_va_block_t *va_block,
|
||
|
uvm_va_block_region_t region,
|
||
|
uvm_gpu_id_t gpu_id,
|
||
|
uvm_prot_t required_prot);
|
||
|
|
||
|
bool uvm_va_block_is_processor_authorized_on_whole_region(uvm_va_block_t *va_block,
|
||
|
uvm_va_block_region_t region,
|
||
|
uvm_processor_id_t processor_id,
|
||
|
uvm_prot_t required_prot);
|
||
|
|
||
|
bool uvm_va_block_page_is_gpu_authorized(uvm_va_block_t *va_block,
|
||
|
uvm_page_index_t page_index,
|
||
|
uvm_gpu_id_t gpu_id,
|
||
|
uvm_prot_t required_prot);
|
||
|
|
||
|
bool uvm_va_block_page_is_processor_authorized(uvm_va_block_t *va_block,
|
||
|
uvm_page_index_t page_index,
|
||
|
uvm_processor_id_t processor_id,
|
||
|
uvm_prot_t required_prot);
|
||
|
|
||
|
// Compute the gpus that have a copy of the given page resident in their memory
|
||
|
void uvm_va_block_page_resident_gpus(uvm_va_block_t *va_block,
|
||
|
uvm_page_index_t page_index,
|
||
|
uvm_processor_mask_t *resident_gpus);
|
||
|
|
||
|
// Compute the processors that have a copy of the given page resident in their memory
|
||
|
void uvm_va_block_page_resident_processors(uvm_va_block_t *va_block,
|
||
|
uvm_page_index_t page_index,
|
||
|
uvm_processor_mask_t *resident_processors);
|
||
|
|
||
|
// Count how many processors have a copy of the given page resident in their memory
|
||
|
NvU32 uvm_va_block_page_resident_processors_count(uvm_va_block_t *va_block, uvm_page_index_t page_index);
|
||
|
|
||
|
// Get the processor with a resident copy of a page closest to the given processor
|
||
|
uvm_processor_id_t uvm_va_block_page_get_closest_resident(uvm_va_block_t *va_block,
|
||
|
uvm_page_index_t page_index,
|
||
|
uvm_processor_id_t processor);
|
||
|
|
||
|
uvm_processor_id_t uvm_va_block_page_get_closest_resident_in_mask(uvm_va_block_t *va_block,
|
||
|
uvm_page_index_t page_index,
|
||
|
uvm_processor_id_t processor,
|
||
|
const uvm_processor_mask_t *processor_mask);
|
||
|
|
||
|
// Get CPU page size or 0 if it is not mapped
|
||
|
NvU32 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block, uvm_page_index_t page_index);
|
||
|
|
||
|
// Get GPU page size or 0 if it is not mapped on the given GPU
|
||
|
NvU32 uvm_va_block_page_size_gpu(uvm_va_block_t *va_block, uvm_gpu_id_t gpu_id, uvm_page_index_t page_index);
|
||
|
|
||
|
// Get page size or 0 if it is not mapped on the given processor
|
||
|
static NvU32 uvm_va_block_page_size_processor(uvm_va_block_t *va_block,
|
||
|
uvm_processor_id_t processor_id,
|
||
|
uvm_page_index_t page_index)
|
||
|
{
|
||
|
if (UVM_ID_IS_CPU(processor_id))
|
||
|
return uvm_va_block_page_size_cpu(va_block, page_index);
|
||
|
else
|
||
|
return uvm_va_block_page_size_gpu(va_block, processor_id, page_index);
|
||
|
}
|
||
|
|
||
|
// Returns the big page size for the GPU VA space of the block
|
||
|
NvU32 uvm_va_block_gpu_big_page_size(uvm_va_block_t *va_block, uvm_gpu_t *gpu);
|
||
|
|
||
|
// Returns the number of big pages in the VA block for the given size
|
||
|
size_t uvm_va_block_num_big_pages(uvm_va_block_t *va_block, NvU32 big_page_size);
|
||
|
|
||
|
// Returns the number of big pages in the VA block for the big page size on the
|
||
|
// given GPU
|
||
|
static size_t uvm_va_block_gpu_num_big_pages(uvm_va_block_t *va_block, uvm_gpu_t *gpu)
|
||
|
{
|
||
|
return uvm_va_block_num_big_pages(va_block, uvm_va_block_gpu_big_page_size(va_block, gpu));
|
||
|
}
|
||
|
|
||
|
// Returns the start address of the given big page index and big page size
|
||
|
NvU64 uvm_va_block_big_page_addr(uvm_va_block_t *va_block, size_t big_page_index, NvU32 big_page_size);
|
||
|
|
||
|
// Returns the region [start, end] of the given big page index and big page size
|
||
|
uvm_va_block_region_t uvm_va_block_big_page_region(uvm_va_block_t *va_block,
|
||
|
size_t big_page_index,
|
||
|
NvU32 big_page_size);
|
||
|
|
||
|
// Returns the largest sub-region region of [start, end] which can fit big
|
||
|
// pages. If the region cannot fit any big pages, an invalid region (0, 0) is
|
||
|
// returned.
|
||
|
uvm_va_block_region_t uvm_va_block_big_page_region_all(uvm_va_block_t *va_block, NvU32 big_page_size);
|
||
|
|
||
|
// Returns the big page index (the bit index within
|
||
|
// uvm_va_block_gpu_state_t::big_ptes) corresponding to page_index. If
|
||
|
// page_index cannot be covered by a big PTE due to alignment or block size,
|
||
|
// MAX_BIG_PAGES_PER_UVM_VA_BLOCK is returned.
|
||
|
size_t uvm_va_block_big_page_index(uvm_va_block_t *va_block, uvm_page_index_t page_index, NvU32 big_page_size);
|
||
|
|
||
|
// Returns the new residency for a page that faulted or triggered access
|
||
|
// counter notifications. The read_duplicate output parameter indicates if the
|
||
|
// page meets the requirements to be read-duplicated
|
||
|
uvm_processor_id_t uvm_va_block_select_residency(uvm_va_block_t *va_block,
|
||
|
uvm_page_index_t page_index,
|
||
|
uvm_processor_id_t processor_id,
|
||
|
NvU32 access_type_mask,
|
||
|
uvm_va_policy_t *policy,
|
||
|
const uvm_perf_thrashing_hint_t *thrashing_hint,
|
||
|
uvm_service_operation_t operation,
|
||
|
bool *read_duplicate);
|
||
|
|
||
|
// Return the maximum mapping protection for processor_id that will not require
|
||
|
// any permision revocation on the rest of processors.
|
||
|
uvm_prot_t uvm_va_block_page_compute_highest_permission(uvm_va_block_t *va_block,
|
||
|
uvm_processor_id_t processor_id,
|
||
|
uvm_page_index_t page_index);
|
||
|
|
||
|
// A helper macro for handling allocation-retry
|
||
|
//
|
||
|
// The macro takes a VA block, uvm_va_block_retry_t struct and a function call
|
||
|
// to retry as long as it returns NV_ERR_MORE_PROCESSING_REQUIRED.
|
||
|
//
|
||
|
// block_retry can be NULL if it's not necessary for the function call,
|
||
|
// otherwise it will be initialized and deinitialized by the macro.
|
||
|
//
|
||
|
// The macro also locks and unlocks the block's lock internally as it's expected
|
||
|
// that the block's lock has been unlocked and relocked whenever the function call
|
||
|
// returns NV_ERR_MORE_PROCESSING_REQUIRED and this makes it clear that the
|
||
|
// block's state is not locked across these calls.
|
||
|
#define UVM_VA_BLOCK_LOCK_RETRY(va_block, block_retry, call) ({ \
|
||
|
NV_STATUS status; \
|
||
|
uvm_va_block_t *__block = (va_block); \
|
||
|
uvm_va_block_retry_t *__retry = (block_retry); \
|
||
|
\
|
||
|
uvm_va_block_retry_init(__retry); \
|
||
|
\
|
||
|
uvm_mutex_lock(&__block->lock); \
|
||
|
\
|
||
|
do { \
|
||
|
status = (call); \
|
||
|
} while (status == NV_ERR_MORE_PROCESSING_REQUIRED); \
|
||
|
\
|
||
|
uvm_mutex_unlock(&__block->lock); \
|
||
|
\
|
||
|
uvm_va_block_retry_deinit(__retry, __block); \
|
||
|
\
|
||
|
status; \
|
||
|
})
|
||
|
|
||
|
// A helper macro for handling allocation-retry
|
||
|
//
|
||
|
// The macro takes a VA block, uvm_va_block_retry_t struct and a function call
|
||
|
// to retry as long as it returns NV_ERR_MORE_PROCESSING_REQUIRED.
|
||
|
//
|
||
|
// block_retry can be NULL if it's not necessary for the function call,
|
||
|
// otherwise it will be initialized and deinitialized by the macro.
|
||
|
//
|
||
|
// This macro, as opposed to UVM_VA_BLOCK_LOCK_RETRY(), expects the block lock
|
||
|
// to be already taken. Notably the block's lock might be unlocked and relocked
|
||
|
// as part of the call.
|
||
|
#define UVM_VA_BLOCK_RETRY_LOCKED(va_block, block_retry, call) ({ \
|
||
|
NV_STATUS status; \
|
||
|
uvm_va_block_t *__block = (va_block); \
|
||
|
uvm_va_block_retry_t *__retry = (block_retry); \
|
||
|
\
|
||
|
uvm_va_block_retry_init(__retry); \
|
||
|
\
|
||
|
uvm_assert_mutex_locked(&__block->lock); \
|
||
|
\
|
||
|
do { \
|
||
|
status = (call); \
|
||
|
} while (status == NV_ERR_MORE_PROCESSING_REQUIRED); \
|
||
|
\
|
||
|
uvm_va_block_retry_deinit(__retry, __block); \
|
||
|
\
|
||
|
status; \
|
||
|
})
|
||
|
|
||
|
#endif // __UVM_VA_BLOCK_H__
|