open-gpu-kernel-modules/kernel-open/nvidia-uvm/uvm_pmm_sysmem.h

/*******************************************************************************
    Copyright (c) 2017-2021 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
    deal in the Software without restriction, including without limitation the
    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
    sell copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:

        The above copyright notice and this permission notice shall be
        included in all copies or substantial portions of the Software.

    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
    DEALINGS IN THE SOFTWARE.

*******************************************************************************/

#ifndef __UVM_PMM_SYSMEM_H__
#define __UVM_PMM_SYSMEM_H__

#include "uvm_common.h"
#include "uvm_linux.h"
#include "uvm_forward_decl.h"
#include "uvm_lock.h"

// Module to handle per-GPU user mappings to sysmem physical memory. Notably,
// this implements a reverse map of the DMA address to {va_block, virt_addr}.
// This is required by the GPU access counters feature since they may provide a
// physical address in the notification packet (GPA notifications). We use the
// table to obtain the VAs of the memory regions being accessed remotely. The
// reverse map is implemented by a radix tree, which is indexed using the
// DMA address. For now, only PAGE_SIZE translations are supported (i.e. no
// big/huge pages).
//
// TODO: Bug 1995015: add support for physically-contiguous mappings.
struct uvm_pmm_sysmem_mappings_struct
{
    uvm_gpu_t                                      *gpu;

    struct radix_tree_root             reverse_map_tree;

    uvm_mutex_t                        reverse_map_lock;
};

// See comments in uvm_linux.h
#ifdef NV_RADIX_TREE_REPLACE_SLOT_PRESENT
#define uvm_pmm_sysmem_mappings_indirect_supported() true
#else
#define uvm_pmm_sysmem_mappings_indirect_supported() false
#endif

// Global initialization/exit functions, that need to be called during driver
// initialization/tear-down. These are needed to allocate/free global internal
// data structures.
NV_STATUS uvm_pmm_sysmem_init(void);
void uvm_pmm_sysmem_exit(void);

// Initialize per-GPU sysmem mapping tracking
NV_STATUS uvm_pmm_sysmem_mappings_init(uvm_gpu_t *gpu, uvm_pmm_sysmem_mappings_t *sysmem_mappings);

// Destroy per-GPU sysmem mapping tracking. The caller must ensure that all the
// mappings have been removed before calling this function.
void uvm_pmm_sysmem_mappings_deinit(uvm_pmm_sysmem_mappings_t *sysmem_mappings);

// If the GPU used to initialize sysmem_mappings supports access counters, the
// dma_addr -> {va_block, virt_addr} mapping is inserted in the reverse map.
NV_STATUS uvm_pmm_sysmem_mappings_add_gpu_mapping(uvm_pmm_sysmem_mappings_t *sysmem_mappings,
                                                  NvU64 dma_addr,
                                                  NvU64 virt_addr,
                                                  NvU64 region_size,
                                                  uvm_va_block_t *va_block,
                                                  uvm_processor_id_t owner);

static NV_STATUS uvm_pmm_sysmem_mappings_add_gpu_chunk_mapping(uvm_pmm_sysmem_mappings_t *sysmem_mappings,
                                                               NvU64 dma_addr,
                                                               NvU64 virt_addr,
                                                               NvU64 region_size,
                                                               uvm_va_block_t *va_block,
                                                               uvm_gpu_id_t owner)
{
    if (!uvm_pmm_sysmem_mappings_indirect_supported())
        return NV_OK;

    return uvm_pmm_sysmem_mappings_add_gpu_mapping(sysmem_mappings,
                                                   dma_addr,
                                                   virt_addr,
                                                   region_size,
                                                   va_block,
                                                   owner);
}

// If the GPU used to initialize sysmem_mappings supports access counters, the
// entries for the physical region starting at dma_addr are removed from the
// reverse map.
void uvm_pmm_sysmem_mappings_remove_gpu_mapping(uvm_pmm_sysmem_mappings_t *sysmem_mappings, NvU64 dma_addr);

static void uvm_pmm_sysmem_mappings_remove_gpu_chunk_mapping(uvm_pmm_sysmem_mappings_t *sysmem_mappings, NvU64 dma_addr)
{
    if (uvm_pmm_sysmem_mappings_indirect_supported())
        uvm_pmm_sysmem_mappings_remove_gpu_mapping(sysmem_mappings, dma_addr);
}

// Like uvm_pmm_sysmem_mappings_remove_gpu_mapping but it doesn't assert if the
// mapping doesn't exist. See uvm_va_block_evict_chunks for more information.
void uvm_pmm_sysmem_mappings_remove_gpu_mapping_on_eviction(uvm_pmm_sysmem_mappings_t *sysmem_mappings, NvU64 dma_addr);

// If the GPU used to initialize sysmem_mappings supports access counters, the
// mapping for the region starting at dma_addr is updated with va_block.
// This is required on VA block split.
void uvm_pmm_sysmem_mappings_reparent_gpu_mapping(uvm_pmm_sysmem_mappings_t *sysmem_mappings,
                                                  NvU64 dma_addr,
                                                  uvm_va_block_t *va_block);

static void uvm_pmm_sysmem_mappings_reparent_gpu_chunk_mapping(uvm_pmm_sysmem_mappings_t *sysmem_mappings,
                                                               NvU64 dma_addr,
                                                               uvm_va_block_t *va_block)
{
    if (uvm_pmm_sysmem_mappings_indirect_supported())
        uvm_pmm_sysmem_mappings_reparent_gpu_mapping(sysmem_mappings, dma_addr, va_block);
}

// If the GPU used to initialize sysmem_mappings supports access counters, the
// mapping for the region starting at dma_addr is split into regions of
// new_region_size. new_region_size must be a power of two and smaller than the
// previously-registered size.
NV_STATUS uvm_pmm_sysmem_mappings_split_gpu_mappings(uvm_pmm_sysmem_mappings_t *sysmem_mappings,
                                                     NvU64 dma_addr,
                                                     NvU64 new_region_size);

static NV_STATUS uvm_pmm_sysmem_mappings_split_gpu_chunk_mappings(uvm_pmm_sysmem_mappings_t *sysmem_mappings,
                                                                  NvU64 dma_addr,
                                                                  NvU64 new_region_size)
{
    if (!uvm_pmm_sysmem_mappings_indirect_supported())
        return NV_OK;

    return uvm_pmm_sysmem_mappings_split_gpu_mappings(sysmem_mappings, dma_addr, new_region_size);
}

// If the GPU used to initialize sysmem_mappings supports access counters, all
// the mappings within the region [dma_addr, dma_addr + new_region_size) are
// merged into a single mapping. new_region_size must be a power of two. The
// whole region must be previously populated with mappings and all of them must
// have the same VA block and processor owner.
void uvm_pmm_sysmem_mappings_merge_gpu_mappings(uvm_pmm_sysmem_mappings_t *sysmem_mappings,
                                                NvU64 dma_addr,
                                                NvU64 new_region_size);

static void uvm_pmm_sysmem_mappings_merge_gpu_chunk_mappings(uvm_pmm_sysmem_mappings_t *sysmem_mappings,
                                                             NvU64 dma_addr,
                                                             NvU64 new_region_size)
{
    if (uvm_pmm_sysmem_mappings_indirect_supported())
        uvm_pmm_sysmem_mappings_merge_gpu_mappings(sysmem_mappings, dma_addr, new_region_size);
}

// Obtain the {va_block, virt_addr} information for the mappings in the given
// [dma_addr:dma_addr + region_size) range. dma_addr and region_size must be
// page-aligned.
//
// Valid translations are written to out_mappings sequentially (there are no
// gaps). max_out_mappings are written, at most. The caller is required to
// provide enough entries in out_mappings.
//
// The VA Block in each returned translation entry is retained, and it's up to
// the caller to release them
size_t uvm_pmm_sysmem_mappings_dma_to_virt(uvm_pmm_sysmem_mappings_t *sysmem_mappings,
                                           NvU64 dma_addr,
                                           NvU64 region_size,
                                           uvm_reverse_map_t *out_mappings,
                                           size_t max_out_mappings);

#define UVM_CPU_CHUNK_SIZES PAGE_SIZE

#if UVM_CPU_CHUNK_SIZES == PAGE_SIZE
#define UVM_CPU_CHUNK_SIZE_IS_PAGE_SIZE() 1
typedef struct page uvm_cpu_chunk_t;
#else
#define UVM_CPU_CHUNK_SIZE_IS_PAGE_SIZE() 0
typedef struct uvm_cpu_chunk_struct uvm_cpu_chunk_t;

// CPU memory chunk descriptor.
// CPU memory chunks represent a physically contiguous CPU memory
// allocation.
// CPU memory chunks can be created due to CPU page allocation or
// CPU chunk splitting. Chunks created due to page allocations are
// referred to as "physical chunks", while chunks resulting from
// splitting are referred to as "logical chunks".
struct uvm_cpu_chunk_struct
{
    // Pointer to the CPU page backing this CPU chunk.
    // For physical chunks, this will point to the head page. Physical
    // chunk allocation will set the reference count for the struct
    // page (compound or not) to 1.
    //
    // For logical chunks, this will point to the struct page from
    // the compound page array corresponding to the correct page index.
    // Because freeing a logical chunk does not result in freeing of
    // any struct page(s) and both physical and logical chunks are
    // reference counted, there is no need to take separate references
    // to the struct page for logical chunks.
    struct page *page;

    // For logical chunks, this points to the parent chunk (which
    // could also be a logical chunk). For physical chunks, this
    // is NULL.
    uvm_cpu_chunk_t *parent;

    // Page offset of this chunk within the physical size of
    // the parent.
    uvm_page_index_t offset;

    // Region within the VA block covered by this CPU chunk.
    uvm_va_block_region_t region;

    // Chunk reference count used when a CPU chunk is split. Each
    // child sub-chunk will increment the reference count of its
    // parent.
    nv_kref_t refcount;

    // Size of the chunk at the time of its creation.
    // For chunks, which are the result of a split, this
    // value will be the size of the chunk prior to the
    // split.
    // For chunks resulting from page allocations (physical),
    // this value is the size of the physical allocation.
    size_t log2_phys_size : order_base_2(UVM_CHUNK_SIZE_MASK_SIZE);

    struct {
        // Per-GPU array of DMA mapping addresses for the chunk.
        // The DMA mapping addresses for logical chunks are adjusted
        // to the correct offset within the parent chunk.
        union {
            NvU64 static_entry;
            NvU64 *dynamic_entries;
        };
        uvm_processor_mask_t dma_addrs_mask;
    } gpu_mappings;

    // Lock protecting dirty_bitmap
    uvm_spinlock_t lock;

    // A dynamically allocated bitmap (one per PAGE_SIZE page) used
    // to track dirty state of each PAGE_SIZE page.
    // Dirty state is tracked only by physical chunks. Therefore,
    // for logical chunks this will be NULL;
    unsigned long *dirty_bitmap;
};
#endif // UVM_CPU_CHUNK_SIZES == PAGE_SIZE

// Return the set of allowed CPU chunk allocation sizes.
uvm_chunk_sizes_mask_t uvm_cpu_chunk_get_allocation_sizes(void);

// Allocate a physical CPU chunk for the specified page index and owned by
// va_block.
//
// The size of the allocated CPU chunk may be any of the allowed sizes and
// depends on several factors:
//     * Allocation will be attempted in reverse order - highest to lowest - in
//       order ensure that the highest possible size is used.
//     * An allocation size will be used if:
//         - the VA region within the block covered by the allocation size is
//           aligned to that allocation size,
//         - the VA block region corresponding to the allocation size is empty
//           (has no previously populated pages), and
//         - the system allows a page allocation of that size.
//
// If mm is not NULL, the chunks memory will be added to the mm's memory cgroup.
//
// If a CPU chunk allocation succeeds, NV_OK is returned. If new_chunk is not
// NULL it will be set to point to the newly allocated chunk. On failure,
// NV_ERR_NO_MEMORY is returned.
NV_STATUS uvm_cpu_chunk_alloc(uvm_va_block_t *va_block,
                              uvm_page_index_t page_index,
                              struct mm_struct *mm,
                              uvm_cpu_chunk_t **new_chunk);

// Insert a CPU chunk in the va_block's storage structures.
//
// On success, NV_OK is returned. On error,
//   - NV_ERR_NO_MEMORY is returned if memory allocation for any if the internal
//     structures did not succeed.
//   - NV_ERR_INVALID_ARGUMENT is returned if the size of the chunk to be inserted
//     in invalid.
//   - NV_ERR_INVALID_STATE is returned if a matching chunk already exists in the
//     block.
NV_STATUS uvm_cpu_chunk_insert_in_block(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index);

// Remove a CPU chunk from the va_block's storage structures.
// The chunk is not freed, only removed from the block's storage structures.
void uvm_cpu_chunk_remove_from_block(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index);

// Return the CPU chunk backing page_index within the VA block.
// If page_index is beyond the boundary of the VA block or a CPU chunk for
// the specified page has not been allocated and/or inserted into the block,
// NULL is returned.
uvm_cpu_chunk_t *uvm_cpu_chunk_get_chunk_for_page(uvm_va_block_t *block, uvm_page_index_t page_index);

// Return the physical size of the CPU chunk.
// The physical size of the CPU chunk is the size of the physical CPU
// memory backing the CPU chunk. It is set at CPU chunk allocation time
static uvm_chunk_size_t uvm_cpu_chunk_get_phys_size(uvm_cpu_chunk_t *chunk)
{
#if UVM_CPU_CHUNK_SIZE_IS_PAGE_SIZE()
    return (uvm_chunk_size_t)PAGE_SIZE;
#else
    return ((uvm_chunk_size_t)1) << chunk->log2_phys_size;
#endif
}

// Return the size of the CPU chunk. While the physical size of the CPU
// chunk reflects the size of the physical memory backing the chunk, this
// size is the effective size of the chunk and changes as result of CPU
// chunk splits.
uvm_chunk_size_t uvm_cpu_chunk_get_size(uvm_cpu_chunk_t *chunk);

// Return the number of base system pages covered by the CPU chunk.
static size_t uvm_cpu_chunk_num_pages(uvm_cpu_chunk_t *chunk)
{
    UVM_ASSERT(chunk);
    return uvm_cpu_chunk_get_size(chunk) / PAGE_SIZE;
}

static bool uvm_cpu_chunk_is_physical(uvm_cpu_chunk_t *chunk)
{
#if UVM_CPU_CHUNK_SIZE_IS_PAGE_SIZE()
    return true;
#else
    return chunk->parent == NULL;
#endif
}

// Return a pointer to the struct page backing page_index within the owning
// VA block.
struct page *uvm_cpu_chunk_get_cpu_page(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index);

// Take a reference to the CPU chunk.
void uvm_cpu_chunk_get(uvm_cpu_chunk_t *chunk);

// Release a reference to the CPU chunk. When the reference count
// drops to zero, the CPU chunk will be freed. Physical CPU chunks
// will also free the CPU pages backing the chunk.
void uvm_cpu_chunk_put(uvm_cpu_chunk_t *chunk);

NV_STATUS uvm_cpu_chunk_gpu_mapping_alloc(uvm_va_block_t *va_block, uvm_gpu_id_t id);
void uvm_cpu_chunk_gpu_mapping_split(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_id_t id);
void uvm_cpu_chunk_gpu_mapping_free(uvm_va_block_t *va_block, uvm_gpu_id_t id);

// Set the CPU chunk's DMA mapping address for the specified GPU ID.
NV_STATUS uvm_cpu_chunk_set_gpu_mapping_addr(uvm_va_block_t *va_block,
                                             uvm_page_index_t page_index,
                                             uvm_cpu_chunk_t *chunk,
                                             uvm_gpu_id_t id,
                                             NvU64 dma_addr);

// Get the CPU chunk's DMA mapping address for the specified GPU ID.
NvU64 uvm_cpu_chunk_get_gpu_mapping_addr(uvm_va_block_t *block,
                                         uvm_page_index_t page_index,
                                         uvm_cpu_chunk_t *chunk,
                                         uvm_gpu_id_t id);

#if !UVM_CPU_CHUNK_SIZE_IS_PAGE_SIZE()
// Split a CPU chunk into a set of CPU chunks of size new_size.
// new_size has to be one of the supported CPU chunk allocation sizes and has to
// be smaller than the current size of chunk.
//
// On success, NV_OK is returned. All new chunks will have chunk as parent and
// chunk's size will have been updated to new_size.
//
// Note that due to the way CPU chunks are managed and split, the number of
// newly created chunks will be (size_of(chunk) / new_size) - 1.
//
// On failure NV_ERR_NO_MEMORY will be returned. chunk's size will not be
// modified.
NV_STATUS uvm_cpu_chunk_split(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk, uvm_chunk_size_t new_size);

// Merge chunk's parent to the highest possible CPU chunk size fully contained
// within the parent's owning VA block.
//
// The size to which chunks are merged is determined by finding the largest
// size from the set of allowed CPU chunk sizes that satisfies both criteria
// below:
//    * The VA range of the parent chunk resulting from the merge has to be
//      fully contained within the VA block.
//    * The start and end VA addresses of the parent based on its physical
//      size have to be aligned to the merge size.
//
// It is possible that a merge cannot be done if chunk does not have a parent
// (it is a physical chunk), chunk's owning VA block is not the same as
// its parent's owning VA block, or there is no chunk size that satisfied both
// the above criteria.
//
// Return a pointer to the merged chunk. If a merge could not be done, return
// NULL.
uvm_cpu_chunk_t *uvm_cpu_chunk_merge(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk);

// Mark the CPU sub-page page_index in the CPU chunk as dirty.
// page_index has to be a page withing the chunk's region.
void uvm_cpu_chunk_mark_dirty(uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index);

// Mark the CPU sub-pages page_index in the CPU chunk as clean.
// page_index has to be a page withing the chunk's region.
void uvm_cpu_chunk_mark_clean(uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index);

// Return true if the CPU sub-pages page_index in the CPU chunk are dirty.
// page_index has to be a page withing the chunk's region.
bool uvm_cpu_chunk_is_dirty(uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index);

#else // UVM_CPU_CHUNK_SIZE_IS_PAGE_SIZE()

static NV_STATUS uvm_cpu_chunk_split(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk, uvm_chunk_size_t new_size)
{
    return NV_OK;
}

static uvm_cpu_chunk_t *uvm_cpu_chunk_merge(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk)
{
    return NULL;
}

static void uvm_cpu_chunk_mark_dirty(uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index)
{
    SetPageDirty(chunk);
}

static void uvm_cpu_chunk_mark_clean(uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index)
{
    ClearPageDirty(chunk);
}

static bool uvm_cpu_chunk_is_dirty(uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index)
{
    return PageDirty(chunk);
}
#endif // !UVM_CPU_CHUNK_SIZE_IS_PAGE_SIZE()

// Return the first CPU chunk in the block. If no CPU chunks have been
// allocated and/or inserted into the block, NULL is returned.
// If not NULL, page_index will be set to the first page of the block covered by
// the returned chunk.
uvm_cpu_chunk_t *uvm_cpu_chunk_first_in_block(uvm_va_block_t *va_block, uvm_page_index_t *out_page_index);

// Return the next CPU chunk in the block owning chunk.
// previous_page_index is the index after which to start searching. Its value
// will be updated with the starting page index of the next chunk in the block.
uvm_cpu_chunk_t *uvm_cpu_chunk_next(uvm_va_block_t *va_block, uvm_page_index_t *previous_page_index);

#define for_each_cpu_chunk_in_block(chunk, page_index, va_block)                                                     \
    for ((chunk) = uvm_cpu_chunk_first_in_block((va_block), &(page_index));                                          \
         (chunk) != NULL;                                                                                            \
         (page_index) += uvm_cpu_chunk_num_pages(chunk) - 1, (chunk) = uvm_cpu_chunk_next((va_block), &(page_index)))

#define for_each_cpu_chunk_in_block_safe(chunk, page_index, next_page_index, va_block)                  \
    for ((chunk) = uvm_cpu_chunk_first_in_block((va_block), &(page_index)),                             \
             (next_page_index) = (page_index) + ((chunk) ? uvm_cpu_chunk_num_pages(chunk) : 0);         \
         (chunk) != NULL;                                                                               \
         (page_index) = (next_page_index) - 1, (chunk) = uvm_cpu_chunk_next((va_block), &(page_index)), \
             (next_page_index) = (page_index) + ((chunk) ? uvm_cpu_chunk_num_pages(chunk) : 0))

// Use a special symbol for the region so it does not replace the chunk's region
// structure member.
#define for_each_cpu_chunk_in_block_region(chunk, page_index, va_block, __region)                                   \
    for ((page_index) = uvm_va_block_first_page_in_mask((__region), &(va_block)->cpu.allocated),                    \
             (chunk) = uvm_cpu_chunk_get_chunk_for_page((va_block), (page_index));                                  \
         (chunk) != NULL && page_index < (__region).outer;                                                          \
         (page_index) += uvm_cpu_chunk_num_pages(chunk) - 1, (chunk) = uvm_cpu_chunk_next((va_block), &(page_index))

#define for_each_cpu_chunk_in_block_region_safe(chunk, page_index, next_page_index, va_block, __region) \
    for ((page_index) = uvm_va_block_first_page_in_mask((__region), &(va_block)->cpu.allocated),        \
             (chunk) = uvm_cpu_chunk_get_chunk_for_page((va_block), (page_index)),                      \
             (next_page_index) = (page_index) + (chunk ? uvm_cpu_chunk_num_pages(chunk) : 0);           \
         (chunk) != NULL && page_index < (__region).outer;                                              \
         (page_index) = (next_page_index) - 1, (chunk) = uvm_cpu_chunk_next((va_block), &(page_index)), \
             (next_page_index) = (page_index) + (chunk ? uvm_cpu_chunk_num_pages(chunk) : 0))

static NV_STATUS uvm_test_get_cpu_chunk_allocation_sizes(UVM_TEST_GET_CPU_CHUNK_ALLOC_SIZES_PARAMS *params,
                                                                struct file *filp)
{
        params->alloc_size_mask = (NvU32)uvm_cpu_chunk_get_allocation_sizes();
        return NV_OK;
}
#endif