open-gpu-kernel-modules/kernel-open/nvidia-uvm/uvm_linux.h

/*******************************************************************************
    Copyright (c) 2013-2021 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
    deal in the Software without restriction, including without limitation the
    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
    sell copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:

        The above copyright notice and this permission notice shall be
        included in all copies or substantial portions of the Software.

    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
    DEALINGS IN THE SOFTWARE.

*******************************************************************************/

//
// uvm_linux.h
//
// This file, along with conftest.h and umv_linux.c, helps to insulate
// the (out-of-tree) UVM driver from changes to the upstream Linux kernel.
//
//

#ifndef _UVM_LINUX_H
#define _UVM_LINUX_H

#include "nvtypes.h"

#include "nv-time.h"

#define NV_BUILD_MODULE_INSTANCES 0
#include "nv-linux.h"

#if defined(NV_LINUX_LOG2_H_PRESENT)
#include <linux/log2.h>
#endif
#if defined(NV_PRIO_TREE_PRESENT)
#include <linux/prio_tree.h>
#endif

#include <linux/jhash.h>
#include <linux/rwsem.h>
#include <linux/rbtree.h>
#include <linux/mm.h>

#if defined(NV_ASM_BARRIER_H_PRESENT)
#include <asm/barrier.h>
#endif

#if defined(NV_LINUX_ATOMIC_H_PRESENT)
#include <linux/atomic.h>
#endif

#include <asm/current.h>

#include <linux/random.h>           /* get_random_bytes()               */
#include <linux/radix-tree.h>       /* Linux kernel radix tree          */

#include <linux/file.h>             /* fget()                           */

#include <linux/percpu.h>

#if defined(NV_LINUX_PRINTK_H_PRESENT)
#include <linux/printk.h>
#endif

#if defined(NV_LINUX_RATELIMIT_H_PRESENT)
#include <linux/ratelimit.h>
#endif

#if defined(NV_PNV_NPU2_INIT_CONTEXT_PRESENT)
#include <asm/powernv.h>
#endif

#if defined(NV_LINUX_SCHED_TASK_STACK_H_PRESENT)
#include <linux/sched/task_stack.h>
#endif

#include <linux/cpumask.h>
#include <linux/topology.h>

#include "nv-kthread-q.h"

    #if defined(NV_CPUMASK_OF_NODE_PRESENT)
        #define UVM_THREAD_AFFINITY_SUPPORTED() 1
    #else
        #define UVM_THREAD_AFFINITY_SUPPORTED() 0
    #endif

// The ARM arch lacks support for cpumask_of_node() until kernel 4.7. It was
// added via commit1a2db300348b ("arm64, numa: Add NUMA support for arm64
// platforms.") Callers should either check UVM_THREAD_AFFINITY_SUPPORTED()
// prior to calling this function of be prepared to deal with a NULL CPU
// mask.
static inline const struct cpumask *uvm_cpumask_of_node(int node)
{
#ifdef NV_CPUMASK_OF_NODE_PRESENT
    return cpumask_of_node(node);
#else
    return NULL;
#endif
}

    #if defined(CONFIG_HMM_MIRROR) && defined(CONFIG_DEVICE_PRIVATE) && defined(NV_MIGRATE_DEVICE_RANGE_PRESENT)
        #define UVM_IS_CONFIG_HMM() 1
    #else
        #define UVM_IS_CONFIG_HMM() 0
    #endif

// ATS prefetcher uses hmm_range_fault() to query residency information.
// hmm_range_fault() needs CONFIG_HMM_MIRROR. To detect racing CPU invalidates
// of memory regions while hmm_range_fault() is being called, MMU interval
// notifiers are needed.
    #if defined(CONFIG_HMM_MIRROR) && defined(NV_MMU_INTERVAL_NOTIFIER)
        #define UVM_HMM_RANGE_FAULT_SUPPORTED() 1
    #else
        #define UVM_HMM_RANGE_FAULT_SUPPORTED() 0
    #endif

// Various issues prevent us from using mmu_notifiers in older kernels. These
// include:
//  - ->release being called under RCU instead of SRCU: fixed by commit
//    21a92735f660eaecf69a6f2e777f18463760ec32, v3.7 (2012-10-08).
//  - Race conditions between mmu_notifier_release and mmu_notifier_unregister:
//    fixed by commit d34883d4e35c0a994e91dd847a82b4c9e0c31d83, v3.10
//    (2013-05-24).
//
// Unfortunately these issues aren't conftest-able, so instead we look for the
// presence of the invalidate_range callback in mmu_notifier_ops. This was added
// after all of the above issues were resolved, so we assume the fixes are
// present if we see the callback.
//
// The callback was added in commit 0f0a327fa12cd55de5e7f8c05a70ac3d047f405e,
// v3.19 (2014-11-13) and renamed in commit 1af5a8109904.
    #if defined(NV_MMU_NOTIFIER_OPS_HAS_INVALIDATE_RANGE) || \
        defined(NV_MMU_NOTIFIER_OPS_HAS_ARCH_INVALIDATE_SECONDARY_TLBS)
        #define UVM_CAN_USE_MMU_NOTIFIERS() 1
    #else
        #define UVM_CAN_USE_MMU_NOTIFIERS() 0
    #endif

// See bug 1707453 for further details about setting the minimum kernel version.
#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0)
#  error This driver does not support kernels older than 4.4!
#endif

//
// printk.h already defined pr_fmt, so we have to redefine it so the pr_*
// routines pick up our version
//
#undef pr_fmt
#define NVIDIA_UVM_PRETTY_PRINTING_PREFIX "nvidia-uvm: "
#define pr_fmt(fmt) NVIDIA_UVM_PRETTY_PRINTING_PREFIX fmt

// Dummy printing function that maintains syntax and format specifier checking
// but doesn't print anything and doesn't evaluate the print parameters. This is
// roughly equivalent to the kernel's no_printk function. We use this instead
// because:
// 1) no_printk was not available until 2.6.36
// 2) Until 4.5 no_printk was implemented as a static function, meaning its
//    parameters were always evaluated
#define UVM_NO_PRINT(fmt, ...)          \
    do {                                \
        if (0)                          \
            printk(fmt, ##__VA_ARGS__); \
    } while (0)

#define NV_UVM_GFP_FLAGS (GFP_KERNEL)

#if defined(NVCPU_X86)
/* Some old IA32 kernels don't have 64/64 division routines,
 * they only support 64/32 division with do_div(). */
static inline uint64_t NV_DIV64(uint64_t dividend, uint64_t divisor, uint64_t *remainder)
{
    /* do_div() only accepts a 32-bit divisor */
    *remainder = do_div(dividend, (uint32_t)divisor);

    /* do_div() modifies the dividend in-place */
    return dividend;
}
#else
/* All other 32/64-bit kernels we support (including non-x86 kernels) support
 * 64/64 division. */
static inline uint64_t NV_DIV64(uint64_t dividend, uint64_t divisor, uint64_t *remainder)
{
    *remainder = dividend % divisor;

    return dividend / divisor;
}
#endif

/* Return a nanosecond-precise value */
static inline NvU64 NV_GETTIME(void)
{
    struct timespec64 tm;

    ktime_get_raw_ts64(&tm);
    return (NvU64) timespec64_to_ns(&tm);
}

#if !defined(NV_FIND_NEXT_BIT_WRAP_PRESENT)
    static inline unsigned long find_next_bit_wrap(const unsigned long *addr, unsigned long size, unsigned long offset)
    {
        unsigned long bit = find_next_bit(addr, size, offset);

        if (bit < size)
            return bit;

        bit = find_first_bit(addr, offset);
        return bit < offset ? bit : size;
    }
#endif

// for_each_set_bit_wrap and __for_each_wrap were introduced in v6.1-rc1
// by commit 4fe49b3b97c2640147c46519c2a6fdb06df34f5f
#if !defined(for_each_set_bit_wrap)
static inline unsigned long __for_each_wrap(const unsigned long *bitmap,
                                            unsigned long size,
                                            unsigned long start,
                                            unsigned long n)
{
    unsigned long bit;

    if (n > start) {
        bit = find_next_bit(bitmap, size, n);
        if (bit < size)
            return bit;

        n = 0;
    }

    bit = find_next_bit(bitmap, start, n);
    return bit < start ? bit : size;
}

#define for_each_set_bit_wrap(bit, addr, size, start)                   \
    for ((bit) = find_next_bit_wrap((addr), (size), (start));           \
         (bit) < (size);                                                \
         (bit) = __for_each_wrap((addr), (size), (start), (bit) + 1))
#endif

// atomic_long_read_acquire and atomic_long_set_release were added in commit
// b5d47ef9ea5c5fe31d7eabeb79f697629bd9e2cb ("locking/atomics: Switch to
// generated atomic-long") in v5.1 (2019-05-05).
// TODO: Bug 3849079: We always use these definitions on newer kernels.
#define atomic_long_read_acquire uvm_atomic_long_read_acquire
static inline long uvm_atomic_long_read_acquire(atomic_long_t *p)
{
    long val = atomic_long_read(p);
    smp_mb();
    return val;
}

#define atomic_long_set_release uvm_atomic_long_set_release
static inline void uvm_atomic_long_set_release(atomic_long_t *p, long v)
{
    smp_mb();
    atomic_long_set(p, v);
}

static void uvm_init_radix_tree_preloadable(struct radix_tree_root *tree)
{
    // GFP_NOWAIT, or some combination of flags that avoids setting
    // __GFP_DIRECT_RECLAIM (__GFP_WAIT prior to commit
    // d0164adc89f6bb374d304ffcc375c6d2652fe67d from Nov 2015), is required for
    // using radix_tree_preload() for the tree.
    INIT_RADIX_TREE(tree, GFP_NOWAIT);
}

#if !defined(NV_RADIX_TREE_EMPTY_PRESENT)
static bool radix_tree_empty(struct radix_tree_root *tree)
{
    void *dummy;
    return radix_tree_gang_lookup(tree, &dummy, 0, 1) == 0;
}
#endif

// The radix tree root parameter was added to radix_tree_replace_slot in 4.10.
// That same change moved radix_tree_replace_slot from a header-only
// implementation to a .c file, but the symbol wasn't exported until later so
// we cannot use the function on 4.10. UVM uses this macro to ensure that
// radix_tree_replace_slot is not called when using that kernel.
#ifndef NV_RADIX_TREE_REPLACE_SLOT_PRESENT
    #define NV_RADIX_TREE_REPLACE_SLOT(...) \
        UVM_ASSERT_MSG(false, "radix_tree_replace_slot cannot be used in 4.10\n");
#else
#if (NV_RADIX_TREE_REPLACE_SLOT_ARGUMENT_COUNT == 2)
    #define NV_RADIX_TREE_REPLACE_SLOT(root, slot, entry) \
        radix_tree_replace_slot((slot), (entry))
#elif  (NV_RADIX_TREE_REPLACE_SLOT_ARGUMENT_COUNT == 3)
    #define NV_RADIX_TREE_REPLACE_SLOT(root, slot, entry) \
        radix_tree_replace_slot((root), (slot), (entry))
#else
#error "Unknown number of arguments"
#endif
#endif

typedef struct
{
    struct mem_cgroup *new_memcg;
    struct mem_cgroup *old_memcg;
} uvm_memcg_context_t;

    // cgroup support requires set_active_memcg(). set_active_memcg() is an
    // inline function that requires int_active_memcg per-cpu symbol when called
    // from interrupt context. int_active_memcg is only exported by commit
    // c74d40e8b5e2a on >= 5.14 kernels.
    #if NV_IS_EXPORT_SYMBOL_PRESENT_int_active_memcg
        #define UVM_CGROUP_ACCOUNTING_SUPPORTED() 1
        #define NV_UVM_GFP_FLAGS_ACCOUNT              (NV_UVM_GFP_FLAGS | __GFP_ACCOUNT)

        // Begin a Cgroup accounting context.
        // All sysmem page allocations done with NV_UVM_ACCOUNT_GFP_FLAGS will be
        // charged to the mm's memory control group.
        //
        // If mm is NULL, the accounting context will not be switched. Please, note
        // that in this case, any allocations which include NV_UVM_ACCOUNT_GFP_FLAGS
        // will be charged to the currently active context.
        //
        // Locking: uvm_memcg_context_t does not maintain its own locking. Callers must
        //          ensure that concurrent calls do not operate on the same context.
        void uvm_memcg_context_start(uvm_memcg_context_t *context, struct mm_struct *mm);

        // End the Cgroup accounting context started with uvm_mem_memcg_context_start().
        // After this call, the previously active memory control group will be restored.
        //
        // Locking: Callers must ensure that concurrent calls do not operate on the same
        //          context.
        void uvm_memcg_context_end(uvm_memcg_context_t *context);
    #else // !NV_IS_EXPORT_SYMBOL_PRESENT_int_active_memcg
        #define UVM_CGROUP_ACCOUNTING_SUPPORTED() 0
        #define NV_UVM_GFP_FLAGS_ACCOUNT              (NV_UVM_GFP_FLAGS)

        static inline void uvm_memcg_context_start(uvm_memcg_context_t *context, struct mm_struct *mm)
        {
            return;
        }

        static inline void uvm_memcg_context_end(uvm_memcg_context_t *context)
        {
            return;
        }
    #endif // NV_IS_EXPORT_SYMBOL_PRESENT_int_active_memcg

#if defined(NVCPU_X86) || defined(NVCPU_X86_64)
  #include <asm/pgtable.h>
  #include <asm/pgtable_types.h>
#endif

// Added in 57bd1905b228f (acpi, x86/mm: Remove encryption mask from ACPI page
// protection type), v4.13
#if !defined(PAGE_KERNEL_NOENC)
  #define PAGE_KERNEL_NOENC PAGE_KERNEL
#endif

// uvm_pgprot_decrypted is a GPL-aware version of pgprot_decrypted that returns
// the given input when UVM cannot use GPL symbols, or pgprot_decrypted is not
// defined. Otherwise, the function is equivalent to pgprot_decrypted. UVM only
// depends on pgprot_decrypted when the driver is allowed to use GPL symbols:
// both AMD's SEV and Intel's TDX are only supported in conjunction with OpenRM.
//
// It is safe to invoke uvm_pgprot_decrypted in KVM + AMD SEV-SNP guests, even
// if the call is not required, because pgprot_decrypted(PAGE_KERNEL_NOENC) ==
// PAGE_KERNEL_NOENC.
//
// pgprot_decrypted was added by commit 21729f81ce8a ("x86/mm: Provide general
// kernel support for memory encryption") in v4.14 (2017-07-18)
static inline pgprot_t uvm_pgprot_decrypted(pgprot_t prot)
{
#if defined(pgprot_decrypted)
        return pgprot_decrypted(prot);
#endif

   return prot;
}

#endif // _UVM_LINUX_H