open-gpu-kernel-modules/kernel-open/nvidia-uvm/uvm_page_tree_test.c

/*******************************************************************************
    Copyright (c) 2015-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
    deal in the Software without restriction, including without limitation the
    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
    sell copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:

        The above copyright notice and this permission notice shall be
        included in all copies or substantial portions of the Software.

    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
    DEALINGS IN THE SOFTWARE.

*******************************************************************************/

#include "uvm_test.h"
#include "uvm_test_ioctl.h"
#include "uvm_gpu.h"
#include "uvm_global.h"
#include "uvm_hal.h"
#include "uvm_tlb_batch.h"
#include "uvm_mmu.h"
#include "uvm_kvmalloc.h"
// MAXWELL_*
#include "cla16f.h"
#include "clb0b5.h"
// PASCAL_*
#include "clb069.h" // MAXWELL_FAULT_BUFFER_A
#include "clc0b5.h"
#include "clc06f.h"
// VOLTA_*
#include "clc369.h" // MMU_FAULT_BUFFER
#include "clc3b5.h"
#include "clc36f.h"
// AMPERE_*
#include "clc56f.h"
#include "clc6b5.h"
// HOPPER_*
#include "clc8b5.h"
#include "clc86f.h"
// ARCHITECTURE_*
#include "ctrl2080mc.h"

#define BIG_PAGE_SIZE_PASCAL (1 << 16)
#define MAX_NUM_PAGE_SIZES  (8)

static void fake_ce_memset_8(uvm_push_t *push, uvm_gpu_address_t dst, NvU64 value, size_t size)
{
    size_t i;

    UVM_ASSERT(dst.aperture == UVM_APERTURE_SYS);

    for (i = 0; i < size; i += 8)
        *(NvU64 *)phys_to_virt(dst.address + i) = value;
}

static void *cpu_addr_from_fake(uvm_gpu_address_t fake_gpu_addr)
{
    if (fake_gpu_addr.is_virtual)
        return (void*)fake_gpu_addr.address;

    UVM_ASSERT(fake_gpu_addr.aperture == UVM_APERTURE_SYS);

    return phys_to_virt(fake_gpu_addr.address);
}

static void fake_ce_memcopy(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src, size_t size)
{
    memcpy(cpu_addr_from_fake(dst), cpu_addr_from_fake(src), size);
}

static void fake_wait_for_idle(uvm_push_t *push)
{
}

static void fake_noop(uvm_push_t *push, NvU32 size)
{
    push->next += size / 4;
}

static void fake_membar(uvm_push_t *push)
{
}

#define FAKE_TLB_INVALS_COUNT_MAX UVM_TLB_BATCH_MAX_ENTRIES

typedef struct
{
    NvU64 base;
    NvU64 size;
    NvU64 page_size;
    NvU32 depth;
    uvm_membar_t membar;
} fake_tlb_invalidate_t;

static NvU32 g_fake_invals_count = 0;
static fake_tlb_invalidate_t *g_fake_invals = NULL;
static fake_tlb_invalidate_t *g_last_fake_inval;
static bool g_fake_tlb_invals_tracking_enabled = false;

// Allocate the tracking for TLB invalidates
static NV_STATUS fake_tlb_invals_alloc(void)
{
    UVM_ASSERT(!g_fake_invals);
    g_fake_invals = (fake_tlb_invalidate_t *)uvm_kvmalloc(sizeof(*g_fake_invals) * FAKE_TLB_INVALS_COUNT_MAX);
    if (!g_fake_invals)
        return NV_ERR_NO_MEMORY;

    return NV_OK;
}

// Free the tracking for TLB invalidates
static void fake_tlb_invals_free(void)
{
    uvm_kvfree(g_fake_invals);
    g_fake_invals = NULL;
}

static void fake_tlb_invals_reset(void)
{
    UVM_ASSERT(g_fake_tlb_invals_tracking_enabled);

    g_fake_invals_count = 0;
}

static void fake_tlb_invals_enable(void)
{
    UVM_ASSERT(g_fake_invals);

    g_fake_tlb_invals_tracking_enabled = true;
}

static void fake_tlb_invals_disable(void)
{
    UVM_ASSERT(g_fake_invals);

    fake_tlb_invals_reset();
    g_fake_tlb_invals_tracking_enabled = false;
}

// Fake TLB invalidate VA that just saves off the parameters so that they can be
// verified later.
static void fake_tlb_invalidate_va(uvm_push_t *push,
                                   uvm_gpu_phys_address_t pdb,
                                   NvU32 depth,
                                   NvU64 base,
                                   NvU64 size,
                                   NvU64 page_size,
                                   uvm_membar_t membar)
{
    if (!g_fake_tlb_invals_tracking_enabled)
        return;

    ++g_fake_invals_count;

    if (g_fake_invals_count == FAKE_TLB_INVALS_COUNT_MAX + 1) {
        // Assert on the first overflow
        UVM_ASSERT(0);
    }

    if (g_fake_invals_count > FAKE_TLB_INVALS_COUNT_MAX)
        return;

    g_last_fake_inval = &g_fake_invals[g_fake_invals_count - 1];

    g_last_fake_inval->base = base;
    g_last_fake_inval->size = size;
    g_last_fake_inval->page_size = page_size;
    g_last_fake_inval->depth = depth;
    g_last_fake_inval->membar = membar;
}

static void fake_tlb_invalidate_all(uvm_push_t *push, uvm_gpu_phys_address_t pdb, NvU32 depth, uvm_membar_t membar)
{
    fake_tlb_invalidate_va(push, pdb, depth, 0, -1, 0, membar);
}

static bool assert_no_invalidate(void)
{
    UVM_ASSERT(g_fake_tlb_invals_tracking_enabled);

    if (g_fake_invals_count != 0) {
        UVM_TEST_PRINT("Expected no invalidates, but got %u instead\n", g_fake_invals_count);
        return false;
    }

    return true;
}

static bool assert_and_reset_last_invalidate(NvU32 expected_depth, bool expected_membar)
{
    bool result = true;

    UVM_ASSERT(g_fake_tlb_invals_tracking_enabled);

    if (g_fake_invals_count == 0) {
        UVM_TEST_PRINT("Expected an invalidate, but got none\n");
        return false;
    }
    if (g_fake_invals_count > FAKE_TLB_INVALS_COUNT_MAX) {
        UVM_TEST_PRINT("Too many invalidates %u\n", g_fake_invals_count);
        return false;
    }

    if (g_last_fake_inval->depth != expected_depth) {
        UVM_TEST_PRINT("Expected depth %u, got %u instead\n", expected_depth, g_last_fake_inval->depth);
        result = false;
    }
    if ((g_last_fake_inval->membar == UVM_MEMBAR_NONE) == expected_membar) {
        UVM_TEST_PRINT("Expected %s membar, got %s instead\n",
                       expected_membar ? "a" : "no",
                       uvm_membar_string(g_last_fake_inval->membar));
        result = false;
    }

    fake_tlb_invals_reset();

    return result;
}

static bool assert_last_invalidate_all(NvU32 expected_depth, bool expected_membar)
{
    UVM_ASSERT(g_fake_tlb_invals_tracking_enabled);

    if (g_fake_invals_count != 1) {
        UVM_TEST_PRINT("Expected a single invalidate, but got %u instead\n", g_fake_invals_count);
        return false;
    }
    if (g_last_fake_inval->base != 0 || g_last_fake_inval->size != -1) {
        UVM_TEST_PRINT("Expected invalidate all but got range [0x%llx, 0x%llx) instead\n",
                       g_last_fake_inval->base,
                       g_last_fake_inval->base + g_last_fake_inval->size);
        return false;
    }
    if (g_last_fake_inval->depth != expected_depth) {
        UVM_TEST_PRINT("Expected depth %u, got %u instead\n", expected_depth, g_last_fake_inval->depth);
        return false;
    }

    return true;
}

static bool assert_invalidate_range_specific(fake_tlb_invalidate_t *inval,
                                             NvU64 base,
                                             NvU64 size,
                                             NvU64 page_size,
                                             NvU32 expected_depth,
                                             bool expected_membar)
{
    UVM_ASSERT(g_fake_tlb_invals_tracking_enabled);

    if (g_fake_invals_count == 0) {
        UVM_TEST_PRINT("Expected an invalidate for range [0x%llx, 0x%llx), but got none\n", base, base + size);
        return false;
    }

    if ((inval->base != base || inval->size != size) && inval->base != 0 && inval->size != -1) {
        UVM_TEST_PRINT("Expected invalidate range [0x%llx, 0x%llx), but got range [0x%llx, 0x%llx) instead\n",
                        base,
                        base + size,
                        inval->base,
                        inval->base + inval->size);
        return false;
    }
    if (inval->depth != expected_depth) {
        UVM_TEST_PRINT("Expected depth %u, got %u instead\n", expected_depth, inval->depth);
        return false;
    }
    if (inval->page_size != page_size && inval->base != 0 && inval->size != -1) {
        UVM_TEST_PRINT("Expected page size %llu, got %llu instead\n", page_size, inval->page_size);
        return false;
    }

    return true;
}

static bool assert_invalidate_range(NvU64 base,
                                    NvU64 size,
                                    NvU64 page_size,
                                    bool allow_inval_all,
                                    NvU32 range_depth,
                                    NvU32 all_depth,
                                    bool expected_membar)
{
    NvU32 i;

    UVM_ASSERT(g_fake_tlb_invals_tracking_enabled);

    if (g_fake_invals_count == 0) {
        UVM_TEST_PRINT("Expected an invalidate for range [0x%llx, 0x%llx), but got none\n",
                base, base + size);
        return false;
    }

    for (i = 0; i < g_fake_invals_count; ++i) {
        fake_tlb_invalidate_t *inval = &g_fake_invals[i];
        if (inval->base == base && inval->size == size)
            return assert_invalidate_range_specific(inval, base, size, page_size, range_depth, expected_membar);
    }

    if (g_fake_invals_count == 1 && allow_inval_all)
        return assert_last_invalidate_all(all_depth, expected_membar);

    UVM_TEST_PRINT("Couldn't find an invalidate for range [0x%llx, 0x%llx) in:\n", base, base + size);
    for (i = 0; i < g_fake_invals_count; ++i) {
        fake_tlb_invalidate_t *inval = &g_fake_invals[i];
        UVM_TEST_PRINT(" range %d [0x%llx, 0x%llx)\n", i, inval->base, inval->base + inval->size);
    }

    return false;
}

static NV_STATUS test_page_tree_init(uvm_gpu_t *gpu, NvU32 big_page_size, uvm_page_tree_t *tree)
{
    return uvm_page_tree_init(gpu, NULL, UVM_PAGE_TREE_TYPE_USER, big_page_size, UVM_APERTURE_SYS, tree);
}

static NV_STATUS test_page_tree_init_kernel(uvm_gpu_t *gpu, NvU32 big_page_size, uvm_page_tree_t *tree)
{
    return uvm_page_tree_init(gpu, NULL, UVM_PAGE_TREE_TYPE_KERNEL, big_page_size, UVM_APERTURE_SYS, tree);
}

static NV_STATUS test_page_tree_get_ptes(uvm_page_tree_t *tree,
                                         NvU64 page_size,
                                         NvU64 start,
                                         NvLength size,
                                         uvm_page_table_range_t *range)
{
    uvm_mmu_mode_hal_t *hal = tree->gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);

    // Maxwell GPUs don't use canonical form address even on platforms that
    // support it.
    start = (tree->type == UVM_PAGE_TREE_TYPE_USER) && (hal->num_va_bits() > 40) ?
                           uvm_parent_gpu_canonical_address(tree->gpu->parent, start) :
                           start;
    return uvm_page_tree_get_ptes(tree, page_size, start, size, UVM_PMM_ALLOC_FLAGS_NONE, range);
}

static NV_STATUS test_page_tree_get_entry(uvm_page_tree_t *tree,
                                          NvU64 page_size,
                                          NvU64 start,
                                          uvm_page_table_range_t *single)
{
    uvm_mmu_mode_hal_t *hal = tree->gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);

    // See comment above (test_page_tree_get_ptes)
    start = (tree->type == UVM_PAGE_TREE_TYPE_USER) && (hal->num_va_bits() > 40) ?
                           uvm_parent_gpu_canonical_address(tree->gpu->parent, start) :
                           start;
    return uvm_page_tree_get_entry(tree, page_size, start, UVM_PMM_ALLOC_FLAGS_NONE, single);
}

static NV_STATUS test_page_tree_alloc_table(uvm_page_tree_t *tree,
                                            NvU64 page_size,
                                            uvm_page_table_range_t *single,
                                            uvm_page_table_range_t *children)
{
    return uvm_page_tree_alloc_table(tree, page_size, UVM_PMM_ALLOC_FLAGS_NONE, single, children);
}

static bool assert_entry_no_invalidate(uvm_page_tree_t *tree, NvU64 page_size, NvU64 start)
{
    uvm_page_table_range_t entry;
    bool result = true;

    if (test_page_tree_get_entry(tree, page_size, start, &entry) != NV_OK)
        return false;

    if (!assert_no_invalidate())
        result = false;

    uvm_page_tree_put_ptes(tree, &entry);

    return assert_no_invalidate() && result;
}

static bool assert_entry_invalidate(uvm_page_tree_t *tree, NvU64 page_size, NvU64 start, NvU32 depth, bool membar)
{
    uvm_page_table_range_t entry;
    bool result = true;

    if (test_page_tree_get_entry(tree, page_size, start, &entry) != NV_OK)
        return false;

    if (!assert_and_reset_last_invalidate(depth, false))
        result = false;

    uvm_page_tree_put_ptes(tree, &entry);

    return assert_and_reset_last_invalidate(depth, membar) && result;
}

static NV_STATUS allocate_root(uvm_gpu_t *gpu)
{
    uvm_page_tree_t tree;
    MEM_NV_CHECK_RET(test_page_tree_init(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
    uvm_page_tree_deinit(&tree);

    return NV_OK;
}

static NV_STATUS alloc_64k_memory(uvm_gpu_t *gpu)
{
    uvm_page_tree_t tree;
    uvm_page_table_range_t range;

    NvLength size = 64 * 1024;
    MEM_NV_CHECK_RET(test_page_tree_init(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_64K, 0, size, &range), NV_OK);
    TEST_CHECK_RET(range.entry_count == 1);
    TEST_CHECK_RET(range.table->depth == 4);
    TEST_CHECK_RET(range.start_index == 0);
    TEST_CHECK_RET(range.page_size == UVM_PAGE_SIZE_64K);
    TEST_CHECK_RET(tree.root->ref_count == 1);
    TEST_CHECK_RET(tree.root->entries[0]->ref_count == 1);
    TEST_CHECK_RET(tree.root->entries[0]->entries[0]->ref_count == 1);
    TEST_CHECK_RET(tree.root->entries[0]->entries[0]->entries[0]->ref_count == 1);
    TEST_CHECK_RET(tree.root->entries[0]->entries[0]->entries[0]->entries[0]->ref_count == 1);
    TEST_CHECK_RET(range.table == tree.root->entries[0]->entries[0]->entries[0]->entries[0]);
    uvm_page_tree_put_ptes(&tree, &range);
    UVM_ASSERT(tree.root->ref_count == 0);
    uvm_page_tree_deinit(&tree);

    return NV_OK;
}

static NV_STATUS alloc_64k_memory_57b_va(uvm_gpu_t *gpu)
{
    uvm_page_tree_t tree;
    uvm_page_table_range_t range;

    NvLength size = 64 * 1024;

    // We use a kernel-type page tree to decouple the test from the CPU VA width
    // and canonical form address limits.
    MEM_NV_CHECK_RET(test_page_tree_init_kernel(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_64K, 0x100000000000000ULL, size, &range), NV_OK);
    TEST_CHECK_RET(range.entry_count == 1);
    TEST_CHECK_RET(range.table->depth == 5);
    TEST_CHECK_RET(range.start_index == 0);
    TEST_CHECK_RET(range.page_size == UVM_PAGE_SIZE_64K);
    TEST_CHECK_RET(tree.root->ref_count == 1);
    TEST_CHECK_RET(tree.root->entries[1]->ref_count == 1);
    TEST_CHECK_RET(tree.root->entries[1]->entries[0]->ref_count == 1);
    TEST_CHECK_RET(tree.root->entries[1]->entries[0]->entries[0]->ref_count == 1);
    TEST_CHECK_RET(tree.root->entries[1]->entries[0]->entries[0]->entries[0]->ref_count == 1);
    TEST_CHECK_RET(tree.root->entries[1]->entries[0]->entries[0]->entries[0]->entries[0]->ref_count == 1);
    TEST_CHECK_RET(range.table == tree.root->entries[1]->entries[0]->entries[0]->entries[0]->entries[0]);
    uvm_page_tree_put_ptes(&tree, &range);
    UVM_ASSERT(tree.root->ref_count == 0);
    uvm_page_tree_deinit(&tree);

    return NV_OK;
}

static NV_STATUS alloc_adjacent_64k_memory(uvm_gpu_t *gpu)
{
    uvm_page_tree_t tree;
    uvm_page_table_range_t range1;
    uvm_page_table_range_t range2;

    NvLength size = 64 * 1024;
    MEM_NV_CHECK_RET(test_page_tree_init(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_64K, size, size, &range1), NV_OK);
    TEST_CHECK_RET(range1.entry_count == 1);

    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_64K, 0, size, &range2), NV_OK);
    TEST_CHECK_RET(range2.entry_count == 1);
    TEST_CHECK_RET(range1.table == range2.table);
    TEST_CHECK_RET(range1.table == tree.root->entries[0]->entries[0]->entries[0]->entries[0]);
    TEST_CHECK_RET(range1.start_index == 1);
    TEST_CHECK_RET(range2.start_index == 0);

    uvm_page_tree_put_ptes(&tree, &range1);
    uvm_page_tree_put_ptes(&tree, &range2);
    uvm_page_tree_deinit(&tree);

    return NV_OK;
}

static NV_STATUS alloc_adjacent_pde_64k_memory(uvm_gpu_t *gpu)
{
    uvm_page_tree_t tree;
    uvm_page_table_range_t range;
    uvm_page_table_range_t next_range;
    NvLength size = 64 * 1024;

    MEM_NV_CHECK_RET(test_page_tree_init(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_64K, 0, size, &range), NV_OK);
    TEST_CHECK_RET(range.entry_count == 1);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_64K, 2 * 1024 * 1024, size, &next_range), NV_OK);
    TEST_CHECK_RET(range.table == tree.root->entries[0]->entries[0]->entries[0]->entries[0]);
    TEST_CHECK_RET(next_range.table == tree.root->entries[0]->entries[0]->entries[0]->entries[2]);
    uvm_page_tree_put_ptes(&tree, &range);
    uvm_page_tree_put_ptes(&tree, &next_range);
    uvm_page_tree_deinit(&tree);

    return NV_OK;
}

static NV_STATUS alloc_nearby_pde_64k_memory(uvm_gpu_t *gpu)
{
    uvm_page_tree_t tree;
    uvm_page_table_range_t range;
    uvm_page_table_range_t next_range;
    NvLength size = 64 * 1024;
    MEM_NV_CHECK_RET(test_page_tree_init(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_64K, 6 * 1024 * 1024, size, &range), NV_OK);
    TEST_CHECK_RET(range.entry_count == 1);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_64K, 2 * 1024 * 1024, size, &next_range), NV_OK);
    TEST_CHECK_RET(range.table == tree.root->entries[0]->entries[0]->entries[0]->entries[6]);
    TEST_CHECK_RET(next_range.table == tree.root->entries[0]->entries[0]->entries[0]->entries[2]);
    uvm_page_tree_put_ptes(&tree, &range);
    uvm_page_tree_put_ptes(&tree, &next_range);
    uvm_page_tree_deinit(&tree);

    return NV_OK;
}

static NV_STATUS allocate_then_free_all_16_64k(uvm_gpu_t *gpu)
{
    uvm_page_tree_t tree;
    uvm_page_table_range_t range[16];

    NvLength size = 64 * 1024;
    NvLength stride = 32 * size;
    NvLength start = stride * 256;
    int i;

    MEM_NV_CHECK_RET(test_page_tree_init(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);

    for (i = 0; i < 16; i++)
        MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_64K, start + i * stride, size, range + i), NV_OK);

    TEST_CHECK_RET(tree.root->entries[0]->entries[0]->entries[1]->ref_count == 16);

    for (i = 0; i < 16; i++)
        uvm_page_tree_put_ptes(&tree, range + i);

    UVM_ASSERT(tree.root->ref_count == 0);
    uvm_page_tree_deinit(&tree);

    return NV_OK;
}

static NV_STATUS allocate_then_free_8_8_64k(uvm_gpu_t *gpu)
{
    uvm_page_tree_t tree;
    uvm_page_table_range_t range[16];

    NvLength size = 64 * 1024;
    NvLength stride = 32 * size;
    NvLength start = (248 * stride) + (256 * UVM_SIZE_1GB) + (128 * UVM_SIZE_1TB);
    int i;

    MEM_NV_CHECK_RET(test_page_tree_init_kernel(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);

    for (i = 0; i < 16; i++)
        MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_64K, start + i * stride, size, range + i), NV_OK);

    TEST_CHECK_RET(tree.root->entries[1]->entries[1]->entries[0]->ref_count == 8);
    TEST_CHECK_RET(tree.root->entries[1]->entries[1]->entries[1]->ref_count == 8);

    for (i = 0; i < 16; i++)
        uvm_page_tree_put_ptes(&tree, range + i);

    UVM_ASSERT(tree.root->ref_count == 0);
    uvm_page_tree_deinit(&tree);

    return NV_OK;
}

static NV_STATUS get_single_page_2m(uvm_gpu_t *gpu)
{
    uvm_page_tree_t tree;
    uvm_page_table_range_t range;

    // use a start address not at the beginning of a PDE3 entry's range
    NvU64 start = 34983UL * (1 << 21);
    NvLength size = 1 << 21;

    MEM_NV_CHECK_RET(test_page_tree_init(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_2M, start, size, &range), NV_OK);

    TEST_CHECK_RET(range.entry_count == 1);
    TEST_CHECK_RET(range.table->depth == 3);
    TEST_CHECK_RET(range.page_size == UVM_PAGE_SIZE_2M);

    uvm_page_tree_put_ptes(&tree, &range);
    TEST_CHECK_RET(tree.root->ref_count == 0);
    uvm_page_tree_deinit(&tree);

    return NV_OK;
}

static NV_STATUS alloc_512m_memory(uvm_gpu_t *gpu)
{
    uvm_page_tree_t tree;
    uvm_page_table_range_t range;

    NvLength size = 512UL * 1024 * 1024;
    MEM_NV_CHECK_RET(test_page_tree_init(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_512M, 0, size, &range), NV_OK);
    TEST_CHECK_RET(range.entry_count == 1);
    TEST_CHECK_RET(range.table->depth == 2);
    TEST_CHECK_RET(range.start_index == 0);
    TEST_CHECK_RET(range.page_size == UVM_PAGE_SIZE_512M);
    TEST_CHECK_RET(tree.root->ref_count == 1);
    TEST_CHECK_RET(tree.root->entries[0]->ref_count == 1);
    TEST_CHECK_RET(tree.root->entries[0]->entries[0]->ref_count == 1);
    TEST_CHECK_RET(range.table == tree.root->entries[0]->entries[0]);
    uvm_page_tree_put_ptes(&tree, &range);
    UVM_ASSERT(tree.root->ref_count == 0);
    uvm_page_tree_deinit(&tree);

    return NV_OK;
}

static NV_STATUS alloc_adjacent_512m_memory(uvm_gpu_t *gpu)
{
    uvm_page_tree_t tree;
    uvm_page_table_range_t range1;
    uvm_page_table_range_t range2;

    NvLength size = 512UL * 1024 * 1024;
    MEM_NV_CHECK_RET(test_page_tree_init(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_512M, size, size, &range1), NV_OK);
    TEST_CHECK_RET(range1.entry_count == 1);

    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_512M, 0, size, &range2), NV_OK);
    TEST_CHECK_RET(range2.entry_count == 1);
    TEST_CHECK_RET(range1.table == range2.table);
    TEST_CHECK_RET(range1.table == tree.root->entries[0]->entries[0]);
    TEST_CHECK_RET(range1.start_index == 1);
    TEST_CHECK_RET(range2.start_index == 0);

    uvm_page_tree_put_ptes(&tree, &range1);
    uvm_page_tree_put_ptes(&tree, &range2);
    uvm_page_tree_deinit(&tree);

    return NV_OK;
}

static NV_STATUS get_single_page_512m(uvm_gpu_t *gpu)
{
    uvm_page_tree_t tree;
    uvm_page_table_range_t range;

    // use a start address not at the beginning of a PDE2 entry's range
    NvU64 start = 3UL * 512 * 1024 * 1024;
    NvLength size = 512UL * 1024 * 1024;

    MEM_NV_CHECK_RET(test_page_tree_init(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_512M, start, size, &range), NV_OK);

    TEST_CHECK_RET(range.entry_count == 1);
    TEST_CHECK_RET(range.table->depth == 2);
    TEST_CHECK_RET(range.page_size == UVM_PAGE_SIZE_512M);

    uvm_page_tree_put_ptes(&tree, &range);
    TEST_CHECK_RET(tree.root->ref_count == 0);
    uvm_page_tree_deinit(&tree);

    return NV_OK;
}

static NV_STATUS get_entire_table_4k(uvm_gpu_t *gpu)
{
    uvm_page_tree_t tree;
    uvm_page_table_range_t range;

    NvU64 start = 1UL << 47;

    NvLength size = 2 * UVM_SIZE_1MB;

    MEM_NV_CHECK_RET(test_page_tree_init_kernel(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_4K, start, size, &range), NV_OK);

    TEST_CHECK_RET(range.table == tree.root->entries[1]->entries[0]->entries[0]->entries[1]);
    TEST_CHECK_RET(range.entry_count == 512);
    TEST_CHECK_RET(range.table->depth == 4);
    TEST_CHECK_RET(range.page_size == UVM_PAGE_SIZE_4K);
    TEST_CHECK_RET(tree.root->ref_count == 1);

    uvm_page_tree_put_ptes(&tree, &range);
    uvm_page_tree_deinit(&tree);

    return NV_OK;
}

static NV_STATUS get_entire_table_512m(uvm_gpu_t *gpu)
{
    uvm_page_tree_t tree;
    uvm_page_table_range_t range;

    NvU64 start = 1UL << 48;
    NvLength size = 512UL * UVM_PAGE_SIZE_512M;

    MEM_NV_CHECK_RET(test_page_tree_init_kernel(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_512M, start, size, &range), NV_OK);

    TEST_CHECK_RET(range.table == tree.root->entries[2]->entries[0]);
    TEST_CHECK_RET(range.entry_count == 512);
    TEST_CHECK_RET(range.table->depth == 2);
    TEST_CHECK_RET(range.page_size == UVM_PAGE_SIZE_512M);
    TEST_CHECK_RET(tree.root->ref_count == 1);

    uvm_page_tree_put_ptes(&tree, &range);
    uvm_page_tree_deinit(&tree);

    return NV_OK;
}

static NV_STATUS split_4k_from_2m(uvm_gpu_t *gpu)
{
    uvm_page_tree_t tree;
    uvm_page_table_range_t range_2m;
    uvm_page_table_range_t range_adj;
    uvm_page_table_range_t range_4k;
    uvm_page_table_range_t range_64k;

    NvU64 start = 1UL << 48;
    NvLength size = 2 * UVM_SIZE_1MB;

    MEM_NV_CHECK_RET(test_page_tree_init_kernel(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_2M, start, size, &range_2m), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_2M, start + size, size, &range_adj), NV_OK);

    TEST_CHECK_RET(range_2m.entry_count == 1);
    TEST_CHECK_RET(range_2m.table->depth == 3);
    TEST_CHECK_RET(range_adj.entry_count == 1);
    TEST_CHECK_RET(range_adj.table->depth == 3);

    // Need to release the 2 MB page so that the reference count is right.
    uvm_page_tree_put_ptes(&tree, &range_2m);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_4K, start, 64 * 1024, &range_4k), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree,
                                             UVM_PAGE_SIZE_64K,
                                             start + 64 * 1024,
                                             size - 64 * 1024,
                                             &range_64k),
                     NV_OK);

    TEST_CHECK_RET(range_4k.entry_count == 16);
    TEST_CHECK_RET(range_4k.table->depth == 4);
    TEST_CHECK_RET(range_4k.table == tree.root->entries[2]->entries[0]->entries[0]->entries[1]);
    TEST_CHECK_RET(range_4k.start_index == 0);

    TEST_CHECK_RET(range_64k.entry_count == 31);
    TEST_CHECK_RET(range_64k.table == tree.root->entries[2]->entries[0]->entries[0]->entries[0]);
    TEST_CHECK_RET(range_64k.start_index == 1);

    // Free everything
    uvm_page_tree_put_ptes(&tree, &range_adj);
    uvm_page_tree_put_ptes(&tree, &range_4k);
    uvm_page_tree_put_ptes(&tree, &range_64k);

    uvm_page_tree_deinit(&tree);

    return NV_OK;
}

static NV_STATUS split_2m_from_512m(uvm_gpu_t *gpu)
{
    uvm_page_tree_t tree;
    uvm_page_table_range_t range_512m;
    uvm_page_table_range_t range_adj;
    uvm_page_table_range_t range_2m;

    NvU64 start = 1UL << 48;
    NvLength size = UVM_PAGE_SIZE_512M;

    MEM_NV_CHECK_RET(test_page_tree_init_kernel(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_512M, start, size, &range_512m), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_512M, start + size, size, &range_adj), NV_OK);

    TEST_CHECK_RET(range_512m.entry_count == 1);
    TEST_CHECK_RET(range_512m.table->depth == 2);
    TEST_CHECK_RET(range_adj.entry_count == 1);
    TEST_CHECK_RET(range_adj.table->depth == 2);

    // Need to release the 512M page so that the reference count is right.
    uvm_page_tree_put_ptes(&tree, &range_512m);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_2M, start, size, &range_2m), NV_OK);

    TEST_CHECK_RET(range_2m.entry_count == 256);
    TEST_CHECK_RET(range_2m.table->depth == 3);
    TEST_CHECK_RET(range_2m.table == tree.root->entries[2]->entries[0]->entries[0]);
    TEST_CHECK_RET(range_2m.start_index == 0);

    // Free everything
    uvm_page_tree_put_ptes(&tree, &range_adj);
    uvm_page_tree_put_ptes(&tree, &range_2m);

    uvm_page_tree_deinit(&tree);

    return NV_OK;
}

static NV_STATUS get_512mb_range(uvm_gpu_t *gpu)
{
    uvm_page_tree_t tree;
    uvm_page_table_range_t range;

    NvU64 start = 512 * (1 << 20);
    NvU64 size = start;

    MEM_NV_CHECK_RET(test_page_tree_init(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_2M, start, size, &range), NV_OK);
    TEST_CHECK_RET(range.entry_count == 256);
    TEST_CHECK_RET(range.table->depth == 3);
    TEST_CHECK_RET(range.start_index == 0);
    uvm_page_tree_put_ptes(&tree, &range);
    uvm_page_tree_deinit(&tree);

    return NV_OK;
}

static NV_STATUS get_2gb_range(uvm_gpu_t *gpu)
{
    uvm_page_tree_t tree;
    uvm_page_table_range_t range;

    NvU64 start = 2 * UVM_SIZE_1GB;
    NvU64 size = start;

    MEM_NV_CHECK_RET(test_page_tree_init(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_512M, start, size, &range), NV_OK);
    TEST_CHECK_RET(range.entry_count == 4);
    TEST_CHECK_RET(range.table->depth == 2);
    TEST_CHECK_RET(range.start_index == 4);
    uvm_page_tree_put_ptes(&tree, &range);
    uvm_page_tree_deinit(&tree);

    return NV_OK;
}

static NV_STATUS get_two_free_apart(uvm_gpu_t *gpu)
{
    uvm_page_tree_t tree;
    uvm_page_table_range_t range1;
    uvm_page_table_range_t range2;

    NvLength size = 1024 * 1024;
    MEM_NV_CHECK_RET(test_page_tree_init(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_4K, size, size, &range1), NV_OK);
    TEST_CHECK_RET(range1.entry_count == 256);
    TEST_CHECK_RET(range1.table->ref_count == 256);

    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_4K, 0, size, &range2), NV_OK);
    TEST_CHECK_RET(range2.entry_count == 256);
    TEST_CHECK_RET(range2.table->ref_count == 512);
    TEST_CHECK_RET(range1.table == range2.table);

    // 4k page is second entry in a dual PDE
    TEST_CHECK_RET(range1.table == tree.root->entries[0]->entries[0]->entries[0]->entries[1]);
    TEST_CHECK_RET(range1.start_index == 256);
    TEST_CHECK_RET(range2.start_index == 0);

    uvm_page_tree_put_ptes(&tree, &range1);
    TEST_CHECK_RET(range2.table->ref_count == 256);
    TEST_CHECK_RET(range2.table == tree.root->entries[0]->entries[0]->entries[0]->entries[1]);
    uvm_page_tree_put_ptes(&tree, &range2);
    uvm_page_tree_deinit(&tree);

    return NV_OK;
}

static NV_STATUS get_overlapping_dual_pdes(uvm_gpu_t *gpu)
{
    uvm_page_tree_t tree;
    uvm_page_table_range_t range4k;
    uvm_page_table_range_t range64k;

    NvLength size = 1024 * 1024;
    MEM_NV_CHECK_RET(test_page_tree_init(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_4K, size, size, &range4k), NV_OK);
    TEST_CHECK_RET(range4k.entry_count == 256);
    TEST_CHECK_RET(range4k.table->ref_count == 256);

    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_64K, size, size, &range64k), NV_OK);
    TEST_CHECK_RET(range64k.entry_count == 16);
    TEST_CHECK_RET(range64k.table->ref_count == 16);

    // 4k page is second entry in a dual PDE
    TEST_CHECK_RET(range64k.table == tree.root->entries[0]->entries[0]->entries[0]->entries[0]);
    TEST_CHECK_RET(range64k.start_index == 16);
    TEST_CHECK_RET(range4k.start_index == 256);

    uvm_page_tree_put_ptes(&tree, &range64k);
    TEST_CHECK_RET(range4k.table->ref_count == 256);
    TEST_CHECK_RET(range4k.table == tree.root->entries[0]->entries[0]->entries[0]->entries[1]);
    uvm_page_tree_put_ptes(&tree, &range4k);

    UVM_ASSERT(tree.root->ref_count == 0);
    uvm_page_tree_deinit(&tree);

    return NV_OK;
}

static NV_STATUS split_and_free(uvm_gpu_t *gpu)
{
    uvm_page_tree_t tree;
    uvm_page_table_range_t range;

    // 45 = 1 + 2 + 3 + ... + 9
    NvU64 size = 45 * (2 << 20);
    NvU32 i;
    NvU32 sum = 0;

    MEM_NV_CHECK_RET(test_page_tree_init(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_2M, 0, size, &range), NV_OK);
    TEST_CHECK_RET(range.entry_count == 45);
    TEST_CHECK_RET(range.table->depth == 3);
    TEST_CHECK_RET(range.start_index == 0);

    for (i = 1; i <= 9; i++) {
        range.entry_count = i;
        range.start_index = sum;
        uvm_page_tree_put_ptes(&tree, &range);
        sum += i;
    }
    UVM_ASSERT(tree.root->ref_count == 0);
    uvm_page_tree_deinit(&tree);

    return NV_OK;
}

static NV_STATUS check_sizes(uvm_gpu_t *gpu)
{
    NvU64 user_sizes = UVM_PAGE_SIZE_2M;
    NvU64 kernel_sizes = UVM_PAGE_SIZE_4K | 256;

    if (UVM_PAGE_SIZE_64K >= PAGE_SIZE)
        user_sizes |= UVM_PAGE_SIZE_64K;
    if (UVM_PAGE_SIZE_4K >= PAGE_SIZE)
        user_sizes |= UVM_PAGE_SIZE_4K;

    TEST_CHECK_RET(gpu->parent->mmu_user_chunk_sizes == user_sizes);
    TEST_CHECK_RET(gpu->parent->mmu_kernel_chunk_sizes == kernel_sizes);

    return NV_OK;
}

static NV_STATUS fast_split_normal(uvm_gpu_t *gpu)
{
    uvm_page_tree_t tree;
    uvm_page_table_range_t parent;
    uvm_page_table_range_t child_4k;
    uvm_page_table_range_t child_64k;

    NvU64 start = 0;

    MEM_NV_CHECK_RET(test_page_tree_init(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_entry(&tree, UVM_PAGE_SIZE_2M, start, &parent), NV_OK);
    TEST_CHECK_RET(parent.entry_count == 1);
    TEST_CHECK_RET(parent.table->depth == 3);
    TEST_CHECK_RET(parent.page_size == UVM_PAGE_SIZE_2M);

    MEM_NV_CHECK_RET(test_page_tree_alloc_table(&tree, UVM_PAGE_SIZE_4K, &parent, &child_4k), NV_OK);
    TEST_CHECK_RET(child_4k.table->host_parent == parent.table);
    TEST_CHECK_RET(child_4k.entry_count == 512);
    TEST_CHECK_RET(child_4k.page_size == UVM_PAGE_SIZE_4K);
    TEST_CHECK_RET(parent.table->ref_count == 2);
    TEST_CHECK_RET(parent.table->entries[1] == child_4k.table);

    MEM_NV_CHECK_RET(test_page_tree_alloc_table(&tree, UVM_PAGE_SIZE_64K, &parent, &child_64k), NV_OK);
    TEST_CHECK_RET(child_64k.table->host_parent == parent.table);
    TEST_CHECK_RET(child_64k.entry_count == 32);
    TEST_CHECK_RET(child_64k.page_size == UVM_PAGE_SIZE_64K);
    TEST_CHECK_RET(parent.table->ref_count == 3);
    TEST_CHECK_RET(parent.table->entries[0] == child_64k.table);

    uvm_page_tree_put_ptes(&tree, &parent);
    TEST_CHECK_RET(parent.table->ref_count == 2);
    uvm_page_tree_put_ptes(&tree, &child_4k);
    TEST_CHECK_RET(parent.table->entries[1] == NULL);
    uvm_page_tree_put_ptes(&tree, &child_64k);
    uvm_page_tree_deinit(&tree);

    return NV_OK;
}

static NV_STATUS fast_split_double_backoff(uvm_gpu_t *gpu)
{
    uvm_page_tree_t tree;
    uvm_page_table_range_t parent;
    uvm_page_table_range_t child_4k;
    uvm_page_table_range_t child_64k;
    uvm_page_table_range_t child_64k2;

    NvU64 start = 0;

    MEM_NV_CHECK_RET(test_page_tree_init(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_entry(&tree, UVM_PAGE_SIZE_2M, start, &parent), NV_OK);
    TEST_CHECK_RET(parent.entry_count == 1);
    TEST_CHECK_RET(parent.table->depth == 3);
    TEST_CHECK_RET(parent.page_size == UVM_PAGE_SIZE_2M);

    MEM_NV_CHECK_RET(test_page_tree_alloc_table(&tree, UVM_PAGE_SIZE_4K, &parent, &child_4k), NV_OK);
    TEST_CHECK_RET(child_4k.table->host_parent == parent.table);
    TEST_CHECK_RET(child_4k.entry_count == 512);
    TEST_CHECK_RET(child_4k.page_size == UVM_PAGE_SIZE_4K);
    TEST_CHECK_RET(parent.table->ref_count == 2);
    TEST_CHECK_RET(parent.table->entries[1] == child_4k.table);

    MEM_NV_CHECK_RET(test_page_tree_alloc_table(&tree, UVM_PAGE_SIZE_64K, &parent, &child_64k), NV_OK);
    TEST_CHECK_RET(child_64k.table->host_parent == parent.table);
    TEST_CHECK_RET(child_64k.entry_count == 32);
    TEST_CHECK_RET(child_64k.page_size == UVM_PAGE_SIZE_64K);
    TEST_CHECK_RET(parent.table->ref_count == 3);
    TEST_CHECK_RET(parent.table->entries[0] == child_64k.table);

    MEM_NV_CHECK_RET(test_page_tree_alloc_table(&tree, UVM_PAGE_SIZE_64K, &parent, &child_64k2), NV_OK);
    TEST_CHECK_RET(child_64k2.table->host_parent == parent.table);
    TEST_CHECK_RET(child_64k2.entry_count == 32);
    TEST_CHECK_RET(child_64k2.table->ref_count == 64);
    TEST_CHECK_RET(child_64k2.page_size == UVM_PAGE_SIZE_64K);
    TEST_CHECK_RET(child_64k2.table == child_64k.table);
    TEST_CHECK_RET(parent.table->ref_count == 3);
    TEST_CHECK_RET(parent.table->entries[0] == child_64k2.table);

    uvm_page_tree_put_ptes(&tree, &child_64k2);

    uvm_page_tree_put_ptes(&tree, &parent);
    TEST_CHECK_RET(parent.table->ref_count == 2);
    uvm_page_tree_put_ptes(&tree, &child_4k);
    TEST_CHECK_RET(parent.table->entries[1] == NULL);
    uvm_page_tree_put_ptes(&tree, &child_64k);
    uvm_page_tree_deinit(&tree);

    return NV_OK;
}

static NV_STATUS test_tlb_invalidates(uvm_gpu_t *gpu)
{
    NV_STATUS status = NV_OK;
    uvm_page_tree_t tree;
    uvm_page_table_range_t entries[5];
    int i;

    // Depth 4
    NvU64 extent_pte = UVM_PAGE_SIZE_2M;

    // Depth 3
    NvU64 extent_pde0 = extent_pte * (1ull << 8);

    // Depth 2
    NvU64 extent_pde1 = extent_pde0 * (1ull << 9);

    // Depth 1
    NvU64 extent_pde2 = extent_pde1 * (1ull << 9);

    MEM_NV_CHECK_RET(test_page_tree_init_kernel(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);

    fake_tlb_invals_enable();

    TEST_CHECK_RET(assert_entry_invalidate(&tree, UVM_PAGE_SIZE_4K, 0, 0, true));
    TEST_CHECK_RET(assert_entry_invalidate(&tree, UVM_PAGE_SIZE_4K, 0, 0, true));

    TEST_CHECK_RET(test_page_tree_get_entry(&tree, UVM_PAGE_SIZE_4K, 0, &entries[0]) == NV_OK);
    TEST_CHECK_RET(assert_and_reset_last_invalidate(0, false));

    TEST_CHECK_RET(assert_entry_no_invalidate(&tree, UVM_PAGE_SIZE_4K, extent_pte - UVM_PAGE_SIZE_4K));

    TEST_CHECK_RET(assert_entry_invalidate(&tree, UVM_PAGE_SIZE_64K, 0, 3, true));

    TEST_CHECK_RET(test_page_tree_get_entry(&tree, UVM_PAGE_SIZE_64K, 0, &entries[1]) == NV_OK);
    TEST_CHECK_RET(assert_and_reset_last_invalidate(3, false));

    TEST_CHECK_RET(test_page_tree_get_entry(&tree, UVM_PAGE_SIZE_4K, extent_pde0, &entries[2]) == NV_OK);
    TEST_CHECK_RET(assert_and_reset_last_invalidate(2, false));

    TEST_CHECK_RET(test_page_tree_get_entry(&tree, UVM_PAGE_SIZE_4K, extent_pde1, &entries[3]) == NV_OK);
    TEST_CHECK_RET(assert_and_reset_last_invalidate(1, false));

    TEST_CHECK_RET(test_page_tree_get_entry(&tree, UVM_PAGE_SIZE_4K, extent_pde2, &entries[4]) == NV_OK);
    TEST_CHECK_RET(assert_and_reset_last_invalidate(0, false));

    for (i = 4; i > 1; --i) {
        uvm_page_tree_put_ptes(&tree, &entries[i]);
        TEST_CHECK_RET(assert_and_reset_last_invalidate(4 - i, true));
    }

    uvm_page_tree_put_ptes(&tree, &entries[0]);
    TEST_CHECK_RET(assert_and_reset_last_invalidate(3, true));

    uvm_page_tree_put_ptes(&tree, &entries[1]);
    TEST_CHECK_RET(assert_and_reset_last_invalidate(0, true));

    fake_tlb_invals_disable();

    uvm_page_tree_deinit(&tree);

    return status;
}

static NV_STATUS test_tlb_batch_invalidates_case(uvm_page_tree_t *tree,
                                                 NvU64 base,
                                                 NvU64 size,
                                                 NvU32 min_page_size,
                                                 NvU32 max_page_size)
{
    NV_STATUS status = NV_OK;
    uvm_push_t push;
    uvm_tlb_batch_t batch;
    uvm_gpu_t *gpu = tree->gpu;
    int i, j;

    MEM_NV_CHECK_RET(uvm_push_begin_fake(gpu, &push), NV_OK);

    for (i = 1; i < 10; ++i) {
        // If invalidate all ends up being used, the expected depth is the
        // minimum depth across all the ranges. Start off with the min page size
        // as that's the deepest.
        NvU32 expected_inval_all_depth = tree->hal->page_table_depth(min_page_size);
        NvU64 total_pages = 0;

        fake_tlb_invals_enable();

        uvm_tlb_batch_begin(tree, &batch);

        for (j = 0; j < i; ++j) {
            NvU32 used_max_page_size = (j & 1) ? max_page_size : min_page_size;
            NvU32 expected_range_depth = tree->hal->page_table_depth(used_max_page_size);
            expected_inval_all_depth = min(expected_inval_all_depth, expected_range_depth);
            uvm_tlb_batch_invalidate(&batch,
                                     base + (NvU64)j * 2 * size,
                                     size,
                                     min_page_size | used_max_page_size,
                                     UVM_MEMBAR_NONE);
            total_pages += size / min_page_size;
        }

        uvm_tlb_batch_end(&batch, &push, UVM_MEMBAR_NONE);

        for (j = 0; j < i; ++j) {
            NvU32 used_max_page_size = (j & 1) ? max_page_size : min_page_size;
            NvU32 expected_range_depth = tree->hal->page_table_depth(used_max_page_size);
            bool allow_inval_all = (total_pages > gpu->parent->tlb_batch.max_pages) ||
                                   !gpu->parent->tlb_batch.va_invalidate_supported ||
                                   (i > UVM_TLB_BATCH_MAX_ENTRIES);
            TEST_CHECK_RET(assert_invalidate_range(base + (NvU64)j * 2 * size,
                                                   size,
                                                   min_page_size,
                                                   allow_inval_all,
                                                   expected_range_depth,
                                                   expected_inval_all_depth,
                                                   false));
        }

        fake_tlb_invals_disable();
    }

    uvm_push_end_fake(&push);

    return status;
}

static NV_STATUS test_tlb_batch_invalidates(uvm_gpu_t *gpu, const NvU64 *page_sizes, const NvU32 page_sizes_count)
{
    NV_STATUS status = NV_OK;
    uvm_page_tree_t tree;

    NvU32 min_index;
    NvU32 max_index;
    NvU32 size_index;

    static const NvU32 sizes_in_max_pages[] = { 1, 2, 3, 5, 7, 32 };

    MEM_NV_CHECK_RET(test_page_tree_init(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);

    for (min_index = 0; min_index < page_sizes_count; ++min_index) {
        for (max_index = min_index; max_index < page_sizes_count; ++max_index) {
            for (size_index = 0; size_index < ARRAY_SIZE(sizes_in_max_pages); ++size_index) {
                NvU64 min_page_size = page_sizes[min_index];
                NvU64 max_page_size = page_sizes[max_index];
                NvU64 size = (NvU64)sizes_in_max_pages[size_index] * max_page_size;

                TEST_CHECK_GOTO(test_tlb_batch_invalidates_case(&tree,
                                                                (NvU64)min_index * max_page_size,
                                                                size,
                                                                min_page_size,
                                                                max_page_size) == NV_OK, done);
            }
        }
    }

done:
    uvm_page_tree_deinit(&tree);

    return status;
}

typedef struct
{
    NvU64 count;
    NV_STATUS status;
} test_pte_maker_data_t;

static NvU64 test_range_vec_pte_maker(uvm_page_table_range_vec_t *range_vec, NvU64 offset, void *void_data)
{
    test_pte_maker_data_t *data = (test_pte_maker_data_t *)void_data;
    if (range_vec->page_size * data->count != offset) {
        data->status = NV_ERR_INVALID_STATE;
    }
    ++data->count;
    return range_vec->size + offset;
}

static bool assert_range_vec_ptes(uvm_page_table_range_vec_t *range_vec, bool expecting_cleared)
{
    NvU32 i;
    NvU32 entry;
    NvU64 offset = 0;

    for (i = 0; i < range_vec->range_count; ++i) {
        uvm_page_table_range_t *range = &range_vec->ranges[i];

        for (entry = 0; entry < range->entry_count; ++entry) {
            uvm_gpu_phys_address_t pte_addr = uvm_page_table_range_entry_address(range_vec->tree, range, entry);
            NvU64 *pte = (NvU64*)phys_to_virt(pte_addr.address);
            NvU64 expected_pte = expecting_cleared ? 0 : range_vec->size + offset;
            if (*pte != expected_pte) {
                UVM_TEST_PRINT("PTE is 0x%llx instead of 0x%llx for offset 0x%llx within range [0x%llx, 0x%llx)\n",
                               *pte,
                               expected_pte,
                               offset,
                               range_vec->start,
                               range_vec->size);
                return false;
            }
            offset += range_vec->page_size;
        }
    }

    return true;
}

static NV_STATUS test_range_vec_write_ptes(uvm_page_table_range_vec_t *range_vec, uvm_membar_t membar)
{
    test_pte_maker_data_t data = { 0 };
    NvU32 page_table_depth = range_vec->tree->hal->page_table_depth(range_vec->page_size);

    fake_tlb_invals_enable();

    TEST_CHECK_RET(uvm_page_table_range_vec_write_ptes(range_vec, membar, test_range_vec_pte_maker, &data) == NV_OK);
    TEST_CHECK_RET(data.status == NV_OK);
    TEST_CHECK_RET(data.count == range_vec->size / range_vec->page_size);
    TEST_CHECK_RET(assert_invalidate_range_specific(g_last_fake_inval,
                                                    range_vec->start,
                                                    range_vec->size,
                                                    range_vec->page_size,
                                                    page_table_depth,
                                                    membar != UVM_MEMBAR_NONE));
    TEST_CHECK_RET(assert_range_vec_ptes(range_vec, false));

    fake_tlb_invals_disable();

    return NV_OK;
}

static NV_STATUS test_range_vec_clear_ptes(uvm_page_table_range_vec_t *range_vec, uvm_membar_t membar)
{
    NvU32 page_table_depth = range_vec->tree->hal->page_table_depth(range_vec->page_size);

    fake_tlb_invals_enable();

    TEST_CHECK_RET(uvm_page_table_range_vec_clear_ptes(range_vec, membar) == NV_OK);
    TEST_CHECK_RET(assert_and_reset_last_invalidate(page_table_depth, membar != UVM_MEMBAR_NONE));
    TEST_CHECK_RET(assert_range_vec_ptes(range_vec, true));

    fake_tlb_invals_disable();

    return NV_OK;
}

static NV_STATUS test_range_vec_create(uvm_page_tree_t *tree,
                                       NvU64 start,
                                       NvU64 size,
                                       NvU64 page_size,
                                       uvm_page_table_range_vec_t **range_vec_out)
{
    uvm_page_table_range_vec_t *range_vec;
    uvm_pmm_alloc_flags_t pmm_flags = UVM_PMM_ALLOC_FLAGS_EVICT;

    TEST_CHECK_RET(uvm_page_table_range_vec_create(tree, start, size, page_size, pmm_flags, &range_vec) == NV_OK);
    TEST_CHECK_RET(test_range_vec_write_ptes(range_vec, UVM_MEMBAR_NONE) == NV_OK);
    TEST_CHECK_RET(test_range_vec_clear_ptes(range_vec, UVM_MEMBAR_GPU) == NV_OK);
    TEST_CHECK_RET(test_range_vec_write_ptes(range_vec, UVM_MEMBAR_NONE) == NV_OK);
    TEST_CHECK_RET(test_range_vec_write_ptes(range_vec, UVM_MEMBAR_SYS) == NV_OK);
    TEST_CHECK_RET(test_range_vec_clear_ptes(range_vec, UVM_MEMBAR_SYS) == NV_OK);

    *range_vec_out = range_vec;

    return NV_OK;
}

// Test page table range vector APIs.
// Notably the test leaks the page_tree and range_vec on error as it's hard to
// clean up on failure and the destructors would likely assert.
static NV_STATUS test_range_vec(uvm_gpu_t *gpu, NvU32 big_page_size, NvU64 page_size)
{
    NV_STATUS status = NV_OK;
    uvm_page_tree_t tree;
    uvm_page_table_range_vec_t *range_vec;
    uvm_page_table_range_vec_t upper_range_vec;
    NvU64 pde_coverage;
    NvU64 page_table_entries;
    NvU64 start;
    NvU64 size;
    NvU32 i;
    NvU64 offsets[4];

    MEM_NV_CHECK_RET(test_page_tree_init_kernel(gpu, big_page_size, &tree), NV_OK);

    pde_coverage = uvm_mmu_pde_coverage(&tree, page_size);
    page_table_entries = pde_coverage / page_size;

    // Interesting page offsets
    offsets[0] = 0;
    offsets[1] = 1;
    offsets[2] = page_table_entries / 2;
    offsets[3] = page_table_entries - 1;

    // A single page
    size = page_size;
    for (i = 0; i < ARRAY_SIZE(offsets); ++i) {
        NvU64 offset = offsets[i];
        start = offset * page_size;
        TEST_CHECK_RET(test_range_vec_create(&tree, start, size, page_size, &range_vec) == NV_OK);
        TEST_CHECK_RET(range_vec->range_count == 1);
        TEST_CHECK_RET(range_vec->ranges[0].start_index == offset);
        TEST_CHECK_RET(range_vec->ranges[0].entry_count == 1);
        uvm_page_table_range_vec_destroy(range_vec);
    }

    // A full page table extent offset by a non-zero multiple of page_size
    size = pde_coverage;
    for (i = 1; i < ARRAY_SIZE(offsets); ++i) {
        NvU64 offset = offsets[i];
        start = pde_coverage + offset * page_size;
        TEST_CHECK_RET(test_range_vec_create(&tree, start, size, page_size, &range_vec) == NV_OK);
        TEST_CHECK_RET(range_vec->range_count == 2);
        TEST_CHECK_RET(range_vec->ranges[0].start_index == offset);
        TEST_CHECK_RET(range_vec->ranges[0].entry_count == page_table_entries - offset);
        TEST_CHECK_RET(range_vec->ranges[1].start_index == 0);
        TEST_CHECK_RET(range_vec->ranges[1].entry_count == offset);
        uvm_page_table_range_vec_destroy(range_vec);
    }

    // One page on each side of the page table extent boundary
    start = pde_coverage - page_size;
    size = 2 * page_size;
    TEST_CHECK_RET(test_range_vec_create(&tree, start, size, page_size, &range_vec) == NV_OK);
    TEST_CHECK_RET(range_vec->range_count == 2);
    TEST_CHECK_RET(range_vec->ranges[0].entry_count == 1);
    TEST_CHECK_RET(range_vec->ranges[1].entry_count == 1);
    uvm_page_table_range_vec_destroy(range_vec);

    // Two pages on each side of the page table extent boundary and a full page
    // table extent in between
    start = pde_coverage - 2 * page_size;
    size = pde_coverage + 4 * page_size;
    TEST_CHECK_RET(test_range_vec_create(&tree, start, size, page_size, &range_vec) == NV_OK);
    TEST_CHECK_RET(range_vec->range_count == 3);
    TEST_CHECK_RET(range_vec->ranges[0].entry_count == 2);
    TEST_CHECK_RET(range_vec->ranges[1].start_index == 0);
    TEST_CHECK_RET(range_vec->ranges[1].entry_count == page_table_entries);
    TEST_CHECK_RET(range_vec->ranges[2].entry_count == 2);
    uvm_page_table_range_vec_destroy(range_vec);

    // Test splitting of a single page table extent in half
    start = 0;
    size = pde_coverage;
    TEST_CHECK_RET(test_range_vec_create(&tree, start, size, page_size, &range_vec) == NV_OK);
    TEST_CHECK_RET(uvm_page_table_range_vec_split_upper(range_vec, (pde_coverage / 2) - 1, &upper_range_vec) == NV_OK);
    TEST_CHECK_RET(range_vec->range_count == 1);
    TEST_CHECK_RET(range_vec->start == 0);
    TEST_CHECK_RET(range_vec->size == pde_coverage / 2);
    TEST_CHECK_RET(range_vec->ranges[0].entry_count == page_table_entries / 2);
    TEST_CHECK_RET(upper_range_vec.range_count == 1);
    TEST_CHECK_RET(upper_range_vec.start == pde_coverage / 2);
    TEST_CHECK_RET(upper_range_vec.size == pde_coverage / 2);
    TEST_CHECK_RET(upper_range_vec.ranges[0].entry_count == page_table_entries / 2);
    uvm_page_table_range_vec_destroy(range_vec);
    uvm_page_table_range_vec_deinit(&upper_range_vec);

    // Test splitting of two page table extents into two vectors
    size = pde_coverage * 2;
    TEST_CHECK_RET(test_range_vec_create(&tree, start, size, page_size, &range_vec) == NV_OK);
    TEST_CHECK_RET(uvm_page_table_range_vec_split_upper(range_vec, pde_coverage - 1, &upper_range_vec) == NV_OK);
    TEST_CHECK_RET(range_vec->range_count == 1);
    TEST_CHECK_RET(range_vec->start == 0);
    TEST_CHECK_RET(range_vec->size == pde_coverage);
    TEST_CHECK_RET(range_vec->ranges[0].entry_count == page_table_entries);
    TEST_CHECK_RET(upper_range_vec.range_count == 1);
    TEST_CHECK_RET(upper_range_vec.start == pde_coverage);
    TEST_CHECK_RET(upper_range_vec.size == pde_coverage);
    TEST_CHECK_RET(upper_range_vec.ranges[0].entry_count == page_table_entries);
    uvm_page_table_range_vec_destroy(range_vec);
    uvm_page_table_range_vec_deinit(&upper_range_vec);

    // Test uneven split
    TEST_CHECK_RET(test_range_vec_create(&tree, start, size, page_size, &range_vec) == NV_OK);
    TEST_CHECK_RET(uvm_page_table_range_vec_split_upper(range_vec,
                                                        pde_coverage + page_size - 1,
                                                        &upper_range_vec) == NV_OK);
    TEST_CHECK_RET(range_vec->range_count == 2);
    TEST_CHECK_RET(range_vec->start == 0);
    TEST_CHECK_RET(range_vec->size == pde_coverage + page_size);
    TEST_CHECK_RET(range_vec->ranges[0].entry_count == page_table_entries);
    TEST_CHECK_RET(range_vec->ranges[1].entry_count == 1);
    TEST_CHECK_RET(upper_range_vec.range_count == 1);
    TEST_CHECK_RET(upper_range_vec.start == pde_coverage + page_size);
    TEST_CHECK_RET(upper_range_vec.size == pde_coverage - page_size);
    TEST_CHECK_RET(upper_range_vec.ranges[0].entry_count == page_table_entries - 1);
    uvm_page_table_range_vec_destroy(range_vec);
    uvm_page_table_range_vec_deinit(&upper_range_vec);

    // Test splitting a partial page table extent
    start = 2 * page_size;
    size = pde_coverage - (2 * page_size);
    TEST_CHECK_RET(test_range_vec_create(&tree, start, size, page_size, &range_vec) == NV_OK);
    TEST_CHECK_RET(uvm_page_table_range_vec_split_upper(range_vec,
                                                        start + (size / 2) - 1,
                                                        &upper_range_vec) == NV_OK);
    TEST_CHECK_RET(range_vec->range_count == 1);
    TEST_CHECK_RET(range_vec->start == start);
    TEST_CHECK_RET(range_vec->size == size / 2);
    TEST_CHECK_RET(range_vec->ranges[0].entry_count == (size / 2) / page_size);
    TEST_CHECK_RET(upper_range_vec.range_count == 1);
    TEST_CHECK_RET(upper_range_vec.start == start + (size / 2));
    TEST_CHECK_RET(upper_range_vec.size == size / 2);
    TEST_CHECK_RET(upper_range_vec.ranges[0].entry_count == (size / 2) / page_size);
    uvm_page_table_range_vec_destroy(range_vec);
    uvm_page_table_range_vec_deinit(&upper_range_vec);

    uvm_page_tree_deinit(&tree);

    return status;
}

static NV_STATUS alloc_64k_memory_maxwell(uvm_gpu_t *gpu)
{
    uvm_page_tree_t tree;
    uvm_page_table_range_t range;

    NvLength size = 64 * 1024;
    MEM_NV_CHECK_RET(test_page_tree_init(gpu, UVM_PAGE_SIZE_64K, &tree), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_64K, 0, size, &range), NV_OK);
    TEST_CHECK_RET(range.entry_count == 1);
    TEST_CHECK_RET(range.table->depth == 1);
    TEST_CHECK_RET(range.start_index == 0);
    TEST_CHECK_RET(tree.root->ref_count == 1);
    TEST_CHECK_RET(tree.root->entries[0]->ref_count == 1);
    TEST_CHECK_RET(range.table == tree.root->entries[0]);
    uvm_page_tree_put_ptes(&tree, &range);
    UVM_ASSERT(tree.root->ref_count == 0);
    uvm_page_tree_deinit(&tree);

    return NV_OK;
}

static NV_STATUS alloc_128k_memory_maxwell(uvm_gpu_t *gpu)
{
    uvm_page_tree_t tree;
    uvm_page_table_range_t range;
    NvLength size = 128 * 1024;

    // 64k big page mode
    MEM_NV_CHECK_RET(test_page_tree_init(gpu, UVM_PAGE_SIZE_64K, &tree), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_64K, 0, size, &range), NV_OK);
    TEST_CHECK_RET(range.entry_count == 2);
    TEST_CHECK_RET(range.table->depth == 1);
    TEST_CHECK_RET(range.start_index == 0);
    TEST_CHECK_RET(range.page_size == UVM_PAGE_SIZE_64K);
    TEST_CHECK_RET(tree.root->ref_count == 1);
    TEST_CHECK_RET(tree.root->entries[0]->ref_count == 2);
    TEST_CHECK_RET(range.table == tree.root->entries[0]);
    uvm_page_tree_put_ptes(&tree, &range);
    UVM_ASSERT(tree.root->ref_count == 0);
    uvm_page_tree_deinit(&tree);

    // 128k big page mode
    MEM_NV_CHECK_RET(test_page_tree_init(gpu, UVM_PAGE_SIZE_128K, &tree), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_128K, 0, size, &range), NV_OK);
    TEST_CHECK_RET(range.entry_count == 1);
    TEST_CHECK_RET(range.table->depth == 1);
    TEST_CHECK_RET(range.start_index == 0);
    TEST_CHECK_RET(tree.root->ref_count == 1);
    TEST_CHECK_RET(range.page_size == UVM_PAGE_SIZE_128K);
    TEST_CHECK_RET(tree.root->entries[0]->ref_count == 1);
    TEST_CHECK_RET(range.table == tree.root->entries[0]);
    uvm_page_tree_put_ptes(&tree, &range);
    UVM_ASSERT(tree.root->ref_count == 0);
    uvm_page_tree_deinit(&tree);

    return NV_OK;
}

static uvm_mmu_page_table_alloc_t fake_table_alloc(uvm_aperture_t aperture, NvU64 address)
{
    return (uvm_mmu_page_table_alloc_t){.addr = uvm_gpu_phys_address(aperture, address) };
}

// Queries the supported page sizes of the GPU(uvm_gpu_t) and fills the
// page_sizes array up to MAX_NUM_PAGE_SIZE. Returns the number of elements in
// page_sizes;
size_t get_page_sizes(uvm_gpu_t *gpu, NvU64 *page_sizes)
{
    unsigned long page_size_log2;
    unsigned long page_sizes_bitvec;
    size_t count = 0;
    uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(BIG_PAGE_SIZE_PASCAL);

    UVM_ASSERT(hal != NULL);
    UVM_ASSERT(page_sizes != NULL);

    page_sizes_bitvec = hal->page_sizes();

    for_each_set_bit(page_size_log2, &page_sizes_bitvec, BITS_PER_LONG) {
        NvU64 page_size = 1ULL << page_size_log2;
        UVM_ASSERT(count < MAX_NUM_PAGE_SIZES);
        page_sizes[count++] = page_size;
    }

    return count;
}

static NV_STATUS entry_test_page_size_pascal(uvm_gpu_t *gpu, size_t page_size)
{
    uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);

    // Page table entries
    if (page_size == UVM_PAGE_SIZE_64K)
        TEST_CHECK_RET(hal->unmapped_pte(page_size) == 0x20);
    else
        TEST_CHECK_RET(hal->unmapped_pte(page_size) == 0);

    return NV_OK;
}

static NV_STATUS entry_test_page_size_volta(uvm_gpu_t *gpu, size_t page_size)
{
    return entry_test_page_size_pascal(gpu, page_size);
}

static NV_STATUS entry_test_page_size_ampere(uvm_gpu_t *gpu, size_t page_size)
{
    return entry_test_page_size_volta(gpu, page_size);
}

static NV_STATUS entry_test_page_size_hopper(uvm_gpu_t *gpu, size_t page_size)
{
    uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);

    // Page table entries
    if (page_size == UVM_PAGE_SIZE_64K)
        TEST_CHECK_RET(hal->unmapped_pte(page_size) == 0x18);
    else
        TEST_CHECK_RET(hal->unmapped_pte(page_size) == 0);

    return NV_OK;
}

typedef NV_STATUS (*entry_test_page_size_func)(uvm_gpu_t *gpu, size_t page_size);

static NV_STATUS entry_test_maxwell(uvm_gpu_t *gpu)
{
    static const NvU64 big_page_sizes[] = {UVM_PAGE_SIZE_64K, UVM_PAGE_SIZE_128K};
    NvU64 pde_bits;
    uvm_mmu_page_table_alloc_t *phys_allocs[2];
    uvm_mmu_page_table_alloc_t alloc_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x9999999000LL);
    uvm_mmu_page_table_alloc_t alloc_vid = fake_table_alloc(UVM_APERTURE_VID, 0x1BBBBBB000LL);
    uvm_mmu_mode_hal_t *hal;
    uvm_page_directory_t dir;
    NvU32 i, j, big_page_size, page_size;

    dir.depth = 0;

    for (i = 0; i < ARRAY_SIZE(big_page_sizes); i++) {
        big_page_size = big_page_sizes[i];
        hal = gpu->parent->arch_hal->mmu_mode_hal(big_page_size);

        memset(phys_allocs, 0, sizeof(phys_allocs));

        hal->make_pde(&pde_bits, phys_allocs, &dir, 0);
        TEST_CHECK_RET(pde_bits == 0x0L);

        phys_allocs[0] = &alloc_sys;
        phys_allocs[1] = &alloc_vid;
        hal->make_pde(&pde_bits, phys_allocs, &dir, 0);
        TEST_CHECK_RET(pde_bits == 0x1BBBBBBD99999992LL);

        phys_allocs[0] = &alloc_vid;
        phys_allocs[1] = &alloc_sys;
        hal->make_pde(&pde_bits, phys_allocs, &dir, 0);
        TEST_CHECK_RET(pde_bits == 0x9999999E1BBBBBB1LL);

        for (j = 0; j <= 2; j++) {
            if (j == 0)
                page_size = UVM_PAGE_SIZE_4K;
            else
                page_size = big_page_size;

            if (page_size == UVM_PAGE_SIZE_4K)
                TEST_CHECK_RET(hal->unmapped_pte(page_size) == 0);
            else
                TEST_CHECK_RET(hal->unmapped_pte(page_size) == 0x2);
        }

        // uncached, i.e., the sysmem data is not cached in GPU's L2
        // cache. Clear the volatile bit.
        TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_SYS,
                                     0x9999999000LL,
                                     UVM_PROT_READ_WRITE_ATOMIC,
                                     UVM_MMU_PTE_FLAGS_NONE) == 0x599999991LL);

        // change to cached, set the volatile bit.
        TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_SYS,
                                     0x9999999000LL,
                                     UVM_PROT_READ_WRITE_ATOMIC,
                                     UVM_MMU_PTE_FLAGS_CACHED) == 0x499999991LL);

        // remove atomic
        TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_SYS,
                                     0x9999999000LL,
                                     UVM_PROT_READ_WRITE,
                                     UVM_MMU_PTE_FLAGS_CACHED) == 0x499999991LL);

        // read only
        TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_SYS,
                                     0x9999999000LL,
                                     UVM_PROT_READ_ONLY,
                                     UVM_MMU_PTE_FLAGS_CACHED) == 0x8000000499999995LL);

        // local video
        TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_VID,
                                     0x1BBBBBB000LL,
                                     UVM_PROT_READ_ONLY,
                                     UVM_MMU_PTE_FLAGS_CACHED) == 0x800000001BBBBBB5LL);

        // peer 0
        TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_PEER_0,
                                     0x1BBBBBB000LL,
                                     UVM_PROT_READ_ONLY,
                                     UVM_MMU_PTE_FLAGS_CACHED) == 0x800000021BBBBBB5LL);

        // peer 7
        TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_PEER_7,
                                     0x1BBBBBB000LL,
                                     UVM_PROT_READ_ONLY,
                                     UVM_MMU_PTE_FLAGS_CACHED) == 0x80000002FBBBBBB5LL);
    }

    return NV_OK;
}

static NV_STATUS entry_test_pascal(uvm_gpu_t *gpu, entry_test_page_size_func entry_test_page_size)
{
    NvU64 page_sizes[MAX_NUM_PAGE_SIZES];
    NvU64 pde_bits[2];
    size_t i, num_page_sizes;
    uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
    uvm_mmu_page_table_alloc_t alloc_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x399999999999000LL);
    uvm_mmu_page_table_alloc_t alloc_vid = fake_table_alloc(UVM_APERTURE_VID, 0x1BBBBBB000LL);
    uvm_page_directory_t dir;

    // big versions have [11:8] set as well to test the page table merging
    uvm_mmu_page_table_alloc_t alloc_big_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x399999999999900LL);
    uvm_mmu_page_table_alloc_t alloc_big_vid = fake_table_alloc(UVM_APERTURE_VID, 0x1BBBBBBB00LL);

    uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);

    dir.index_in_parent = 0;
    dir.host_parent = NULL;
    dir.depth = 0;

    // Make sure cleared PDEs work as expected
    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0);

    memset(pde_bits, 0xFF, sizeof(pde_bits));
    dir.depth = 3;
    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0 && pde_bits[1] == 0);

    // Sys and vidmem PDEs
    phys_allocs[0] = &alloc_sys;
    dir.depth = 0;
    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x3999999999990C);

    phys_allocs[0] = &alloc_vid;
    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBB0A);

    // Dual PDEs
    phys_allocs[0] = &alloc_big_sys;
    phys_allocs[1] = &alloc_vid;
    dir.depth = 3;
    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x3999999999999C && pde_bits[1] == 0x1BBBBBB0A);

    phys_allocs[0] = &alloc_big_vid;
    phys_allocs[1] = &alloc_sys;
    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBBBA && pde_bits[1] == 0x3999999999990C);

    // uncached, i.e., the sysmem data is not cached in GPU's L2 cache. Clear
    // the volatile bit.
    TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_SYS,
                                 0x399999999999000LL,
                                 UVM_PROT_READ_WRITE_ATOMIC,
                                 UVM_MMU_PTE_FLAGS_NONE) == 0x3999999999990D);

    // change to cached, set the volatile bit.
    TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_SYS,
                                 0x399999999999000LL,
                                 UVM_PROT_READ_WRITE_ATOMIC,
                                 UVM_MMU_PTE_FLAGS_CACHED) == 0x39999999999905);

    // remove atomic
    TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_SYS,
                                 0x399999999999000LL,
                                 UVM_PROT_READ_WRITE,
                                 UVM_MMU_PTE_FLAGS_CACHED) == 0x39999999999985);

    // read only
    TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_SYS,
                                 0x399999999999000LL,
                                 UVM_PROT_READ_ONLY,
                                 UVM_MMU_PTE_FLAGS_CACHED) == 0x399999999999C5);

    // local video
    TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_VID,
                                 0x1BBBBBB000LL,
                                 UVM_PROT_READ_ONLY,
                                 UVM_MMU_PTE_FLAGS_CACHED) == 0x1BBBBBBC1);

    // peer 0
    TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_PEER_0,
                                 0x1BBBBBB000LL,
                                 UVM_PROT_READ_ONLY,
                                 UVM_MMU_PTE_FLAGS_CACHED) == 0x1BBBBBBC3);

    num_page_sizes = get_page_sizes(gpu, page_sizes);

    for (i = 0; i < num_page_sizes; i++)
        TEST_NV_CHECK_RET(entry_test_page_size(gpu, page_sizes[i]));

    return NV_OK;
}

static NV_STATUS entry_test_volta(uvm_gpu_t *gpu, entry_test_page_size_func entry_test_page_size)
{
    NvU64 page_sizes[MAX_NUM_PAGE_SIZES];
    NvU64 pde_bits[2];
    size_t i, num_page_sizes;
    uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
    uvm_mmu_page_table_alloc_t alloc_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x399999999999000LL);
    uvm_mmu_page_table_alloc_t alloc_vid = fake_table_alloc(UVM_APERTURE_VID, 0x1BBBBBB000LL);
    uvm_page_directory_t dir;

    // big versions have [11:8] set as well to test the page table merging
    uvm_mmu_page_table_alloc_t alloc_big_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x399999999999900LL);
    uvm_mmu_page_table_alloc_t alloc_big_vid = fake_table_alloc(UVM_APERTURE_VID, 0x1BBBBBBB00LL);

    uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);

    dir.index_in_parent = 0;
    dir.host_parent = NULL;
    dir.depth = 0;

    // Make sure cleared PDEs work as expected
    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0);

    memset(pde_bits, 0xFF, sizeof(pde_bits));
    dir.depth = 3;
    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0 && pde_bits[1] == 0);

    // Sys and vidmem PDEs
    phys_allocs[0] = &alloc_sys;
    dir.depth = 0;
    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x3999999999990C);

    phys_allocs[0] = &alloc_vid;
    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBB0A);

    // Dual PDEs
    phys_allocs[0] = &alloc_big_sys;
    phys_allocs[1] = &alloc_vid;
    dir.depth = 3;
    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x3999999999999C && pde_bits[1] == 0x1BBBBBB0A);

    phys_allocs[0] = &alloc_big_vid;
    phys_allocs[1] = &alloc_sys;
    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBBBA && pde_bits[1] == 0x3999999999990C);

    // NO_ATS PDE1 (depth 2)
    phys_allocs[0] = &alloc_vid;
    dir.depth = 2;
    hal->make_pde(pde_bits, phys_allocs, &dir, 0);
    if (g_uvm_global.ats.enabled)
        TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBB2A);
    else
        TEST_CHECK_RET(pde_bits[0] == 0x1BBBBBB0A);

    // peer 0 47-bit physical addressing
    TEST_CHECK_RET(hal->make_pte(UVM_APERTURE_PEER_0,
                                 0x5BBBBBBBB000LL,
                                 UVM_PROT_READ_ONLY,
                                 UVM_MMU_PTE_FLAGS_CACHED) == 0x2DD1BBBBBBC3);

    num_page_sizes = get_page_sizes(gpu, page_sizes);

    for (i = 0; i < num_page_sizes; i++)
        TEST_NV_CHECK_RET(entry_test_page_size(gpu, page_sizes[i]));

    return NV_OK;
}

static NV_STATUS entry_test_ampere(uvm_gpu_t *gpu, entry_test_page_size_func entry_test_page_size)
{
    NvU64 page_sizes[MAX_NUM_PAGE_SIZES];
    NvU32 i, num_page_sizes;

    num_page_sizes = get_page_sizes(gpu, page_sizes);

    for (i = 0; i < num_page_sizes; i++)
        TEST_NV_CHECK_RET(entry_test_page_size(gpu, page_sizes[i]));

    return NV_OK;
}

static NV_STATUS entry_test_hopper(uvm_gpu_t *gpu, entry_test_page_size_func entry_test_page_size)
{
    NV_STATUS status = NV_OK;
    NvU64 page_sizes[MAX_NUM_PAGE_SIZES];
    NvU64 pde_bits[2];
    uvm_page_directory_t *dirs[5];
    size_t i, num_page_sizes;
    uvm_mmu_page_table_alloc_t *phys_allocs[2] = {NULL, NULL};
    uvm_mmu_page_table_alloc_t alloc_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x9999999999000LL);
    uvm_mmu_page_table_alloc_t alloc_vid = fake_table_alloc(UVM_APERTURE_VID, 0xBBBBBBB000LL);

    // Big versions have [11:8] set as well to test the page table merging
    uvm_mmu_page_table_alloc_t alloc_big_sys = fake_table_alloc(UVM_APERTURE_SYS, 0x9999999999900LL);
    uvm_mmu_page_table_alloc_t alloc_big_vid = fake_table_alloc(UVM_APERTURE_VID, 0xBBBBBBBB00LL);

    uvm_mmu_mode_hal_t *hal = gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);

    memset(dirs, 0, sizeof(dirs));
    // Fake directory tree.
    for (i = 0; i < ARRAY_SIZE(dirs); i++) {
        dirs[i] = uvm_kvmalloc_zero(sizeof(uvm_page_directory_t) + sizeof(dirs[i]->entries[0]) * 512);
        TEST_CHECK_GOTO(dirs[i] != NULL, cleanup);

        dirs[i]->depth = i;
        dirs[i]->index_in_parent = 0;

        if (i == 0)
            dirs[i]->host_parent = NULL;
        else
            dirs[i]->host_parent = dirs[i - 1];
    }

    // Make sure cleared PDEs work as expected.
    hal->make_pde(pde_bits, phys_allocs, dirs[0], 0);
    TEST_CHECK_GOTO(pde_bits[0] == 0, cleanup);

    // Cleared PDEs work as expected for big and small PDEs.
    memset(pde_bits, 0xFF, sizeof(pde_bits));
    hal->make_pde(pde_bits, phys_allocs, dirs[4], 0);
    TEST_CHECK_GOTO(pde_bits[0] == 0 && pde_bits[1] == 0, cleanup);

    // Sys and vidmem PDEs, uncached ATS allowed.
    phys_allocs[0] = &alloc_sys;
    hal->make_pde(pde_bits, phys_allocs, dirs[0], 0);
    TEST_CHECK_GOTO(pde_bits[0] == 0x999999999900C, cleanup);

    phys_allocs[0] = &alloc_vid;
    hal->make_pde(pde_bits, phys_allocs, dirs[0], 0);
    TEST_CHECK_GOTO(pde_bits[0] == 0xBBBBBBB00A, cleanup);

    // Dual PDEs, uncached. We don't use child_dir in the depth 4 checks because
    // our policy decides the PDE's PCF without using it.
    phys_allocs[0] = &alloc_big_sys;
    phys_allocs[1] = &alloc_vid;
    hal->make_pde(pde_bits, phys_allocs, dirs[4], 0);
    if (g_uvm_global.ats.enabled)
        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999991C && pde_bits[1] == 0xBBBBBBB01A, cleanup);
    else
        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999990C && pde_bits[1] == 0xBBBBBBB00A, cleanup);

    phys_allocs[0] = &alloc_big_vid;
    phys_allocs[1] = &alloc_sys;
    hal->make_pde(pde_bits, phys_allocs, dirs[4], 0);
    if (g_uvm_global.ats.enabled)
        TEST_CHECK_GOTO(pde_bits[0] == 0xBBBBBBBB1A && pde_bits[1] == 0x999999999901C, cleanup);
    else
        TEST_CHECK_GOTO(pde_bits[0] == 0xBBBBBBBB0A && pde_bits[1] == 0x999999999900C, cleanup);

    // We only need to test make_pde() on ATS when the CPU VA width < GPU's.
    if (g_uvm_global.ats.enabled && uvm_cpu_num_va_bits() < hal->num_va_bits()) {
        phys_allocs[0] = &alloc_sys;

        dirs[1]->index_in_parent = 0;
        hal->make_pde(pde_bits, phys_allocs, dirs[0], 0);
        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999900C, cleanup);

        dirs[2]->index_in_parent = 0;
        hal->make_pde(pde_bits, phys_allocs, dirs[1], 0);
        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);

        dirs[2]->index_in_parent = 1;
        hal->make_pde(pde_bits, phys_allocs, dirs[1], 1);
        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);

        dirs[2]->index_in_parent = 2;
        hal->make_pde(pde_bits, phys_allocs, dirs[1], 2);
        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);

        dirs[2]->index_in_parent = 511;
        hal->make_pde(pde_bits, phys_allocs, dirs[1], 511);
        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);

        dirs[1]->index_in_parent = 1;
        hal->make_pde(pde_bits, phys_allocs, dirs[0], 1);
        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999900C, cleanup);

        dirs[2]->index_in_parent = 0;
        hal->make_pde(pde_bits, phys_allocs, dirs[1], 0);
        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);

        dirs[2]->index_in_parent = 509;
        hal->make_pde(pde_bits, phys_allocs, dirs[1], 509);
        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);

        dirs[2]->index_in_parent = 510;
        hal->make_pde(pde_bits, phys_allocs, dirs[1], 510);
        TEST_CHECK_GOTO(pde_bits[0] == 0x999999999901C, cleanup);

        phys_allocs[0] = NULL;

        dirs[1]->index_in_parent = 0;
        hal->make_pde(pde_bits, phys_allocs, dirs[0], 0);
        TEST_CHECK_GOTO(pde_bits[0] == 0x0, cleanup);

        dirs[2]->index_in_parent = 0;
        hal->make_pde(pde_bits, phys_allocs, dirs[1], 0);
        TEST_CHECK_GOTO(pde_bits[0] == 0x0, cleanup);

        dirs[2]->index_in_parent = 2;
        hal->make_pde(pde_bits, phys_allocs, dirs[1], 2);
        TEST_CHECK_GOTO(pde_bits[0] == 0x10, cleanup);

        dirs[1]->index_in_parent = 1;
        dirs[2]->index_in_parent = 509;
        hal->make_pde(pde_bits, phys_allocs, dirs[1], 509);
        TEST_CHECK_GOTO(pde_bits[0] == 0x10, cleanup);

        dirs[2]->index_in_parent = 510;
        hal->make_pde(pde_bits, phys_allocs, dirs[1], 510);
        TEST_CHECK_GOTO(pde_bits[0] == 0x0, cleanup);
    }

    // uncached, i.e., the sysmem data is not cached in GPU's L2 cache, and
    // access counters disabled.
    TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_SYS,
                                  0x9999999999000LL,
                                  UVM_PROT_READ_WRITE_ATOMIC,
                                  UVM_MMU_PTE_FLAGS_ACCESS_COUNTERS_DISABLED) == 0x999999999968D,
                    cleanup);

    // change to cached.
    TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_SYS,
                                  0x9999999999000LL,
                                  UVM_PROT_READ_WRITE_ATOMIC,
                                  UVM_MMU_PTE_FLAGS_CACHED | UVM_MMU_PTE_FLAGS_ACCESS_COUNTERS_DISABLED) ==
                                  0x9999999999685,
                    cleanup);

    // enable access counters.
    TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_SYS,
                                  0x9999999999000LL,
                                  UVM_PROT_READ_WRITE_ATOMIC,
                                  UVM_MMU_PTE_FLAGS_CACHED) == 0x9999999999605,
                    cleanup);

    // remove atomic
    TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_SYS,
                                  0x9999999999000LL,
                                  UVM_PROT_READ_WRITE,
                                  UVM_MMU_PTE_FLAGS_CACHED) == 0x9999999999645,
                    cleanup);

    // read only
    TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_SYS,
                                  0x9999999999000LL,
                                  UVM_PROT_READ_ONLY,
                                  UVM_MMU_PTE_FLAGS_CACHED) == 0x9999999999665,
                    cleanup);

    // local video
    TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_VID,
                                  0xBBBBBBB000LL,
                                  UVM_PROT_READ_ONLY,
                                  UVM_MMU_PTE_FLAGS_CACHED) == 0xBBBBBBB661,
                    cleanup);

    // peer 1
    TEST_CHECK_GOTO(hal->make_pte(UVM_APERTURE_PEER_1,
                                  0xBBBBBBB000LL,
                                  UVM_PROT_READ_ONLY,
                                  UVM_MMU_PTE_FLAGS_CACHED) == 0x200000BBBBBBB663,
                    cleanup);

    // sparse
    TEST_CHECK_GOTO(hal->make_sparse_pte() == 0x8, cleanup);

    // sked reflected
    TEST_CHECK_GOTO(hal->make_sked_reflected_pte() == 0xF09, cleanup);

    num_page_sizes = get_page_sizes(gpu, page_sizes);

    for (i = 0; i < num_page_sizes; i++)
        TEST_NV_CHECK_GOTO(entry_test_page_size(gpu, page_sizes[i]), cleanup);

cleanup:
    for (i = 0; i < ARRAY_SIZE(dirs); i++)
        uvm_kvfree(dirs[i]);

    return status;
}

static NV_STATUS alloc_4k_maxwell(uvm_gpu_t *gpu)
{
    uvm_page_tree_t tree;
    uvm_page_table_range_t range;
    NvLength size = 4096;

    // 64k big page mode
    MEM_NV_CHECK_RET(test_page_tree_init(gpu, UVM_PAGE_SIZE_64K, &tree), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_4K, 0, size, &range), NV_OK);
    TEST_CHECK_RET(range.entry_count == 1);
    TEST_CHECK_RET(range.table->depth == 1);
    TEST_CHECK_RET(range.start_index == 0);
    TEST_CHECK_RET(range.page_size == UVM_PAGE_SIZE_4K);
    TEST_CHECK_RET(tree.root->ref_count == 1);
    TEST_CHECK_RET(range.table == tree.root->entries[1]);
    TEST_CHECK_RET(tree.root->entries[1]->ref_count == 1);
    uvm_page_tree_put_ptes(&tree, &range);
    UVM_ASSERT(tree.root->ref_count == 0);
    uvm_page_tree_deinit(&tree);

    // 128k big page mode
    MEM_NV_CHECK_RET(test_page_tree_init(gpu, UVM_PAGE_SIZE_128K, &tree), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_4K, 0, size, &range), NV_OK);
    TEST_CHECK_RET(range.entry_count == 1);
    TEST_CHECK_RET(range.table->depth == 1);
    TEST_CHECK_RET(range.start_index == 0);
    TEST_CHECK_RET(range.page_size == UVM_PAGE_SIZE_4K);
    TEST_CHECK_RET(tree.root->ref_count == 1);
    TEST_CHECK_RET(range.table == tree.root->entries[1]);
    TEST_CHECK_RET(tree.root->entries[1]->ref_count == 1);
    uvm_page_tree_put_ptes(&tree, &range);
    UVM_ASSERT(tree.root->ref_count == 0);
    uvm_page_tree_deinit(&tree);

    return NV_OK;
}

static NV_STATUS shrink_test(uvm_gpu_t *gpu, NvU32 big_page_size, NvU32 page_size)
{
    uvm_page_tree_t tree;
    uvm_page_table_range_t range;
    NvU64 addr = 0;
    NvLength size;
    NvU32 num_pages, new_page_count;
    int alignment;

    MEM_NV_CHECK_RET(test_page_tree_init(gpu, big_page_size, &tree), NV_OK);

    for (num_pages = 1; num_pages <= 3; num_pages++) {
        for (alignment = 0; alignment <= 2; alignment++) {
            size = num_pages * page_size;

            // Get the alignment of the range within a PDE
            switch (alignment) {
                case 0: // Start of the PDE
                    addr = 0;
                    break;
                case 1: // In the middle of the PDE
                    addr = page_size;
                    break;
                case 2: // At the end of the PDE
                    addr = uvm_mmu_pde_coverage(&tree, page_size) - size;
                    break;
            }

            for (new_page_count = 0; new_page_count <= num_pages; new_page_count++) {
                MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, page_size, addr, size, &range), NV_OK);
                TEST_CHECK_RET(range.table->ref_count == num_pages);
                TEST_CHECK_RET(range.entry_count == num_pages);
                TEST_CHECK_RET(range.start_index == addr / page_size);

                uvm_page_table_range_shrink(&tree, &range, new_page_count);

                if (new_page_count) {
                    TEST_CHECK_RET(range.table->ref_count == new_page_count);
                    TEST_CHECK_RET(range.entry_count == new_page_count);
                    TEST_CHECK_RET(range.start_index == addr / page_size);
                    uvm_page_tree_put_ptes(&tree, &range);
                }

                TEST_CHECK_RET(tree.root->ref_count == 0);
            }
        }
    }

    uvm_page_tree_deinit(&tree);
    return NV_OK;
}

static NV_STATUS get_upper_test(uvm_gpu_t *gpu, NvU32 big_page_size, NvU32 page_size)
{
    uvm_page_tree_t tree;
    uvm_page_table_range_t range, upper_range;
    NvU64 addr = 0;
    NvLength size;
    NvU32 num_pages, num_upper_pages;
    int alignment, put_upper_first;

    MEM_NV_CHECK_RET(test_page_tree_init(gpu, big_page_size, &tree), NV_OK);

    for (num_pages = 1; num_pages <= 3; num_pages++) {
        for (alignment = 0; alignment <= 2; alignment++) {
            size = num_pages * page_size;

            // Get the alignment of the range within a PDE
            switch (alignment) {
                case 0: // Start of the PDE
                    addr = 0;
                    break;
                case 1: // In the middle of the PDE
                    addr = page_size;
                    break;
                case 2: // At the end of the PDE
                    addr = uvm_mmu_pde_coverage(&tree, page_size) - size;
                    break;
            }

            for (num_upper_pages = 1; num_upper_pages <= num_pages; num_upper_pages++) {
                for (put_upper_first = 0; put_upper_first <= 1; put_upper_first++) {
                    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, page_size, addr, size, &range), NV_OK);
                    TEST_CHECK_RET(range.table->ref_count == num_pages);
                    TEST_CHECK_RET(range.entry_count == num_pages);
                    TEST_CHECK_RET(range.start_index == addr / page_size);

                    uvm_page_table_range_get_upper(&tree, &range, &upper_range, num_upper_pages);

                    TEST_CHECK_RET(range.entry_count == num_pages);
                    TEST_CHECK_RET(range.start_index == addr / page_size);

                    TEST_CHECK_RET(upper_range.entry_count == num_upper_pages);
                    TEST_CHECK_RET(upper_range.start_index == range.start_index + num_pages - num_upper_pages);

                    TEST_CHECK_RET(range.table->ref_count == num_pages + num_upper_pages);

                    if (put_upper_first) {
                        uvm_page_tree_put_ptes(&tree, &upper_range);
                        TEST_CHECK_RET(range.entry_count == num_pages);
                        TEST_CHECK_RET(range.start_index == addr / page_size);
                        TEST_CHECK_RET(range.table->ref_count == num_pages);
                        uvm_page_tree_put_ptes(&tree, &range);
                    }
                    else {
                        uvm_page_tree_put_ptes(&tree, &range);
                        TEST_CHECK_RET(upper_range.entry_count == num_upper_pages);
                        TEST_CHECK_RET(upper_range.start_index == (addr / page_size) + num_pages - num_upper_pages);
                        TEST_CHECK_RET(range.table->ref_count == num_upper_pages);
                        uvm_page_tree_put_ptes(&tree, &upper_range);
                    }

                    TEST_CHECK_RET(tree.root->ref_count == 0);
                }
            }
        }
    }

    uvm_page_tree_deinit(&tree);
    return NV_OK;
}

static uvm_host_hal_t fake_host_hal = {
        .noop = fake_noop,
        .wait_for_idle = fake_wait_for_idle,
        .membar_sys = fake_membar,
        .membar_gpu = fake_membar,
        .tlb_invalidate_all = fake_tlb_invalidate_all,
        .tlb_invalidate_va = fake_tlb_invalidate_va,
};
static uvm_ce_hal_t fake_ce_hal = {
        .memset_8 = fake_ce_memset_8,
        .memcopy = fake_ce_memcopy,
};

static NV_STATUS fake_gpu_init(NvU32 host_class, NvU32 ce_class, NvU32 architecture, uvm_gpu_t *fake_gpu)
{
    uvm_parent_gpu_t *fake_parent_gpu = fake_gpu->parent;

    fake_parent_gpu->num_retained_gpus = 1;

    fake_parent_gpu->rm_info.ceClass = ce_class;
    fake_parent_gpu->rm_info.hostClass = host_class;
    fake_parent_gpu->rm_info.gpuArch = architecture;

    TEST_CHECK_RET(uvm_hal_init_gpu(fake_parent_gpu) == NV_OK);

    uvm_hal_init_properties(fake_parent_gpu);

    // The PTE allocation code expects the address space tree HAL to be present
    // (for example, when checking the addressing capabilities of a GPU).
    // The selected page size (64K) should work across all supported GPU
    // architectures.
    fake_gpu->address_space_tree.hal = fake_parent_gpu->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);

    fake_parent_gpu->host_hal = &fake_host_hal;
    fake_parent_gpu->ce_hal = &fake_ce_hal;

    uvm_mmu_init_gpu_chunk_sizes(fake_parent_gpu);
    uvm_mmu_init_gpu_peer_addresses(fake_gpu);

    return NV_OK;
}

static NV_STATUS fake_gpu_init_maxwell(uvm_gpu_t *fake_gpu)
{
    // KEPLER_CHANNEL_GPFIFO_B host class is used for GM10x.
    return fake_gpu_init(KEPLER_CHANNEL_GPFIFO_B,
                         MAXWELL_DMA_COPY_A,
                         NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GM000,
                         fake_gpu);
}

static NV_STATUS fake_gpu_init_pascal(uvm_gpu_t *fake_gpu)
{
    return fake_gpu_init(PASCAL_CHANNEL_GPFIFO_A,
                         PASCAL_DMA_COPY_A,
                         NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GP100,
                         fake_gpu);
}

static NV_STATUS fake_gpu_init_volta(uvm_gpu_t *fake_gpu)
{
    return fake_gpu_init(VOLTA_CHANNEL_GPFIFO_A,
                         VOLTA_DMA_COPY_A,
                         NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GV100,
                         fake_gpu);
}

static NV_STATUS fake_gpu_init_ampere(uvm_gpu_t *fake_gpu)
{
    return fake_gpu_init(AMPERE_CHANNEL_GPFIFO_A,
                         AMPERE_DMA_COPY_A,
                         NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GA100,
                         fake_gpu);
}

static NV_STATUS fake_gpu_init_hopper(uvm_gpu_t *fake_gpu)
{
    return fake_gpu_init(HOPPER_CHANNEL_GPFIFO_A,
                         HOPPER_DMA_COPY_A,
                         NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GH100,
                         fake_gpu);
}

static NV_STATUS maxwell_test_page_tree(uvm_gpu_t *maxwell)
{
    // create a fake Maxwell GPU for this test.
    static const NvU64 big_page_sizes[] = {UVM_PAGE_SIZE_64K, UVM_PAGE_SIZE_128K};
    NvU64 i, j, big_page_size, page_size;

    TEST_CHECK_RET(fake_gpu_init_maxwell(maxwell) == NV_OK);

    MEM_NV_CHECK_RET(allocate_root(maxwell), NV_OK);
    MEM_NV_CHECK_RET(alloc_64k_memory_maxwell(maxwell), NV_OK);
    MEM_NV_CHECK_RET(alloc_128k_memory_maxwell(maxwell), NV_OK);
    MEM_NV_CHECK_RET(alloc_4k_maxwell(maxwell), NV_OK);
    TEST_CHECK_RET(entry_test_maxwell(maxwell) == NV_OK);

    for (i = 0; i < ARRAY_SIZE(big_page_sizes); i++) {
        big_page_size = big_page_sizes[i];
        for (j = 0; j < 2; j++) {
            page_size = (j == 0) ? UVM_PAGE_SIZE_4K : big_page_size;

            MEM_NV_CHECK_RET(shrink_test(maxwell, big_page_size, page_size), NV_OK);
            MEM_NV_CHECK_RET(get_upper_test(maxwell, big_page_size, page_size), NV_OK);
            MEM_NV_CHECK_RET(test_range_vec(maxwell, big_page_size, page_size), NV_OK);
        }
    }

    return NV_OK;
}

static NV_STATUS pascal_test_page_tree(uvm_gpu_t *pascal)
{
    // create a fake Pascal GPU for this test.
    NvU32 tlb_batch_saved_max_pages;
    NvU32 i;
    NvU64 page_sizes[MAX_NUM_PAGE_SIZES];
    size_t num_page_sizes;

    TEST_CHECK_RET(fake_gpu_init_pascal(pascal) == NV_OK);

    num_page_sizes = get_page_sizes(pascal, page_sizes);
    UVM_ASSERT(num_page_sizes > 0);

    MEM_NV_CHECK_RET(allocate_root(pascal), NV_OK);
    MEM_NV_CHECK_RET(alloc_64k_memory(pascal), NV_OK);
    MEM_NV_CHECK_RET(alloc_adjacent_64k_memory(pascal), NV_OK);
    MEM_NV_CHECK_RET(alloc_adjacent_pde_64k_memory(pascal), NV_OK);
    MEM_NV_CHECK_RET(alloc_nearby_pde_64k_memory(pascal), NV_OK);
    MEM_NV_CHECK_RET(allocate_then_free_all_16_64k(pascal), NV_OK);
    MEM_NV_CHECK_RET(allocate_then_free_8_8_64k(pascal), NV_OK);
    MEM_NV_CHECK_RET(get_single_page_2m(pascal), NV_OK);
    MEM_NV_CHECK_RET(get_entire_table_4k(pascal), NV_OK);
    MEM_NV_CHECK_RET(split_4k_from_2m(pascal), NV_OK);
    MEM_NV_CHECK_RET(get_512mb_range(pascal), NV_OK);
    MEM_NV_CHECK_RET(get_two_free_apart(pascal), NV_OK);
    MEM_NV_CHECK_RET(get_overlapping_dual_pdes(pascal), NV_OK);
    MEM_NV_CHECK_RET(split_and_free(pascal), NV_OK);
    MEM_NV_CHECK_RET(entry_test_pascal(pascal, entry_test_page_size_pascal), NV_OK);
    MEM_NV_CHECK_RET(check_sizes(pascal), NV_OK);
    MEM_NV_CHECK_RET(fast_split_normal(pascal), NV_OK);
    MEM_NV_CHECK_RET(fast_split_double_backoff(pascal), NV_OK);
    MEM_NV_CHECK_RET(test_tlb_invalidates(pascal), NV_OK);
    MEM_NV_CHECK_RET(test_tlb_batch_invalidates(pascal, page_sizes, num_page_sizes), NV_OK);

    // Run the test again with a bigger limit on max pages
    tlb_batch_saved_max_pages = pascal->parent->tlb_batch.max_pages;
    pascal->parent->tlb_batch.max_pages = 1024 * 1024;
    MEM_NV_CHECK_RET(test_tlb_batch_invalidates(pascal, page_sizes, num_page_sizes), NV_OK);
    pascal->parent->tlb_batch.max_pages = tlb_batch_saved_max_pages;

    // And with per VA invalidates disabled
    pascal->parent->tlb_batch.va_invalidate_supported = false;
    MEM_NV_CHECK_RET(test_tlb_batch_invalidates(pascal, page_sizes, num_page_sizes), NV_OK);
    pascal->parent->tlb_batch.va_invalidate_supported = true;

    for (i = 0; i < num_page_sizes; i++) {
        MEM_NV_CHECK_RET(shrink_test(pascal, BIG_PAGE_SIZE_PASCAL, page_sizes[i]), NV_OK);
        MEM_NV_CHECK_RET(get_upper_test(pascal, BIG_PAGE_SIZE_PASCAL, page_sizes[i]), NV_OK);
        MEM_NV_CHECK_RET(test_range_vec(pascal, BIG_PAGE_SIZE_PASCAL, page_sizes[i]), NV_OK);
    }

    return NV_OK;
}

static NV_STATUS volta_test_page_tree(uvm_gpu_t *volta)
{
    TEST_CHECK_RET(fake_gpu_init_volta(volta) == NV_OK);

    MEM_NV_CHECK_RET(entry_test_volta(volta, entry_test_page_size_volta), NV_OK);

    return NV_OK;
}

static NV_STATUS ampere_test_page_tree(uvm_gpu_t *ampere)
{
    NvU32 i, tlb_batch_saved_max_pages;
    NvU64 page_sizes[MAX_NUM_PAGE_SIZES];
    size_t num_page_sizes;

    TEST_CHECK_RET(fake_gpu_init_ampere(ampere) == NV_OK);

    num_page_sizes = get_page_sizes(ampere, page_sizes);
    UVM_ASSERT(num_page_sizes > 0);

    MEM_NV_CHECK_RET(alloc_512m_memory(ampere), NV_OK);
    MEM_NV_CHECK_RET(alloc_adjacent_512m_memory(ampere), NV_OK);
    MEM_NV_CHECK_RET(get_single_page_512m(ampere), NV_OK);
    MEM_NV_CHECK_RET(get_entire_table_512m(ampere), NV_OK);

    // Although there is no support for the 512M page size for managed memory,
    // we run tests that split 512M pages into 256x2M pages because UVM handles
    // the PTEs for all supported page sizes.
    MEM_NV_CHECK_RET(split_2m_from_512m(ampere), NV_OK);
    MEM_NV_CHECK_RET(get_2gb_range(ampere), NV_OK);
    MEM_NV_CHECK_RET(entry_test_ampere(ampere, entry_test_page_size_ampere), NV_OK);

    // TLB invalidate
    MEM_NV_CHECK_RET(test_tlb_invalidates(ampere), NV_OK);

    // TLB batch invalidate
    MEM_NV_CHECK_RET(test_tlb_batch_invalidates(ampere, page_sizes, num_page_sizes), NV_OK);

    // Run the test again with a bigger limit on max pages
    tlb_batch_saved_max_pages = ampere->parent->tlb_batch.max_pages;
    ampere->parent->tlb_batch.max_pages = 1024 * 1024;
    MEM_NV_CHECK_RET(test_tlb_batch_invalidates(ampere, page_sizes, num_page_sizes), NV_OK);
    ampere->parent->tlb_batch.max_pages = tlb_batch_saved_max_pages;

    // And with per VA invalidates disabled
    ampere->parent->tlb_batch.va_invalidate_supported = false;
    MEM_NV_CHECK_RET(test_tlb_batch_invalidates(ampere, page_sizes, num_page_sizes), NV_OK);
    ampere->parent->tlb_batch.va_invalidate_supported = true;

    for (i = 0; i < num_page_sizes; i++) {
        MEM_NV_CHECK_RET(shrink_test(ampere, BIG_PAGE_SIZE_PASCAL, page_sizes[i]), NV_OK);
        MEM_NV_CHECK_RET(get_upper_test(ampere, BIG_PAGE_SIZE_PASCAL, page_sizes[i]), NV_OK);
        MEM_NV_CHECK_RET(test_range_vec(ampere, BIG_PAGE_SIZE_PASCAL, page_sizes[i]), NV_OK);
    }

    return NV_OK;
}

static NV_STATUS hopper_test_page_tree(uvm_gpu_t *hopper)
{
    TEST_CHECK_RET(fake_gpu_init_hopper(hopper) == NV_OK);

    MEM_NV_CHECK_RET(entry_test_hopper(hopper, entry_test_page_size_hopper), NV_OK);
    MEM_NV_CHECK_RET(alloc_64k_memory_57b_va(hopper), NV_OK);

    return NV_OK;
}

NV_STATUS uvm_test_page_tree(UVM_TEST_PAGE_TREE_PARAMS *params, struct file *filp)
{
    NV_STATUS status = NV_OK;
    uvm_parent_gpu_t *parent_gpu;
    uvm_gpu_t *gpu;

    parent_gpu = uvm_kvmalloc_zero(sizeof(*parent_gpu));
    if (!parent_gpu)
        return NV_ERR_NO_MEMORY;

    gpu = uvm_kvmalloc_zero(sizeof(*gpu));
    if (!gpu) {
        uvm_kvfree(parent_gpu);
        return NV_ERR_NO_MEMORY;
    }

    parent_gpu->gpus[0] = gpu;
    gpu->parent = parent_gpu;

    // At least test_tlb_invalidates() relies on global state
    // (g_tlb_invalidate_*) so make sure only one test instance can run at a
    // time.
    uvm_mutex_lock(&g_uvm_global.global_lock);

    // Allocate the fake TLB tracking state. Notably tests still need to enable
    // and disable the tracking with explicit fake_tlb_invals_enable/disable()
    // calls.
    TEST_NV_CHECK_GOTO(fake_tlb_invals_alloc(), done);

    // We prevent the maxwell_test_page_tree test from running on ATS-enabled
    // systems. On "fake" Maxwell-based ATS systems pde_fill() may push more
    // methods than what we support in UVM. Specifically, on
    // uvm_page_tree_init() which eventually calls phys_mem_init(). On Maxwell,
    // upper PDE levels have more than 512 entries.
    if (!g_uvm_global.ats.enabled)
        TEST_NV_CHECK_GOTO(maxwell_test_page_tree(gpu), done);
    TEST_NV_CHECK_GOTO(pascal_test_page_tree(gpu), done);
    TEST_NV_CHECK_GOTO(volta_test_page_tree(gpu), done);
    TEST_NV_CHECK_GOTO(ampere_test_page_tree(gpu), done);
    TEST_NV_CHECK_GOTO(hopper_test_page_tree(gpu), done);

done:
    fake_tlb_invals_free();

    uvm_mutex_unlock(&g_uvm_global.global_lock);

    uvm_kvfree(gpu);
    uvm_kvfree(parent_gpu);

    return status;
}