open-gpu-kernel-modules/kernel-open/nvidia-uvm/uvm_push_test.c

/*******************************************************************************
    Copyright (c) 2015-2021 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
    deal in the Software without restriction, including without limitation the
    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
    sell copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:

        The above copyright notice and this permission notice shall be
        included in all copies or substantial portions of the Software.

    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
    DEALINGS IN THE SOFTWARE.

*******************************************************************************/

#include <asm/atomic.h>

#include "uvm_global.h"
#include "uvm_channel.h"
#include "uvm_hal.h"
#include "uvm_mem.h"
#include "uvm_push.h"
#include "uvm_test.h"
#include "uvm_test_rng.h"
#include "uvm_thread_context.h"
#include "uvm_va_space.h"
#include "uvm_tracker.h"
#include "uvm_gpu_semaphore.h"
#include "uvm_kvmalloc.h"

#define TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES 2

static NvU32 get_push_end_size(uvm_channel_t *channel)
{
    if (uvm_channel_is_ce(channel))
        return UVM_PUSH_CE_END_SIZE;


    return 0;
}


static NV_STATUS test_push_end_size(uvm_va_space_t *va_space)
{
    NV_STATUS status = NV_OK;
    uvm_gpu_t *gpu;
    NvU32 push_size;
    NvU32 i;

    for_each_va_space_gpu(gpu, va_space) {
        for (i = 0; i < UVM_CHANNEL_TYPE_COUNT; ++i) {
            uvm_push_t push;
            NvU32 push_end_size;
            uvm_channel_type_t type = i;


            status = uvm_push_begin(gpu->channel_manager, type, &push, "type %u\n", (unsigned)type);
            TEST_CHECK_GOTO(status == NV_OK, done);

            push_end_size = get_push_end_size(push.channel);
            push_size = uvm_push_get_size(&push);
            uvm_push_end(&push);
            if (uvm_push_get_size(&push) - push_size != push_end_size) {
                UVM_TEST_PRINT("push_end_size incorrect, %u instead of %u for GPU %s\n",
                               uvm_push_get_size(&push) - push_size,
                               push_end_size,
                               uvm_gpu_name(gpu));
                status = NV_ERR_INVALID_STATE;
                goto done;
            }
        }
    }

done:
    for_each_va_space_gpu(gpu, va_space) {
        uvm_channel_manager_wait(gpu->channel_manager);
    }

    return status;
}

typedef enum {
    TEST_INLINE_ADD,
    TEST_INLINE_GET,
    TEST_INLINE_SINGLE_BUFFER,
    TEST_INLINE_MAX,
} test_inline_type_t;

static NV_STATUS test_push_inline_data_gpu(uvm_gpu_t *gpu)
{
    static const size_t test_sizes[] = { 1, 2, 3, 4, 8, 31, 32, 1023, 1024, 1025, UVM_PUSH_INLINE_DATA_MAX_SIZE };
    NV_STATUS status;
    int i, j;
    int test_inline_type;
    uvm_push_t push;
    uvm_mem_t *mem = NULL;
    char *verif;

    status = uvm_mem_alloc_sysmem_and_map_cpu_kernel(UVM_PUSH_INLINE_DATA_MAX_SIZE, current->mm, &mem);
    TEST_CHECK_GOTO(status == NV_OK, done);

    status = uvm_mem_map_gpu_kernel(mem, gpu);
    TEST_CHECK_GOTO(status == NV_OK, done);

    verif = (char *)uvm_mem_get_cpu_addr_kernel(mem);

    for (test_inline_type = 0; test_inline_type < TEST_INLINE_MAX; ++test_inline_type) {
        for (i = 0; i < ARRAY_SIZE(test_sizes); ++i) {
            size_t test_size = test_sizes[i];
            uvm_push_inline_data_t data;
            size_t inline_data_size = 0;
            uvm_gpu_address_t data_gpu_address;
            char *inline_buf;

            status = uvm_push_begin(gpu->channel_manager,
                                    UVM_CHANNEL_TYPE_GPU_INTERNAL,
                                    &push,
                                    "Inline data size %zu",
                                    test_size);
            TEST_CHECK_GOTO(status == NV_OK, done);

            // Do a noop first to test inline data starting at different offsets
            gpu->parent->host_hal->noop(&push, roundup(min(test_size, (size_t)4096), UVM_METHOD_SIZE));

            switch (test_inline_type) {
                case TEST_INLINE_ADD:
                    uvm_push_inline_data_begin(&push, &data);
                    for (j = 0; j < test_size; ++j) {
                        char value = 1 + i + j;
                        uvm_push_inline_data_add(&data, &value, 1);
                    }
                    inline_data_size = uvm_push_inline_data_size(&data);
                    data_gpu_address = uvm_push_inline_data_end(&data);
                    break;
                case TEST_INLINE_GET:
                    uvm_push_inline_data_begin(&push, &data);
                    inline_buf = (char*)uvm_push_inline_data_get(&data, test_size);
                    inline_data_size = uvm_push_inline_data_size(&data);
                    data_gpu_address = uvm_push_inline_data_end(&data);
                    for (j = 0; j < test_size; ++j)
                        inline_buf[j] = 1 + i + j;
                    break;
                case TEST_INLINE_SINGLE_BUFFER:
                    inline_buf = (char*)uvm_push_get_single_inline_buffer(&push, test_size, &data_gpu_address);
                    inline_data_size = test_size;
                    for (j = 0; j < test_size; ++j)
                        inline_buf[j] = 1 + i + j;
                    break;
            }


            gpu->parent->ce_hal->memcopy(&push,
                                        uvm_mem_gpu_address_virtual_kernel(mem, gpu),
                                        data_gpu_address,
                                        test_size);
            status = uvm_push_end_and_wait(&push);
            TEST_CHECK_GOTO(status == NV_OK, done);

            TEST_CHECK_GOTO(inline_data_size == test_size, done);

            for (j = 0; j < test_size; ++j) {
                char expected = 1 + i + j;
                if (verif[j] != expected) {
                    UVM_TEST_PRINT("size %zu verif[%d] = %d instead of %d\n", test_size, j, verif[j], expected);
                    status = NV_ERR_INVALID_STATE;
                    goto done;
                }
            }
        }
    }
done:
    uvm_mem_free(mem);

    return status;
}

static NV_STATUS test_push_inline_data(uvm_va_space_t *va_space)
{
    uvm_gpu_t *gpu;

    for_each_va_space_gpu(gpu, va_space) {
        TEST_CHECK_RET(test_push_inline_data_gpu(gpu) == NV_OK);
    }

    return NV_OK;
}

// Test that begins UVM_PUSH_MAX_CONCURRENT_PUSHES number of pushes before
// ending any of them on each GPU.
// Notably starting more than a single push is not safe to do outside of a test
// as if multiple threads tried doing so, it could easily deadlock.
static NV_STATUS test_concurrent_pushes(uvm_va_space_t *va_space)
{
    NV_STATUS status = NV_OK;
    uvm_gpu_t *gpu;
    NvU32 i;
    uvm_push_t *pushes;
    uvm_tracker_t tracker = UVM_TRACKER_INIT();

    // As noted above, this test does unsafe things that would be detected by
    // lock tracking, opt-out.
    uvm_thread_context_lock_disable_tracking();

    pushes = uvm_kvmalloc_zero(sizeof(*pushes) * UVM_PUSH_MAX_CONCURRENT_PUSHES);
    if (pushes == NULL) {
        status = NV_ERR_NO_MEMORY;
        goto done;
    }

    for_each_va_space_gpu(gpu, va_space) {
        for (i = 0; i < UVM_PUSH_MAX_CONCURRENT_PUSHES; ++i) {
            uvm_push_t *push = &pushes[i];
            status = uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_CPU_TO_GPU, push, "concurrent push %u", i);
            TEST_CHECK_GOTO(status == NV_OK, done);
        }
        for (i = 0; i < UVM_PUSH_MAX_CONCURRENT_PUSHES; ++i) {
            uvm_push_t *push = &pushes[i];
            uvm_push_end(push);
            TEST_NV_CHECK_GOTO(uvm_tracker_add_push(&tracker, push), done);
        }
        TEST_CHECK_GOTO(tracker.size != 0, done);

        status = uvm_tracker_wait(&tracker);
        TEST_CHECK_GOTO(status == NV_OK, done);
    }

done:
    uvm_thread_context_lock_enable_tracking();

    uvm_tracker_deinit(&tracker);

    uvm_kvfree(pushes);

    return status;
}

static void add_to_counter(void* ptr, int value)
{
    atomic_t *atomic = (atomic_t*) ptr;
    atomic_add(value, atomic);
}

static void add_one_to_counter(void* ptr)
{
    add_to_counter(ptr, 1);
}

static void add_two_to_counter(void* ptr)
{
    add_to_counter(ptr, 2);
}

static NV_STATUS test_push_interleaving_on_gpu(uvm_gpu_t* gpu)
{
    NV_STATUS status;
    uvm_channel_t *channel;
    uvm_push_t push;
    NvU32 i;
    NvU32 *host_va;
    NvU64 gpu_va;
    NvU32 observed, expected;
    unsigned int num_non_paused_pushes;
    uvm_push_t pushes_not_ended[TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES];
    const NvLength size = sizeof(NvU32) * (1 + TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES);
    uvm_rm_mem_t *mem = NULL;
    atomic_t on_complete_counter = ATOMIC_INIT(0);

    // This test issues virtual memcopies/memsets, which in SR-IOV heavy cannot
    // be pushed to a proxy channel. Pushing to a UVM internal CE channel works
    // in all scenarios.
    channel = uvm_channel_any_of_type(gpu->channel_manager, UVM_CHANNEL_POOL_TYPE_CE);
    TEST_CHECK_RET(channel != NULL);

    if (channel->num_gpfifo_entries <= TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES) {
        UVM_TEST_PRINT("Insufficient number of gpfifo entries per channel to run this test. Expected at least %u "
                       "entries, but found %u\n",
                       TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES + 1,
                       channel->num_gpfifo_entries);
        return NV_ERR_INVALID_STATE;
    }
    num_non_paused_pushes = channel->num_gpfifo_entries;

    // The UVM driver only allows push interleaving across separate threads, but
    // it is hard to consistenly replicate the interleaving. Instead, we
    // temporarily disable lock tracking, so we can interleave pushes from a
    // single thread.
    uvm_thread_context_lock_disable_tracking();

    status = uvm_rm_mem_alloc_and_map_cpu(gpu, UVM_RM_MEM_TYPE_SYS, size, &mem);
    TEST_CHECK_GOTO(status == NV_OK, done);
    host_va = (NvU32*)uvm_rm_mem_get_cpu_va(mem);
    gpu_va = uvm_rm_mem_get_gpu_va(mem, gpu, uvm_channel_is_proxy(channel));
    memset(host_va, 0, size);

    // Begin a few pushes on the channel, but do not end them yet.
    // Each pushed method sets a magic number on an independent memory location.
    for (i = 0; i < TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES; ++i) {
        uvm_push_info_t *push_info;

        status = uvm_push_begin_on_channel(channel, pushes_not_ended + i, "Set to 0x%x", 0xDEADBEEF + i);
        TEST_CHECK_GOTO(status == NV_OK, done);
        gpu->parent->ce_hal->memset_v_4(pushes_not_ended + i,
                                        gpu_va + sizeof(NvU32) * (i + 1),
                                        0xDEADBEEF + i,
                                        sizeof(NvU32));

        push_info = uvm_push_info_from_push(pushes_not_ended + i);
        push_info->on_complete = add_two_to_counter;
        push_info->on_complete_data = &on_complete_counter;
    }

    // Push N (N = #channel entries) value increments to the same channel.
    for (i = 0; i < num_non_paused_pushes; ++i) {
        uvm_push_info_t *push_info;

        status = uvm_push_begin_on_channel(channel, &push, "inc to %u", i + 1);
        TEST_CHECK_GOTO(status == NV_OK, done);
        gpu->parent->ce_hal->semaphore_reduction_inc(&push, gpu_va, num_non_paused_pushes);

        push_info = uvm_push_info_from_push(&push);
        push_info->on_complete = add_one_to_counter;
        push_info->on_complete_data = &on_complete_counter;

        uvm_push_end(&push);
    }

    // End the pending pushes
    for (i = 0; i < TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES; ++i)
        uvm_push_end(pushes_not_ended + i);

    // When the channel manager becomes idle, the GPU methods have been
    // completed, and the CPU completion callbacks associated with the push
    // have been invoked.
    status = uvm_channel_manager_wait(channel->pool->manager);
    TEST_CHECK_GOTO(status == NV_OK, done);

    observed = host_va[0];
    expected = num_non_paused_pushes;
    if (observed != expected) {
        UVM_TEST_PRINT("Observed counter %u but expected %u\n", observed, expected);
        status = NV_ERR_INVALID_STATE;
        goto done;
    }

    for (i = 0; i < TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES; ++i) {
        observed = host_va[i + 1];
        expected = 0xDEADBEEF + i;
        if (observed != expected) {
            UVM_TEST_PRINT("Observed magic number 0x%x but expected 0x%x\n", observed, expected);
            status = NV_ERR_INVALID_STATE;
            goto done;
        }
    }

    observed = atomic_read(&on_complete_counter);
    expected = TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES * 2 + num_non_paused_pushes;
    if (observed != expected) {
        UVM_TEST_PRINT("Wrong value of counter incremented by push info callback. Observed %u but expected %u\n",
                       observed,
                       expected);
        status = NV_ERR_INVALID_STATE;
        goto done;
    }

done:
    uvm_rm_mem_free(mem);
    uvm_thread_context_lock_enable_tracking();

    return status;
}

// Using a single thread, interleave pushes and check that the result is
// consistent with a non-interleaved sequence.
// 1) Begin a few pushes in channel X but do not end them. Each pushed (GPU)
//    method sets a individual value in an independent system memory location.
//    Each push is associated with a push info (CPU) callback that atomically
//    adds 2 to a memory location M
// 2) Begin and end many pushes in the same channel X such that all the gpfifo
//    entries are filled. All the pushed methods do the same thing: atomically
//    increment a given system memory location.
//    Each push is associated with a push info callback that atomically
//    increments the memory location M
// 3) End the pending pushes
//
// The final state should be the same as in the non-interleaved sequence
// (1)-(3)-(2)
//
// Starting more than a single push is not safe to do outside of a test as if
// multiple threads tried doing so, it could easily deadlock.
static NV_STATUS test_push_interleaving(uvm_va_space_t *va_space)
{
    NV_STATUS status;
    uvm_gpu_t *gpu;

    BUILD_BUG_ON(TEST_PUSH_INTERLEAVING_NUM_PAUSED_PUSHES >= UVM_PUSH_MAX_CONCURRENT_PUSHES);

    for_each_va_space_gpu(gpu, va_space) {
        status = test_push_interleaving_on_gpu(gpu);
        if (status != NV_OK)
            return status;
    }

    return NV_OK;
}

// Push exactly UVM_MAX_PUSH_SIZE methods while acquiring a semaphore
// This is very tightly coupled with the pushbuffer implementation and method
// sizes, which is not ideal, but allows to test corner cases in the pushbuffer
// management code.
static NV_STATUS test_push_exactly_max_push(uvm_gpu_t *gpu,
                                            uvm_push_t *push,
                                            uvm_channel_type_t channel_type,
                                            uvm_gpu_semaphore_t *sema_to_acquire,
                                            NvU32 value)
{
    NV_STATUS status;
    NvU64 semaphore_gpu_va;
    NvU32 push_end_size;

    status = uvm_push_begin(gpu->channel_manager, channel_type, push, "Test push");
    if (status != NV_OK)
        return status;

    TEST_CHECK_RET(uvm_push_has_space(push, UVM_MAX_PUSH_SIZE));
    TEST_CHECK_RET(!uvm_push_has_space(push, UVM_MAX_PUSH_SIZE + 1));

    semaphore_gpu_va = uvm_gpu_semaphore_get_gpu_va(sema_to_acquire, gpu, uvm_channel_is_proxy(push->channel));
    gpu->parent->host_hal->semaphore_acquire(push, semaphore_gpu_va, value);

    // Push a noop leaving just push_end_size in the pushbuffer.
    push_end_size = get_push_end_size(push->channel);
    gpu->parent->host_hal->noop(push, UVM_MAX_PUSH_SIZE - uvm_push_get_size(push) - push_end_size);

    TEST_CHECK_RET(uvm_push_has_space(push, push_end_size));
    TEST_CHECK_RET(!uvm_push_has_space(push, push_end_size + 1));
    uvm_push_end(push);

    UVM_ASSERT_MSG(uvm_push_get_size(push) == UVM_MAX_PUSH_SIZE, "push_size %u\n", uvm_push_get_size(push));

    return NV_OK;
}

static NvU32 test_count_idle_chunks(uvm_pushbuffer_t *pushbuffer)
{
    NvU32 i;
    NvU32 count = 0;
    for (i = 0; i < UVM_PUSHBUFFER_CHUNKS; ++i)
        count += test_bit(i, pushbuffer->idle_chunks) ? 1 : 0;
    return count;
}

static NvU32 test_count_available_chunks(uvm_pushbuffer_t *pushbuffer)
{
    NvU32 i;
    NvU32 count = 0;
    for (i = 0; i < UVM_PUSHBUFFER_CHUNKS; ++i)
        count += test_bit(i, pushbuffer->available_chunks) ? 1 : 0;
    return count;
}

// Reuse the whole pushbuffer 4 times, one UVM_MAX_PUSH_SIZE at a time
#define EXTRA_MAX_PUSHES_WHILE_FULL (4 * UVM_PUSHBUFFER_SIZE / UVM_MAX_PUSH_SIZE)

// Test doing pushes of exactly UVM_MAX_PUSH_SIZE size and only allowing them to
// complete one by one.
static NV_STATUS test_max_pushes_on_gpu_and_channel_type(uvm_gpu_t *gpu, uvm_channel_type_t channel_type)
{
    NV_STATUS status;

    uvm_tracker_t tracker;
    uvm_gpu_semaphore_t sema;
    NvU32 total_push_size = 0;
    NvU32 push_count = 0;
    NvU32 i;

    uvm_tracker_init(&tracker);

    status = uvm_gpu_semaphore_alloc(gpu->semaphore_pool, &sema);
    TEST_CHECK_GOTO(status == NV_OK, done);

    uvm_gpu_semaphore_set_payload(&sema, 0);

    // Need to wait for all channels to completely idle so that the pushbuffer
    // is in completely idle state when we begin.
    status = uvm_channel_manager_wait(gpu->channel_manager);
    TEST_CHECK_GOTO(status == NV_OK, done);

    while (uvm_pushbuffer_has_space(gpu->channel_manager->pushbuffer)) {
        uvm_push_t push;

        ++push_count;

        status = test_push_exactly_max_push(gpu, &push, channel_type, &sema, push_count);
        TEST_CHECK_GOTO(status == NV_OK, done);

        total_push_size += uvm_push_get_size(&push);
        TEST_NV_CHECK_GOTO(uvm_tracker_add_push(&tracker, &push), done);
    }

    if (total_push_size != UVM_PUSHBUFFER_SIZE) {
        UVM_TEST_PRINT("Unexpected space in the pushbuffer, total push %u\n", total_push_size);
        uvm_pushbuffer_print(gpu->channel_manager->pushbuffer);
        status = NV_ERR_INVALID_STATE;
        goto done;
    }

    TEST_CHECK_GOTO(test_count_available_chunks(gpu->channel_manager->pushbuffer) == 0, done);
    TEST_CHECK_GOTO(test_count_idle_chunks(gpu->channel_manager->pushbuffer) == 0, done);

    for (i = 0; i < EXTRA_MAX_PUSHES_WHILE_FULL; ++i) {
        uvm_push_t push;

        // There should be no space for another push until the sema is
        // incremented. Incrementing the same allows a single push to complete
        // freeing exactly UVM_MAX_PUSH_SIZE space.
        if (uvm_pushbuffer_has_space(gpu->channel_manager->pushbuffer)) {
            UVM_TEST_PRINT("Unexpected space in the pushbuffer for iter %d\n", i);
            uvm_pushbuffer_print(gpu->channel_manager->pushbuffer);
            status = NV_ERR_INVALID_STATE;
            goto done;
        }

        uvm_gpu_semaphore_set_payload(&sema, i + 1);

        ++push_count;

        // Take UVM_MAX_PUSH_SIZE space. This should leave no space left again.
        status = test_push_exactly_max_push(gpu, &push, channel_type, &sema, push_count);
        TEST_CHECK_GOTO(status == NV_OK, done);

        TEST_NV_CHECK_GOTO(uvm_tracker_add_push(&tracker, &push), done);
    }

done:
    uvm_gpu_semaphore_set_payload(&sema, push_count);
    uvm_tracker_wait_deinit(&tracker);

    uvm_gpu_semaphore_free(&sema);

    return status;
}

static NV_STATUS test_max_pushes_on_gpu(uvm_gpu_t *gpu)
{


    TEST_NV_CHECK_RET(test_max_pushes_on_gpu_and_channel_type(gpu, UVM_CHANNEL_TYPE_GPU_INTERNAL));

    return NV_OK;
}

// Test doing UVM_PUSHBUFFER_CHUNKS independent pushes expecting each one to use
// a different chunk in the pushbuffer.
static NV_STATUS test_idle_chunks_on_gpu(uvm_gpu_t *gpu)
{
    NV_STATUS status;

    uvm_gpu_semaphore_t sema;
    uvm_tracker_t tracker = UVM_TRACKER_INIT();
    NvU32 i;

    uvm_tracker_init(&tracker);

    status = uvm_gpu_semaphore_alloc(gpu->semaphore_pool, &sema);
    TEST_CHECK_GOTO(status == NV_OK, done);

    uvm_gpu_semaphore_set_payload(&sema, 0);

    // Need to wait for all channels to completely idle so that the pushbuffer
    // is in completely idle state when we begin.
    status = uvm_channel_manager_wait(gpu->channel_manager);
    TEST_CHECK_GOTO(status == NV_OK, done);

    for (i = 0; i < UVM_PUSHBUFFER_CHUNKS; ++i) {
        NvU64 semaphore_gpu_va;
        uvm_push_t push;

        status = uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_INTERNAL, &push, "Push using chunk %u", i);
        TEST_CHECK_GOTO(status == NV_OK, done);

        semaphore_gpu_va = uvm_gpu_semaphore_get_gpu_va(&sema, gpu, uvm_channel_is_proxy(push.channel));
        gpu->parent->host_hal->semaphore_acquire(&push, semaphore_gpu_va, i + 1);
        uvm_push_end(&push);

        TEST_NV_CHECK_GOTO(uvm_tracker_add_push(&tracker, &push), done);

        if (test_count_idle_chunks(gpu->channel_manager->pushbuffer) != UVM_PUSHBUFFER_CHUNKS - i - 1) {
            UVM_TEST_PRINT("Unexpected count of idle chunks in the pushbuffer %u instead of %u\n",
                           test_count_idle_chunks(gpu->channel_manager->pushbuffer), UVM_PUSHBUFFER_CHUNKS - i - 1);
            uvm_pushbuffer_print(gpu->channel_manager->pushbuffer);
            status = NV_ERR_INVALID_STATE;
            goto done;
        }
    }
    uvm_gpu_semaphore_set_payload(&sema, UVM_PUSHBUFFER_CHUNKS + 1);

    status = uvm_channel_manager_wait(gpu->channel_manager);
    TEST_CHECK_GOTO(status == NV_OK, done);

    if (test_count_idle_chunks(gpu->channel_manager->pushbuffer) != UVM_PUSHBUFFER_CHUNKS) {
        UVM_TEST_PRINT("Unexpected count of idle chunks in the pushbuffer %u\n",
                       test_count_idle_chunks(gpu->channel_manager->pushbuffer));
        uvm_pushbuffer_print(gpu->channel_manager->pushbuffer);
        status = NV_ERR_INVALID_STATE;
        goto done;
    }

done:
    uvm_gpu_semaphore_set_payload(&sema, UVM_PUSHBUFFER_CHUNKS + 1);
    uvm_tracker_wait(&tracker);

    uvm_gpu_semaphore_free(&sema);
    uvm_tracker_deinit(&tracker);

    return status;
}

static NV_STATUS test_pushbuffer(uvm_va_space_t *va_space)
{
    uvm_gpu_t *gpu;

    for_each_va_space_gpu(gpu, va_space) {
        TEST_NV_CHECK_RET(test_max_pushes_on_gpu(gpu));
        TEST_NV_CHECK_RET(test_idle_chunks_on_gpu(gpu));
    }
    return NV_OK;
}

typedef struct
{
    NvU64 *timestmap_in_pushbuffer;
    NvU64 timestamp;
} timestamp_test_t;

static void timestamp_on_complete(void *void_data)
{
    timestamp_test_t *data = (timestamp_test_t *)void_data;

    if (uvm_global_get_status() != NV_OK) {
        // Do nothing if a global error has been set as the callback might be
        // called from teardown where the reference to test data is no longer
        // valid.
        return;
    }

    data->timestamp = *data->timestmap_in_pushbuffer;
}

static NV_STATUS test_timestamp_on_gpu(uvm_gpu_t *gpu)
{
    NV_STATUS status;
    uvm_push_t push;
    timestamp_test_t test_data = {0};
    NvU32 i;
    NvU64 last_stamp = 0;

    for (i = 0; i < 10; ++i) {
        status = uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_INTERNAL, &push, "Releasing a timestamp");
        if (status != NV_OK)
            return status;

        test_data.timestmap_in_pushbuffer = uvm_push_timestamp(&push);
        uvm_push_info_from_push(&push)->on_complete = timestamp_on_complete;
        uvm_push_info_from_push(&push)->on_complete_data = &test_data;
        uvm_push_end(&push);

        // Synchronize the channel manager to make sure the on_complete
        // callbacks have a chance to run.
        status = uvm_channel_manager_wait(gpu->channel_manager);
        TEST_CHECK_RET(status == NV_OK);

        TEST_CHECK_RET(test_data.timestamp != 0);
        TEST_CHECK_RET(test_data.timestamp > last_stamp);
        last_stamp = test_data.timestamp;
    }

    return NV_OK;
}

static NV_STATUS test_timestamp(uvm_va_space_t *va_space)
{
    uvm_gpu_t *gpu;

    for_each_va_space_gpu(gpu, va_space)
        TEST_CHECK_RET(test_timestamp_on_gpu(gpu) == NV_OK);

    return NV_OK;
}

static NV_STATUS sync_memcopy(uvm_channel_type_t type, uvm_mem_t *dst, uvm_mem_t *src)
{
    uvm_push_t push;
    uvm_gpu_address_t dst_va;
    uvm_gpu_address_t src_va;
    uvm_gpu_t *gpu;
    NV_STATUS status;

    UVM_ASSERT(uvm_mem_is_vidmem(src) || uvm_mem_is_vidmem(dst));

    if (type == UVM_CHANNEL_TYPE_CPU_TO_GPU || type == UVM_CHANNEL_TYPE_GPU_TO_CPU) {
        gpu = (type == UVM_CHANNEL_TYPE_CPU_TO_GPU) ? dst->backing_gpu : src->backing_gpu;
        status = uvm_push_begin(gpu->channel_manager, type, &push, uvm_channel_type_to_string(type));
        if (status != NV_OK)
            return status;

        dst_va = uvm_mem_gpu_address_virtual_kernel(dst, gpu);
        src_va = uvm_mem_gpu_address_virtual_kernel(src, gpu);
        gpu->parent->ce_hal->memcopy(&push, dst_va, src_va, src->size);
    }
    else {
        unsigned i;
        const NvU32 chunk_size = src->chunk_size;

        UVM_ASSERT((src->size % chunk_size) == 0);

        gpu = src->backing_gpu;
        status = uvm_push_begin_gpu_to_gpu(gpu->channel_manager,
                                           dst->backing_gpu,
                                           &push,
                                           uvm_channel_type_to_string(type));

        for (i = 0; i < src->size / chunk_size; i++) {
            dst_va = uvm_mem_gpu_address_copy(dst, gpu, i * chunk_size, chunk_size);
            src_va = uvm_mem_gpu_address_copy(src, gpu, i * chunk_size, chunk_size);
            gpu->parent->ce_hal->memcopy(&push, dst_va, src_va, chunk_size);
        }
    }

    return uvm_push_end_and_wait(&push);
}

static bool can_do_peer_copies(uvm_va_space_t *va_space, uvm_gpu_t *gpu_a, uvm_gpu_t *gpu_b)
{
    if (gpu_a == gpu_b || !uvm_processor_mask_test(&va_space->can_copy_from[uvm_id_value(gpu_a->id)], gpu_b->id))
        return false;

    UVM_ASSERT(uvm_processor_mask_test(&va_space->can_copy_from[uvm_id_value(gpu_b->id)], gpu_a->id));

    // TODO: Bug 2028875. Indirect peers are not supported for now.
    if (uvm_gpus_are_indirect_peers(gpu_a, gpu_b))
        return false;

    return true;
}

// Test the GPU to GPU push interface by transferring data between each
// permutation of GPU peers.
static NV_STATUS test_push_gpu_to_gpu(uvm_va_space_t *va_space)
{
    NvU32 i;
    NV_STATUS status;
    uvm_gpu_t *gpu, *gpu_a, *gpu_b;
    uvm_mem_t *mem[UVM_ID_MAX_PROCESSORS] = {NULL};
    NvU32 *host_ptr;
    const size_t size = 1024 * 1024;
    bool waive = true;

    for_each_va_space_gpu(gpu_a, va_space) {
        for_each_va_space_gpu(gpu_b, va_space) {
            if (can_do_peer_copies(va_space, gpu_a, gpu_b)) {
                waive = false;
                break;
            }
        }
    }

    if (waive)
        return NV_OK;

    // Alloc and initialize host buffer
    status = uvm_mem_alloc_sysmem_and_map_cpu_kernel(size, current->mm, &mem[UVM_ID_CPU_VALUE]);
    TEST_CHECK_GOTO(status == NV_OK, done);

    host_ptr = (NvU32 *)uvm_mem_get_cpu_addr_kernel(mem[UVM_ID_CPU_VALUE]);

    for (i = 0; i < size / sizeof(NvU32); ++i)
        host_ptr[i] = i + 1;

    // Allocate vidmem on each GPU, and map the host buffer
    for_each_va_space_gpu(gpu, va_space) {
        status = uvm_mem_alloc_vidmem(size, gpu, &mem[uvm_id_value(gpu->id)]);
        TEST_CHECK_GOTO(status == NV_OK, done);

        status = uvm_mem_map_gpu_kernel(mem[uvm_id_value(gpu->id)], gpu);
        TEST_CHECK_GOTO(status == NV_OK, done);

        status = uvm_mem_map_gpu_kernel(mem[UVM_ID_CPU_VALUE], gpu);
        TEST_CHECK_GOTO(status == NV_OK, done);
    }

    // Copy buffer between each pair of GPU peers, in both directions
    for_each_va_space_gpu(gpu_a, va_space) {
        for_each_va_space_gpu(gpu_b, va_space) {
            if (!can_do_peer_copies(va_space, gpu_a, gpu_b))
                continue;

            // Copy from CPU to the first GPU, and then zero out the host copy
            status = sync_memcopy(UVM_CHANNEL_TYPE_CPU_TO_GPU,
                                  mem[uvm_id_value(gpu_a->id)],
                                  mem[UVM_ID_CPU_VALUE]);
            TEST_CHECK_GOTO(status == NV_OK, done);

            memset(host_ptr, 0, size / sizeof(NvU32));

            // Copy from the first GPU to the second GPU
            status = sync_memcopy(UVM_CHANNEL_TYPE_GPU_TO_GPU,
                                  mem[uvm_id_value(gpu_b->id)],
                                  mem[uvm_id_value(gpu_a->id)]);
            TEST_CHECK_GOTO(status == NV_OK, done);

            // Copy from the second GPU back to the host, and check result
            status = sync_memcopy(UVM_CHANNEL_TYPE_GPU_TO_CPU,
                                  mem[UVM_ID_CPU_VALUE],
                                  mem[uvm_id_value(gpu_b->id)]);
            TEST_CHECK_GOTO(status == NV_OK, done);

            for (i = 0; i < size / sizeof(NvU32); ++i) {
                if (host_ptr[i] != i + 1) {
                    UVM_TEST_PRINT("host_ptr[%u] = %u instead of %u when copying between %s and %s\n",
                                   i,
                                   host_ptr[i],
                                   i + 1,
                                   uvm_gpu_name(gpu_a),
                                   uvm_gpu_name(gpu_b));
                    status = NV_ERR_INVALID_STATE;
                    TEST_CHECK_GOTO(status == NV_OK, done);
                }
            }
        }
    }

 done:
    for_each_va_space_gpu(gpu, va_space)
        uvm_mem_free(mem[uvm_id_value(gpu->id)]);

    uvm_mem_free(mem[UVM_ID_CPU_VALUE]);

    return status;
}

NV_STATUS uvm_test_push_sanity(UVM_TEST_PUSH_SANITY_PARAMS *params, struct file *filp)
{
    NV_STATUS status;
    uvm_va_space_t *va_space = uvm_va_space_get(filp);

    // Take the global lock as some of the tests rely on being the
    // only thread doing pushes and could deadlock otherwise.
    uvm_mutex_lock(&g_uvm_global.global_lock);
    uvm_va_space_down_read_rm(va_space);

    status = test_push_end_size(va_space);
    if (status != NV_OK)
        goto done;

    status = test_push_inline_data(va_space);
    if (status != NV_OK)
        goto done;

    status = test_concurrent_pushes(va_space);
    if (status != NV_OK)
        goto done;

    status = test_push_interleaving(va_space);
    if (status != NV_OK)
        goto done;

    status = test_push_gpu_to_gpu(va_space);
    if (status != NV_OK)
        goto done;

    status = test_pushbuffer(va_space);
    if (status != NV_OK)
        goto done;

    if (!params->skipTimestampTest) {
        status = test_timestamp(va_space);
        if (status != NV_OK)
            goto done;
    }

done:
    uvm_va_space_up_read_rm(va_space);
    uvm_mutex_unlock(&g_uvm_global.global_lock);

    return status;
}