open-gpu-kernel-modules/kernel-open/nvidia-uvm/uvm_pushbuffer.c

/*******************************************************************************
    Copyright (c) 2015-2022 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
    deal in the Software without restriction, including without limitation the
    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
    sell copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:

        The above copyright notice and this permission notice shall be
        included in all copies or substantial portions of the Software.

    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
    DEALINGS IN THE SOFTWARE.

*******************************************************************************/

#include "uvm_api.h"
#include "uvm_pushbuffer.h"
#include "uvm_channel.h"
#include "uvm_global.h"
#include "uvm_lock.h"
#include "uvm_procfs.h"
#include "uvm_push.h"
#include "uvm_kvmalloc.h"
#include "uvm_gpu.h"
#include "uvm_common.h"
#include "uvm_linux.h"

// Print pushbuffer state into a seq_file if provided or with UVM_DBG_PRINT() if not.
static void uvm_pushbuffer_print_common(uvm_pushbuffer_t *pushbuffer, struct seq_file *s);

static int nv_procfs_read_pushbuffer_info(struct seq_file *s, void *v)
{
    uvm_pushbuffer_t *pushbuffer = (uvm_pushbuffer_t *)s->private;

    if (!uvm_down_read_trylock(&g_uvm_global.pm.lock))
            return -EAGAIN;

    uvm_pushbuffer_print_common(pushbuffer, s);

    uvm_up_read(&g_uvm_global.pm.lock);

    return 0;
}

static int nv_procfs_read_pushbuffer_info_entry(struct seq_file *s, void *v)
{
    UVM_ENTRY_RET(nv_procfs_read_pushbuffer_info(s, v));
}

UVM_DEFINE_SINGLE_PROCFS_FILE(pushbuffer_info_entry);

static NV_STATUS create_procfs(uvm_pushbuffer_t *pushbuffer)
{
    uvm_gpu_t *gpu = pushbuffer->channel_manager->gpu;

    // The pushbuffer info file is for debug only
    if (!uvm_procfs_is_debug_enabled())
        return NV_OK;

    pushbuffer->procfs.info_file = NV_CREATE_PROC_FILE("pushbuffer",
                                                       gpu->procfs.dir,
                                                       pushbuffer_info_entry,
                                                       pushbuffer);
    if (pushbuffer->procfs.info_file == NULL)
        return NV_ERR_OPERATING_SYSTEM;

    return NV_OK;
}

NV_STATUS uvm_pushbuffer_create(uvm_channel_manager_t *channel_manager, uvm_pushbuffer_t **pushbuffer_out)
{
    NV_STATUS status;
    int i;
    uvm_gpu_t *gpu = channel_manager->gpu;
    NvU64 pushbuffer_alignment;

    uvm_pushbuffer_t *pushbuffer = uvm_kvmalloc_zero(sizeof(*pushbuffer));
    if (pushbuffer == NULL)
        return NV_ERR_NO_MEMORY;

    pushbuffer->channel_manager = channel_manager;

    uvm_spin_lock_init(&pushbuffer->lock, UVM_LOCK_ORDER_LEAF);

    // Currently the pushbuffer supports UVM_PUSHBUFFER_CHUNKS of concurrent
    // pushes.
    uvm_sema_init(&pushbuffer->concurrent_pushes_sema, UVM_PUSHBUFFER_CHUNKS, UVM_LOCK_ORDER_PUSH);

    UVM_ASSERT(channel_manager->conf.pushbuffer_loc == UVM_BUFFER_LOCATION_SYS ||
               channel_manager->conf.pushbuffer_loc == UVM_BUFFER_LOCATION_VID);

    // The pushbuffer allocation is aligned to UVM_PUSHBUFFER_SIZE and its size
    // (UVM_PUSHBUFFER_SIZE) is a power of 2. These constraints guarantee that
    // the entire pushbuffer belongs to a 1TB (2^40) segment. Thus, we can set
    // the Esched/PBDMA segment base for all channels during their
    // initialization and it is immutable for the entire channels' lifetime.
    BUILD_BUG_ON_NOT_POWER_OF_2(UVM_PUSHBUFFER_SIZE);
    BUILD_BUG_ON(UVM_PUSHBUFFER_SIZE >= (1ull << 40));

    if (gpu->uvm_test_force_upper_pushbuffer_segment)
        pushbuffer_alignment = (1ull << 40);
    else
        pushbuffer_alignment = UVM_PUSHBUFFER_SIZE;

    status = uvm_rm_mem_alloc_and_map_cpu(gpu,
                                          (channel_manager->conf.pushbuffer_loc == UVM_BUFFER_LOCATION_SYS) ?
                                              UVM_RM_MEM_TYPE_SYS:
                                              UVM_RM_MEM_TYPE_GPU,
                                          UVM_PUSHBUFFER_SIZE,
                                          pushbuffer_alignment,
                                          &pushbuffer->memory);
    if (status != NV_OK)
        goto error;

    // Verify the GPU can access the pushbuffer.
    UVM_ASSERT((uvm_pushbuffer_get_gpu_va_base(pushbuffer) + UVM_PUSHBUFFER_SIZE - 1) < gpu->parent->max_host_va);

    bitmap_fill(pushbuffer->idle_chunks, UVM_PUSHBUFFER_CHUNKS);
    bitmap_fill(pushbuffer->available_chunks, UVM_PUSHBUFFER_CHUNKS);

    for (i = 0; i < UVM_PUSHBUFFER_CHUNKS; ++i)
        INIT_LIST_HEAD(&pushbuffer->chunks[i].pending_gpfifos);

    status = create_procfs(pushbuffer);
    if (status != NV_OK)
        goto error;

    *pushbuffer_out = pushbuffer;

    return status;

error:
    uvm_pushbuffer_destroy(pushbuffer);
    return status;
}

static uvm_pushbuffer_chunk_t *get_chunk_in_mask(uvm_pushbuffer_t *pushbuffer, unsigned long *mask)
{
    NvU32 index = find_first_bit(mask, UVM_PUSHBUFFER_CHUNKS);

    uvm_assert_spinlock_locked(&pushbuffer->lock);

    if (index == UVM_PUSHBUFFER_CHUNKS)
        return NULL;

    return &pushbuffer->chunks[index];
}

static uvm_pushbuffer_chunk_t *get_available_chunk(uvm_pushbuffer_t *pushbuffer)
{
    return get_chunk_in_mask(pushbuffer, pushbuffer->available_chunks);
}

static uvm_pushbuffer_chunk_t *get_idle_chunk(uvm_pushbuffer_t *pushbuffer)
{
    return get_chunk_in_mask(pushbuffer, pushbuffer->idle_chunks);
}

static NvU32 chunk_get_index(uvm_pushbuffer_t *pushbuffer, uvm_pushbuffer_chunk_t *chunk)
{
    NvU32 index = chunk - pushbuffer->chunks;
    UVM_ASSERT(index < UVM_PUSHBUFFER_CHUNKS);
    return index;
}

static NvU32 chunk_get_offset(uvm_pushbuffer_t *pushbuffer, uvm_pushbuffer_chunk_t *chunk)
{
    return chunk_get_index(pushbuffer, chunk) * UVM_PUSHBUFFER_CHUNK_SIZE;
}

static void set_chunk(uvm_pushbuffer_t *pushbuffer, uvm_pushbuffer_chunk_t *chunk, unsigned long *mask)
{
    NvU32 index = chunk_get_index(pushbuffer, chunk);

    uvm_assert_spinlock_locked(&pushbuffer->lock);

    __set_bit(index, mask);
}

static void clear_chunk(uvm_pushbuffer_t *pushbuffer, uvm_pushbuffer_chunk_t *chunk, unsigned long *mask)
{
    NvU32 index = chunk_get_index(pushbuffer, chunk);

    uvm_assert_spinlock_locked(&pushbuffer->lock);

    __clear_bit(index, mask);
}

static uvm_pushbuffer_chunk_t *pick_chunk(uvm_pushbuffer_t *pushbuffer)
{
    uvm_pushbuffer_chunk_t *chunk = get_idle_chunk(pushbuffer);

    uvm_assert_spinlock_locked(&pushbuffer->lock);

    if (chunk == NULL)
        chunk = get_available_chunk(pushbuffer);

    return chunk;
}

static bool try_claim_chunk(uvm_pushbuffer_t *pushbuffer, uvm_push_t *push, uvm_pushbuffer_chunk_t **chunk_out)
{
    uvm_pushbuffer_chunk_t *chunk;

    uvm_spin_lock(&pushbuffer->lock);

    chunk = pick_chunk(pushbuffer);
    if (!chunk)
        goto done;

    chunk->current_push = push;
    clear_chunk(pushbuffer, chunk, pushbuffer->idle_chunks);
    clear_chunk(pushbuffer, chunk, pushbuffer->available_chunks);

done:
    uvm_spin_unlock(&pushbuffer->lock);
    *chunk_out = chunk;

    return chunk != NULL;
}

static NvU32 *chunk_get_next_push_start_addr(uvm_pushbuffer_t *pushbuffer, uvm_pushbuffer_chunk_t *chunk)
{
    char *push_start = (char *)uvm_rm_mem_get_cpu_va(pushbuffer->memory);
    push_start += chunk_get_offset(pushbuffer, chunk);
    push_start += chunk->next_push_start;

    UVM_ASSERT(((NvU64)push_start) % sizeof(NvU32) == 0);

    return (NvU32*)push_start;
}

static NV_STATUS claim_chunk(uvm_pushbuffer_t *pushbuffer, uvm_push_t *push, uvm_pushbuffer_chunk_t **chunk_out)
{
    NV_STATUS status = NV_OK;
    uvm_channel_manager_t *channel_manager = pushbuffer->channel_manager;
    uvm_spin_loop_t spin;

    if (try_claim_chunk(pushbuffer, push, chunk_out))
        return NV_OK;

    uvm_channel_manager_update_progress(channel_manager);

    uvm_spin_loop_init(&spin);
    while (!try_claim_chunk(pushbuffer, push, chunk_out) && status == NV_OK) {
        UVM_SPIN_LOOP(&spin);
        status = uvm_channel_manager_check_errors(channel_manager);
        uvm_channel_manager_update_progress(channel_manager);
    }

    return status;
}

NV_STATUS uvm_pushbuffer_begin_push(uvm_pushbuffer_t *pushbuffer, uvm_push_t *push)
{
    uvm_pushbuffer_chunk_t *chunk;
    NV_STATUS status;

    UVM_ASSERT(pushbuffer);
    UVM_ASSERT(push);

    // Note that this semaphore is uvm_up()ed in end_push().
    uvm_down(&pushbuffer->concurrent_pushes_sema);

    status = claim_chunk(pushbuffer, push, &chunk);
    if (status != NV_OK) {
        uvm_up(&pushbuffer->concurrent_pushes_sema);
        return status;
    }

    UVM_ASSERT(chunk);

    push->begin = chunk_get_next_push_start_addr(pushbuffer, chunk);
    push->next = push->begin;

    return NV_OK;
}

static uvm_gpfifo_entry_t *chunk_get_first_gpfifo(uvm_pushbuffer_chunk_t *chunk)
{
    return list_first_entry_or_null(&chunk->pending_gpfifos, uvm_gpfifo_entry_t, pending_list_node);
}

static uvm_gpfifo_entry_t *chunk_get_last_gpfifo(uvm_pushbuffer_chunk_t *chunk)
{
    return list_last_entry_or_null(&chunk->pending_gpfifos, uvm_gpfifo_entry_t, pending_list_node);
}

// Get the cpu put within the chunk (in range [0, UVM_PUSHBUFFER_CHUNK_SIZE])
static NvU32 chunk_get_cpu_put(uvm_pushbuffer_t *pushbuffer, uvm_pushbuffer_chunk_t *chunk)
{
    uvm_gpfifo_entry_t *gpfifo = chunk_get_last_gpfifo(chunk);

    uvm_assert_spinlock_locked(&pushbuffer->lock);

    if (gpfifo != NULL)
        return gpfifo->pushbuffer_offset + gpfifo->pushbuffer_size - chunk_get_offset(pushbuffer, chunk);
    else
        return 0;
}

// Get the gpu get within the chunk (in range [0, UVM_PUSHBUFFER_CHUNK_SIZE))
static NvU32 chunk_get_gpu_get(uvm_pushbuffer_t *pushbuffer, uvm_pushbuffer_chunk_t *chunk)
{
    uvm_gpfifo_entry_t *gpfifo = chunk_get_first_gpfifo(chunk);

    uvm_assert_spinlock_locked(&pushbuffer->lock);

    if (gpfifo != NULL)
        return gpfifo->pushbuffer_offset - chunk_get_offset(pushbuffer, chunk);
    else
        return 0;
}

static void update_chunk(uvm_pushbuffer_t *pushbuffer, uvm_pushbuffer_chunk_t *chunk)
{
    NvU32 gpu_get = chunk_get_gpu_get(pushbuffer, chunk);
    NvU32 cpu_put = chunk_get_cpu_put(pushbuffer, chunk);

    uvm_assert_spinlock_locked(&pushbuffer->lock);

    if (gpu_get == cpu_put) {
        // cpu_put can be equal to gpu_get both when the chunk is full and empty. We
        // can tell apart the cases by checking whether the pending GPFIFOs list is
        // empty.
        if (!list_empty(&chunk->pending_gpfifos))
            return;

        // Chunk completely idle
        set_chunk(pushbuffer, chunk, pushbuffer->idle_chunks);
        set_chunk(pushbuffer, chunk, pushbuffer->available_chunks);
        UVM_ASSERT_MSG(cpu_put == 0, "cpu put %u\n", cpu_put);

        // For a completely idle chunk, always start at the very beginning. This
        // helps avoid the waste that can happen at the very end of the chunk
        // described at the top of uvm_pushbuffer.h.
        chunk->next_push_start = 0;
    }
    else if (gpu_get > cpu_put) {
        if (gpu_get - cpu_put >= UVM_MAX_PUSH_SIZE) {
            // Enough space between put and get
            set_chunk(pushbuffer, chunk, pushbuffer->available_chunks);
            chunk->next_push_start = cpu_put;
        }
    }
    else if (UVM_PUSHBUFFER_CHUNK_SIZE >= cpu_put + UVM_MAX_PUSH_SIZE) {
        UVM_ASSERT_MSG(gpu_get < cpu_put, "gpu_get %u cpu_put %u\n", gpu_get, cpu_put);

        // Enough space at the end
        set_chunk(pushbuffer, chunk, pushbuffer->available_chunks);
        chunk->next_push_start = cpu_put;
    }
    else if (gpu_get >= UVM_MAX_PUSH_SIZE) {
        UVM_ASSERT_MSG(gpu_get < cpu_put, "gpu_get %u cpu_put %u\n", gpu_get, cpu_put);

        // Enough space at the beginning
        set_chunk(pushbuffer, chunk, pushbuffer->available_chunks);
        chunk->next_push_start = 0;
    }
}

void uvm_pushbuffer_destroy(uvm_pushbuffer_t *pushbuffer)
{
    if (pushbuffer == NULL)
        return;

    proc_remove(pushbuffer->procfs.info_file);

    uvm_rm_mem_free(pushbuffer->memory);
    uvm_kvfree(pushbuffer);
}

static uvm_pushbuffer_chunk_t *offset_to_chunk(uvm_pushbuffer_t *pushbuffer, NvU32 offset)
{
    UVM_ASSERT(offset < UVM_PUSHBUFFER_SIZE);
    return &pushbuffer->chunks[offset / UVM_PUSHBUFFER_CHUNK_SIZE];
}

static uvm_pushbuffer_chunk_t *gpfifo_to_chunk(uvm_pushbuffer_t *pushbuffer, uvm_gpfifo_entry_t *gpfifo)
{
    uvm_pushbuffer_chunk_t *chunk = offset_to_chunk(pushbuffer, gpfifo->pushbuffer_offset);
    UVM_ASSERT(offset_to_chunk(pushbuffer, gpfifo->pushbuffer_offset + gpfifo->pushbuffer_size - 1) == chunk);
    return chunk;
}

void uvm_pushbuffer_mark_completed(uvm_pushbuffer_t *pushbuffer, uvm_gpfifo_entry_t *gpfifo)
{
    uvm_pushbuffer_chunk_t *chunk;
    uvm_push_info_t *push_info = gpfifo->push_info;
    bool need_to_update_chunk = false;

    UVM_ASSERT(gpfifo->type == UVM_GPFIFO_ENTRY_TYPE_NORMAL);

    chunk = gpfifo_to_chunk(pushbuffer, gpfifo);

    if (push_info->on_complete != NULL)
        push_info->on_complete(push_info->on_complete_data);

    push_info->on_complete = NULL;
    push_info->on_complete_data = NULL;

    uvm_spin_lock(&pushbuffer->lock);

    if (gpfifo == chunk_get_first_gpfifo(chunk))
        need_to_update_chunk = true;
    else if (gpfifo == chunk_get_last_gpfifo(chunk))
        need_to_update_chunk = true;

    list_del(&gpfifo->pending_list_node);

    // If current_push is not NULL, updating the chunk is delayed till
    // uvm_pushbuffer_end_push() is called for that push.
    if (need_to_update_chunk && chunk->current_push == NULL)
        update_chunk(pushbuffer, chunk);

    uvm_spin_unlock(&pushbuffer->lock);
}

NvU32 uvm_pushbuffer_get_offset_for_push(uvm_pushbuffer_t *pushbuffer, uvm_push_t *push)
{
    NvU32 offset = (char*)push->begin - (char *)uvm_rm_mem_get_cpu_va(pushbuffer->memory);

    UVM_ASSERT(((NvU64)offset) % sizeof(NvU32) == 0);

    return offset;
}

NvU64 uvm_pushbuffer_get_gpu_va_for_push(uvm_pushbuffer_t *pushbuffer, uvm_push_t *push)
{
    NvU64 pushbuffer_base;
    uvm_gpu_t *gpu = uvm_push_get_gpu(push);
    bool is_proxy_channel = uvm_channel_is_proxy(push->channel);

    pushbuffer_base = uvm_rm_mem_get_gpu_va(pushbuffer->memory, gpu, is_proxy_channel);

    return pushbuffer_base + uvm_pushbuffer_get_offset_for_push(pushbuffer, push);
}

void uvm_pushbuffer_end_push(uvm_pushbuffer_t *pushbuffer, uvm_push_t *push, uvm_gpfifo_entry_t *gpfifo)
{
    uvm_pushbuffer_chunk_t *chunk = gpfifo_to_chunk(pushbuffer, gpfifo);

    uvm_channel_pool_assert_locked(push->channel->pool);

    uvm_spin_lock(&pushbuffer->lock);

    list_add_tail(&gpfifo->pending_list_node, &chunk->pending_gpfifos);

    update_chunk(pushbuffer, chunk);

    UVM_ASSERT(chunk->current_push == push);
    chunk->current_push = NULL;

    uvm_spin_unlock(&pushbuffer->lock);

    // uvm_pushbuffer_end_push() needs to be called with the channel lock held
    // while the concurrent pushes sema has a higher lock order. To keep the
    // code structure simple, just up out of order here.
    uvm_up_out_of_order(&pushbuffer->concurrent_pushes_sema);
}

bool uvm_pushbuffer_has_space(uvm_pushbuffer_t *pushbuffer)
{
    bool has_space;

    uvm_spin_lock(&pushbuffer->lock);

    has_space = pick_chunk(pushbuffer) != NULL;

    uvm_spin_unlock(&pushbuffer->lock);

    return has_space;
}

void uvm_pushbuffer_print_common(uvm_pushbuffer_t *pushbuffer, struct seq_file *s)
{
    NvU32 i;

    UVM_SEQ_OR_DBG_PRINT(s, "Pushbuffer for GPU %s\n", uvm_gpu_name(pushbuffer->channel_manager->gpu));
    UVM_SEQ_OR_DBG_PRINT(s, " has space: %d\n", uvm_pushbuffer_has_space(pushbuffer));

    uvm_spin_lock(&pushbuffer->lock);

    for (i = 0; i < UVM_PUSHBUFFER_CHUNKS; ++i) {
        uvm_pushbuffer_chunk_t *chunk = &pushbuffer->chunks[i];
        NvU32 cpu_put = chunk_get_cpu_put(pushbuffer, chunk);
        NvU32 gpu_get = chunk_get_gpu_get(pushbuffer, chunk);
        UVM_SEQ_OR_DBG_PRINT(s, " chunk %u put %u get %u next %u available %d idle %d\n",
                i,
                cpu_put, gpu_get, chunk->next_push_start,
                test_bit(i, pushbuffer->available_chunks) ? 1 : 0,
                test_bit(i, pushbuffer->idle_chunks) ? 1 : 0);

    }

    uvm_spin_unlock(&pushbuffer->lock);
}

void uvm_pushbuffer_print(uvm_pushbuffer_t *pushbuffer)
{
    return uvm_pushbuffer_print_common(pushbuffer, NULL);
}

NvU64 uvm_pushbuffer_get_gpu_va_base(uvm_pushbuffer_t *pushbuffer)
{
    return uvm_rm_mem_get_gpu_uvm_va(pushbuffer->memory, pushbuffer->channel_manager->gpu);
}