/******************************************************************************* Copyright (c) 2015-2023 NVIDIA Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. *******************************************************************************/ #include "uvm_channel.h" #include "uvm_api.h" #include "uvm_common.h" #include "uvm_global.h" #include "uvm_hal.h" #include "uvm_procfs.h" #include "uvm_push.h" #include "uvm_gpu_semaphore.h" #include "uvm_lock.h" #include "uvm_kvmalloc.h" #include "nv_uvm_types.h" #include "nv_uvm_interface.h" #include "clb06f.h" #include "uvm_conf_computing.h" static unsigned uvm_channel_num_gpfifo_entries = UVM_CHANNEL_NUM_GPFIFO_ENTRIES_DEFAULT; #define UVM_CHANNEL_GPFIFO_LOC_DEFAULT "auto" static char *uvm_channel_gpfifo_loc = UVM_CHANNEL_GPFIFO_LOC_DEFAULT; #define UVM_CHANNEL_GPPUT_LOC_DEFAULT "auto" static char *uvm_channel_gpput_loc = UVM_CHANNEL_GPPUT_LOC_DEFAULT; #define UVM_CHANNEL_PUSHBUFFER_LOC_DEFAULT "auto" static char *uvm_channel_pushbuffer_loc = UVM_CHANNEL_PUSHBUFFER_LOC_DEFAULT; module_param(uvm_channel_num_gpfifo_entries, uint, S_IRUGO); module_param(uvm_channel_gpfifo_loc, charp, S_IRUGO); module_param(uvm_channel_gpput_loc, charp, S_IRUGO); module_param(uvm_channel_pushbuffer_loc, charp, S_IRUGO); static NV_STATUS manager_create_procfs_dirs(uvm_channel_manager_t *manager); static NV_STATUS manager_create_procfs(uvm_channel_manager_t *manager); static NV_STATUS channel_create_procfs(uvm_channel_t *channel); typedef enum { // Only remove completed GPFIFO entries from the pushbuffer UVM_CHANNEL_UPDATE_MODE_COMPLETED, // Remove all remaining GPFIFO entries from the pushbuffer, regardless of // whether they're actually done yet. UVM_CHANNEL_UPDATE_MODE_FORCE_ALL } uvm_channel_update_mode_t; bool uvm_channel_pool_uses_mutex(uvm_channel_pool_t *pool) { // Work submission to proxy channels in SR-IOV heavy entails calling RM API // that acquires a mutex, so the proxy channel pool must use a mutex. if (uvm_channel_pool_is_proxy(pool)) return true; // When Confidential Computing is enabled push submission requires call to // CSL routines which acquire the CSL context mutex lock. Moreover, WLC // submission uses UVM_SPIN_LOOP, which can call 'schedule', to wait for // LCIC completion. Indirect submission is synchronous, calling // uvm_push_wait which again uses UVM_SPIN_LOOP. if (uvm_conf_computing_mode_enabled(pool->manager->gpu)) return true; // Unless the mutex is required, the spinlock is preferred when work // submission is expected to take little time. return false; } static void channel_pool_lock_init(uvm_channel_pool_t *pool) { uvm_lock_order_t order = UVM_LOCK_ORDER_CHANNEL; if (uvm_conf_computing_mode_enabled(pool->manager->gpu) && uvm_channel_pool_is_wlc(pool)) order = UVM_LOCK_ORDER_WLC_CHANNEL; if (uvm_channel_pool_uses_mutex(pool)) uvm_mutex_init(&pool->mutex, order); else uvm_spin_lock_init(&pool->spinlock, order); } static void channel_pool_lock(uvm_channel_pool_t *pool) { if (uvm_channel_pool_uses_mutex(pool)) uvm_mutex_lock(&pool->mutex); else uvm_spin_lock(&pool->spinlock); } static void channel_pool_unlock(uvm_channel_pool_t *pool) { if (uvm_channel_pool_uses_mutex(pool)) uvm_mutex_unlock(&pool->mutex); else uvm_spin_unlock(&pool->spinlock); } // Update channel progress, completing up to max_to_complete entries static NvU32 uvm_channel_update_progress_with_max(uvm_channel_t *channel, NvU32 max_to_complete, uvm_channel_update_mode_t mode) { NvU32 gpu_get; NvU32 cpu_put; NvU32 completed_count = 0; NvU32 pending_gpfifos; NvU64 completed_value = uvm_channel_update_completed_value(channel); channel_pool_lock(channel->pool); // Completed value should never exceed the queued value UVM_ASSERT_MSG_RELEASE(completed_value <= channel->tracking_sem.queued_value, "GPU %s channel %s unexpected completed_value 0x%llx > queued_value 0x%llx\n", channel->pool->manager->gpu->parent->name, channel->name, completed_value, channel->tracking_sem.queued_value); cpu_put = channel->cpu_put; gpu_get = channel->gpu_get; while (gpu_get != cpu_put && completed_count < max_to_complete) { uvm_gpfifo_entry_t *entry = &channel->gpfifo_entries[gpu_get]; if (mode == UVM_CHANNEL_UPDATE_MODE_COMPLETED && entry->tracking_semaphore_value > completed_value) break; if (entry->type == UVM_GPFIFO_ENTRY_TYPE_NORMAL) { uvm_pushbuffer_mark_completed(channel, entry); list_add_tail(&entry->push_info->available_list_node, &channel->available_push_infos); } gpu_get = (gpu_get + 1) % channel->num_gpfifo_entries; ++completed_count; } channel->gpu_get = gpu_get; channel_pool_unlock(channel->pool); if (cpu_put >= gpu_get) pending_gpfifos = cpu_put - gpu_get; else pending_gpfifos = channel->num_gpfifo_entries - gpu_get + cpu_put; return pending_gpfifos; } NvU32 uvm_channel_update_progress(uvm_channel_t *channel) { // By default, don't complete too many entries at a time to spread the cost // of doing so across callers and avoid potentially holding a spin lock for // too long. return uvm_channel_update_progress_with_max(channel, 8, UVM_CHANNEL_UPDATE_MODE_COMPLETED); } // Update progress for all pending GPFIFO entries. This might take a longer time // and should be only used in exceptional circumstances like when a channel // error is encountered. Otherwise, uvm_chanel_update_progress() should be used. static NvU32 channel_update_progress_all(uvm_channel_t *channel, uvm_channel_update_mode_t mode) { return uvm_channel_update_progress_with_max(channel, channel->num_gpfifo_entries, mode); } NvU32 uvm_channel_update_progress_all(uvm_channel_t *channel) { return channel_update_progress_all(channel, UVM_CHANNEL_UPDATE_MODE_COMPLETED); } NvU32 uvm_channel_manager_update_progress(uvm_channel_manager_t *channel_manager) { NvU32 pending_gpfifos = 0; uvm_channel_pool_t *pool; uvm_for_each_pool(pool, channel_manager) { uvm_channel_t *channel; uvm_for_each_channel_in_pool(channel, pool) pending_gpfifos += uvm_channel_update_progress(channel); } return pending_gpfifos; } static NvU32 channel_get_available_gpfifo_entries(uvm_channel_t *channel) { NvU32 available = channel->num_gpfifo_entries; uvm_channel_pool_assert_locked(channel->pool); // Remove sentinel entry available -= 1; // Remove entries of ongoing pushes available -= channel->current_gpfifo_count; // Remove pending entries if (channel->cpu_put >= channel->gpu_get) available -= (channel->cpu_put - channel->gpu_get); else available -= (channel->cpu_put + channel->num_gpfifo_entries - channel->gpu_get); UVM_ASSERT(available < channel->num_gpfifo_entries); return available; } NvU32 uvm_channel_get_available_gpfifo_entries(uvm_channel_t *channel) { NvU32 available; channel_pool_lock(channel->pool); available = channel_get_available_gpfifo_entries(channel); channel_pool_unlock(channel->pool); return available; } static bool try_claim_channel_locked(uvm_channel_t *channel, NvU32 num_gpfifo_entries) { bool claimed = false; UVM_ASSERT(num_gpfifo_entries > 0); UVM_ASSERT(num_gpfifo_entries < channel->num_gpfifo_entries); uvm_channel_pool_assert_locked(channel->pool); if (channel_get_available_gpfifo_entries(channel) >= num_gpfifo_entries) { channel->current_gpfifo_count += num_gpfifo_entries; claimed = true; } return claimed; } static bool try_claim_channel(uvm_channel_t *channel, NvU32 num_gpfifo_entries) { bool claimed; channel_pool_lock(channel->pool); claimed = try_claim_channel_locked(channel, num_gpfifo_entries); channel_pool_unlock(channel->pool); return claimed; } static void unlock_channel_for_push(uvm_channel_t *channel) { NvU32 index; uvm_gpu_t *gpu = uvm_channel_get_gpu(channel); if (!uvm_conf_computing_mode_enabled(gpu)) return; index = uvm_channel_index_in_pool(channel); uvm_channel_pool_assert_locked(channel->pool); UVM_ASSERT(test_bit(index, channel->pool->push_locks)); __clear_bit(index, channel->pool->push_locks); uvm_up_out_of_order(&channel->pool->push_sem); } static bool is_channel_locked_for_push(uvm_channel_t *channel) { uvm_gpu_t *gpu = uvm_channel_get_gpu(channel); if (uvm_conf_computing_mode_enabled(gpu)) return test_bit(uvm_channel_index_in_pool(channel), channel->pool->push_locks); // For CE and proxy channels, we always return that the channel is locked, // which has no functional impact in the UVM channel code-flow, this is only // used on UVM_ASSERTs. return true; } static void lock_channel_for_push(uvm_channel_t *channel) { uvm_gpu_t *gpu = uvm_channel_get_gpu(channel); NvU32 index = uvm_channel_index_in_pool(channel); UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu)); uvm_channel_pool_assert_locked(channel->pool); UVM_ASSERT(!test_bit(index, channel->pool->push_locks)); __set_bit(index, channel->pool->push_locks); } static bool test_claim_and_lock_channel(uvm_channel_t *channel, NvU32 num_gpfifo_entries) { uvm_gpu_t *gpu = uvm_channel_get_gpu(channel); NvU32 index = uvm_channel_index_in_pool(channel); UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu)); uvm_channel_pool_assert_locked(channel->pool); if (!test_bit(index, channel->pool->push_locks) && try_claim_channel_locked(channel, num_gpfifo_entries)) { lock_channel_for_push(channel); return true; } return false; } // Reserve a channel in the specified pool. The channel is locked until the push // ends static NV_STATUS channel_reserve_and_lock_in_pool(uvm_channel_pool_t *pool, uvm_channel_t **channel_out) { uvm_channel_t *channel; uvm_spin_loop_t spin; NvU32 index; UVM_ASSERT(pool); UVM_ASSERT(uvm_conf_computing_mode_enabled(pool->manager->gpu)); // This semaphore is uvm_up() in unlock_channel_for_push() as part of the // uvm_channel_end_push() routine. uvm_down(&pool->push_sem); // At least one channel is unlocked. We check if any unlocked channel is // available, i.e., if it has free GPFIFO entries. channel_pool_lock(pool); for_each_clear_bit(index, pool->push_locks, pool->num_channels) { channel = &pool->channels[index]; if (try_claim_channel_locked(channel, 1)) { lock_channel_for_push(channel); goto done; } } channel_pool_unlock(pool); // No channels are available. Update and check errors on all channels until // one becomes available. uvm_spin_loop_init(&spin); while (1) { uvm_for_each_channel_in_pool(channel, pool) { NV_STATUS status; uvm_channel_update_progress(channel); index = uvm_channel_index_in_pool(channel); channel_pool_lock(pool); if (test_claim_and_lock_channel(channel, 1)) goto done; channel_pool_unlock(pool); status = uvm_channel_check_errors(channel); if (status != NV_OK) { uvm_up(&pool->push_sem); return status; } UVM_SPIN_LOOP(&spin); } } done: channel_pool_unlock(pool); *channel_out = channel; return NV_OK; } // Reserve a channel in the specified pool static NV_STATUS channel_reserve_in_pool(uvm_channel_pool_t *pool, uvm_channel_t **channel_out) { uvm_channel_t *channel; uvm_spin_loop_t spin; UVM_ASSERT(pool); if (uvm_conf_computing_mode_enabled(pool->manager->gpu)) return channel_reserve_and_lock_in_pool(pool, channel_out); uvm_for_each_channel_in_pool(channel, pool) { // TODO: Bug 1764953: Prefer idle/less busy channels if (try_claim_channel(channel, 1)) { *channel_out = channel; return NV_OK; } } uvm_spin_loop_init(&spin); while (1) { uvm_for_each_channel_in_pool(channel, pool) { NV_STATUS status; uvm_channel_update_progress(channel); if (try_claim_channel(channel, 1)) { *channel_out = channel; return NV_OK; } status = uvm_channel_check_errors(channel); if (status != NV_OK) return status; UVM_SPIN_LOOP(&spin); } } UVM_ASSERT_MSG(0, "Cannot get here?!\n"); return NV_ERR_GENERIC; } NV_STATUS uvm_channel_reserve_type(uvm_channel_manager_t *manager, uvm_channel_type_t type, uvm_channel_t **channel_out) { uvm_channel_pool_t *pool = manager->pool_to_use.default_for_type[type]; UVM_ASSERT(pool != NULL); UVM_ASSERT(type < UVM_CHANNEL_TYPE_COUNT); return channel_reserve_in_pool(pool, channel_out); } NV_STATUS uvm_channel_reserve_gpu_to_gpu(uvm_channel_manager_t *manager, uvm_gpu_t *dst_gpu, uvm_channel_t **channel_out) { const NvU32 dst_gpu_index = uvm_id_gpu_index(dst_gpu->id); uvm_channel_pool_t *pool = manager->pool_to_use.gpu_to_gpu[dst_gpu_index]; // If there is no recommended pool for the given GPU pair, use default if (pool == NULL) pool = manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_GPU_TO_GPU]; UVM_ASSERT(pool->pool_type == UVM_CHANNEL_POOL_TYPE_CE); return channel_reserve_in_pool(pool, channel_out); } NV_STATUS uvm_channel_manager_wait(uvm_channel_manager_t *manager) { NV_STATUS status = NV_OK; uvm_spin_loop_t spin; if (uvm_channel_manager_update_progress(manager) == 0) return uvm_channel_manager_check_errors(manager); uvm_spin_loop_init(&spin); while (uvm_channel_manager_update_progress(manager) > 0 && status == NV_OK) { UVM_SPIN_LOOP(&spin); status = uvm_channel_manager_check_errors(manager); } return status; } static NvU32 channel_get_available_push_info_index(uvm_channel_t *channel) { uvm_push_info_t *push_info; channel_pool_lock(channel->pool); push_info = list_first_entry_or_null(&channel->available_push_infos, uvm_push_info_t, available_list_node); UVM_ASSERT(push_info != NULL); UVM_ASSERT(push_info->on_complete == NULL && push_info->on_complete_data == NULL); list_del(&push_info->available_list_node); channel_pool_unlock(channel->pool); return push_info - channel->push_infos; } static void channel_semaphore_gpu_encrypt_payload(uvm_push_t *push, NvU64 semaphore_va) { NvU32 iv_index; uvm_gpu_address_t notifier_gpu_va; uvm_gpu_address_t auth_tag_gpu_va; uvm_gpu_address_t semaphore_gpu_va; uvm_gpu_address_t encrypted_payload_gpu_va; uvm_gpu_t *gpu = push->gpu; uvm_channel_t *channel = push->channel; uvm_gpu_semaphore_t *semaphore = &channel->tracking_sem.semaphore; UvmCslIv *iv_cpu_addr = semaphore->conf_computing.ivs; NvU32 payload_size = sizeof(*semaphore->payload); NvU32 *last_pushed_notifier = &semaphore->conf_computing.last_pushed_notifier; UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu)); UVM_ASSERT(uvm_channel_is_ce(channel)); encrypted_payload_gpu_va = uvm_rm_mem_get_gpu_va(semaphore->conf_computing.encrypted_payload, gpu, false); notifier_gpu_va = uvm_rm_mem_get_gpu_va(semaphore->conf_computing.notifier, gpu, false); auth_tag_gpu_va = uvm_rm_mem_get_gpu_va(semaphore->conf_computing.auth_tag, gpu, false); semaphore_gpu_va = uvm_gpu_address_virtual(semaphore_va); iv_index = ((*last_pushed_notifier + 2) / 2) % channel->num_gpfifo_entries; uvm_conf_computing_log_gpu_encryption(channel, &iv_cpu_addr[iv_index]); gpu->parent->ce_hal->memset_4(push, notifier_gpu_va, ++(*last_pushed_notifier), sizeof(*last_pushed_notifier)); gpu->parent->ce_hal->encrypt(push, encrypted_payload_gpu_va, semaphore_gpu_va, payload_size, auth_tag_gpu_va); gpu->parent->ce_hal->memset_4(push, notifier_gpu_va, ++(*last_pushed_notifier), sizeof(*last_pushed_notifier)); } // Auxiliary buffer only used by SEC2 channels for CPU computation of the method // stream signature. Note that it is required that this inline pushbuffer buffer // is reserved at push->begin. static void push_reserve_csl_sign_buf(uvm_push_t *push) { uvm_gpu_address_t dummy; NvU32 *buf; buf = uvm_push_get_single_inline_buffer(push, UVM_CONF_COMPUTING_SIGN_BUF_MAX_SIZE, UVM_METHOD_SIZE, &dummy); // Offset UVM_METHOD_SIZE from buf due to the NOP method. UVM_ASSERT((buf - UVM_METHOD_SIZE / sizeof(*buf)) == push->begin); } NV_STATUS uvm_channel_begin_push(uvm_channel_t *channel, uvm_push_t *push) { NV_STATUS status; uvm_channel_manager_t *manager; uvm_gpu_t *gpu; UVM_ASSERT(channel); UVM_ASSERT(push); manager = channel->pool->manager; gpu = uvm_channel_get_gpu(channel); // Only SEC2 and WLC with set up fixed schedule can use direct push // submission. All other cases (including WLC pre-schedule) need to // reserve a launch channel that will be used to submit this push // indirectly. if (uvm_conf_computing_mode_enabled(gpu) && uvm_channel_is_ce(channel) && !(uvm_channel_is_wlc(channel) && uvm_channel_manager_is_wlc_ready(manager))) { uvm_channel_type_t indirect_channel_type = uvm_channel_manager_is_wlc_ready(manager) ? UVM_CHANNEL_TYPE_WLC : UVM_CHANNEL_TYPE_SEC2; status = uvm_channel_reserve_type(manager, indirect_channel_type, &push->launch_channel); if (status != NV_OK) return status; } // When the Confidential Computing feature is enabled, the channel's lock // should have already been acquired in uvm_channel_reserve() or // channel_reserve_and_lock_in_pool(). UVM_ASSERT(is_channel_locked_for_push(channel)); push->channel = channel; push->channel_tracking_value = 0; status = uvm_pushbuffer_begin_push(manager->pushbuffer, push); if (status != NV_OK) return status; push->push_info_index = channel_get_available_push_info_index(channel); if (uvm_channel_is_sec2(push->channel)) push_reserve_csl_sign_buf(push); else if (uvm_channel_is_wlc(push->channel) && uvm_channel_manager_is_wlc_ready(manager)) uvm_conf_computing_acquire_encryption_iv(push->channel, &push->launch_iv); return NV_OK; } static void internal_channel_submit_work(uvm_push_t *push, NvU32 push_size, NvU32 new_gpu_put) { NvU64 *gpfifo_entry; NvU64 pushbuffer_va; uvm_channel_t *channel = push->channel; uvm_pushbuffer_t *pushbuffer = uvm_channel_get_pushbuffer(channel); uvm_gpu_t *gpu = uvm_channel_get_gpu(channel); BUILD_BUG_ON(sizeof(*gpfifo_entry) != NVB06F_GP_ENTRY__SIZE); UVM_ASSERT(!uvm_channel_is_proxy(channel)); gpfifo_entry = (NvU64*)channel->channel_info.gpFifoEntries + channel->cpu_put; pushbuffer_va = uvm_pushbuffer_get_gpu_va_for_push(pushbuffer, push); if (uvm_conf_computing_mode_enabled(gpu)) { void *unprotected_pb = uvm_pushbuffer_get_unprotected_cpu_va_for_push(pushbuffer, push); UVM_ASSERT(uvm_channel_is_sec2(channel)); // Copy push data to unprotected sysmem, it has already been signed. memcpy(unprotected_pb, push->begin, push_size); } gpu->parent->host_hal->set_gpfifo_entry(gpfifo_entry, pushbuffer_va, push_size, UVM_GPFIFO_SYNC_PROCEED); // Need to make sure all the pushbuffer and the GPFIFO entries writes // complete before updating GPPUT. We also don't want any reads to be moved // after the GPPut write as the GPU might modify the data they read as soon // as the GPPut write happens. mb(); gpu->parent->host_hal->write_gpu_put(channel, new_gpu_put); } static void proxy_channel_submit_work(uvm_push_t *push, NvU32 push_size) { NV_STATUS status; uvm_channel_t *channel = push->channel; UVM_ASSERT(uvm_channel_is_proxy(channel)); status = nvUvmInterfacePagingChannelPushStream(channel->proxy.handle, (char *) push->begin, push_size); if (status != NV_OK) { uvm_push_info_t *push_info = uvm_push_info_from_push(push); // If the RM invocation fails, there is no clean recovery available // (for example, the vGPU plugin may have crashed), so swallow the error // but print a descriptive message about the failed push. UVM_ASSERT_MSG(status == NV_OK, "nvUvmInterfacePagingChannelPushStream() failed: %s, GPU %s, push '%s' started at %s:%d in %s()\n", nvstatusToString(status), uvm_gpu_name(uvm_channel_get_gpu(channel)), push_info->description, push_info->filename, push_info->line, push_info->function); } } static void do_semaphore_release(uvm_push_t *push, NvU64 semaphore_va, NvU32 new_payload) { uvm_gpu_t *gpu = uvm_push_get_gpu(push); if (uvm_channel_is_ce(push->channel)) gpu->parent->ce_hal->semaphore_release(push, semaphore_va, new_payload); else gpu->parent->sec2_hal->semaphore_release(push, semaphore_va, new_payload); } static void uvm_channel_tracking_semaphore_release(uvm_push_t *push, NvU64 semaphore_va, NvU32 new_payload) { // We used to skip the membar or use membar GPU for the semaphore release // for a few pushes, but that doesn't provide sufficient ordering guarantees // in some cases (e.g. ga100 with an LCE with PCEs from both HSHUBs) for the // semaphore writes. To be safe, just always uses a membar sys for now. // TODO: Bug 3770539: Optimize membars used by end of push semaphore releases (void)uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU); (void)uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE); do_semaphore_release(push, semaphore_va, new_payload); // When the Confidential Computing feature is enabled, additional work // needs to be scheduled to get an encrypted shadow copy in unprotected // sysmem. This allows UVM to later decrypt it and observe the new // semaphore value. if (uvm_conf_computing_mode_enabled(push->gpu) && uvm_channel_is_ce(push->channel)) channel_semaphore_gpu_encrypt_payload(push, semaphore_va); } static uvm_channel_t *get_paired_channel(uvm_channel_t *channel) { unsigned index; uvm_channel_pool_t *paired_pool; uvm_channel_type_t paired_channel_type; UVM_ASSERT(channel); UVM_ASSERT(uvm_channel_is_wlc(channel) || uvm_channel_is_lcic(channel)); index = uvm_channel_index_in_pool(channel); paired_channel_type = uvm_channel_is_wlc(channel) ? UVM_CHANNEL_TYPE_LCIC : UVM_CHANNEL_TYPE_WLC; paired_pool = channel->pool->manager->pool_to_use.default_for_type[paired_channel_type]; return paired_pool->channels + index; } static uvm_channel_t *wlc_get_paired_lcic(uvm_channel_t *wlc_channel) { UVM_ASSERT(wlc_channel); UVM_ASSERT(uvm_channel_is_wlc(wlc_channel)); return get_paired_channel(wlc_channel); } static uvm_channel_t *lcic_get_paired_wlc(uvm_channel_t *lcic_channel) { UVM_ASSERT(lcic_channel); UVM_ASSERT(uvm_channel_is_lcic(lcic_channel)); return get_paired_channel(lcic_channel); } static void internal_channel_submit_work_wlc(uvm_push_t *push) { uvm_channel_t *wlc_channel = push->channel; uvm_channel_t *lcic_channel = wlc_get_paired_lcic(wlc_channel); UvmCslIv *iv_cpu_addr = lcic_channel->tracking_sem.semaphore.conf_computing.ivs; NvU32 *last_pushed_notifier; NvU32 iv_index; uvm_spin_loop_t spin; UVM_ASSERT(lcic_channel); // Wait for the WLC/LCIC to be primed. This means that PUT == GET + 2 // and a WLC doorbell ring is enough to start work. UVM_SPIN_WHILE(!uvm_gpu_tracking_semaphore_is_completed(&lcic_channel->tracking_sem), &spin); // Executing WLC adds an extra job to LCIC ++lcic_channel->tracking_sem.queued_value; UVM_ASSERT_MSG(uvm_push_get_size(push) == UVM_MAX_WLC_PUSH_SIZE, "WLC push submission size mismatch, expected: %u, got: %u", UVM_MAX_WLC_PUSH_SIZE, uvm_push_get_size(push)); // Handles the CPU part of the setup for the LCIC to be able to do GPU // encryption of its tracking semaphore value. See setup_lcic_schedule(). last_pushed_notifier = &lcic_channel->tracking_sem.semaphore.conf_computing.last_pushed_notifier; *lcic_channel->conf_computing.static_notifier_entry_unprotected_sysmem_cpu = ++(*last_pushed_notifier); *lcic_channel->conf_computing.static_notifier_exit_unprotected_sysmem_cpu = ++(*last_pushed_notifier); iv_index = (*last_pushed_notifier / 2) % lcic_channel->num_gpfifo_entries; uvm_conf_computing_log_gpu_encryption(lcic_channel, &iv_cpu_addr[iv_index]); // Move push data uvm_conf_computing_cpu_encrypt(wlc_channel, wlc_channel->conf_computing.static_pb_unprotected_sysmem_cpu, push->begin, &push->launch_iv, UVM_MAX_WLC_PUSH_SIZE, wlc_channel->conf_computing.static_pb_unprotected_sysmem_auth_tag_cpu); // Make sure all encrypted data is observable before ringing the doorbell. wmb(); // Ring the WLC doorbell to start processing the above push UVM_GPU_WRITE_ONCE(*wlc_channel->channel_info.workSubmissionOffset, wlc_channel->channel_info.workSubmissionToken); } static void internal_channel_submit_work_indirect_wlc(uvm_push_t *push, NvU32 old_cpu_put, NvU32 new_gpu_put) { uvm_pushbuffer_t *pushbuffer = uvm_channel_get_pushbuffer(push->channel); uvm_gpu_t *gpu = uvm_push_get_gpu(push); uvm_push_t indirect_push; NV_STATUS status; NvU64 gpfifo_entry; void *push_enc_cpu = uvm_pushbuffer_get_unprotected_cpu_va_for_push(pushbuffer, push); NvU64 push_enc_gpu = uvm_pushbuffer_get_unprotected_gpu_va_for_push(pushbuffer, push); void *push_enc_auth_tag; uvm_gpu_address_t push_enc_auth_tag_gpu; NvU64 gpfifo_gpu_va = push->channel->channel_info.gpFifoGpuVa + old_cpu_put * sizeof(gpfifo_entry); UVM_ASSERT(uvm_channel_is_ce(push->channel)); UVM_ASSERT(uvm_channel_is_wlc(push->launch_channel)); // WLC submissions are done under channel lock, so there should be no // contention to get the right submission order. UVM_ASSERT(push->channel->conf_computing.gpu_put == old_cpu_put); // This can never stall or return error. WLC launch after WLC channels are // initialized uses private static pb space and it neither needs the general // PB space, nor it counts towards max concurrent pushes. status = uvm_push_begin_on_reserved_channel(push->launch_channel, &indirect_push, "Worklaunch to '%s' via '%s'", push->channel->name, push->launch_channel->name); UVM_ASSERT(status == NV_OK); // Move over the pushbuffer data // WLC channels use a static preallocated space for launch auth tags push_enc_auth_tag = indirect_push.channel->conf_computing.launch_auth_tag_cpu; push_enc_auth_tag_gpu = uvm_gpu_address_virtual(indirect_push.channel->conf_computing.launch_auth_tag_gpu_va); uvm_conf_computing_cpu_encrypt(indirect_push.channel, push_enc_cpu, push->begin, NULL, uvm_push_get_size(push), push_enc_auth_tag); uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE); gpu->parent->ce_hal->decrypt(&indirect_push, uvm_gpu_address_virtual(uvm_pushbuffer_get_gpu_va_for_push(pushbuffer, push)), uvm_gpu_address_virtual(push_enc_gpu), uvm_push_get_size(push), push_enc_auth_tag_gpu); gpu->parent->host_hal->set_gpfifo_entry(&gpfifo_entry, uvm_pushbuffer_get_gpu_va_for_push(pushbuffer, push), uvm_push_get_size(push), UVM_GPFIFO_SYNC_PROCEED); gpu->parent->ce_hal->memset_8(&indirect_push, uvm_gpu_address_virtual(gpfifo_gpu_va), gpfifo_entry, sizeof(gpfifo_entry)); uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU); do_semaphore_release(&indirect_push, push->channel->channel_info.gpPutGpuVa, new_gpu_put); uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU); do_semaphore_release(&indirect_push, push->channel->channel_info.workSubmissionOffsetGpuVa, push->channel->channel_info.workSubmissionToken); // Ignore return value of push_wait. It can only fail with channel error // which will be detected when waiting for the primary push. (void)uvm_push_end_and_wait(&indirect_push); push->channel->conf_computing.gpu_put = new_gpu_put; } static void update_gpput_via_sec2(uvm_push_t *sec2_push, uvm_channel_t *channel, NvU32 new_gpu_put) { uvm_gpu_t *gpu = uvm_push_get_gpu(sec2_push); void *gpput_auth_tag_cpu, *gpput_enc_cpu; uvm_gpu_address_t gpput_auth_tag_gpu, gpput_enc_gpu; NvU32 gpput_scratchpad[UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT/sizeof(new_gpu_put)]; UVM_ASSERT(uvm_channel_is_sec2(sec2_push->channel)); gpput_enc_cpu = uvm_push_get_single_inline_buffer(sec2_push, UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT, UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT, &gpput_enc_gpu); gpput_auth_tag_cpu = uvm_push_get_single_inline_buffer(sec2_push, UVM_CONF_COMPUTING_AUTH_TAG_SIZE, UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT, &gpput_auth_tag_gpu); // Update GPPUT. The update needs 4B write to specific offset, // however we can only do 16B aligned decrypt writes. // A poison value is written to all other locations, this is ignored in // most locations and overwritten by HW for GPGET location memset(gpput_scratchpad, 0, sizeof(gpput_scratchpad)); UVM_ASSERT(sizeof(*gpput_scratchpad) == sizeof(new_gpu_put)); gpput_scratchpad[(channel->channel_info.gpPutGpuVa % UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT) / sizeof(*gpput_scratchpad)] = new_gpu_put; // Set value of GPGET to be the same as GPPUT. It will be overwritten by // HW next time GET value changes. UVM never reads GPGET. // However, RM does read it when freeing a channel. When this function // is called from 'channel_manager_stop_wlc' we set the value of GPGET // to the same value as GPPUT. Mismatch between these two values makes // RM wait for any "pending" tasks, leading to significant delays in the // channel teardown sequence. UVM_ASSERT(channel->channel_info.gpPutGpuVa / UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT == channel->channel_info.gpGetGpuVa / UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT); gpput_scratchpad[(channel->channel_info.gpGetGpuVa % UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT) / sizeof(*gpput_scratchpad)] = new_gpu_put; uvm_conf_computing_cpu_encrypt(sec2_push->channel, gpput_enc_cpu, gpput_scratchpad, NULL, sizeof(gpput_scratchpad), gpput_auth_tag_cpu); gpu->parent->sec2_hal->decrypt(sec2_push, UVM_ALIGN_DOWN(channel->channel_info.gpPutGpuVa, UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT), gpput_enc_gpu.address, sizeof(gpput_scratchpad), gpput_auth_tag_gpu.address); } static void set_gpfifo_via_sec2(uvm_push_t *sec2_push, uvm_channel_t *channel, NvU32 put, NvU64 value) { uvm_gpu_t *gpu = uvm_push_get_gpu(sec2_push); void *gpfifo_auth_tag_cpu, *gpfifo_enc_cpu; uvm_gpu_address_t gpfifo_auth_tag_gpu, gpfifo_enc_gpu; NvU64 gpfifo_gpu = channel->channel_info.gpFifoGpuVa + put * sizeof(value); NvU64 gpfifo_scratchpad[2]; UVM_ASSERT(uvm_channel_is_sec2(sec2_push->channel)); gpfifo_enc_cpu = uvm_push_get_single_inline_buffer(sec2_push, sizeof(gpfifo_scratchpad), UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT, &gpfifo_enc_gpu); gpfifo_auth_tag_cpu = uvm_push_get_single_inline_buffer(sec2_push, UVM_CONF_COMPUTING_AUTH_TAG_SIZE, UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT, &gpfifo_auth_tag_gpu); if (IS_ALIGNED(gpfifo_gpu, UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT)) { gpfifo_scratchpad[0] = value; // Set the value of the odd entry to noop. // It will be overwritten when the next entry is submitted. gpu->parent->host_hal->set_gpfifo_noop(&gpfifo_scratchpad[1]); } else { uvm_gpfifo_entry_t *previous_gpfifo; UVM_ASSERT(put > 0); previous_gpfifo = &channel->gpfifo_entries[put - 1]; if (previous_gpfifo->type == UVM_GPFIFO_ENTRY_TYPE_CONTROL) { gpfifo_scratchpad[0] = previous_gpfifo->control_value; } else { uvm_pushbuffer_t *pushbuffer = uvm_channel_get_pushbuffer(channel); NvU64 prev_pb_va = uvm_pushbuffer_get_gpu_va_base(pushbuffer) + previous_gpfifo->pushbuffer_offset; // Reconstruct the previous gpfifo entry. UVM_GPFIFO_SYNC_WAIT is // used only in static WLC schedule. // Overwriting the previous entry with the same value doesn't hurt, // whether the previous entry has been processed or not gpu->parent->host_hal->set_gpfifo_entry(&gpfifo_scratchpad[0], prev_pb_va, previous_gpfifo->pushbuffer_size, UVM_GPFIFO_SYNC_PROCEED); } gpfifo_scratchpad[1] = value; } uvm_conf_computing_cpu_encrypt(sec2_push->channel, gpfifo_enc_cpu, gpfifo_scratchpad, NULL, sizeof(gpfifo_scratchpad), gpfifo_auth_tag_cpu); gpu->parent->sec2_hal->decrypt(sec2_push, UVM_ALIGN_DOWN(gpfifo_gpu, UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT), gpfifo_enc_gpu.address, sizeof(gpfifo_scratchpad), gpfifo_auth_tag_gpu.address); } static NV_STATUS internal_channel_submit_work_indirect_sec2(uvm_push_t *push, NvU32 old_cpu_put, NvU32 new_gpu_put) { uvm_pushbuffer_t *pushbuffer = uvm_channel_get_pushbuffer(push->channel); uvm_gpu_t *gpu = uvm_push_get_gpu(push); uvm_push_t indirect_push; NV_STATUS status; NvU64 gpfifo_entry; void *push_enc_cpu = uvm_pushbuffer_get_unprotected_cpu_va_for_push(pushbuffer, push); NvU64 push_enc_gpu = uvm_pushbuffer_get_unprotected_gpu_va_for_push(pushbuffer, push); void *push_auth_tag_cpu; uvm_gpu_address_t push_auth_tag_gpu; uvm_spin_loop_t spin; UVM_ASSERT(uvm_channel_is_ce(push->channel)); UVM_ASSERT(uvm_channel_is_sec2(push->launch_channel)); // If the old_cpu_put is not equal to the last gpu put, other pushes are // pending that need to be submitted. That push/es' submission will update // the gpu_put pointer to expected value. UVM_SPIN_WHILE(push->channel->conf_computing.gpu_put != old_cpu_put, &spin); // This can never stall or return error. SEC2 launch used during init has // plenty of PB space available before it needs to check for push // completion/channel status. WLC launch after WLC channels are initialized // uses private static pb space and needs neither general PB space, nor it // counts towards max concurrent pushes. status = uvm_push_begin_on_reserved_channel(push->launch_channel, &indirect_push, "Worklaunch to '%s' via '%s'", push->channel->name, push->launch_channel->name); if (status != NV_OK) return status; // Move over the pushbuffer data push_auth_tag_cpu = uvm_push_get_single_inline_buffer(&indirect_push, UVM_CONF_COMPUTING_AUTH_TAG_SIZE, UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT, &push_auth_tag_gpu); uvm_conf_computing_cpu_encrypt(indirect_push.channel, push_enc_cpu, push->begin, NULL, uvm_push_get_size(push), push_auth_tag_cpu); uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE); gpu->parent->sec2_hal->decrypt(&indirect_push, uvm_pushbuffer_get_gpu_va_for_push(pushbuffer, push), push_enc_gpu, uvm_push_get_size(push), push_auth_tag_gpu.address); gpu->parent->host_hal->set_gpfifo_entry(&gpfifo_entry, uvm_pushbuffer_get_gpu_va_for_push(pushbuffer, push), uvm_push_get_size(push), UVM_GPFIFO_SYNC_PROCEED); set_gpfifo_via_sec2(&indirect_push, push->channel, old_cpu_put, gpfifo_entry); update_gpput_via_sec2(&indirect_push, push->channel, new_gpu_put); // Ring the doorbell uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU); do_semaphore_release(&indirect_push, push->channel->channel_info.workSubmissionOffsetGpuVa, push->channel->channel_info.workSubmissionToken); status = uvm_push_end_and_wait(&indirect_push); if (status != NV_OK) return status; push->channel->conf_computing.gpu_put = new_gpu_put; return status; } // When the Confidential Computing feature is enabled, the CPU is unable to // access and read the pushbuffer. This is because it is located in the CPR of // vidmem in this configuration. This function allows UVM to retrieve the // content of the pushbuffer in an encrypted form for later decryption, hence, // simulating the original access pattern. E.g, reading timestamp semaphores. // See also: decrypt_push(). static void encrypt_push(uvm_push_t *push) { NvU64 push_protected_gpu_va; NvU64 push_unprotected_gpu_va; uvm_gpu_address_t auth_tag_gpu_va; uvm_channel_t *channel = push->channel; uvm_push_crypto_bundle_t *crypto_bundle; uvm_gpu_t *gpu = uvm_push_get_gpu(push); NvU32 push_size = uvm_push_get_size(push); uvm_push_info_t *push_info = uvm_push_info_from_push(push); uvm_pushbuffer_t *pushbuffer = uvm_channel_get_pushbuffer(channel); unsigned auth_tag_offset = UVM_CONF_COMPUTING_AUTH_TAG_SIZE * push->push_info_index; if (!uvm_conf_computing_mode_enabled(gpu)) return; if (!push_info->on_complete) return; if (!uvm_channel_is_ce(channel)) return; if (push_size == 0) return; UVM_ASSERT(!uvm_channel_is_wlc(channel)); UVM_ASSERT(!uvm_channel_is_lcic(channel)); UVM_ASSERT(channel->conf_computing.push_crypto_bundles != NULL); crypto_bundle = channel->conf_computing.push_crypto_bundles + push->push_info_index; auth_tag_gpu_va = uvm_rm_mem_get_gpu_va(channel->conf_computing.push_crypto_bundle_auth_tags, gpu, false); auth_tag_gpu_va.address += auth_tag_offset; crypto_bundle->push_size = push_size; push_protected_gpu_va = uvm_pushbuffer_get_gpu_va_for_push(pushbuffer, push); push_unprotected_gpu_va = uvm_pushbuffer_get_unprotected_gpu_va_for_push(pushbuffer, push); uvm_conf_computing_log_gpu_encryption(channel, &crypto_bundle->iv); gpu->parent->ce_hal->encrypt(push, uvm_gpu_address_virtual_unprotected(push_unprotected_gpu_va), uvm_gpu_address_virtual(push_protected_gpu_va), push_size, auth_tag_gpu_va); } void uvm_channel_end_push(uvm_push_t *push) { uvm_channel_t *channel = push->channel; uvm_channel_manager_t *channel_manager = channel->pool->manager; uvm_pushbuffer_t *pushbuffer = channel_manager->pushbuffer; uvm_gpfifo_entry_t *entry; NvU64 semaphore_va; NvU64 new_tracking_value; NvU32 new_payload; NvU32 push_size; NvU32 cpu_put; NvU32 new_cpu_put; uvm_gpu_t *gpu = uvm_channel_get_gpu(channel); bool needs_sec2_work_submit = false; channel_pool_lock(channel->pool); encrypt_push(push); new_tracking_value = ++channel->tracking_sem.queued_value; new_payload = (NvU32)new_tracking_value; semaphore_va = uvm_channel_tracking_semaphore_get_gpu_va(channel); uvm_channel_tracking_semaphore_release(push, semaphore_va, new_payload); if (uvm_channel_is_wlc(channel) && uvm_channel_manager_is_wlc_ready(channel_manager)) { uvm_channel_t *paired_lcic = wlc_get_paired_lcic(channel); gpu->parent->ce_hal->semaphore_reduction_inc(push, paired_lcic->channel_info.gpPutGpuVa, paired_lcic->num_gpfifo_entries - 1); uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU); do_semaphore_release(push, paired_lcic->channel_info.workSubmissionOffsetGpuVa, paired_lcic->channel_info.workSubmissionToken); if (uvm_push_get_size(push) < UVM_MAX_WLC_PUSH_SIZE) { // The UVM_MAX_WLC_PUSH_SIZE is set to fit indirect work launch // pushes. However, direct pushes to WLC can be smaller than this // size. This is used e.g. by indirect submission of control // gpfifo entries. gpu->parent->host_hal->noop(push, UVM_MAX_WLC_PUSH_SIZE - uvm_push_get_size(push)); } } push_size = uvm_push_get_size(push); UVM_ASSERT_MSG(push_size <= UVM_MAX_PUSH_SIZE, "push size %u\n", push_size); cpu_put = channel->cpu_put; new_cpu_put = (cpu_put + 1) % channel->num_gpfifo_entries; entry = &channel->gpfifo_entries[cpu_put]; entry->tracking_semaphore_value = new_tracking_value; entry->pushbuffer_offset = uvm_pushbuffer_get_offset_for_push(pushbuffer, push); entry->pushbuffer_size = push_size; // Indirect submission via SEC2/WLC needs pushes to be aligned for // encryption/decryption. The pushbuffer_size of this push // influences starting address of the next push. if (uvm_conf_computing_mode_enabled(gpu)) entry->pushbuffer_size = UVM_ALIGN_UP(push_size, UVM_CONF_COMPUTING_BUF_ALIGNMENT); entry->push_info = &channel->push_infos[push->push_info_index]; entry->type = UVM_GPFIFO_ENTRY_TYPE_NORMAL; UVM_ASSERT(channel->current_gpfifo_count > 0); --channel->current_gpfifo_count; if (uvm_channel_is_proxy(channel)) { proxy_channel_submit_work(push, push_size); } else if (uvm_channel_is_wlc(channel) && uvm_channel_manager_is_wlc_ready(channel_manager)) { internal_channel_submit_work_wlc(push); } else if (uvm_conf_computing_mode_enabled(gpu) && uvm_channel_is_ce(channel)) { if (uvm_channel_manager_is_wlc_ready(channel_manager)) { internal_channel_submit_work_indirect_wlc(push, cpu_put, new_cpu_put); } else { // submitting via SEC2 starts a push, postpone until this push is // ended needs_sec2_work_submit = true; } } else { internal_channel_submit_work(push, push_size, new_cpu_put); } channel->cpu_put = new_cpu_put; uvm_pushbuffer_end_push(pushbuffer, push, entry); // The moment the channel is unlocked uvm_channel_update_progress_with_max() // may notice the GPU work to be completed and hence all state tracking the // push must be updated before that. Notably uvm_pushbuffer_end_push() has // to be called first. unlock_channel_for_push(channel); channel_pool_unlock(channel->pool); // This memory barrier is borrowed from CUDA, as it supposedly fixes perf // issues on some systems. Comment from CUDA: "fixes throughput-related // performance problems, e.g. bugs 626179, 593841. This may be related to // bug 124888, which GL works around by doing a clflush" wmb(); if (needs_sec2_work_submit) { NV_STATUS status = internal_channel_submit_work_indirect_sec2(push, cpu_put, new_cpu_put); // This codepath should only be used during initialization and thus // NEVER return an error. UVM_ASSERT(status == NV_OK); } push->push_info_index = channel->num_gpfifo_entries; push->channel_tracking_value = new_tracking_value; } static void submit_ctrl_gpfifo(uvm_channel_t *channel, uvm_gpfifo_entry_t *entry, NvU32 new_cpu_put) { uvm_gpu_t *gpu = uvm_channel_get_gpu(channel); NvU32 cpu_put = channel->cpu_put; NvU64 *gpfifo_entry; UVM_ASSERT(entry == &channel->gpfifo_entries[cpu_put]); if (uvm_conf_computing_mode_enabled(gpu) && uvm_channel_is_ce(channel)) return; gpfifo_entry = (NvU64*)channel->channel_info.gpFifoEntries + cpu_put; *gpfifo_entry = entry->control_value; // Need to make sure all the GPFIFO entries writes complete before updating // GPPUT. We also don't want any reads to be moved after the GPPut write as // the GPU might modify the data they read as soon as the GPPut write // happens. mb(); gpu->parent->host_hal->write_gpu_put(channel, new_cpu_put); } static NV_STATUS submit_ctrl_gpfifo_indirect(uvm_channel_t *channel, uvm_gpfifo_entry_t *entry, NvU32 old_cpu_put, NvU32 new_gpu_put) { uvm_push_t indirect_push; NV_STATUS status = NV_OK; uvm_spin_loop_t spin; uvm_channel_type_t indirect_channel_type = uvm_channel_manager_is_wlc_ready(channel->pool->manager) ? UVM_CHANNEL_TYPE_WLC : UVM_CHANNEL_TYPE_SEC2; UVM_ASSERT(uvm_channel_is_ce(channel)); // If the old_cpu_put is not equal to the last gpu put, // Another push(es) is pending that needs to be submitted. // That push/es' submission will update the gpu_put pointer // to expected value. UVM_SPIN_WHILE(channel->conf_computing.gpu_put != old_cpu_put, &spin); status = uvm_push_begin(channel->pool->manager, indirect_channel_type, &indirect_push, "GPFIFO submit to '%s' via '%s'", channel->name, uvm_channel_type_to_string(indirect_channel_type)); if (status != NV_OK) return status; if (uvm_channel_is_sec2(indirect_push.channel)) { set_gpfifo_via_sec2(&indirect_push, channel, old_cpu_put, entry->control_value); update_gpput_via_sec2(&indirect_push, channel, new_gpu_put); } else { uvm_gpu_t *gpu = uvm_push_get_gpu(&indirect_push); NvU64 gpfifo_gpu_va = channel->channel_info.gpFifoGpuVa + (old_cpu_put * sizeof(entry->control_value)); gpu->parent->ce_hal->memset_8(&indirect_push, uvm_gpu_address_virtual(gpfifo_gpu_va), entry->control_value, sizeof(entry->control_value)); uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU); do_semaphore_release(&indirect_push, channel->channel_info.gpPutGpuVa, new_gpu_put); } uvm_push_set_flag(&indirect_push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU); do_semaphore_release(&indirect_push, channel->channel_info.workSubmissionOffsetGpuVa, channel->channel_info.workSubmissionToken); status = uvm_push_end_and_wait(&indirect_push); if (status != NV_OK) return status; channel->conf_computing.gpu_put = new_gpu_put; return NV_OK; } // The caller must submit a normal GPFIFO entry with a semaphore release // following the control GPFIFO, refer to uvm_channel_write_ctrl_gpfifo() for an // example. static void write_ctrl_gpfifo(uvm_channel_t *channel, NvU64 ctrl_fifo_entry_value) { uvm_gpfifo_entry_t *entry; NvU32 cpu_put; NvU32 new_cpu_put; bool needs_indirect_submit = false; uvm_gpu_t *gpu = uvm_channel_get_gpu(channel); channel_pool_lock(channel->pool); cpu_put = channel->cpu_put; new_cpu_put = (cpu_put + 1) % channel->num_gpfifo_entries; entry = &channel->gpfifo_entries[cpu_put]; memset(entry, 0, sizeof(*entry)); entry->type = UVM_GPFIFO_ENTRY_TYPE_CONTROL; entry->control_value = ctrl_fifo_entry_value; // Control GPFIFO entries are followed by a semaphore_release push in UVM. // We add the tracking semaphore value of the next GPFIFO entry, // potentially the associated semaphore release push. Even if a different // GPFIFO entry sneaks in, the purposes of signaling that this control // GPFIFO entry has been processed is accomplished. entry->tracking_semaphore_value = channel->tracking_sem.queued_value + 1; UVM_ASSERT(channel->current_gpfifo_count > 1); --channel->current_gpfifo_count; submit_ctrl_gpfifo(channel, entry, new_cpu_put); if (uvm_conf_computing_mode_enabled(gpu) && uvm_channel_is_ce(channel)) needs_indirect_submit = true; channel->cpu_put = new_cpu_put; // The moment the channel is unlocked uvm_channel_update_progress_with_max() // may notice the GPU work to be completed and hence all state tracking the // push must be updated before that. Note that we do not call // unlock_channel_for_push() because a control GPFIFO is followed by a // semaphore release, where the channel is unlocked. channel_pool_unlock(channel->pool); if (needs_indirect_submit) { NV_STATUS status = submit_ctrl_gpfifo_indirect(channel, entry, cpu_put, new_cpu_put); // All failures are globally fatal. There's nothing we do to recover. UVM_ASSERT_RELEASE(status == NV_OK); } // This memory barrier is borrowed from CUDA, as it supposedly fixes perf // issues on some systems. Comment from CUDA: "fixes throughput-related // performance problems, e.g. bugs 626179, 593841. This may be related to // bug 124888, which GL works around by doing a clflush" wmb(); } NV_STATUS uvm_channel_write_ctrl_gpfifo(uvm_channel_t *channel, NvU64 ctrl_fifo_entry_value) { NV_STATUS status; uvm_gpu_t *gpu = channel->pool->manager->gpu; uvm_push_t push; UVM_ASSERT(!uvm_channel_is_proxy(channel)); // WLC/LCIC channels can only process custom gpfifo entries before // their schedule is set up. UVM_ASSERT(!uvm_channel_is_lcic(channel) || !uvm_channel_manager_is_wlc_ready(channel->pool->manager)); UVM_ASSERT(!uvm_channel_is_wlc(channel) || !uvm_channel_manager_is_wlc_ready(channel->pool->manager)); // We reserve two GPFIFO entries, i.e., the control GPFIFO entry and the // subsequent semaphore_release push. There is, however, a potential case // for GPFIFO control submission starvation. This may happen because a // GPFIFO control submission requires two available GPFIFO entries. If you // have a full GPFIFO ring buffer that frees one entry at a time, while // there is another thread consuming this recently released entry at the // same rate, a concurrent thread trying to reserve two entries for a GPFIFO // control submission may starve. We could handle this by imposing minimal // release entries in uvm_channel.c:uvm_channel_update_progress(). Instead, // we don't deal with this potential starvation case because: // - Control GPFIFO are rarely used. // - By default, we release up to 8 GPFIFO entries at a time, except if the // release rate is constrained by lengthy pushbuffers -- another rare // situation. // - It would add unnecessary complexity to channel_update_progress(). status = uvm_channel_reserve(channel, 2); if (status != NV_OK) return status; write_ctrl_gpfifo(channel, ctrl_fifo_entry_value); status = uvm_push_begin_on_reserved_channel(channel, &push, "write_ctrl_GPFIFO"); if (status != NV_OK) { UVM_ERR_PRINT("Failed to begin push on channel: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu)); return status; } // This is an empty push, the push's embedded semaphore_release signals that // the GPFIFO control entry has been processed. uvm_push_end(&push); return NV_OK; } static NV_STATUS channel_reserve_and_lock(uvm_channel_t *channel, NvU32 num_gpfifo_entries) { uvm_spin_loop_t spin; uvm_channel_pool_t *pool = channel->pool; // This semaphore is uvm_up() in unlock_channel_for_push() as part of the // uvm_channel_end_push() routine. Note that different than in // channel_reserve_and_lock_in_pool, we cannot pick an unlocked channel from // the pool, even when there is one available and *channel is locked. // Not a concern given that uvm_channel_reserve() is not the common-case for // channel reservation, and only used for channel initialization, GPFIFO // control work submission, and testing. uvm_down(&pool->push_sem); channel_pool_lock(pool); if (test_claim_and_lock_channel(channel, num_gpfifo_entries)) goto out; channel_pool_unlock(pool); uvm_spin_loop_init(&spin); while (1) { NV_STATUS status; uvm_channel_update_progress(channel); channel_pool_lock(pool); if (test_claim_and_lock_channel(channel, num_gpfifo_entries)) goto out; channel_pool_unlock(pool); status = uvm_channel_check_errors(channel); if (status != NV_OK) { uvm_up(&pool->push_sem); return status; } UVM_SPIN_LOOP(&spin); } out: channel_pool_unlock(pool); return NV_OK; } NV_STATUS uvm_channel_reserve(uvm_channel_t *channel, NvU32 num_gpfifo_entries) { NV_STATUS status = NV_OK; uvm_spin_loop_t spin; uvm_gpu_t *gpu = uvm_channel_get_gpu(channel); if (uvm_conf_computing_mode_enabled(gpu)) return channel_reserve_and_lock(channel, num_gpfifo_entries); if (try_claim_channel(channel, num_gpfifo_entries)) return NV_OK; uvm_channel_update_progress(channel); uvm_spin_loop_init(&spin); while (!try_claim_channel(channel, num_gpfifo_entries) && status == NV_OK) { UVM_SPIN_LOOP(&spin); status = uvm_channel_check_errors(channel); uvm_channel_update_progress(channel); } return status; } // Get the first pending GPFIFO entry, if any. // This doesn't stop the entry from being reused. static uvm_gpfifo_entry_t *uvm_channel_get_first_pending_entry(uvm_channel_t *channel) { uvm_gpfifo_entry_t *entry = NULL; NvU32 pending_count = channel_update_progress_all(channel, UVM_CHANNEL_UPDATE_MODE_COMPLETED); if (pending_count == 0) return NULL; channel_pool_lock(channel->pool); if (channel->gpu_get != channel->cpu_put) entry = &channel->gpfifo_entries[channel->gpu_get]; channel_pool_unlock(channel->pool); return entry; } NV_STATUS uvm_channel_get_status(uvm_channel_t *channel) { uvm_gpu_t *gpu; NvNotification *errorNotifier; if (uvm_channel_is_proxy(channel)) errorNotifier = channel->proxy.channel_info.shadowErrorNotifier; else errorNotifier = channel->channel_info.errorNotifier; if (errorNotifier->status == 0) return NV_OK; // In case we hit a channel error, check the ECC error notifier as well so // that a more precise ECC error can be returned in case there is indeed an // ECC error. // // Notably this might be racy depending on the ordering of the notifications, // but we can't always call RM to service interrupts from this context. gpu = uvm_channel_get_gpu(channel); if (gpu->ecc.enabled && *gpu->ecc.error_notifier) return NV_ERR_ECC_ERROR; return NV_ERR_RC_ERROR; } uvm_gpfifo_entry_t *uvm_channel_get_fatal_entry(uvm_channel_t *channel) { UVM_ASSERT(uvm_channel_get_status(channel) != NV_OK); return uvm_channel_get_first_pending_entry(channel); } NV_STATUS uvm_channel_check_errors(uvm_channel_t *channel) { uvm_gpfifo_entry_t *fatal_entry; NV_STATUS status = uvm_channel_get_status(channel); if (status == NV_OK) return NV_OK; UVM_ERR_PRINT("Detected a channel error, channel %s GPU %s\n", channel->name, uvm_gpu_name(uvm_channel_get_gpu(channel))); fatal_entry = uvm_channel_get_fatal_entry(channel); if (fatal_entry != NULL) { if (fatal_entry->type == UVM_GPFIFO_ENTRY_TYPE_NORMAL) { uvm_push_info_t *push_info = fatal_entry->push_info; UVM_ERR_PRINT("Channel error likely caused by push '%s' started at %s:%d in %s()\n", push_info->description, push_info->filename, push_info->line, push_info->function); } else { UVM_ASSERT(!uvm_channel_is_proxy(channel)); UVM_ERR_PRINT("Channel error likely caused by GPFIFO control entry, data: 0x%llx, gpu_get: %d\n", fatal_entry->control_value, channel->gpu_get); } } uvm_global_set_fatal_error(status); return status; } NV_STATUS uvm_channel_manager_check_errors(uvm_channel_manager_t *channel_manager) { uvm_channel_pool_t *pool; NV_STATUS status = uvm_global_get_status(); if (status != NV_OK) return status; uvm_for_each_pool(pool, channel_manager) { uvm_channel_t *channel; uvm_for_each_channel_in_pool(channel, pool) { status = uvm_channel_check_errors(channel); if (status != NV_OK) return status; } } return status; } bool uvm_channel_is_value_completed(uvm_channel_t *channel, NvU64 value) { return uvm_gpu_tracking_semaphore_is_value_completed(&channel->tracking_sem, value); } NvU64 uvm_channel_update_completed_value(uvm_channel_t *channel) { return uvm_gpu_tracking_semaphore_update_completed_value(&channel->tracking_sem); } static NV_STATUS csl_init(uvm_channel_t *channel) { NV_STATUS status; uvm_gpu_t *gpu = uvm_channel_get_gpu(channel); UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu)); uvm_mutex_init(&channel->csl.ctx_lock, UVM_LOCK_ORDER_LEAF); status = uvm_rm_locked_call(nvUvmInterfaceCslInitContext(&channel->csl.ctx, channel->handle)); if (status == NV_OK) { channel->csl.is_ctx_initialized = true; } else { UVM_DBG_PRINT("nvUvmInterfaceCslInitContext() failed: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu)); } return status; } static void csl_destroy(uvm_channel_t *channel) { if (!channel->csl.is_ctx_initialized) return; uvm_assert_mutex_unlocked(&channel->csl.ctx_lock); UVM_ASSERT(!is_channel_locked_for_push(channel)); uvm_rm_locked_call_void(nvUvmInterfaceDeinitCslContext(&channel->csl.ctx)); channel->csl.is_ctx_initialized = false; } static void free_conf_computing_buffers(uvm_channel_t *channel) { uvm_gpu_t *gpu = uvm_channel_get_gpu(channel); UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu)); UVM_ASSERT(uvm_channel_is_ce(channel)); uvm_rm_mem_free(channel->conf_computing.static_pb_protected_vidmem); uvm_rm_mem_free(channel->conf_computing.static_pb_unprotected_sysmem); uvm_rm_mem_free(channel->conf_computing.static_notifier_unprotected_sysmem); uvm_rm_mem_free(channel->conf_computing.push_crypto_bundle_auth_tags); uvm_kvfree(channel->conf_computing.static_pb_protected_sysmem); uvm_kvfree(channel->conf_computing.push_crypto_bundles); channel->conf_computing.static_pb_protected_vidmem = NULL; channel->conf_computing.static_pb_unprotected_sysmem = NULL; channel->conf_computing.static_notifier_unprotected_sysmem = NULL; channel->conf_computing.push_crypto_bundle_auth_tags = NULL; channel->conf_computing.static_pb_protected_sysmem = NULL; channel->conf_computing.push_crypto_bundles = NULL; uvm_rm_mem_free(channel->tracking_sem.semaphore.conf_computing.encrypted_payload); uvm_rm_mem_free(channel->tracking_sem.semaphore.conf_computing.notifier); uvm_rm_mem_free(channel->tracking_sem.semaphore.conf_computing.auth_tag); uvm_kvfree(channel->tracking_sem.semaphore.conf_computing.ivs); channel->tracking_sem.semaphore.conf_computing.encrypted_payload = NULL; channel->tracking_sem.semaphore.conf_computing.notifier = NULL; channel->tracking_sem.semaphore.conf_computing.auth_tag = NULL; channel->tracking_sem.semaphore.conf_computing.ivs = NULL; } static NV_STATUS alloc_conf_computing_buffers_semaphore(uvm_channel_t *channel) { uvm_gpu_semaphore_t *semaphore = &channel->tracking_sem.semaphore; uvm_gpu_t *gpu = uvm_channel_get_gpu(channel); NV_STATUS status; UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu)); UVM_ASSERT(uvm_channel_is_ce(channel)); status = uvm_rm_mem_alloc_and_map_cpu(gpu, UVM_RM_MEM_TYPE_SYS, sizeof(semaphore->conf_computing.last_pushed_notifier), UVM_CONF_COMPUTING_BUF_ALIGNMENT, &semaphore->conf_computing.notifier); if (status != NV_OK) return status; status = uvm_rm_mem_alloc_and_map_cpu(gpu, UVM_RM_MEM_TYPE_SYS, sizeof(*channel->tracking_sem.semaphore.payload), UVM_CONF_COMPUTING_BUF_ALIGNMENT, &semaphore->conf_computing.encrypted_payload); if (status != NV_OK) return status; status = uvm_rm_mem_alloc_and_map_cpu(gpu, UVM_RM_MEM_TYPE_SYS, UVM_CONF_COMPUTING_AUTH_TAG_SIZE, UVM_CONF_COMPUTING_BUF_ALIGNMENT, &semaphore->conf_computing.auth_tag); if (status != NV_OK) return status; semaphore->conf_computing.ivs = uvm_kvmalloc_zero(sizeof(*semaphore->conf_computing.ivs) * channel->num_gpfifo_entries); if (!semaphore->conf_computing.ivs) return NV_ERR_NO_MEMORY; return status; } static NV_STATUS alloc_conf_computing_buffers_wlc(uvm_channel_t *channel) { uvm_gpu_t *gpu = uvm_channel_get_gpu(channel); size_t aligned_wlc_push_size = UVM_ALIGN_UP(UVM_MAX_WLC_PUSH_SIZE, UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT); NV_STATUS status = uvm_rm_mem_alloc_and_map_cpu(gpu, UVM_RM_MEM_TYPE_SYS, aligned_wlc_push_size + UVM_CONF_COMPUTING_AUTH_TAG_SIZE * 2, PAGE_SIZE, &channel->conf_computing.static_pb_unprotected_sysmem); if (status != NV_OK) return status; // Both pushes will be targets for SEC2 decrypt operations and have to // be aligned for SEC2. The first push location will also be a target // for CE decrypt operation and has to be aligned for CE decrypt. status = uvm_rm_mem_alloc(gpu, UVM_RM_MEM_TYPE_GPU, UVM_ALIGN_UP(UVM_MAX_WLC_PUSH_SIZE, UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT) * 2, UVM_CONF_COMPUTING_BUF_ALIGNMENT, &channel->conf_computing.static_pb_protected_vidmem); if (status != NV_OK) return status; channel->conf_computing.static_pb_unprotected_sysmem_cpu = uvm_rm_mem_get_cpu_va(channel->conf_computing.static_pb_unprotected_sysmem); channel->conf_computing.static_pb_unprotected_sysmem_auth_tag_cpu = (char*)channel->conf_computing.static_pb_unprotected_sysmem_cpu + aligned_wlc_push_size; // The location below is only used for launch pushes but reuses // the same sysmem allocation channel->conf_computing.launch_auth_tag_cpu = (char*)channel->conf_computing.static_pb_unprotected_sysmem_cpu + aligned_wlc_push_size + UVM_CONF_COMPUTING_AUTH_TAG_SIZE; channel->conf_computing.launch_auth_tag_gpu_va = uvm_rm_mem_get_gpu_uvm_va(channel->conf_computing.static_pb_unprotected_sysmem, gpu) + aligned_wlc_push_size + UVM_CONF_COMPUTING_AUTH_TAG_SIZE; channel->conf_computing.static_pb_protected_sysmem = uvm_kvmalloc(UVM_MAX_WLC_PUSH_SIZE + UVM_PAGE_SIZE_4K); if (!channel->conf_computing.static_pb_protected_sysmem) return NV_ERR_NO_MEMORY; return status; } static NV_STATUS alloc_conf_computing_buffers_lcic(uvm_channel_t *channel) { uvm_gpu_t *gpu = uvm_channel_get_gpu(channel); const size_t notifier_size = sizeof(*channel->conf_computing.static_notifier_entry_unprotected_sysmem_cpu); NV_STATUS status = uvm_rm_mem_alloc_and_map_cpu(gpu, UVM_RM_MEM_TYPE_SYS, notifier_size * 2, UVM_CONF_COMPUTING_BUF_ALIGNMENT, &channel->conf_computing.static_notifier_unprotected_sysmem); if (status != NV_OK) return status; status = uvm_rm_mem_alloc(gpu, UVM_RM_MEM_TYPE_GPU, UVM_LCIC_PUSH_SIZE, UVM_CONF_COMPUTING_BUF_ALIGNMENT, &channel->conf_computing.static_pb_protected_vidmem); if (status != NV_OK) return status; channel->conf_computing.static_notifier_entry_unprotected_sysmem_cpu = uvm_rm_mem_get_cpu_va(channel->conf_computing.static_notifier_unprotected_sysmem); channel->conf_computing.static_notifier_exit_unprotected_sysmem_cpu = channel->conf_computing.static_notifier_entry_unprotected_sysmem_cpu + 1; channel->conf_computing.static_notifier_entry_unprotected_sysmem_gpu_va = uvm_rm_mem_get_gpu_va(channel->conf_computing.static_notifier_unprotected_sysmem, gpu, false); channel->conf_computing.static_notifier_exit_unprotected_sysmem_gpu_va = channel->conf_computing.static_notifier_entry_unprotected_sysmem_gpu_va; channel->conf_computing.static_notifier_exit_unprotected_sysmem_gpu_va.address += notifier_size; return status; } static NV_STATUS alloc_conf_computing_buffers(uvm_channel_t *channel) { NV_STATUS status; uvm_gpu_t *gpu = uvm_channel_get_gpu(channel); UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu)); UVM_ASSERT(uvm_channel_is_ce(channel)); status = alloc_conf_computing_buffers_semaphore(channel); if (status != NV_OK) return status; if (uvm_channel_is_wlc(channel)) { status = alloc_conf_computing_buffers_wlc(channel); } else if (uvm_channel_is_lcic(channel)) { status = alloc_conf_computing_buffers_lcic(channel); } else { void *push_crypto_bundles = uvm_kvmalloc_zero(sizeof(*channel->conf_computing.push_crypto_bundles) * channel->num_gpfifo_entries); if (push_crypto_bundles == NULL) return NV_ERR_NO_MEMORY; channel->conf_computing.push_crypto_bundles = push_crypto_bundles; status = uvm_rm_mem_alloc_and_map_cpu(gpu, UVM_RM_MEM_TYPE_SYS, channel->num_gpfifo_entries * UVM_CONF_COMPUTING_AUTH_TAG_SIZE, UVM_CONF_COMPUTING_BUF_ALIGNMENT, &channel->conf_computing.push_crypto_bundle_auth_tags); } return status; } static void channel_destroy(uvm_channel_pool_t *pool, uvm_channel_t *channel) { uvm_gpu_t *gpu = uvm_channel_get_gpu(channel); UVM_ASSERT(pool->num_channels > 0); if (channel->tracking_sem.queued_value > 0) { // The channel should have been idled before being destroyed, unless an // error was triggered. We need to check both error cases (global and // channel) to handle the UVM_TEST_CHANNEL_SANITY unit test. if (uvm_global_get_status() == NV_OK && uvm_channel_get_status(channel) == NV_OK) UVM_ASSERT(uvm_gpu_tracking_semaphore_is_completed(&channel->tracking_sem)); // Remove all remaining GPFIFOs from their pushbuffer chunk, since the // pushbuffer has a longer lifetime. channel_update_progress_all(channel, UVM_CHANNEL_UPDATE_MODE_FORCE_ALL); } proc_remove(channel->procfs.pushes); proc_remove(channel->procfs.info); proc_remove(channel->procfs.dir); uvm_kvfree(channel->push_acquire_infos); uvm_kvfree(channel->push_infos); uvm_kvfree(channel->gpfifo_entries); if (uvm_conf_computing_mode_enabled(gpu)) { csl_destroy(channel); if (uvm_channel_is_ce(channel)) free_conf_computing_buffers(channel); } if (uvm_channel_is_proxy(channel)) uvm_rm_locked_call_void(nvUvmInterfacePagingChannelDestroy(channel->proxy.handle)); else uvm_rm_locked_call_void(nvUvmInterfaceChannelDestroy(channel->handle)); uvm_gpu_tracking_semaphore_free(&channel->tracking_sem); UVM_ASSERT(list_empty(&channel->tools.channel_list_node)); UVM_ASSERT(channel->tools.pending_event_count == 0); pool->num_channels--; } static unsigned channel_pool_type_num_gpfifo_entries(uvm_channel_manager_t *manager, uvm_channel_pool_type_t pool_type) { switch (pool_type) { case UVM_CHANNEL_POOL_TYPE_CE: case UVM_CHANNEL_POOL_TYPE_CE_PROXY: return manager->conf.num_gpfifo_entries; case UVM_CHANNEL_POOL_TYPE_SEC2: return manager->conf.num_gpfifo_entries; case UVM_CHANNEL_POOL_TYPE_WLC: { // WLC benefits from larger number of entries since more available // entries result in less frequent calls to // uvm_channel_update_progress 16 is the maximum size that can // re-use static pb preallocated memory when uploading the WLC // schedule. return 16; } case UVM_CHANNEL_POOL_TYPE_LCIC: { // Every channel needs at least 3 entries; 1 for sentinel and 2 more // for submitting GPFIFO control entries. The number also has to be // power of 2, as the HW stores the size as log2 value. // LCIC does not accept external pushes, uvm_channel_update_progress // is not a concern. return 4; } default: UVM_ASSERT_MSG(0, "Unhandled pool type: %d", pool_type); return 0; } } // Returns the TSG for a given channel. static uvmGpuTsgHandle channel_get_tsg(uvm_channel_t *channel) { unsigned tsg_index = 0; uvm_channel_pool_t *pool = channel->pool; if (uvm_channel_pool_is_wlc(pool) || uvm_channel_pool_is_lcic(pool)) { if (uvm_channel_pool_is_lcic(pool)) { channel = lcic_get_paired_wlc(channel); pool = channel->pool; } tsg_index = uvm_channel_index_in_pool(channel); } UVM_ASSERT(tsg_index < pool->num_tsgs); return pool->tsg_handles[tsg_index]; } static NV_STATUS internal_channel_create(uvm_channel_t *channel) { NV_STATUS status; UvmGpuChannelAllocParams channel_alloc_params; UvmGpuChannelInfo *channel_info = &channel->channel_info; uvm_channel_manager_t *manager = channel->pool->manager; uvm_gpu_t *gpu = manager->gpu; memset(&channel_alloc_params, 0, sizeof(channel_alloc_params)); channel_alloc_params.numGpFifoEntries = channel_pool_type_num_gpfifo_entries(manager, channel->pool->pool_type); channel_alloc_params.gpFifoLoc = manager->conf.gpfifo_loc; channel_alloc_params.gpPutLoc = manager->conf.gpput_loc; if (uvm_channel_is_sec2(channel)) { UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu)); // SEC2 channels' GPPUT and GPFIFO must be allocated in sysmem. channel_alloc_params.gpFifoLoc = UVM_BUFFER_LOCATION_SYS; channel_alloc_params.gpPutLoc = UVM_BUFFER_LOCATION_SYS; } status = uvm_rm_locked_call(nvUvmInterfaceChannelAllocate(channel_get_tsg(channel), &channel_alloc_params, &channel->handle, channel_info)); if (status != NV_OK) { UVM_ERR_PRINT("nvUvmInterfaceChannelAllocate() failed: %s, GPU %s, type %s\n", nvstatusToString(status), uvm_gpu_name(gpu), uvm_channel_pool_type_to_string(channel->pool->pool_type)); return status; } snprintf(channel->name, sizeof(channel->name), "ID %u:%u (0x%x:0x%x) %s %u", channel_info->hwRunlistId, channel_info->hwChannelId, channel_info->hwRunlistId, channel_info->hwChannelId, uvm_channel_is_sec2(channel) ? "SEC2" : uvm_channel_is_wlc(channel) ? "WLC" : uvm_channel_is_lcic(channel) ? "LCIC" : "CE", channel->pool->engine_index); return NV_OK; } static NV_STATUS proxy_channel_create(uvm_channel_t *channel, unsigned ce_index) { NV_STATUS status; unsigned proxy_index; UvmGpuPagingChannelAllocParams channel_alloc_params; uvm_channel_manager_t *manager = channel->pool->manager; uvm_gpu_t *gpu = manager->gpu; UVM_ASSERT(uvm_channel_is_proxy(channel)); memset(&channel_alloc_params, 0, sizeof(channel_alloc_params)); channel_alloc_params.engineIndex = ce_index; status = uvm_rm_locked_call(nvUvmInterfacePagingChannelAllocate(uvm_gpu_device_handle(gpu), &channel_alloc_params, &channel->proxy.handle, &channel->proxy.channel_info)); if (status != NV_OK) { UVM_ERR_PRINT("nvUvmInterfacePagingChannelAllocate() failed: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu)); return status; } proxy_index = uvm_channel_index_in_pool(channel); snprintf(channel->name, sizeof(channel->name), "Proxy %u CE %u", proxy_index, ce_index); return NV_OK; } static NV_STATUS channel_create(uvm_channel_pool_t *pool, uvm_channel_t *channel) { NV_STATUS status; uvm_channel_manager_t *manager = pool->manager; uvm_gpu_t *gpu = manager->gpu; uvm_gpu_semaphore_pool_t *semaphore_pool = gpu->semaphore_pool; unsigned int i; UVM_ASSERT(channel != NULL); channel->pool = pool; pool->num_channels++; INIT_LIST_HEAD(&channel->available_push_infos); channel->tools.pending_event_count = 0; INIT_LIST_HEAD(&channel->tools.channel_list_node); if (uvm_conf_computing_mode_enabled(gpu) && uvm_channel_is_ce(channel)) semaphore_pool = gpu->secure_semaphore_pool; status = uvm_gpu_tracking_semaphore_alloc(semaphore_pool, &channel->tracking_sem); if (status != NV_OK) { UVM_ERR_PRINT("uvm_gpu_tracking_semaphore_alloc() failed: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu)); goto error; } if (uvm_channel_is_proxy(channel)) status = proxy_channel_create(channel, pool->engine_index); else status = internal_channel_create(channel); if (status != NV_OK) goto error; channel->num_gpfifo_entries = channel_pool_type_num_gpfifo_entries(manager, pool->pool_type); channel->gpfifo_entries = uvm_kvmalloc_zero(sizeof(*channel->gpfifo_entries) * channel->num_gpfifo_entries); if (channel->gpfifo_entries == NULL) { status = NV_ERR_NO_MEMORY; goto error; } if (uvm_conf_computing_mode_enabled(gpu)) { status = csl_init(channel); if (status != NV_OK) goto error; if (uvm_channel_is_ce(channel)) { // Must happen after the channel's num_gpfifo_entries is known, as // UVM needs one IV slot per GPFIFO entry. status = alloc_conf_computing_buffers(channel); if (status != NV_OK) goto error; } } channel->push_infos = uvm_kvmalloc_zero(sizeof(*channel->push_infos) * channel->num_gpfifo_entries); if (channel->push_infos == NULL) { status = NV_ERR_NO_MEMORY; goto error; } if (uvm_push_info_is_tracking_acquires()) { channel->push_acquire_infos = uvm_kvmalloc_zero(sizeof(*channel->push_acquire_infos) * channel->num_gpfifo_entries); if (channel->push_acquire_infos == NULL) { status = NV_ERR_NO_MEMORY; goto error; } } for (i = 0; i < channel->num_gpfifo_entries; i++) list_add_tail(&channel->push_infos[i].available_list_node, &channel->available_push_infos); status = channel_create_procfs(channel); if (status != NV_OK) goto error; return NV_OK; error: channel_destroy(pool, channel); return status; } NvU64 uvm_channel_tracking_semaphore_get_gpu_va_in_channel(uvm_channel_t *semaphore_channel, uvm_channel_t *access_channel) { uvm_gpu_semaphore_t *semaphore = &semaphore_channel->tracking_sem.semaphore; uvm_gpu_t *gpu = uvm_channel_get_gpu(access_channel); return uvm_gpu_semaphore_get_gpu_va(semaphore, gpu, uvm_channel_is_proxy(access_channel)); } static NV_STATUS channel_init(uvm_channel_t *channel) { uvm_push_t push; uvm_gpu_t *gpu = uvm_channel_get_gpu(channel); NV_STATUS status; NvU32 num_entries = 1; if (uvm_gpu_has_pushbuffer_segments(gpu)) num_entries++; status = uvm_channel_reserve(channel, num_entries); if (status != NV_OK) return status; if (uvm_gpu_has_pushbuffer_segments(gpu)) { NvU64 gpfifo_entry; uvm_pushbuffer_t *pushbuffer = uvm_channel_get_pushbuffer(channel); NvU64 pb_base = uvm_pushbuffer_get_gpu_va_base(pushbuffer); if (uvm_channel_is_sec2(channel)) pb_base = uvm_pushbuffer_get_sec2_gpu_va_base(pushbuffer); else if (channel->conf_computing.static_pb_protected_vidmem) pb_base = uvm_rm_mem_get_gpu_uvm_va(channel->conf_computing.static_pb_protected_vidmem, gpu); gpu->parent->host_hal->set_gpfifo_pushbuffer_segment_base(&gpfifo_entry, pb_base); write_ctrl_gpfifo(channel, gpfifo_entry); } status = uvm_push_begin_on_reserved_channel(channel, &push, "Init channel"); if (status != NV_OK) { UVM_ERR_PRINT("Failed to begin push on channel: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu)); return status; } if (uvm_channel_is_ce(channel)) gpu->parent->ce_hal->init(&push); else gpu->parent->sec2_hal->init(&push); gpu->parent->host_hal->init(&push); status = uvm_push_end_and_wait(&push); if (status != NV_OK) { UVM_ERR_PRINT("Channel '%s' init failed: %s, GPU %s\n", channel->name, nvstatusToString(status), uvm_gpu_name(gpu)); } return status; } static bool channel_manager_uses_proxy_pool(uvm_channel_manager_t *manager) { return uvm_gpu_is_virt_mode_sriov_heavy(manager->gpu); } // Number of channels to create in a pool of the given type. // // TODO: Bug 1764958: Tweak this function after benchmarking real workloads. static unsigned channel_pool_type_num_channels(uvm_channel_pool_type_t pool_type) { // TODO: Bug 3387454: The vGPU plugin implementation supports a single // proxy channel per GPU if (pool_type == UVM_CHANNEL_POOL_TYPE_CE_PROXY) return 1; // Not all GPU architectures support more than 1 channel per TSG. Since SEC2 // is not in UVM critical path for performance, we conservatively create a // pool/TSG with a single channel. if (pool_type == UVM_CHANNEL_POOL_TYPE_SEC2) return 1; if (pool_type == UVM_CHANNEL_POOL_TYPE_WLC || pool_type == UVM_CHANNEL_POOL_TYPE_LCIC) return UVM_PUSH_MAX_CONCURRENT_PUSHES; return 2; } // Number of TSGs to create in a pool of a given type. static unsigned channel_pool_type_num_tsgs(uvm_channel_pool_type_t pool_type) { // For WLC and LCIC channels, we create one TSG per WLC/LCIC channel pair. // The TSG is stored in the WLC pool. if (pool_type == UVM_CHANNEL_POOL_TYPE_WLC) return channel_pool_type_num_channels(pool_type); else if (pool_type == UVM_CHANNEL_POOL_TYPE_LCIC) return 0; return 1; } static UVM_GPU_CHANNEL_ENGINE_TYPE pool_type_to_engine_type(uvm_channel_pool_type_t pool_type) { if (pool_type == UVM_CHANNEL_POOL_TYPE_SEC2) return UVM_GPU_CHANNEL_ENGINE_TYPE_SEC2; return UVM_GPU_CHANNEL_ENGINE_TYPE_CE; } static void tsg_destroy(uvm_channel_pool_t *pool, uvmGpuTsgHandle tsg_handle) { UVM_ASSERT(pool->num_tsgs > 0); uvm_rm_locked_call_void(nvUvmInterfaceTsgDestroy(tsg_handle)); pool->num_tsgs--; } static NV_STATUS tsg_create(uvm_channel_pool_t *pool, uvmGpuTsgHandle *tsg_handle) { NV_STATUS status; UvmGpuTsgAllocParams tsg_alloc_params; uvm_gpu_t *gpu = pool->manager->gpu; pool->num_tsgs++; tsg_alloc_params.engineType = pool_type_to_engine_type(pool->pool_type); tsg_alloc_params.engineIndex = pool->engine_index; status = uvm_rm_locked_call(nvUvmInterfaceTsgAllocate(gpu->rm_address_space, &tsg_alloc_params, tsg_handle)); if (status != NV_OK) { UVM_ERR_PRINT("nvUvmInterfaceTsgAllocate() failed: %s, GPU %s, type %s\n", nvstatusToString(status), uvm_gpu_name(gpu), uvm_channel_pool_type_to_string(pool->pool_type)); goto error; } return NV_OK; error: tsg_destroy(pool, *tsg_handle); return status; } static void channel_pool_destroy(uvm_channel_pool_t *pool) { UVM_ASSERT(pool->manager->num_channel_pools > 0); while (pool->num_channels > 0) channel_destroy(pool, pool->channels + pool->num_channels - 1); uvm_kvfree(pool->channels); pool->channels = NULL; while (pool->num_tsgs > 0) tsg_destroy(pool, *(pool->tsg_handles + pool->num_tsgs - 1)); uvm_kvfree(pool->tsg_handles); pool->tsg_handles = NULL; pool->manager->num_channel_pools--; } static NV_STATUS channel_pool_add(uvm_channel_manager_t *channel_manager, uvm_channel_pool_type_t pool_type, unsigned engine_index, uvm_channel_pool_t **pool_out) { NV_STATUS status; unsigned i; unsigned num_channels; unsigned num_tsgs; uvm_channel_pool_t *pool; UVM_ASSERT(uvm_pool_type_is_valid(pool_type)); pool = channel_manager->channel_pools + channel_manager->num_channel_pools; channel_manager->num_channel_pools++; pool->manager = channel_manager; pool->engine_index = engine_index; pool->pool_type = pool_type; num_tsgs = channel_pool_type_num_tsgs(pool_type); if (num_tsgs != 0) { pool->tsg_handles = uvm_kvmalloc_zero(sizeof(*pool->tsg_handles) * num_tsgs); if (!pool->tsg_handles) { status = NV_ERR_NO_MEMORY; goto error; } for (i = 0; i < num_tsgs; i++) { uvmGpuTsgHandle *tsg_handle = pool->tsg_handles + i; status = tsg_create(pool, tsg_handle); if (status != NV_OK) goto error; } } channel_pool_lock_init(pool); num_channels = channel_pool_type_num_channels(pool_type); UVM_ASSERT(num_channels <= UVM_CHANNEL_MAX_NUM_CHANNELS_PER_POOL); if (uvm_conf_computing_mode_enabled(channel_manager->gpu)) { // Use different order lock for SEC2 and WLC channels. // This allows reserving a SEC2 or WLC channel for indirect work // submission while holding a reservation for a channel. uvm_lock_order_t order = uvm_channel_pool_is_sec2(pool) ? UVM_LOCK_ORDER_CSL_SEC2_PUSH : (uvm_channel_pool_is_wlc(pool) ? UVM_LOCK_ORDER_CSL_WLC_PUSH : UVM_LOCK_ORDER_CSL_PUSH); uvm_sema_init(&pool->push_sem, num_channels, order); } pool->channels = uvm_kvmalloc_zero(sizeof(*pool->channels) * num_channels); if (!pool->channels) { status = NV_ERR_NO_MEMORY; goto error; } for (i = 0; i < num_channels; i++) { uvm_channel_t *channel = pool->channels + i; status = channel_create(pool, channel); if (status != NV_OK) goto error; status = channel_init(channel); if (status != NV_OK) goto error; } *pool_out = pool; return NV_OK; error: channel_pool_destroy(pool); return status; } static bool ce_usable_for_channel_type(uvm_channel_type_t type, const UvmGpuCopyEngineCaps *cap) { if (!cap->supported || cap->grce) return false; switch (type) { case UVM_CHANNEL_TYPE_CPU_TO_GPU: case UVM_CHANNEL_TYPE_GPU_TO_CPU: return cap->sysmem; case UVM_CHANNEL_TYPE_GPU_INTERNAL: case UVM_CHANNEL_TYPE_MEMOPS: return true; case UVM_CHANNEL_TYPE_GPU_TO_GPU: return cap->p2p; default: UVM_ASSERT_MSG(false, "Unexpected channel type 0x%x\n", type); return false; } } static unsigned ce_usage_count(NvU32 ce, const unsigned *preferred_ce) { unsigned i; unsigned count = 0; UVM_ASSERT(ce < UVM_COPY_ENGINE_COUNT_MAX); for (i = 0; i < UVM_CHANNEL_TYPE_CE_COUNT; i++) { if (ce == preferred_ce[i]) count++; } return count; } // Returns negative if the first CE should be considered better than the second static int compare_ce_for_channel_type(const UvmGpuCopyEngineCaps *ce_caps, uvm_channel_type_t type, NvU32 ce_index0, NvU32 ce_index1, NvU32 *preferred_ce) { unsigned ce0_usage, ce1_usage; const UvmGpuCopyEngineCaps *cap0 = ce_caps + ce_index0; const UvmGpuCopyEngineCaps *cap1 = ce_caps + ce_index1; UVM_ASSERT(ce_usable_for_channel_type(type, cap0)); UVM_ASSERT(ce_usable_for_channel_type(type, cap1)); UVM_ASSERT(ce_index0 < UVM_COPY_ENGINE_COUNT_MAX); UVM_ASSERT(ce_index1 < UVM_COPY_ENGINE_COUNT_MAX); UVM_ASSERT(ce_index0 != ce_index1); switch (type) { case UVM_CHANNEL_TYPE_CPU_TO_GPU: // For CPU to GPU fast sysmem read is the most important if (cap0->sysmemRead != cap1->sysmemRead) return cap1->sysmemRead - cap0->sysmemRead; // Prefer not to take up the CEs for nvlink P2P if (cap0->nvlinkP2p != cap1->nvlinkP2p) return cap0->nvlinkP2p - cap1->nvlinkP2p; break; case UVM_CHANNEL_TYPE_GPU_TO_CPU: // For GPU to CPU fast sysmem write is the most important if (cap0->sysmemWrite != cap1->sysmemWrite) return cap1->sysmemWrite - cap0->sysmemWrite; // Prefer not to take up the CEs for nvlink P2P if (cap0->nvlinkP2p != cap1->nvlinkP2p) return cap0->nvlinkP2p - cap1->nvlinkP2p; break; case UVM_CHANNEL_TYPE_GPU_TO_GPU: // Prefer the LCE with the most PCEs { int pce_diff = (int)hweight32(cap1->cePceMask) - (int)hweight32(cap0->cePceMask); if (pce_diff != 0) return pce_diff; } break; case UVM_CHANNEL_TYPE_GPU_INTERNAL: // We want the max possible bandwidth for CEs used for GPU_INTERNAL, // for now assume that the number of PCEs is a good measure. // TODO: Bug 1735254: Add a direct CE query for local FB bandwidth { int pce_diff = (int)hweight32(cap1->cePceMask) - (int)hweight32(cap0->cePceMask); if (pce_diff != 0) return pce_diff; } // Leave P2P CEs to the GPU_TO_GPU channel type, when possible if (cap0->nvlinkP2p != cap1->nvlinkP2p) return cap0->nvlinkP2p - cap1->nvlinkP2p; break; case UVM_CHANNEL_TYPE_MEMOPS: // For MEMOPS we mostly care about latency which should be better // with less used CEs (although we only know about our own usage and // not system-wide) so just break out to get the default ordering // which prioritizes usage count. break; default: UVM_ASSERT_MSG(false, "Unexpected channel type 0x%x\n", type); return 0; } // By default, prefer less used CEs (within the UVM driver at least) ce0_usage = ce_usage_count(ce_index0, preferred_ce); ce1_usage = ce_usage_count(ce_index1, preferred_ce); if (ce0_usage != ce1_usage) return ce0_usage - ce1_usage; // And CEs that don't share PCEs if (cap0->shared != cap1->shared) return cap0->shared - cap1->shared; // Last resort, just order by index return ce_index0 - ce_index1; } // Identify usable CEs, and select the preferred CE for a given channel type. static NV_STATUS pick_ce_for_channel_type(uvm_channel_manager_t *manager, const UvmGpuCopyEngineCaps *ce_caps, uvm_channel_type_t type, unsigned *preferred_ce) { NvU32 i; NvU32 best_ce = UVM_COPY_ENGINE_COUNT_MAX; UVM_ASSERT(type < UVM_CHANNEL_TYPE_CE_COUNT); for (i = 0; i < UVM_COPY_ENGINE_COUNT_MAX; ++i) { const UvmGpuCopyEngineCaps *cap = ce_caps + i; if (!ce_usable_for_channel_type(type, cap)) continue; __set_bit(i, manager->ce_mask); if (best_ce == UVM_COPY_ENGINE_COUNT_MAX) { best_ce = i; continue; } if (compare_ce_for_channel_type(ce_caps, type, i, best_ce, preferred_ce) < 0) best_ce = i; } if (best_ce == UVM_COPY_ENGINE_COUNT_MAX) { UVM_ERR_PRINT("Failed to find a suitable CE for channel type %s\n", uvm_channel_type_to_string(type)); return NV_ERR_NOT_SUPPORTED; } preferred_ce[type] = best_ce; return NV_OK; } static NV_STATUS channel_manager_pick_copy_engines(uvm_channel_manager_t *manager, unsigned *preferred_ce) { NV_STATUS status; unsigned i; UvmGpuCopyEnginesCaps *ces_caps; uvm_channel_type_t types[] = {UVM_CHANNEL_TYPE_CPU_TO_GPU, UVM_CHANNEL_TYPE_GPU_TO_CPU, UVM_CHANNEL_TYPE_GPU_INTERNAL, UVM_CHANNEL_TYPE_GPU_TO_GPU, UVM_CHANNEL_TYPE_MEMOPS}; ces_caps = uvm_kvmalloc_zero(sizeof(*ces_caps)); if (!ces_caps) return NV_ERR_NO_MEMORY; status = uvm_rm_locked_call(nvUvmInterfaceQueryCopyEnginesCaps(uvm_gpu_device_handle(manager->gpu), ces_caps)); if (status != NV_OK) goto out; // The order of picking CEs for each type matters as it's affected by the // usage count of each CE and it increases every time a CE is selected. // MEMOPS has the least priority as it only cares about low usage of the // CE to improve latency for (i = 0; i < ARRAY_SIZE(types); ++i) { status = pick_ce_for_channel_type(manager, ces_caps->copyEngineCaps, types[i], preferred_ce); if (status != NV_OK) goto out; } out: uvm_kvfree(ces_caps); return status; } // Return the pool corresponding to the given CE index // // This function cannot be used to access the proxy pool in SR-IOV heavy. static uvm_channel_pool_t *channel_manager_ce_pool(uvm_channel_manager_t *manager, NvU32 ce) { uvm_channel_pool_t *pool; UVM_ASSERT(test_bit(ce, manager->ce_mask)); // The index of the pool associated with 'ce' is the number of usable CEs // in [0, ce) pool = manager->channel_pools + bitmap_weight(manager->ce_mask, ce); UVM_ASSERT(pool->pool_type == UVM_CHANNEL_POOL_TYPE_CE); UVM_ASSERT(pool->engine_index == ce); return pool; } void uvm_channel_manager_set_p2p_ce(uvm_channel_manager_t *manager, uvm_gpu_t *peer, NvU32 optimal_ce) { const NvU32 peer_gpu_index = uvm_id_gpu_index(peer->id); UVM_ASSERT(manager->gpu != peer); UVM_ASSERT(optimal_ce < UVM_COPY_ENGINE_COUNT_MAX); manager->pool_to_use.gpu_to_gpu[peer_gpu_index] = channel_manager_ce_pool(manager, optimal_ce); } static bool is_string_valid_location(const char *loc) { return strcmp(uvm_channel_gpfifo_loc, "sys") == 0 || strcmp(uvm_channel_gpfifo_loc, "vid") == 0 || strcmp(uvm_channel_gpfifo_loc, "auto") == 0; } static UVM_BUFFER_LOCATION string_to_buffer_location(const char *loc) { UVM_ASSERT(is_string_valid_location(loc)); if (strcmp(loc, "sys") == 0) return UVM_BUFFER_LOCATION_SYS; else if (strcmp(loc, "vid") == 0) return UVM_BUFFER_LOCATION_VID; else return UVM_BUFFER_LOCATION_DEFAULT; } static const char *buffer_location_to_string(UVM_BUFFER_LOCATION loc) { if (loc == UVM_BUFFER_LOCATION_SYS) return "sys"; else if (loc == UVM_BUFFER_LOCATION_VID) return "vid"; else if (loc == UVM_BUFFER_LOCATION_DEFAULT) return "auto"; UVM_ASSERT_MSG(false, "Invalid buffer locationvalue %d\n", loc); return NULL; } static void init_channel_manager_conf(uvm_channel_manager_t *manager) { const char *gpfifo_loc_value; const char *gpput_loc_value; const char *pushbuffer_loc_value; uvm_gpu_t *gpu = manager->gpu; // 1- Number of GPFIFO entries manager->conf.num_gpfifo_entries = uvm_channel_num_gpfifo_entries; if (uvm_channel_num_gpfifo_entries < UVM_CHANNEL_NUM_GPFIFO_ENTRIES_MIN) manager->conf.num_gpfifo_entries = UVM_CHANNEL_NUM_GPFIFO_ENTRIES_MIN; else if (uvm_channel_num_gpfifo_entries > UVM_CHANNEL_NUM_GPFIFO_ENTRIES_MAX) manager->conf.num_gpfifo_entries = UVM_CHANNEL_NUM_GPFIFO_ENTRIES_MAX; if (!is_power_of_2(manager->conf.num_gpfifo_entries)) manager->conf.num_gpfifo_entries = UVM_CHANNEL_NUM_GPFIFO_ENTRIES_DEFAULT; if (manager->conf.num_gpfifo_entries != uvm_channel_num_gpfifo_entries) { pr_info("Invalid value for uvm_channel_num_gpfifo_entries = %u, using %u instead\n", uvm_channel_num_gpfifo_entries, manager->conf.num_gpfifo_entries); } // 2- Allocation locations if (uvm_conf_computing_mode_is_hcc(gpu)) { UVM_ASSERT(gpu->mem_info.size > 0); // When the Confidential Computing feature is enabled, the GPU is // guaranteed to have a frame buffer and the hardware enforces the // following apertures for all channel types but SEC2. We create SEC2 // channels by overriding their GPPUT and GPFIFO locations and ignoring // the location from the channel manager. SEC2 channels are used to // bootstrap secure work submission. To setup secure work launch, data // transfers from unprotected main memory to protected vidmem are // performed by the SEC2 engine and the driver must push work to SEC2 // channels using unprotected locations. manager->conf.gpfifo_loc = UVM_BUFFER_LOCATION_VID; manager->conf.gpput_loc = UVM_BUFFER_LOCATION_VID; // Pushbuffer is located in unprotected sysmem. This is the location // that gets mapped to both CPU and GPU. It is populated either by // signed pushes (SEC2 channels), or encrypted pushes (CE channels). manager->conf.pushbuffer_loc = UVM_BUFFER_LOCATION_SYS; return; } // Override if the GPU doesn't have memory if (gpu->mem_info.size == 0) { manager->conf.pushbuffer_loc = UVM_BUFFER_LOCATION_SYS; manager->conf.gpfifo_loc = UVM_BUFFER_LOCATION_SYS; manager->conf.gpput_loc = UVM_BUFFER_LOCATION_SYS; return; } manager->conf.pushbuffer_loc = UVM_BUFFER_LOCATION_SYS; pushbuffer_loc_value = uvm_channel_pushbuffer_loc; if (!is_string_valid_location(pushbuffer_loc_value)) { pushbuffer_loc_value = UVM_CHANNEL_PUSHBUFFER_LOC_DEFAULT; pr_info("Invalid value for uvm_channel_pushbuffer_loc = %s, using %s instead\n", uvm_channel_pushbuffer_loc, pushbuffer_loc_value); } // Override the default value if requested by the user if (strcmp(pushbuffer_loc_value, "vid") == 0) { // aarch64 requires memset_io/memcpy_io instead of memset/memcpy for // mapped GPU memory. The existing push paths only use memset/memcpy, // so force the location to sys for now. // TODO: Bug 2904133: Remove the following "if" after the bug is fixed. if (NVCPU_IS_AARCH64) { pr_info("uvm_channel_pushbuffer_loc = %s is not supported on AARCH64, using sys instead\n", pushbuffer_loc_value); manager->conf.pushbuffer_loc = UVM_BUFFER_LOCATION_SYS; } else { manager->conf.pushbuffer_loc = UVM_BUFFER_LOCATION_VID; } } // 3- GPFIFO/GPPut location // Only support the knobs for GPFIFO/GPPut on Volta+ if (!gpu->parent->gpfifo_in_vidmem_supported) { if (manager->conf.gpput_loc == UVM_BUFFER_LOCATION_SYS) { pr_info("CAUTION: allocating GPPut in sysmem is NOT supported and may crash the system, using %s instead\n", buffer_location_to_string(UVM_BUFFER_LOCATION_DEFAULT)); } manager->conf.gpfifo_loc = UVM_BUFFER_LOCATION_DEFAULT; manager->conf.gpput_loc = UVM_BUFFER_LOCATION_DEFAULT; return; } gpfifo_loc_value = uvm_channel_gpfifo_loc; if (!is_string_valid_location(gpfifo_loc_value)) { gpfifo_loc_value = UVM_CHANNEL_GPFIFO_LOC_DEFAULT; pr_info("Invalid value for uvm_channel_gpfifo_loc = %s, using %s instead\n", uvm_channel_gpfifo_loc, gpfifo_loc_value); } gpput_loc_value = uvm_channel_gpput_loc; if (!is_string_valid_location(gpput_loc_value)) { gpput_loc_value = UVM_CHANNEL_GPPUT_LOC_DEFAULT; pr_info("Invalid value for uvm_channel_gpput_loc = %s, using %s instead\n", uvm_channel_gpput_loc, gpput_loc_value); } // On coherent platforms where the GPU does not cache sysmem but the CPU // caches vidmem (and sysmem), we place GPFIFO and GPPUT on sysmem to avoid // cache thrash. The memory access latency is reduced, despite the required // access through the bus, because no cache coherence message is exchanged. if (uvm_gpu_is_coherent(gpu->parent)) { manager->conf.gpfifo_loc = UVM_BUFFER_LOCATION_SYS; // On GPUs with limited ESCHED addressing range, e.g., Volta on P9, RM // cannot guarantee that USERD/GPPUT physical address is accessible by // ESCHED. We set GPPUT location to vidmem where physical addresses are // all accessible by ESCHED. We use the max_host_va as a proxy for the // PA limitation, since all architectures with 40b VA limits also have // 40b PA limits. manager->conf.gpput_loc = gpu->parent->max_host_va == (1ull << 40) ? UVM_BUFFER_LOCATION_VID : UVM_BUFFER_LOCATION_SYS; } else { // By default we place GPFIFO and GPPUT on vidmem as it potentially has // lower latency. manager->conf.gpfifo_loc = UVM_BUFFER_LOCATION_VID; manager->conf.gpput_loc = UVM_BUFFER_LOCATION_VID; } // Override defaults if (string_to_buffer_location(gpfifo_loc_value) != UVM_BUFFER_LOCATION_DEFAULT) manager->conf.gpfifo_loc = string_to_buffer_location(gpfifo_loc_value); if (string_to_buffer_location(gpput_loc_value) != UVM_BUFFER_LOCATION_DEFAULT) manager->conf.gpput_loc = string_to_buffer_location(gpput_loc_value); } // Returns the maximum number of pools that are needed in the current // configuration. The implementation may choose to create a smaller number of // pools. static unsigned channel_manager_get_max_pools(uvm_channel_manager_t *manager) { unsigned num_channel_pools; // Create one CE channel pool per usable CE num_channel_pools = bitmap_weight(manager->ce_mask, UVM_COPY_ENGINE_COUNT_MAX); // CE proxy channel pool. if (uvm_gpu_uses_proxy_channel_pool(manager->gpu)) num_channel_pools++; // SEC2 pool, WLC pool, LCIC pool if (uvm_conf_computing_mode_enabled(manager->gpu)) num_channel_pools += 3; return num_channel_pools; } static NV_STATUS channel_manager_create_ce_pools(uvm_channel_manager_t *manager, unsigned *preferred_ce) { unsigned ce; // A pool is created for each usable CE, even if it has not been selected as // the preferred CE for any type, because as more information is discovered // (for example, a pair of peer GPUs is added) we may start using the // previously idle pools. for_each_set_bit(ce, manager->ce_mask, UVM_COPY_ENGINE_COUNT_MAX) { NV_STATUS status; unsigned type; uvm_channel_pool_t *pool = NULL; status = channel_pool_add(manager, UVM_CHANNEL_POOL_TYPE_CE, ce, &pool); if (status != NV_OK) return status; for (type = 0; type < UVM_CHANNEL_TYPE_CE_COUNT; type++) { // Set pool type if it hasn't been set before. if (preferred_ce[type] == ce && manager->pool_to_use.default_for_type[type] == NULL) manager->pool_to_use.default_for_type[type] = pool; } } return NV_OK; } static NV_STATUS setup_wlc_schedule(uvm_channel_t *wlc) { uvm_gpu_t *gpu = uvm_channel_get_gpu(wlc); NvU64 protected_vidmem = uvm_rm_mem_get_gpu_uvm_va(wlc->conf_computing.static_pb_protected_vidmem, gpu); NvU64 unprotected_sysmem_gpu = uvm_rm_mem_get_gpu_uvm_va(wlc->conf_computing.static_pb_unprotected_sysmem, gpu); void *unprotected_sysmem_cpu = wlc->conf_computing.static_pb_unprotected_sysmem_cpu; NvU64 tag_offset = (uintptr_t)wlc->conf_computing.static_pb_unprotected_sysmem_auth_tag_cpu - (uintptr_t)wlc->conf_computing.static_pb_unprotected_sysmem_cpu; NvU64 *wlc_gpfifo_entries; uvm_push_t wlc_decrypt_push, sec2_push; NvU32 decrypt_push_size; int i; NV_STATUS status = NV_OK; // "gpfifo" is the representation of GPFIFO copied to gpFifoGpu const size_t gpfifo_size = wlc->num_gpfifo_entries * sizeof(*wlc_gpfifo_entries); void *gpfifo_unprotected_cpu = unprotected_sysmem_cpu; NvU64 gpfifo_unprotected_gpu = unprotected_sysmem_gpu; // "run_push" represents mutable push location used by WLC uvm_gpu_address_t run_push_protected_gpu = uvm_gpu_address_virtual(protected_vidmem); uvm_gpu_address_t run_push_unprotected_gpu = uvm_gpu_address_virtual(unprotected_sysmem_gpu); uvm_gpu_address_t run_push_unprotected_auth_tag_gpu = uvm_gpu_address_virtual(unprotected_sysmem_gpu + tag_offset); // "decrypt_push" represents WLC decrypt push, constructed using fake_push. // Copied to wlc_pb_base + UVM_MAX_WLC_PUSH_SIZE, as the second of the two // pushes that make the WLC fixed schedule. NvU64 decrypt_push_protected_gpu = UVM_ALIGN_UP(protected_vidmem + UVM_MAX_WLC_PUSH_SIZE, UVM_CONF_COMPUTING_SEC2_BUF_ALIGNMENT); NvU64 decrypt_push_unprotected_gpu = unprotected_sysmem_gpu + gpfifo_size; void *decrypt_push_unprotected_cpu = (char*)gpfifo_unprotected_cpu + gpfifo_size; // Tags for upload via SEC2 void *decrypt_push_auth_tag, *gpfifo_auth_tag; uvm_gpu_address_t decrypt_push_auth_tag_gpu, gpfifo_auth_tag_gpu; BUILD_BUG_ON(sizeof(*wlc_gpfifo_entries) != sizeof(*wlc->channel_info.gpFifoEntries)); UVM_ASSERT(uvm_channel_is_wlc(wlc)); UVM_ASSERT(tag_offset == UVM_ALIGN_UP(UVM_MAX_WLC_PUSH_SIZE, UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT)); // WLC schedule consists of two parts, the number of entries needs to be even. // This also guarantees that the size is 16B aligned UVM_ASSERT(IS_ALIGNED(wlc->num_gpfifo_entries, 2)); wlc_gpfifo_entries = uvm_kvmalloc(gpfifo_size); if (!wlc_gpfifo_entries) return NV_ERR_NO_MEMORY; // WLC can only process one job at a time. // Prune any initialization entries and block all but one (+1 for sentinel) uvm_channel_update_progress(wlc); if (!try_claim_channel(wlc, wlc->num_gpfifo_entries - 2)) { status = NV_ERR_INVALID_STATE; goto free_gpfifo_entries; } // WLC schedule has two steps: // 1.) Decrypt from gpu_unprotected_base to wlc_pb_base // Increment LCIC PUT // 2.) Execute push at wlc_pb_base // The first one is a push, the second one is just a GPFIFO entry status = uvm_push_begin_fake(gpu, &wlc_decrypt_push); if (status != NV_OK) goto free_gpfifo_entries; // Begin WLC DECRYPT push uvm_push_set_flag(&wlc_decrypt_push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU); gpu->parent->ce_hal->decrypt(&wlc_decrypt_push, run_push_protected_gpu, run_push_unprotected_gpu, UVM_MAX_WLC_PUSH_SIZE, run_push_unprotected_auth_tag_gpu); uvm_hal_wfi_membar(&wlc_decrypt_push, UVM_MEMBAR_NONE); decrypt_push_size = uvm_push_get_size(&wlc_decrypt_push); // The code below reuses static unprotected sysmem buffer as a temporary // storage for uploading the schedule. Check that everything fits. UVM_ASSERT(gpfifo_size + decrypt_push_size <= UVM_MAX_WLC_PUSH_SIZE); // GPFIFO schedule should alternate between the decrypt routine // we don't know if initialization used even or odd number of // GPFIFO entries so the exact pattern depends on the value of "PUT" for (i = 0; i < wlc->num_gpfifo_entries; ++i) { if (i % 2 == wlc->cpu_put % 2) { gpu->parent->host_hal->set_gpfifo_entry(wlc_gpfifo_entries + i, decrypt_push_protected_gpu, decrypt_push_size, UVM_GPFIFO_SYNC_PROCEED); } else { gpu->parent->host_hal->set_gpfifo_entry(wlc_gpfifo_entries + i, run_push_protected_gpu.address, UVM_MAX_WLC_PUSH_SIZE, UVM_GPFIFO_SYNC_WAIT); } } // The schedule is prepared. Upload to vidmem status = uvm_push_begin(wlc->pool->manager, UVM_CHANNEL_TYPE_SEC2, &sec2_push, "Upload WLC schedule for: %s", wlc->name); if (status != NV_OK) goto end_wlc_dec_push; decrypt_push_auth_tag = uvm_push_get_single_inline_buffer(&sec2_push, UVM_CONF_COMPUTING_AUTH_TAG_SIZE, UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT, &decrypt_push_auth_tag_gpu); gpfifo_auth_tag = uvm_push_get_single_inline_buffer(&sec2_push, UVM_CONF_COMPUTING_AUTH_TAG_SIZE, UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT, &gpfifo_auth_tag_gpu); // Upload WLC pushbuffer uvm_conf_computing_cpu_encrypt(sec2_push.channel, decrypt_push_unprotected_cpu, wlc_decrypt_push.begin, NULL, decrypt_push_size, decrypt_push_auth_tag); gpu->parent->sec2_hal->decrypt(&sec2_push, decrypt_push_protected_gpu, decrypt_push_unprotected_gpu, decrypt_push_size, decrypt_push_auth_tag_gpu.address); // Upload WLC GPFIFO uvm_conf_computing_cpu_encrypt(sec2_push.channel, gpfifo_unprotected_cpu, wlc_gpfifo_entries, NULL, gpfifo_size, gpfifo_auth_tag); gpu->parent->sec2_hal->decrypt(&sec2_push, wlc->channel_info.gpFifoGpuVa, gpfifo_unprotected_gpu, gpfifo_size, gpfifo_auth_tag_gpu.address); // Prime the WLC by setting "PUT" two steps ahead. Reuse the current // cpu_put value that was used during channel initialization. // Don't update wlc->cpu_put, it will be used to track submitted pushes // as any other channel. update_gpput_via_sec2(&sec2_push, wlc, (wlc->cpu_put + 2) % wlc->num_gpfifo_entries); status = uvm_push_end_and_wait(&sec2_push); end_wlc_dec_push: uvm_push_end_fake(&wlc_decrypt_push); free_gpfifo_entries: uvm_kvfree(wlc_gpfifo_entries); return status; } static NV_STATUS setup_lcic_schedule(uvm_channel_t *paired_wlc, uvm_channel_t *lcic) { uvm_gpu_t *gpu = uvm_channel_get_gpu(lcic); NvU64 lcic_pb_base = uvm_rm_mem_get_gpu_uvm_va(lcic->conf_computing.static_pb_protected_vidmem, gpu); // Reuse WLC sysmem allocation NvU64 gpu_unprotected = uvm_rm_mem_get_gpu_uvm_va(paired_wlc->conf_computing.static_pb_unprotected_sysmem, gpu); char *cpu_unprotected = paired_wlc->conf_computing.static_pb_unprotected_sysmem_cpu; uvm_gpu_semaphore_t *lcic_gpu_semaphore = &lcic->tracking_sem.semaphore; uvm_gpu_address_t notifier_src_entry_addr = lcic->conf_computing.static_notifier_entry_unprotected_sysmem_gpu_va; uvm_gpu_address_t notifier_src_exit_addr = lcic->conf_computing.static_notifier_exit_unprotected_sysmem_gpu_va; uvm_gpu_address_t notifier_dst_addr = uvm_rm_mem_get_gpu_va(lcic_gpu_semaphore->conf_computing.notifier, gpu, false); uvm_gpu_address_t encrypted_payload_gpu_va = uvm_rm_mem_get_gpu_va(lcic_gpu_semaphore->conf_computing.encrypted_payload, gpu, false); uvm_gpu_address_t semaphore_gpu_va = uvm_gpu_address_virtual(uvm_channel_tracking_semaphore_get_gpu_va(lcic)); uvm_gpu_address_t auth_tag_gpu_va = uvm_rm_mem_get_gpu_va(lcic_gpu_semaphore->conf_computing.auth_tag, gpu, false); NvU32 payload_size = sizeof(*lcic->tracking_sem.semaphore.payload); NvU32 notifier_size = sizeof(*lcic->conf_computing.static_notifier_entry_unprotected_sysmem_cpu); NvU64 *lcic_gpfifo_entries; uvm_push_t lcic_push, sec2_push; NvU32 lcic_push_size; int i; NV_STATUS status; const size_t gpfifo_size = lcic->num_gpfifo_entries * sizeof(*lcic_gpfifo_entries); char *gpfifo_unprotected_cpu = cpu_unprotected; NvU64 gpfifo_unprotected_gpu = gpu_unprotected; char *lcic_push_unprotected_cpu = gpfifo_unprotected_cpu + gpfifo_size; NvU64 lcic_push_unprotected_gpu = gpfifo_unprotected_gpu + gpfifo_size; NvU64 lcic_push_protected_gpu = lcic_pb_base; char *lcic_push_enc_tag, *gpfifo_enc_tag; uvm_gpu_address_t lcic_push_enc_tag_gpu, gpfifo_enc_tag_gpu; BUILD_BUG_ON(sizeof(*lcic_gpfifo_entries) != sizeof(*lcic->channel_info.gpFifoEntries)); UVM_ASSERT(uvm_channel_is_wlc(paired_wlc)); UVM_ASSERT(uvm_channel_is_lcic(lcic)); lcic_gpfifo_entries = uvm_kvmalloc(gpfifo_size); if (!lcic_gpfifo_entries) return NV_ERR_NO_MEMORY; // LCIC can not process outside jobs. // Prune any initialization entries and // block all gpfifo entries (-1 for sentinel) uvm_channel_update_progress(lcic); if (!try_claim_channel(lcic, lcic->num_gpfifo_entries - 1)) { status = NV_ERR_INVALID_STATE; goto free_gpfifo_entries; } status = uvm_push_begin_fake(gpu, &lcic_push); if (status != NV_OK) goto free_gpfifo_entries; // LCIC schedule is simple: // 1.) wait for engine idle // 2.) advance the WLC PUT by 2 // 3.) release driver semaphore uvm_hal_wfi_membar(&lcic_push, UVM_MEMBAR_NONE); gpu->parent->ce_hal->semaphore_reduction_inc(&lcic_push, paired_wlc->channel_info.gpPutGpuVa, paired_wlc->num_gpfifo_entries - 1); gpu->parent->ce_hal->semaphore_reduction_inc(&lcic_push, paired_wlc->channel_info.gpPutGpuVa, paired_wlc->num_gpfifo_entries - 1); gpu->parent->ce_hal->semaphore_reduction_inc(&lcic_push, semaphore_gpu_va.address, 0xffffffff); gpu->parent->ce_hal->memcopy(&lcic_push, notifier_dst_addr, notifier_src_entry_addr, notifier_size); gpu->parent->ce_hal->encrypt(&lcic_push, encrypted_payload_gpu_va, semaphore_gpu_va, payload_size, auth_tag_gpu_va); gpu->parent->ce_hal->memcopy(&lcic_push, notifier_dst_addr, notifier_src_exit_addr, notifier_size); // End LCIC push lcic_push_size = uvm_push_get_size(&lcic_push); // We're reusing pre-allocated structures from WLC, make sure we fit. UVM_ASSERT(lcic_push_size == UVM_LCIC_PUSH_SIZE); UVM_ASSERT(lcic_push_size + gpfifo_size <= UVM_MAX_WLC_PUSH_SIZE); // Set all entries to execute the above push for (i = 0; i < lcic->num_gpfifo_entries; ++i) gpu->parent->host_hal->set_gpfifo_entry(lcic_gpfifo_entries + i, lcic_push_protected_gpu, lcic_push_size, UVM_GPFIFO_SYNC_PROCEED); // Upload the prepared schedule using SEC2 status = uvm_push_begin(lcic->pool->manager, UVM_CHANNEL_TYPE_SEC2, &sec2_push, "Upload LCIC schedule for: %s", lcic->name); if (status != NV_OK) goto end_lcic_push; lcic_push_enc_tag = uvm_push_get_single_inline_buffer(&sec2_push, UVM_CONF_COMPUTING_AUTH_TAG_SIZE, UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT, &lcic_push_enc_tag_gpu); gpfifo_enc_tag = uvm_push_get_single_inline_buffer(&sec2_push, UVM_CONF_COMPUTING_AUTH_TAG_SIZE, UVM_CONF_COMPUTING_AUTH_TAG_ALIGNMENT, &gpfifo_enc_tag_gpu); // Upload LCIC pushbuffer uvm_conf_computing_cpu_encrypt(sec2_push.channel, lcic_push_unprotected_cpu, lcic_push.begin, NULL, lcic_push_size, lcic_push_enc_tag); gpu->parent->sec2_hal->decrypt(&sec2_push, lcic_push_protected_gpu, lcic_push_unprotected_gpu, lcic_push_size, lcic_push_enc_tag_gpu.address); // Upload LCIC GPFIFO uvm_conf_computing_cpu_encrypt(sec2_push.channel, gpfifo_unprotected_cpu, lcic_gpfifo_entries, NULL, gpfifo_size, gpfifo_enc_tag); gpu->parent->sec2_hal->decrypt(&sec2_push, lcic->channel_info.gpFifoGpuVa, gpfifo_unprotected_gpu, gpfifo_size, gpfifo_enc_tag_gpu.address); status = uvm_push_end_and_wait(&sec2_push); end_lcic_push: uvm_push_end_fake(&lcic_push); free_gpfifo_entries: uvm_kvfree(lcic_gpfifo_entries); return status; } static NV_STATUS channel_manager_setup_wlc_lcic(uvm_channel_pool_t *wlc_pool, uvm_channel_pool_t *lcic_pool) { NvU32 i; UVM_ASSERT(wlc_pool->manager == lcic_pool->manager); UVM_ASSERT(wlc_pool->manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_WLC] != NULL); UVM_ASSERT(lcic_pool->manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_LCIC] == NULL); UVM_ASSERT(wlc_pool->num_channels == lcic_pool->num_channels); for (i = 0; i < wlc_pool->num_channels; ++i) { uvm_channel_t *wlc = wlc_pool->channels + i; uvm_channel_t *lcic = lcic_pool->channels + i; NV_STATUS status; status = setup_wlc_schedule(wlc); if (status != NV_OK) return status; status = setup_lcic_schedule(wlc, lcic); if (status != NV_OK) return status; } return NV_OK; } static NV_STATUS channel_manager_create_conf_computing_pools(uvm_channel_manager_t *manager, unsigned *preferred_ce) { NV_STATUS status; unsigned wlc_lcic_ce_index; uvm_channel_pool_t *sec2_pool = NULL; uvm_channel_pool_t *wlc_pool = NULL; uvm_channel_pool_t *lcic_pool = NULL; if (!uvm_conf_computing_mode_enabled(manager->gpu)) return NV_OK; status = uvm_rm_mem_alloc(manager->gpu, UVM_RM_MEM_TYPE_SYS, sizeof(UvmCslIv), UVM_CONF_COMPUTING_BUF_ALIGNMENT, &manager->gpu->conf_computing.iv_rm_mem); if (status != NV_OK) return status; // Create SEC2 pool. This needs to be done first, initialization of // other channels needs SEC2. status = channel_pool_add(manager, UVM_CHANNEL_POOL_TYPE_SEC2, 0, &sec2_pool); if (status != NV_OK) return status; manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_SEC2] = sec2_pool; // Use the same CE as CPU TO GPU channels for WLC/LCIC // Both need to use the same engine for the fixed schedule to work. // TODO: Bug 3981928: [hcc][uvm] Optimize parameters of WLC/LCIC secure // work launch // Find a metric to select the best CE to use wlc_lcic_ce_index = preferred_ce[UVM_CHANNEL_TYPE_CPU_TO_GPU]; // Create WLC/LCIC pools. This should be done early, CE channels use // them for secure launch. The WLC pool must be created before the LCIC. status = channel_pool_add(manager, UVM_CHANNEL_POOL_TYPE_WLC, wlc_lcic_ce_index, &wlc_pool); if (status != NV_OK) return status; manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_WLC] = wlc_pool; status = channel_pool_add(manager, UVM_CHANNEL_POOL_TYPE_LCIC, wlc_lcic_ce_index, &lcic_pool); if (status != NV_OK) return status; status = channel_manager_setup_wlc_lcic(wlc_pool, lcic_pool); if (status != NV_OK) return status; // The LCIC pool must be assigned after the call to // channel_manager_setup_wlc_lcic(). It determines WLC and LCIC channels // are ready to be used for secure work submission. manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_LCIC] = lcic_pool; return NV_OK; } static NV_STATUS channel_manager_create_pools(uvm_channel_manager_t *manager) { NV_STATUS status; uvm_channel_type_t type; unsigned max_channel_pools; unsigned preferred_ce[UVM_CHANNEL_TYPE_CE_COUNT]; for (type = 0; type < ARRAY_SIZE(preferred_ce); type++) preferred_ce[type] = UVM_COPY_ENGINE_COUNT_MAX; status = channel_manager_pick_copy_engines(manager, preferred_ce); if (status != NV_OK) return status; max_channel_pools = channel_manager_get_max_pools(manager); manager->channel_pools = uvm_kvmalloc_zero(sizeof(*manager->channel_pools) * max_channel_pools); if (!manager->channel_pools) return NV_ERR_NO_MEMORY; status = channel_manager_create_conf_computing_pools(manager, preferred_ce); if (status != NV_OK) return status; status = channel_manager_create_ce_pools(manager, preferred_ce); if (status != NV_OK) return status; // In SR-IOV heavy, add an additional, single-channel, pool that is // dedicated to the MEMOPS type. if (uvm_gpu_uses_proxy_channel_pool(manager->gpu)) { uvm_channel_pool_t *proxy_pool = NULL; uvm_channel_type_t channel_type = uvm_channel_proxy_channel_type(); status = channel_pool_add(manager, UVM_CHANNEL_POOL_TYPE_CE_PROXY, preferred_ce[channel_type], &proxy_pool); if (status != NV_OK) return status; manager->pool_to_use.default_for_type[channel_type] = proxy_pool; } return NV_OK; } NV_STATUS uvm_channel_manager_create(uvm_gpu_t *gpu, uvm_channel_manager_t **channel_manager_out) { NV_STATUS status = NV_OK; uvm_channel_manager_t *channel_manager; channel_manager = uvm_kvmalloc_zero(sizeof(*channel_manager)); if (!channel_manager) return NV_ERR_NO_MEMORY; channel_manager->gpu = gpu; init_channel_manager_conf(channel_manager); status = uvm_pushbuffer_create(channel_manager, &channel_manager->pushbuffer); if (status != NV_OK) goto error; status = manager_create_procfs_dirs(channel_manager); if (status != NV_OK) goto error; status = channel_manager_create_pools(channel_manager); if (status != NV_OK) goto error; status = manager_create_procfs(channel_manager); if (status != NV_OK) goto error; *channel_manager_out = channel_manager; return status; error: uvm_channel_manager_destroy(channel_manager); return status; } static void channel_manager_destroy_pools(uvm_channel_manager_t *manager) { uvm_rm_mem_free(manager->gpu->conf_computing.iv_rm_mem); manager->gpu->conf_computing.iv_rm_mem = NULL; while (manager->num_channel_pools > 0) channel_pool_destroy(manager->channel_pools + manager->num_channel_pools - 1); uvm_kvfree(manager->channel_pools); } // Because the WLC at rest state has PUT = GET + 2, there's always pending work // This is what enables the driver to submit work just by ringing a doorbell. // However, this also means that the pending work has to be removed before // the channel is passed to RM for deallocation. static void channel_manager_stop_wlc(uvm_channel_manager_t *manager) { uvm_channel_pool_t *wlc_pool = manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_WLC]; uvm_channel_pool_t *lcic_pool = manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_LCIC]; uvm_channel_t *channel; uvm_push_t push; NV_STATUS status; uvm_for_each_channel_in_pool(channel, lcic_pool) { uvm_spin_loop_t spin; // Wait for the WLC/LCIC to be primed. This means that PUT == GET + 2 // and a WLC doorbell ring is enough to start work. UVM_SPIN_WHILE(!uvm_gpu_tracking_semaphore_is_completed(&channel->tracking_sem), &spin); } status = uvm_push_begin(manager, UVM_CHANNEL_TYPE_SEC2, &push, "Stop WLC channels"); if (status != NV_OK) { UVM_ERR_PRINT_NV_STATUS("Failed to begin stop push for WLC", status); return; } uvm_for_each_channel_in_pool(channel, wlc_pool) { // Every gpfifo entry advances the gpu put of WLC by two so the current // value is: (cpu_put * 2) % num_gpfifo_entries and it's ahead of the // get pointer by 2. update_gpput_via_sec2(&push, channel, (channel->cpu_put * 2 - 2) % channel->num_gpfifo_entries); } status = uvm_push_end_and_wait(&push); if (status != NV_OK) UVM_ERR_PRINT_NV_STATUS("Failed to end stop push for WLC", status); manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_WLC] = NULL; manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_LCIC] = NULL; } void uvm_channel_manager_destroy(uvm_channel_manager_t *channel_manager) { if (channel_manager == NULL) return; proc_remove(channel_manager->procfs.pending_pushes); if (uvm_channel_manager_is_wlc_ready(channel_manager)) channel_manager_stop_wlc(channel_manager); channel_manager_destroy_pools(channel_manager); proc_remove(channel_manager->procfs.channels_dir); uvm_pushbuffer_destroy(channel_manager->pushbuffer); uvm_kvfree(channel_manager); } bool uvm_channel_is_privileged(uvm_channel_t *channel) { if (uvm_gpu_is_virt_mode_sriov_heavy(uvm_channel_get_gpu(channel))) return uvm_channel_is_proxy(channel); return true; } // Return the first channel pool of the given type(s) starting at begin_pool // (included). // // The pool type mask must be a non empty mask of uvm_channel_pool_type_t // values. static uvm_channel_pool_t *channel_pool_first_from(uvm_channel_manager_t *manager, uvm_channel_pool_t *begin_pool, NvU32 pool_type_mask) { uvm_channel_pool_t *curr_pool, *end_pool; UVM_ASSERT(manager->channel_pools != NULL); UVM_ASSERT(begin_pool != NULL); UVM_ASSERT(begin_pool >= manager->channel_pools); UVM_ASSERT(pool_type_mask > 0); UVM_ASSERT(pool_type_mask <= UVM_CHANNEL_POOL_TYPE_MASK); end_pool = manager->channel_pools + manager->num_channel_pools; UVM_ASSERT(begin_pool <= end_pool); for (curr_pool = begin_pool; curr_pool != end_pool; curr_pool++) { if (curr_pool->pool_type & pool_type_mask) return curr_pool; } return NULL; } uvm_channel_pool_t *uvm_channel_pool_first(uvm_channel_manager_t *manager, NvU32 pool_type_mask) { return channel_pool_first_from(manager, manager->channel_pools, pool_type_mask); } uvm_channel_pool_t *uvm_channel_pool_next(uvm_channel_manager_t *manager, uvm_channel_pool_t *pool, NvU32 pool_type_mask) { return channel_pool_first_from(manager, pool + 1, pool_type_mask); } uvm_channel_t *uvm_channel_any_of_type(uvm_channel_manager_t *manager, NvU32 pool_type_mask) { uvm_channel_pool_t *pool = uvm_channel_pool_first(manager, pool_type_mask); if (pool == NULL) return NULL; UVM_ASSERT(pool->channels); return pool->channels; } const char *uvm_channel_type_to_string(uvm_channel_type_t channel_type) { BUILD_BUG_ON(UVM_CHANNEL_TYPE_COUNT != 8); switch (channel_type) { UVM_ENUM_STRING_CASE(UVM_CHANNEL_TYPE_CPU_TO_GPU); UVM_ENUM_STRING_CASE(UVM_CHANNEL_TYPE_GPU_TO_CPU); UVM_ENUM_STRING_CASE(UVM_CHANNEL_TYPE_GPU_INTERNAL); UVM_ENUM_STRING_CASE(UVM_CHANNEL_TYPE_MEMOPS); UVM_ENUM_STRING_CASE(UVM_CHANNEL_TYPE_GPU_TO_GPU); UVM_ENUM_STRING_CASE(UVM_CHANNEL_TYPE_SEC2); UVM_ENUM_STRING_CASE(UVM_CHANNEL_TYPE_WLC); UVM_ENUM_STRING_CASE(UVM_CHANNEL_TYPE_LCIC); UVM_ENUM_STRING_DEFAULT(); } } const char *uvm_channel_pool_type_to_string(uvm_channel_pool_type_t channel_pool_type) { BUILD_BUG_ON(UVM_CHANNEL_POOL_TYPE_COUNT != 5); switch (channel_pool_type) { UVM_ENUM_STRING_CASE(UVM_CHANNEL_POOL_TYPE_CE); UVM_ENUM_STRING_CASE(UVM_CHANNEL_POOL_TYPE_CE_PROXY); UVM_ENUM_STRING_CASE(UVM_CHANNEL_POOL_TYPE_SEC2); UVM_ENUM_STRING_CASE(UVM_CHANNEL_POOL_TYPE_WLC); UVM_ENUM_STRING_CASE(UVM_CHANNEL_POOL_TYPE_LCIC); UVM_ENUM_STRING_DEFAULT(); } } static const char *get_gpfifo_location_string(uvm_channel_t *channel) { // SEC2 channels override the channel manager location for GPFIFO. if (uvm_channel_is_sec2(channel)) return buffer_location_to_string(UVM_BUFFER_LOCATION_SYS); return buffer_location_to_string(channel->pool->manager->conf.gpfifo_loc); } static const char *get_gpput_location_string(uvm_channel_t *channel) { // SEC2 channels override the channel manager location for GPPUT. if (uvm_channel_is_sec2(channel)) return buffer_location_to_string(UVM_BUFFER_LOCATION_SYS); return buffer_location_to_string(channel->pool->manager->conf.gpput_loc); } static void uvm_channel_print_info(uvm_channel_t *channel, struct seq_file *s) { UVM_SEQ_OR_DBG_PRINT(s, "Channel %s\n", channel->name); channel_pool_lock(channel->pool); UVM_SEQ_OR_DBG_PRINT(s, "completed %llu\n", uvm_channel_update_completed_value(channel)); UVM_SEQ_OR_DBG_PRINT(s, "queued %llu\n", channel->tracking_sem.queued_value); UVM_SEQ_OR_DBG_PRINT(s, "GPFIFO count %u\n", channel->num_gpfifo_entries); UVM_SEQ_OR_DBG_PRINT(s, "GPFIFO location %s\n", get_gpfifo_location_string(channel)); UVM_SEQ_OR_DBG_PRINT(s, "GPPUT location %s\n", get_gpput_location_string(channel)); UVM_SEQ_OR_DBG_PRINT(s, "get %u\n", channel->gpu_get); UVM_SEQ_OR_DBG_PRINT(s, "put %u\n", channel->cpu_put); UVM_SEQ_OR_DBG_PRINT(s, "Semaphore GPU VA 0x%llx\n", uvm_channel_tracking_semaphore_get_gpu_va(channel)); UVM_SEQ_OR_DBG_PRINT(s, "Semaphore CPU VA 0x%llx\n", (NvU64)(uintptr_t)channel->tracking_sem.semaphore.payload); channel_pool_unlock(channel->pool); } static void channel_print_push_acquires(uvm_push_acquire_info_t *push_acquire_info, struct seq_file *seq) { NvU32 i; NvU32 valid_entries; UVM_ASSERT(uvm_push_info_is_tracking_acquires()); UVM_ASSERT(push_acquire_info); if (push_acquire_info->num_values == 0) return; valid_entries = min(push_acquire_info->num_values, (NvU32)UVM_PUSH_ACQUIRE_INFO_MAX_ENTRIES); for (i = 0; i < valid_entries; ++i) { bool is_proxy = push_acquire_info->values[i].is_proxy; UVM_SEQ_OR_DBG_PRINT(seq, "%s (gpu %u, channel %d:%u, value %llu)", i == 0? " acquiring values" : "", uvm_id_value(push_acquire_info->values[i].gpu_id), is_proxy? -1 : push_acquire_info->values[i].runlist_id, is_proxy? push_acquire_info->values[i].proxy.pool_index : push_acquire_info->values[i].channel_id, push_acquire_info->values[i].value); } if (push_acquire_info->num_values > valid_entries) UVM_SEQ_OR_DBG_PRINT(seq, " (missing %u entries)", push_acquire_info->num_values - valid_entries); UVM_SEQ_OR_DBG_PRINT(seq, "\n"); } // Print all pending pushes and up to finished_pushes_count completed if their // GPFIFO entries haven't been reused yet. static void channel_print_pushes(uvm_channel_t *channel, NvU32 finished_pushes_count, struct seq_file *seq) { NvU32 gpu_get; NvU32 cpu_put; NvU64 completed_value = uvm_channel_update_completed_value(channel); channel_pool_lock(channel->pool); cpu_put = channel->cpu_put; for (gpu_get = channel->gpu_get; gpu_get != cpu_put; gpu_get = (gpu_get + 1) % channel->num_gpfifo_entries) { uvm_gpfifo_entry_t *entry = &channel->gpfifo_entries[gpu_get]; uvm_push_info_t *push_info = entry->push_info; uvm_push_acquire_info_t *push_acquire_info = NULL; if (entry->tracking_semaphore_value + finished_pushes_count <= completed_value) continue; if (entry->type == UVM_GPFIFO_ENTRY_TYPE_CONTROL) { UVM_ASSERT(!uvm_channel_is_proxy(channel)); UVM_SEQ_OR_DBG_PRINT(seq, " control GPFIFO entry - data: 0x%llx, gpu_get: %d\n", entry->control_value, gpu_get); } else { // Obtain the value acquire tracking information from the push_info // index if (uvm_push_info_is_tracking_acquires()) { NvU32 push_info_index = push_info - channel->push_infos; UVM_ASSERT(push_info_index < channel->num_gpfifo_entries); push_acquire_info = &channel->push_acquire_infos[push_info_index]; } UVM_SEQ_OR_DBG_PRINT(seq, " %s push '%s' started at %s:%d in %s() releasing value %llu%s", entry->tracking_semaphore_value <= completed_value ? "finished" : "pending", push_info->description, push_info->filename, push_info->line, push_info->function, entry->tracking_semaphore_value, !push_acquire_info || push_acquire_info->num_values == 0 ? "\n" : ""); if (push_acquire_info) channel_print_push_acquires(push_acquire_info, seq); } } channel_pool_unlock(channel->pool); } void uvm_channel_print_pending_pushes(uvm_channel_t *channel) { channel_print_pushes(channel, 0, NULL); } static void channel_manager_print_pending_pushes(uvm_channel_manager_t *manager, struct seq_file *seq) { uvm_channel_pool_t *pool; uvm_for_each_pool(pool, manager) { uvm_channel_t *channel; uvm_for_each_channel_in_pool(channel, pool) { UVM_SEQ_OR_DBG_PRINT(seq, "Channel %s, pending pushes:\n", channel->name); channel_print_pushes(channel, 0, seq); } } } static NV_STATUS manager_create_procfs_dirs(uvm_channel_manager_t *manager) { uvm_gpu_t *gpu = manager->gpu; // The channel manager procfs files are debug only if (!uvm_procfs_is_debug_enabled()) return NV_OK; manager->procfs.channels_dir = NV_CREATE_PROC_DIR("channels", gpu->procfs.dir); if (manager->procfs.channels_dir == NULL) return NV_ERR_OPERATING_SYSTEM; return NV_OK; } static int nv_procfs_read_manager_pending_pushes(struct seq_file *s, void *v) { uvm_channel_manager_t *manager = (uvm_channel_manager_t *)s->private; if (!uvm_down_read_trylock(&g_uvm_global.pm.lock)) return -EAGAIN; channel_manager_print_pending_pushes(manager, s); uvm_up_read(&g_uvm_global.pm.lock); return 0; } static int nv_procfs_read_manager_pending_pushes_entry(struct seq_file *s, void *v) { UVM_ENTRY_RET(nv_procfs_read_manager_pending_pushes(s, v)); } UVM_DEFINE_SINGLE_PROCFS_FILE(manager_pending_pushes_entry); static NV_STATUS manager_create_procfs(uvm_channel_manager_t *manager) { uvm_gpu_t *gpu = manager->gpu; // The channel manager procfs files are debug only if (!uvm_procfs_is_debug_enabled()) return NV_OK; manager->procfs.pending_pushes = NV_CREATE_PROC_FILE("pending_pushes", gpu->procfs.dir, manager_pending_pushes_entry, manager); if (manager->procfs.pending_pushes == NULL) return NV_ERR_OPERATING_SYSTEM; return NV_OK; } static int nv_procfs_read_channel_info(struct seq_file *s, void *v) { uvm_channel_t *channel = (uvm_channel_t *)s->private; if (!uvm_down_read_trylock(&g_uvm_global.pm.lock)) return -EAGAIN; uvm_channel_print_info(channel, s); uvm_up_read(&g_uvm_global.pm.lock); return 0; } static int nv_procfs_read_channel_info_entry(struct seq_file *s, void *v) { UVM_ENTRY_RET(nv_procfs_read_channel_info(s, v)); } UVM_DEFINE_SINGLE_PROCFS_FILE(channel_info_entry); static int nv_procfs_read_channel_pushes(struct seq_file *s, void *v) { uvm_channel_t *channel = (uvm_channel_t *)s->private; if (!uvm_down_read_trylock(&g_uvm_global.pm.lock)) return -EAGAIN; // Include up to 5 finished pushes for some context channel_print_pushes(channel, 5, s); uvm_up_read(&g_uvm_global.pm.lock); return 0; } static int nv_procfs_read_channel_pushes_entry(struct seq_file *s, void *v) { UVM_ENTRY_RET(nv_procfs_read_channel_pushes(s, v)); } UVM_DEFINE_SINGLE_PROCFS_FILE(channel_pushes_entry); static NV_STATUS channel_create_procfs(uvm_channel_t *channel) { char dirname[16]; uvm_channel_manager_t *manager = channel->pool->manager; // The channel procfs files are debug only if (!uvm_procfs_is_debug_enabled()) return NV_OK; // For internal channels, the directory name contains the HW IDs. Those are // not available for proxy channels, so use -1: instead. if (uvm_channel_is_proxy(channel)) snprintf(dirname, sizeof(dirname), "-1:%u", uvm_channel_index_in_pool(channel)); else snprintf(dirname, sizeof(dirname), "%u:%u", channel->channel_info.hwRunlistId, channel->channel_info.hwChannelId); channel->procfs.dir = NV_CREATE_PROC_DIR(dirname, manager->procfs.channels_dir); if (channel->procfs.dir == NULL) return NV_ERR_OPERATING_SYSTEM; channel->procfs.info = NV_CREATE_PROC_FILE("info", channel->procfs.dir, channel_info_entry, channel); if (channel->procfs.info == NULL) return NV_ERR_OPERATING_SYSTEM; channel->procfs.pushes = NV_CREATE_PROC_FILE("pushes", channel->procfs.dir, channel_pushes_entry, channel); if (channel->procfs.pushes == NULL) return NV_ERR_OPERATING_SYSTEM; return NV_OK; }