/* * SPDX-FileCopyrightText: Copyright (c) 2016 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #include "nv-kthread-q.h" #include "nv-list-helpers.h" #include #include #include #include #include #if defined(NV_LINUX_BUG_H_PRESENT) #include #else #include #endif // Today's implementation is a little simpler and more limited than the // API description allows for in nv-kthread-q.h. Details include: // // 1. Each nv_kthread_q instance is a first-in, first-out queue. // // 2. Each nv_kthread_q instance is serviced by exactly one kthread. // // You can create any number of queues, each of which gets its own // named kernel thread (kthread). You can then insert arbitrary functions // into the queue, and those functions will be run in the context of the // queue's kthread. #ifndef WARN // Only *really* old kernels (2.6.9) end up here. Just use a simple printk // to implement this, because such kernels won't be supported much longer. #define WARN(condition, format...) ({ \ int __ret_warn_on = !!(condition); \ if (unlikely(__ret_warn_on)) \ printk(KERN_ERR format); \ unlikely(__ret_warn_on); \ }) #endif #define NVQ_WARN(fmt, ...) \ do { \ if (in_interrupt()) { \ WARN(1, "nv_kthread_q: [in interrupt]: " fmt, \ ##__VA_ARGS__); \ } \ else { \ WARN(1, "nv_kthread_q: task: %s: " fmt, \ current->comm, \ ##__VA_ARGS__); \ } \ } while (0) static int _main_loop(void *args) { nv_kthread_q_t *q = (nv_kthread_q_t *)args; nv_kthread_q_item_t *q_item = NULL; unsigned long flags; while (1) { // Normally this thread is never interrupted. However, // down_interruptible (instead of down) is called here, // in order to avoid being classified as a potentially // hung task, by the kernel watchdog. while (down_interruptible(&q->q_sem)) NVQ_WARN("Interrupted during semaphore wait\n"); if (atomic_read(&q->main_loop_should_exit)) break; spin_lock_irqsave(&q->q_lock, flags); // The q_sem semaphore prevents us from getting here unless there is // at least one item in the list, so an empty list indicates a bug. if (unlikely(list_empty(&q->q_list_head))) { spin_unlock_irqrestore(&q->q_lock, flags); NVQ_WARN("_main_loop: Empty queue: q: 0x%p\n", q); continue; } // Consume one item from the queue q_item = list_first_entry(&q->q_list_head, nv_kthread_q_item_t, q_list_node); list_del_init(&q_item->q_list_node); spin_unlock_irqrestore(&q->q_lock, flags); // Run the item q_item->function_to_run(q_item->function_args); // Make debugging a little simpler by clearing this between runs: q_item = NULL; } while (!kthread_should_stop()) schedule(); return 0; } void nv_kthread_q_stop(nv_kthread_q_t *q) { // check if queue has been properly initialized if (unlikely(!q->q_kthread)) return; nv_kthread_q_flush(q); // If this assertion fires, then a caller likely either broke the API rules, // by adding items after calling nv_kthread_q_stop, or possibly messed up // with inadequate flushing of self-rescheduling q_items. if (unlikely(!list_empty(&q->q_list_head))) NVQ_WARN("list not empty after flushing\n"); if (likely(!atomic_read(&q->main_loop_should_exit))) { atomic_set(&q->main_loop_should_exit, 1); // Wake up the kthread so that it can see that it needs to stop: up(&q->q_sem); kthread_stop(q->q_kthread); q->q_kthread = NULL; } } // When CONFIG_VMAP_STACK is defined, the kernel thread stack allocator used by // kthread_create_on_node relies on a 2 entry, per-core cache to minimize // vmalloc invocations. The cache is NUMA-unaware, so when there is a hit, the // stack location ends up being a function of the core assigned to the current // thread, instead of being a function of the specified NUMA node. The cache was // added to the kernel in commit ac496bf48d97f2503eaa353996a4dd5e4383eaf0 // ("fork: Optimize task creation by caching two thread stacks per CPU if // CONFIG_VMAP_STACK=y") // // To work around the problematic cache, we create up to three kernel threads // -If the first thread's stack is resident on the preferred node, return this // thread. // -Otherwise, create a second thread. If its stack is resident on the // preferred node, stop the first thread and return this one. // -Otherwise, create a third thread. The stack allocator does not find a // cached stack, and so falls back to vmalloc, which takes the NUMA hint into // consideration. The first two threads are then stopped. // // When CONFIG_VMAP_STACK is not defined, the first kernel thread is returned. // // This function is never invoked when there is no NUMA preference (preferred // node is NUMA_NO_NODE). static struct task_struct *thread_create_on_node(int (*threadfn)(void *data), nv_kthread_q_t *q, int preferred_node, const char *q_name) { unsigned i, j; const static unsigned attempts = 3; struct task_struct *thread[3]; for (i = 0;; i++) { struct page *stack; thread[i] = kthread_create_on_node(threadfn, q, preferred_node, q_name); if (unlikely(IS_ERR(thread[i]))) { // Instead of failing, pick the previous thread, even if its // stack is not allocated on the preferred node. if (i > 0) i--; break; } // vmalloc is not used to allocate the stack, so simply return the // thread, even if its stack may not be allocated on the preferred node if (!is_vmalloc_addr(thread[i]->stack)) break; // Ran out of attempts - return thread even if its stack may not be // allocated on the preferred node if (i == (attempts - 1)) break; // Get the NUMA node where the first page of the stack is resident. If // it is the preferred node, select this thread. stack = vmalloc_to_page(thread[i]->stack); if (page_to_nid(stack) == preferred_node) break; } for (j = i; j > 0; j--) kthread_stop(thread[j - 1]); return thread[i]; } int nv_kthread_q_init_on_node(nv_kthread_q_t *q, const char *q_name, int preferred_node) { memset(q, 0, sizeof(*q)); INIT_LIST_HEAD(&q->q_list_head); spin_lock_init(&q->q_lock); sema_init(&q->q_sem, 0); if (preferred_node == NV_KTHREAD_NO_NODE) { q->q_kthread = kthread_create(_main_loop, q, q_name); } else { q->q_kthread = thread_create_on_node(_main_loop, q, preferred_node, q_name); } if (IS_ERR(q->q_kthread)) { int err = PTR_ERR(q->q_kthread); // Clear q_kthread before returning so that nv_kthread_q_stop() can be // safely called on it making error handling easier. q->q_kthread = NULL; return err; } wake_up_process(q->q_kthread); return 0; } int nv_kthread_q_init(nv_kthread_q_t *q, const char *qname) { return nv_kthread_q_init_on_node(q, qname, NV_KTHREAD_NO_NODE); } // Returns true (non-zero) if the item was actually scheduled, and false if the // item was already pending in a queue. static int _raw_q_schedule(nv_kthread_q_t *q, nv_kthread_q_item_t *q_item) { unsigned long flags; int ret = 1; spin_lock_irqsave(&q->q_lock, flags); if (likely(list_empty(&q_item->q_list_node))) list_add_tail(&q_item->q_list_node, &q->q_list_head); else ret = 0; spin_unlock_irqrestore(&q->q_lock, flags); if (likely(ret)) up(&q->q_sem); return ret; } void nv_kthread_q_item_init(nv_kthread_q_item_t *q_item, nv_q_func_t function_to_run, void *function_args) { INIT_LIST_HEAD(&q_item->q_list_node); q_item->function_to_run = function_to_run; q_item->function_args = function_args; } // Returns true (non-zero) if the q_item got scheduled, false otherwise. int nv_kthread_q_schedule_q_item(nv_kthread_q_t *q, nv_kthread_q_item_t *q_item) { if (unlikely(atomic_read(&q->main_loop_should_exit))) { NVQ_WARN("Not allowed: nv_kthread_q_schedule_q_item was " "called with a non-alive q: 0x%p\n", q); return 0; } return _raw_q_schedule(q, q_item); } static void _q_flush_function(void *args) { struct completion *completion = (struct completion *)args; complete(completion); } static void _raw_q_flush(nv_kthread_q_t *q) { nv_kthread_q_item_t q_item; DECLARE_COMPLETION_ONSTACK(completion); nv_kthread_q_item_init(&q_item, _q_flush_function, &completion); _raw_q_schedule(q, &q_item); // Wait for the flush item to run. Once it has run, then all of the // previously queued items in front of it will have run, so that means // the flush is complete. wait_for_completion(&completion); } void nv_kthread_q_flush(nv_kthread_q_t *q) { if (unlikely(atomic_read(&q->main_loop_should_exit))) { NVQ_WARN("Not allowed: nv_kthread_q_flush was called after " "nv_kthread_q_stop. q: 0x%p\n", q); return; } // This 2x flush is not a typing mistake. The queue really does have to be // flushed twice, in order to take care of the case of a q_item that // reschedules itself. _raw_q_flush(q); _raw_q_flush(q); }