open-gpu-kernel-modules/kernel-open/nvidia/nv-ibmnpu.c

/*
 * SPDX-FileCopyrightText: Copyright (c) 2017-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

/*
 * nv-ibmnpu.c - interface with the ibmnpu (IBM NVLink Processing Unit) "module"
 */
#include "nv-linux.h"

#if defined(NVCPU_PPC64LE)
#include "nv-ibmnpu.h"
#include "nv-rsync.h"

/*
 * Temporary query to get the L1D cache block size directly from the device
 * tree for the offline cache flush workaround, since the ppc64_caches symbol
 * is unavailable to us.
 */
const NvU32 P9_L1D_CACHE_DEFAULT_BLOCK_SIZE = 0x80;

static NvU32 nv_ibm_get_cpu_l1d_cache_block_size(void)
{
    const __be32 *block_size_prop;

    /*
     * Attempt to look up the block size from device tree. If unavailable, just
     * return the default that we see on these systems.
     */
    struct device_node *cpu = of_find_node_by_type(NULL, "cpu");
    if (!cpu)
    {
        return P9_L1D_CACHE_DEFAULT_BLOCK_SIZE;
    }

    block_size_prop = of_get_property(cpu, "d-cache-block-size", NULL);
    if (!block_size_prop)
    {
        return P9_L1D_CACHE_DEFAULT_BLOCK_SIZE;
    }

    return be32_to_cpu(*block_size_prop);
}

/*
 * GPU device memory can be exposed to the kernel as NUMA node memory via the
 * IBMNPU devices associated with the GPU. The platform firmware will specify
 * the parameters of where the memory lives in the system address space via
 * firmware properties on the IBMNPU devices. These properties specify what
 * memory can be accessed through the IBMNPU device, and the driver can online
 * a GPU device's memory into the range accessible by its associated IBMNPU
 * devices.
 *
 * This function calls over to the IBMNPU driver to query the parameters from
 * firmware, and validates that the resulting parameters are acceptable.
 */
static void nv_init_ibmnpu_numa_info(nv_state_t *nv)
{
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
    nv_npu_numa_info_t *npu_numa_info = &nvl->npu->numa_info;
    struct pci_dev *npu_dev = nvl->npu->devs[0];
    NvU64 spa, gpa, aper_size;

    /*
     * Terminology:
     * - system physical address (spa): 47-bit NVIDIA physical address, which
     *      is the CPU real address with the NVLink address compression scheme
     *      already applied in firmware.
     * - guest physical address (gpa): 56-bit physical address as seen by the
     *      operating system. This is the base address that we should use for
     *      onlining device memory.
     */
    nvl->numa_info.node_id = ibmnpu_device_get_memory_config(npu_dev, &spa, &gpa,
                                                             &aper_size);
    if (nvl->numa_info.node_id == NUMA_NO_NODE)
    {
        NV_DEV_PRINTF(NV_DBG_SETUP, nv, "No NUMA memory aperture found\n");
        return;
    }

    /* Validate that the compressed system physical address is not too wide */
    if (spa & (~(BIT_ULL(nv_volta_dma_addr_size) - 1)))
    {
        NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
            "Invalid NUMA memory system pa 0x%llx"
            " on IBM-NPU device %04x:%02x:%02x.%u\n",
            spa, NV_PCI_DOMAIN_NUMBER(npu_dev), NV_PCI_BUS_NUMBER(npu_dev),
            NV_PCI_SLOT_NUMBER(npu_dev), PCI_FUNC(npu_dev->devfn));
        goto invalid_numa_config;
    }

    /*
     * Validate that the guest physical address is aligned to 128GB.
     * This alignment requirement comes from the Volta address space
     * size on POWER9.
     */
    if (!IS_ALIGNED(gpa, BIT_ULL(nv_volta_addr_space_width)))
    {
        NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
            "Invalid alignment in NUMA memory guest pa 0x%llx"
            " on IBM-NPU device %04x:%02x:%02x.%u\n",
            gpa, NV_PCI_DOMAIN_NUMBER(npu_dev), NV_PCI_BUS_NUMBER(npu_dev),
            NV_PCI_SLOT_NUMBER(npu_dev), PCI_FUNC(npu_dev->devfn));
        goto invalid_numa_config;
    }

    /* Validate that the aperture can map all of the device's framebuffer */
    if (aper_size < nv->fb->size)
    {
        NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
            "Insufficient NUMA memory aperture size 0x%llx"
            " on IBM-NPU device %04x:%02x:%02x.%u (0x%llx required)\n",
            aper_size, NV_PCI_DOMAIN_NUMBER(npu_dev),
            NV_PCI_BUS_NUMBER(npu_dev), NV_PCI_SLOT_NUMBER(npu_dev),
            PCI_FUNC(npu_dev->devfn), nv->fb->size);
        goto invalid_numa_config;
    }

    npu_numa_info->compr_sys_phys_addr = spa;
    npu_numa_info->guest_phys_addr = gpa;

    if (NVreg_EnableUserNUMAManagement)
    {
        NV_ATOMIC_SET(nvl->numa_info.status, NV_IOCTL_NUMA_STATUS_OFFLINE);
    }
    else
    {
        NV_DEV_PRINTF(NV_DBG_SETUP, nv, "User-mode NUMA onlining disabled.\n");
        nvl->numa_info.node_id = NUMA_NO_NODE;
    }

    NV_DEV_PRINTF(NV_DBG_SETUP, nv, "NUMA memory aperture: "
        "[spa = 0x%llx, gpa = 0x%llx, aper_size = 0x%llx]\n",
        spa, gpa, aper_size);

    /* Get the CPU's L1D cache block size for offlining cache flush */
    npu_numa_info->l1d_cache_block_size = nv_ibm_get_cpu_l1d_cache_block_size();

    return;

invalid_numa_config:
    NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
        "NUMA memory aperture disabled due to invalid firmware configuration\n");
    nvl->numa_info.node_id = NUMA_NO_NODE;
}

void nv_init_ibmnpu_info(nv_state_t *nv)
{
#if defined(NV_PNV_PCI_GET_NPU_DEV_PRESENT)
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
    struct pci_dev *npu_dev = pnv_pci_get_npu_dev(nvl->pci_dev, 0);
    NvU8 dev_count;

    if (!npu_dev)
    {
        return;
    }

    if (os_alloc_mem((void **)&nvl->npu, sizeof(nv_ibmnpu_info_t)) != NV_OK)
    {
        return;
    }

    os_mem_set(nvl->npu, 0, sizeof(nv_ibmnpu_info_t));

    /* Find any other IBMNPU devices attached to this GPU */
    for (nvl->npu->devs[0] = npu_dev, dev_count = 1;
         dev_count < NV_MAX_ATTACHED_IBMNPUS; dev_count++)
    {
        nvl->npu->devs[dev_count] = pnv_pci_get_npu_dev(nvl->pci_dev, dev_count);
        if (!nvl->npu->devs[dev_count])
        {
            break;
        }
    }

    nvl->npu->dev_count = dev_count;

    /*
     * If we run out of space for IBMNPU devices, NV_MAX_ATTACHED_IBMNPUS will
     * need to be bumped.
     */
    WARN_ON((dev_count == NV_MAX_ATTACHED_IBMNPUS) &&
            pnv_pci_get_npu_dev(nvl->pci_dev, dev_count));

    ibmnpu_device_get_genregs_info(npu_dev, &nvl->npu->genregs);

    if (nvl->npu->genregs.size > 0)
    {
        NV_DEV_PRINTF(NV_DBG_SETUP, nv,
            "IBM-NPU device %04x:%02x:%02x.%u associated with GPU "
            " has a generation register space 0x%llx-0x%llx\n",
            NV_PCI_DOMAIN_NUMBER(npu_dev), NV_PCI_BUS_NUMBER(npu_dev),
            NV_PCI_SLOT_NUMBER(npu_dev), PCI_FUNC(npu_dev->devfn),
            nvl->npu->genregs.start_addr,
            nvl->npu->genregs.start_addr + nvl->npu->genregs.size - 1);
    }
    else
    {
        NV_DEV_PRINTF(NV_DBG_SETUP, nv,
            "IBM-NPU device %04x:%02x:%02x.%u associated with GPU "
            "does not support generation registers\n",
            NV_PCI_DOMAIN_NUMBER(npu_dev), NV_PCI_BUS_NUMBER(npu_dev),
            NV_PCI_SLOT_NUMBER(npu_dev), PCI_FUNC(npu_dev->devfn));
    }

    nv_init_ibmnpu_numa_info(nv);
#endif
}

void nv_destroy_ibmnpu_info(nv_state_t *nv)
{
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);

    if (nvl->npu != NULL)
    {
        os_free_mem(nvl->npu);
        nvl->npu = NULL;
    }
}

int nv_init_ibmnpu_devices(nv_state_t *nv)
{
    NvU8 i;
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);

    if (!nvl->npu)
    {
        return 0;
    }

    for (i = 0; i < nvl->npu->dev_count; i++)
    {
        NV_DEV_PRINTF(NV_DBG_SETUP, nv,
            "Initializing IBM-NPU device %04x:%02x:%02x.%u\n",
            NV_PCI_DOMAIN_NUMBER(nvl->npu->devs[i]),
            NV_PCI_BUS_NUMBER(nvl->npu->devs[i]),
            NV_PCI_SLOT_NUMBER(nvl->npu->devs[i]),
            PCI_FUNC(nvl->npu->devs[i]->devfn));

        if (ibmnpu_init_device(nvl->npu->devs[i]) != NVL_SUCCESS)
        {
            nv_unregister_ibmnpu_devices(nv);
            return -EIO;
        }

        nvl->npu->initialized_dev_count++;
    }

    return 0;
}

void nv_unregister_ibmnpu_devices(nv_state_t *nv)
{
    NvU8 i;
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);

    if (!nvl->npu)
    {
        return;
    }

    for (i = 0; i < nvl->npu->initialized_dev_count; i++)
    {
        NV_DEV_PRINTF(NV_DBG_SETUP, nv,
            "Unregistering IBM-NPU device %04x:%02x:%02x.%u\n",
            NV_PCI_DOMAIN_NUMBER(nvl->npu->devs[i]),
            NV_PCI_BUS_NUMBER(nvl->npu->devs[i]),
            NV_PCI_SLOT_NUMBER(nvl->npu->devs[i]),
            PCI_FUNC(nvl->npu->devs[i]->devfn));

        ibmnpu_unregister_device(nvl->npu->devs[i]);
    }

    nvl->npu->initialized_dev_count = 0;
}

NV_STATUS NV_API_CALL nv_get_ibmnpu_genreg_info(nv_state_t *nv, NvU64 *addr,
                                                NvU64 *size, void **device)
{
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);

    if (nvl->npu == NULL || nvl->npu->genregs.size == 0)
    {
        return NV_ERR_NOT_SUPPORTED;
    }

    if (addr)
    {
        *addr = nvl->npu->genregs.start_addr;
    }

    if (size)
    {
        *size = nvl->npu->genregs.size;
    }

    if (device)
    {
        *device = (void*)nvl->npu->devs[0];
    }

    return NV_OK;
}

NV_STATUS NV_API_CALL nv_get_ibmnpu_relaxed_ordering_mode(nv_state_t *nv,
                                                          NvBool *mode)
{
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);

    if (nvl->npu == NULL || nvl->npu->genregs.size == 0)
    {
        return NV_ERR_NOT_SUPPORTED;
    }

    *mode = nv_get_rsync_relaxed_ordering_mode(nv);

    return NV_OK;
}

void NV_API_CALL nv_wait_for_ibmnpu_rsync(nv_state_t *nv)
{
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);

    if (nvl->npu == NULL || nvl->npu->genregs.size == 0)
    {
        return;
    }

    nv_wait_for_rsync(nv);
}

int nv_get_ibmnpu_chip_id(nv_state_t *nv)
{
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);

    if (nvl->npu == NULL)
    {
        return -1;
    }

    return ibmnpu_device_get_chip_id(nvl->npu->devs[0]);
}

void NV_API_CALL nv_ibmnpu_cache_flush_range(nv_state_t *nv, NvU64 cpu_virtual, NvU64 size)
{
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
    NvU64 offset, cbsize;

    /*
     * The range is commonly an ioremap()ed mapping of the GPU's ATS range and
     * needs to be compared against the created mappings. Alternatively, kernel
     * page tables can be dumped through sysfs if CONFIG_PPC_PTDUMP is enabled.
     */
    NV_DEV_PRINTF(NV_DBG_INFO, nv,
            "Flushing CPU virtual range [0x%llx, 0x%llx)\n",
            cpu_virtual, cpu_virtual + size);

    cbsize = nvl->npu->numa_info.l1d_cache_block_size;

    asm volatile("sync; isync" ::: "memory");

    /* Force eviction of any cache lines from the NUMA-onlined region. */
    for (offset = 0; offset < size; offset += cbsize)
    {
        asm volatile("dcbf %0,%1" :: "r" (cpu_virtual), "r" (offset) : "memory");

        /* Reschedule if necessary to avoid lockup warnings */
        cond_resched();
    }

    asm volatile("sync; isync" ::: "memory");
}

#else

void nv_init_ibmnpu_info(nv_state_t *nv)
{
}

void nv_destroy_ibmnpu_info(nv_state_t *nv)
{
}

int nv_init_ibmnpu_devices(nv_state_t *nv)
{
    return 0;
}

void nv_unregister_ibmnpu_devices(nv_state_t *nv)
{
}

NV_STATUS NV_API_CALL nv_get_ibmnpu_genreg_info(nv_state_t *nv, NvU64 *addr,
                                                NvU64 *size, void **device)
{
    return NV_ERR_NOT_SUPPORTED;
}

NV_STATUS NV_API_CALL nv_get_ibmnpu_relaxed_ordering_mode(nv_state_t *nv,
                                                          NvBool *mode)
{
    return NV_ERR_NOT_SUPPORTED;
}

void NV_API_CALL nv_wait_for_ibmnpu_rsync(nv_state_t *nv)
{
}

int nv_get_ibmnpu_chip_id(nv_state_t *nv)
{
    return -1;
}

void NV_API_CALL nv_ibmnpu_cache_flush_range(nv_state_t *nv, NvU64 virtual, NvU64 size)
{
}

void nv_ibmnpu_cache_flush_numa_region(nv_state_t *nv)
{
}

#endif