open-gpu-kernel-modules/kernel-open/nvidia/linux_nvswitch.c

/*
 * SPDX-FileCopyrightText: Copyright (c) 2016-2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

#include "linux_nvswitch.h"

#include <linux/version.h>

#include "conftest.h"
#include "nvlink_errors.h"
#include "nvlink_linux.h"
#include "nvCpuUuid.h"
#include "nv-time.h"
#include "nvlink_caps.h"

#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/cdev.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/poll.h>
#include <linux/sched.h>
#include <linux/time.h>
#include <linux/string.h>
#include <linux/moduleparam.h>
#include <linux/ctype.h>
#include <linux/wait.h>
#include <linux/jiffies.h>

#include "ioctl_nvswitch.h"

const static struct
{
    NvlStatus status;
    int err;
} nvswitch_status_map[] = {
    { NVL_ERR_GENERIC,                  -EIO        },
    { NVL_NO_MEM,                       -ENOMEM     },
    { NVL_BAD_ARGS,                     -EINVAL     },
    { NVL_ERR_INVALID_STATE,            -EIO        },
    { NVL_ERR_NOT_SUPPORTED,            -EOPNOTSUPP },
    { NVL_NOT_FOUND,                    -EINVAL     },
    { NVL_ERR_STATE_IN_USE,             -EBUSY      },
    { NVL_ERR_NOT_IMPLEMENTED,          -ENOSYS     },
    { NVL_ERR_INSUFFICIENT_PERMISSIONS, -EPERM      },
    { NVL_ERR_OPERATING_SYSTEM,         -EIO        },
    { NVL_MORE_PROCESSING_REQUIRED,     -EAGAIN     },
    { NVL_SUCCESS,                       0          },
};

int
nvswitch_map_status
(
    NvlStatus status
)
{
    int err = -EIO;
    NvU32 i;
    NvU32 limit = sizeof(nvswitch_status_map) / sizeof(nvswitch_status_map[0]);

    for (i = 0; i < limit; i++)
    {
        if (nvswitch_status_map[i].status == status ||
            nvswitch_status_map[i].status == -status)
        {
            err = nvswitch_status_map[i].err;
            break;
        }
    }

    return err;
}

#if !defined(IRQF_SHARED)
#define IRQF_SHARED SA_SHIRQ
#endif

#define NV_FILE_INODE(file) (file)->f_inode

static int nvswitch_probe(struct pci_dev *, const struct pci_device_id *);
static void nvswitch_remove(struct pci_dev *);

static struct pci_device_id nvswitch_pci_table[] =
{
    {
        .vendor      = PCI_VENDOR_ID_NVIDIA,
        .device      = PCI_ANY_ID,
        .subvendor   = PCI_ANY_ID,
        .subdevice   = PCI_ANY_ID,
        .class       = (PCI_CLASS_BRIDGE_OTHER << 8),
        .class_mask  = ~0
    },
    {}
};

static struct pci_driver nvswitch_pci_driver =
{
    .name           = NVSWITCH_DRIVER_NAME,
    .id_table       = nvswitch_pci_table,
    .probe          = nvswitch_probe,
    .remove         = nvswitch_remove,
    .shutdown       = nvswitch_remove
};

//
// nvidia_nvswitch_mknod uses minor number 255 to create nvidia-nvswitchctl
// node. Hence, if NVSWITCH_CTL_MINOR is changed, then NV_NVSWITCH_CTL_MINOR
// should be updated. See nvdia-modprobe-utils.h
//
#define NVSWITCH_CTL_MINOR 255
#define NVSWITCH_MINOR_COUNT (NVSWITCH_CTL_MINOR + 1)

// 32 bit hex value - including 0x prefix. (10 chars)
#define NVSWITCH_REGKEY_VALUE_LEN 10

static char *NvSwitchRegDwords;
module_param(NvSwitchRegDwords, charp, 0);
MODULE_PARM_DESC(NvSwitchRegDwords, "NvSwitch regkey");

static char *NvSwitchBlacklist;
module_param(NvSwitchBlacklist, charp, 0);
MODULE_PARM_DESC(NvSwitchBlacklist, "NvSwitchBlacklist=uuid[,uuid...]");

//
// Locking:
//   We handle nvswitch driver locking in the OS layer. The nvswitch lib
//   layer does not have its own locking. It relies on the OS layer for
//   atomicity.
//
//   All locking is done with sleep locks. We use threaded MSI interrupts to
//   facilitate this.
//
//   When handling a request from a user context we use the interruptible
//   version to enable a quick ^C return if there is lock contention.
//
//   nvswitch.driver_mutex is used to protect driver's global state, "struct
//   NVSWITCH". The driver_mutex is taken during .probe, .remove, .open,
//   .close, and nvswitch-ctl .ioctl operations.
//
//   nvswitch_dev.device_mutex is used to protect per-device state, "struct
//   NVSWITCH_DEV", once a device is opened. The device_mutex is taken during
//   .ioctl, .poll and other background tasks.
//
//   The kernel guarantees that .close won't happen while .ioctl and .poll
//   are going on and without successful .open one can't execute any file ops.
//   This behavior guarantees correctness of the locking model.
//
//   If .close is invoked and holding the lock which is also used by threaded
//   tasks such as interrupt, driver will deadlock while trying to stop such
//   tasks. For example, when threaded interrupts are enabled, free_irq() calls
//   kthread_stop() to flush pending interrupt tasks. The locking model
//   makes sure that such deadlock cases don't happen.
//
// Lock ordering:
//   nvswitch.driver_mutex
//   nvswitch_dev.device_mutex
//
// Note:
//   Due to bug 2856314, nvswitch_dev.device_mutex is taken when calling
//   nvswitch_post_init_device() in nvswitch_probe().
//

// Per-chip driver state is defined in linux_nvswitch.h

// Global driver state
typedef struct
{
    NvBool initialized;
    struct cdev cdev;
    struct cdev cdev_ctl;
    dev_t devno;
    atomic_t count;
    struct mutex driver_mutex;
    struct list_head devices;
} NVSWITCH;

static NVSWITCH nvswitch = {0};

// NvSwitch event
typedef struct nvswitch_event_t
{
    wait_queue_head_t wait_q_event;
    NvBool            event_pending;
} nvswitch_event_t;

typedef struct nvswitch_file_private
{
    NVSWITCH_DEV     *nvswitch_dev;
    nvswitch_event_t file_event;
    struct
    {
        /* A duped file descriptor for fabric_mgmt capability */
        int fabric_mgmt;
    } capability_fds;
} nvswitch_file_private_t;

#define NVSWITCH_SET_FILE_PRIVATE(filp, data) ((filp)->private_data = (data))
#define NVSWITCH_GET_FILE_PRIVATE(filp) ((nvswitch_file_private_t *)(filp)->private_data)

static int nvswitch_device_open(struct inode *inode, struct file *file);
static int nvswitch_device_release(struct inode *inode, struct file *file);
static unsigned int nvswitch_device_poll(struct file *file, poll_table *wait);
static int nvswitch_device_ioctl(struct inode *inode,
                                 struct file *file,
                                 unsigned int cmd,
                                 unsigned long arg);
static long nvswitch_device_unlocked_ioctl(struct file *file,
                                           unsigned int cmd,
                                           unsigned long arg);

static int nvswitch_ctl_ioctl(struct inode *inode,
                              struct file *file,
                              unsigned int cmd,
                              unsigned long arg);
static long nvswitch_ctl_unlocked_ioctl(struct file *file,
                                        unsigned int cmd,
                                        unsigned long arg);

struct file_operations device_fops =
{
    .owner = THIS_MODULE,
#if defined(NV_FILE_OPERATIONS_HAS_IOCTL)
    .ioctl = nvswitch_device_ioctl,
#endif
    .unlocked_ioctl = nvswitch_device_unlocked_ioctl,
    .open    = nvswitch_device_open,
    .release = nvswitch_device_release,
    .poll    = nvswitch_device_poll
};

struct file_operations ctl_fops =
{
    .owner = THIS_MODULE,
#if defined(NV_FILE_OPERATIONS_HAS_IOCTL)
    .ioctl = nvswitch_ctl_ioctl,
#endif
    .unlocked_ioctl = nvswitch_ctl_unlocked_ioctl,
};

static int nvswitch_initialize_device_interrupt(NVSWITCH_DEV *nvswitch_dev);
static void nvswitch_shutdown_device_interrupt(NVSWITCH_DEV *nvswitch_dev);
static void nvswitch_load_bar_info(NVSWITCH_DEV *nvswitch_dev);
static void nvswitch_task_dispatch(NVSWITCH_DEV *nvswitch_dev);

static NvBool
nvswitch_is_device_blacklisted
(
    NVSWITCH_DEV *nvswitch_dev
)
{
    NVSWITCH_DEVICE_FABRIC_STATE device_fabric_state = 0;
    NvlStatus status;

    status = nvswitch_lib_read_fabric_state(nvswitch_dev->lib_device, 
                                            &device_fabric_state, NULL, NULL);

    if (status != NVL_SUCCESS)
    {
        printk(KERN_INFO "%s: Failed to read fabric state, %x\n", nvswitch_dev->name, status);
        return NV_FALSE;
    }

    return device_fabric_state == NVSWITCH_DEVICE_FABRIC_STATE_BLACKLISTED;
}

static void
nvswitch_deinit_background_tasks
(
    NVSWITCH_DEV *nvswitch_dev
)
{
    NV_ATOMIC_SET(nvswitch_dev->task_q_ready, 0);

    wake_up(&nvswitch_dev->wait_q_shutdown);

    nv_kthread_q_stop(&nvswitch_dev->task_q);
}

static int
nvswitch_init_background_tasks
(
    NVSWITCH_DEV *nvswitch_dev
)
{
    int rc;

    rc = nv_kthread_q_init(&nvswitch_dev->task_q, nvswitch_dev->sname);
    if (rc)
    {
        printk(KERN_ERR "%s: Failed to create task queue\n", nvswitch_dev->name);
        return rc;
    }

    NV_ATOMIC_SET(nvswitch_dev->task_q_ready, 1);

    nv_kthread_q_item_init(&nvswitch_dev->task_item,
                           (nv_q_func_t) &nvswitch_task_dispatch,
                           nvswitch_dev);

    if (!nv_kthread_q_schedule_q_item(&nvswitch_dev->task_q,
                                      &nvswitch_dev->task_item))
    {
        printk(KERN_ERR "%s: Failed to schedule an item\n",nvswitch_dev->name);
        rc = -ENODEV;
        goto init_background_task_failed;
    }

    return 0;

init_background_task_failed:
    nvswitch_deinit_background_tasks(nvswitch_dev);

    return rc;
}

static NVSWITCH_DEV*
nvswitch_find_device(int minor)
{
    struct list_head *cur;
    NVSWITCH_DEV *nvswitch_dev = NULL;

    list_for_each(cur, &nvswitch.devices)
    {
        nvswitch_dev = list_entry(cur, NVSWITCH_DEV, list_node);
        if (nvswitch_dev->minor == minor)
        {
            return nvswitch_dev;
        }
    }

    return NULL;
}

static int
nvswitch_find_minor(void)
{
    struct list_head *cur;
    NVSWITCH_DEV *nvswitch_dev;
    int minor;
    int minor_in_use;

    for (minor = 0; minor < NVSWITCH_DEVICE_INSTANCE_MAX; minor++)
    {
        minor_in_use = 0;

        list_for_each(cur, &nvswitch.devices)
        {
            nvswitch_dev = list_entry(cur, NVSWITCH_DEV, list_node);
            if (nvswitch_dev->minor == minor)
            {
                minor_in_use = 1;
                break;
            }
        }

        if (!minor_in_use)
        {
            return minor;
        }
    }

    return NVSWITCH_DEVICE_INSTANCE_MAX;
}

static int
nvswitch_init_i2c_adapters
(
    NVSWITCH_DEV *nvswitch_dev
)
{
    NvlStatus retval;
    NvU32 i, valid_ports_mask;
    struct i2c_adapter *adapter;
    nvswitch_i2c_adapter_entry *adapter_entry;

    if (!nvswitch_lib_is_i2c_supported(nvswitch_dev->lib_device))
    {
        return 0;
    }

    retval = nvswitch_lib_get_valid_ports_mask(nvswitch_dev->lib_device,
                                               &valid_ports_mask);
    if (retval != NVL_SUCCESS)
    {
        printk(KERN_ERR "Failed to get valid I2C ports mask.\n");
        return -ENODEV;
    }

    FOR_EACH_INDEX_IN_MASK(32, i, valid_ports_mask)
    {
        adapter = nvswitch_i2c_add_adapter(nvswitch_dev, i);
        if (adapter == NULL)
        {
            continue;
        }

        adapter_entry = nvswitch_os_malloc(sizeof(*adapter_entry));
        if (adapter_entry == NULL)
        {
            printk(KERN_ERR "Failed to create I2C adapter entry.\n");
            nvswitch_i2c_del_adapter(adapter);
            continue;
        }

        adapter_entry->adapter = adapter;

        list_add_tail(&adapter_entry->entry, &nvswitch_dev->i2c_adapter_list);
    }
    FOR_EACH_INDEX_IN_MASK_END;

    return 0;
}

static void
nvswitch_deinit_i2c_adapters
(
    NVSWITCH_DEV *nvswitch_dev
)
{
    nvswitch_i2c_adapter_entry *curr;
    nvswitch_i2c_adapter_entry *next;

    list_for_each_entry_safe(curr,
                             next,
                             &nvswitch_dev->i2c_adapter_list,
                             entry)
    {
        nvswitch_i2c_del_adapter(curr->adapter);
        list_del(&curr->entry);
        nvswitch_os_free(curr);
    }
}

static int
nvswitch_init_device
(
    NVSWITCH_DEV *nvswitch_dev
)
{
    struct pci_dev *pci_dev = nvswitch_dev->pci_dev;
    NvlStatus retval;
    int rc;

    INIT_LIST_HEAD(&nvswitch_dev->i2c_adapter_list);

    retval = nvswitch_lib_register_device(NV_PCI_DOMAIN_NUMBER(pci_dev),
                                          NV_PCI_BUS_NUMBER(pci_dev),
                                          NV_PCI_SLOT_NUMBER(pci_dev),
                                          PCI_FUNC(pci_dev->devfn),
                                          pci_dev->device,
                                          pci_dev,
                                          nvswitch_dev->minor,
                                          &nvswitch_dev->lib_device);
    if (NVL_SUCCESS != retval)
    {
        printk(KERN_ERR "%s: Failed to register device : %d\n",
               nvswitch_dev->name,
               retval);
        return -ENODEV;
    }

    nvswitch_load_bar_info(nvswitch_dev);

    retval = nvswitch_lib_initialize_device(nvswitch_dev->lib_device);
    if (NVL_SUCCESS != retval)
    {
        printk(KERN_ERR "%s: Failed to initialize device : %d\n",
               nvswitch_dev->name,
               retval);
        rc = -ENODEV;
        goto init_device_failed;
    }

    nvswitch_lib_get_uuid(nvswitch_dev->lib_device, &nvswitch_dev->uuid);

    if (nvswitch_lib_get_bios_version(nvswitch_dev->lib_device,
                                      &nvswitch_dev->bios_ver) != NVL_SUCCESS)
    {
        nvswitch_dev->bios_ver = 0;
    }

    if (nvswitch_lib_get_physid(nvswitch_dev->lib_device,
                                &nvswitch_dev->phys_id) != NVL_SUCCESS)
    {
        nvswitch_dev->phys_id = NVSWITCH_INVALID_PHYS_ID;
    }

    rc = nvswitch_initialize_device_interrupt(nvswitch_dev);
    if (rc)
    {
        printk(KERN_ERR "%s: Failed to initialize interrupt : %d\n",
               nvswitch_dev->name,
               rc);
        goto init_intr_failed;
    }

    if (nvswitch_is_device_blacklisted(nvswitch_dev))
    {
        printk(KERN_ERR "%s: Blacklisted nvswitch device\n", nvswitch_dev->name);
        // Keep device registered for HAL access and Fabric State updates
        return 0;
    }

    nvswitch_lib_enable_interrupts(nvswitch_dev->lib_device);

    return 0;

init_intr_failed:
    nvswitch_lib_shutdown_device(nvswitch_dev->lib_device);

init_device_failed:
    nvswitch_lib_unregister_device(nvswitch_dev->lib_device);
    nvswitch_dev->lib_device = NULL;

    return rc;
}

static int
nvswitch_post_init_device
(
    NVSWITCH_DEV *nvswitch_dev
)
{
    int rc;
    NvlStatus retval;

    rc = nvswitch_init_i2c_adapters(nvswitch_dev);
    if (rc < 0)
    {
       return rc;
    }

    retval = nvswitch_lib_post_init_device(nvswitch_dev->lib_device);
    if (retval != NVL_SUCCESS)
    {
        return -ENODEV;
    }

    return 0;
}

static void
nvswitch_post_init_blacklisted
(
    NVSWITCH_DEV *nvswitch_dev
)
{
    nvswitch_lib_post_init_blacklist_device(nvswitch_dev->lib_device);
}

static void
nvswitch_deinit_device
(
    NVSWITCH_DEV *nvswitch_dev
)
{
    nvswitch_lib_disable_interrupts(nvswitch_dev->lib_device);

    nvswitch_shutdown_device_interrupt(nvswitch_dev);

    nvswitch_lib_shutdown_device(nvswitch_dev->lib_device);

    nvswitch_lib_unregister_device(nvswitch_dev->lib_device);
    nvswitch_dev->lib_device = NULL;
}

static void
nvswitch_init_file_event
(
    nvswitch_file_private_t *private
)
{
    init_waitqueue_head(&private->file_event.wait_q_event);
    private->file_event.event_pending = NV_FALSE;
}

//
// Basic device open to support IOCTL interface
//
static int
nvswitch_device_open
(
    struct inode *inode,
    struct file *file
)
{
    NVSWITCH_DEV *nvswitch_dev;
    int rc = 0;
    nvswitch_file_private_t *private = NULL;

    //
    // Get the major/minor device
    // We might want this for routing requests to multiple nvswitches
    //
    printk(KERN_INFO "nvidia-nvswitch%d: open (major=%d)\n",
           MINOR(inode->i_rdev),
           MAJOR(inode->i_rdev));

    rc = mutex_lock_interruptible(&nvswitch.driver_mutex);
    if (rc)
    {
        return rc;
    }

    nvswitch_dev = nvswitch_find_device(MINOR(inode->i_rdev));
    if (!nvswitch_dev)
    {
        rc = -ENODEV;
        goto done;
    }

    if (nvswitch_is_device_blacklisted(nvswitch_dev))
    {
        rc = -ENODEV;
        goto done;
    }

    private = nvswitch_os_malloc(sizeof(*private));
    if (private == NULL)
    {
        rc = -ENOMEM;
        goto done;
    }

    private->nvswitch_dev = nvswitch_dev;

    nvswitch_init_file_event(private);

    private->capability_fds.fabric_mgmt = -1;
    NVSWITCH_SET_FILE_PRIVATE(file, private);

    NV_ATOMIC_INC(nvswitch_dev->ref_count);

done:
    mutex_unlock(&nvswitch.driver_mutex);

    return rc;
}

//
// Basic device release to support IOCTL interface
//
static int
nvswitch_device_release
(
    struct inode *inode,
    struct file *file
)
{
    nvswitch_file_private_t *private = NVSWITCH_GET_FILE_PRIVATE(file);
    NVSWITCH_DEV *nvswitch_dev = private->nvswitch_dev;

    printk(KERN_INFO "nvidia-nvswitch%d: release (major=%d)\n",
           MINOR(inode->i_rdev),
           MAJOR(inode->i_rdev));

    mutex_lock(&nvswitch.driver_mutex);

    nvswitch_lib_remove_client_events(nvswitch_dev->lib_device, (void *)private);

    //
    // If there are no outstanding references and the device is marked
    // unusable, free it.
    //
    if (NV_ATOMIC_DEC_AND_TEST(nvswitch_dev->ref_count) &&
        nvswitch_dev->unusable)
    {
        kfree(nvswitch_dev);
    }

    if (private->capability_fds.fabric_mgmt > 0)
    {
        nvlink_cap_release(private->capability_fds.fabric_mgmt);
        private->capability_fds.fabric_mgmt = -1;
    }

    nvswitch_os_free(file->private_data);
    NVSWITCH_SET_FILE_PRIVATE(file, NULL);

    mutex_unlock(&nvswitch.driver_mutex);

    return 0;
}

static unsigned int
nvswitch_device_poll
(
    struct file *file,
    poll_table *wait
)
{
    nvswitch_file_private_t *private = NVSWITCH_GET_FILE_PRIVATE(file);
    NVSWITCH_DEV *nvswitch_dev = private->nvswitch_dev;
    int rc = 0;
    NvlStatus status;
    struct NVSWITCH_CLIENT_EVENT *client_event;

    rc = mutex_lock_interruptible(&nvswitch_dev->device_mutex);
    if (rc)
    {
        return rc;
    }

    if (nvswitch_dev->unusable)
    {
        printk(KERN_INFO "%s: a stale fd detected\n", nvswitch_dev->name);
        rc = POLLHUP;
        goto done;
    }

    status = nvswitch_lib_get_client_event(nvswitch_dev->lib_device,
                                           (void *) private, &client_event);
    if (status != NVL_SUCCESS)
    {
        printk(KERN_INFO "%s: no events registered for fd\n", nvswitch_dev->name);
        rc = POLLERR;
        goto done;
    }

    poll_wait(file, &private->file_event.wait_q_event, wait);

    if (private->file_event.event_pending)
    {
        rc = POLLPRI | POLLIN;
        private->file_event.event_pending = NV_FALSE;
    }

done:
    mutex_unlock(&nvswitch_dev->device_mutex);

    return rc;
}

typedef struct {
    void *kernel_params;                // Kernel copy of ioctl parameters
    unsigned long kernel_params_size;   // Size of ioctl params according to user
} IOCTL_STATE;

//
// Clean up any dynamically allocated memory for ioctl state
//
static void
nvswitch_ioctl_state_cleanup
(
    IOCTL_STATE *state
)
{
    kfree(state->kernel_params);
    state->kernel_params = NULL;
}

//
// Initialize buffer state for ioctl.
//
// This handles allocating memory and copying user data into kernel space.  The
// ioctl params structure only is supported. Nested data pointers are not handled.
//
// State is maintained in the IOCTL_STATE struct for use by the ioctl, _sync and
// _cleanup calls.
//
static int
nvswitch_ioctl_state_start(IOCTL_STATE *state, int cmd, unsigned long user_arg)
{
    int rc;

    state->kernel_params = NULL;
    state->kernel_params_size = _IOC_SIZE(cmd);

    if (0 == state->kernel_params_size)
    {
        return 0;
    }

    state->kernel_params = kzalloc(state->kernel_params_size, GFP_KERNEL);
    if (NULL == state->kernel_params)
    {
        rc = -ENOMEM;
        goto nvswitch_ioctl_state_start_fail;
    }

    // Copy params to kernel buffers.  Simple _IOR() ioctls can skip this step.
    if (_IOC_DIR(cmd) & _IOC_WRITE)
    {
        rc = copy_from_user(state->kernel_params,
                            (const void *)user_arg,
                            state->kernel_params_size);
        if (rc)
        {
            rc = -EFAULT;
            goto nvswitch_ioctl_state_start_fail;
        }
    }

    return 0;

nvswitch_ioctl_state_start_fail:
    nvswitch_ioctl_state_cleanup(state);
    return rc;
}

//
// Synchronize any ioctl output in the kernel buffers to the user mode buffers.
//
static int
nvswitch_ioctl_state_sync
(
    IOCTL_STATE *state,
    int cmd,
    unsigned long user_arg
)
{
    int rc;

    // Nothing to do if no buffer or write-only ioctl
    if ((0 == state->kernel_params_size) || (0 == (_IOC_DIR(cmd) & _IOC_READ)))
    {
        return 0;
    }

    // Copy params structure back to user mode
    rc = copy_to_user((void *)user_arg,
                      state->kernel_params,
                      state->kernel_params_size);
    if (rc)
    {
        rc = -EFAULT;
    }

    return rc;
}

static int
nvswitch_device_ioctl
(
    struct inode *inode,
    struct file *file,
    unsigned int cmd,
    unsigned long arg
)
{
    nvswitch_file_private_t *private = NVSWITCH_GET_FILE_PRIVATE(file);
    NVSWITCH_DEV *nvswitch_dev = private->nvswitch_dev;
    IOCTL_STATE state = {0};
    NvlStatus retval;
    int rc = 0;

    if (_IOC_TYPE(cmd) != NVSWITCH_DEV_IO_TYPE)
    {
        return -EINVAL;
    }

    rc = mutex_lock_interruptible(&nvswitch_dev->device_mutex);
    if (rc)
    {
        return rc;
    }

    if (nvswitch_dev->unusable)
    {
        printk(KERN_INFO "%s: a stale fd detected\n", nvswitch_dev->name);
        rc = -ENODEV;
        goto nvswitch_device_ioctl_exit;
    }

    if (nvswitch_is_device_blacklisted(nvswitch_dev))
    {
        printk(KERN_INFO "%s: ioctl attempted on blacklisted device\n", nvswitch_dev->name);
        rc = -ENODEV;
        goto nvswitch_device_ioctl_exit;
    }

    rc = nvswitch_ioctl_state_start(&state, cmd, arg);
    if (rc)
    {
        goto nvswitch_device_ioctl_exit;
    }

    retval = nvswitch_lib_ctrl(nvswitch_dev->lib_device,
                               _IOC_NR(cmd),
                               state.kernel_params,
                               state.kernel_params_size,
                               file->private_data);
    rc = nvswitch_map_status(retval);
    if (!rc)
    {
        rc = nvswitch_ioctl_state_sync(&state, cmd, arg);
    }

    nvswitch_ioctl_state_cleanup(&state);

nvswitch_device_ioctl_exit:
    mutex_unlock(&nvswitch_dev->device_mutex);

    return rc;
}

static long
nvswitch_device_unlocked_ioctl
(
    struct file *file,
    unsigned int cmd,
    unsigned long arg
)
{
    return nvswitch_device_ioctl(NV_FILE_INODE(file), file, cmd, arg);
}

static int
nvswitch_ctl_check_version(NVSWITCH_CHECK_VERSION_PARAMS *p)
{
    NvlStatus retval;

    p->is_compatible = 0;
    p->user.version[NVSWITCH_VERSION_STRING_LENGTH - 1] = '\0';

    retval = nvswitch_lib_check_api_version(p->user.version, p->kernel.version,
                                            NVSWITCH_VERSION_STRING_LENGTH);
    if (retval == NVL_SUCCESS)
    {
        p->is_compatible = 1;
    }
    else if (retval == -NVL_ERR_NOT_SUPPORTED)
    {
        printk(KERN_ERR "nvidia-nvswitch: Version mismatch, "
               "kernel version %s user version %s\n",
               p->kernel.version, p->user.version);
    }
    else
    {
        // An unexpected failure
        return nvswitch_map_status(retval);
    }

    return 0;
}

static void
nvswitch_ctl_get_devices(NVSWITCH_GET_DEVICES_PARAMS *p)
{
    int index = 0;
    NVSWITCH_DEV *nvswitch_dev;
    struct list_head *cur;

    BUILD_BUG_ON(NVSWITCH_DEVICE_INSTANCE_MAX != NVSWITCH_MAX_DEVICES);

    list_for_each(cur, &nvswitch.devices)
    {
        nvswitch_dev = list_entry(cur, NVSWITCH_DEV, list_node);
        p->info[index].deviceInstance = nvswitch_dev->minor;
        p->info[index].pciDomain = NV_PCI_DOMAIN_NUMBER(nvswitch_dev->pci_dev);
        p->info[index].pciBus = NV_PCI_BUS_NUMBER(nvswitch_dev->pci_dev);
        p->info[index].pciDevice = NV_PCI_SLOT_NUMBER(nvswitch_dev->pci_dev);
        p->info[index].pciFunction = PCI_FUNC(nvswitch_dev->pci_dev->devfn);
        index++;
    }

    p->deviceCount = index;
}

static void
nvswitch_ctl_get_devices_v2(NVSWITCH_GET_DEVICES_V2_PARAMS *p)
{
    int index = 0;
    NVSWITCH_DEV *nvswitch_dev;
    struct list_head *cur;

    BUILD_BUG_ON(NVSWITCH_DEVICE_INSTANCE_MAX != NVSWITCH_MAX_DEVICES);

    list_for_each(cur, &nvswitch.devices)
    {
        nvswitch_dev = list_entry(cur, NVSWITCH_DEV, list_node);
        p->info[index].deviceInstance = nvswitch_dev->minor;
        memcpy(&p->info[index].uuid, &nvswitch_dev->uuid, sizeof(nvswitch_dev->uuid));
        p->info[index].pciDomain = NV_PCI_DOMAIN_NUMBER(nvswitch_dev->pci_dev);
        p->info[index].pciBus = NV_PCI_BUS_NUMBER(nvswitch_dev->pci_dev);
        p->info[index].pciDevice = NV_PCI_SLOT_NUMBER(nvswitch_dev->pci_dev);
        p->info[index].pciFunction = PCI_FUNC(nvswitch_dev->pci_dev->devfn);
        p->info[index].physId = nvswitch_dev->phys_id;

        if (nvswitch_dev->lib_device != NULL)
        {
            mutex_lock(&nvswitch_dev->device_mutex);
            (void)nvswitch_lib_read_fabric_state(nvswitch_dev->lib_device,
                                                 &p->info[index].deviceState,
                                                 &p->info[index].deviceReason,
                                                 &p->info[index].driverState);
            mutex_unlock(&nvswitch_dev->device_mutex);
        }
        index++;
    }

    p->deviceCount = index;
}

#define NVSWITCH_CTL_CHECK_PARAMS(type, size) (sizeof(type) == size ? 0 : -EINVAL)

static int
nvswitch_ctl_cmd_dispatch
(
    unsigned int cmd,
    void *params,
    unsigned int param_size
)
{
    int rc;

    switch(cmd)
    {
        case CTRL_NVSWITCH_CHECK_VERSION:
            rc = NVSWITCH_CTL_CHECK_PARAMS(NVSWITCH_CHECK_VERSION_PARAMS,
                                           param_size);
            if (!rc)
            {
                rc = nvswitch_ctl_check_version(params);
            }
            break;
        case CTRL_NVSWITCH_GET_DEVICES:
            rc = NVSWITCH_CTL_CHECK_PARAMS(NVSWITCH_GET_DEVICES_PARAMS,
                                           param_size);
            if (!rc)
            {
                nvswitch_ctl_get_devices(params);
            }
            break;
        case CTRL_NVSWITCH_GET_DEVICES_V2:
            rc = NVSWITCH_CTL_CHECK_PARAMS(NVSWITCH_GET_DEVICES_V2_PARAMS,
                                           param_size);
            if (!rc)
            {
                nvswitch_ctl_get_devices_v2(params);
            }
            break;

        default:
            rc = -EINVAL;
            break;
    }

    return rc;
}

static int
nvswitch_ctl_ioctl
(
    struct inode *inode,
    struct file *file,
    unsigned int cmd,
    unsigned long arg
)
{
    int rc = 0;
    IOCTL_STATE state = {0};

    if (_IOC_TYPE(cmd) != NVSWITCH_CTL_IO_TYPE)
    {
        return -EINVAL;
    }

    rc = mutex_lock_interruptible(&nvswitch.driver_mutex);
    if (rc)
    {
        return rc;
    }

    rc = nvswitch_ioctl_state_start(&state, cmd, arg);
    if (rc)
    {
        goto nvswitch_ctl_ioctl_exit;
    }

    rc = nvswitch_ctl_cmd_dispatch(_IOC_NR(cmd),
                                   state.kernel_params,
                                   state.kernel_params_size);
    if (!rc)
    {
        rc = nvswitch_ioctl_state_sync(&state, cmd, arg);
    }

    nvswitch_ioctl_state_cleanup(&state);

nvswitch_ctl_ioctl_exit:
    mutex_unlock(&nvswitch.driver_mutex);

    return rc;
}

static long
nvswitch_ctl_unlocked_ioctl
(
    struct file *file,
    unsigned int cmd,
    unsigned long arg
)
{
    return nvswitch_ctl_ioctl(NV_FILE_INODE(file), file, cmd, arg);
}

static irqreturn_t
nvswitch_isr_pending
(
    int   irq,
    void *arg
)
{

    NVSWITCH_DEV *nvswitch_dev = (NVSWITCH_DEV *)arg;
    NvlStatus retval;

    //
    // On silicon MSI must be enabled.  Since interrupts will not be shared
    // with MSI, we can simply signal the thread.
    //
    if (nvswitch_dev->irq_mechanism == NVSWITCH_IRQ_MSI)
    {
        return IRQ_WAKE_THREAD;
    }

    if (nvswitch_dev->irq_mechanism == NVSWITCH_IRQ_PIN)
    {
        //
        // We do not take mutex in the interrupt context. The interrupt
        // check is safe to driver state.
        //
        retval = nvswitch_lib_check_interrupts(nvswitch_dev->lib_device);

        // Wake interrupt thread if there is an interrupt pending
        if (-NVL_MORE_PROCESSING_REQUIRED == retval)
        {
            nvswitch_lib_disable_interrupts(nvswitch_dev->lib_device);
            return IRQ_WAKE_THREAD;
        }

        // PCI errors are handled else where.
        if (-NVL_PCI_ERROR == retval)
        {
            return IRQ_NONE;
        }

        if (NVL_SUCCESS != retval)
        {
            pr_err("nvidia-nvswitch: unrecoverable error in ISR\n");
            NVSWITCH_OS_ASSERT(0);
        }
        return IRQ_NONE;
    }

    pr_err("nvidia-nvswitch: unsupported IRQ mechanism in ISR\n");
    NVSWITCH_OS_ASSERT(0);

    return IRQ_NONE;
}

static irqreturn_t
nvswitch_isr_thread
(
    int   irq,
    void *arg
)
{
    NVSWITCH_DEV *nvswitch_dev = (NVSWITCH_DEV *)arg;
    NvlStatus retval;

    mutex_lock(&nvswitch_dev->device_mutex);

    retval = nvswitch_lib_service_interrupts(nvswitch_dev->lib_device);

    wake_up(&nvswitch_dev->wait_q_errors);

    if (nvswitch_dev->irq_mechanism == NVSWITCH_IRQ_PIN)
    {
        nvswitch_lib_enable_interrupts(nvswitch_dev->lib_device);
    }

    mutex_unlock(&nvswitch_dev->device_mutex);

    if (WARN_ON(retval != NVL_SUCCESS))
    {
        printk(KERN_ERR "%s: Interrupts disabled to avoid a storm\n",
               nvswitch_dev->name);
    }

    return IRQ_HANDLED;
}

static void
nvswitch_task_dispatch
(
    NVSWITCH_DEV *nvswitch_dev
)
{
    NvU64 nsec;
    NvU64 timeout;
    NvS64 rc;

    if (NV_ATOMIC_READ(nvswitch_dev->task_q_ready) == 0)
    {
        return;
    }

    mutex_lock(&nvswitch_dev->device_mutex);

    nsec = nvswitch_lib_deferred_task_dispatcher(nvswitch_dev->lib_device);

    mutex_unlock(&nvswitch_dev->device_mutex);

    timeout = usecs_to_jiffies(nsec / NSEC_PER_USEC);

    rc = wait_event_interruptible_timeout(nvswitch_dev->wait_q_shutdown,
                              (NV_ATOMIC_READ(nvswitch_dev->task_q_ready) == 0),
                              timeout);

    //
    // These background tasks should rarely, if ever, get interrupted. We use
    // the "interruptible" variant of wait_event in order to avoid contributing
    // to the system load average (/proc/loadavg), and to avoid softlockup
    // warnings that can occur if a kernel thread lingers too long in an
    // uninterruptible state. If this does get interrupted, we'd like to debug
    // and find out why, so WARN in that case.
    //
    WARN_ON(rc < 0);

    //
    // Schedule a work item only if the above actually timed out or got
    // interrupted, without the condition becoming true.
    //
    if (rc <= 0)
    {
        if (!nv_kthread_q_schedule_q_item(&nvswitch_dev->task_q,
                                          &nvswitch_dev->task_item))
        {
            printk(KERN_ERR "%s: Failed to re-schedule background task\n",
                   nvswitch_dev->name);
        }
    }
}

static int
nvswitch_probe
(
    struct pci_dev *pci_dev,
    const struct pci_device_id *id_table
)
{
    NVSWITCH_DEV *nvswitch_dev = NULL;
    int rc = 0;
    int minor;

    if (!nvswitch_lib_validate_device_id(pci_dev->device))
    {
        return -EINVAL;
    }

    printk(KERN_INFO "nvidia-nvswitch: Probing device %04x:%02x:%02x.%x, "
           "Vendor Id = 0x%x, Device Id = 0x%x, Class = 0x%x \n",
           NV_PCI_DOMAIN_NUMBER(pci_dev),
           NV_PCI_BUS_NUMBER(pci_dev),
           NV_PCI_SLOT_NUMBER(pci_dev),
           PCI_FUNC(pci_dev->devfn),
           pci_dev->vendor,
           pci_dev->device,
           pci_dev->class);

    mutex_lock(&nvswitch.driver_mutex);

    minor = nvswitch_find_minor();
    if (minor >= NVSWITCH_DEVICE_INSTANCE_MAX)
    {
        rc = -ERANGE;
        goto find_minor_failed;
    }

    nvswitch_dev = kzalloc(sizeof(*nvswitch_dev), GFP_KERNEL);
    if (NULL == nvswitch_dev)
    {
        rc = -ENOMEM;
        goto kzalloc_failed;
    }

    mutex_init(&nvswitch_dev->device_mutex);
    init_waitqueue_head(&nvswitch_dev->wait_q_errors);
    init_waitqueue_head(&nvswitch_dev->wait_q_shutdown);

    snprintf(nvswitch_dev->name, sizeof(nvswitch_dev->name),
        NVSWITCH_DRIVER_NAME "%d", minor);

    snprintf(nvswitch_dev->sname, sizeof(nvswitch_dev->sname),
        NVSWITCH_SHORT_NAME "%d", minor);

    rc = pci_enable_device(pci_dev);
    if (rc)
    {
        printk(KERN_ERR "%s: Failed to enable PCI device : %d\n",
               nvswitch_dev->name,
               rc);
        goto pci_enable_device_failed;
    }

    pci_set_master(pci_dev);

    rc = pci_request_regions(pci_dev, nvswitch_dev->name);
    if (rc)
    {
        printk(KERN_ERR "%s: Failed to request memory regions : %d\n",
               nvswitch_dev->name,
               rc);
        goto pci_request_regions_failed;
    }

    nvswitch_dev->bar0 = pci_iomap(pci_dev, 0, 0);
    if (!nvswitch_dev->bar0)
    {
        rc = -ENOMEM;
        printk(KERN_ERR "%s: Failed to map BAR0 region : %d\n",
               nvswitch_dev->name,
               rc);
        goto pci_iomap_failed;
    }

    nvswitch_dev->pci_dev = pci_dev;
    nvswitch_dev->minor = minor;

    rc = nvswitch_init_device(nvswitch_dev);
    if (rc)
    {
        printk(KERN_ERR "%s: Failed to initialize device : %d\n",
               nvswitch_dev->name,
               rc);
        goto init_device_failed;
    }

    if (nvswitch_is_device_blacklisted(nvswitch_dev))
    {
        nvswitch_post_init_blacklisted(nvswitch_dev);
        goto blacklisted;
    }

    //
    // device_mutex held here because post_init entries may call soeService_HAL()
    // with IRQs on. see bug 2856314 for more info
    //
    mutex_lock(&nvswitch_dev->device_mutex);
    rc = nvswitch_post_init_device(nvswitch_dev);
    mutex_unlock(&nvswitch_dev->device_mutex);
    if (rc)
    {
        printk(KERN_ERR "%s:Failed during device post init : %d\n",
               nvswitch_dev->name, rc);
        goto post_init_device_failed;
    }

blacklisted:
    rc = nvswitch_init_background_tasks(nvswitch_dev);
    if (rc)
    {
        printk(KERN_ERR "%s: Failed to initialize background tasks : %d\n",
               nvswitch_dev->name,
               rc);
        goto init_background_task_failed;
    }

    pci_set_drvdata(pci_dev, nvswitch_dev);

    nvswitch_procfs_device_add(nvswitch_dev);

    list_add_tail(&nvswitch_dev->list_node, &nvswitch.devices);

    NV_ATOMIC_INC(nvswitch.count);

    mutex_unlock(&nvswitch.driver_mutex);

    return 0;

init_background_task_failed:
post_init_device_failed:
    nvswitch_deinit_device(nvswitch_dev);

init_device_failed:
    pci_iounmap(pci_dev, nvswitch_dev->bar0);

pci_iomap_failed:
    pci_release_regions(pci_dev);

pci_request_regions_failed:
#ifdef CONFIG_PCI
    pci_clear_master(pci_dev);
#endif
    pci_disable_device(pci_dev);

pci_enable_device_failed:
    kfree(nvswitch_dev);

kzalloc_failed:
find_minor_failed:
    mutex_unlock(&nvswitch.driver_mutex);

    return rc;
}

void
nvswitch_remove
(
    struct pci_dev *pci_dev
)
{
    NVSWITCH_DEV *nvswitch_dev;

    mutex_lock(&nvswitch.driver_mutex);

    nvswitch_dev = pci_get_drvdata(pci_dev);

    if (nvswitch_dev == NULL)
    {
        goto done;
    }

    printk(KERN_INFO "%s: removing device %04x:%02x:%02x.%x\n",
           nvswitch_dev->name,
           NV_PCI_DOMAIN_NUMBER(pci_dev),
           NV_PCI_BUS_NUMBER(pci_dev),
           NV_PCI_SLOT_NUMBER(pci_dev),
           PCI_FUNC(pci_dev->devfn));

    //
    // Synchronize with device operations such as .ioctls/.poll, and then mark
    // the device unusable.
    //
    mutex_lock(&nvswitch_dev->device_mutex);
    nvswitch_dev->unusable = NV_TRUE;
    mutex_unlock(&nvswitch_dev->device_mutex);

    NV_ATOMIC_DEC(nvswitch.count);

    list_del(&nvswitch_dev->list_node);

    nvswitch_deinit_i2c_adapters(nvswitch_dev);

    WARN_ON(!list_empty(&nvswitch_dev->i2c_adapter_list));

    pci_set_drvdata(pci_dev, NULL);

    nvswitch_deinit_background_tasks(nvswitch_dev);

    nvswitch_deinit_device(nvswitch_dev);

    pci_iounmap(pci_dev, nvswitch_dev->bar0);

    pci_release_regions(pci_dev);

#ifdef CONFIG_PCI
    pci_clear_master(pci_dev);
#endif

    pci_disable_device(pci_dev);

    nvswitch_procfs_device_remove(nvswitch_dev);

    // Free nvswitch_dev only if it is not in use.
    if (NV_ATOMIC_READ(nvswitch_dev->ref_count) == 0)
    {
        kfree(nvswitch_dev);
    }

done:
    mutex_unlock(&nvswitch.driver_mutex);

    return;
}

static void
nvswitch_load_bar_info
(
    NVSWITCH_DEV *nvswitch_dev
)
{
    struct pci_dev *pci_dev = nvswitch_dev->pci_dev;
    nvlink_pci_info *info;
    NvU32 bar = 0;

    nvswitch_lib_get_device_info(nvswitch_dev->lib_device, &info);

    info->bars[0].offset = NVRM_PCICFG_BAR_OFFSET(0);
    pci_read_config_dword(pci_dev, info->bars[0].offset, &bar);

    info->bars[0].busAddress = (bar & PCI_BASE_ADDRESS_MEM_MASK);
    if (NV_PCI_RESOURCE_FLAGS(pci_dev, 0) & PCI_BASE_ADDRESS_MEM_TYPE_64)
    {
        pci_read_config_dword(pci_dev, info->bars[0].offset + 4, &bar);
        info->bars[0].busAddress |= (((NvU64)bar) << 32);
    }

    info->bars[0].baseAddr = NV_PCI_RESOURCE_START(pci_dev, 0);

    info->bars[0].barSize = NV_PCI_RESOURCE_SIZE(pci_dev, 0);

    info->bars[0].pBar = nvswitch_dev->bar0;
}

static int
_nvswitch_initialize_msix_interrupt
(
    NVSWITCH_DEV *nvswitch_dev
)
{
    // Not supported (bug 3018806)
    return -EINVAL;
}

static int
_nvswitch_initialize_msi_interrupt
(
    NVSWITCH_DEV *nvswitch_dev
)
{
#ifdef CONFIG_PCI_MSI
    struct pci_dev *pci_dev = nvswitch_dev->pci_dev;
    int rc;

    rc = pci_enable_msi(pci_dev);
    if (rc)
    {
        return rc;
    }

    return 0;
#else
    return -EINVAL;
#endif
}

static int
_nvswitch_get_irq_caps(NVSWITCH_DEV *nvswitch_dev, unsigned long *irq_caps)
{
    struct pci_dev *pci_dev;

    if (!nvswitch_dev || !irq_caps)
        return -EINVAL;

    pci_dev = nvswitch_dev->pci_dev;

    if (pci_find_capability(pci_dev, PCI_CAP_ID_MSIX))
        set_bit(NVSWITCH_IRQ_MSIX, irq_caps);

    if (pci_find_capability(pci_dev, PCI_CAP_ID_MSI))
        set_bit(NVSWITCH_IRQ_MSI, irq_caps);

    if (nvswitch_lib_use_pin_irq(nvswitch_dev->lib_device))
        set_bit(NVSWITCH_IRQ_PIN, irq_caps);

    return 0;
}

static int
nvswitch_initialize_device_interrupt
(
    NVSWITCH_DEV *nvswitch_dev
)
{
    struct pci_dev *pci_dev = nvswitch_dev->pci_dev;
    int flags = 0;
    unsigned long irq_caps = 0;
    int rc;

    if (_nvswitch_get_irq_caps(nvswitch_dev, &irq_caps))
    {
        pr_err("%s: failed to retrieve device interrupt capabilities\n",
               nvswitch_dev->name);
        return -EINVAL;
    }

    nvswitch_dev->irq_mechanism = NVSWITCH_IRQ_NONE;

    if (test_bit(NVSWITCH_IRQ_MSIX, &irq_caps))
    {
        rc = _nvswitch_initialize_msix_interrupt(nvswitch_dev);
        if (!rc)
        {
            nvswitch_dev->irq_mechanism = NVSWITCH_IRQ_MSIX;
            pr_info("%s: using MSI-X\n", nvswitch_dev->name);
        }
    }

    if (nvswitch_dev->irq_mechanism == NVSWITCH_IRQ_NONE
        && test_bit(NVSWITCH_IRQ_MSI, &irq_caps))
    {
        rc = _nvswitch_initialize_msi_interrupt(nvswitch_dev);
        if (!rc)
        {
            nvswitch_dev->irq_mechanism = NVSWITCH_IRQ_MSI;
            pr_info("%s: using MSI\n", nvswitch_dev->name);
        }
    }

    if (nvswitch_dev->irq_mechanism == NVSWITCH_IRQ_NONE
        && test_bit(NVSWITCH_IRQ_PIN, &irq_caps))
    {
        flags |= IRQF_SHARED;
        nvswitch_dev->irq_mechanism = NVSWITCH_IRQ_PIN;
        pr_info("%s: using PCI pin\n", nvswitch_dev->name);
    }

    if (nvswitch_dev->irq_mechanism == NVSWITCH_IRQ_NONE)
    {
        pr_err("%s: No supported interrupt mechanism was found. This device supports:\n",
               nvswitch_dev->name);

        if (test_bit(NVSWITCH_IRQ_MSIX, &irq_caps))
            pr_err("%s: MSI-X\n", nvswitch_dev->name);
        if (test_bit(NVSWITCH_IRQ_MSI, &irq_caps))
            pr_err("%s: MSI\n", nvswitch_dev->name);
        if (test_bit(NVSWITCH_IRQ_PIN, &irq_caps))
             pr_err("%s: PCI Pin\n", nvswitch_dev->name);

        return -EINVAL;
    }

    rc = request_threaded_irq(pci_dev->irq,
                              nvswitch_isr_pending,
                              nvswitch_isr_thread,
                              flags, nvswitch_dev->sname,
                              nvswitch_dev);
    if (rc)
    {
#ifdef CONFIG_PCI_MSI
        if (nvswitch_dev->irq_mechanism == NVSWITCH_IRQ_MSI)
        {
            pci_disable_msi(pci_dev);
        }
#endif
        printk(KERN_ERR "%s: failed to get IRQ\n",
               nvswitch_dev->name);

        return rc;
    }

    return 0;
}

void
nvswitch_shutdown_device_interrupt
(
    NVSWITCH_DEV *nvswitch_dev
)
{
    struct pci_dev *pci_dev = nvswitch_dev->pci_dev;

    free_irq(pci_dev->irq, nvswitch_dev);
#ifdef CONFIG_PCI_MSI
    if (nvswitch_dev->irq_mechanism == NVSWITCH_IRQ_MSI)
    {
        pci_disable_msi(pci_dev);
    }
#endif
}

static void
nvswitch_ctl_exit
(
    void
)
{
    cdev_del(&nvswitch.cdev_ctl);
}

static int
nvswitch_ctl_init
(
    int major
)
{
    int rc = 0;
    dev_t nvswitch_ctl = MKDEV(major, NVSWITCH_CTL_MINOR);

    cdev_init(&nvswitch.cdev_ctl, &ctl_fops);

    nvswitch.cdev_ctl.owner = THIS_MODULE;

    rc = cdev_add(&nvswitch.cdev_ctl, nvswitch_ctl, 1);
    if (rc < 0)
    {
        printk(KERN_ERR "nvidia-nvswitch: Unable to create cdev ctl\n");
        return rc;
    }

    return 0;
}

//
// Initialize nvswitch driver SW state.  This is currently called
// from the RM as a backdoor interface, and not by the Linux device
// manager
//
int
nvswitch_init
(
    void
)
{
    int rc;

    if (nvswitch.initialized)
    {
        printk(KERN_ERR "nvidia-nvswitch: Interface already initialized\n");
        return -EBUSY;
    }

    BUILD_BUG_ON(NVSWITCH_DEVICE_INSTANCE_MAX >= NVSWITCH_MINOR_COUNT);

    mutex_init(&nvswitch.driver_mutex);

    INIT_LIST_HEAD(&nvswitch.devices);

    rc = alloc_chrdev_region(&nvswitch.devno,
                             0,
                             NVSWITCH_MINOR_COUNT,
                             NVSWITCH_DRIVER_NAME);
    if (rc < 0)
    {
        printk(KERN_ERR "nvidia-nvswitch: Unable to create cdev region\n");
        goto alloc_chrdev_region_fail;
    }

    printk(KERN_ERR, "nvidia-nvswitch: Major: %d Minor: %d\n",
           MAJOR(nvswitch.devno),
           MINOR(nvswitch.devno));

    cdev_init(&nvswitch.cdev, &device_fops);
    nvswitch.cdev.owner = THIS_MODULE;
    rc = cdev_add(&nvswitch.cdev, nvswitch.devno, NVSWITCH_DEVICE_INSTANCE_MAX);
    if (rc < 0)
    {
        printk(KERN_ERR "nvidia-nvswitch: Unable to create cdev\n");
        goto cdev_add_fail;
    }

    rc = nvswitch_procfs_init();
    if (rc < 0)
    {
        goto nvswitch_procfs_init_fail;
    }

    rc = pci_register_driver(&nvswitch_pci_driver);
    if (rc < 0)
    {
        printk(KERN_ERR "nvidia-nvswitch: Failed to register driver : %d\n", rc);
        goto pci_register_driver_fail;
    }

    rc = nvswitch_ctl_init(MAJOR(nvswitch.devno));
    if (rc < 0)
    {
        goto nvswitch_ctl_init_fail;
    }

    nvswitch.initialized = NV_TRUE;

    return 0;

nvswitch_ctl_init_fail:
    pci_unregister_driver(&nvswitch_pci_driver);

pci_register_driver_fail:
nvswitch_procfs_init_fail:
    cdev_del(&nvswitch.cdev);

cdev_add_fail:
    unregister_chrdev_region(nvswitch.devno, NVSWITCH_MINOR_COUNT);

alloc_chrdev_region_fail:

    return rc;
}

//
// Clean up driver state on exit.  Currently called from RM backdoor call,
// and not by the Linux device manager.
//
void
nvswitch_exit
(
    void
)
{
    if (NV_FALSE == nvswitch.initialized)
    {
        return;
    }

    nvswitch_ctl_exit();

    pci_unregister_driver(&nvswitch_pci_driver);

    nvswitch_procfs_exit();

    cdev_del(&nvswitch.cdev);

    unregister_chrdev_region(nvswitch.devno, NVSWITCH_MINOR_COUNT);

    WARN_ON(!list_empty(&nvswitch.devices));

    nvswitch.initialized = NV_FALSE;
}

//
// Get current time in seconds.nanoseconds
// In this implementation, the time is from epoch time
// (midnight UTC of January 1, 1970)
//
NvU64
nvswitch_os_get_platform_time
(
    void
)
{
    struct timespec64 ts;

    ktime_get_raw_ts64(&ts);
    return (NvU64) timespec64_to_ns(&ts);
}

void
nvswitch_os_print
(
    const int  log_level,
    const char *fmt,
    ...
)
{
    va_list arglist;
    char   *kern_level;
    char    fmt_printk[NVSWITCH_LOG_BUFFER_SIZE];

    switch (log_level)
    {
        case NVSWITCH_DBG_LEVEL_MMIO:
            kern_level = KERN_DEBUG;
            break;
        case NVSWITCH_DBG_LEVEL_INFO:
            kern_level = KERN_INFO;
            break;
        case NVSWITCH_DBG_LEVEL_SETUP:
            kern_level = KERN_INFO;
            break;
        case NVSWITCH_DBG_LEVEL_WARN:
            kern_level = KERN_WARNING;
            break;
        case NVSWITCH_DBG_LEVEL_ERROR:
            kern_level = KERN_ERR;
            break;
        default:
            kern_level = KERN_DEFAULT;
            break;
    }

    va_start(arglist, fmt);
    snprintf(fmt_printk, sizeof(fmt_printk), "%s%s", kern_level, fmt);
    vprintk(fmt_printk, arglist);
    va_end(arglist);
}

void
nvswitch_os_override_platform
(
    void *os_handle,
    NvBool *rtlsim
)
{
    // Never run on RTL
    *rtlsim = NV_FALSE;
}

NvlStatus
nvswitch_os_read_registery_binary
(
    void *os_handle,
    const char *name,
    NvU8 *data,
    NvU32 length
)
{
    return -NVL_ERR_NOT_SUPPORTED;
}

NvU32
nvswitch_os_get_device_count
(
    void
)
{
    return NV_ATOMIC_READ(nvswitch.count);
}

//
// A helper to convert a string to an unsigned int.
//
// The string should be NULL terminated.
// Only works with base16 values.
//
static int
nvswitch_os_strtouint
(
    char *str,
    unsigned int *data
)
{
    char *p;
    unsigned long long val;

    if (!str || !data)
    {
        return -EINVAL;
    }

    *data = 0;
    val = 0;
    p = str;

    while (*p != '\0')
    {
        if ((tolower(*p) == 'x') && (*str == '0') && (p == str + 1))
        {
            p++;
        }
        else if (*p >='0' && *p <= '9')
        {
            val = val * 16 + (*p - '0');
            p++;
        }
        else if (tolower(*p) >= 'a' && tolower(*p) <= 'f')
        {
            val = val * 16 + (tolower(*p) - 'a' + 10);
            p++;
        }
        else
        {
            return -EINVAL;
        }
    }

    if (val > 0xFFFFFFFF)
    {
        return -EINVAL;
    }

    *data = (unsigned int)val;

    return 0;
}

NvlStatus
nvswitch_os_read_registry_dword
(
    void *os_handle,
    const char *name,
    NvU32 *data
)
{
    char *regkey, *regkey_val_start, *regkey_val_end;
    char regkey_val[NVSWITCH_REGKEY_VALUE_LEN + 1];
    NvU32 regkey_val_len = 0;

    *data = 0;

    if (!NvSwitchRegDwords)
    {
        return -NVL_ERR_GENERIC;
    }

    regkey = strstr(NvSwitchRegDwords, name);
    if (!regkey)
    {
        return -NVL_ERR_GENERIC;
    }

    regkey = strchr(regkey, '=');
    if (!regkey)
    {
        return -NVL_ERR_GENERIC;
    }

    regkey_val_start = regkey + 1;

    regkey_val_end = strchr(regkey, ';');
    if (!regkey_val_end)
    {
        regkey_val_end = strchr(regkey, '\0');
    }

    regkey_val_len = regkey_val_end - regkey_val_start;
    if (regkey_val_len > NVSWITCH_REGKEY_VALUE_LEN || regkey_val_len == 0)
    {
        return -NVL_ERR_GENERIC;
    }

    strncpy(regkey_val, regkey_val_start, regkey_val_len);
    regkey_val[regkey_val_len] = '\0';

    if (nvswitch_os_strtouint(regkey_val, data) != 0)
    {
        return -NVL_ERR_GENERIC;
    }

    return NVL_SUCCESS;
}

static NvBool
_nvswitch_is_space(const char ch)
{
    return ((ch == ' ') || ((ch >= '\t') && (ch <= '\r')));
}

static char *
_nvswitch_remove_spaces(const char *in)
{
    unsigned int len = nvswitch_os_strlen(in) + 1;
    const char *in_ptr;
    char *out, *out_ptr;

    out = nvswitch_os_malloc(len);
    if (out == NULL)
        return NULL;

    in_ptr = in;
    out_ptr = out;

    while (*in_ptr != '\0')
    {
        if (!_nvswitch_is_space(*in_ptr))
            *out_ptr++ = *in_ptr;
        in_ptr++;
    }
    *out_ptr = '\0';

    return out;
}

/*
 * Compare given string UUID with the NvSwitchBlacklist registry parameter string and
 * return whether the UUID is in the NvSwitch blacklist
 */
NvBool
nvswitch_os_is_uuid_in_blacklist
(
    NvUuid *uuid
)
{
    char *list;
    char *ptr;
    char *token;
    NvU8 uuid_string[NVSWITCH_UUID_STRING_LENGTH];

    if (NvSwitchBlacklist == NULL)
        return NV_FALSE;

    if (nvswitch_uuid_to_string(uuid, uuid_string, NVSWITCH_UUID_STRING_LENGTH) == 0)
        return NV_FALSE;

    if ((list = _nvswitch_remove_spaces(NvSwitchBlacklist)) == NULL)
        return NV_FALSE;

    ptr = list;

    while ((token = strsep(&ptr, ",")) != NULL)
    {
        if (strcmp(token, uuid_string) == 0)
        {
            nvswitch_os_free(list);
            return NV_TRUE;
        }
    }
    nvswitch_os_free(list);
    return NV_FALSE;
}


NvlStatus
nvswitch_os_alloc_contig_memory
(
    void *os_handle,
    void **virt_addr,
    NvU32 size,
    NvBool force_dma32
)
{
    NvU32 gfp_flags;
    unsigned long nv_gfp_addr = 0;

    if (!virt_addr)
        return -NVL_BAD_ARGS;

    gfp_flags = GFP_KERNEL | (force_dma32 ? GFP_DMA32 : 0);
    NV_GET_FREE_PAGES(nv_gfp_addr, get_order(size), gfp_flags);

    if(!nv_gfp_addr)
    {
        pr_err("nvidia-nvswitch: unable to allocate kernel memory\n");
        return -NVL_NO_MEM;
    }

    *virt_addr = (void *)nv_gfp_addr;

    return NVL_SUCCESS;
}

void
nvswitch_os_free_contig_memory
(
    void *os_handle,
    void *virt_addr,
    NvU32 size
)
{
    NV_FREE_PAGES((unsigned long)virt_addr, get_order(size));
}

static inline int
_nvswitch_to_pci_dma_direction
(
    NvU32 direction
)
{
    if (direction == NVSWITCH_DMA_DIR_TO_SYSMEM)
        return DMA_FROM_DEVICE;
    else if (direction == NVSWITCH_DMA_DIR_FROM_SYSMEM)
        return DMA_TO_DEVICE;
    else
        return DMA_BIDIRECTIONAL;
}

NvlStatus
nvswitch_os_map_dma_region
(
    void *os_handle,
    void *cpu_addr,
    NvU64 *dma_handle,
    NvU32 size,
    NvU32 direction
)
{
    int dma_dir;
    struct pci_dev *pdev = (struct pci_dev *)os_handle;

    if (!pdev || !cpu_addr || !dma_handle)
        return -NVL_BAD_ARGS;

    dma_dir = _nvswitch_to_pci_dma_direction(direction);

    *dma_handle = (NvU64)dma_map_single(&pdev->dev, cpu_addr, size, dma_dir);

    if (dma_mapping_error(&pdev->dev, *dma_handle))
    {
        pr_err("nvidia-nvswitch: unable to create PCI DMA mapping\n");
        return -NVL_ERR_GENERIC;
    }

    return NVL_SUCCESS;
}

NvlStatus
nvswitch_os_unmap_dma_region
(
    void *os_handle,
    void *cpu_addr,
    NvU64 dma_handle,
    NvU32 size,
    NvU32 direction
)
{
    int dma_dir;
    struct pci_dev *pdev = (struct pci_dev *)os_handle;

    if (!pdev || !cpu_addr)
        return -NVL_BAD_ARGS;

    dma_dir = _nvswitch_to_pci_dma_direction(direction);

    dma_unmap_single(&pdev->dev, dma_handle, size, dma_dir);

    return NVL_SUCCESS;
}

NvlStatus
nvswitch_os_set_dma_mask
(
    void *os_handle,
    NvU32 dma_addr_width
)
{
    struct pci_dev *pdev = (struct pci_dev *)os_handle;

    if (!pdev)
        return -NVL_BAD_ARGS;

    if (dma_set_mask(&pdev->dev, DMA_BIT_MASK(dma_addr_width)))
        return -NVL_ERR_GENERIC;

    return NVL_SUCCESS;
}

NvlStatus
nvswitch_os_sync_dma_region_for_cpu
(
    void *os_handle,
    NvU64 dma_handle,
    NvU32 size,
    NvU32 direction
)
{
    int dma_dir;
    struct pci_dev *pdev = (struct pci_dev *)os_handle;

    if (!pdev)
        return -NVL_BAD_ARGS;

    dma_dir = _nvswitch_to_pci_dma_direction(direction);

    dma_sync_single_for_cpu(&pdev->dev, dma_handle, size, dma_dir);

    return NVL_SUCCESS;
}

NvlStatus
nvswitch_os_sync_dma_region_for_device
(
    void *os_handle,
    NvU64 dma_handle,
    NvU32 size,
    NvU32 direction
)
{
    int dma_dir;
    struct pci_dev *pdev = (struct pci_dev *)os_handle;

    if (!pdev)
        return -NVL_BAD_ARGS;

    dma_dir = _nvswitch_to_pci_dma_direction(direction);

    dma_sync_single_for_device(&pdev->dev, dma_handle, size, dma_dir);

    return NVL_SUCCESS;
}

static inline void *
_nvswitch_os_malloc
(
    NvLength size
)
{
    void *ptr = NULL;

    if (!NV_MAY_SLEEP())
    {
        if (size <= NVSWITCH_KMALLOC_LIMIT)
        {
            ptr = kmalloc(size, NV_GFP_ATOMIC);
        }
    }
    else
    {
        if (size <= NVSWITCH_KMALLOC_LIMIT)
        {
            ptr = kmalloc(size, NV_GFP_NO_OOM);
        }

        if (ptr == NULL)
        {
            ptr = vmalloc(size);
        }
    }

    return ptr;
}

void *
nvswitch_os_malloc_trace
(
    NvLength size,
    const char *file,
    NvU32 line
)
{
#if defined(NV_MEM_LOGGER)
    void *ptr = _nvswitch_os_malloc(size);
    if (ptr)
    {
        nv_memdbg_add(ptr, size, file, line);
    }

    return ptr;
#else
    return _nvswitch_os_malloc(size);
#endif
}

static inline void
_nvswitch_os_free
(
    void *ptr
)
{
    if (!ptr)
        return;

    if (is_vmalloc_addr(ptr))
    {
        vfree(ptr);
    }
    else
    {
        kfree(ptr);
    }
}

void
nvswitch_os_free
(
    void *ptr
)
{
#if defined (NV_MEM_LOGGER)
    if (ptr == NULL)
        return;

    nv_memdbg_remove(ptr, 0, NULL, 0);

    return _nvswitch_os_free(ptr);
#else
    return _nvswitch_os_free(ptr);
#endif
}

NvLength
nvswitch_os_strlen
(
    const char *str
)
{
    return strlen(str);
}

char*
nvswitch_os_strncpy
(
    char *dest,
    const char *src,
    NvLength length
)
{
    return strncpy(dest, src, length);
}

int
nvswitch_os_strncmp
(
    const char *s1,
    const char *s2,
    NvLength length
)
{
    return strncmp(s1, s2, length);
}

void *
nvswitch_os_memset
(
    void *dest,
    int value,
    NvLength size
)
{
     return memset(dest, value, size);
}

void *
nvswitch_os_memcpy
(
    void *dest,
    const void *src,
    NvLength size
)
{
    return memcpy(dest, src, size);
}

int
nvswitch_os_memcmp
(
    const void *s1,
    const void *s2,
    NvLength size
)
{
    return memcmp(s1, s2, size);
}

NvU32
nvswitch_os_mem_read32
(
    const volatile void * address
)
{
    return (*(const volatile NvU32*)(address));
}

void
nvswitch_os_mem_write32
(
    volatile void *address,
    NvU32 data
)
{
    (*(volatile NvU32 *)(address)) = data;
}

NvU64
nvswitch_os_mem_read64
(
    const volatile void * address
)
{
    return (*(const volatile NvU64 *)(address));
}

void
nvswitch_os_mem_write64
(
    volatile void *address,
    NvU64 data
)
{
    (*(volatile NvU64 *)(address)) = data;
}

int
nvswitch_os_snprintf
(
    char *dest,
    NvLength size,
    const char *fmt,
    ...
)
{
    va_list arglist;
    int chars_written;

    va_start(arglist, fmt);
    chars_written = vsnprintf(dest, size, fmt, arglist);
    va_end(arglist);

    return chars_written;
}

int
nvswitch_os_vsnprintf
(
    char *buf,
    NvLength size,
    const char *fmt,
    va_list arglist
)
{
    return vsnprintf(buf, size, fmt, arglist);
}

void
nvswitch_os_assert_log
(
    int cond,
    const char *fmt,
    ...
)
{
    if(cond == 0x0)
    {
        if (printk_ratelimit())
        {
            va_list arglist;
            char fmt_printk[NVSWITCH_LOG_BUFFER_SIZE];

            va_start(arglist, fmt);
            vsnprintf(fmt_printk, sizeof(fmt_printk), fmt, arglist);
            va_end(arglist);
            nvswitch_os_print(NVSWITCH_DBG_LEVEL_ERROR, fmt_printk);
            WARN_ON(1);
         }
         dbg_breakpoint();
    }
}

/*
 * Sleep for specified milliseconds. Yields the CPU to scheduler.
 */
void
nvswitch_os_sleep
(
    unsigned int ms
)
{
    NV_STATUS status;
    status = nv_sleep_ms(ms);

    if (status != NV_OK)
    {
        if (printk_ratelimit())
        {
            nvswitch_os_print(NVSWITCH_DBG_LEVEL_ERROR, "NVSwitch: requested"
                              " sleep duration %d msec exceeded %d msec\n",
                              ms, NV_MAX_ISR_DELAY_MS);
            WARN_ON(1);
        }
    }
}

NvlStatus
nvswitch_os_acquire_fabric_mgmt_cap
(
    void *osPrivate,
    NvU64 capDescriptor
)
{
    int dup_fd = -1;
    nvswitch_file_private_t *private_data = (nvswitch_file_private_t *)osPrivate;

    if (private_data == NULL)
    {
        return -NVL_BAD_ARGS;
    }

    dup_fd = nvlink_cap_acquire((int)capDescriptor,
                                NVLINK_CAP_FABRIC_MANAGEMENT);
    if (dup_fd < 0)
    {
        return -NVL_ERR_OPERATING_SYSTEM;
    }

    private_data->capability_fds.fabric_mgmt = dup_fd;
    return NVL_SUCCESS;
}

int
nvswitch_os_is_fabric_manager
(
    void *osPrivate
)
{
    nvswitch_file_private_t *private_data = (nvswitch_file_private_t *)osPrivate;

    /* Make sure that fabric mgmt capbaility fd is valid */
    if ((private_data == NULL) ||
        (private_data->capability_fds.fabric_mgmt < 0))
    {
        return 0;
    }

    return 1;
}

int
nvswitch_os_is_admin
(
    void
)
{
    return NV_IS_SUSER();
}

#define NV_KERNEL_RELEASE    ((LINUX_VERSION_CODE >> 16) & 0x0ff)
#define NV_KERNEL_VERSION    ((LINUX_VERSION_CODE >> 8)  & 0x0ff)
#define NV_KERNEL_SUBVERSION ((LINUX_VERSION_CODE)       & 0x0ff)

NvlStatus
nvswitch_os_get_os_version
(
    NvU32 *pMajorVer,
    NvU32 *pMinorVer,
    NvU32 *pBuildNum
)
{
    if (pMajorVer)
        *pMajorVer = NV_KERNEL_RELEASE;
    if (pMinorVer)
        *pMinorVer = NV_KERNEL_VERSION;
    if (pBuildNum)
        *pBuildNum = NV_KERNEL_SUBVERSION;

    return NVL_SUCCESS;
}

/*!
 * @brief: OS specific handling to add an event.
 */
NvlStatus
nvswitch_os_add_client_event
(
    void            *osHandle,
    void            *osPrivate,
    NvU32           eventId
)
{
    return NVL_SUCCESS;
}

/*!
 * @brief: OS specific handling to remove all events corresponding to osPrivate.
 */
NvlStatus
nvswitch_os_remove_client_event
(
    void            *osHandle,
    void            *osPrivate
)
{
    return NVL_SUCCESS;
}

/*!
 * @brief: OS specific handling to notify an event.
 */
NvlStatus
nvswitch_os_notify_client_event
(
    void *osHandle,
    void *osPrivate,
    NvU32 eventId
)
{
    nvswitch_file_private_t *private_data = (nvswitch_file_private_t *)osPrivate;

    if (private_data == NULL)
    {
        return -NVL_BAD_ARGS;
    }

    private_data->file_event.event_pending = NV_TRUE;
    wake_up_interruptible(&private_data->file_event.wait_q_event);

    return NVL_SUCCESS;
}

/*!
 * @brief: Gets OS specific support for the REGISTER_EVENTS ioctl
 */
NvlStatus
nvswitch_os_get_supported_register_events_params
(
    NvBool *many_events,
    NvBool *os_descriptor
)
{
    *many_events   = NV_FALSE;
    *os_descriptor = NV_FALSE;
    return NVL_SUCCESS;
}