open-gpu-kernel-modules/kernel-open/nvidia-uvm/uvm_ats_faults.c

290 lines
13 KiB
C
Raw Normal View History

2022-05-09 22:18:59 +02:00
/*******************************************************************************
Copyright (c) 2018 NVIDIA Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
deal in the Software without restriction, including without limitation the
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*******************************************************************************/
#include "uvm_tools.h"
#include "uvm_va_range.h"
#include "uvm_ats_faults.h"
#include "uvm_migrate_pageable.h"
2022-11-10 17:39:33 +01:00
// TODO: Bug 2103669: Implement a real prefetching policy and remove or adapt
// these experimental parameters. These are intended to help guide that policy.
static unsigned int uvm_exp_perf_prefetch_ats_order_replayable = 0;
module_param(uvm_exp_perf_prefetch_ats_order_replayable, uint, 0644);
MODULE_PARM_DESC(uvm_exp_perf_prefetch_ats_order_replayable,
"Max order of pages (2^N) to prefetch on replayable ATS faults");
static unsigned int uvm_exp_perf_prefetch_ats_order_non_replayable = 0;
module_param(uvm_exp_perf_prefetch_ats_order_non_replayable, uint, 0644);
MODULE_PARM_DESC(uvm_exp_perf_prefetch_ats_order_non_replayable,
"Max order of pages (2^N) to prefetch on non-replayable ATS faults");
// Expand the fault region to the naturally-aligned region with order given by
// the module parameters, clamped to the vma containing fault_addr (if any).
// Note that this means the region contains fault_addr but may not begin at
// fault_addr.
static void expand_fault_region(struct mm_struct *mm,
NvU64 fault_addr,
uvm_fault_client_type_t client_type,
unsigned long *start,
unsigned long *size)
{
struct vm_area_struct *vma;
unsigned int order;
unsigned long outer, aligned_start, aligned_size;
*start = fault_addr;
*size = PAGE_SIZE;
if (client_type == UVM_FAULT_CLIENT_TYPE_HUB)
order = uvm_exp_perf_prefetch_ats_order_non_replayable;
else
order = uvm_exp_perf_prefetch_ats_order_replayable;
if (order == 0)
return;
vma = find_vma_intersection(mm, fault_addr, fault_addr + 1);
if (!vma)
return;
UVM_ASSERT(order < BITS_PER_LONG - PAGE_SHIFT);
aligned_size = (1UL << order) * PAGE_SIZE;
aligned_start = fault_addr & ~(aligned_size - 1);
*start = max(vma->vm_start, aligned_start);
outer = min(vma->vm_end, aligned_start + aligned_size);
*size = outer - *start;
}
2022-05-09 22:18:59 +02:00
static NV_STATUS uvm_ats_service_fault(uvm_gpu_va_space_t *gpu_va_space,
NvU64 fault_addr,
2022-11-10 17:39:33 +01:00
uvm_fault_access_type_t access_type,
uvm_fault_client_type_t client_type)
2022-05-09 22:18:59 +02:00
{
uvm_va_space_t *va_space = gpu_va_space->va_space;
struct mm_struct *mm = va_space->va_space_mm.mm;
bool write = (access_type >= UVM_FAULT_ACCESS_TYPE_WRITE);
NV_STATUS status;
NvU64 start;
NvU64 length;
// Request uvm_migrate_pageable() to touch the corresponding page after
// population.
// Under virtualization ATS provides two translations:
// 1) guest virtual -> guest physical
// 2) guest physical -> host physical
//
// The overall ATS translation will fault if either of those translations is
// invalid. The get_user_pages() call above handles translation #1, but not
// #2. We don't know if we're running as a guest, but in case we are we can
// force that translation to be valid by touching the guest physical address
// from the CPU. If the translation is not valid then the access will cause
// a hypervisor fault. Note that dma_map_page() can't establish mappings
// used by GPU ATS SVA translations. GPU accesses to host physical addresses
// obtained as a result of the address translation request uses the CPU
// address space instead of the IOMMU address space since the translated
// host physical address isn't necessarily an IOMMU address. The only way to
// establish guest physical to host physical mapping in the CPU address
// space is to touch the page from the CPU.
//
// We assume that the hypervisor mappings are all VM_PFNMAP, VM_SHARED, and
// VM_WRITE, meaning that the mappings are all granted write access on any
// fault and that the kernel will never revoke them.
// drivers/vfio/pci/vfio_pci_nvlink2.c enforces this. Thus we can assume
// that a read fault is always sufficient to also enable write access on the
// guest translation.
uvm_migrate_args_t uvm_migrate_args =
{
.va_space = va_space,
.mm = mm,
.dst_id = gpu_va_space->gpu->parent->id,
.dst_node_id = -1,
.populate_permissions = write ? UVM_POPULATE_PERMISSIONS_WRITE : UVM_POPULATE_PERMISSIONS_ANY,
.touch = true,
.skip_mapped = true,
.user_space_start = &start,
.user_space_length = &length,
};
UVM_ASSERT(uvm_ats_can_service_faults(gpu_va_space, mm));
2022-11-10 17:39:33 +01:00
expand_fault_region(mm, fault_addr, client_type, &uvm_migrate_args.start, &uvm_migrate_args.length);
2022-05-09 22:18:59 +02:00
// TODO: Bug 2103669: Service more than a single fault at a time
//
// We are trying to use migrate_vma API in the kernel (if it exists) to
// populate and map the faulting region on the GPU. We want to do this only
// on the first touch. That is, pages which are not already mapped. So, we
// set skip_mapped to true. For pages already mapped, this will only handle
// PTE upgrades if needed.
status = uvm_migrate_pageable(&uvm_migrate_args);
if (status == NV_WARN_NOTHING_TO_DO)
status = NV_OK;
UVM_ASSERT(status != NV_ERR_MORE_PROCESSING_REQUIRED);
return status;
}
NV_STATUS uvm_ats_service_fault_entry(uvm_gpu_va_space_t *gpu_va_space,
uvm_fault_buffer_entry_t *current_entry,
uvm_ats_fault_invalidate_t *ats_invalidate)
{
NvU64 gmmu_region_base;
bool in_gmmu_region;
NV_STATUS status = NV_OK;
uvm_fault_access_type_t service_access_type;
UVM_ASSERT(g_uvm_global.ats.enabled);
UVM_ASSERT(gpu_va_space->ats.enabled);
UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE);
UVM_ASSERT(current_entry->fault_access_type ==
uvm_fault_access_type_mask_highest(current_entry->access_type_mask));
service_access_type = current_entry->fault_access_type;
// ATS lookups are disabled on all addresses within the same
// UVM_GMMU_ATS_GRANULARITY as existing GMMU mappings (see documentation in
// uvm_mmu.h). User mode is supposed to reserve VAs as appropriate to
// prevent any system memory allocations from falling within the NO_ATS
// range of other GMMU mappings, so this shouldn't happen during normal
// operation. However, since this scenario may lead to infinite fault loops,
// we handle it by canceling the fault.
//
// TODO: Bug 2103669: Remove redundant VA range lookups
gmmu_region_base = UVM_ALIGN_DOWN(current_entry->fault_address, UVM_GMMU_ATS_GRANULARITY);
in_gmmu_region = !uvm_va_space_range_empty(current_entry->va_space,
gmmu_region_base,
gmmu_region_base + UVM_GMMU_ATS_GRANULARITY - 1);
if (in_gmmu_region) {
status = NV_ERR_INVALID_ADDRESS;
}
else {
// TODO: Bug 2103669: Service more than a single fault at a time
2022-11-10 17:39:33 +01:00
status = uvm_ats_service_fault(gpu_va_space,
current_entry->fault_address,
service_access_type,
current_entry->fault_source.client_type);
2022-05-09 22:18:59 +02:00
}
// Do not flag prefetch faults as fatal unless something fatal happened
if (status == NV_ERR_INVALID_ADDRESS) {
if (current_entry->fault_access_type != UVM_FAULT_ACCESS_TYPE_PREFETCH) {
current_entry->is_fatal = true;
current_entry->fatal_reason = uvm_tools_status_to_fatal_fault_reason(status);
// Compute cancel mode for replayable faults
if (current_entry->is_replayable) {
if (service_access_type == UVM_FAULT_ACCESS_TYPE_READ || in_gmmu_region)
current_entry->replayable.cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
else
current_entry->replayable.cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_WRITE_AND_ATOMIC;
// If there are pending read accesses on the same page, we have to
// service them before we can cancel the write/atomic faults. So we
// retry with read fault access type.
if (!in_gmmu_region &&
current_entry->fault_access_type > UVM_FAULT_ACCESS_TYPE_READ &&
uvm_fault_access_type_mask_test(current_entry->access_type_mask, UVM_FAULT_ACCESS_TYPE_READ)) {
status = uvm_ats_service_fault(gpu_va_space,
current_entry->fault_address,
2022-11-10 17:39:33 +01:00
UVM_FAULT_ACCESS_TYPE_READ,
current_entry->fault_source.client_type);
2022-05-09 22:18:59 +02:00
// If read accesses are also invalid, cancel the fault. If a
// different error code is returned, exit
if (status == NV_ERR_INVALID_ADDRESS)
current_entry->replayable.cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
else if (status != NV_OK)
return status;
}
}
}
else {
current_entry->is_invalid_prefetch = true;
}
// Do not fail overall fault servicing due to logical errors
status = NV_OK;
}
// The Linux kernel never invalidates TLB entries on mapping permission
// upgrade. This is a problem if the GPU has cached entries with the old
// permission. The GPU will re-fetch the entry if the PTE is invalid and
// page size is not 4K (this is the case on P9). However, if a page gets
// upgraded from R/O to R/W and GPU has the PTEs cached with R/O
// permissions we will enter an infinite loop because we just forward the
// fault to the Linux kernel and it will see that the permissions in the
// page table are correct. Therefore, we flush TLB entries on ATS write
// faults.
if (!current_entry->is_fatal && current_entry->fault_access_type > UVM_FAULT_ACCESS_TYPE_READ) {
if (!ats_invalidate->write_faults_in_batch) {
uvm_tlb_batch_begin(&gpu_va_space->page_tables, &ats_invalidate->write_faults_tlb_batch);
ats_invalidate->write_faults_in_batch = true;
}
uvm_tlb_batch_invalidate(&ats_invalidate->write_faults_tlb_batch,
current_entry->fault_address,
PAGE_SIZE,
PAGE_SIZE,
UVM_MEMBAR_NONE);
}
return status;
}
NV_STATUS uvm_ats_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space,
uvm_ats_fault_invalidate_t *ats_invalidate,
uvm_tracker_t *out_tracker)
{
NV_STATUS status;
uvm_push_t push;
if (!ats_invalidate->write_faults_in_batch)
return NV_OK;
UVM_ASSERT(gpu_va_space);
UVM_ASSERT(gpu_va_space->ats.enabled);
status = uvm_push_begin(gpu_va_space->gpu->channel_manager,
UVM_CHANNEL_TYPE_MEMOPS,
&push,
"Invalidate ATS entries");
if (status == NV_OK) {
uvm_tlb_batch_end(&ats_invalidate->write_faults_tlb_batch, &push, UVM_MEMBAR_NONE);
uvm_push_end(&push);
// Add this push to the GPU's tracker so that fault replays/clears can
// wait on it
status = uvm_tracker_add_push_safe(out_tracker, &push);
}
ats_invalidate->write_faults_in_batch = false;
return status;
}