/******************************************************************************* Copyright (c) 2015-2022 NVIDIA Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. *******************************************************************************/ #include "uvm_linux.h" #include "uvm_common.h" #include "uvm_api.h" #include "uvm_gpu.h" #include "uvm_va_space.h" #include "uvm_va_range.h" #include "uvm_va_block.h" #include "uvm_hal_types.h" #include "uvm_kvmalloc.h" #include "uvm_tools.h" #include "uvm_push.h" #include "uvm_hal.h" #include "uvm_perf_thrashing.h" #include "uvm_perf_prefetch.h" #include "uvm_mem.h" #include "uvm_gpu_access_counters.h" #include "uvm_va_space_mm.h" #include "uvm_test_ioctl.h" typedef enum { BLOCK_PTE_OP_MAP, BLOCK_PTE_OP_REVOKE, BLOCK_PTE_OP_COUNT } block_pte_op_t; static NvU64 uvm_perf_authorized_cpu_fault_tracking_window_ns = 300000; static struct kmem_cache *g_uvm_va_block_cache __read_mostly; static struct kmem_cache *g_uvm_va_block_gpu_state_cache __read_mostly; static struct kmem_cache *g_uvm_page_mask_cache __read_mostly; static struct kmem_cache *g_uvm_va_block_context_cache __read_mostly; static int uvm_fault_force_sysmem __read_mostly = 0; module_param(uvm_fault_force_sysmem, int, S_IRUGO|S_IWUSR); MODULE_PARM_DESC(uvm_fault_force_sysmem, "Force (1) using sysmem storage for pages that faulted. Default: 0."); static int uvm_perf_map_remote_on_eviction __read_mostly = 1; module_param(uvm_perf_map_remote_on_eviction, int, S_IRUGO); // Caching is always disabled for mappings to remote memory. The following two // module parameters can be used to force caching for GPU peer/sysmem mappings. // // However, it is important to note that it may not be safe to enable caching // in the general case so the enablement should only be used for experiments. static unsigned uvm_exp_gpu_cache_peermem __read_mostly = 0; module_param(uvm_exp_gpu_cache_peermem, uint, S_IRUGO); MODULE_PARM_DESC(uvm_exp_gpu_cache_peermem, "Force caching for mappings to peer memory. " "This is an experimental parameter that may cause correctness issues if used."); static unsigned uvm_exp_gpu_cache_sysmem __read_mostly = 0; module_param(uvm_exp_gpu_cache_sysmem, uint, S_IRUGO); MODULE_PARM_DESC(uvm_exp_gpu_cache_sysmem, "Force caching for mappings to system memory. " "This is an experimental parameter that may cause correctness issues if used."); static void block_deferred_eviction_mappings_entry(void *args); uvm_va_space_t *uvm_va_block_get_va_space_maybe_dead(uvm_va_block_t *va_block) { #if UVM_IS_CONFIG_HMM() if (va_block->hmm.va_space) return va_block->hmm.va_space; #endif if (va_block->va_range) return va_block->va_range->va_space; return NULL; } uvm_va_space_t *uvm_va_block_get_va_space(uvm_va_block_t *va_block) { uvm_va_space_t *va_space; UVM_ASSERT(!uvm_va_block_is_dead(va_block)); va_space = uvm_va_block_get_va_space_maybe_dead(va_block); UVM_ASSERT(va_space); return va_space; } static NvU64 block_gpu_pte_flag_cacheable(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_processor_id_t resident_id) { uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); UVM_ASSERT(UVM_ID_IS_VALID(resident_id)); // Local vidmem is always cached if (uvm_id_equal(resident_id, gpu->id)) return UVM_MMU_PTE_FLAGS_CACHED; if (UVM_ID_IS_CPU(resident_id)) return uvm_exp_gpu_cache_sysmem == 0 ? UVM_MMU_PTE_FLAGS_NONE : UVM_MMU_PTE_FLAGS_CACHED; UVM_ASSERT(uvm_processor_mask_test(&va_space->can_access[uvm_id_value(gpu->id)], resident_id)); return uvm_exp_gpu_cache_peermem == 0 ? UVM_MMU_PTE_FLAGS_NONE : UVM_MMU_PTE_FLAGS_CACHED; } static uvm_gpu_t *block_get_gpu(uvm_va_block_t *block, uvm_gpu_id_t gpu_id) { uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); return uvm_va_space_get_gpu(va_space, gpu_id); } static const char *block_processor_name(uvm_va_block_t *block, uvm_processor_id_t id) { uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); return uvm_va_space_processor_name(va_space, id); } static bool block_processor_has_memory(uvm_va_block_t *block, uvm_processor_id_t id) { uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); return uvm_va_space_processor_has_memory(va_space, id); } static bool is_uvm_fault_force_sysmem_set(void) { // Only enforce this during testing return uvm_enable_builtin_tests && uvm_fault_force_sysmem != 0; } static bool va_space_map_remote_on_eviction(uvm_va_space_t *va_space) { return uvm_perf_map_remote_on_eviction && uvm_va_space_has_access_counter_migrations(va_space); } static const uvm_processor_mask_t *block_get_uvm_lite_gpus(uvm_va_block_t *va_block) { // Note that for HMM we always return a pointer to a zero bitmap // (not allocated on the stack) since uvm_lite GPUs are not supported. static const uvm_processor_mask_t uvm_lite_gpus = {}; if (uvm_va_block_is_hmm(va_block)) return &uvm_lite_gpus; else return &va_block->va_range->uvm_lite_gpus; } void uvm_va_block_retry_init(uvm_va_block_retry_t *retry) { if (!retry) return; uvm_tracker_init(&retry->tracker); INIT_LIST_HEAD(&retry->used_chunks); INIT_LIST_HEAD(&retry->free_chunks); } static bool block_verify_cpu_chunks(uvm_va_block_t *block) { uvm_cpu_chunk_t *chunk; size_t alloced_pages = 0; NvU64 tracking_virt_addr = block->start; uvm_page_mask_t region_mask; uvm_page_index_t page_index; uvm_va_block_region_t block_region = uvm_va_block_region_from_block(block); for_each_cpu_chunk_in_block(chunk, page_index, block) { NvU64 chunk_virt_addr = uvm_va_block_cpu_page_address(block, page_index); size_t num_chunk_pages = uvm_cpu_chunk_num_pages(chunk); uvm_page_index_t chunk_page; UVM_ASSERT(tracking_virt_addr <= chunk_virt_addr); if (tracking_virt_addr > chunk_virt_addr) return false; UVM_ASSERT(uvm_va_block_contains_address(block, chunk_virt_addr)); if (!uvm_va_block_contains_address(block, chunk_virt_addr)) return false; alloced_pages += uvm_cpu_chunk_num_pages(chunk); uvm_page_mask_init_from_region(®ion_mask, uvm_va_block_region(page_index, page_index + uvm_cpu_chunk_num_pages(chunk)), NULL); UVM_ASSERT(uvm_page_mask_intersects(&block->cpu.allocated, ®ion_mask)); if (!uvm_page_mask_intersects(&block->cpu.allocated, ®ion_mask)) return false; tracking_virt_addr = chunk_virt_addr; for (chunk_page = page_index; chunk_page < page_index + num_chunk_pages; chunk_page++) { UVM_ASSERT(uvm_cpu_chunk_get_chunk_for_page(block, chunk_page) == chunk); if (uvm_cpu_chunk_get_chunk_for_page(block, chunk_page) != chunk) return false; } } UVM_ASSERT(alloced_pages == uvm_page_mask_weight(&block->cpu.allocated)); if (alloced_pages != uvm_page_mask_weight(&block->cpu.allocated)) return false; for_each_va_block_page_in_region_mask(page_index, &block->cpu.allocated, block_region) { uvm_cpu_chunk_t *next; uvm_page_index_t next_page_index; chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); UVM_ASSERT(chunk); if (!chunk) return false; next_page_index = uvm_va_block_next_page_in_mask(block_region, &block->cpu.allocated, page_index + uvm_cpu_chunk_num_pages(chunk) - 1); next = uvm_cpu_chunk_next(block, &next_page_index); if (next_page_index < block_region.outer) { UVM_ASSERT(next && uvm_va_block_cpu_page_address(block, page_index) + uvm_cpu_chunk_get_size(chunk) <= uvm_va_block_cpu_page_address(block, next_page_index)); if (!next || (uvm_va_block_cpu_page_address(block, page_index) + uvm_cpu_chunk_get_size(chunk) > uvm_va_block_cpu_page_address(block, next_page_index))) return false; } else { UVM_ASSERT(next == NULL); if (next != NULL) return false; } } return true; } // Frees any left-over free chunks and unpins all the used chunks void uvm_va_block_retry_deinit(uvm_va_block_retry_t *retry, uvm_va_block_t *va_block) { uvm_gpu_t *gpu; uvm_gpu_chunk_t *gpu_chunk; uvm_gpu_chunk_t *next_chunk; if (!retry) return; uvm_tracker_deinit(&retry->tracker); // Free any unused chunks list_for_each_entry_safe(gpu_chunk, next_chunk, &retry->free_chunks, list) { list_del_init(&gpu_chunk->list); gpu = uvm_gpu_chunk_get_gpu(gpu_chunk); uvm_pmm_gpu_free(&gpu->pmm, gpu_chunk, NULL); } // Unpin all the used chunks now that we are done list_for_each_entry_safe(gpu_chunk, next_chunk, &retry->used_chunks, list) { list_del_init(&gpu_chunk->list); gpu = uvm_gpu_chunk_get_gpu(gpu_chunk); uvm_pmm_gpu_unpin_temp(&gpu->pmm, gpu_chunk, va_block); } } static void block_retry_add_free_chunk(uvm_va_block_retry_t *retry, uvm_gpu_chunk_t *gpu_chunk) { list_add_tail(&gpu_chunk->list, &retry->free_chunks); } static void block_retry_add_used_chunk(uvm_va_block_retry_t *retry, uvm_gpu_chunk_t *gpu_chunk) { list_add_tail(&gpu_chunk->list, &retry->used_chunks); } static uvm_gpu_chunk_t *block_retry_get_free_chunk(uvm_va_block_retry_t *retry, uvm_gpu_t *gpu, uvm_chunk_size_t size) { uvm_gpu_chunk_t *gpu_chunk; list_for_each_entry(gpu_chunk, &retry->free_chunks, list) { if (uvm_gpu_chunk_get_gpu(gpu_chunk) == gpu && uvm_gpu_chunk_get_size(gpu_chunk) == size) { list_del_init(&gpu_chunk->list); return gpu_chunk; } } return NULL; } // Encapsulates a reference to a physical page belonging to a specific processor // within a VA block. typedef struct { // Processor the page is on uvm_processor_id_t processor; // The page index uvm_page_index_t page_index; } block_phys_page_t; static block_phys_page_t block_phys_page(uvm_processor_id_t processor, uvm_page_index_t page_index) { return (block_phys_page_t){ processor, page_index }; } NV_STATUS uvm_va_block_init(void) { if (uvm_enable_builtin_tests) g_uvm_va_block_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_wrapper_t", uvm_va_block_wrapper_t); else g_uvm_va_block_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_t", uvm_va_block_t); if (!g_uvm_va_block_cache) return NV_ERR_NO_MEMORY; g_uvm_va_block_gpu_state_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_gpu_state_t", uvm_va_block_gpu_state_t); if (!g_uvm_va_block_gpu_state_cache) return NV_ERR_NO_MEMORY; g_uvm_page_mask_cache = NV_KMEM_CACHE_CREATE("uvm_page_mask_t", uvm_page_mask_t); if (!g_uvm_page_mask_cache) return NV_ERR_NO_MEMORY; g_uvm_va_block_context_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_context_t", uvm_va_block_context_t); if (!g_uvm_va_block_context_cache) return NV_ERR_NO_MEMORY; return NV_OK; } void uvm_va_block_exit(void) { kmem_cache_destroy_safe(&g_uvm_va_block_context_cache); kmem_cache_destroy_safe(&g_uvm_page_mask_cache); kmem_cache_destroy_safe(&g_uvm_va_block_gpu_state_cache); kmem_cache_destroy_safe(&g_uvm_va_block_cache); } uvm_va_block_context_t *uvm_va_block_context_alloc(struct mm_struct *mm) { uvm_va_block_context_t *block_context = kmem_cache_alloc(g_uvm_va_block_context_cache, NV_UVM_GFP_FLAGS); if (block_context) uvm_va_block_context_init(block_context, mm); return block_context; } void uvm_va_block_context_free(uvm_va_block_context_t *va_block_context) { if (va_block_context) kmem_cache_free(g_uvm_va_block_context_cache, va_block_context); } // Convert from page_index to chunk_index. The goal is for each system page in // the region [start, start + size) to be covered by the largest naturally- // aligned user chunk size. size_t uvm_va_block_gpu_chunk_index_range(NvU64 start, NvU64 size, uvm_gpu_t *gpu, uvm_page_index_t page_index, uvm_chunk_size_t *out_chunk_size) { uvm_chunk_sizes_mask_t chunk_sizes = gpu->parent->mmu_user_chunk_sizes; uvm_chunk_size_t chunk_size, final_chunk_size; size_t num_chunks, num_chunks_total; NvU64 addr, end, aligned_start, aligned_addr, aligned_end, temp_size; UVM_ASSERT(PAGE_ALIGNED(start)); UVM_ASSERT(PAGE_ALIGNED(size)); UVM_ASSERT(size > 0); UVM_ASSERT(size <= UVM_CHUNK_SIZE_2M); UVM_ASSERT(UVM_ALIGN_DOWN(start, UVM_CHUNK_SIZE_2M) == UVM_ALIGN_DOWN(start + size - 1, UVM_CHUNK_SIZE_2M)); BUILD_BUG_ON(UVM_VA_BLOCK_SIZE != UVM_CHUNK_SIZE_2M); // PAGE_SIZE needs to be the lowest natively-supported chunk size in the // mask, since we never deal with chunk sizes smaller than that (although we // may have PTEs mapping pages smaller than that). UVM_ASSERT(uvm_chunk_find_first_size(chunk_sizes) == PAGE_SIZE); // Optimize the ideal Pascal+ case: the whole block is covered by a single // 2M page. if ((chunk_sizes & UVM_CHUNK_SIZE_2M) && size == UVM_CHUNK_SIZE_2M) { UVM_ASSERT(IS_ALIGNED(start, UVM_CHUNK_SIZE_2M)); final_chunk_size = UVM_CHUNK_SIZE_2M; num_chunks_total = 0; goto out; } // Only one 2M chunk can fit within a VA block on any GPU architecture, so // remove that size from consideration. chunk_sizes &= ~UVM_CHUNK_SIZE_2M; // Next common case: the whole block is aligned and sized to perfectly fit // the largest page size. // // TODO: Bug 1750144: This might not be the common case for HMM. Verify that // this helps performance more than it hurts. final_chunk_size = uvm_chunk_find_last_size(chunk_sizes); if (IS_ALIGNED(start, final_chunk_size) && IS_ALIGNED(size, final_chunk_size)) { num_chunks_total = (size_t)uvm_div_pow2_64(page_index * PAGE_SIZE, final_chunk_size); goto out; } // We didn't hit our special paths. Do it the hard way. num_chunks_total = 0; addr = start + page_index * PAGE_SIZE; end = start + size; final_chunk_size = 0; UVM_ASSERT(addr < end); // The below loop collapses almost completely when chunk_size == PAGE_SIZE // since in that lowest-common-denominator case everything is already // aligned. Skip it and handle that specially after the loop. // // Note that since we removed 2M already above, this loop will only iterate // once on x86 Pascal+ since only 64K is left. chunk_sizes &= ~PAGE_SIZE; // This loop calculates the number of chunks between start and addr by // calculating the number of whole chunks of each size between them, // starting with the largest allowed chunk size. This requires fewer // iterations than if we began from start and kept calculating the next // larger chunk size boundary. for_each_chunk_size_rev(chunk_size, chunk_sizes) { aligned_start = UVM_ALIGN_UP(start, chunk_size); aligned_addr = UVM_ALIGN_DOWN(addr, chunk_size); aligned_end = UVM_ALIGN_DOWN(end, chunk_size); // If addr and start are within the same chunk, try smaller if (aligned_start > aligned_addr) continue; // If addr and end are not in the same chunk, then addr is covered by a // single chunk of the current size. Ignore smaller boundaries between // addr and aligned_addr. if (aligned_addr < aligned_end && final_chunk_size == 0) { addr = aligned_addr; final_chunk_size = chunk_size; } // How many chunks of this size are between start and addr? Note that // this might be 0 since aligned_addr and aligned_start could be in the // same chunk. num_chunks = uvm_div_pow2_32(((NvU32)aligned_addr - aligned_start), chunk_size); num_chunks_total += num_chunks; // We've already accounted for these chunks, so "remove" them by // bringing start, addr, and end closer together to calculate the // remaining chunk sizes. temp_size = num_chunks * chunk_size; addr -= temp_size; end -= temp_size; // Once there's no separation between addr and start, and we've // successfully found the right chunk size when taking end into account, // we're done. if (addr == start && final_chunk_size) break; } // Handle PAGE_SIZE cleanup since we skipped it in the loop num_chunks_total += (addr - start) / PAGE_SIZE; if (final_chunk_size == 0) final_chunk_size = PAGE_SIZE; out: if (out_chunk_size) *out_chunk_size = final_chunk_size; return num_chunks_total; } static size_t block_gpu_chunk_index(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_page_index_t page_index, uvm_chunk_size_t *out_chunk_size) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); uvm_chunk_size_t size; uvm_gpu_chunk_t *chunk; size_t index = uvm_va_block_gpu_chunk_index_range(block->start, uvm_va_block_size(block), gpu, page_index, &size); UVM_ASSERT(size >= PAGE_SIZE); if (gpu_state) { UVM_ASSERT(gpu_state->chunks); chunk = gpu_state->chunks[index]; if (chunk) { UVM_ASSERT(uvm_gpu_chunk_get_size(chunk) == size); UVM_ASSERT(chunk->state != UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED); UVM_ASSERT(chunk->state != UVM_PMM_GPU_CHUNK_STATE_FREE); } } if (out_chunk_size) *out_chunk_size = size; return index; } // Compute the size of the chunk known to start at start_page_index static uvm_chunk_size_t block_gpu_chunk_size(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_page_index_t start_page_index) { uvm_chunk_sizes_mask_t chunk_sizes = gpu->parent->mmu_user_chunk_sizes; uvm_chunk_sizes_mask_t start_alignments, pow2_leq_size, allowed_sizes; NvU64 start = uvm_va_block_cpu_page_address(block, start_page_index); NvU64 size = block->end - start + 1; // Create a mask of all sizes for which start is aligned. x ^ (x-1) yields a // mask of the rightmost 1 bit in x, as well as all trailing 0 bits in x. // Example: 1011000 -> 0001111 start_alignments = (uvm_chunk_sizes_mask_t)(start ^ (start - 1)); // Next, compute all sizes (powers of two) which are <= size. pow2_leq_size = (uvm_chunk_sizes_mask_t)rounddown_pow_of_two(size); pow2_leq_size |= pow2_leq_size - 1; // Now and them all together to get our list of GPU-supported chunk sizes // which are aligned to start and will fit within size. allowed_sizes = chunk_sizes & start_alignments & pow2_leq_size; // start and size must always be aligned to at least the smallest supported // chunk size (PAGE_SIZE). UVM_ASSERT(allowed_sizes >= PAGE_SIZE); // Take the largest allowed size return uvm_chunk_find_last_size(allowed_sizes); } static size_t block_num_gpu_chunks(uvm_va_block_t *block, uvm_gpu_t *gpu) { return block_gpu_chunk_index(block, gpu, uvm_va_block_cpu_page_index(block, block->end), NULL) + 1; } static size_t block_num_gpu_chunks_range(NvU64 start, NvU64 size, uvm_gpu_t *gpu) { uvm_page_index_t last_page_index = (size_t)((size / PAGE_SIZE) - 1); return uvm_va_block_gpu_chunk_index_range(start, size, gpu, last_page_index, NULL) + 1; } uvm_gpu_chunk_t *uvm_va_block_lookup_gpu_chunk(uvm_va_block_t *va_block, uvm_gpu_t *gpu, NvU64 address) { size_t chunk_index; uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, address); uvm_assert_mutex_locked(&va_block->lock); if (!gpu_state) return NULL; chunk_index = block_gpu_chunk_index(va_block, gpu, page_index, NULL); return gpu_state->chunks[chunk_index]; } NV_STATUS uvm_va_block_create(uvm_va_range_t *va_range, NvU64 start, NvU64 end, uvm_va_block_t **out_block) { uvm_va_block_t *block = NULL; NvU64 size = end - start + 1; NV_STATUS status; UVM_ASSERT(PAGE_ALIGNED(start)); UVM_ASSERT(PAGE_ALIGNED(end + 1)); UVM_ASSERT(PAGE_ALIGNED(size)); UVM_ASSERT(size > 0); UVM_ASSERT(size <= UVM_VA_BLOCK_SIZE); if (va_range) { // Create a UVM managed va_block. UVM_ASSERT(start >= va_range->node.start); UVM_ASSERT(end <= va_range->node.end); UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED); } // Blocks can't span a block alignment boundary UVM_ASSERT(UVM_VA_BLOCK_ALIGN_DOWN(start) == UVM_VA_BLOCK_ALIGN_DOWN(end)); if (uvm_enable_builtin_tests) { uvm_va_block_wrapper_t *block_wrapper = nv_kmem_cache_zalloc(g_uvm_va_block_cache, NV_UVM_GFP_FLAGS); if (block_wrapper) block = &block_wrapper->block; } else { block = nv_kmem_cache_zalloc(g_uvm_va_block_cache, NV_UVM_GFP_FLAGS); } if (!block) { status = NV_ERR_NO_MEMORY; goto error; } nv_kref_init(&block->kref); uvm_mutex_init(&block->lock, UVM_LOCK_ORDER_VA_BLOCK); block->start = start; block->end = end; block->va_range = va_range; uvm_tracker_init(&block->tracker); nv_kthread_q_item_init(&block->eviction_mappings_q_item, block_deferred_eviction_mappings_entry, block); *out_block = block; return NV_OK; error: uvm_va_block_release(block); return status; } static void block_gpu_unmap_phys_all_cpu_pages(uvm_va_block_t *block, uvm_gpu_t *gpu) { uvm_cpu_chunk_t *chunk; uvm_page_index_t page_index; for_each_cpu_chunk_in_block(chunk, page_index, block) { NvU64 gpu_mapping_addr; UVM_ASSERT(chunk); gpu_mapping_addr = uvm_cpu_chunk_get_gpu_mapping_addr(block, page_index, chunk, gpu->id); if (gpu_mapping_addr != 0) { uvm_pmm_sysmem_mappings_remove_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings, gpu_mapping_addr); uvm_gpu_unmap_cpu_pages(gpu, gpu_mapping_addr, uvm_cpu_chunk_get_size(chunk)); uvm_cpu_chunk_set_gpu_mapping_addr(block, page_index, chunk, gpu->id, 0); } } } static NV_STATUS block_gpu_map_phys_all_cpu_pages(uvm_va_block_t *block, uvm_gpu_t *gpu) { NV_STATUS status; uvm_cpu_chunk_t *chunk; NvU64 block_mapping_size = uvm_va_block_size(block); uvm_page_index_t page_index; UVM_ASSERT(IS_ALIGNED(block_mapping_size, UVM_PAGE_SIZE_4K)); for_each_cpu_chunk_in_block(chunk, page_index, block) { uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk); NvU64 gpu_mapping_addr = uvm_cpu_chunk_get_gpu_mapping_addr(block, page_index, chunk, gpu->id); UVM_ASSERT_MSG(gpu_mapping_addr == 0, "GPU%u DMA address 0x%llx\n", uvm_id_value(gpu->id), gpu_mapping_addr); status = uvm_gpu_map_cpu_pages(gpu, uvm_cpu_chunk_get_cpu_page(block, chunk, page_index), chunk_size, &gpu_mapping_addr); if (status != NV_OK) goto error; uvm_cpu_chunk_set_gpu_mapping_addr(block, page_index, chunk, gpu->id, gpu_mapping_addr); // In some configurations such as SR-IOV heavy, the chunk cannot be // referenced using its physical address. Create a kernel mapping. status = uvm_mmu_sysmem_map(gpu, gpu_mapping_addr, chunk_size); if (status != NV_OK) goto error; status = uvm_pmm_sysmem_mappings_add_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings, uvm_cpu_chunk_get_gpu_mapping_addr(block, page_index, chunk, gpu->id), uvm_va_block_cpu_page_address(block, page_index), chunk_size, block, UVM_ID_CPU); if (status != NV_OK) goto error; } return NV_OK; error: block_gpu_unmap_phys_all_cpu_pages(block, gpu); return status; } static NV_STATUS block_sysmem_mappings_add_gpu_chunk(uvm_va_block_t *block, uvm_gpu_t *local_gpu, uvm_gpu_chunk_t *chunk, uvm_gpu_t *accessing_gpu) { NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&local_gpu->pmm, chunk, accessing_gpu); return uvm_pmm_sysmem_mappings_add_gpu_chunk_mapping(&accessing_gpu->pmm_reverse_sysmem_mappings, peer_addr, block->start + chunk->va_block_page_index * PAGE_SIZE, uvm_gpu_chunk_get_size(chunk), block, local_gpu->id); } static void block_sysmem_mappings_remove_gpu_chunk(uvm_gpu_t *local_gpu, uvm_gpu_chunk_t *chunk, uvm_gpu_t *accessing_gpu) { NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&local_gpu->pmm, chunk, accessing_gpu); uvm_pmm_sysmem_mappings_remove_gpu_chunk_mapping(&accessing_gpu->pmm_reverse_sysmem_mappings, peer_addr); } static NV_STATUS block_gpu_map_all_chunks_indirect_peer(uvm_va_block_t *block, uvm_gpu_t *local_gpu, uvm_gpu_t *accessing_gpu) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, local_gpu->id); uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); size_t num_chunks, i; NV_STATUS status; UVM_ASSERT(uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(local_gpu->id)], accessing_gpu->id)); // If no chunks are allocated currently, the mappings will be created later // at chunk allocation. if (!gpu_state || !gpu_state->chunks) return NV_OK; num_chunks = block_num_gpu_chunks(block, local_gpu); for (i = 0; i < num_chunks; i++) { uvm_gpu_chunk_t *chunk = gpu_state->chunks[i]; if (!chunk) continue; status = uvm_pmm_gpu_indirect_peer_map(&local_gpu->pmm, chunk, accessing_gpu); if (status != NV_OK) goto error; status = block_sysmem_mappings_add_gpu_chunk(block, local_gpu, chunk, accessing_gpu); if (status != NV_OK) goto error; } return NV_OK; error: while (i-- > 0) { uvm_gpu_chunk_t *chunk = gpu_state->chunks[i]; if (chunk) { // Indirect peer mappings are removed lazily by PMM, so if an error // occurs the mappings established above will be removed when the // chunk is freed later on. We only need to remove the sysmem // reverse mappings. block_sysmem_mappings_remove_gpu_chunk(local_gpu, chunk, accessing_gpu); } } return status; } // Mappings for indirect peers are removed lazily by PMM, but we need to remove // the entries from the reverse map. static void block_gpu_unmap_all_chunks_indirect_peer(uvm_va_block_t *block, uvm_gpu_t *local_gpu, uvm_gpu_t *accessing_gpu) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, local_gpu->id); uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); size_t num_chunks, i; UVM_ASSERT(uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(local_gpu->id)], accessing_gpu->id)); // Exit if no chunks are allocated currently. if (!gpu_state || !gpu_state->chunks) return; num_chunks = block_num_gpu_chunks(block, local_gpu); for (i = 0; i < num_chunks; i++) { uvm_gpu_chunk_t *chunk = gpu_state->chunks[i]; if (chunk) block_sysmem_mappings_remove_gpu_chunk(local_gpu, chunk, accessing_gpu); } } // Retrieves the gpu_state for the given GPU, allocating it if it doesn't exist static uvm_va_block_gpu_state_t *block_gpu_state_get_alloc(uvm_va_block_t *block, uvm_gpu_t *gpu) { NV_STATUS status; uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); if (gpu_state) return gpu_state; gpu_state = nv_kmem_cache_zalloc(g_uvm_va_block_gpu_state_cache, NV_UVM_GFP_FLAGS); if (!gpu_state) return NULL; gpu_state->chunks = uvm_kvmalloc_zero(block_num_gpu_chunks(block, gpu) * sizeof(gpu_state->chunks[0])); if (!gpu_state->chunks) goto error; block->gpus[uvm_id_gpu_index(gpu->id)] = gpu_state; status = uvm_cpu_chunk_gpu_mapping_alloc(block, gpu->id); if (status != NV_OK) goto error; status = block_gpu_map_phys_all_cpu_pages(block, gpu); if (status != NV_OK) goto error; return gpu_state; error: if (gpu_state) { if (gpu_state->chunks) uvm_kvfree(gpu_state->chunks); uvm_cpu_chunk_gpu_mapping_free(block, gpu->id); kmem_cache_free(g_uvm_va_block_gpu_state_cache, gpu_state); } block->gpus[uvm_id_gpu_index(gpu->id)] = NULL; return NULL; } static void block_unmap_cpu_chunk_on_gpus(uvm_va_block_t *block, uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index) { uvm_gpu_id_t id; for_each_gpu_id(id) { NvU64 gpu_mapping_addr; uvm_gpu_t *gpu; if (!uvm_va_block_gpu_state_get(block, id)) continue; gpu_mapping_addr = uvm_cpu_chunk_get_gpu_mapping_addr(block, page_index, chunk, id); if (gpu_mapping_addr == 0) continue; gpu = block_get_gpu(block, id); uvm_pmm_sysmem_mappings_remove_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings, gpu_mapping_addr); uvm_gpu_unmap_cpu_pages(gpu, gpu_mapping_addr, uvm_cpu_chunk_get_size(chunk)); uvm_cpu_chunk_set_gpu_mapping_addr(block, page_index, chunk, id, 0); } } static NV_STATUS block_map_cpu_chunk_on_gpus(uvm_va_block_t *block, uvm_page_index_t page_index) { NV_STATUS status; uvm_gpu_id_t id; uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk); uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, chunk_size, page_index); UVM_ASSERT(chunk); // We can't iterate over va_space->registered_gpus because we might be // on the eviction path, which does not have the VA space lock held. We have // the VA block lock held however, so the gpu_states can't change. uvm_assert_mutex_locked(&block->lock); // Only physical chunks can be mapped. UVM_ASSERT(uvm_cpu_chunk_is_physical(chunk)); for_each_gpu_id(id) { NvU64 gpu_mapping_addr; uvm_gpu_t *gpu; if (!uvm_va_block_gpu_state_get(block, id)) continue; gpu_mapping_addr = uvm_cpu_chunk_get_gpu_mapping_addr(block, page_index, chunk, id); UVM_ASSERT_MSG(gpu_mapping_addr == 0, "GPU%u DMA address 0x%llx\n", uvm_id_value(id), gpu_mapping_addr); gpu = block_get_gpu(block, id); status = uvm_gpu_map_cpu_pages(gpu, uvm_cpu_chunk_get_cpu_page(block, chunk, chunk_region.first), chunk_size, &gpu_mapping_addr); if (status != NV_OK) goto error; uvm_cpu_chunk_set_gpu_mapping_addr(block, chunk_region.first, chunk, id, gpu_mapping_addr); // In some configurations such as SR-IOV heavy, the chunk cannot be // referenced using its physical address. Create a kernel mapping. status = uvm_mmu_sysmem_map(gpu, gpu_mapping_addr, chunk_size); if (status != NV_OK) goto error; status = uvm_pmm_sysmem_mappings_add_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings, uvm_cpu_chunk_get_gpu_mapping_addr(block, chunk_region.first, chunk, id), uvm_va_block_cpu_page_address(block, chunk_region.first), chunk_size, block, UVM_ID_CPU); if (status != NV_OK) goto error; } return NV_OK; error: block_unmap_cpu_chunk_on_gpus(block, chunk, page_index); return status; } // Create physical mappings to allow other GPUs to access this chunk. static NV_STATUS block_map_indirect_peers_to_gpu_chunk(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk) { uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); uvm_gpu_t *accessing_gpu, *remove_gpu; NV_STATUS status; // Unlike block_map_cpu_chunk_on_gpus, this function isn't called on the // eviction path, so we can assume that the VA space is locked. // // TODO: Bug 2007346: In the future we may want to enable eviction to peers, // meaning we may need to allocate peer memory and map it on the // eviction path. That will require making sure that peers can't be // enabled or disabled either in the VA space or globally within this // function. uvm_assert_rwsem_locked(&va_space->lock); uvm_assert_mutex_locked(&block->lock); for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) { status = uvm_pmm_gpu_indirect_peer_map(&gpu->pmm, chunk, accessing_gpu); if (status != NV_OK) goto error; status = block_sysmem_mappings_add_gpu_chunk(block, gpu, chunk, accessing_gpu); if (status != NV_OK) goto error; } return NV_OK; error: for_each_va_space_gpu_in_mask(remove_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) { if (remove_gpu == accessing_gpu) break; // Indirect peer mappings are removed lazily by PMM, so if an error // occurs the mappings established above will be removed when the // chunk is freed later on. We only need to remove the sysmem // reverse mappings. block_sysmem_mappings_remove_gpu_chunk(gpu, chunk, remove_gpu); } return status; } static void block_unmap_indirect_peers_from_gpu_chunk(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk) { uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); uvm_gpu_t *peer_gpu; uvm_assert_rwsem_locked(&va_space->lock); uvm_assert_mutex_locked(&block->lock); // Indirect peer mappings are removed lazily by PMM, so we only need to // remove the sysmem reverse mappings. for_each_va_space_gpu_in_mask(peer_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) block_sysmem_mappings_remove_gpu_chunk(gpu, chunk, peer_gpu); } // Mark a CPU page as dirty. static void block_mark_cpu_page_dirty(uvm_va_block_t *block, uvm_page_index_t page_index) { uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); uvm_cpu_chunk_mark_dirty(chunk, page_index); } // Mark a CPU page as clean. static void block_mark_cpu_page_clean(uvm_va_block_t *block, uvm_page_index_t page_index) { uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); uvm_cpu_chunk_mark_clean(chunk, page_index); } // Check if a CPU page is dirty. static bool block_cpu_page_is_dirty(uvm_va_block_t *block, uvm_page_index_t page_index) { uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); return uvm_cpu_chunk_is_dirty(chunk, page_index); } // Allocates the input page in the block, if it doesn't already exist // // Also maps the page for physical access by all GPUs used by the block, which // is required for IOMMU support. // // TODO: Bug 1995015: Optimize this function and its callers to avoid calling for // each page index. static NV_STATUS block_populate_page_cpu(uvm_va_block_t *block, uvm_page_index_t page_index, struct mm_struct *mm) { NV_STATUS status; uvm_cpu_chunk_t *chunk = NULL; uvm_va_block_test_t *block_test = uvm_va_block_get_test(block); if (uvm_page_mask_test(&block->cpu.allocated, page_index)) return NV_OK; UVM_ASSERT(!uvm_page_mask_test(&block->cpu.resident, page_index)); // Return out of memory error if the tests have requested it. As opposed to // other error injection settings, this one is persistent. if (block_test && block_test->inject_cpu_pages_allocation_error) return NV_ERR_NO_MEMORY; status = uvm_cpu_chunk_alloc(block, page_index, mm, &chunk); if (status != NV_OK) goto error; status = block_map_cpu_chunk_on_gpus(block, page_index); error: if (status != NV_OK && chunk) { uvm_cpu_chunk_remove_from_block(block, chunk, page_index); uvm_cpu_chunk_put(chunk); } return status; } // Try allocating a chunk. If eviction was required, // NV_ERR_MORE_PROCESSING_REQUIRED will be returned since the block's lock was // unlocked and relocked. The caller is responsible for adding the chunk to the // retry used_chunks list. static NV_STATUS block_alloc_gpu_chunk(uvm_va_block_t *block, uvm_va_block_retry_t *retry, uvm_gpu_t *gpu, uvm_chunk_size_t size, uvm_gpu_chunk_t **out_gpu_chunk) { NV_STATUS status = NV_OK; uvm_gpu_chunk_t *gpu_chunk; // First try getting a free chunk from previously-made allocations. gpu_chunk = block_retry_get_free_chunk(retry, gpu, size); if (!gpu_chunk) { uvm_va_block_test_t *block_test = uvm_va_block_get_test(block); if (block_test && block_test->user_pages_allocation_retry_force_count > 0) { // Force eviction by pretending the allocation failed with no memory --block_test->user_pages_allocation_retry_force_count; status = NV_ERR_NO_MEMORY; } else { // Try allocating a new one without eviction status = uvm_pmm_gpu_alloc_user(&gpu->pmm, 1, size, UVM_PMM_ALLOC_FLAGS_NONE, &gpu_chunk, &retry->tracker); } if (status == NV_ERR_NO_MEMORY) { // If that fails with no memory, try allocating with eviction and // return back to the caller immediately so that the operation can // be restarted. uvm_mutex_unlock(&block->lock); status = uvm_pmm_gpu_alloc_user(&gpu->pmm, 1, size, UVM_PMM_ALLOC_FLAGS_EVICT, &gpu_chunk, &retry->tracker); if (status == NV_OK) { block_retry_add_free_chunk(retry, gpu_chunk); status = NV_ERR_MORE_PROCESSING_REQUIRED; } uvm_mutex_lock(&block->lock); return status; } else if (status != NV_OK) { return status; } } *out_gpu_chunk = gpu_chunk; return NV_OK; } static bool block_gpu_has_page_tables(uvm_va_block_t *block, uvm_gpu_t *gpu) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); if (!gpu_state) return false; return gpu_state->page_table_range_4k.table || gpu_state->page_table_range_big.table || gpu_state->page_table_range_2m.table; } // A helper to get a known-to-be-present GPU VA space given a VA block that's // locked. In order to use this function, the caller must know that at least one // of these conditions is true: // // 1) The VA space lock is held // 2) The VA block has active page tables for the GPU // // If the VA space lock is held (#1), then the gpu_va_space obviously can't go // away. // // On the eviction path, we don't have a lock on the VA space state. However, // since remove_gpu_va_space walks each block to unmap the GPU and free GPU page // tables before destroying the gpu_va_space, we're guaranteed that if this GPU // has page tables (#2), the gpu_va_space can't go away while we're holding the // block lock. static uvm_gpu_va_space_t *uvm_va_block_get_gpu_va_space(uvm_va_block_t *va_block, uvm_gpu_t *gpu) { uvm_gpu_va_space_t *gpu_va_space; uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); UVM_ASSERT(gpu); if (!block_gpu_has_page_tables(va_block, gpu)) uvm_assert_rwsem_locked(&va_space->lock); UVM_ASSERT(uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->id)); gpu_va_space = va_space->gpu_va_spaces[uvm_id_gpu_index(gpu->id)]; UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE); UVM_ASSERT(gpu_va_space->va_space == va_space); UVM_ASSERT(gpu_va_space->gpu == gpu); return gpu_va_space; } static bool block_gpu_supports_2m(uvm_va_block_t *block, uvm_gpu_t *gpu) { uvm_gpu_va_space_t *gpu_va_space; if (uvm_va_block_size(block) < UVM_PAGE_SIZE_2M) return false; UVM_ASSERT(uvm_va_block_size(block) == UVM_PAGE_SIZE_2M); gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu); return uvm_mmu_page_size_supported(&gpu_va_space->page_tables, UVM_PAGE_SIZE_2M); } NvU32 uvm_va_block_gpu_big_page_size(uvm_va_block_t *va_block, uvm_gpu_t *gpu) { uvm_gpu_va_space_t *gpu_va_space; gpu_va_space = uvm_va_block_get_gpu_va_space(va_block, gpu); return gpu_va_space->page_tables.big_page_size; } static uvm_va_block_region_t range_big_page_region_all(NvU64 start, NvU64 end, NvU32 big_page_size) { NvU64 first_addr = UVM_ALIGN_UP(start, big_page_size); NvU64 outer_addr = UVM_ALIGN_DOWN(end + 1, big_page_size); // The range must fit within a VA block UVM_ASSERT(UVM_VA_BLOCK_ALIGN_DOWN(start) == UVM_VA_BLOCK_ALIGN_DOWN(end)); if (outer_addr <= first_addr) return uvm_va_block_region(0, 0); return uvm_va_block_region((first_addr - start) / PAGE_SIZE, (outer_addr - start) / PAGE_SIZE); } static size_t range_num_big_pages(NvU64 start, NvU64 end, NvU32 big_page_size) { uvm_va_block_region_t region = range_big_page_region_all(start, end, big_page_size); return (size_t)uvm_div_pow2_64(uvm_va_block_region_size(region), big_page_size); } uvm_va_block_region_t uvm_va_block_big_page_region_all(uvm_va_block_t *va_block, NvU32 big_page_size) { return range_big_page_region_all(va_block->start, va_block->end, big_page_size); } size_t uvm_va_block_num_big_pages(uvm_va_block_t *va_block, NvU32 big_page_size) { return range_num_big_pages(va_block->start, va_block->end, big_page_size); } NvU64 uvm_va_block_big_page_addr(uvm_va_block_t *va_block, size_t big_page_index, NvU32 big_page_size) { NvU64 addr = UVM_ALIGN_UP(va_block->start, big_page_size) + (big_page_index * big_page_size); UVM_ASSERT(addr >= va_block->start); UVM_ASSERT(addr < va_block->end); return addr; } uvm_va_block_region_t uvm_va_block_big_page_region(uvm_va_block_t *va_block, size_t big_page_index, NvU32 big_page_size) { NvU64 page_addr = uvm_va_block_big_page_addr(va_block, big_page_index, big_page_size); // Assume that we don't have to handle multiple big PTEs per system page. // It's not terribly difficult to implement, but we don't currently have a // use case. UVM_ASSERT(big_page_size >= PAGE_SIZE); return uvm_va_block_region_from_start_size(va_block, page_addr, big_page_size); } // Returns the big page index (the bit index within // uvm_va_block_gpu_state_t::big_ptes) corresponding to page_index. If // page_index cannot be covered by a big PTE due to alignment or block size, // MAX_BIG_PAGES_PER_UVM_VA_BLOCK is returned. size_t uvm_va_block_big_page_index(uvm_va_block_t *va_block, uvm_page_index_t page_index, NvU32 big_page_size) { uvm_va_block_region_t big_region_all = uvm_va_block_big_page_region_all(va_block, big_page_size); size_t big_index; // Note that this condition also handles the case of having no big pages in // the block, in which case .first >= .outer. if (page_index < big_region_all.first || page_index >= big_region_all.outer) return MAX_BIG_PAGES_PER_UVM_VA_BLOCK; big_index = (size_t)uvm_div_pow2_64((page_index - big_region_all.first) * PAGE_SIZE, big_page_size); UVM_ASSERT(uvm_va_block_big_page_addr(va_block, big_index, big_page_size) >= va_block->start); UVM_ASSERT(uvm_va_block_big_page_addr(va_block, big_index, big_page_size) + big_page_size <= va_block->end + 1); return big_index; } static void uvm_page_mask_init_from_big_ptes(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_page_mask_t *mask_out, const unsigned long *big_ptes_in) { uvm_va_block_region_t big_region; size_t big_page_index; NvU32 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu); uvm_page_mask_zero(mask_out); for_each_set_bit(big_page_index, big_ptes_in, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) { big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size); uvm_page_mask_region_fill(mask_out, big_region); } } NvU32 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block, uvm_page_index_t page_index) { if (!uvm_page_mask_test(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index)) return 0; UVM_ASSERT(uvm_processor_mask_test(&va_block->mapped, UVM_ID_CPU)); // Despite the fact that physical CPU memory can be allocated at sizes // greater than PAGE_SIZE, vm_insert_page(s)() always maps CPU memory // with 4K PTEs. Until the core kernel adds support for PMD mappings, // the return value of this function will remain at PAGE_SIZE. return PAGE_SIZE; } NvU32 uvm_va_block_page_size_gpu(uvm_va_block_t *va_block, uvm_gpu_id_t gpu_id, uvm_page_index_t page_index) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id); size_t big_page_size, big_page_index; if (!gpu_state) return 0; if (!uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], page_index)) return 0; UVM_ASSERT(uvm_processor_mask_test(&va_block->mapped, gpu_id)); if (gpu_state->pte_is_2m) return UVM_PAGE_SIZE_2M; big_page_size = uvm_va_block_gpu_big_page_size(va_block, block_get_gpu(va_block, gpu_id)); big_page_index = uvm_va_block_big_page_index(va_block, page_index, big_page_size); if (big_page_index != MAX_BIG_PAGES_PER_UVM_VA_BLOCK && test_bit(big_page_index, gpu_state->big_ptes)) return big_page_size; return UVM_PAGE_SIZE_4K; } // Get the size of the physical allocation backing the page, or 0 if not // resident. Note that this is different from uvm_va_block_page_size_* because // those return the size of the PTE which maps the page index, which may be // smaller than the physical allocation. static NvU32 block_phys_page_size(uvm_va_block_t *block, block_phys_page_t page) { uvm_va_block_gpu_state_t *gpu_state; uvm_chunk_size_t chunk_size; if (UVM_ID_IS_CPU(page.processor)) { uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page.page_index); if (!uvm_page_mask_test(&block->cpu.resident, page.page_index)) return 0; UVM_ASSERT(uvm_processor_mask_test(&block->resident, UVM_ID_CPU)); return (NvU32)uvm_cpu_chunk_get_size(chunk); } gpu_state = uvm_va_block_gpu_state_get(block, page.processor); if (!gpu_state || !uvm_page_mask_test(&gpu_state->resident, page.page_index)) return 0; UVM_ASSERT(uvm_processor_mask_test(&block->resident, page.processor)); block_gpu_chunk_index(block, block_get_gpu(block, page.processor), page.page_index, &chunk_size); return (NvU32)chunk_size; } static uvm_pte_bits_cpu_t get_cpu_pte_bit_index(uvm_prot_t prot) { uvm_pte_bits_cpu_t pte_bit_index = UVM_PTE_BITS_CPU_MAX; // ATOMIC and WRITE are synonyms for the CPU if (prot == UVM_PROT_READ_WRITE_ATOMIC || prot == UVM_PROT_READ_WRITE) pte_bit_index = UVM_PTE_BITS_CPU_WRITE; else if (prot == UVM_PROT_READ_ONLY) pte_bit_index = UVM_PTE_BITS_CPU_READ; else UVM_ASSERT_MSG(false, "Invalid access permissions %s\n", uvm_prot_string(prot)); return pte_bit_index; } static uvm_pte_bits_gpu_t get_gpu_pte_bit_index(uvm_prot_t prot) { uvm_pte_bits_gpu_t pte_bit_index = UVM_PTE_BITS_GPU_MAX; if (prot == UVM_PROT_READ_WRITE_ATOMIC) pte_bit_index = UVM_PTE_BITS_GPU_ATOMIC; else if (prot == UVM_PROT_READ_WRITE) pte_bit_index = UVM_PTE_BITS_GPU_WRITE; else if (prot == UVM_PROT_READ_ONLY) pte_bit_index = UVM_PTE_BITS_GPU_READ; else UVM_ASSERT_MSG(false, "Invalid access permissions %s\n", uvm_prot_string(prot)); return pte_bit_index; } uvm_page_mask_t *uvm_va_block_resident_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor) { uvm_va_block_gpu_state_t *gpu_state; if (UVM_ID_IS_CPU(processor)) return &block->cpu.resident; gpu_state = uvm_va_block_gpu_state_get(block, processor); UVM_ASSERT(gpu_state); return &gpu_state->resident; } // Get the page residency mask for a processor // // Notably this will allocate GPU state if not yet present and if that fails // NULL is returned. static uvm_page_mask_t *block_resident_mask_get_alloc(uvm_va_block_t *block, uvm_processor_id_t processor) { uvm_va_block_gpu_state_t *gpu_state; if (UVM_ID_IS_CPU(processor)) return &block->cpu.resident; gpu_state = block_gpu_state_get_alloc(block, block_get_gpu(block, processor)); if (!gpu_state) return NULL; return &gpu_state->resident; } static const uvm_page_mask_t *block_map_with_prot_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor, uvm_prot_t prot) { uvm_va_block_gpu_state_t *gpu_state; if (UVM_ID_IS_CPU(processor)) return &block->cpu.pte_bits[get_cpu_pte_bit_index(prot)]; gpu_state = uvm_va_block_gpu_state_get(block, processor); UVM_ASSERT(gpu_state); return &gpu_state->pte_bits[get_gpu_pte_bit_index(prot)]; } const uvm_page_mask_t *uvm_va_block_map_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor) { return block_map_with_prot_mask_get(block, processor, UVM_PROT_READ_ONLY); } static const uvm_page_mask_t *block_evicted_mask_get(uvm_va_block_t *block, uvm_gpu_id_t gpu_id) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu_id); UVM_ASSERT(gpu_state); return &gpu_state->evicted; } static bool block_is_page_resident_anywhere(uvm_va_block_t *block, uvm_page_index_t page_index) { uvm_processor_id_t id; for_each_id_in_mask(id, &block->resident) { if (uvm_page_mask_test(uvm_va_block_resident_mask_get(block, id), page_index)) return true; } return false; } static bool block_processor_page_is_populated(uvm_va_block_t *block, uvm_processor_id_t proc, uvm_page_index_t page_index) { uvm_va_block_gpu_state_t *gpu_state; size_t chunk_index; if (UVM_ID_IS_CPU(proc)) return uvm_page_mask_test(&block->cpu.allocated, page_index); gpu_state = uvm_va_block_gpu_state_get(block, proc); if (!gpu_state) return false; chunk_index = block_gpu_chunk_index(block, block_get_gpu(block, proc), page_index, NULL); return gpu_state->chunks[chunk_index] != NULL; } static bool block_processor_page_is_resident_on(uvm_va_block_t *block, uvm_processor_id_t proc, uvm_page_index_t page_index) { const uvm_page_mask_t *resident_mask; if (UVM_ID_IS_CPU(proc)) { resident_mask = &block->cpu.resident; } else { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, proc); if (!gpu_state) return false; resident_mask = &gpu_state->resident; } return uvm_page_mask_test(resident_mask, page_index); } void uvm_va_block_region_authorized_gpus(uvm_va_block_t *va_block, uvm_va_block_region_t region, uvm_prot_t access_permission, uvm_processor_mask_t *authorized_gpus) { uvm_gpu_id_t gpu_id; uvm_pte_bits_gpu_t search_gpu_bit = get_gpu_pte_bit_index(access_permission); uvm_processor_mask_zero(authorized_gpus); // Test all GPUs with mappings on the block for_each_gpu_id_in_mask(gpu_id, &va_block->mapped) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id); if (gpu_state && !uvm_page_mask_region_empty(&gpu_state->pte_bits[search_gpu_bit], region)) uvm_processor_mask_set(authorized_gpus, gpu_id); } } void uvm_va_block_region_authorized_processors(uvm_va_block_t *va_block, uvm_va_block_region_t region, uvm_prot_t access_permission, uvm_processor_mask_t *authorized_processors) { uvm_pte_bits_cpu_t search_cpu_bit = get_cpu_pte_bit_index(access_permission); // Compute GPUs uvm_va_block_region_authorized_gpus(va_block, region, access_permission, authorized_processors); // Test CPU if (uvm_processor_mask_test(&va_block->mapped, UVM_ID_CPU) && !uvm_page_mask_region_empty(&va_block->cpu.pte_bits[search_cpu_bit], region)) { uvm_processor_mask_set(authorized_processors, UVM_ID_CPU); } } void uvm_va_block_page_authorized_gpus(uvm_va_block_t *va_block, uvm_page_index_t page_index, uvm_prot_t access_permission, uvm_processor_mask_t *authorized_gpus) { uvm_va_block_region_authorized_gpus(va_block, uvm_va_block_region_for_page(page_index), access_permission, authorized_gpus); } void uvm_va_block_page_authorized_processors(uvm_va_block_t *va_block, uvm_page_index_t page_index, uvm_prot_t access_permission, uvm_processor_mask_t *authorized_processors) { uvm_va_block_region_authorized_processors(va_block, uvm_va_block_region_for_page(page_index), access_permission, authorized_processors); } bool uvm_va_block_is_gpu_authorized_on_whole_region(uvm_va_block_t *va_block, uvm_va_block_region_t region, uvm_gpu_id_t gpu_id, uvm_prot_t required_prot) { uvm_pte_bits_gpu_t search_gpu_bit = get_gpu_pte_bit_index(required_prot); uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id); if (!gpu_state) return false; return uvm_page_mask_region_full(&gpu_state->pte_bits[search_gpu_bit], region); } bool uvm_va_block_is_processor_authorized_on_whole_region(uvm_va_block_t *va_block, uvm_va_block_region_t region, uvm_processor_id_t processor_id, uvm_prot_t required_prot) { if (UVM_ID_IS_CPU(processor_id)) { uvm_pte_bits_cpu_t search_cpu_bit = get_cpu_pte_bit_index(required_prot); return uvm_page_mask_region_full(&va_block->cpu.pte_bits[search_cpu_bit], region); } else { return uvm_va_block_is_gpu_authorized_on_whole_region(va_block, region, processor_id, required_prot); } } bool uvm_va_block_page_is_gpu_authorized(uvm_va_block_t *va_block, uvm_page_index_t page_index, uvm_gpu_id_t gpu_id, uvm_prot_t required_prot) { return uvm_va_block_is_gpu_authorized_on_whole_region(va_block, uvm_va_block_region_for_page(page_index), gpu_id, required_prot); } bool uvm_va_block_page_is_processor_authorized(uvm_va_block_t *va_block, uvm_page_index_t page_index, uvm_processor_id_t processor_id, uvm_prot_t required_prot) { return uvm_va_block_is_processor_authorized_on_whole_region(va_block, uvm_va_block_region_for_page(page_index), processor_id, required_prot); } void uvm_va_block_page_resident_gpus(uvm_va_block_t *va_block, uvm_page_index_t page_index, uvm_processor_mask_t *resident_gpus) { uvm_gpu_id_t id; uvm_processor_mask_zero(resident_gpus); for_each_gpu_id_in_mask(id, &va_block->resident) { if (uvm_page_mask_test(uvm_va_block_resident_mask_get(va_block, id), page_index)) { UVM_ASSERT(block_processor_page_is_populated(va_block, id, page_index)); uvm_processor_mask_set(resident_gpus, id); } } } void uvm_va_block_page_resident_processors(uvm_va_block_t *va_block, uvm_page_index_t page_index, uvm_processor_mask_t *resident_processors) { uvm_va_block_page_resident_gpus(va_block, page_index, resident_processors); if (uvm_page_mask_test(uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU), page_index)) { UVM_ASSERT(block_processor_page_is_populated(va_block, UVM_ID_CPU, page_index)); uvm_processor_mask_set(resident_processors, UVM_ID_CPU); } } NvU32 uvm_va_block_page_resident_processors_count(uvm_va_block_t *va_block, uvm_page_index_t page_index) { uvm_processor_mask_t resident_processors; uvm_va_block_page_resident_processors(va_block, page_index, &resident_processors); return uvm_processor_mask_get_count(&resident_processors); } uvm_processor_id_t uvm_va_block_page_get_closest_resident(uvm_va_block_t *va_block, uvm_page_index_t page_index, uvm_processor_id_t processor) { return uvm_va_block_page_get_closest_resident_in_mask(va_block, page_index, processor, NULL); } uvm_processor_id_t uvm_va_block_page_get_closest_resident_in_mask(uvm_va_block_t *va_block, uvm_page_index_t page_index, uvm_processor_id_t processor, const uvm_processor_mask_t *processor_mask) { uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); uvm_processor_mask_t search_mask; uvm_processor_id_t id; if (processor_mask) uvm_processor_mask_and(&search_mask, processor_mask, &va_block->resident); else uvm_processor_mask_copy(&search_mask, &va_block->resident); for_each_closest_id(id, &search_mask, processor, va_space) { if (uvm_page_mask_test(uvm_va_block_resident_mask_get(va_block, id), page_index)) return id; } return UVM_ID_INVALID; } // We don't track the specific aperture of each mapped page. Instead, we assume // that each virtual mapping from a given processor always targets the closest // processor on which that page is resident (with special rules for UVM-Lite). // // This function verifies that assumption: before a page becomes resident on a // new location, assert that no processor has a valid mapping to a farther // processor on that page. static bool block_check_resident_proximity(uvm_va_block_t *block, uvm_page_index_t page_index, uvm_processor_id_t new_residency) { uvm_processor_mask_t resident_procs, mapped_procs; uvm_processor_id_t mapped_id, closest_id; uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); uvm_processor_mask_andnot(&mapped_procs, &block->mapped, block_get_uvm_lite_gpus(block)); for_each_id_in_mask(mapped_id, &mapped_procs) { if (!uvm_page_mask_test(uvm_va_block_map_mask_get(block, mapped_id), page_index)) continue; uvm_va_block_page_resident_processors(block, page_index, &resident_procs); UVM_ASSERT(!uvm_processor_mask_empty(&resident_procs)); UVM_ASSERT(!uvm_processor_mask_test(&resident_procs, new_residency)); uvm_processor_mask_set(&resident_procs, new_residency); closest_id = uvm_processor_mask_find_closest_id(va_space, &resident_procs, mapped_id); UVM_ASSERT(!uvm_id_equal(closest_id, new_residency)); } return true; } // Returns the processor to which page_index should be mapped on gpu static uvm_processor_id_t block_gpu_get_processor_to_map(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_page_index_t page_index) { uvm_processor_id_t dest_id; // UVM-Lite GPUs can only map pages on the preferred location if (uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu->id)) return uvm_va_range_get_policy(block->va_range)->preferred_location; // Otherwise we always map the closest resident processor dest_id = uvm_va_block_page_get_closest_resident(block, page_index, gpu->id); UVM_ASSERT(UVM_ID_IS_VALID(dest_id)); return dest_id; } // Returns the processor to which page_index should be mapped on mapping_id static uvm_processor_id_t block_get_processor_to_map(uvm_va_block_t *block, uvm_processor_id_t mapping_id, uvm_page_index_t page_index) { if (UVM_ID_IS_CPU(mapping_id)) return uvm_va_block_page_get_closest_resident(block, page_index, mapping_id); return block_gpu_get_processor_to_map(block, block_get_gpu(block, mapping_id), page_index); } static void block_get_mapped_processors(uvm_va_block_t *block, uvm_processor_id_t resident_id, uvm_page_index_t page_index, uvm_processor_mask_t *mapped_procs) { uvm_processor_id_t mapped_id; uvm_processor_mask_zero(mapped_procs); for_each_id_in_mask(mapped_id, &block->mapped) { if (uvm_page_mask_test(uvm_va_block_map_mask_get(block, mapped_id), page_index)) { uvm_processor_id_t to_map_id = block_get_processor_to_map(block, mapped_id, page_index); if (uvm_id_equal(to_map_id, resident_id)) uvm_processor_mask_set(mapped_procs, mapped_id); } } } // We use block_gpu_get_processor_to_map to find the destination processor of a // given GPU mapping. This function is called when the mapping is established to // sanity check that the destination of the mapping matches the query. static bool block_check_mapping_residency_region(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_processor_id_t mapping_dest, uvm_va_block_region_t region, const uvm_page_mask_t *page_mask) { uvm_page_index_t page_index; for_each_va_block_page_in_region_mask(page_index, page_mask, region) { NvU64 va = uvm_va_block_cpu_page_address(block, page_index); uvm_processor_id_t proc_to_map = block_gpu_get_processor_to_map(block, gpu, page_index); UVM_ASSERT_MSG(uvm_id_equal(mapping_dest, proc_to_map), "VA 0x%llx on %s: mapping %s, supposed to map %s", va, uvm_gpu_name(gpu), block_processor_name(block, mapping_dest), block_processor_name(block, proc_to_map)); } return true; } static bool block_check_mapping_residency(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_processor_id_t mapping_dest, const uvm_page_mask_t *page_mask) { return block_check_mapping_residency_region(block, gpu, mapping_dest, uvm_va_block_region_from_block(block), page_mask); } // Check that there are no mappings targeting resident_id from any processor in // the block. static bool block_check_processor_not_mapped(uvm_va_block_t *block, uvm_processor_id_t resident_id) { uvm_processor_id_t mapped_id; uvm_page_index_t page_index; for_each_id_in_mask(mapped_id, &block->mapped) { const uvm_page_mask_t *map_mask = uvm_va_block_map_mask_get(block, mapped_id); for_each_va_block_page_in_mask(page_index, map_mask, block) { uvm_processor_id_t to_map_id = block_get_processor_to_map(block, mapped_id, page_index); UVM_ASSERT(!uvm_id_equal(to_map_id, resident_id)); } } return true; } // Zero all pages of the newly-populated chunk which are not resident anywhere // else in the system, adding that work to the block's tracker. In all cases, // this function adds a dependency on passed in tracker to the block's tracker. static NV_STATUS block_zero_new_gpu_chunk(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk, uvm_va_block_region_t chunk_region, uvm_tracker_t *tracker) { uvm_va_block_gpu_state_t *gpu_state; NV_STATUS status; uvm_gpu_address_t memset_addr_base, memset_addr; uvm_push_t push; uvm_gpu_id_t id; uvm_va_block_region_t subregion; uvm_page_mask_t *zero_mask; UVM_ASSERT(uvm_va_block_region_size(chunk_region) == uvm_gpu_chunk_get_size(chunk)); if (chunk->is_zero) return NV_OK; gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); zero_mask = kmem_cache_alloc(g_uvm_page_mask_cache, NV_UVM_GFP_FLAGS); if (!zero_mask) return NV_ERR_NO_MEMORY; // Tradeoff: zeroing entire chunk vs zeroing only the pages needed for the // operation. // // We may over-zero the page with this approach. For example, we might be // populating a 2MB chunk because only a single page within that chunk needs // to be made resident. If we also zero non-resident pages outside of the // strict region, we could waste the effort if those pages are populated on // another processor later and migrated here. // // We zero all non-resident pages in the chunk anyway for two reasons: // // 1) Efficiency. It's better to do all zeros as pipelined transfers once // rather than scatter them around for each populate operation. // // 2) Optimizing the common case of block_populate_gpu_chunk being called // for already-populated chunks. If we zero once at initial populate, we // can simply check whether the chunk is present in the array. Otherwise // we'd have to recompute the "is any page resident" mask every time. // Roll up all pages in chunk_region which are resident somewhere uvm_page_mask_zero(zero_mask); for_each_id_in_mask(id, &block->resident) uvm_page_mask_or(zero_mask, zero_mask, uvm_va_block_resident_mask_get(block, id)); // If all pages in the chunk are resident somewhere, we don't need to clear // anything. Just make sure the chunk is tracked properly. if (uvm_page_mask_region_full(zero_mask, chunk_region)) { status = uvm_tracker_add_tracker_safe(&block->tracker, tracker); goto out; } // Complement to get the pages which are not resident anywhere. These // are the pages which must be zeroed. uvm_page_mask_complement(zero_mask, zero_mask); if (uvm_mmu_gpu_needs_static_vidmem_mapping(gpu) || uvm_mmu_gpu_needs_dynamic_vidmem_mapping(gpu)) memset_addr_base = uvm_gpu_address_virtual_from_vidmem_phys(gpu, chunk->address); else memset_addr_base = uvm_gpu_address_physical(UVM_APERTURE_VID, chunk->address); memset_addr = memset_addr_base; status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_INTERNAL, tracker, &push, "Zero out chunk [0x%llx, 0x%llx) for region [0x%llx, 0x%llx) in va block [0x%llx, 0x%llx)", chunk->address, chunk->address + uvm_gpu_chunk_get_size(chunk), uvm_va_block_region_start(block, chunk_region), uvm_va_block_region_end(block, chunk_region) + 1, block->start, block->end + 1); if (status != NV_OK) goto out; for_each_va_block_subregion_in_mask(subregion, zero_mask, chunk_region) { // Pipeline the memsets since they never overlap with each other uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED); // We'll push one membar later for all memsets in this loop uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE); memset_addr.address = memset_addr_base.address + (subregion.first - chunk_region.first) * PAGE_SIZE; gpu->parent->ce_hal->memset_8(&push, memset_addr, 0, uvm_va_block_region_size(subregion)); } // A membar from this GPU is required between this memset and any PTE write // pointing this or another GPU to this chunk. Otherwise an engine could // read the PTE then access the page before the memset write is visible to // that engine. // // This memset writes GPU memory, so local mappings need only a GPU-local // membar. We can't easily determine here whether a peer GPU will ever map // this page in the future, so always use a sysmembar. uvm_push_end provides // one by default. // // TODO: Bug 1766424: Use GPU-local membars if no peer can currently map // this page. When peer access gets enabled, do a MEMBAR_SYS at that // point. uvm_push_end(&push); status = uvm_tracker_add_push_safe(&block->tracker, &push); out: if (zero_mask) kmem_cache_free(g_uvm_page_mask_cache, zero_mask); return status; } static NV_STATUS block_populate_gpu_chunk(uvm_va_block_t *block, uvm_va_block_retry_t *retry, uvm_gpu_t *gpu, size_t chunk_index, uvm_va_block_region_t chunk_region) { uvm_va_block_gpu_state_t *gpu_state = block_gpu_state_get_alloc(block, gpu); uvm_gpu_chunk_t *chunk = NULL; uvm_chunk_size_t chunk_size = uvm_va_block_region_size(chunk_region); uvm_va_block_test_t *block_test = uvm_va_block_get_test(block); NV_STATUS status; if (!gpu_state) return NV_ERR_NO_MEMORY; uvm_assert_mutex_locked(&block->lock); UVM_ASSERT(chunk_index < block_num_gpu_chunks(block, gpu)); UVM_ASSERT(chunk_size & gpu->parent->mmu_user_chunk_sizes); // We zero chunks as necessary at initial population, so if the chunk is // already populated we're done. See the comment in // block_zero_new_gpu_chunk. if (gpu_state->chunks[chunk_index]) return NV_OK; UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->resident, chunk_region)); status = block_alloc_gpu_chunk(block, retry, gpu, chunk_size, &chunk); if (status != NV_OK) return status; // In some configurations such as SR-IOV heavy, the chunk cannot be // referenced using its physical address. Create a virtual mapping. status = uvm_mmu_chunk_map(chunk); if (status != NV_OK) goto chunk_free; status = block_zero_new_gpu_chunk(block, gpu, chunk, chunk_region, &retry->tracker); if (status != NV_OK) goto chunk_unmap; // It is safe to modify the page index field without holding any PMM locks // because the chunk is pinned, which means that none of the other fields in // the bitmap can change. chunk->va_block_page_index = chunk_region.first; // va_block_page_index is a bitfield of size PAGE_SHIFT. Make sure at // compile-time that it can store VA Block page indexes. BUILD_BUG_ON(PAGES_PER_UVM_VA_BLOCK >= PAGE_SIZE); status = block_map_indirect_peers_to_gpu_chunk(block, gpu, chunk); if (status != NV_OK) goto chunk_unmap; if (block_test && block_test->inject_populate_error) { block_test->inject_populate_error = false; // Use NV_ERR_MORE_PROCESSING_REQUIRED to force a retry rather than // causing a fatal OOM failure. status = NV_ERR_MORE_PROCESSING_REQUIRED; goto chunk_unmap_indirect_peers; } // Record the used chunk so that it can be unpinned at the end of the whole // operation. block_retry_add_used_chunk(retry, chunk); gpu_state->chunks[chunk_index] = chunk; return NV_OK; chunk_unmap_indirect_peers: block_unmap_indirect_peers_from_gpu_chunk(block, gpu, chunk); chunk_unmap: uvm_mmu_chunk_unmap(chunk, &block->tracker); chunk_free: // block_zero_new_gpu_chunk may have pushed memsets on this chunk which it // placed in the block tracker. uvm_pmm_gpu_free(&gpu->pmm, chunk, &block->tracker); return status; } // Populate all chunks which cover the given region and page mask. static NV_STATUS block_populate_pages_gpu(uvm_va_block_t *block, uvm_va_block_retry_t *retry, uvm_gpu_t *gpu, uvm_va_block_region_t region, const uvm_page_mask_t *populate_mask) { uvm_va_block_region_t chunk_region, check_region; size_t chunk_index; uvm_page_index_t page_index; uvm_chunk_size_t chunk_size; NV_STATUS status; page_index = uvm_va_block_first_page_in_mask(region, populate_mask); if (page_index == region.outer) return NV_OK; chunk_index = block_gpu_chunk_index(block, gpu, page_index, &chunk_size); chunk_region = uvm_va_block_chunk_region(block, chunk_size, page_index); while (1) { check_region = uvm_va_block_region(max(chunk_region.first, region.first), min(chunk_region.outer, region.outer)); page_index = uvm_va_block_first_page_in_mask(check_region, populate_mask); if (page_index != check_region.outer) { status = block_populate_gpu_chunk(block, retry, gpu, chunk_index, chunk_region); if (status != NV_OK) return status; } if (check_region.outer == region.outer) break; ++chunk_index; chunk_size = block_gpu_chunk_size(block, gpu, chunk_region.outer); chunk_region = uvm_va_block_region(chunk_region.outer, chunk_region.outer + (chunk_size / PAGE_SIZE)); } return NV_OK; } static NV_STATUS block_populate_pages(uvm_va_block_t *block, uvm_va_block_retry_t *retry, uvm_va_block_context_t *block_context, uvm_processor_id_t dest_id, uvm_va_block_region_t region, const uvm_page_mask_t *page_mask) { NV_STATUS status = NV_OK; const uvm_page_mask_t *resident_mask = block_resident_mask_get_alloc(block, dest_id); uvm_page_index_t page_index; uvm_page_mask_t *populate_page_mask = &block_context->make_resident.page_mask; uvm_memcg_context_t memcg_context; if (!resident_mask) return NV_ERR_NO_MEMORY; if (page_mask) uvm_page_mask_andnot(populate_page_mask, page_mask, resident_mask); else uvm_page_mask_complement(populate_page_mask, resident_mask); if (UVM_ID_IS_GPU(dest_id)) return block_populate_pages_gpu(block, retry, block_get_gpu(block, dest_id), region, populate_page_mask); uvm_memcg_context_start(&memcg_context, block_context->mm); for_each_va_block_page_in_region_mask(page_index, populate_page_mask, region) { status = block_populate_page_cpu(block, page_index, block_context->mm); if (status != NV_OK) break; } uvm_memcg_context_end(&memcg_context); return status; } static const uvm_processor_mask_t *block_get_can_copy_from_mask(uvm_va_block_t *block, uvm_processor_id_t from) { uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); return &va_space->can_copy_from[uvm_id_value(from)]; } static bool block_can_copy_from(uvm_va_block_t *va_block, uvm_processor_id_t from, uvm_processor_id_t to) { return uvm_processor_mask_test(block_get_can_copy_from_mask(va_block, to), from); } // Get the chunk containing the given page, along with the offset of that page // within the chunk. static uvm_gpu_chunk_t *block_phys_page_chunk(uvm_va_block_t *block, block_phys_page_t block_page, size_t *chunk_offset) { uvm_gpu_t *gpu = block_get_gpu(block, block_page.processor); uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, block_page.processor); size_t chunk_index; uvm_gpu_chunk_t *chunk; uvm_chunk_size_t chunk_size; UVM_ASSERT(gpu_state); chunk_index = block_gpu_chunk_index(block, gpu, block_page.page_index, &chunk_size); chunk = gpu_state->chunks[chunk_index]; UVM_ASSERT(chunk); if (chunk_offset) { size_t page_offset = block_page.page_index - uvm_va_block_chunk_region(block,chunk_size, block_page.page_index).first; *chunk_offset = page_offset * PAGE_SIZE; } return chunk; } // Get the physical GPU address of a block's page from the POV of the specified GPU // This is the address that should be used for making PTEs for the specified GPU. static uvm_gpu_phys_address_t block_phys_page_address(uvm_va_block_t *block, block_phys_page_t block_page, uvm_gpu_t *gpu) { uvm_va_block_gpu_state_t *accessing_gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); size_t chunk_offset; uvm_gpu_chunk_t *chunk; UVM_ASSERT(accessing_gpu_state); if (UVM_ID_IS_CPU(block_page.processor)) { uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, block_page.page_index); NvU64 dma_addr = uvm_cpu_chunk_get_gpu_mapping_addr(block, block_page.page_index, chunk, gpu->id); uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), block_page.page_index); // The page should be mapped for physical access already as we do that // eagerly on CPU page population and GPU state alloc. UVM_ASSERT(dma_addr != 0); dma_addr += (block_page.page_index - chunk_region.first) * PAGE_SIZE; return uvm_gpu_phys_address(UVM_APERTURE_SYS, dma_addr); } chunk = block_phys_page_chunk(block, block_page, &chunk_offset); if (uvm_id_equal(block_page.processor, gpu->id)) { return uvm_gpu_phys_address(UVM_APERTURE_VID, chunk->address + chunk_offset); } else { uvm_gpu_phys_address_t phys_addr; uvm_gpu_t *owning_gpu = block_get_gpu(block, block_page.processor); uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); UVM_ASSERT(uvm_va_space_peer_enabled(va_space, gpu, owning_gpu)); phys_addr = uvm_pmm_gpu_peer_phys_address(&owning_gpu->pmm, chunk, gpu); phys_addr.address += chunk_offset; return phys_addr; } } // Get the physical GPU address of a block's page from the POV of the specified // GPU, suitable for accessing the memory from UVM-internal CE channels. // // Notably this is may be different from block_phys_page_address() to handle CE // limitations in addressing physical memory directly. static uvm_gpu_address_t block_phys_page_copy_address(uvm_va_block_t *block, block_phys_page_t block_page, uvm_gpu_t *gpu) { uvm_gpu_t *owning_gpu; size_t chunk_offset; uvm_gpu_chunk_t *chunk; uvm_gpu_address_t copy_addr; uvm_va_space_t *va_space; bool page_in_cpu, page_in_local_gpu; UVM_ASSERT_MSG(block_can_copy_from(block, gpu->id, block_page.processor), "from %s to %s\n", block_processor_name(block, gpu->id), block_processor_name(block, block_page.processor)); page_in_cpu = UVM_ID_IS_CPU(block_page.processor); page_in_local_gpu = uvm_id_equal(block_page.processor, gpu->id); // CPU and local GPU accesses can rely on block_phys_page_address, but the // resulting physical address may need to be converted into virtual. if (page_in_cpu || page_in_local_gpu) { uvm_gpu_phys_address_t gpu_phys_address = block_phys_page_address(block, block_page, gpu); if (page_in_cpu && uvm_mmu_gpu_needs_dynamic_sysmem_mapping(gpu)) return uvm_gpu_address_virtual_from_sysmem_phys(gpu, gpu_phys_address.address); if (page_in_local_gpu && uvm_mmu_gpu_needs_dynamic_vidmem_mapping(gpu)) return uvm_gpu_address_virtual_from_vidmem_phys(gpu, gpu_phys_address.address); return uvm_gpu_address_from_phys(gpu_phys_address); } va_space = uvm_va_block_get_va_space(block); // See the comments on the peer_identity_mappings_supported assignments in // the HAL for why we disable direct copies between peers. owning_gpu = block_get_gpu(block, block_page.processor); UVM_ASSERT(uvm_va_space_peer_enabled(va_space, gpu, owning_gpu)); chunk = block_phys_page_chunk(block, block_page, &chunk_offset); copy_addr = uvm_pmm_gpu_peer_copy_address(&owning_gpu->pmm, chunk, gpu); copy_addr.address += chunk_offset; return copy_addr; } uvm_gpu_phys_address_t uvm_va_block_gpu_phys_page_address(uvm_va_block_t *va_block, uvm_page_index_t page_index, uvm_gpu_t *gpu) { uvm_assert_mutex_locked(&va_block->lock); return block_phys_page_address(va_block, block_phys_page(gpu->id, page_index), gpu); } // Begin a push appropriate for copying data from src_id processor to dst_id processor. // One of src_id and dst_id needs to be a GPU. static NV_STATUS block_copy_begin_push(uvm_va_block_t *va_block, uvm_processor_id_t dst_id, uvm_processor_id_t src_id, uvm_tracker_t *tracker, uvm_push_t *push) { uvm_channel_type_t channel_type; uvm_gpu_t *gpu; UVM_ASSERT_MSG(!uvm_id_equal(src_id, dst_id), "Unexpected copy to self, processor %s\n", block_processor_name(va_block, src_id)); if (UVM_ID_IS_CPU(src_id)) { gpu = block_get_gpu(va_block, dst_id); channel_type = UVM_CHANNEL_TYPE_CPU_TO_GPU; } else if (UVM_ID_IS_CPU(dst_id)) { gpu = block_get_gpu(va_block, src_id); channel_type = UVM_CHANNEL_TYPE_GPU_TO_CPU; } else { // For GPU to GPU copies, prefer to "push" the data from the source as // that works better at least for P2P over PCI-E. gpu = block_get_gpu(va_block, src_id); channel_type = UVM_CHANNEL_TYPE_GPU_TO_GPU; } UVM_ASSERT_MSG(block_can_copy_from(va_block, gpu->id, dst_id), "GPU %s dst %s src %s\n", block_processor_name(va_block, gpu->id), block_processor_name(va_block, dst_id), block_processor_name(va_block, src_id)); UVM_ASSERT_MSG(block_can_copy_from(va_block, gpu->id, src_id), "GPU %s dst %s src %s\n", block_processor_name(va_block, gpu->id), block_processor_name(va_block, dst_id), block_processor_name(va_block, src_id)); if (channel_type == UVM_CHANNEL_TYPE_GPU_TO_GPU) { uvm_gpu_t *dst_gpu = block_get_gpu(va_block, dst_id); return uvm_push_begin_acquire_gpu_to_gpu(gpu->channel_manager, dst_gpu, tracker, push, "Copy from %s to %s for block [0x%llx, 0x%llx]", block_processor_name(va_block, src_id), block_processor_name(va_block, dst_id), va_block->start, va_block->end); } return uvm_push_begin_acquire(gpu->channel_manager, channel_type, tracker, push, "Copy from %s to %s for block [0x%llx, 0x%llx]", block_processor_name(va_block, src_id), block_processor_name(va_block, dst_id), va_block->start, va_block->end); } // A page is clean iff... // the destination is the preferred location and // the source is the CPU and // the destination does not support faults/eviction and // the CPU page is not dirty static bool block_page_is_clean(uvm_va_block_t *block, uvm_processor_id_t dst_id, uvm_processor_id_t src_id, uvm_page_index_t page_index) { return !uvm_va_block_is_hmm(block) && uvm_id_equal(dst_id, uvm_va_range_get_policy(block->va_range)->preferred_location) && UVM_ID_IS_CPU(src_id) && !block_get_gpu(block, dst_id)->parent->isr.replayable_faults.handling && !block_cpu_page_is_dirty(block, page_index); } // When the destination is the CPU... // if the source is the preferred location, mark as clean // otherwise, mark as dirty static void block_update_page_dirty_state(uvm_va_block_t *block, uvm_processor_id_t dst_id, uvm_processor_id_t src_id, uvm_page_index_t page_index) { if (UVM_ID_IS_GPU(dst_id) || uvm_va_block_is_hmm(block)) return; if (uvm_id_equal(src_id, uvm_va_range_get_policy(block->va_range)->preferred_location)) block_mark_cpu_page_clean(block, page_index); else block_mark_cpu_page_dirty(block, page_index); } static void block_mark_memory_used(uvm_va_block_t *block, uvm_processor_id_t id) { uvm_gpu_t *gpu; if (UVM_ID_IS_CPU(id)) return; gpu = block_get_gpu(block, id); // If the block is of the max size and the GPU supports eviction, mark the // root chunk as used in PMM. if (uvm_va_block_size(block) == UVM_CHUNK_SIZE_MAX && uvm_gpu_supports_eviction(gpu)) { // The chunk has to be there if this GPU is resident UVM_ASSERT(uvm_processor_mask_test(&block->resident, id)); uvm_pmm_gpu_mark_root_chunk_used(&gpu->pmm, uvm_va_block_gpu_state_get(block, gpu->id)->chunks[0]); } } static void block_set_resident_processor(uvm_va_block_t *block, uvm_processor_id_t id) { UVM_ASSERT(!uvm_page_mask_empty(uvm_va_block_resident_mask_get(block, id))); if (uvm_processor_mask_test_and_set(&block->resident, id)) return; block_mark_memory_used(block, id); } static void block_clear_resident_processor(uvm_va_block_t *block, uvm_processor_id_t id) { uvm_gpu_t *gpu; UVM_ASSERT(uvm_page_mask_empty(uvm_va_block_resident_mask_get(block, id))); if (!uvm_processor_mask_test_and_clear(&block->resident, id)) return; if (UVM_ID_IS_CPU(id)) return; gpu = block_get_gpu(block, id); // If the block is of the max size and the GPU supports eviction, mark the // root chunk as unused in PMM. if (uvm_va_block_size(block) == UVM_CHUNK_SIZE_MAX && uvm_gpu_supports_eviction(gpu)) { // The chunk may not be there any more when residency is cleared. uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); if (gpu_state && gpu_state->chunks[0]) uvm_pmm_gpu_mark_root_chunk_unused(&gpu->pmm, gpu_state->chunks[0]); } } typedef enum { BLOCK_TRANSFER_MODE_INTERNAL_MOVE = 1, BLOCK_TRANSFER_MODE_INTERNAL_COPY = 2, BLOCK_TRANSFER_MODE_INTERNAL_MOVE_TO_STAGE = 3, BLOCK_TRANSFER_MODE_INTERNAL_MOVE_FROM_STAGE = 4, BLOCK_TRANSFER_MODE_INTERNAL_COPY_TO_STAGE = 5, BLOCK_TRANSFER_MODE_INTERNAL_COPY_FROM_STAGE = 6 } block_transfer_mode_internal_t; static uvm_va_block_transfer_mode_t get_block_transfer_mode_from_internal(block_transfer_mode_internal_t transfer_mode) { switch (transfer_mode) { case BLOCK_TRANSFER_MODE_INTERNAL_MOVE: case BLOCK_TRANSFER_MODE_INTERNAL_MOVE_TO_STAGE: case BLOCK_TRANSFER_MODE_INTERNAL_MOVE_FROM_STAGE: return UVM_VA_BLOCK_TRANSFER_MODE_MOVE; case BLOCK_TRANSFER_MODE_INTERNAL_COPY: case BLOCK_TRANSFER_MODE_INTERNAL_COPY_TO_STAGE: case BLOCK_TRANSFER_MODE_INTERNAL_COPY_FROM_STAGE: return UVM_VA_BLOCK_TRANSFER_MODE_COPY; } UVM_ASSERT_MSG(0, "Invalid transfer mode %u\n", transfer_mode); return 0; } static bool block_phys_copy_contig_check(uvm_va_block_t *block, uvm_page_index_t page_index, const uvm_gpu_address_t *base_address, uvm_processor_id_t proc_id, uvm_gpu_t *copying_gpu) { uvm_gpu_address_t page_address; uvm_gpu_address_t contig_address = *base_address; contig_address.address += page_index * PAGE_SIZE; page_address = block_phys_page_copy_address(block, block_phys_page(proc_id, page_index), copying_gpu); return uvm_gpu_addr_cmp(page_address, contig_address) == 0; } // Check if the VA block has a single physically-contiguous chunk of storage // on the processor. static bool is_block_phys_contig(uvm_va_block_t *block, uvm_processor_id_t id) { uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_first_in_block(block, NULL); if (UVM_ID_IS_GPU(id)) return uvm_va_block_size(block) == block_gpu_chunk_size(block, block_get_gpu(block, id), 0); return chunk && (uvm_va_block_size(block) <= uvm_cpu_chunk_get_size(chunk)); } static uvm_va_block_region_t block_phys_contig_region(uvm_va_block_t *block, uvm_page_index_t page_index, uvm_processor_id_t resident_id) { if (UVM_ID_IS_CPU(resident_id)) { uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); return uvm_va_block_region(page_index, page_index + uvm_cpu_chunk_num_pages(chunk)); } else { uvm_chunk_size_t chunk_size; (void)block_gpu_chunk_index(block, block_get_gpu(block, resident_id), page_index, &chunk_size); return uvm_va_block_chunk_region(block, chunk_size, page_index); } } // Copies pages resident on the src_id processor to the dst_id processor // // The function adds the pages that were successfully copied to the output // migrated_pages mask and returns the number of pages in copied_pages. These // fields are reliable even if an error is returned. // // Acquires the block's tracker and adds all of its pushes to the copy_tracker. static NV_STATUS block_copy_resident_pages_between(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_processor_id_t dst_id, uvm_processor_id_t src_id, uvm_va_block_region_t region, const uvm_page_mask_t *page_mask, const uvm_page_mask_t *prefetch_page_mask, block_transfer_mode_internal_t transfer_mode, uvm_page_mask_t *migrated_pages, NvU32 *copied_pages, uvm_tracker_t *copy_tracker) { NV_STATUS tracker_status, status = NV_OK; uvm_page_mask_t *src_resident_mask = uvm_va_block_resident_mask_get(block, src_id); uvm_page_mask_t *dst_resident_mask = uvm_va_block_resident_mask_get(block, dst_id); uvm_gpu_t *copying_gpu = NULL; uvm_push_t push; uvm_page_index_t page_index; uvm_page_index_t contig_start_index = region.outer; uvm_page_index_t last_index = region.outer; uvm_page_mask_t *copy_mask = &block_context->make_resident.copy_resident_pages_between_mask; uvm_range_group_range_t *rgr = NULL; bool rgr_has_changed = false; uvm_make_resident_cause_t cause = block_context->make_resident.cause; uvm_make_resident_cause_t contig_cause = cause; const bool may_prefetch = (cause == UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT || cause == UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT || cause == UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER) && !!prefetch_page_mask; const bool is_src_phys_contig = is_block_phys_contig(block, src_id); const bool is_dst_phys_contig = is_block_phys_contig(block, dst_id); uvm_gpu_address_t contig_src_address = {0}; uvm_gpu_address_t contig_dst_address = {0}; uvm_va_range_t *va_range = block->va_range; uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); const uvm_va_block_transfer_mode_t block_transfer_mode = get_block_transfer_mode_from_internal(transfer_mode); *copied_pages = 0; if (uvm_id_equal(dst_id, src_id)) return NV_OK; uvm_page_mask_init_from_region(copy_mask, region, src_resident_mask); if (page_mask) uvm_page_mask_and(copy_mask, copy_mask, page_mask); // If there are not pages to be copied, exit early if (!uvm_page_mask_andnot(copy_mask, copy_mask, dst_resident_mask)) return NV_OK; // uvm_range_group_range_iter_first should only be called when the va_space // lock is held, which is always the case unless an eviction is taking // place. if (cause != UVM_MAKE_RESIDENT_CAUSE_EVICTION) { rgr = uvm_range_group_range_iter_first(va_space, uvm_va_block_region_start(block, region), uvm_va_block_region_end(block, region)); rgr_has_changed = true; } for_each_va_block_page_in_region_mask(page_index, copy_mask, region) { NvU64 page_start = uvm_va_block_cpu_page_address(block, page_index); uvm_make_resident_cause_t page_cause = (may_prefetch && uvm_page_mask_test(prefetch_page_mask, page_index))? UVM_MAKE_RESIDENT_CAUSE_PREFETCH: cause; UVM_ASSERT(block_check_resident_proximity(block, page_index, dst_id)); if (UVM_ID_IS_CPU(dst_id)) { uvm_memcg_context_t memcg_context; // To support staging through CPU, populate CPU pages on demand. // GPU destinations should have their pages populated already, but // that might change if we add staging through GPUs. uvm_memcg_context_start(&memcg_context, block_context->mm); status = block_populate_page_cpu(block, page_index, block_context->mm); uvm_memcg_context_end(&memcg_context); if (status != NV_OK) break; } UVM_ASSERT(block_processor_page_is_populated(block, dst_id, page_index)); // If we're not evicting and we're migrating away from the preferred // location, then we should add the range group range to the list of // migrated ranges in the range group. It's safe to skip this because // the use of range_group's migrated_ranges list is a UVM-Lite // optimization - eviction is not supported on UVM-Lite GPUs. if (cause != UVM_MAKE_RESIDENT_CAUSE_EVICTION && !uvm_va_block_is_hmm(block) && uvm_id_equal(src_id, uvm_va_range_get_policy(va_range)->preferred_location)) { // rgr_has_changed is used to minimize the number of times the // migrated_ranges_lock is taken. It is set to false when the range // group range pointed by rgr is added to the migrated_ranges list, // and it is just set back to true when we move to a different // range group range. // The current page could be after the end of rgr. Iterate over the // range group ranges until rgr's end location is greater than or // equal to the current page. while (rgr && rgr->node.end < page_start) { rgr = uvm_range_group_range_iter_next(va_space, rgr, uvm_va_block_region_end(block, region)); rgr_has_changed = true; } // Check whether the current page lies within rgr. A single page // must entirely reside within a range group range. Since we've // incremented rgr until its end is higher than page_start, we now // check if page_start lies within rgr. if (rgr && rgr_has_changed && page_start >= rgr->node.start && page_start <= rgr->node.end) { uvm_spin_lock(&rgr->range_group->migrated_ranges_lock); if (list_empty(&rgr->range_group_migrated_list_node)) list_move_tail(&rgr->range_group_migrated_list_node, &rgr->range_group->migrated_ranges); uvm_spin_unlock(&rgr->range_group->migrated_ranges_lock); rgr_has_changed = false; } } // No need to copy pages that haven't changed. Just clear residency // information if (block_page_is_clean(block, dst_id, src_id, page_index)) continue; if (!copying_gpu) { status = block_copy_begin_push(block, dst_id, src_id, &block->tracker, &push); if (status != NV_OK) break; copying_gpu = uvm_push_get_gpu(&push); // Record all processors involved in the copy uvm_processor_mask_set(&block_context->make_resident.all_involved_processors, copying_gpu->id); uvm_processor_mask_set(&block_context->make_resident.all_involved_processors, dst_id); uvm_processor_mask_set(&block_context->make_resident.all_involved_processors, src_id); // This function is called just once per VA block and needs to // receive the "main" cause for the migration (it mainly checks if // we are in the eviction path). Therefore, we pass cause instead // of contig_cause uvm_tools_record_block_migration_begin(block, &push, dst_id, src_id, page_start, cause); } else { uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED); } block_update_page_dirty_state(block, dst_id, src_id, page_index); if (last_index == region.outer) { contig_start_index = page_index; contig_cause = page_cause; // Computing the physical address is a non-trivial operation and // seems to be a performance limiter on systems with 2 or more // NVLINK links. Therefore, for physically-contiguous block // storage, we cache the start address and compute the page address // using the page index. if (is_src_phys_contig) contig_src_address = block_phys_page_copy_address(block, block_phys_page(src_id, 0), copying_gpu); if (is_dst_phys_contig) contig_dst_address = block_phys_page_copy_address(block, block_phys_page(dst_id, 0), copying_gpu); } else if ((page_index != last_index + 1) || contig_cause != page_cause) { uvm_va_block_region_t contig_region = uvm_va_block_region(contig_start_index, last_index + 1); size_t contig_region_size = uvm_va_block_region_size(contig_region); UVM_ASSERT(uvm_va_block_region_contains_region(region, contig_region)); // If both src and dst are physically-contiguous, consolidate copies // of contiguous pages into a single method. if (is_src_phys_contig && is_dst_phys_contig) { uvm_gpu_address_t src_address = contig_src_address; uvm_gpu_address_t dst_address = contig_dst_address; src_address.address += contig_start_index * PAGE_SIZE; dst_address.address += contig_start_index * PAGE_SIZE; uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE); copying_gpu->parent->ce_hal->memcopy(&push, dst_address, src_address, contig_region_size); } uvm_perf_event_notify_migration(&va_space->perf_events, &push, block, dst_id, src_id, uvm_va_block_region_start(block, contig_region), contig_region_size, block_transfer_mode, contig_cause, &block_context->make_resident); contig_start_index = page_index; contig_cause = page_cause; } if (is_src_phys_contig) UVM_ASSERT(block_phys_copy_contig_check(block, page_index, &contig_src_address, src_id, copying_gpu)); if (is_dst_phys_contig) UVM_ASSERT(block_phys_copy_contig_check(block, page_index, &contig_dst_address, dst_id, copying_gpu)); if (!is_src_phys_contig || !is_dst_phys_contig) { uvm_gpu_address_t src_address; uvm_gpu_address_t dst_address; if (is_src_phys_contig) { src_address = contig_src_address; src_address.address += page_index * PAGE_SIZE; } else { src_address = block_phys_page_copy_address(block, block_phys_page(src_id, page_index), copying_gpu); } if (is_dst_phys_contig) { dst_address = contig_dst_address; dst_address.address += page_index * PAGE_SIZE; } else { dst_address = block_phys_page_copy_address(block, block_phys_page(dst_id, page_index), copying_gpu); } uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE); copying_gpu->parent->ce_hal->memcopy(&push, dst_address, src_address, PAGE_SIZE); } last_index = page_index; } // Copy the remaining pages if (copying_gpu) { uvm_va_block_region_t contig_region = uvm_va_block_region(contig_start_index, last_index + 1); size_t contig_region_size = uvm_va_block_region_size(contig_region); UVM_ASSERT(uvm_va_block_region_contains_region(region, contig_region)); if (is_src_phys_contig && is_dst_phys_contig) { uvm_gpu_address_t src_address = contig_src_address; uvm_gpu_address_t dst_address = contig_dst_address; src_address.address += contig_start_index * PAGE_SIZE; dst_address.address += contig_start_index * PAGE_SIZE; uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE); copying_gpu->parent->ce_hal->memcopy(&push, dst_address, src_address, contig_region_size); } uvm_perf_event_notify_migration(&va_space->perf_events, &push, block, dst_id, src_id, uvm_va_block_region_start(block, contig_region), contig_region_size, block_transfer_mode, contig_cause, &block_context->make_resident); // TODO: Bug 1766424: If the destination is a GPU and the copy was done // by that GPU, use a GPU-local membar if no peer can currently // map this page. When peer access gets enabled, do a MEMBAR_SYS // at that point. uvm_push_end(&push); tracker_status = uvm_tracker_add_push_safe(copy_tracker, &push); if (status == NV_OK) status = tracker_status; } // Update VA block status bits // // Only update the bits for the pages that succeded if (status != NV_OK) uvm_page_mask_region_clear(copy_mask, uvm_va_block_region(page_index, PAGES_PER_UVM_VA_BLOCK)); *copied_pages = uvm_page_mask_weight(copy_mask); if (*copied_pages) { uvm_page_mask_or(migrated_pages, migrated_pages, copy_mask); uvm_page_mask_or(dst_resident_mask, dst_resident_mask, copy_mask); block_set_resident_processor(block, dst_id); if (transfer_mode == BLOCK_TRANSFER_MODE_INTERNAL_MOVE_FROM_STAGE) { // Check whether there are any resident pages left on src if (!uvm_page_mask_andnot(src_resident_mask, src_resident_mask, copy_mask)) block_clear_resident_processor(block, src_id); } // If we are staging the copy due to read duplication, we keep the copy there if (transfer_mode == BLOCK_TRANSFER_MODE_INTERNAL_COPY || transfer_mode == BLOCK_TRANSFER_MODE_INTERNAL_COPY_TO_STAGE) uvm_page_mask_or(&block->read_duplicated_pages, &block->read_duplicated_pages, copy_mask); if (transfer_mode == BLOCK_TRANSFER_MODE_INTERNAL_COPY_FROM_STAGE) UVM_ASSERT(uvm_page_mask_subset(copy_mask, &block->read_duplicated_pages)); // Any move operation implies that mappings have been removed from all // non-UVM-Lite GPUs if (transfer_mode == BLOCK_TRANSFER_MODE_INTERNAL_MOVE || transfer_mode == BLOCK_TRANSFER_MODE_INTERNAL_MOVE_TO_STAGE) uvm_page_mask_andnot(&block->maybe_mapped_pages, &block->maybe_mapped_pages, copy_mask); // Record ReadDuplicate events here, after the residency bits have been // updated if (block_transfer_mode == UVM_VA_BLOCK_TRANSFER_MODE_COPY) uvm_tools_record_read_duplicate(block, dst_id, region, copy_mask); // If we are migrating due to an eviction, set the GPU as evicted and // mark the evicted pages. If we are migrating away from the CPU this // means that those pages are not evicted. if (cause == UVM_MAKE_RESIDENT_CAUSE_EVICTION) { uvm_va_block_gpu_state_t *src_gpu_state = uvm_va_block_gpu_state_get(block, src_id); UVM_ASSERT(src_gpu_state); UVM_ASSERT(UVM_ID_IS_CPU(dst_id)); uvm_page_mask_or(&src_gpu_state->evicted, &src_gpu_state->evicted, copy_mask); uvm_processor_mask_set(&block->evicted_gpus, src_id); } else if (UVM_ID_IS_GPU(dst_id) && uvm_processor_mask_test(&block->evicted_gpus, dst_id)) { uvm_va_block_gpu_state_t *dst_gpu_state = uvm_va_block_gpu_state_get(block, dst_id); UVM_ASSERT(dst_gpu_state); if (!uvm_page_mask_andnot(&dst_gpu_state->evicted, &dst_gpu_state->evicted, copy_mask)) uvm_processor_mask_clear(&block->evicted_gpus, dst_id); } } return status; } // Copy resident pages to the destination from all source processors in the // src_processor_mask // // The function adds the pages that were successfully copied to the output // migrated_pages mask and returns the number of pages in copied_pages. These // fields are reliable even if an error is returned. static NV_STATUS block_copy_resident_pages_mask(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_processor_id_t dst_id, const uvm_processor_mask_t *src_processor_mask, uvm_va_block_region_t region, const uvm_page_mask_t *page_mask, const uvm_page_mask_t *prefetch_page_mask, block_transfer_mode_internal_t transfer_mode, NvU32 max_pages_to_copy, uvm_page_mask_t *migrated_pages, NvU32 *copied_pages_out, uvm_tracker_t *tracker_out) { uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); uvm_processor_id_t src_id; uvm_processor_mask_t search_mask; uvm_processor_mask_copy(&search_mask, src_processor_mask); *copied_pages_out = 0; for_each_closest_id(src_id, &search_mask, dst_id, va_space) { NV_STATUS status; NvU32 copied_pages_from_src; UVM_ASSERT(!uvm_id_equal(src_id, dst_id)); status = block_copy_resident_pages_between(block, block_context, dst_id, src_id, region, page_mask, prefetch_page_mask, transfer_mode, migrated_pages, &copied_pages_from_src, tracker_out); *copied_pages_out += copied_pages_from_src; UVM_ASSERT(*copied_pages_out <= max_pages_to_copy); if (status != NV_OK) return status; // Break out once we copied max pages already if (*copied_pages_out == max_pages_to_copy) break; } return NV_OK; } static void break_read_duplication_in_region(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_processor_id_t dst_id, uvm_va_block_region_t region, const uvm_page_mask_t *page_mask) { uvm_processor_id_t id; uvm_page_mask_t *break_pages_in_region = &block_context->scratch_page_mask; uvm_page_mask_init_from_region(break_pages_in_region, region, page_mask); UVM_ASSERT(uvm_page_mask_subset(break_pages_in_region, uvm_va_block_resident_mask_get(block, dst_id))); // Clear read_duplicated bit for all pages in region uvm_page_mask_andnot(&block->read_duplicated_pages, &block->read_duplicated_pages, break_pages_in_region); // Clear residency bits for all processors other than dst_id for_each_id_in_mask(id, &block->resident) { uvm_page_mask_t *other_resident_mask; if (uvm_id_equal(id, dst_id)) continue; other_resident_mask = uvm_va_block_resident_mask_get(block, id); if (!uvm_page_mask_andnot(other_resident_mask, other_resident_mask, break_pages_in_region)) block_clear_resident_processor(block, id); } } static void block_copy_set_first_touch_residency(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_processor_id_t dst_id, uvm_va_block_region_t region, const uvm_page_mask_t *page_mask) { uvm_page_index_t page_index; uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, dst_id); uvm_page_mask_t *first_touch_mask = &block_context->make_resident.page_mask; if (page_mask) uvm_page_mask_andnot(first_touch_mask, page_mask, resident_mask); else uvm_page_mask_complement(first_touch_mask, resident_mask); uvm_page_mask_region_clear_outside(first_touch_mask, region); for_each_va_block_page_in_mask(page_index, first_touch_mask, block) { UVM_ASSERT(!block_is_page_resident_anywhere(block, page_index)); UVM_ASSERT(block_processor_page_is_populated(block, dst_id, page_index)); UVM_ASSERT(block_check_resident_proximity(block, page_index, dst_id)); } uvm_page_mask_or(resident_mask, resident_mask, first_touch_mask); if (!uvm_page_mask_empty(resident_mask)) block_set_resident_processor(block, dst_id); // Add them to the output mask, too uvm_page_mask_or(&block_context->make_resident.pages_changed_residency, &block_context->make_resident.pages_changed_residency, first_touch_mask); } // Copy resident pages from other processors to the destination and mark any // pages not resident anywhere as resident on the destination. // All the pages on the destination need to be populated by the caller first. // Pages not resident anywhere else need to be zeroed out as well. // // If UVM_VA_BLOCK_TRANSFER_MODE_COPY is passed, processors that already have a // copy of the page will keep it. Conversely, if UVM_VA_BLOCK_TRANSFER_MODE_MOVE // is passed, the page will no longer be resident in any processor other than dst_id. static NV_STATUS block_copy_resident_pages(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_processor_id_t dst_id, uvm_va_block_region_t region, const uvm_page_mask_t *page_mask, const uvm_page_mask_t *prefetch_page_mask, uvm_va_block_transfer_mode_t transfer_mode) { NV_STATUS status = NV_OK; NV_STATUS tracker_status; uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, dst_id); NvU32 missing_pages_count; NvU32 pages_copied; NvU32 pages_copied_to_cpu; uvm_processor_mask_t src_processor_mask; uvm_page_mask_t *copy_page_mask = &block_context->make_resident.page_mask; uvm_page_mask_t *migrated_pages = &block_context->make_resident.pages_migrated; uvm_page_mask_t *staged_pages = &block_context->make_resident.pages_staged; block_transfer_mode_internal_t transfer_mode_internal; uvm_page_mask_zero(migrated_pages); if (page_mask) uvm_page_mask_andnot(copy_page_mask, page_mask, resident_mask); else uvm_page_mask_complement(copy_page_mask, resident_mask); missing_pages_count = uvm_page_mask_region_weight(copy_page_mask, region); // If nothing needs to be copied, just check if we need to break // read-duplication (i.e. transfer_mode is UVM_VA_BLOCK_TRANSFER_MODE_MOVE) if (missing_pages_count == 0) goto out; // TODO: Bug 1753731: Add P2P2P copies staged through a GPU // TODO: Bug 1753731: When a page is resident in multiple locations due to // read-duplication, spread out the source of the copy so we don't // bottleneck on a single location. uvm_processor_mask_zero(&src_processor_mask); if (!uvm_id_equal(dst_id, UVM_ID_CPU)) { // If the destination is a GPU, first move everything from processors // with copy access supported. Notably this will move pages from the CPU // as well even if later some extra copies from CPU are required for // staged copies. uvm_processor_mask_and(&src_processor_mask, block_get_can_copy_from_mask(block, dst_id), &block->resident); uvm_processor_mask_clear(&src_processor_mask, dst_id); status = block_copy_resident_pages_mask(block, block_context, dst_id, &src_processor_mask, region, copy_page_mask, prefetch_page_mask, transfer_mode == UVM_VA_BLOCK_TRANSFER_MODE_COPY? BLOCK_TRANSFER_MODE_INTERNAL_COPY: BLOCK_TRANSFER_MODE_INTERNAL_MOVE, missing_pages_count, migrated_pages, &pages_copied, &local_tracker); UVM_ASSERT(missing_pages_count >= pages_copied); missing_pages_count -= pages_copied; if (status != NV_OK) goto out; if (missing_pages_count == 0) goto out; if (pages_copied) uvm_page_mask_andnot(copy_page_mask, copy_page_mask, migrated_pages); } // Now copy from everywhere else to the CPU. This is both for when the // destination is the CPU (src_processor_mask empty) and for a staged copy // (src_processor_mask containing processors with copy access to dst_id). uvm_processor_mask_andnot(&src_processor_mask, &block->resident, &src_processor_mask); uvm_processor_mask_clear(&src_processor_mask, dst_id); uvm_processor_mask_clear(&src_processor_mask, UVM_ID_CPU); uvm_page_mask_zero(staged_pages); if (UVM_ID_IS_CPU(dst_id)) { transfer_mode_internal = transfer_mode == UVM_VA_BLOCK_TRANSFER_MODE_COPY? BLOCK_TRANSFER_MODE_INTERNAL_COPY: BLOCK_TRANSFER_MODE_INTERNAL_MOVE; } else { transfer_mode_internal = transfer_mode == UVM_VA_BLOCK_TRANSFER_MODE_COPY? BLOCK_TRANSFER_MODE_INTERNAL_COPY_TO_STAGE: BLOCK_TRANSFER_MODE_INTERNAL_MOVE_TO_STAGE; } status = block_copy_resident_pages_mask(block, block_context, UVM_ID_CPU, &src_processor_mask, region, copy_page_mask, prefetch_page_mask, transfer_mode_internal, missing_pages_count, staged_pages, &pages_copied_to_cpu, &local_tracker); if (status != NV_OK) goto out; // If destination is the CPU then we copied everything there above if (UVM_ID_IS_CPU(dst_id)) { uvm_page_mask_or(migrated_pages, migrated_pages, staged_pages); missing_pages_count -= pages_copied_to_cpu; goto out; } // Add everything to the block's tracker so that the // block_copy_resident_pages_between() call below will acquire it. status = uvm_tracker_add_tracker_safe(&block->tracker, &local_tracker); if (status != NV_OK) goto out; uvm_tracker_clear(&local_tracker); // Now copy staged pages from the CPU to the destination. status = block_copy_resident_pages_between(block, block_context, dst_id, UVM_ID_CPU, region, staged_pages, prefetch_page_mask, transfer_mode == UVM_VA_BLOCK_TRANSFER_MODE_COPY? BLOCK_TRANSFER_MODE_INTERNAL_COPY_FROM_STAGE: BLOCK_TRANSFER_MODE_INTERNAL_MOVE_FROM_STAGE, migrated_pages, &pages_copied, &local_tracker); UVM_ASSERT(missing_pages_count >= pages_copied); missing_pages_count -= pages_copied; if (status != NV_OK) goto out; // If we get here, that means we were staging the copy through the CPU and // we should copy as many pages from the CPU as we copied to the CPU. UVM_ASSERT(pages_copied == pages_copied_to_cpu); out: // Pages that weren't resident anywhere else were populated at the // destination directly. Mark them as resident now. We only do it if there // have been no errors because we cannot identify which pages failed. if (status == NV_OK && missing_pages_count > 0) block_copy_set_first_touch_residency(block, block_context, dst_id, region, page_mask); // Break read duplication if (transfer_mode == UVM_VA_BLOCK_TRANSFER_MODE_MOVE) { const uvm_page_mask_t *break_read_duplication_mask; if (status == NV_OK) { break_read_duplication_mask = page_mask; } else { // We reuse this mask since copy_page_mask is no longer used in the // function if (page_mask) uvm_page_mask_and(&block_context->make_resident.page_mask, resident_mask, page_mask); else uvm_page_mask_copy(&block_context->make_resident.page_mask, resident_mask); break_read_duplication_mask = &block_context->make_resident.page_mask; } break_read_duplication_in_region(block, block_context, dst_id, region, break_read_duplication_mask); } // Accumulate the pages that migrated into the output mask uvm_page_mask_or(&block_context->make_resident.pages_changed_residency, &block_context->make_resident.pages_changed_residency, migrated_pages); // Add everything from the local tracker to the block's tracker. // Notably this is also needed for handling block_copy_resident_pages_between() // failures in the first loop. tracker_status = uvm_tracker_add_tracker_safe(&block->tracker, &local_tracker); uvm_tracker_deinit(&local_tracker); return status == NV_OK ? tracker_status : status; } NV_STATUS uvm_va_block_make_resident(uvm_va_block_t *va_block, uvm_va_block_retry_t *va_block_retry, uvm_va_block_context_t *va_block_context, uvm_processor_id_t dest_id, uvm_va_block_region_t region, const uvm_page_mask_t *page_mask, const uvm_page_mask_t *prefetch_page_mask, uvm_make_resident_cause_t cause) { NV_STATUS status; uvm_processor_mask_t unmap_processor_mask; uvm_page_mask_t *unmap_page_mask = &va_block_context->make_resident.page_mask; uvm_page_mask_t *resident_mask; va_block_context->make_resident.dest_id = dest_id; va_block_context->make_resident.cause = cause; if (prefetch_page_mask) { UVM_ASSERT(cause == UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT || cause == UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT || cause == UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER); } uvm_assert_mutex_locked(&va_block->lock); UVM_ASSERT(uvm_va_block_is_hmm(va_block) || va_block->va_range->type == UVM_VA_RANGE_TYPE_MANAGED); resident_mask = block_resident_mask_get_alloc(va_block, dest_id); if (!resident_mask) return NV_ERR_NO_MEMORY; // Unmap all mapped processors except for UVM-Lite GPUs as their mappings // are largely persistent. uvm_processor_mask_andnot(&unmap_processor_mask, &va_block->mapped, block_get_uvm_lite_gpus(va_block)); if (page_mask) uvm_page_mask_andnot(unmap_page_mask, page_mask, resident_mask); else uvm_page_mask_complement(unmap_page_mask, resident_mask); // Unmap all pages not resident on the destination status = uvm_va_block_unmap_mask(va_block, va_block_context, &unmap_processor_mask, region, unmap_page_mask); if (status != NV_OK) return status; if (page_mask) uvm_page_mask_and(unmap_page_mask, page_mask, &va_block->read_duplicated_pages); else uvm_page_mask_init_from_region(unmap_page_mask, region, &va_block->read_duplicated_pages); // Also unmap read-duplicated pages excluding dest_id uvm_processor_mask_clear(&unmap_processor_mask, dest_id); status = uvm_va_block_unmap_mask(va_block, va_block_context, &unmap_processor_mask, region, unmap_page_mask); if (status != NV_OK) return status; uvm_tools_record_read_duplicate_invalidate(va_block, dest_id, region, unmap_page_mask); // Note that block_populate_pages and block_move_resident_pages also use // va_block_context->make_resident.page_mask. unmap_page_mask = NULL; status = block_populate_pages(va_block, va_block_retry, va_block_context, dest_id, region, page_mask); if (status != NV_OK) return status; status = block_copy_resident_pages(va_block, va_block_context, dest_id, region, page_mask, prefetch_page_mask, UVM_VA_BLOCK_TRANSFER_MODE_MOVE); if (status != NV_OK) return status; // Update eviction heuristics, if needed. Notably this could repeat the call // done in block_set_resident_processor(), but that doesn't do anything bad // and it's simpler to keep it in both places. // // Skip this if we didn't do anything (the input region and/or page mask was // empty). if (uvm_processor_mask_test(&va_block->resident, dest_id)) block_mark_memory_used(va_block, dest_id); return NV_OK; } // Combination function which prepares the input {region, page_mask} for // entering read-duplication. It: // - Unmaps all processors but revoke_id // - Revokes write access from revoke_id static NV_STATUS block_prep_read_duplicate_mapping(uvm_va_block_t *va_block, uvm_va_block_context_t *va_block_context, uvm_processor_id_t revoke_id, uvm_va_block_region_t region, const uvm_page_mask_t *page_mask) { uvm_processor_mask_t unmap_processor_mask; uvm_processor_id_t unmap_id; uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); NV_STATUS status, tracker_status; // Unmap everybody except revoke_id uvm_processor_mask_andnot(&unmap_processor_mask, &va_block->mapped, block_get_uvm_lite_gpus(va_block)); uvm_processor_mask_clear(&unmap_processor_mask, revoke_id); for_each_id_in_mask(unmap_id, &unmap_processor_mask) { status = uvm_va_block_unmap(va_block, va_block_context, unmap_id, region, page_mask, &local_tracker); if (status != NV_OK) goto out; } // Revoke WRITE/ATOMIC access permissions from the remaining mapped // processor. status = uvm_va_block_revoke_prot(va_block, va_block_context, revoke_id, region, page_mask, UVM_PROT_READ_WRITE, &local_tracker); if (status != NV_OK) goto out; out: tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker); uvm_tracker_deinit(&local_tracker); return status == NV_OK ? tracker_status : status; } NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block, uvm_va_block_retry_t *va_block_retry, uvm_va_block_context_t *va_block_context, uvm_processor_id_t dest_id, uvm_va_block_region_t region, const uvm_page_mask_t *page_mask, const uvm_page_mask_t *prefetch_page_mask, uvm_make_resident_cause_t cause) { NV_STATUS status = NV_OK; uvm_processor_id_t src_id; va_block_context->make_resident.dest_id = dest_id; va_block_context->make_resident.cause = cause; if (prefetch_page_mask) { // TODO: Bug 1877578: investigate automatic read-duplicate policies UVM_ASSERT(cause == UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT || cause == UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT || cause == UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER); } uvm_assert_mutex_locked(&va_block->lock); UVM_ASSERT(!uvm_va_block_is_dead(va_block)); // For pages that are entering read-duplication we need to unmap remote // mappings and revoke RW and higher access permissions. // // The current implementation: // - Unmaps pages from all processors but the one with the resident copy // - Revokes write access from the processor with the resident copy for_each_id_in_mask(src_id, &va_block->resident) { // Note that the below calls to block_populate_pages and // block_move_resident_pages also use // va_block_context->make_resident.page_mask. uvm_page_mask_t *preprocess_page_mask = &va_block_context->make_resident.page_mask; const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, src_id); UVM_ASSERT(!uvm_page_mask_empty(resident_mask)); if (page_mask) uvm_page_mask_andnot(preprocess_page_mask, page_mask, &va_block->read_duplicated_pages); else uvm_page_mask_complement(preprocess_page_mask, &va_block->read_duplicated_pages); // If there are no pages that need to be unmapped/revoked, skip to the // next processor if (!uvm_page_mask_and(preprocess_page_mask, preprocess_page_mask, resident_mask)) continue; status = block_prep_read_duplicate_mapping(va_block, va_block_context, src_id, region, preprocess_page_mask); if (status != NV_OK) return status; } status = block_populate_pages(va_block, va_block_retry, va_block_context, dest_id, region, page_mask); if (status != NV_OK) return status; status = block_copy_resident_pages(va_block, va_block_context, dest_id, region, page_mask, prefetch_page_mask, UVM_VA_BLOCK_TRANSFER_MODE_COPY); if (status != NV_OK) return status; // Update eviction heuristics, if needed. Notably this could repeat the call // done in block_set_resident_processor(), but that doesn't do anything bad // and it's simpler to keep it in both places. // // Skip this if we didn't do anything (the input region and/or page mask was // empty). if (uvm_processor_mask_test(&va_block->resident, dest_id)) block_mark_memory_used(va_block, dest_id); return NV_OK; } // Looks up the current CPU mapping state of page from the // block->cpu.pte_bits bitmaps. If write access is enabled, // UVM_PROT_READ_WRITE_ATOMIC is returned instead of UVM_PROT_READ_WRITE, since // write access implies atomic access for CPUs. static uvm_prot_t block_page_prot_cpu(uvm_va_block_t *block, uvm_page_index_t page_index) { uvm_prot_t prot; UVM_ASSERT(!uvm_va_block_is_dead(block)); if (uvm_page_mask_test(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], page_index)) prot = UVM_PROT_READ_WRITE_ATOMIC; else if (uvm_page_mask_test(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index)) prot = UVM_PROT_READ_ONLY; else prot = UVM_PROT_NONE; return prot; } // Looks up the current GPU mapping state of page from the // block->gpus[i]->pte_bits bitmaps. static uvm_prot_t block_page_prot_gpu(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_page_index_t page_index) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); uvm_prot_t prot; UVM_ASSERT(!uvm_va_block_is_dead(block)); if (!gpu_state) return UVM_PROT_NONE; if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_ATOMIC], page_index)) prot = UVM_PROT_READ_WRITE_ATOMIC; else if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_WRITE], page_index)) prot = UVM_PROT_READ_WRITE; else if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], page_index)) prot = UVM_PROT_READ_ONLY; else prot = UVM_PROT_NONE; return prot; } static uvm_prot_t block_page_prot(uvm_va_block_t *block, uvm_processor_id_t id, uvm_page_index_t page_index) { if (UVM_ID_IS_CPU(id)) return block_page_prot_cpu(block, page_index); else return block_page_prot_gpu(block, block_get_gpu(block, id), page_index); } // Returns true if the block has any valid CPU PTE mapping in the block region. static bool block_has_valid_mapping_cpu(uvm_va_block_t *block, uvm_va_block_region_t region) { size_t valid_page; UVM_ASSERT(region.outer <= uvm_va_block_num_cpu_pages(block)); // Early-out: check whether any address in this block has a CPU mapping if (!uvm_processor_mask_test(&block->mapped, UVM_ID_CPU)) { UVM_ASSERT(uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ])); UVM_ASSERT(uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE])); return false; } // All valid mappings have at least read permissions so we only need to // inspect the read bits. valid_page = uvm_va_block_first_page_in_mask(region, &block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]); if (valid_page == region.outer) return false; UVM_ASSERT(block_page_prot_cpu(block, valid_page) != UVM_PROT_NONE); return true; } static bool block_check_chunk_indirect_peers(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk) { uvm_gpu_t *accessing_gpu; uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); if (!uvm_pmm_sysmem_mappings_indirect_supported()) return true; for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) { NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu); uvm_reverse_map_t reverse_map; size_t num_mappings; num_mappings = uvm_pmm_sysmem_mappings_dma_to_virt(&accessing_gpu->pmm_reverse_sysmem_mappings, peer_addr, uvm_gpu_chunk_get_size(chunk), &reverse_map, 1); UVM_ASSERT(num_mappings == 1); UVM_ASSERT(reverse_map.va_block == block); UVM_ASSERT(reverse_map.region.first == chunk->va_block_page_index); UVM_ASSERT(uvm_va_block_region_size(reverse_map.region) == uvm_gpu_chunk_get_size(chunk)); uvm_va_block_release_no_destroy(reverse_map.va_block); } return true; } // Sanity check the given GPU's chunks array static bool block_check_chunks(uvm_va_block_t *block, uvm_gpu_id_t id) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, id); uvm_gpu_t *gpu; size_t i, num_chunks; uvm_page_index_t page_index; uvm_chunk_size_t chunk_size; if (!gpu_state) return true; gpu = block_get_gpu(block, id); num_chunks = block_num_gpu_chunks(block, gpu); for (page_index = 0, i = 0; i < num_chunks; i++) { uvm_gpu_chunk_t *chunk = gpu_state->chunks[i]; size_t chunk_index = block_gpu_chunk_index(block, gpu, page_index, &chunk_size); if (chunk_index != i) { UVM_ERR_PRINT("chunk index mismatch: calculated %zu, is in %zu. VA block [0x%llx, 0x%llx) GPU %u page_index: %u\n", chunk_index, i, block->start, block->end + 1, uvm_id_value(id), page_index); return false; } if (chunk) { if (chunk_size != uvm_gpu_chunk_get_size(chunk)) { UVM_ERR_PRINT("chunk size mismatch: calc %u, actual %u. VA block [0x%llx, 0x%llx) GPU: %u page_index: %u chunk index: %zu\n", chunk_size, uvm_gpu_chunk_get_size(chunk), block->start, block->end + 1, uvm_id_value(id), page_index, i); return false; } if (chunk->state != UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) { UVM_ERR_PRINT("Invalid chunk state %s. VA block [0x%llx, 0x%llx) GPU: %u page_index: %u chunk index: %zu chunk_size: %u\n", uvm_pmm_gpu_chunk_state_string(chunk->state), block->start, block->end + 1, uvm_id_value(id), page_index, i, chunk_size); return false; } UVM_ASSERT(chunk->va_block == block); UVM_ASSERT(chunk->va_block_page_index == page_index); UVM_ASSERT(block_check_chunk_indirect_peers(block, gpu, chunk)); } page_index += chunk_size / PAGE_SIZE; } return true; } // Sanity checks for page mappings static bool block_check_mappings_page(uvm_va_block_t *block, uvm_page_index_t page_index) { uvm_processor_mask_t atomic_mappings, write_mappings, read_mappings; uvm_processor_mask_t lite_read_mappings, lite_atomic_mappings; uvm_processor_mask_t remaining_mappings, temp_mappings; uvm_processor_mask_t resident_processors; const uvm_processor_mask_t *residency_accessible_from = NULL; const uvm_processor_mask_t *residency_has_native_atomics = NULL; uvm_processor_id_t residency, id; uvm_va_range_t *va_range = block->va_range; uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); uvm_processor_id_t preferred_location = va_range ? uvm_va_range_get_policy(va_range)->preferred_location : UVM_ID_INVALID; const uvm_processor_mask_t *uvm_lite_gpus = block_get_uvm_lite_gpus(block); uvm_va_block_page_authorized_processors(block, page_index, UVM_PROT_READ_WRITE_ATOMIC, &atomic_mappings); uvm_va_block_page_authorized_processors(block, page_index, UVM_PROT_READ_WRITE, &write_mappings); uvm_va_block_page_authorized_processors(block, page_index, UVM_PROT_READ_ONLY, &read_mappings); // Each access bit implies all accesses below it UVM_ASSERT(uvm_processor_mask_subset(&atomic_mappings, &write_mappings)); UVM_ASSERT(uvm_processor_mask_subset(&write_mappings, &read_mappings)); UVM_ASSERT(uvm_processor_mask_subset(&read_mappings, &block->mapped)); uvm_va_block_page_resident_processors(block, page_index, &resident_processors); UVM_ASSERT(uvm_processor_mask_subset(&resident_processors, &block->resident)); // Sanity check block_get_mapped_processors uvm_processor_mask_copy(&remaining_mappings, &read_mappings); for_each_id_in_mask(residency, &resident_processors) { block_get_mapped_processors(block, residency, page_index, &temp_mappings); UVM_ASSERT(uvm_processor_mask_subset(&temp_mappings, &remaining_mappings)); uvm_processor_mask_andnot(&remaining_mappings, &remaining_mappings, &temp_mappings); } // Any remaining mappings point to non-resident locations, so they must be // UVM-Lite mappings. UVM_ASSERT(uvm_processor_mask_subset(&remaining_mappings, uvm_lite_gpus)); residency = uvm_processor_mask_find_first_id(&resident_processors); if (uvm_processor_mask_get_count(&resident_processors) > 0) { residency_accessible_from = &va_space->accessible_from[uvm_id_value(residency)]; residency_has_native_atomics = &va_space->has_native_atomics[uvm_id_value(residency)]; } // If the page is not resident, there should be no valid mappings UVM_ASSERT_MSG(uvm_processor_mask_get_count(&resident_processors) > 0 || uvm_processor_mask_get_count(&read_mappings) == 0, "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n", *resident_processors.bitmap, *read_mappings.bitmap, *write_mappings.bitmap, *atomic_mappings.bitmap, *va_space->system_wide_atomics_enabled_processors.bitmap, *block->read_duplicated_pages.bitmap); // Test read_duplicated_pages mask UVM_ASSERT_MSG((uvm_processor_mask_get_count(&resident_processors) <= 1 && !uvm_page_mask_test(&block->read_duplicated_pages, page_index)) || (uvm_processor_mask_get_count(&resident_processors) > 1 && uvm_page_mask_test(&block->read_duplicated_pages, page_index)), "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n", *resident_processors.bitmap, *read_mappings.bitmap, *write_mappings.bitmap, *atomic_mappings.bitmap, *va_space->system_wide_atomics_enabled_processors.bitmap, *block->read_duplicated_pages.bitmap); if (!uvm_processor_mask_empty(uvm_lite_gpus)) UVM_ASSERT(UVM_ID_IS_VALID(preferred_location)); // UVM-Lite checks. Since the range group is made non-migratable before the // actual migrations for that range group happen, we can only make those // checks which are valid on both migratable and non-migratable range // groups. uvm_processor_mask_and(&lite_read_mappings, &read_mappings, uvm_lite_gpus); uvm_processor_mask_and(&lite_atomic_mappings, &atomic_mappings, uvm_lite_gpus); // Any mapping from a UVM-Lite GPU must be atomic... UVM_ASSERT(uvm_processor_mask_equal(&lite_read_mappings, &lite_atomic_mappings)); // ... and must have access to preferred_location if (UVM_ID_IS_VALID(preferred_location)) { const uvm_processor_mask_t *preferred_location_accessible_from; preferred_location_accessible_from = &va_space->accessible_from[uvm_id_value(preferred_location)]; UVM_ASSERT(uvm_processor_mask_subset(&lite_atomic_mappings, preferred_location_accessible_from)); } for_each_id_in_mask(id, &lite_atomic_mappings) UVM_ASSERT(uvm_processor_mask_test(&va_space->can_access[uvm_id_value(id)], preferred_location)); // Exclude uvm_lite_gpus from mappings' masks after UVM-Lite tests uvm_processor_mask_andnot(&read_mappings, &read_mappings, uvm_lite_gpus); uvm_processor_mask_andnot(&write_mappings, &write_mappings, uvm_lite_gpus); uvm_processor_mask_andnot(&atomic_mappings, &atomic_mappings, uvm_lite_gpus); // Pages set to zero in maybe_mapped_pages must not be mapped on any // non-UVM-Lite GPU if (!uvm_page_mask_test(&block->maybe_mapped_pages, page_index)) { UVM_ASSERT_MSG(uvm_processor_mask_get_count(&read_mappings) == 0, "Resident: 0x%lx - Mappings Block: 0x%lx / Page R: 0x%lx W: 0x%lx A: 0x%lx\n", *resident_processors.bitmap, *block->mapped.bitmap, *read_mappings.bitmap, *write_mappings.bitmap, *atomic_mappings.bitmap); } // atomic mappings from GPUs with disabled system-wide atomics are treated // as write mappings. Therefore, we remove them from the atomic mappings mask uvm_processor_mask_and(&atomic_mappings, &atomic_mappings, &va_space->system_wide_atomics_enabled_processors); if (!uvm_processor_mask_empty(&read_mappings)) { // Read-duplicate: if a page is resident in multiple locations, it // must be resident locally on each mapped processor. if (uvm_processor_mask_get_count(&resident_processors) > 1) { UVM_ASSERT_MSG(uvm_processor_mask_subset(&read_mappings, &resident_processors), "Read-duplicate copies from remote processors\n" "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n", *resident_processors.bitmap, *read_mappings.bitmap, *write_mappings.bitmap, *atomic_mappings.bitmap, *va_space->system_wide_atomics_enabled_processors.bitmap, *block->read_duplicated_pages.bitmap); } else { // Processors with mappings must have access to the processor that // has the valid copy UVM_ASSERT_MSG(uvm_processor_mask_subset(&read_mappings, residency_accessible_from), "Not all processors have access to %s\n", "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -" "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n", uvm_va_space_processor_name(va_space, residency), *resident_processors.bitmap, *read_mappings.bitmap, *write_mappings.bitmap, *atomic_mappings.bitmap, *residency_accessible_from->bitmap, *residency_has_native_atomics->bitmap, *va_space->system_wide_atomics_enabled_processors.bitmap); for_each_id_in_mask(id, &read_mappings) { UVM_ASSERT(uvm_processor_mask_test(&va_space->can_access[uvm_id_value(id)], residency)); if (uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(residency)], id)) { uvm_gpu_t *resident_gpu = uvm_va_space_get_gpu(va_space, residency); uvm_gpu_t *mapped_gpu = uvm_va_space_get_gpu(va_space, id); uvm_gpu_chunk_t *chunk = block_phys_page_chunk(block, block_phys_page(residency, page_index), NULL); // This function will assert if no mapping exists (void)uvm_pmm_gpu_indirect_peer_addr(&resident_gpu->pmm, chunk, mapped_gpu); } } } } // If any processor has a writable mapping, there must only be one copy of // the page in the system if (!uvm_processor_mask_empty(&write_mappings)) { UVM_ASSERT_MSG(uvm_processor_mask_get_count(&resident_processors) == 1, "Too many resident copies for pages with write_mappings\n" "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n", *resident_processors.bitmap, *read_mappings.bitmap, *write_mappings.bitmap, *atomic_mappings.bitmap, *va_space->system_wide_atomics_enabled_processors.bitmap, *block->read_duplicated_pages.bitmap); } if (!uvm_processor_mask_empty(&atomic_mappings)) { uvm_processor_mask_t native_atomics; uvm_processor_mask_and(&native_atomics, &atomic_mappings, residency_has_native_atomics); if (uvm_processor_mask_empty(&native_atomics)) { // No other faultable processor should be able to write uvm_processor_mask_and(&write_mappings, &write_mappings, &va_space->faultable_processors); UVM_ASSERT_MSG(uvm_processor_mask_get_count(&write_mappings) == 1, "Too many write mappings to %s from processors with non-native atomics\n" "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -" "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n", uvm_va_space_processor_name(va_space, residency), *resident_processors.bitmap, *read_mappings.bitmap, *write_mappings.bitmap, *atomic_mappings.bitmap, *residency_accessible_from->bitmap, *residency_has_native_atomics->bitmap, *va_space->system_wide_atomics_enabled_processors.bitmap); // Only one processor outside of the native group can have atomics enabled UVM_ASSERT_MSG(uvm_processor_mask_get_count(&atomic_mappings) == 1, "Too many atomics mappings to %s from processors with non-native atomics\n" "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -" "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n", uvm_va_space_processor_name(va_space, residency), *resident_processors.bitmap, *read_mappings.bitmap, *write_mappings.bitmap, *atomic_mappings.bitmap, *residency_accessible_from->bitmap, *residency_has_native_atomics->bitmap, *va_space->system_wide_atomics_enabled_processors.bitmap); } else { uvm_processor_mask_t non_native_atomics; // One or more processors within the native group have atomics enabled. // All processors outside of that group may have write but not atomic // permissions. uvm_processor_mask_andnot(&non_native_atomics, &atomic_mappings, residency_has_native_atomics); UVM_ASSERT_MSG(uvm_processor_mask_empty(&non_native_atomics), "atomic mappings to %s from processors native and non-native\n" "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -" "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n", uvm_va_space_processor_name(va_space, residency), *resident_processors.bitmap, *read_mappings.bitmap, *write_mappings.bitmap, *atomic_mappings.bitmap, *residency_accessible_from->bitmap, *residency_has_native_atomics->bitmap, *va_space->system_wide_atomics_enabled_processors.bitmap); } } return true; } static bool block_check_mappings_ptes(uvm_va_block_t *block, uvm_gpu_t *gpu) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); uvm_va_block_gpu_state_t *resident_gpu_state; uvm_pte_bits_gpu_t pte_bit; uvm_processor_id_t resident_id; uvm_prot_t prot; NvU32 big_page_size; size_t num_big_pages, big_page_index; uvm_va_block_region_t big_region, chunk_region; uvm_gpu_chunk_t *chunk; if (!gpu_state->page_table_range_4k.table) UVM_ASSERT(!gpu_state->activated_4k); if (!gpu_state->page_table_range_big.table) { UVM_ASSERT(!gpu_state->initialized_big); UVM_ASSERT(!gpu_state->activated_big); } // It's only safe to check the PTE mappings if we have page tables. See // uvm_va_block_get_gpu_va_space. if (!block_gpu_has_page_tables(block, gpu)) { UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, gpu->id)); return true; } big_page_size = uvm_va_block_gpu_big_page_size(block, gpu); num_big_pages = uvm_va_block_num_big_pages(block, big_page_size); if (block_gpu_supports_2m(block, gpu)) { if (gpu_state->page_table_range_big.table || gpu_state->page_table_range_4k.table) { // 2M blocks require the 2M entry to be allocated for the lower // ranges to also be allocated. UVM_ASSERT(gpu_state->page_table_range_2m.table); } else if (gpu_state->page_table_range_2m.table) { // If the 2M entry is present but the lower ones aren't, the PTE // must be 2M. UVM_ASSERT(gpu_state->pte_is_2m); } } else { UVM_ASSERT(!gpu_state->page_table_range_2m.table); if (num_big_pages == 0) UVM_ASSERT(!gpu_state->page_table_range_big.table); } // If we have the big table and it's in use then it must have been // initialized, even if it doesn't currently contain active PTEs. if ((!block_gpu_supports_2m(block, gpu) && gpu_state->page_table_range_big.table) || (block_gpu_supports_2m(block, gpu) && !gpu_state->pte_is_2m && gpu_state->activated_big)) UVM_ASSERT(gpu_state->initialized_big); if (gpu_state->pte_is_2m) { UVM_ASSERT(block_gpu_supports_2m(block, gpu)); UVM_ASSERT(gpu_state->page_table_range_2m.table); UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); UVM_ASSERT(!gpu_state->force_4k_ptes); // GPU architectures which support 2M pages only support 64K as the big // page size. All of the 2M code assumes that // MAX_BIG_PAGES_PER_UVM_VA_BLOCK covers a 2M PTE exactly (bitmap_full, // bitmap_complement, etc). BUILD_BUG_ON((UVM_PAGE_SIZE_2M / UVM_PAGE_SIZE_64K) != MAX_BIG_PAGES_PER_UVM_VA_BLOCK); prot = block_page_prot_gpu(block, gpu, 0); // All page permissions match for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) { if (prot == UVM_PROT_NONE || pte_bit > get_gpu_pte_bit_index(prot)) UVM_ASSERT(uvm_page_mask_empty(&gpu_state->pte_bits[pte_bit])); else UVM_ASSERT(uvm_page_mask_full(&gpu_state->pte_bits[pte_bit])); } if (prot != UVM_PROT_NONE) { resident_id = block_gpu_get_processor_to_map(block, gpu, 0); // block_check_resident_proximity verifies that no closer processor // has a resident page, so we don't need to check that all pages // have the same resident_id. // block_check_mappings_page verifies that all pages marked resident // are backed by populated memory. // The mapped processor should be fully resident and physically- // contiguous. UVM_ASSERT(uvm_page_mask_full(uvm_va_block_resident_mask_get(block, resident_id))); if (UVM_ID_IS_GPU(resident_id)) { resident_gpu_state = uvm_va_block_gpu_state_get(block, resident_id); UVM_ASSERT(resident_gpu_state); UVM_ASSERT(uvm_gpu_chunk_get_size(resident_gpu_state->chunks[0]) == UVM_CHUNK_SIZE_2M); } else { uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_first_in_block(block, NULL); UVM_ASSERT(uvm_page_mask_full(&block->cpu.allocated)); UVM_ASSERT(chunk); UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_2M); } } } else if (!bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) { UVM_ASSERT(gpu_state->page_table_range_big.table); UVM_ASSERT(!gpu_state->force_4k_ptes); UVM_ASSERT(num_big_pages > 0); UVM_ASSERT(gpu_state->initialized_big); for (big_page_index = 0; big_page_index < num_big_pages; big_page_index++) { big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size); if (!test_bit(big_page_index, gpu_state->big_ptes)) { // If there are valid mappings but this isn't a big PTE, the // mapping must be using the 4k PTEs. if (!uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], big_region)) UVM_ASSERT(gpu_state->page_table_range_4k.table); continue; } prot = block_page_prot_gpu(block, gpu, big_region.first); // All page permissions match for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) { if (prot == UVM_PROT_NONE || pte_bit > get_gpu_pte_bit_index(prot)) UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->pte_bits[pte_bit], big_region)); else UVM_ASSERT(uvm_page_mask_region_full(&gpu_state->pte_bits[pte_bit], big_region)); } if (prot != UVM_PROT_NONE) { resident_id = block_gpu_get_processor_to_map(block, gpu, big_region.first); // The mapped processor should be fully resident and physically- // contiguous. Exception: UVM-Lite GPUs always map the preferred // location even if the memory is resident elsewhere. Skip the // residency check but still verify contiguity. if (!uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu->id)) { UVM_ASSERT(uvm_page_mask_region_full(uvm_va_block_resident_mask_get(block, resident_id), big_region)); } if (UVM_ID_IS_CPU(resident_id)) { uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, big_region.first); UVM_ASSERT(gpu->parent->can_map_sysmem_with_large_pages); UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) >= uvm_va_block_region_size(big_region)); } else { // Check GPU chunks chunk = block_phys_page_chunk(block, block_phys_page(resident_id, big_region.first), NULL); chunk_region = uvm_va_block_chunk_region(block, uvm_gpu_chunk_get_size(chunk), big_region.first); UVM_ASSERT(uvm_va_block_region_contains_region(chunk_region, big_region)); } } } } return true; } static bool block_check_mappings(uvm_va_block_t *block) { uvm_page_index_t page_index; uvm_processor_id_t id; // Verify the master masks, since block_check_mappings_page relies on them for_each_processor_id(id) { const uvm_page_mask_t *resident_mask, *map_mask; if (UVM_ID_IS_GPU(id) && !uvm_va_block_gpu_state_get(block, id)) { UVM_ASSERT(!uvm_processor_mask_test(&block->resident, id)); UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, id)); UVM_ASSERT(!uvm_processor_mask_test(&block->evicted_gpus, id)); continue; } resident_mask = uvm_va_block_resident_mask_get(block, id); UVM_ASSERT(uvm_processor_mask_test(&block->resident, id) == !uvm_page_mask_empty(resident_mask)); map_mask = uvm_va_block_map_mask_get(block, id); UVM_ASSERT(uvm_processor_mask_test(&block->mapped, id) == !uvm_page_mask_empty(map_mask)); if (UVM_ID_IS_GPU(id)) { const uvm_page_mask_t *evicted_mask = block_evicted_mask_get(block, id); UVM_ASSERT(uvm_processor_mask_test(&block->evicted_gpus, id) == !uvm_page_mask_empty(evicted_mask)); // Pages cannot be resident if they are marked as evicted UVM_ASSERT(!uvm_page_mask_intersects(evicted_mask, resident_mask)); // Pages cannot be resident on a GPU with no memory if (!block_processor_has_memory(block, id)) UVM_ASSERT(!uvm_processor_mask_test(&block->resident, id)); } } // Check that every page has coherent mappings for_each_va_block_page(page_index, block) block_check_mappings_page(block, page_index); for_each_gpu_id(id) { if (uvm_va_block_gpu_state_get(block, id)) { uvm_gpu_t *gpu = block_get_gpu(block, id); // Check big and/or 2M PTE state block_check_mappings_ptes(block, gpu); } } return true; } // See the comments on uvm_va_block_unmap static void block_unmap_cpu(uvm_va_block_t *block, uvm_va_block_region_t region, const uvm_page_mask_t *unmap_pages) { uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); uvm_pte_bits_cpu_t pte_bit; bool unmapped_something = false; uvm_va_block_region_t subregion; NvU32 num_mapped_processors; // Early-out if nothing in the region is mapped if (!block_has_valid_mapping_cpu(block, region)) return; num_mapped_processors = uvm_processor_mask_get_count(&block->mapped); // If we are unmapping a page which we are tracking due to CPU faults with // correct permissions, clear the info. This will cover both the unmap and // revoke cases (since we implement CPU revocation by unmap + map) if (block->cpu.fault_authorized.first_fault_stamp && uvm_page_mask_region_test(unmap_pages, region, block->cpu.fault_authorized.page_index)) block->cpu.fault_authorized.first_fault_stamp = 0; for_each_va_block_subregion_in_mask(subregion, unmap_pages, region) { if (!block_has_valid_mapping_cpu(block, subregion)) continue; unmap_mapping_range(&va_space->mapping, uvm_va_block_region_start(block, subregion), uvm_va_block_region_size(subregion), 1); for (pte_bit = 0; pte_bit < UVM_PTE_BITS_CPU_MAX; pte_bit++) uvm_page_mask_region_clear(&block->cpu.pte_bits[pte_bit], subregion); // If the CPU is the only processor with mappings we can safely mark // the pages as fully unmapped if (num_mapped_processors == 1) uvm_page_mask_region_clear(&block->maybe_mapped_pages, subregion); unmapped_something = true; } if (!unmapped_something) return; // Check whether the block has any more mappings if (uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ])) { UVM_ASSERT(uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE])); uvm_processor_mask_clear(&block->mapped, UVM_ID_CPU); } UVM_ASSERT(block_check_mappings(block)); } // Given a mask of mapped pages, returns true if any of the pages in the mask // are mapped remotely by the given GPU. static bool block_has_remote_mapping_gpu(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_gpu_id_t gpu_id, const uvm_page_mask_t *mapped_pages) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu_id); if (!gpu_state) return false; // The caller must ensure that all pages of the input mask are really mapped UVM_ASSERT(uvm_page_mask_subset(mapped_pages, &gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ])); // UVM-Lite GPUs map the preferred location if it's accessible, regardless // of the resident location. if (uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu_id)) { if (uvm_page_mask_empty(mapped_pages)) return false; return !uvm_id_equal(uvm_va_range_get_policy(block->va_range)->preferred_location, gpu_id); } // Remote pages are pages which are mapped but not resident locally return uvm_page_mask_andnot(&block_context->scratch_page_mask, mapped_pages, &gpu_state->resident); } // Writes pte_clear_val to the 4k PTEs covered by clear_page_mask. If // clear_page_mask is NULL, all 4k PTEs in the {block, gpu} are written. // // If tlb_batch is provided, the 4k PTEs written are added to the batch. The // caller is responsible for ending the TLB batch with the appropriate membar. static void block_gpu_pte_clear_4k(uvm_va_block_t *block, uvm_gpu_t *gpu, const uvm_page_mask_t *clear_page_mask, NvU64 pte_clear_val, uvm_pte_batch_t *pte_batch, uvm_tlb_batch_t *tlb_batch) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; uvm_gpu_phys_address_t pte_addr; NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_4K); uvm_va_block_region_t region = uvm_va_block_region_from_block(block); uvm_va_block_region_t subregion; size_t num_ptes, ptes_per_page = PAGE_SIZE / UVM_PAGE_SIZE_4K; for_each_va_block_subregion_in_mask(subregion, clear_page_mask, region) { num_ptes = uvm_va_block_region_num_pages(subregion) * ptes_per_page; pte_addr = uvm_page_table_range_entry_address(tree, &gpu_state->page_table_range_4k, subregion.first * ptes_per_page); uvm_pte_batch_clear_ptes(pte_batch, pte_addr, pte_clear_val, pte_size, num_ptes); if (tlb_batch) { uvm_tlb_batch_invalidate(tlb_batch, uvm_va_block_region_start(block, subregion), uvm_va_block_region_size(subregion), UVM_PAGE_SIZE_4K, UVM_MEMBAR_NONE); } } } // Writes the 4k PTEs covered by write_page_mask using memory from resident_id // with new_prot permissions. new_prot must not be UVM_PROT_NONE: use // block_gpu_pte_clear_4k instead. // // If write_page_mask is NULL, all 4k PTEs in the {block, gpu} are written. // // If tlb_batch is provided, the 4k PTEs written are added to the batch. The // caller is responsible for ending the TLB batch with the appropriate membar. static void block_gpu_pte_write_4k(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_processor_id_t resident_id, uvm_prot_t new_prot, const uvm_page_mask_t *write_page_mask, uvm_pte_batch_t *pte_batch, uvm_tlb_batch_t *tlb_batch) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_4K); const size_t ptes_per_page = PAGE_SIZE / UVM_PAGE_SIZE_4K; uvm_va_block_region_t contig_region = {0}; uvm_gpu_phys_address_t contig_addr = {0}; uvm_gpu_phys_address_t page_addr = {0}; uvm_page_index_t page_index; NvU64 pte_flags = block_gpu_pte_flag_cacheable(block, gpu, resident_id); UVM_ASSERT(new_prot != UVM_PROT_NONE); UVM_ASSERT(UVM_ID_IS_VALID(resident_id)); for_each_va_block_page_in_mask(page_index, write_page_mask, block) { uvm_gpu_phys_address_t pte_addr; size_t i; // Assume that this mapping will be used to write to the page if (new_prot > UVM_PROT_READ_ONLY && UVM_ID_IS_CPU(resident_id)) block_mark_cpu_page_dirty(block, page_index); if (page_index >= contig_region.outer) { contig_region = block_phys_contig_region(block, page_index, resident_id); contig_addr = block_phys_page_address(block, block_phys_page(resident_id, contig_region.first), gpu); page_addr = contig_addr; } page_addr.address = contig_addr.address + (page_index - contig_region.first) * PAGE_SIZE; pte_addr = uvm_page_table_range_entry_address(tree, &gpu_state->page_table_range_4k, page_index * ptes_per_page); // Handle PAGE_SIZE > GPU PTE size for (i = 0; i < ptes_per_page; i++) { NvU64 pte_val = tree->hal->make_pte(page_addr.aperture, page_addr.address, new_prot, pte_flags); uvm_pte_batch_write_pte(pte_batch, pte_addr, pte_val, pte_size); page_addr.address += UVM_PAGE_SIZE_4K; pte_addr.address += pte_size; } if (tlb_batch) { NvU64 page_virt_addr = uvm_va_block_cpu_page_address(block, page_index); uvm_tlb_batch_invalidate(tlb_batch, page_virt_addr, PAGE_SIZE, UVM_PAGE_SIZE_4K, UVM_MEMBAR_NONE); } } } // Writes all 4k PTEs under the big PTE regions described by big_ptes_covered. // This is used to initialize the 4k PTEs when splitting 2M and big PTEs. It // only writes 4k PTEs, not big PTEs. // // For those 4k PTEs, new_pages_mask indicates which ones should inherit the // mapping from the corresponding big page (0) and which ones should be written // using memory from resident_id and new_prot (1). Unlike the other pte_write // functions, new_prot may be UVM_PROT_NONE. // // If resident_id is UVM_ID_INVALID, this function looks up the resident ID // which should inherit the current permissions. new_prot must be UVM_PROT_NONE // in this case. // // new_pages_mask must not be NULL. // // No TLB invalidates are required since we've set up the lower PTEs to never be // cached by the GPU's MMU when covered by larger PTEs. static void block_gpu_pte_big_split_write_4k(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu, uvm_processor_id_t resident_id, uvm_prot_t new_prot, const unsigned long *big_ptes_covered, const uvm_page_mask_t *new_pages_mask, uvm_pte_batch_t *pte_batch) { uvm_va_block_region_t big_region; size_t big_page_index; uvm_processor_id_t curr_resident_id; uvm_prot_t curr_prot; NvU32 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu); if (UVM_ID_IS_INVALID(resident_id)) UVM_ASSERT(new_prot == UVM_PROT_NONE); for_each_set_bit(big_page_index, big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) { big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size); curr_prot = block_page_prot_gpu(block, gpu, big_region.first); // The unmap path doesn't know the current residency ahead of time, so // we have to look it up. if (UVM_ID_IS_INVALID(resident_id)) { curr_resident_id = block_gpu_get_processor_to_map(block, gpu, big_region.first); } else { // Check that we aren't changing the aperture of the existing // mappings. It could be legal in some cases (switching from {RO, A} // to {RO, B} for example) but we'd need to issue TLB membars. if (curr_prot != UVM_PROT_NONE) UVM_ASSERT(uvm_id_equal(block_gpu_get_processor_to_map(block, gpu, big_region.first), resident_id)); curr_resident_id = resident_id; } // pages in new_pages_mask under this big page get new_prot uvm_page_mask_zero(&block_context->scratch_page_mask); uvm_page_mask_region_fill(&block_context->scratch_page_mask, big_region); if (uvm_page_mask_and(&block_context->scratch_page_mask, &block_context->scratch_page_mask, new_pages_mask)) { if (new_prot == UVM_PROT_NONE) { block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL); } else { block_gpu_pte_write_4k(block, gpu, curr_resident_id, new_prot, &block_context->scratch_page_mask, pte_batch, NULL); } } // All other pages under this big page inherit curr_prot uvm_page_mask_zero(&block_context->scratch_page_mask); uvm_page_mask_region_fill(&block_context->scratch_page_mask, big_region); if (uvm_page_mask_andnot(&block_context->scratch_page_mask, &block_context->scratch_page_mask, new_pages_mask)) { if (curr_prot == UVM_PROT_NONE) { block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL); } else { block_gpu_pte_write_4k(block, gpu, curr_resident_id, curr_prot, &block_context->scratch_page_mask, pte_batch, NULL); } } } } // Writes pte_clear_val to the big PTEs in big_ptes_mask. If big_ptes_mask is // NULL, all big PTEs in the {block, gpu} are cleared. // // If tlb_batch is provided, the big PTEs written are added to the batch. The // caller is responsible for ending the TLB batch with the appropriate membar. static void block_gpu_pte_clear_big(uvm_va_block_t *block, uvm_gpu_t *gpu, const unsigned long *big_ptes_mask, NvU64 pte_clear_val, uvm_pte_batch_t *pte_batch, uvm_tlb_batch_t *tlb_batch) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu); NvU32 big_page_size = gpu_va_space->page_tables.big_page_size; uvm_gpu_phys_address_t pte_addr; NvU32 pte_size = uvm_mmu_pte_size(&gpu_va_space->page_tables, big_page_size); size_t big_page_index; DECLARE_BITMAP(big_ptes_to_clear, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); if (big_ptes_mask) bitmap_copy(big_ptes_to_clear, big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); else bitmap_set(big_ptes_to_clear, 0, uvm_va_block_num_big_pages(block, big_page_size)); for_each_set_bit(big_page_index, big_ptes_to_clear, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) { pte_addr = uvm_page_table_range_entry_address(&gpu_va_space->page_tables, &gpu_state->page_table_range_big, big_page_index); uvm_pte_batch_clear_ptes(pte_batch, pte_addr, pte_clear_val, pte_size, 1); if (tlb_batch) { uvm_tlb_batch_invalidate(tlb_batch, uvm_va_block_big_page_addr(block, big_page_index, big_page_size), big_page_size, big_page_size, UVM_MEMBAR_NONE); } } } // Writes the big PTEs in big_ptes_mask using memory from resident_id with // new_prot permissions. new_prot must not be UVM_PROT_NONE: use // block_gpu_pte_clear_big instead. // // Unlike block_gpu_pte_clear_big, big_ptes_mask must not be NULL. // // If tlb_batch is provided, the big PTEs written are added to the batch. The // caller is responsible for ending the TLB batch with the appropriate membar. static void block_gpu_pte_write_big(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_processor_id_t resident_id, uvm_prot_t new_prot, const unsigned long *big_ptes_mask, uvm_pte_batch_t *pte_batch, uvm_tlb_batch_t *tlb_batch) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu); uvm_page_tree_t *tree = &gpu_va_space->page_tables; NvU32 big_page_size = tree->big_page_size; NvU32 pte_size = uvm_mmu_pte_size(tree, big_page_size); size_t big_page_index; uvm_va_block_region_t contig_region = {0}; uvm_gpu_phys_address_t contig_addr = {0}; uvm_gpu_phys_address_t page_addr = {0}; NvU64 pte_flags = block_gpu_pte_flag_cacheable(block, gpu, resident_id); UVM_ASSERT(new_prot != UVM_PROT_NONE); UVM_ASSERT(UVM_ID_IS_VALID(resident_id)); UVM_ASSERT(big_ptes_mask); if (!bitmap_empty(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) { UVM_ASSERT(uvm_va_block_num_big_pages(block, big_page_size) > 0); if (!gpu->parent->can_map_sysmem_with_large_pages) UVM_ASSERT(UVM_ID_IS_GPU(resident_id)); } for_each_set_bit(big_page_index, big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) { NvU64 pte_val; uvm_gpu_phys_address_t pte_addr; uvm_va_block_region_t big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size); // Assume that this mapping will be used to write to the page if (new_prot > UVM_PROT_READ_ONLY && UVM_ID_IS_CPU(resident_id)) { uvm_page_index_t page_index; for_each_va_block_page_in_region(page_index, big_region) block_mark_cpu_page_dirty(block, page_index); } if (big_region.first >= contig_region.outer) { contig_region = block_phys_contig_region(block, big_region.first, resident_id); contig_addr = block_phys_page_address(block, block_phys_page(resident_id, contig_region.first), gpu); page_addr = contig_addr; } page_addr.address = contig_addr.address + (big_region.first - contig_region.first) * PAGE_SIZE; pte_addr = uvm_page_table_range_entry_address(tree, &gpu_state->page_table_range_big, big_page_index); pte_val = tree->hal->make_pte(page_addr.aperture, page_addr.address, new_prot, pte_flags); uvm_pte_batch_write_pte(pte_batch, pte_addr, pte_val, pte_size); if (tlb_batch) { uvm_tlb_batch_invalidate(tlb_batch, uvm_va_block_region_start(block, big_region), big_page_size, big_page_size, UVM_MEMBAR_NONE); } } } // Switches any mix of valid or invalid 4k PTEs under the big PTEs in // big_ptes_to_merge to an unmapped big PTE. This also ends both pte_batch and // tlb_batch in order to poison the now-unused 4k PTEs. // // The 4k PTEs are invalidated with the specified membar. static void block_gpu_pte_merge_big_and_end(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu, const unsigned long *big_ptes_to_merge, uvm_push_t *push, uvm_pte_batch_t *pte_batch, uvm_tlb_batch_t *tlb_batch, uvm_membar_t tlb_membar) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; NvU32 big_page_size = tree->big_page_size; NvU64 unmapped_pte_val = tree->hal->unmapped_pte(big_page_size); size_t big_page_index; DECLARE_BITMAP(dummy_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); UVM_ASSERT(!bitmap_empty(big_ptes_to_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); UVM_ASSERT(!bitmap_and(dummy_big_ptes, gpu_state->big_ptes, big_ptes_to_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); // We can be called with the 4k PTEs in two cases: // 1) 4k PTEs allocated. In this case the 4k PTEs are currently active. // // 2) 4k PTEs unallocated. In this case the GPU may not have invalid 4k PTEs // active under the big PTE, depending on whether neighboring blocks // caused the page tables to be allocated. // // In both cases we need to invalidate the 4k PTEs in case the GPU MMU has // them cached. // Each big PTE is currently invalid so the 4ks are active (or unallocated). // First make the big PTEs unmapped to disable future lookups of the 4ks // under it. We can't directly transition the entry from valid 4k PTEs to // valid big PTEs, because that could cause the GPU TLBs to cache the same // VA in different cache lines. That could cause memory ordering to not be // maintained. block_gpu_pte_clear_big(block, gpu, big_ptes_to_merge, unmapped_pte_val, pte_batch, tlb_batch); // Now invalidate the big PTEs we just wrote as well as all 4ks under them. // Subsequent MMU fills will stop at the now-unmapped big PTEs, so we only // need to invalidate the 4k PTEs without actually writing them. for_each_set_bit(big_page_index, big_ptes_to_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) { uvm_tlb_batch_invalidate(tlb_batch, uvm_va_block_big_page_addr(block, big_page_index, big_page_size), big_page_size, big_page_size | UVM_PAGE_SIZE_4K, UVM_MEMBAR_NONE); } // End the batches for the caller. We need to do this here in order to // poison the 4ks below. uvm_pte_batch_end(pte_batch); uvm_tlb_batch_end(tlb_batch, push, tlb_membar); // As a guard against bad PTE writes/TLB invalidates, fill the now-unused // PTEs with a pattern which will trigger fatal faults on access. We have to // do this after the TLB invalidate of the big PTEs, or the GPU might use // the new values. if (UVM_IS_DEBUG() && gpu_state->page_table_range_4k.table) { uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_to_merge); uvm_pte_batch_begin(push, pte_batch); block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, tree->hal->poisoned_pte(), pte_batch, NULL); uvm_pte_batch_end(pte_batch); } } // Writes 0 (invalid) to the 2M PTE for this {block, gpu}. // // If tlb_batch is provided, the 2M PTE is added to the batch. The caller is // responsible for ending the TLB batch with the appropriate membar. static void block_gpu_pte_clear_2m(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_pte_batch_t *pte_batch, uvm_tlb_batch_t *tlb_batch) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; uvm_gpu_phys_address_t pte_addr = uvm_page_table_range_entry_address(tree, &gpu_state->page_table_range_2m, 0); NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_2M); // uvm_pte_batch_write_pte only writes the lower 8 bytes of the 16-byte PTE, // which would cause a problem when trying to make the entry invalid since // both halves must be 0. Using uvm_pte_batch_clear_ptes writes the entire // 16 bytes. uvm_pte_batch_clear_ptes(pte_batch, pte_addr, 0, pte_size, 1); if (tlb_batch) uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, UVM_PAGE_SIZE_2M, UVM_MEMBAR_NONE); } // Writes the 2M PTE for {block, gpu} using memory from resident_id with // new_prot permissions. new_prot must not be UVM_PROT_NONE: use // block_gpu_pte_clear_2m instead. // // If tlb_batch is provided, the 2M PTE is added to the batch. The caller is // responsible for ending the TLB batch with the appropriate membar. static void block_gpu_pte_write_2m(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_processor_id_t resident_id, uvm_prot_t new_prot, uvm_pte_batch_t *pte_batch, uvm_tlb_batch_t *tlb_batch) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; uvm_gpu_phys_address_t pte_addr = uvm_page_table_range_entry_address(tree, &gpu_state->page_table_range_2m, 0); uvm_gpu_phys_address_t page_addr; NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_2M); NvU64 pte_val; NvU64 pte_flags = block_gpu_pte_flag_cacheable(block, gpu, resident_id); UVM_ASSERT(new_prot != UVM_PROT_NONE); UVM_ASSERT(UVM_ID_IS_VALID(resident_id)); if (UVM_ID_IS_CPU(resident_id)) block_mark_cpu_page_dirty(block, 0); page_addr = block_phys_page_address(block, block_phys_page(resident_id, 0), gpu); pte_val = tree->hal->make_pte(page_addr.aperture, page_addr.address, new_prot, pte_flags); uvm_pte_batch_write_pte(pte_batch, pte_addr, pte_val, pte_size); if (tlb_batch) uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, UVM_PAGE_SIZE_2M, UVM_MEMBAR_NONE); } static bool block_gpu_needs_to_activate_table(uvm_va_block_t *block, uvm_gpu_t *gpu) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); if (!block_gpu_supports_2m(block, gpu)) return false; if ((gpu_state->page_table_range_big.table && !gpu_state->activated_big) || (gpu_state->page_table_range_4k.table && !gpu_state->activated_4k)) return true; return false; } // Only used if 2M PTEs are supported. Either transitions a 2M PTE to a PDE, or // activates a newly-allocated page table (big or 4k) while the other is already // active. The caller must have already written the new PTEs under the table // with the appropriate membar. static void block_gpu_write_pde(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_push_t *push, uvm_tlb_batch_t *tlb_batch) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; if (!gpu_state->pte_is_2m) UVM_ASSERT(block_gpu_needs_to_activate_table(block, gpu)); UVM_ASSERT(gpu_state->page_table_range_big.table || gpu_state->page_table_range_4k.table); // We always need a membar to order PDE/PTE writes with the TLB invalidate. // write_pde will do a MEMBAR_SYS by default. if (uvm_page_table_range_aperture(&gpu_state->page_table_range_2m) == UVM_APERTURE_VID) uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU); uvm_page_tree_write_pde(tree, &gpu_state->page_table_range_2m, push); gpu->parent->host_hal->wait_for_idle(push); // Invalidate just the PDE uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, UVM_PAGE_SIZE_2M, UVM_MEMBAR_NONE); if (gpu_state->page_table_range_big.table) gpu_state->activated_big = true; if (gpu_state->page_table_range_4k.table) gpu_state->activated_4k = true; } // Called to switch the 2M PTE (valid or invalid) to a PDE. The caller should // have written all lower PTEs as appropriate into the given pte_batch already. // This function ends the PTE batch, activates the 2M PDE, and does a TLB // invalidate. // // The caller does not need to do any TLB invalidates since none of the lower // PTEs could be cached. static void block_gpu_pte_finish_split_2m(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_push_t *push, uvm_pte_batch_t *pte_batch, uvm_tlb_batch_t *tlb_batch, uvm_membar_t tlb_membar) { uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0); // Step 1: Make the 2M entry invalid. We can't directly transition from a // valid 2M PTE to valid lower PTEs, because that could cause the // GPU TLBs to cache the same VA in different cache lines. That // could cause memory ordering to not be maintained. // // If the 2M PTE is already invalid, no TLB invalidate is needed. if (curr_prot == UVM_PROT_NONE) { // If we aren't downgrading, then we don't need a membar. UVM_ASSERT(tlb_membar == UVM_MEMBAR_NONE); // End the batch, which pushes a membar to ensure that the caller's PTE // writes below 2M are observed before the PDE write we're about to do. uvm_pte_batch_end(pte_batch); } else { // The 64k and 4k PTEs can't possibly be cached since the 2M entry is // not yet a PDE, so we just need to invalidate this single 2M entry. uvm_tlb_batch_begin(tree, tlb_batch); block_gpu_pte_clear_2m(block, gpu, pte_batch, tlb_batch); // Make sure the PTE writes are observed before the TLB invalidate uvm_pte_batch_end(pte_batch); uvm_tlb_batch_end(tlb_batch, push, tlb_membar); } // Step 2: Switch the 2M entry from invalid to a PDE. This activates the // smaller PTEs. uvm_tlb_batch_begin(tree, tlb_batch); block_gpu_write_pde(block, gpu, push, tlb_batch); uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE); } // Switches any mix of valid or invalid 4k or 64k PTEs to an invalid 2M PTE. // Any lower PTEs are invalidated with the specified membar. static void block_gpu_pte_merge_2m(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu, uvm_push_t *push, uvm_membar_t tlb_membar) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; NvU32 tlb_inval_sizes; UVM_ASSERT(!gpu_state->pte_is_2m); UVM_ASSERT(gpu_state->page_table_range_big.table || gpu_state->page_table_range_4k.table); // The 2M entry is currently a PDE, so first make it invalid. We can't // directly transition the entry from a valid PDE to a valid 2M PTE, because // that could cause the GPU TLBs to cache the same VA in different cache // lines. That could cause memory ordering to not be maintained. uvm_pte_batch_begin(push, pte_batch); block_gpu_pte_clear_2m(block, gpu, pte_batch, NULL); uvm_pte_batch_end(pte_batch); // Now invalidate both the 2M entry we just wrote as well as all lower-level // entries which could be cached. Subsequent MMU fills will stop at the now- // invalid 2M entry, so we only need to invalidate the lower PTEs without // actually writing them. tlb_inval_sizes = UVM_PAGE_SIZE_2M; if (gpu_state->page_table_range_big.table) tlb_inval_sizes |= UVM_PAGE_SIZE_64K; // Strictly-speaking we only need to invalidate those 4k ranges which are // not covered by a big pte. However, any such invalidate will require // enough 4k invalidates to force the TLB batching to invalidate everything // anyway, so just do the simpler thing. if (!bitmap_full(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) tlb_inval_sizes |= UVM_PAGE_SIZE_4K; uvm_tlb_batch_begin(tree, tlb_batch); uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, tlb_inval_sizes, UVM_MEMBAR_NONE); uvm_tlb_batch_end(tlb_batch, push, tlb_membar); // As a guard against bad PTE writes/TLB invalidates, fill the now-unused // PTEs with a pattern which will trigger fatal faults on access. We have to // do this after the TLB invalidate of the 2M entry, or the GPU might use // the new values. if (UVM_IS_DEBUG()) { uvm_pte_batch_begin(push, pte_batch); if (gpu_state->page_table_range_big.table) { block_gpu_pte_clear_big(block, gpu, NULL, tree->hal->poisoned_pte(), pte_batch, NULL); } if (gpu_state->page_table_range_4k.table) { block_gpu_pte_clear_4k(block, gpu, NULL, tree->hal->poisoned_pte(), pte_batch, NULL); } uvm_pte_batch_end(pte_batch); } } static uvm_membar_t block_pte_op_membar(block_pte_op_t pte_op, uvm_gpu_t *gpu, uvm_processor_id_t resident_id) { // Permissions upgrades (MAP) don't need membars if (pte_op == BLOCK_PTE_OP_MAP) return UVM_MEMBAR_NONE; UVM_ASSERT(UVM_ID_IS_VALID(resident_id)); UVM_ASSERT(pte_op == BLOCK_PTE_OP_REVOKE); // Permissions downgrades always need a membar on TLB invalidate. If the // mapped memory was local, we only need a GPU-local membar. if (uvm_id_equal(gpu->id, resident_id)) return UVM_MEMBAR_GPU; // Otherwise, remote memory needs a sysmembar return UVM_MEMBAR_SYS; } // Write the 2M PTE for {block, gpu} to the memory on resident_id with new_prot // permissions. If the 2M entry is currently a PDE, it is first merged into a // PTE. // // new_prot must not be UVM_PROT_NONE: use block_gpu_unmap_to_2m instead. // // pte_op specifies whether this is a MAP or REVOKE operation, which determines // the TLB membar required. static void block_gpu_map_to_2m(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu, uvm_processor_id_t resident_id, uvm_prot_t new_prot, uvm_push_t *push, block_pte_op_t pte_op) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu); uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; uvm_membar_t tlb_membar; UVM_ASSERT(new_prot != UVM_PROT_NONE); // If we have a mix of big and 4k PTEs, we have to first merge them to an // invalid 2M PTE. if (!gpu_state->pte_is_2m) { block_gpu_pte_merge_2m(block, block_context, gpu, push, UVM_MEMBAR_NONE); gpu_state->pte_is_2m = true; bitmap_zero(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); } // Write the new permissions uvm_pte_batch_begin(push, pte_batch); uvm_tlb_batch_begin(&gpu_va_space->page_tables, tlb_batch); block_gpu_pte_write_2m(block, gpu, resident_id, new_prot, pte_batch, tlb_batch); uvm_pte_batch_end(pte_batch); tlb_membar = block_pte_op_membar(pte_op, gpu, resident_id); uvm_tlb_batch_end(tlb_batch, push, tlb_membar); } // Combination split + map operation, called when only part of a 2M PTE mapping // is being changed. This splits an existing valid or invalid 2M PTE into the // mix of big and 4k PTEs described by block_context->mapping.new_pte_state. // // The PTEs covering the pages in pages_to_write are written to the memory on // resident_id with new_prot permissions. new_prot must not be UVM_PROT_NONE. // // The PTEs covering the pages not set in pages_to_write inherit the mapping of // the current 2M PTE. If the current mapping is valid, it must target // resident_id. // // pte_op specifies whether this is a MAP or REVOKE operation, which determines // the TLB membar required. static void block_gpu_map_split_2m(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu, uvm_processor_id_t resident_id, const uvm_page_mask_t *pages_to_write, uvm_prot_t new_prot, uvm_push_t *push, block_pte_op_t pte_op) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state; uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0); uvm_membar_t tlb_membar; DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); DECLARE_BITMAP(big_ptes_inherit, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); DECLARE_BITMAP(big_ptes_new_prot, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); UVM_ASSERT(gpu_state->pte_is_2m); if (!gpu_state->page_table_range_4k.table) UVM_ASSERT(bitmap_full(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); uvm_pte_batch_begin(push, pte_batch); // Since the 2M entry is active as a PTE, the GPU MMU can't fetch entries // from the lower levels. This means we don't need to issue a TLB invalidate // when writing those levels. // Cases to handle: // 1) Big PTEs which inherit curr_prot // 2) Big PTEs which get new_prot // 3) Big PTEs which are split to 4k // a) 4k PTEs which inherit curr_prot under the split big PTEs // b) 4k PTEs which get new_prot under the split big PTEs // Compute the big PTEs which will need to be split to 4k, if any. bitmap_complement(big_ptes_split, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); if (gpu_state->page_table_range_big.table) { // Case 1: Write the big PTEs which will inherit the 2M permissions, if // any. These are the big PTEs which are unchanged (uncovered) by the // operation. bitmap_andnot(big_ptes_inherit, new_pte_state->big_ptes, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); if (curr_prot == UVM_PROT_NONE) { block_gpu_pte_clear_big(block, gpu, big_ptes_inherit, tree->hal->unmapped_pte(UVM_PAGE_SIZE_64K), pte_batch, NULL); } else { block_gpu_pte_write_big(block, gpu, resident_id, curr_prot, big_ptes_inherit, pte_batch, NULL); } // Case 2: Write the new big PTEs bitmap_and(big_ptes_new_prot, new_pte_state->big_ptes, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); block_gpu_pte_write_big(block, gpu, resident_id, new_prot, big_ptes_new_prot, pte_batch, NULL); // Case 3: Write the big PTEs which cover 4k PTEs block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, NULL); // We just wrote all possible big PTEs, so mark them as initialized gpu_state->initialized_big = true; } else { UVM_ASSERT(bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); } // Cases 3a and 3b: Write all 4k PTEs under all now-split big PTEs block_gpu_pte_big_split_write_4k(block, block_context, gpu, resident_id, new_prot, big_ptes_split, pages_to_write, pte_batch); // Activate the 2M PDE. This ends the pte_batch and issues a single TLB // invalidate for the 2M entry. tlb_membar = block_pte_op_membar(pte_op, gpu, resident_id); block_gpu_pte_finish_split_2m(block, gpu, push, pte_batch, tlb_batch, tlb_membar); gpu_state->pte_is_2m = false; bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); } // Split the existing 2M PTE into big and 4k PTEs. No permissions are changed. // // new_big_ptes specifies which PTEs should be big. NULL means all PTEs should // be 4k. static void block_gpu_split_2m(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu, const unsigned long *new_big_ptes, uvm_push_t *push) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0); DECLARE_BITMAP(new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); NvU64 unmapped_pte_val; uvm_processor_id_t curr_residency; UVM_ASSERT(gpu_state->pte_is_2m); if (new_big_ptes) bitmap_copy(new_big_ptes_local, new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); else bitmap_zero(new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); if (!bitmap_empty(new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) UVM_ASSERT(gpu_state->page_table_range_big.table); // We're splitting from 2M to big only, so we'll be writing all big PTEs if (gpu_state->page_table_range_big.table) gpu_state->initialized_big = true; // Cases to handle: // 1) Big PTEs which inherit curr_prot // 2) Big PTEs which are split to 4k // a) 4k PTEs inherit curr_prot under the split big PTEs // big_ptes_split will cover the 4k regions bitmap_complement(big_ptes_split, new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_split); uvm_pte_batch_begin(push, pte_batch); // Since the 2M entry is active as a PTE, the GPU MMU can't fetch entries // from the lower levels. This means we don't need to issue a TLB invalidate // when writing those levels. if (curr_prot == UVM_PROT_NONE) { unmapped_pte_val = tree->hal->unmapped_pte(tree->big_page_size); // Case 2a: Clear the 4k PTEs under big_ptes_split block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL); // Case 1: Make the remaining big PTEs unmapped block_gpu_pte_clear_big(block, gpu, new_big_ptes_local, unmapped_pte_val, pte_batch, NULL); } else { curr_residency = block_gpu_get_processor_to_map(block, gpu, 0); // Case 2a: Write the new 4k PTEs under big_ptes_split block_gpu_pte_write_4k(block, gpu, curr_residency, curr_prot, &block_context->scratch_page_mask, pte_batch, NULL); // Case 1: Write the new big PTEs block_gpu_pte_write_big(block, gpu, curr_residency, curr_prot, new_big_ptes_local, pte_batch, NULL); } // Case 2: Make big_ptes_split invalid to activate the 4k PTEs if (gpu_state->page_table_range_big.table) block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, NULL); // Activate the 2M PDE. This ends the pte_batch and issues a single TLB // invalidate for the 2M entry. No membar is necessary since we aren't // changing permissions. block_gpu_pte_finish_split_2m(block, gpu, push, pte_batch, tlb_batch, UVM_MEMBAR_NONE); gpu_state->pte_is_2m = false; bitmap_copy(gpu_state->big_ptes, new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); } // Split the big PTEs in big_ptes_to_split into 4k PTEs. No permissions are // changed. // // big_ptes_to_split must not be NULL. static void block_gpu_split_big(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu, const unsigned long *big_ptes_to_split, uvm_push_t *push) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; NvU32 big_page_size = tree->big_page_size; uvm_va_block_region_t big_region; uvm_processor_id_t resident_id; size_t big_page_index; uvm_prot_t curr_prot; DECLARE_BITMAP(big_ptes_valid, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); UVM_ASSERT(!gpu_state->pte_is_2m); UVM_ASSERT(bitmap_subset(big_ptes_to_split, gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); UVM_ASSERT(!bitmap_empty(big_ptes_to_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); uvm_pte_batch_begin(push, pte_batch); uvm_tlb_batch_begin(tree, tlb_batch); // Write all 4k PTEs under all big PTEs which are being split. We'll make // the big PTEs inactive below after flushing these writes. No TLB // invalidate is needed since the big PTE is active. bitmap_zero(big_ptes_valid, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); for_each_set_bit(big_page_index, big_ptes_to_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) { big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size); curr_prot = block_page_prot_gpu(block, gpu, big_region.first); uvm_page_mask_zero(&block_context->scratch_page_mask); uvm_page_mask_region_fill(&block_context->scratch_page_mask, big_region); if (curr_prot == UVM_PROT_NONE) { block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL); } else { __set_bit(big_page_index, big_ptes_valid); resident_id = block_gpu_get_processor_to_map(block, gpu, big_region.first); block_gpu_pte_write_4k(block, gpu, resident_id, curr_prot, &block_context->scratch_page_mask, pte_batch, NULL); } } // Unmap the big PTEs which are valid and are being split to 4k. We can't // directly transition from a valid big PTE to valid lower PTEs, because // that could cause the GPU TLBs to cache the same VA in different cache // lines. That could cause memory ordering to not be maintained. block_gpu_pte_clear_big(block, gpu, big_ptes_valid, tree->hal->unmapped_pte(big_page_size), pte_batch, tlb_batch); // End the batches. We have to commit the membars and TLB invalidates // before we finish splitting formerly-big PTEs. No membar is necessary // since we aren't changing permissions. uvm_pte_batch_end(pte_batch); uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE); // Finish the split by switching the big PTEs from unmapped to invalid. This // causes the GPU MMU to start reading the 4k PTEs instead of stopping at // the unmapped big PTEs. uvm_pte_batch_begin(push, pte_batch); uvm_tlb_batch_begin(tree, tlb_batch); block_gpu_pte_clear_big(block, gpu, big_ptes_to_split, 0, pte_batch, tlb_batch); uvm_pte_batch_end(pte_batch); // Finally, activate the page tables if they're inactive if (block_gpu_needs_to_activate_table(block, gpu)) block_gpu_write_pde(block, gpu, push, tlb_batch); uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE); bitmap_andnot(gpu_state->big_ptes, gpu_state->big_ptes, big_ptes_to_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); } // Changes permissions on some pre-existing mix of big and 4k PTEs into some // other mix of big and 4k PTEs, as described by // block_context->mapping.new_pte_state. // // The PTEs covering the pages in pages_to_write are written to the memory on // resident_id with new_prot permissions. new_prot must not be UVM_PROT_NONE. // // pte_op specifies whether this is a MAP or REVOKE operation, which determines // the TLB membar required. static void block_gpu_map_big_and_4k(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu, uvm_processor_id_t resident_id, const uvm_page_mask_t *pages_to_write, uvm_prot_t new_prot, uvm_push_t *push, block_pte_op_t pte_op) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state; uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); DECLARE_BITMAP(big_ptes_before_or_after, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); DECLARE_BITMAP(big_ptes_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); DECLARE_BITMAP(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); uvm_va_block_region_t big_region; size_t big_page_index; NvU32 big_page_size = tree->big_page_size; uvm_membar_t tlb_membar = block_pte_op_membar(pte_op, gpu, resident_id); UVM_ASSERT(!gpu_state->pte_is_2m); uvm_pte_batch_begin(push, pte_batch); uvm_tlb_batch_begin(tree, tlb_batch); // All of these cases might be perfomed in the same call: // 1) Split currently-big PTEs to 4k // a) Write new 4k PTEs which inherit curr_prot under the split big PTEs // b) Write new 4k PTEs which get new_prot under the split big PTEs // 2) Merge currently-4k PTEs to big with new_prot // 3) Write currently-big PTEs which wholly get new_prot // 4) Write currently-4k PTEs which get new_prot // 5) Initialize big PTEs which are not covered by this operation // Cases 1a and 1b: Write all 4k PTEs under all currently-big PTEs which are // being split. We'll make the big PTEs inactive below after flushing these // writes. No TLB invalidate is needed since the big PTE is active. // // Mask computation: big_before && !big_after bitmap_andnot(big_ptes_split, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); block_gpu_pte_big_split_write_4k(block, block_context, gpu, resident_id, new_prot, big_ptes_split, pages_to_write, pte_batch); // Case 4: Write the 4k PTEs which weren't covered by a big PTE before, and // remain uncovered after the operation. // // Mask computation: !big_before && !big_after bitmap_or(big_ptes_before_or_after, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_before_or_after); if (uvm_page_mask_andnot(&block_context->scratch_page_mask, pages_to_write, &block_context->scratch_page_mask)) { block_gpu_pte_write_4k(block, gpu, resident_id, new_prot, &block_context->scratch_page_mask, pte_batch, tlb_batch); } // Case 5: If the big page table is newly-allocated, make sure that all big // PTEs we aren't otherwise writing (that is, those which cover 4k PTEs) are // all initialized to invalid. // // The similar case of making newly-allocated big PTEs unmapped when no // lower 4k table is present is handled by having // block_gpu_compute_new_pte_state set new_pte_state->big_ptes // appropriately. if (gpu_state->page_table_range_big.table && !gpu_state->initialized_big) { // TODO: Bug 1766424: If we have the 4k page table already, we could // attempt to merge all uncovered big PTE regions when first // allocating the big table. That's probably not worth doing. UVM_ASSERT(gpu_state->page_table_range_4k.table); UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); bitmap_complement(big_ptes_mask, new_pte_state->big_ptes, uvm_va_block_num_big_pages(block, big_page_size)); block_gpu_pte_clear_big(block, gpu, big_ptes_mask, 0, pte_batch, tlb_batch); gpu_state->initialized_big = true; } // Case 1 (step 1): Unmap the currently-big PTEs which are valid and are // being split to 4k. We can't directly transition from a valid big PTE to // valid lower PTEs, because that could cause the GPU TLBs to cache the same // VA in different cache lines. That could cause memory ordering to not be // maintained. bitmap_zero(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); for_each_set_bit(big_page_index, big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) { big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size); if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], big_region.first)) __set_bit(big_page_index, big_ptes_mask); } block_gpu_pte_clear_big(block, gpu, big_ptes_mask, tree->hal->unmapped_pte(big_page_size), pte_batch, tlb_batch); // Case 3: Write the currently-big PTEs which remain big PTEs, and are // wholly changing permissions. // // Mask computation: big_before && big_after && covered bitmap_and(big_ptes_mask, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); if (bitmap_and(big_ptes_mask, big_ptes_mask, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) block_gpu_pte_write_big(block, gpu, resident_id, new_prot, big_ptes_mask, pte_batch, tlb_batch); // Case 2 (step 1): Merge the new big PTEs and end the batches, now that // we've done all of the independent PTE writes we can. This also merges // newly-allocated uncovered big PTEs to unmapped (see // block_gpu_compute_new_pte_state). // // Mask computation: !big_before && big_after if (bitmap_andnot(big_ptes_merge, new_pte_state->big_ptes, gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) { // This writes the newly-big PTEs to unmapped and ends the PTE and TLB // batches. block_gpu_pte_merge_big_and_end(block, block_context, gpu, big_ptes_merge, push, pte_batch, tlb_batch, tlb_membar); // Remove uncovered big PTEs. We needed to merge them to unmapped above, // but they shouldn't get new_prot below. bitmap_and(big_ptes_merge, big_ptes_merge, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); } else { // End the batches. We have to commit the membars and TLB invalidates // before we finish splitting formerly-big PTEs. uvm_pte_batch_end(pte_batch); uvm_tlb_batch_end(tlb_batch, push, tlb_membar); } if (!bitmap_empty(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) || !bitmap_empty(big_ptes_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) || block_gpu_needs_to_activate_table(block, gpu)) { uvm_pte_batch_begin(push, pte_batch); uvm_tlb_batch_begin(tree, tlb_batch); // Case 1 (step 2): Finish splitting our big PTEs, if we have any, by // switching them from unmapped to invalid. This causes the GPU MMU to // start reading the 4k PTEs instead of stopping at the unmapped big // PTEs. block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, tlb_batch); // Case 2 (step 2): Finish merging our big PTEs, if we have any, by // switching them from unmapped to new_prot. block_gpu_pte_write_big(block, gpu, resident_id, new_prot, big_ptes_merge, pte_batch, tlb_batch); uvm_pte_batch_end(pte_batch); // Finally, activate the page tables if they're inactive if (block_gpu_needs_to_activate_table(block, gpu)) block_gpu_write_pde(block, gpu, push, tlb_batch); uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE); } // Update gpu_state bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); } // Unmap all PTEs for {block, gpu}. If the 2M entry is currently a PDE, it is // merged into a PTE. static void block_gpu_unmap_to_2m(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu, uvm_push_t *push, uvm_membar_t tlb_membar) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu); uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; if (gpu_state->pte_is_2m) { // If we're already mapped as a valid 2M PTE, just write it to invalid uvm_pte_batch_begin(push, pte_batch); uvm_tlb_batch_begin(&gpu_va_space->page_tables, tlb_batch); block_gpu_pte_clear_2m(block, gpu, pte_batch, tlb_batch); uvm_pte_batch_end(pte_batch); uvm_tlb_batch_end(tlb_batch, push, tlb_membar); } else { // Otherwise we have a mix of big and 4K PTEs which need to be merged // into an invalid 2M PTE. block_gpu_pte_merge_2m(block, block_context, gpu, push, tlb_membar); gpu_state->pte_is_2m = true; bitmap_zero(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); } } // Combination split + unmap operation, called when only part of a valid 2M PTE // mapping is being unmapped. The 2M PTE is split into a mix of valid and // invalid big and/or 4k PTEs, as described by // block_context->mapping.new_pte_state. // // The PTEs covering the pages in pages_to_unmap are cleared (unmapped). // // The PTEs covering the pages not set in pages_to_unmap inherit the mapping of // the current 2M PTE. static void block_gpu_unmap_split_2m(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu, const uvm_page_mask_t *pages_to_unmap, uvm_push_t *push, uvm_membar_t tlb_membar) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state; uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0); uvm_processor_id_t resident_id; DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); DECLARE_BITMAP(big_ptes_inherit, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); DECLARE_BITMAP(big_ptes_new_prot, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); UVM_ASSERT(gpu_state->pte_is_2m); resident_id = block_gpu_get_processor_to_map(block, gpu, 0); uvm_pte_batch_begin(push, pte_batch); // Since the 2M entry is active as a PTE, the GPU MMU can't fetch entries // from the lower levels. This means we don't need to issue a TLB invalidate // when writing those levels. // Cases to handle: // 1) Big PTEs which inherit curr_prot // 2) Big PTEs which get unmapped // 3) Big PTEs which are split to 4k // a) 4k PTEs which inherit curr_prot under the split big PTEs // b) 4k PTEs which get unmapped under the split big PTEs // Compute the big PTEs which will need to be split to 4k, if any. bitmap_complement(big_ptes_split, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); if (gpu_state->page_table_range_big.table) { // Case 1: Write the big PTEs which will inherit the 2M permissions, if // any. These are the big PTEs which are unchanged (uncovered) by the // operation. bitmap_andnot(big_ptes_inherit, new_pte_state->big_ptes, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); block_gpu_pte_write_big(block, gpu, resident_id, curr_prot, big_ptes_inherit, pte_batch, NULL); // Case 2: Clear the new big PTEs which get unmapped (those not covering // 4ks) bitmap_and(big_ptes_new_prot, new_pte_state->big_ptes, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); block_gpu_pte_clear_big(block, gpu, big_ptes_new_prot, tree->hal->unmapped_pte(UVM_PAGE_SIZE_64K), pte_batch, NULL); // Case 3: Write the big PTEs which cover 4k PTEs block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, NULL); // We just wrote all possible big PTEs, so mark them as initialized gpu_state->initialized_big = true; } else { UVM_ASSERT(bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); UVM_ASSERT(bitmap_full(new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); } // Cases 3a and 3b: Write all 4k PTEs under all now-split big PTEs block_gpu_pte_big_split_write_4k(block, block_context, gpu, resident_id, UVM_PROT_NONE, big_ptes_split, pages_to_unmap, pte_batch); // And activate the 2M PDE. This ends the pte_batch and issues a single TLB // invalidate for the 2M entry. block_gpu_pte_finish_split_2m(block, gpu, push, pte_batch, tlb_batch, tlb_membar); gpu_state->pte_is_2m = false; bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); } // Unmap some pre-existing mix of big and 4k PTEs into some other mix of big // and 4k PTEs. // // The PTEs covering the pages in pages_to_unmap are cleared (unmapped). static void block_gpu_unmap_big_and_4k(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu, const uvm_page_mask_t *pages_to_unmap, uvm_push_t *push, uvm_membar_t tlb_membar) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables; uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state; uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); DECLARE_BITMAP(big_ptes_before_or_after, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); DECLARE_BITMAP(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); NvU32 big_page_size = tree->big_page_size; NvU64 unmapped_pte_val = tree->hal->unmapped_pte(big_page_size); UVM_ASSERT(!gpu_state->pte_is_2m); uvm_pte_batch_begin(push, pte_batch); uvm_tlb_batch_begin(tree, tlb_batch); // All of these cases might be perfomed in the same call: // 1) Split currently-big PTEs to 4k // a) Write new 4k PTEs which inherit curr_prot under the split big PTEs // b) Clear new 4k PTEs which get unmapped under the split big PTEs // 2) Merge currently-4k PTEs to unmapped big // 3) Clear currently-big PTEs which wholly get unmapped // 4) Clear currently-4k PTEs which get unmapped // 5) Initialize big PTEs which are not covered by this operation // Cases 1a and 1b: Write all 4k PTEs under all currently-big PTEs which are // being split. We'll make the big PTEs inactive below after flushing these // writes. No TLB invalidate is needed since the big PTE is active. // // Mask computation: big_before && !big_after bitmap_andnot(big_ptes_split, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); block_gpu_pte_big_split_write_4k(block, block_context, gpu, UVM_ID_INVALID, UVM_PROT_NONE, big_ptes_split, pages_to_unmap, pte_batch); // Case 4: Clear the 4k PTEs which weren't covered by a big PTE before, and // remain uncovered after the unmap. // // Mask computation: !big_before && !big_after bitmap_or(big_ptes_before_or_after, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_before_or_after); if (uvm_page_mask_andnot(&block_context->scratch_page_mask, pages_to_unmap, &block_context->scratch_page_mask)) block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, tlb_batch); // Case 5: If the big page table is newly-allocated, make sure that all big // PTEs we aren't otherwise writing (that is, those which cover 4k PTEs) are // all initialized to invalid. // // The similar case of making newly-allocated big PTEs unmapped when no // lower 4k table is present is handled by having // block_gpu_compute_new_pte_state set new_pte_state->big_ptes // appropriately. if (gpu_state->page_table_range_big.table && !gpu_state->initialized_big) { // TODO: Bug 1766424: If we have the 4k page table already, we could // attempt to merge all uncovered big PTE regions when first // allocating the big table. That's probably not worth doing. UVM_ASSERT(gpu_state->page_table_range_4k.table); UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); bitmap_complement(big_ptes_mask, new_pte_state->big_ptes, uvm_va_block_num_big_pages(block, big_page_size)); block_gpu_pte_clear_big(block, gpu, big_ptes_mask, 0, pte_batch, tlb_batch); gpu_state->initialized_big = true; } // Case 3 and step 1 of case 1: Unmap both currently-big PTEs which are // getting wholly unmapped, and those currently-big PTEs which are being // split to 4k. We can't directly transition from a valid big PTE to valid // lower PTEs, because that could cause the GPU TLBs to cache the same VA in // different cache lines. That could cause memory ordering to not be // maintained. // // Mask computation: (big_before && big_after && covered) || // (big_before && !big_after) bitmap_and(big_ptes_mask, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); bitmap_and(big_ptes_mask, big_ptes_mask, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); bitmap_or(big_ptes_mask, big_ptes_mask, big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); block_gpu_pte_clear_big(block, gpu, big_ptes_mask, unmapped_pte_val, pte_batch, tlb_batch); // Case 2: Merge the new big PTEs and end the batches, now that we've done // all of the independent PTE writes we can. // // Mask computation: !big_before && big_after if (bitmap_andnot(big_ptes_mask, new_pte_state->big_ptes, gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) { // This writes the newly-big PTEs to unmapped and ends the PTE and TLB // batches. block_gpu_pte_merge_big_and_end(block, block_context, gpu, big_ptes_mask, push, pte_batch, tlb_batch, tlb_membar); } else { // End the batches. We have to commit the membars and TLB invalidates // before we finish splitting formerly-big PTEs. uvm_pte_batch_end(pte_batch); uvm_tlb_batch_end(tlb_batch, push, tlb_membar); } if (!bitmap_empty(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) || block_gpu_needs_to_activate_table(block, gpu)) { uvm_pte_batch_begin(push, pte_batch); uvm_tlb_batch_begin(tree, tlb_batch); // Case 1 (step 2): Finish splitting our big PTEs, if we have any, by // switching them from unmapped to invalid. This causes the GPU MMU to // start reading the 4k PTEs instead of stopping at the unmapped big // PTEs. block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, tlb_batch); uvm_pte_batch_end(pte_batch); // Finally, activate the page tables if they're inactive if (block_gpu_needs_to_activate_table(block, gpu)) block_gpu_write_pde(block, gpu, push, tlb_batch); uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE); } // Update gpu_state bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); } // When PTE state is about to change (for example due to a map/unmap/revoke // operation), this function decides how to split and merge the PTEs in response // to that operation. // // The operation is described with the two page masks: // // - pages_changing indicates which pages will have their PTE mappings changed // on the GPU in some way as a result of the operation (for example, which // pages will actually have their mapping permissions upgraded). // // - page_mask_after indicates which pages on this GPU will have exactly the // same PTE attributes (permissions, residency) as pages_changing after the // operation is applied. // // PTEs are merged eagerly. static void block_gpu_compute_new_pte_state(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_processor_id_t resident_id, const uvm_page_mask_t *pages_changing, const uvm_page_mask_t *page_mask_after, uvm_va_block_new_pte_state_t *new_pte_state) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); uvm_va_block_region_t big_region_all, big_page_region, region; NvU32 big_page_size; uvm_page_index_t page_index; size_t big_page_index; DECLARE_BITMAP(big_ptes_not_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); bool can_make_new_big_ptes, region_full; memset(new_pte_state, 0, sizeof(*new_pte_state)); new_pte_state->needs_4k = true; // TODO: Bug 1676485: Force a specific page size for perf testing if (gpu_state->force_4k_ptes) return; UVM_ASSERT(uvm_page_mask_subset(pages_changing, page_mask_after)); if (block_gpu_supports_2m(block, gpu)) { // If all pages in the 2M mask have the same attributes after the // operation is applied, we can use a 2M PTE. if (uvm_page_mask_full(page_mask_after) && (!UVM_ID_IS_CPU(resident_id) || is_block_phys_contig(block, UVM_ID_CPU))) { new_pte_state->pte_is_2m = true; new_pte_state->needs_4k = false; return; } } // Find big PTEs with matching attributes // Can this block fit any big pages? big_page_size = uvm_va_block_gpu_big_page_size(block, gpu); big_region_all = uvm_va_block_big_page_region_all(block, big_page_size); if (big_region_all.first >= big_region_all.outer) return; new_pte_state->needs_4k = false; can_make_new_big_ptes = true; // Big pages can be used when mapping sysmem if the GPU supports it (Pascal+). if (UVM_ID_IS_CPU(resident_id) && !gpu->parent->can_map_sysmem_with_large_pages) can_make_new_big_ptes = false; // We must not fail during teardown: unmap (resident_id == UVM_ID_INVALID) // with no splits required. That means we should avoid allocating PTEs // which are only needed for merges. // // This only matters if we're merging to big PTEs. If we're merging to 2M, // then we must already have the 2M level (since it has to be allocated // before the lower levels). // // If pte_is_2m already and we don't have a big table, we're splitting so we // have to allocate. if (UVM_ID_IS_INVALID(resident_id) && !gpu_state->page_table_range_big.table && !gpu_state->pte_is_2m) can_make_new_big_ptes = false; for_each_va_block_page_in_region_mask(page_index, pages_changing, big_region_all) { uvm_va_block_region_t contig_region = {0}; big_page_index = uvm_va_block_big_page_index(block, page_index, big_page_size); big_page_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size); if (!UVM_ID_IS_INVALID(resident_id)) contig_region = block_phys_contig_region(block, page_index, resident_id); __set_bit(big_page_index, new_pte_state->big_ptes_covered); region_full = uvm_page_mask_region_full(page_mask_after, big_page_region); if (region_full && UVM_ID_IS_INVALID(resident_id)) __set_bit(big_page_index, new_pte_state->big_ptes_fully_unmapped); // When mapping sysmem, we can use big pages only if we are mapping all pages // in the big page subregion and the CPU pages backing the subregion are // physically contiguous. if (can_make_new_big_ptes && region_full && (!UVM_ID_IS_CPU(resident_id) || (contig_region.first <= big_page_region.first && contig_region.outer >= big_page_region.outer))) { __set_bit(big_page_index, new_pte_state->big_ptes); } if (!test_bit(big_page_index, new_pte_state->big_ptes)) new_pte_state->needs_4k = true; // Skip to the end of the region page_index = big_page_region.outer - 1; } if (!new_pte_state->needs_4k) { // All big page regions in pages_changing will be big PTEs. Now check if // there are any unaligned pages outside of big_region_all which are // changing. region = uvm_va_block_region(0, big_region_all.first); if (!uvm_page_mask_region_empty(pages_changing, region)) { new_pte_state->needs_4k = true; } else { region = uvm_va_block_region(big_region_all.outer, uvm_va_block_num_cpu_pages(block)); if (!uvm_page_mask_region_empty(pages_changing, region)) new_pte_state->needs_4k = true; } } // Now add in the PTEs which should be big but weren't covered by this // operation. // // Note that we can't assume that a given page table range has been // initialized if it's present here, since it could have been allocated by a // thread which had to restart its operation due to allocation retry. if (gpu_state->pte_is_2m || (block_gpu_supports_2m(block, gpu) && !gpu_state->page_table_range_2m.table)) { // We're splitting a 2M PTE so all of the uncovered big PTE regions will // become big PTEs which inherit the 2M permissions. If we haven't // allocated the 2M table yet, it will start as a 2M PTE until the lower // levels are allocated, so it's the same split case regardless of // whether this operation will need to retry a later allocation. bitmap_complement(big_ptes_not_covered, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); } else if (!gpu_state->page_table_range_4k.table && !new_pte_state->needs_4k) { // If we don't have 4k PTEs and we won't be allocating them for this // operation, all of our PTEs need to be big. UVM_ASSERT(!bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); bitmap_zero(big_ptes_not_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); bitmap_set(big_ptes_not_covered, 0, uvm_va_block_num_big_pages(block, big_page_size)); } else { // Otherwise, add in all of the currently-big PTEs which are unchanging. // They won't be written, but they need to be carried into the new // gpu_state->big_ptes when it's updated. bitmap_andnot(big_ptes_not_covered, gpu_state->big_ptes, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); } bitmap_or(new_pte_state->big_ptes, new_pte_state->big_ptes, big_ptes_not_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); } // Wrapper around uvm_page_tree_get_ptes() and uvm_page_tree_alloc_table() that // handles allocation retry. If the block lock has been unlocked and relocked as // part of the allocation, NV_ERR_MORE_PROCESSING_REQUIRED is returned to signal // to the caller that the operation likely needs to be restarted. If that // happens, the pending tracker is added to the block's tracker. static NV_STATUS block_alloc_pt_range_with_retry(uvm_va_block_t *va_block, uvm_gpu_t *gpu, NvU32 page_size, uvm_page_table_range_t *page_table_range, uvm_tracker_t *pending_tracker) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(va_block, gpu); uvm_page_tree_t *page_tables = &gpu_va_space->page_tables; uvm_va_block_test_t *va_block_test = uvm_va_block_get_test(va_block); uvm_page_table_range_t local_range; NV_STATUS status; // Blocks may contain large PTEs without starting on a PTE boundary or // having an aligned size. Cover the PTEs of this size in the block's // interior so we match uvm_va_block_gpu_state_t::big_ptes. NvU64 start = UVM_ALIGN_UP(va_block->start, page_size); NvU64 size = UVM_ALIGN_DOWN(va_block->end + 1, page_size) - start; // VA blocks which can use the 2MB level as either a PTE or a PDE need to // account for the PDE specially, so they must use uvm_page_tree_alloc_table // to allocate the lower levels. bool use_alloc_table = block_gpu_supports_2m(va_block, gpu) && page_size < UVM_PAGE_SIZE_2M; uvm_assert_rwsem_locked(&va_space->lock); UVM_ASSERT(page_table_range->table == NULL); if (va_block_test && va_block_test->page_table_allocation_retry_force_count > 0) { --va_block_test->page_table_allocation_retry_force_count; status = NV_ERR_NO_MEMORY; } else if (use_alloc_table) { // Pascal+: 4k/64k tables under a 2M entry UVM_ASSERT(gpu_state->page_table_range_2m.table); status = uvm_page_tree_alloc_table(page_tables, page_size, UVM_PMM_ALLOC_FLAGS_NONE, &gpu_state->page_table_range_2m, page_table_range); } else { // 4k/big tables on pre-Pascal, and the 2M entry on Pascal+ status = uvm_page_tree_get_ptes(page_tables, page_size, start, size, UVM_PMM_ALLOC_FLAGS_NONE, page_table_range); } if (status == NV_OK) goto allocated; if (status != NV_ERR_NO_MEMORY) return status; // Before unlocking the block lock, any pending work on the block has to be // added to the block's tracker. if (pending_tracker) { status = uvm_tracker_add_tracker_safe(&va_block->tracker, pending_tracker); if (status != NV_OK) return status; } // Unlock the va block and retry with eviction enabled uvm_mutex_unlock(&va_block->lock); if (use_alloc_table) { // Although we don't hold the block lock here, it's safe to pass // gpu_state->page_table_range_2m to the page tree code because we know // that the 2m range has already been allocated, and that it can't go // away while we have the va_space lock held. status = uvm_page_tree_alloc_table(page_tables, page_size, UVM_PMM_ALLOC_FLAGS_EVICT, &gpu_state->page_table_range_2m, &local_range); } else { status = uvm_page_tree_get_ptes(page_tables, page_size, start, size, UVM_PMM_ALLOC_FLAGS_EVICT, &local_range); } uvm_mutex_lock(&va_block->lock); if (status != NV_OK) return status; status = NV_ERR_MORE_PROCESSING_REQUIRED; if (page_table_range->table) { // A different caller allocated the page tables in the meantime, release the // local copy. uvm_page_tree_put_ptes(page_tables, &local_range); return status; } *page_table_range = local_range; allocated: // Mark the 2M PTE as active when we first allocate it, since we don't have // any PTEs below it yet. if (page_size == UVM_PAGE_SIZE_2M) { UVM_ASSERT(!gpu_state->pte_is_2m); gpu_state->pte_is_2m = true; } else if (page_size != UVM_PAGE_SIZE_4K) { // uvm_page_tree_get_ptes initializes big PTEs to invalid. // uvm_page_tree_alloc_table does not, so we'll have to do it later. if (use_alloc_table) UVM_ASSERT(!gpu_state->initialized_big); else gpu_state->initialized_big = true; } return status; } // Helper which allocates all page table ranges necessary for the given page // sizes. See block_alloc_pt_range_with_retry. static NV_STATUS block_alloc_ptes_with_retry(uvm_va_block_t *va_block, uvm_gpu_t *gpu, NvU32 page_sizes, uvm_tracker_t *pending_tracker) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(va_block, gpu); uvm_page_table_range_t *range; NvU32 page_size; NV_STATUS status, final_status = NV_OK; UVM_ASSERT(gpu_state); // Blocks which can map 2M PTE/PDEs must always allocate the 2MB level first // in order to allocate the levels below. if (block_gpu_supports_2m(va_block, gpu)) page_sizes |= UVM_PAGE_SIZE_2M; UVM_ASSERT((page_sizes & gpu_va_space->page_tables.hal->page_sizes()) == page_sizes); for_each_chunk_size_rev(page_size, page_sizes) { if (page_size == UVM_PAGE_SIZE_2M) range = &gpu_state->page_table_range_2m; else if (page_size == UVM_PAGE_SIZE_4K) range = &gpu_state->page_table_range_4k; else range = &gpu_state->page_table_range_big; if (range->table) continue; if (page_size == UVM_PAGE_SIZE_2M) { UVM_ASSERT(!gpu_state->pte_is_2m); UVM_ASSERT(!gpu_state->page_table_range_big.table); UVM_ASSERT(!gpu_state->page_table_range_4k.table); } else if (page_size != UVM_PAGE_SIZE_4K) { UVM_ASSERT(uvm_va_block_num_big_pages(va_block, uvm_va_block_gpu_big_page_size(va_block, gpu)) > 0); UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); } status = block_alloc_pt_range_with_retry(va_block, gpu, page_size, range, pending_tracker); // Keep going to allocate the remaining levels even if the allocation // requires a retry, since we'll likely still need them when we retry // anyway. if (status == NV_ERR_MORE_PROCESSING_REQUIRED) final_status = NV_ERR_MORE_PROCESSING_REQUIRED; else if (status != NV_OK) return status; } return final_status; } static NV_STATUS block_alloc_ptes_new_state(uvm_va_block_t *va_block, uvm_gpu_t *gpu, uvm_va_block_new_pte_state_t *new_pte_state, uvm_tracker_t *pending_tracker) { NvU32 page_sizes = 0; if (new_pte_state->pte_is_2m) { page_sizes |= UVM_PAGE_SIZE_2M; } else { if (!bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) page_sizes |= uvm_va_block_gpu_big_page_size(va_block, gpu); if (new_pte_state->needs_4k) page_sizes |= UVM_PAGE_SIZE_4K; else UVM_ASSERT(!bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)); } return block_alloc_ptes_with_retry(va_block, gpu, page_sizes, pending_tracker); } // Make sure that GMMU PDEs down to PDE1 are populated for the given VA block. // This is currently used on ATS systems to prevent GPUs from inadvertently // accessing sysmem via ATS because there is no PDE1 in the GMMU page tables, // which is where the NOATS bit resides. // // The current implementation simply pre-allocates the PTEs for the VA Block, // which is wasteful because the GPU may never need them. // // TODO: Bug 2064188: Change the MMU code to be able to directly refcount PDE1 // page table entries without having to request PTEs. static NV_STATUS block_pre_populate_pde1_gpu(uvm_va_block_t *block, uvm_gpu_va_space_t *gpu_va_space, uvm_tracker_t *pending_tracker) { NvU32 page_sizes = 0; uvm_gpu_t *gpu = gpu_va_space->gpu; uvm_va_block_gpu_state_t *gpu_state = block_gpu_state_get_alloc(block, gpu); UVM_ASSERT(gpu_state); UVM_ASSERT(gpu_va_space); UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE); UVM_ASSERT(gpu_va_space->ats.enabled); // If the VA Block supports 2M pages, allocate the 2M PTE only, as it // requires less memory if (block_gpu_supports_2m(block, gpu)) { page_sizes = UVM_PAGE_SIZE_2M; } else { // ATS is only enabled on P9 + Volta, therefore, PAGE_SIZE should // be 64K and should match Volta big page size UVM_ASSERT(uvm_va_block_gpu_big_page_size(block, gpu) == PAGE_SIZE); page_sizes = UVM_PAGE_SIZE_64K; } return block_alloc_ptes_with_retry(block, gpu, page_sizes, pending_tracker); } static NV_STATUS block_pre_populate_pde1_all_gpus(uvm_va_block_t *block, uvm_tracker_t *pending_tracker) { uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); NV_STATUS status = NV_OK; // Pre-populate PDEs down to PDE1 for all GPU VA spaces on ATS systems. See // comments in block_pre_populate_pde1_gpu. if (g_uvm_global.ats.enabled && !block->cpu.ever_mapped) { uvm_gpu_va_space_t *gpu_va_space; for_each_gpu_va_space(gpu_va_space, va_space) { // We only care about systems where ATS is supported and the application // enabled it. if (!gpu_va_space->ats.enabled) continue; status = block_pre_populate_pde1_gpu(block, gpu_va_space, pending_tracker); if (status != NV_OK) break; } } return status; } static NV_STATUS block_unmap_gpu(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu, const uvm_page_mask_t *unmap_page_mask, uvm_tracker_t *out_tracker) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); uvm_pte_bits_gpu_t pte_bit; uvm_push_t push; uvm_membar_t tlb_membar = UVM_MEMBAR_GPU; uvm_page_mask_t *pages_to_unmap = &block_context->mapping.page_mask; NV_STATUS status; uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state; bool mask_empty; // We have to check gpu_state before looking at any VA space state like our // gpu_va_space, because we could be on the eviction path where we don't // have a lock on that state. However, since remove_gpu_va_space walks each // block to unmap the GPU before destroying the gpu_va_space, we're // guaranteed that if this GPU has page tables, the gpu_va_space can't go // away while we're holding the block lock. if (!block_gpu_has_page_tables(block, gpu)) return NV_OK; if (!uvm_page_mask_and(pages_to_unmap, unmap_page_mask, &gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ])) return NV_OK; // block_gpu_compute_new_pte_state needs a mask of pages which will have // matching attributes after the operation is performed. In the case of // unmap, those are the pages with unset bits. uvm_page_mask_andnot(&block_context->scratch_page_mask, &gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], pages_to_unmap); uvm_page_mask_complement(&block_context->scratch_page_mask, &block_context->scratch_page_mask); block_gpu_compute_new_pte_state(block, gpu, UVM_ID_INVALID, pages_to_unmap, &block_context->scratch_page_mask, new_pte_state); status = block_alloc_ptes_new_state(block, gpu, new_pte_state, out_tracker); if (status != NV_OK) return status; // All PTE downgrades need a membar. If any of the unmapped PTEs pointed to // remote memory, we must use a sysmembar. if (block_has_remote_mapping_gpu(block, block_context, gpu->id, pages_to_unmap)) tlb_membar = UVM_MEMBAR_SYS; status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_MEMOPS, &block->tracker, &push, "Unmapping pages in block [0x%llx, 0x%llx)", block->start, block->end + 1); if (status != NV_OK) return status; if (new_pte_state->pte_is_2m) { // We're either unmapping a whole valid 2M PTE, or we're unmapping all // remaining pages in a split 2M PTE. block_gpu_unmap_to_2m(block, block_context, gpu, &push, tlb_membar); } else if (gpu_state->pte_is_2m) { // The block is currently mapped as a valid 2M PTE and we're unmapping // some pages within the 2M, so we have to split it into the appropriate // mix of big and 4k PTEs. block_gpu_unmap_split_2m(block, block_context, gpu, pages_to_unmap, &push, tlb_membar); } else { // We're unmapping some pre-existing mix of big and 4K PTEs into some // other mix of big and 4K PTEs. block_gpu_unmap_big_and_4k(block, block_context, gpu, pages_to_unmap, &push, tlb_membar); } uvm_push_end(&push); if (!uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu->id)) { uvm_processor_mask_t non_uvm_lite_gpus; uvm_processor_mask_andnot(&non_uvm_lite_gpus, &block->mapped, block_get_uvm_lite_gpus(block)); UVM_ASSERT(uvm_processor_mask_test(&non_uvm_lite_gpus, gpu->id)); // If the GPU is the only non-UVM-Lite processor with mappings, we can // safely mark pages as fully unmapped if (uvm_processor_mask_get_count(&non_uvm_lite_gpus) == 1) uvm_page_mask_andnot(&block->maybe_mapped_pages, &block->maybe_mapped_pages, pages_to_unmap); } // Clear block PTE state for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) { mask_empty = !uvm_page_mask_andnot(&gpu_state->pte_bits[pte_bit], &gpu_state->pte_bits[pte_bit], pages_to_unmap); if (pte_bit == UVM_PTE_BITS_GPU_READ && mask_empty) uvm_processor_mask_clear(&block->mapped, gpu->id); } UVM_ASSERT(block_check_mappings(block)); return uvm_tracker_add_push_safe(out_tracker, &push); } NV_STATUS uvm_va_block_unmap(uvm_va_block_t *va_block, uvm_va_block_context_t *va_block_context, uvm_processor_id_t id, uvm_va_block_region_t region, const uvm_page_mask_t *unmap_page_mask, uvm_tracker_t *out_tracker) { uvm_page_mask_t *region_page_mask = &va_block_context->mapping.map_running_page_mask; UVM_ASSERT(!uvm_va_block_is_dead(va_block)); uvm_assert_mutex_locked(&va_block->lock); if (UVM_ID_IS_CPU(id)) { block_unmap_cpu(va_block, region, unmap_page_mask); return NV_OK; } uvm_page_mask_init_from_region(region_page_mask, region, unmap_page_mask); return block_unmap_gpu(va_block, va_block_context, block_get_gpu(va_block, id), region_page_mask, out_tracker); } // This function essentially works as a wrapper around vm_insert_page (hence // the similar function prototype). This is needed since vm_insert_page // doesn't take permissions as input, but uses vma->vm_page_prot instead. // Since we may have multiple VA blocks under one VMA which need to map // with different permissions, we have to manually change vma->vm_page_prot for // each call to vm_insert_page. Multiple faults under one VMA in separate // blocks can be serviced concurrently, so the VMA wrapper lock is used // to protect access to vma->vm_page_prot. static NV_STATUS uvm_cpu_insert_page(struct vm_area_struct *vma, NvU64 addr, struct page *page, uvm_prot_t new_prot) { uvm_vma_wrapper_t *vma_wrapper; unsigned long target_flags; pgprot_t target_pgprot; int ret; UVM_ASSERT(vma); UVM_ASSERT(vma->vm_private_data); vma_wrapper = vma->vm_private_data; target_flags = vma->vm_flags; if (new_prot == UVM_PROT_READ_ONLY) target_flags &= ~VM_WRITE; target_pgprot = vm_get_page_prot(target_flags); // Take VMA wrapper lock to check vma->vm_page_prot uvm_down_read(&vma_wrapper->lock); // Take a write lock if we need to modify the VMA vm_page_prot // - vma->vm_page_prot creates writable PTEs but new prot is RO // - vma->vm_page_prot creates read-only PTEs but new_prot is RW if (pgprot_val(vma->vm_page_prot) != pgprot_val(target_pgprot)) { uvm_up_read(&vma_wrapper->lock); uvm_down_write(&vma_wrapper->lock); vma->vm_page_prot = target_pgprot; uvm_downgrade_write(&vma_wrapper->lock); } ret = vm_insert_page(vma, addr, page); uvm_up_read(&vma_wrapper->lock); if (ret) { UVM_ASSERT_MSG(ret == -ENOMEM, "ret: %d\n", ret); return errno_to_nv_status(ret); } return NV_OK; } // Creates or upgrades a CPU mapping for the given page, updating the block's // mapping and pte_bits bitmaps as appropriate. Upon successful return, the page // will be mapped with at least new_prot permissions. // // This never downgrades mappings, so new_prot must not be UVM_PROT_NONE. Use // block_unmap_cpu or uvm_va_block_revoke_prot instead. // // If the existing mapping is >= new_prot already, this is a no-op. // // It is the caller's responsibility to: // - Revoke mappings from other processors as appropriate so the CPU can map // with new_prot permissions // - Guarantee that vm_insert_page is safe to use (vma->vm_mm has a reference // and mmap_lock is held in at least read mode) // - Ensure that the struct page corresponding to the physical memory being // mapped exists // - Manage the block's residency bitmap // - Ensure that the block hasn't been killed (block->va_range is present) // - Update the pte/mapping tracking state on success static NV_STATUS block_map_cpu_page_to(uvm_va_block_t *block, uvm_processor_id_t resident_id, uvm_page_index_t page_index, uvm_prot_t new_prot) { uvm_prot_t curr_prot = block_page_prot_cpu(block, page_index); uvm_va_range_t *va_range = block->va_range; uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); struct vm_area_struct *vma; NV_STATUS status; NvU64 addr; struct page *page; UVM_ASSERT(uvm_va_block_is_hmm(block) || va_range->type == UVM_VA_RANGE_TYPE_MANAGED); UVM_ASSERT(new_prot != UVM_PROT_NONE); UVM_ASSERT(new_prot < UVM_PROT_MAX); UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(resident_id)], UVM_ID_CPU)); uvm_assert_mutex_locked(&block->lock); if (UVM_ID_IS_CPU(resident_id)) UVM_ASSERT(uvm_page_mask_test(&block->cpu.allocated, page_index)); // For the CPU, write implies atomic if (new_prot == UVM_PROT_READ_WRITE) new_prot = UVM_PROT_READ_WRITE_ATOMIC; // Only upgrades are supported in this function UVM_ASSERT(curr_prot <= new_prot); if (new_prot == curr_prot) return NV_OK; // Check for existing VMA permissions. They could have been modified after // the initial mmap by mprotect. if (!uvm_va_block_is_hmm(block) && new_prot > uvm_va_range_logical_prot(va_range)) return NV_ERR_INVALID_ACCESS_TYPE; if (uvm_va_block_is_hmm(block)) { // Do not map CPU pages because they belong to the Linux kernel. return NV_OK; } UVM_ASSERT(va_range); if (UVM_ID_IS_CPU(resident_id) && UVM_ID_IS_CPU(uvm_va_range_get_policy(va_range)->preferred_location)) { // Add the page's range group range to the range group's migrated list. uvm_range_group_range_t *rgr = uvm_range_group_range_find(va_space, uvm_va_block_cpu_page_address(block, page_index)); if (rgr != NULL) { uvm_spin_lock(&rgr->range_group->migrated_ranges_lock); if (list_empty(&rgr->range_group_migrated_list_node)) list_move_tail(&rgr->range_group_migrated_list_node, &rgr->range_group->migrated_ranges); uvm_spin_unlock(&rgr->range_group->migrated_ranges_lock); } } // It's possible here that current->mm != vma->vm_mm. That can happen for // example due to access_process_vm (ptrace) or get_user_pages from another // driver. // // In such cases the caller has taken care of ref counting vma->vm_mm for // us, so we can safely operate on the vma but we can't use // uvm_va_range_vma_current. vma = uvm_va_range_vma(va_range); uvm_assert_mmap_lock_locked(vma->vm_mm); UVM_ASSERT(!uvm_va_space_mm_enabled(va_space) || va_space->va_space_mm.mm == vma->vm_mm); // Add the mapping addr = uvm_va_block_cpu_page_address(block, page_index); // This unmap handles upgrades as vm_insert_page returns -EBUSY when // there's already a mapping present at fault_addr, so we have to unmap // first anyway when upgrading from RO -> RW. if (curr_prot != UVM_PROT_NONE) unmap_mapping_range(&va_space->mapping, addr, PAGE_SIZE, 1); // Don't map the CPU until prior copies and GPU PTE updates finish, // otherwise we might not stay coherent. status = uvm_tracker_wait(&block->tracker); if (status != NV_OK) return status; if (UVM_ID_IS_CPU(resident_id)) { uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); // TODO: Bug 3283417: This can be removed if vm_insert_pages() is used instead of // vm_insert_page(). page = uvm_cpu_chunk_get_cpu_page(block, chunk, page_index); UVM_ASSERT(page); } else { uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, resident_id); size_t chunk_offset; uvm_gpu_chunk_t *chunk = block_phys_page_chunk(block, block_phys_page(resident_id, page_index), &chunk_offset); UVM_ASSERT(gpu->parent->numa_info.enabled); page = uvm_gpu_chunk_to_page(&gpu->pmm, chunk) + chunk_offset / PAGE_SIZE; } return uvm_cpu_insert_page(vma, addr, page, new_prot); } // Maps the CPU to the given pages which are resident on resident_id. // map_page_mask is an in/out parameter: the pages which are mapped to // resident_id are removed from the mask before returning. // // Caller must ensure that: // - Pages in map_page_mask must not be set in the corresponding cpu.pte_bits // mask for the requested protection. static NV_STATUS block_map_cpu_to(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_processor_id_t resident_id, uvm_va_block_region_t region, uvm_page_mask_t *map_page_mask, uvm_prot_t new_prot, uvm_tracker_t *out_tracker) { NV_STATUS status = NV_OK; uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); uvm_page_index_t page_index; uvm_page_mask_t *pages_to_map = &block_context->mapping.page_mask; const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, resident_id); uvm_pte_bits_cpu_t prot_pte_bit = get_cpu_pte_bit_index(new_prot); uvm_pte_bits_cpu_t pte_bit; UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(resident_id)], UVM_ID_CPU)); // TODO: Bug 1766424: Check if optimizing the unmap_mapping_range calls // within block_map_cpu_page_to by doing them once here is helpful. UVM_ASSERT(!uvm_page_mask_and(&block_context->scratch_page_mask, map_page_mask, &block->cpu.pte_bits[prot_pte_bit])); // The pages which will actually change are those in the input page mask // which are resident on the target. if (!uvm_page_mask_and(pages_to_map, map_page_mask, resident_mask)) return NV_OK; status = block_pre_populate_pde1_all_gpus(block, out_tracker); if (status != NV_OK) return status; block->cpu.ever_mapped = true; for_each_va_block_page_in_region_mask(page_index, pages_to_map, region) { status = block_map_cpu_page_to(block, resident_id, page_index, new_prot); if (status != NV_OK) break; uvm_processor_mask_set(&block->mapped, UVM_ID_CPU); } // If there was some error, shrink the region so that we only update the // pte/mapping tracking bits for the pages that succeeded if (status != NV_OK) { region = uvm_va_block_region(region.first, page_index); uvm_page_mask_region_clear_outside(pages_to_map, region); } // If pages are mapped from a remote residency, notify the remote mapping // events to tools. We skip event notification if the cause is Invalid. We // use it to signal that this function is being called from the revocation // path to avoid reporting duplicate events. if (UVM_ID_IS_GPU(resident_id) && va_space->tools.enabled && block_context->mapping.cause != UvmEventMapRemoteCauseInvalid) { uvm_va_block_region_t subregion; for_each_va_block_subregion_in_mask(subregion, pages_to_map, region) { uvm_tools_record_map_remote(block, NULL, UVM_ID_CPU, resident_id, uvm_va_block_region_start(block, subregion), uvm_va_block_region_size(subregion), block_context->mapping.cause); } } // Update CPU mapping state for (pte_bit = 0; pte_bit <= prot_pte_bit; pte_bit++) uvm_page_mask_or(&block->cpu.pte_bits[pte_bit], &block->cpu.pte_bits[pte_bit], pages_to_map); uvm_page_mask_or(&block->maybe_mapped_pages, &block->maybe_mapped_pages, pages_to_map); UVM_ASSERT(block_check_mappings(block)); // Remove all pages that were newly-mapped from the input mask uvm_page_mask_andnot(map_page_mask, map_page_mask, pages_to_map); return status; } // Maps the GPU to the given pages which are resident on resident_id. // map_page_mask is an in/out parameter: the pages which are mapped // to resident_id are removed from the mask before returning. // // Caller must ensure that: // - Pages in map_page_mask must not be set in the corresponding pte_bits mask // for the requested protection on the mapping GPU. static NV_STATUS block_map_gpu_to(uvm_va_block_t *va_block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu, uvm_processor_id_t resident_id, uvm_page_mask_t *map_page_mask, uvm_prot_t new_prot, uvm_tracker_t *out_tracker) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); uvm_push_t push; NV_STATUS status; uvm_page_mask_t *pages_to_map = &block_context->mapping.page_mask; const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id); uvm_pte_bits_gpu_t pte_bit; uvm_pte_bits_gpu_t prot_pte_bit = get_gpu_pte_bit_index(new_prot); uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state; block_pte_op_t pte_op; UVM_ASSERT(map_page_mask); UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(resident_id)], gpu->id)); if (uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id)) UVM_ASSERT(uvm_id_equal(resident_id, uvm_va_range_get_policy(va_block->va_range)->preferred_location)); UVM_ASSERT(!uvm_page_mask_and(&block_context->scratch_page_mask, map_page_mask, &gpu_state->pte_bits[prot_pte_bit])); // The pages which will actually change are those in the input page mask // which are resident on the target. if (!uvm_page_mask_and(pages_to_map, map_page_mask, resident_mask)) return NV_OK; UVM_ASSERT(block_check_mapping_residency(va_block, gpu, resident_id, pages_to_map)); // For PTE merge/split computation, compute all resident pages which will // have exactly new_prot after performing the mapping. uvm_page_mask_or(&block_context->scratch_page_mask, &gpu_state->pte_bits[prot_pte_bit], pages_to_map); if (prot_pte_bit < UVM_PTE_BITS_GPU_ATOMIC) { uvm_page_mask_andnot(&block_context->scratch_page_mask, &block_context->scratch_page_mask, &gpu_state->pte_bits[prot_pte_bit + 1]); } uvm_page_mask_and(&block_context->scratch_page_mask, &block_context->scratch_page_mask, resident_mask); block_gpu_compute_new_pte_state(va_block, gpu, resident_id, pages_to_map, &block_context->scratch_page_mask, new_pte_state); status = block_alloc_ptes_new_state(va_block, gpu, new_pte_state, out_tracker); if (status != NV_OK) return status; status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_MEMOPS, &va_block->tracker, &push, "Mapping pages in block [0x%llx, 0x%llx) as %s", va_block->start, va_block->end + 1, uvm_prot_string(new_prot)); if (status != NV_OK) return status; pte_op = BLOCK_PTE_OP_MAP; if (new_pte_state->pte_is_2m) { // We're either modifying permissions of a pre-existing 2M PTE, or all // permissions match so we can merge to a new 2M PTE. block_gpu_map_to_2m(va_block, block_context, gpu, resident_id, new_prot, &push, pte_op); } else if (gpu_state->pte_is_2m) { // Permissions on a subset of the existing 2M PTE are being upgraded, so // we have to split it into the appropriate mix of big and 4k PTEs. block_gpu_map_split_2m(va_block, block_context, gpu, resident_id, pages_to_map, new_prot, &push, pte_op); } else { // We're upgrading permissions on some pre-existing mix of big and 4K // PTEs into some other mix of big and 4K PTEs. block_gpu_map_big_and_4k(va_block, block_context, gpu, resident_id, pages_to_map, new_prot, &push, pte_op); } // If we are mapping remotely, record the event if (va_space->tools.enabled && !uvm_id_equal(resident_id, gpu->id)) { uvm_va_block_region_t subregion, region = uvm_va_block_region_from_block(va_block); UVM_ASSERT(block_context->mapping.cause != UvmEventMapRemoteCauseInvalid); for_each_va_block_subregion_in_mask(subregion, pages_to_map, region) { uvm_tools_record_map_remote(va_block, &push, gpu->id, resident_id, uvm_va_block_region_start(va_block, subregion), uvm_va_block_region_size(subregion), block_context->mapping.cause); } } uvm_push_end(&push); // Update GPU mapping state for (pte_bit = 0; pte_bit <= prot_pte_bit; pte_bit++) uvm_page_mask_or(&gpu_state->pte_bits[pte_bit], &gpu_state->pte_bits[pte_bit], pages_to_map); uvm_processor_mask_set(&va_block->mapped, gpu->id); // If we are mapping a UVM-Lite GPU do not update maybe_mapped_pages if (!uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id)) uvm_page_mask_or(&va_block->maybe_mapped_pages, &va_block->maybe_mapped_pages, pages_to_map); // Remove all pages resident on this processor from the input mask, which // were newly-mapped. uvm_page_mask_andnot(map_page_mask, map_page_mask, pages_to_map); UVM_ASSERT(block_check_mappings(va_block)); return uvm_tracker_add_push_safe(out_tracker, &push); } static void map_get_allowed_destinations(uvm_va_block_t *block, uvm_va_policy_t *policy, uvm_processor_id_t id, uvm_processor_mask_t *allowed_mask) { uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); if (uvm_processor_mask_test(block_get_uvm_lite_gpus(block), id)) { // UVM-Lite can only map resident pages on the preferred location uvm_processor_mask_zero(allowed_mask); uvm_processor_mask_set(allowed_mask, policy->preferred_location); } else if ((uvm_va_policy_is_read_duplicate(policy, va_space) || uvm_id_equal(policy->preferred_location, id)) && uvm_va_space_processor_has_memory(va_space, id)) { // When operating under read-duplication we should only map the local // processor to cause fault-and-duplicate of remote pages. // // The same holds when this processor is the preferred location: only // create local mappings to force remote pages to fault-and-migrate. uvm_processor_mask_zero(allowed_mask); uvm_processor_mask_set(allowed_mask, id); } else { // Common case: Just map wherever the memory happens to reside uvm_processor_mask_and(allowed_mask, &block->resident, &va_space->can_access[uvm_id_value(id)]); return; } // Clamp to resident and accessible processors uvm_processor_mask_and(allowed_mask, allowed_mask, &block->resident); uvm_processor_mask_and(allowed_mask, allowed_mask, &va_space->can_access[uvm_id_value(id)]); } NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block, uvm_va_block_context_t *va_block_context, uvm_processor_id_t id, uvm_va_block_region_t region, const uvm_page_mask_t *map_page_mask, uvm_prot_t new_prot, UvmEventMapRemoteCause cause, uvm_tracker_t *out_tracker) { uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); uvm_gpu_t *gpu = NULL; uvm_processor_mask_t allowed_destinations; uvm_processor_id_t resident_id; const uvm_page_mask_t *pte_mask; uvm_page_mask_t *running_page_mask = &va_block_context->mapping.map_running_page_mask; NV_STATUS status; va_block_context->mapping.cause = cause; UVM_ASSERT(new_prot != UVM_PROT_NONE); UVM_ASSERT(new_prot < UVM_PROT_MAX); uvm_assert_mutex_locked(&va_block->lock); // Mapping is not supported on the eviction path that doesn't hold the VA // space lock. uvm_assert_rwsem_locked(&va_space->lock); if (UVM_ID_IS_CPU(id)) { uvm_pte_bits_cpu_t prot_pte_bit; // Check if the current thread is allowed to call vm_insert_page if (!uvm_va_block_is_hmm(va_block) && !uvm_va_range_vma_check(va_block->va_range, va_block_context->mm)) return NV_OK; prot_pte_bit = get_cpu_pte_bit_index(new_prot); pte_mask = &va_block->cpu.pte_bits[prot_pte_bit]; } else { uvm_va_block_gpu_state_t *gpu_state; uvm_pte_bits_gpu_t prot_pte_bit; gpu = uvm_va_space_get_gpu(va_space, id); // Although this GPU UUID is registered in the VA space, it might not have a // GPU VA space registered. if (!uvm_gpu_va_space_get(va_space, gpu)) return NV_OK; gpu_state = block_gpu_state_get_alloc(va_block, gpu); if (!gpu_state) return NV_ERR_NO_MEMORY; prot_pte_bit = get_gpu_pte_bit_index(new_prot); pte_mask = &gpu_state->pte_bits[prot_pte_bit]; } uvm_page_mask_init_from_region(running_page_mask, region, map_page_mask); if (!uvm_page_mask_andnot(running_page_mask, running_page_mask, pte_mask)) return NV_OK; // Map per resident location so we can more easily detect physically- // contiguous mappings. map_get_allowed_destinations(va_block, va_block_context->policy, id, &allowed_destinations); for_each_closest_id(resident_id, &allowed_destinations, id, va_space) { if (UVM_ID_IS_CPU(id)) { status = block_map_cpu_to(va_block, va_block_context, resident_id, region, running_page_mask, new_prot, out_tracker); } else { status = block_map_gpu_to(va_block, va_block_context, gpu, resident_id, running_page_mask, new_prot, out_tracker); } if (status != NV_OK) return status; // If we've mapped all requested pages, we're done if (uvm_page_mask_region_empty(running_page_mask, region)) break; } return NV_OK; } // Revokes the given pages mapped by cpu. This is implemented by unmapping all // pages and mapping them later with the lower permission. This is required // because vm_insert_page can only be used for upgrades from Invalid. // // Caller must ensure that: // - Pages in revoke_page_mask must be set in the // cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE] mask. static NV_STATUS block_revoke_cpu_write(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_va_block_region_t region, const uvm_page_mask_t *revoke_page_mask, uvm_tracker_t *out_tracker) { uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); uvm_va_block_region_t subregion; UVM_ASSERT(revoke_page_mask); UVM_ASSERT(uvm_page_mask_subset(revoke_page_mask, &block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE])); block_unmap_cpu(block, region, revoke_page_mask); // Coalesce revocation event notification for_each_va_block_subregion_in_mask(subregion, revoke_page_mask, region) { uvm_perf_event_notify_revocation(&va_space->perf_events, block, UVM_ID_CPU, uvm_va_block_region_start(block, subregion), uvm_va_block_region_size(subregion), UVM_PROT_READ_WRITE_ATOMIC, UVM_PROT_READ_ONLY); } // uvm_va_block_map will skip this remap if we aren't holding the right mm // lock. return uvm_va_block_map(block, block_context, UVM_ID_CPU, region, revoke_page_mask, UVM_PROT_READ_ONLY, UvmEventMapRemoteCauseInvalid, out_tracker); } static void block_revoke_prot_gpu_perf_notify(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu, uvm_prot_t prot_revoked, const uvm_page_mask_t *pages_revoked) { uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id); uvm_va_block_region_t subregion, region = uvm_va_block_region_from_block(block); uvm_pte_bits_gpu_t pte_bit; for (pte_bit = UVM_PTE_BITS_GPU_ATOMIC; pte_bit >= get_gpu_pte_bit_index(prot_revoked); pte_bit--) { uvm_prot_t old_prot; if (!uvm_page_mask_and(&block_context->scratch_page_mask, &gpu_state->pte_bits[pte_bit], pages_revoked)) continue; if (pte_bit == UVM_PTE_BITS_GPU_ATOMIC) old_prot = UVM_PROT_READ_WRITE_ATOMIC; else old_prot = UVM_PROT_READ_WRITE; for_each_va_block_subregion_in_mask(subregion, &block_context->scratch_page_mask, region) { uvm_perf_event_notify_revocation(&va_space->perf_events, block, gpu->id, uvm_va_block_region_start(block, subregion), uvm_va_block_region_size(subregion), old_prot, prot_revoked - 1); } } } // Revokes the given pages mapped by gpu which are resident on resident_id. // revoke_page_mask is an in/out parameter: the pages which have the appropriate // permissions and are mapped to resident_id are removed from the mask before // returning. // // Caller must ensure that: // - Pages in map_page_mask must be set in the corresponding pte_bits mask for // the protection to be revoked on the mapping GPU. static NV_STATUS block_revoke_prot_gpu_to(uvm_va_block_t *va_block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu, uvm_processor_id_t resident_id, uvm_page_mask_t *revoke_page_mask, uvm_prot_t prot_to_revoke, uvm_tracker_t *out_tracker) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); uvm_push_t push; NV_STATUS status; uvm_pte_bits_gpu_t pte_bit; uvm_pte_bits_gpu_t prot_pte_bit = get_gpu_pte_bit_index(prot_to_revoke); uvm_prot_t new_prot = prot_to_revoke - 1; uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state; block_pte_op_t pte_op; const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id); uvm_page_mask_t *pages_to_revoke = &block_context->mapping.page_mask; UVM_ASSERT(revoke_page_mask); UVM_ASSERT(uvm_page_mask_subset(revoke_page_mask, &gpu_state->pte_bits[prot_pte_bit])); // The pages which will actually change are those in the input page mask // which are resident on the target. if (!uvm_page_mask_and(pages_to_revoke, revoke_page_mask, resident_mask)) return NV_OK; UVM_ASSERT(block_check_mapping_residency(va_block, gpu, resident_id, pages_to_revoke)); // For PTE merge/split computation, compute all resident pages which will // have exactly prot_to_revoke-1 after performing the revocation. uvm_page_mask_andnot(&block_context->scratch_page_mask, &gpu_state->pte_bits[prot_pte_bit], pages_to_revoke); uvm_page_mask_andnot(&block_context->scratch_page_mask, &gpu_state->pte_bits[prot_pte_bit - 1], &block_context->scratch_page_mask); uvm_page_mask_and(&block_context->scratch_page_mask, &block_context->scratch_page_mask, resident_mask); block_gpu_compute_new_pte_state(va_block, gpu, resident_id, pages_to_revoke, &block_context->scratch_page_mask, new_pte_state); status = block_alloc_ptes_new_state(va_block, gpu, new_pte_state, out_tracker); if (status != NV_OK) return status; status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_MEMOPS, &va_block->tracker, &push, "Revoking %s access privileges in block [0x%llx, 0x%llx) ", uvm_prot_string(prot_to_revoke), va_block->start, va_block->end + 1); if (status != NV_OK) return status; pte_op = BLOCK_PTE_OP_REVOKE; if (new_pte_state->pte_is_2m) { // We're either modifying permissions of a pre-existing 2M PTE, or all // permissions match so we can merge to a new 2M PTE. block_gpu_map_to_2m(va_block, block_context, gpu, resident_id, new_prot, &push, pte_op); } else if (gpu_state->pte_is_2m) { // Permissions on a subset of the existing 2M PTE are being downgraded, // so we have to split it into the appropriate mix of big and 4k PTEs. block_gpu_map_split_2m(va_block, block_context, gpu, resident_id, pages_to_revoke, new_prot, &push, pte_op); } else { // We're downgrading permissions on some pre-existing mix of big and 4K // PTEs into some other mix of big and 4K PTEs. block_gpu_map_big_and_4k(va_block, block_context, gpu, resident_id, pages_to_revoke, new_prot, &push, pte_op); } uvm_push_end(&push); block_revoke_prot_gpu_perf_notify(va_block, block_context, gpu, prot_to_revoke, pages_to_revoke); // Update GPU mapping state for (pte_bit = UVM_PTE_BITS_GPU_ATOMIC; pte_bit >= prot_pte_bit; pte_bit--) uvm_page_mask_andnot(&gpu_state->pte_bits[pte_bit], &gpu_state->pte_bits[pte_bit], pages_to_revoke); // Remove all pages resident on this processor from the input mask, which // pages which were revoked and pages which already had the correct // permissions. uvm_page_mask_andnot(revoke_page_mask, revoke_page_mask, pages_to_revoke); UVM_ASSERT(block_check_mappings(va_block)); return uvm_tracker_add_push_safe(out_tracker, &push); } NV_STATUS uvm_va_block_revoke_prot(uvm_va_block_t *va_block, uvm_va_block_context_t *va_block_context, uvm_processor_id_t id, uvm_va_block_region_t region, const uvm_page_mask_t *revoke_page_mask, uvm_prot_t prot_to_revoke, uvm_tracker_t *out_tracker) { uvm_gpu_t *gpu; uvm_va_block_gpu_state_t *gpu_state; uvm_processor_mask_t resident_procs; uvm_processor_id_t resident_id; uvm_page_mask_t *running_page_mask = &va_block_context->mapping.revoke_running_page_mask; uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); uvm_pte_bits_gpu_t prot_pte_bit; UVM_ASSERT(prot_to_revoke > UVM_PROT_READ_ONLY); UVM_ASSERT(prot_to_revoke < UVM_PROT_MAX); uvm_assert_mutex_locked(&va_block->lock); if (UVM_ID_IS_CPU(id)) { if (prot_to_revoke == UVM_PROT_READ_WRITE_ATOMIC) return NV_OK; uvm_page_mask_init_from_region(running_page_mask, region, revoke_page_mask); if (uvm_page_mask_and(running_page_mask, running_page_mask, &va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE])) return block_revoke_cpu_write(va_block, va_block_context, region, running_page_mask, out_tracker); return NV_OK; } gpu = uvm_va_space_get_gpu(va_space, id); // UVM-Lite GPUs should never have access revoked UVM_ASSERT_MSG(!uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id), "GPU %s\n", uvm_gpu_name(gpu)); // Return early if there are no mappings for the GPU present in the block if (!uvm_processor_mask_test(&va_block->mapped, gpu->id)) return NV_OK; gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); prot_pte_bit = get_gpu_pte_bit_index(prot_to_revoke); uvm_page_mask_init_from_region(running_page_mask, region, revoke_page_mask); if (!uvm_page_mask_and(running_page_mask, running_page_mask, &gpu_state->pte_bits[prot_pte_bit])) return NV_OK; // Revoke per resident location so we can more easily detect physically- // contiguous mappings. uvm_processor_mask_copy(&resident_procs, &va_block->resident); for_each_closest_id(resident_id, &resident_procs, gpu->id, va_space) { NV_STATUS status = block_revoke_prot_gpu_to(va_block, va_block_context, gpu, resident_id, running_page_mask, prot_to_revoke, out_tracker); if (status != NV_OK) return status; // If we've revoked all requested pages, we're done if (uvm_page_mask_region_empty(running_page_mask, region)) break; } return NV_OK; } NV_STATUS uvm_va_block_map_mask(uvm_va_block_t *va_block, uvm_va_block_context_t *va_block_context, const uvm_processor_mask_t *map_processor_mask, uvm_va_block_region_t region, const uvm_page_mask_t *map_page_mask, uvm_prot_t new_prot, UvmEventMapRemoteCause cause) { uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); NV_STATUS status = NV_OK; NV_STATUS tracker_status; uvm_processor_id_t id; for_each_id_in_mask(id, map_processor_mask) { status = uvm_va_block_map(va_block, va_block_context, id, region, map_page_mask, new_prot, cause, &local_tracker); if (status != NV_OK) break; } // Regardless of error, add the successfully-pushed mapping operations into // the block's tracker. Note that we can't overwrite the tracker because we // aren't guaranteed that the map actually pushed anything (in which case it // would've acquired the block tracker first). tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker); uvm_tracker_deinit(&local_tracker); return status == NV_OK ? tracker_status : status; } NV_STATUS uvm_va_block_unmap_mask(uvm_va_block_t *va_block, uvm_va_block_context_t *va_block_context, const uvm_processor_mask_t *unmap_processor_mask, uvm_va_block_region_t region, const uvm_page_mask_t *unmap_page_mask) { uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); NV_STATUS status = NV_OK; NV_STATUS tracker_status; uvm_processor_id_t id; // Watch out, unmap_mask could change during iteration since it could be // va_block->mapped. for_each_id_in_mask(id, unmap_processor_mask) { // Errors could either be a system-fatal error (ECC) or an allocation // retry due to PTE splitting. In either case we should stop after // hitting the first one. status = uvm_va_block_unmap(va_block, va_block_context, id, region, unmap_page_mask, &local_tracker); if (status != NV_OK) break; } // See the comment in uvm_va_block_map_mask for adding to the tracker. tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker); uvm_tracker_deinit(&local_tracker); return status == NV_OK ? tracker_status : status; } NV_STATUS uvm_va_block_revoke_prot_mask(uvm_va_block_t *va_block, uvm_va_block_context_t *va_block_context, const uvm_processor_mask_t *revoke_processor_mask, uvm_va_block_region_t region, const uvm_page_mask_t *revoke_page_mask, uvm_prot_t prot_to_revoke) { uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); NV_STATUS status = NV_OK; NV_STATUS tracker_status; uvm_processor_id_t id; for_each_id_in_mask(id, revoke_processor_mask) { status = uvm_va_block_revoke_prot(va_block, va_block_context, id, region, revoke_page_mask, prot_to_revoke, &local_tracker); if (status != NV_OK) break; } // See the comment in uvm_va_block_map_mask for adding to the tracker. tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker); uvm_tracker_deinit(&local_tracker); return status == NV_OK ? tracker_status : status; } // Updates the read_duplicated_pages mask in the block when the state of GPU id // is being destroyed static void update_read_duplicated_pages_mask(uvm_va_block_t *block, uvm_gpu_id_t id, uvm_va_block_gpu_state_t *gpu_state) { uvm_gpu_id_t running_id; bool first = true; uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL); uvm_page_mask_t *running_page_mask = &block_context->update_read_duplicated_pages.running_page_mask; uvm_page_mask_t *tmp_page_mask = &block_context->scratch_page_mask; uvm_page_mask_zero(&block->read_duplicated_pages); for_each_id_in_mask(running_id, &block->resident) { const uvm_page_mask_t *running_residency_mask; if (uvm_id_equal(running_id, id)) continue; running_residency_mask = uvm_va_block_resident_mask_get(block, running_id); if (first) { uvm_page_mask_copy(running_page_mask, running_residency_mask); first = false; continue; } if (uvm_page_mask_and(tmp_page_mask, running_page_mask, running_residency_mask)) uvm_page_mask_or(&block->read_duplicated_pages, &block->read_duplicated_pages, tmp_page_mask); uvm_page_mask_or(running_page_mask, running_page_mask, running_residency_mask); } } // Unmaps all GPU mappings under this block, frees the page tables, and frees // all the GPU chunks. This simply drops the chunks on the floor, so the caller // must take care of copying the data elsewhere if it needs to remain intact. // // This serializes on the block tracker since it must unmap page tables. static void block_destroy_gpu_state(uvm_va_block_t *block, uvm_gpu_id_t id) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, id); uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); uvm_gpu_va_space_t *gpu_va_space; uvm_gpu_t *gpu, *other_gpu; if (!gpu_state) return; uvm_assert_mutex_locked(&block->lock); // Unmap PTEs and free page tables gpu = uvm_va_space_get_gpu(va_space, id); gpu_va_space = uvm_gpu_va_space_get(va_space, gpu); if (gpu_va_space) uvm_va_block_remove_gpu_va_space(block, gpu_va_space, NULL); UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, id)); // No processor should have this GPU mapped at this point UVM_ASSERT(block_check_processor_not_mapped(block, id)); // We need to remove the mappings of the indirect peers from the reverse // map when the GPU state is being destroyed (for example, on // unregister_gpu) and when peer access between indirect peers is disabled. // However, we need to avoid double mapping removals. There are two // possible scenarios: // - Disable peer access first. This will remove all mappings between A and // B GPUs, and the indirect_peers bit is cleared. Thus, the later call to // unregister_gpu will not operate on that pair of GPUs. // - Unregister GPU first. This will remove all mappings from all indirect // peers to the GPU being unregistered. It will also destroy its GPU state. // Subsequent calls to disable peers will remove the mappings from the GPU // being unregistered, but never to the GPU being unregistered (since it no // longer has a valid GPU state). for_each_va_space_gpu_in_mask(other_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) block_gpu_unmap_all_chunks_indirect_peer(block, gpu, other_gpu); if (gpu_state->chunks) { size_t i, num_chunks; update_read_duplicated_pages_mask(block, id, gpu_state); uvm_page_mask_zero(&gpu_state->resident); block_clear_resident_processor(block, id); num_chunks = block_num_gpu_chunks(block, gpu); for (i = 0; i < num_chunks; i++) { uvm_gpu_chunk_t *chunk = gpu_state->chunks[i]; if (!chunk) continue; uvm_mmu_chunk_unmap(chunk, &block->tracker); uvm_pmm_gpu_free(&gpu->pmm, chunk, &block->tracker); } uvm_kvfree(gpu_state->chunks); } else { UVM_ASSERT(!uvm_processor_mask_test(&block->resident, id)); } // Pending operations may still need the DMA memory to be mapped. uvm_tracker_wait(&block->tracker); block_gpu_unmap_phys_all_cpu_pages(block, gpu); uvm_cpu_chunk_gpu_mapping_free(block, gpu->id); uvm_processor_mask_clear(&block->evicted_gpus, id); kmem_cache_free(g_uvm_va_block_gpu_state_cache, gpu_state); block->gpus[uvm_id_gpu_index(id)] = NULL; } static void block_put_ptes_safe(uvm_page_tree_t *tree, uvm_page_table_range_t *range) { if (range->table) { uvm_page_tree_put_ptes(tree, range); memset(range, 0, sizeof(*range)); } } NV_STATUS uvm_va_block_add_gpu_va_space(uvm_va_block_t *va_block, uvm_gpu_va_space_t *gpu_va_space) { uvm_assert_mutex_locked(&va_block->lock); if (!gpu_va_space->ats.enabled || !va_block->cpu.ever_mapped) return NV_OK; // Pre-populate PDEs down to PDE1 for all GPU VA spaces on ATS systems. See // comments in pre_populate_pde1_gpu. return block_pre_populate_pde1_gpu(va_block, gpu_va_space, NULL); } void uvm_va_block_remove_gpu_va_space(uvm_va_block_t *va_block, uvm_gpu_va_space_t *gpu_va_space, struct mm_struct *mm) { uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, mm); uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch; uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch; uvm_gpu_t *gpu = gpu_va_space->gpu; uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block); uvm_push_t push; NV_STATUS status; uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); if (!gpu_state) return; uvm_assert_mutex_locked(&va_block->lock); // Unmapping the whole block won't cause a page table split, so this should // only fail if we have a system-fatal error. status = uvm_va_block_unmap(va_block, block_context, gpu->id, region, NULL, &local_tracker); if (status != NV_OK) { UVM_ASSERT(status == uvm_global_get_status()); return; // Just leak } UVM_ASSERT(!uvm_processor_mask_test(&va_block->mapped, gpu->id)); // Reset the page tables if other allocations could reuse them if (!block_gpu_supports_2m(va_block, gpu) && !bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) { status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_MEMOPS, &local_tracker, &push, "Resetting PTEs for block [0x%llx, 0x%llx)", va_block->start, va_block->end + 1); if (status != NV_OK) { UVM_ASSERT(status == uvm_global_get_status()); return; // Just leak } uvm_pte_batch_begin(&push, pte_batch); uvm_tlb_batch_begin(&gpu_va_space->page_tables, tlb_batch); // When the big PTEs is active, the 4k PTEs under it are garbage. Make // them invalid so the page tree code can reuse them for other // allocations on this VA. These don't need TLB invalidates since the // big PTEs above them are active. if (gpu_state->page_table_range_4k.table) { uvm_page_mask_init_from_big_ptes(va_block, gpu, &block_context->scratch_page_mask, gpu_state->big_ptes); block_gpu_pte_clear_4k(va_block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL); } // We unmapped all big PTEs above, which means they have the unmapped // pattern so the GPU MMU won't read 4k PTEs under them. Set them to // invalid to activate the 4ks below so new allocations using just those // 4k PTEs will work. block_gpu_pte_clear_big(va_block, gpu, gpu_state->big_ptes, 0, pte_batch, tlb_batch); uvm_pte_batch_end(pte_batch); uvm_tlb_batch_end(tlb_batch, &push, UVM_MEMBAR_NONE); uvm_push_end(&push); uvm_tracker_overwrite_with_push(&local_tracker, &push); } // The unmap must finish before we free the page tables status = uvm_tracker_wait_deinit(&local_tracker); if (status != NV_OK) return; // System-fatal error, just leak // Note that if the PTE is currently 2M with lower tables allocated but not // in use, calling put_ptes on those lower ranges will re-write the 2M entry // to be a PDE. block_put_ptes_safe(&gpu_va_space->page_tables, &gpu_state->page_table_range_4k); block_put_ptes_safe(&gpu_va_space->page_tables, &gpu_state->page_table_range_big); block_put_ptes_safe(&gpu_va_space->page_tables, &gpu_state->page_table_range_2m); gpu_state->pte_is_2m = false; gpu_state->initialized_big = false; gpu_state->activated_big = false; gpu_state->activated_4k = false; bitmap_zero(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); UVM_ASSERT(block_check_mappings(va_block)); } NV_STATUS uvm_va_block_enable_peer(uvm_va_block_t *va_block, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1) { NV_STATUS status; uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); UVM_ASSERT(uvm_gpu_peer_caps(gpu0, gpu1)->link_type != UVM_GPU_LINK_INVALID); uvm_assert_rwsem_locked_write(&va_space->lock); uvm_assert_mutex_locked(&va_block->lock); if (uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(gpu0->id)], gpu1->id)) { status = block_gpu_map_all_chunks_indirect_peer(va_block, gpu0, gpu1); if (status != NV_OK) return status; status = block_gpu_map_all_chunks_indirect_peer(va_block, gpu1, gpu0); if (status != NV_OK) { block_gpu_unmap_all_chunks_indirect_peer(va_block, gpu0, gpu1); return status; } } // TODO: Bug 1767224: Refactor the uvm_va_block_set_accessed_by logic so we // call it here. return NV_OK; } void uvm_va_block_disable_peer(uvm_va_block_t *va_block, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1) { uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); NV_STATUS status; uvm_tracker_t tracker = UVM_TRACKER_INIT(); uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL); uvm_page_mask_t *unmap_page_mask = &block_context->caller_page_mask; const uvm_page_mask_t *resident0; const uvm_page_mask_t *resident1; uvm_assert_mutex_locked(&va_block->lock); // See comment in block_destroy_gpu_state if (uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(gpu0->id)], gpu1->id)) { block_gpu_unmap_all_chunks_indirect_peer(va_block, gpu0, gpu1); block_gpu_unmap_all_chunks_indirect_peer(va_block, gpu1, gpu0); } // If either of the GPUs doesn't have GPU state then nothing could be mapped // between them. if (!uvm_va_block_gpu_state_get(va_block, gpu0->id) || !uvm_va_block_gpu_state_get(va_block, gpu1->id)) return; resident0 = uvm_va_block_resident_mask_get(va_block, gpu0->id); resident1 = uvm_va_block_resident_mask_get(va_block, gpu1->id); // Unmap all pages resident on gpu1, but not on gpu0, from gpu0 if (uvm_page_mask_andnot(unmap_page_mask, resident1, resident0)) { status = block_unmap_gpu(va_block, block_context, gpu0, unmap_page_mask, &tracker); if (status != NV_OK) { // Since all PTEs unmapped by this call have the same aperture, page // splits should never be required so any failure should be the // result of a system-fatal error. UVM_ASSERT_MSG(status == uvm_global_get_status(), "Unmapping failed: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu0)); } } // Unmap all pages resident on gpu0, but not on gpu1, from gpu1 if (uvm_page_mask_andnot(unmap_page_mask, resident0, resident1)) { status = block_unmap_gpu(va_block, block_context, gpu1, unmap_page_mask, &tracker); if (status != NV_OK) { UVM_ASSERT_MSG(status == uvm_global_get_status(), "Unmapping failed: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu0)); } } status = uvm_tracker_add_tracker_safe(&va_block->tracker, &tracker); if (status != NV_OK) UVM_ASSERT(status == uvm_global_get_status()); status = uvm_tracker_wait_deinit(&tracker); if (status != NV_OK) UVM_ASSERT(status == uvm_global_get_status()); } void uvm_va_block_unmap_preferred_location_uvm_lite(uvm_va_block_t *va_block, uvm_gpu_t *gpu) { NV_STATUS status; uvm_va_range_t *va_range = va_block->va_range; uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL); uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block); uvm_assert_mutex_locked(&va_block->lock); UVM_ASSERT(uvm_processor_mask_test(&va_range->uvm_lite_gpus, gpu->id)); // If the GPU doesn't have GPU state then nothing could be mapped. if (!uvm_va_block_gpu_state_get(va_block, gpu->id)) return; // In UVM-Lite mode, mappings to the preferred location are not tracked // directly, so just unmap the whole block. status = uvm_va_block_unmap(va_block, block_context, gpu->id, region, NULL, &va_block->tracker); if (status != NV_OK) { // Unmapping the whole block should not cause page splits so any failure // should be the result of a system-fatal error. UVM_ASSERT_MSG(status == uvm_global_get_status(), "Unmapping failed: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu)); } status = uvm_tracker_wait(&va_block->tracker); if (status != NV_OK) { UVM_ASSERT_MSG(status == uvm_global_get_status(), "Unmapping failed: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu)); } } // Evict pages from the GPU by moving each resident region to the CPU // // Notably the caller needs to support allocation-retry as // uvm_va_block_migrate_locked() requires that. static NV_STATUS block_evict_pages_from_gpu(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm) { NV_STATUS status = NV_OK; const uvm_page_mask_t *resident = uvm_va_block_resident_mask_get(va_block, gpu->id); uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block); uvm_va_block_region_t subregion; uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, mm); if (!uvm_va_block_is_hmm(va_block)) block_context->policy = uvm_va_range_get_policy(va_block->va_range); // Move all subregions resident on the GPU to the CPU for_each_va_block_subregion_in_mask(subregion, resident, region) { // Need to set block_context->policy for HMM. if (uvm_va_block_is_hmm(va_block)) { uvm_va_policy_node_t *node; node = uvm_va_policy_node_find(va_block, uvm_va_block_region_start(va_block, subregion)); if (node) { uvm_page_index_t outer = uvm_va_block_cpu_page_index(va_block, node->node.end) + 1; // If the policy doesn't cover the subregion, truncate the // subregion. if (subregion.outer > outer) subregion.outer = outer; block_context->policy = &node->policy; } else block_context->policy = &uvm_va_policy_default; } status = uvm_va_block_migrate_locked(va_block, NULL, block_context, subregion, UVM_ID_CPU, UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP, NULL); if (status != NV_OK) return status; } UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, gpu->id)); return NV_OK; } // This handles allocation-retry internally and hence might unlock and relock // block's lock. static void block_unregister_gpu_locked(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm) { NV_STATUS status; uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); uvm_assert_mutex_locked(&va_block->lock); if (!gpu_state) return; // The mappings should've already been torn down by GPU VA space unregister UVM_ASSERT(!uvm_processor_mask_test(&va_block->mapped, gpu->id)); UVM_ASSERT(uvm_page_mask_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ])); UVM_ASSERT(!block_gpu_has_page_tables(va_block, gpu)); // Use UVM_VA_BLOCK_RETRY_LOCKED() as the va block lock is already taken and // we don't rely on any state of the block across the call. status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, NULL, block_evict_pages_from_gpu(va_block, gpu, mm)); if (status != NV_OK) { UVM_ERR_PRINT("Failed to evict GPU pages on GPU unregister: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu)); uvm_global_set_fatal_error(status); } // This function will copy the block's tracker into each chunk then free the // chunk to PMM. If we do this before waiting for the block tracker below // we'll populate PMM's free chunks with tracker entries, which gives us // better testing coverage of chunk synchronization on GPU unregister. block_destroy_gpu_state(va_block, gpu->id); // Any time a GPU is unregistered we need to make sure that there are no // pending (direct or indirect) tracker entries for that GPU left in the // block's tracker. The only way to ensure that is to wait for the whole // tracker. status = uvm_tracker_wait(&va_block->tracker); if (status != NV_OK) UVM_ASSERT(status == uvm_global_get_status()); } void uvm_va_block_unregister_gpu(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm) { // Take the lock internally to not expose the caller to allocation-retry. uvm_mutex_lock(&va_block->lock); block_unregister_gpu_locked(va_block, gpu, mm); uvm_mutex_unlock(&va_block->lock); } static void block_mark_region_cpu_dirty(uvm_va_block_t *va_block, uvm_va_block_region_t region) { uvm_page_index_t page_index; uvm_assert_mutex_locked(&va_block->lock); for_each_va_block_page_in_region_mask (page_index, &va_block->cpu.resident, region) block_mark_cpu_page_dirty(va_block, page_index); } // Tears down everything within the block, but doesn't free the block itself. // Note that when uvm_va_block_kill is called, this is called twice: once for // the initial kill itself, then again when the block's ref count is eventually // destroyed. block->va_range is used to track whether the block has already // been killed. static void block_kill(uvm_va_block_t *block) { uvm_va_space_t *va_space; uvm_perf_event_data_t event_data; uvm_cpu_chunk_t *chunk; uvm_gpu_id_t id; NV_STATUS status; uvm_va_block_region_t region = uvm_va_block_region_from_block(block); uvm_page_index_t page_index; if (uvm_va_block_is_dead(block)) return; va_space = uvm_va_block_get_va_space(block); event_data.block_destroy.block = block; uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_BLOCK_DESTROY, &event_data); // Unmap all processors in parallel first. Unmapping the whole block won't // cause a page table split, so this should only fail if we have a system- // fatal error. if (!uvm_processor_mask_empty(&block->mapped)) { uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL); // We could only be killed with mapped GPU state by VA range free or VA // space teardown, so it's safe to use the va_space's block_context // because both of those have the VA space lock held in write mode. status = uvm_va_block_unmap_mask(block, block_context, &block->mapped, region, NULL); UVM_ASSERT(status == uvm_global_get_status()); } UVM_ASSERT(uvm_processor_mask_empty(&block->mapped)); // Free the GPU page tables and chunks for_each_gpu_id(id) block_destroy_gpu_state(block, id); // Wait for the GPU PTE unmaps before freeing CPU memory uvm_tracker_wait_deinit(&block->tracker); // No processor should have the CPU mapped at this point UVM_ASSERT(block_check_processor_not_mapped(block, UVM_ID_CPU)); // Free CPU pages for_each_va_block_page(page_index, block) { chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); if (!chunk) continue; // be conservative. // Tell the OS we wrote to the page because we sometimes clear the dirty bit after writing to it. uvm_cpu_chunk_mark_dirty(chunk, page_index); uvm_cpu_chunk_remove_from_block(block, chunk, page_index); uvm_cpu_chunk_put(chunk); } uvm_kvfree((void *)block->cpu.chunks); block->cpu.chunks = 0; // Clearing the resident bit isn't strictly necessary since this block // is getting destroyed, but it keeps state consistent for assertions. uvm_page_mask_zero(&block->cpu.resident); block_clear_resident_processor(block, UVM_ID_CPU); if (uvm_va_block_is_hmm(block)) uvm_va_policy_clear(block, block->start, block->end); block->va_range = NULL; #if UVM_IS_CONFIG_HMM() block->hmm.va_space = NULL; #endif } // Called when the block's ref count drops to 0 void uvm_va_block_destroy(nv_kref_t *nv_kref) { uvm_va_block_t *block = container_of(nv_kref, uvm_va_block_t, kref); // Nobody else should have a reference when freeing uvm_assert_mutex_unlocked(&block->lock); uvm_mutex_lock(&block->lock); block_kill(block); uvm_mutex_unlock(&block->lock); if (uvm_enable_builtin_tests) { uvm_va_block_wrapper_t *block_wrapper = container_of(block, uvm_va_block_wrapper_t, block); kmem_cache_free(g_uvm_va_block_cache, block_wrapper); } else { kmem_cache_free(g_uvm_va_block_cache, block); } } void uvm_va_block_kill(uvm_va_block_t *va_block) { uvm_mutex_lock(&va_block->lock); block_kill(va_block); uvm_mutex_unlock(&va_block->lock); // May call block_kill again uvm_va_block_release(va_block); } static NV_STATUS block_split_presplit_ptes_gpu(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_t *gpu) { uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu->id); uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing); uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL); NvU32 big_page_size = uvm_va_block_gpu_big_page_size(existing, gpu); NvU32 alloc_sizes; DECLARE_BITMAP(new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); uvm_page_index_t new_start_page_index = uvm_va_block_cpu_page_index(existing, new->start); size_t big_page_index; uvm_push_t push; NV_STATUS status; // We only have to split to big PTEs if we're currently a 2M PTE if (existing_gpu_state->pte_is_2m) { // We can skip the split if the 2M PTE is invalid and we have no lower // PTEs. if (block_page_prot_gpu(existing, gpu, 0) == UVM_PROT_NONE && !existing_gpu_state->page_table_range_big.table && !existing_gpu_state->page_table_range_4k.table) return NV_OK; alloc_sizes = big_page_size; bitmap_fill(new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); if (!IS_ALIGNED(new->start, big_page_size)) { alloc_sizes |= UVM_PAGE_SIZE_4K; big_page_index = uvm_va_block_big_page_index(existing, new_start_page_index, big_page_size); __clear_bit(big_page_index, new_big_ptes); } status = block_alloc_ptes_with_retry(existing, gpu, alloc_sizes, NULL); if (status != NV_OK) return status; status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_MEMOPS, &existing->tracker, &push, "Splitting 2M PTE, existing [0x%llx, 0x%llx) new [0x%llx, 0x%llx)", existing->start, existing->end + 1, new->start, new->end + 1); if (status != NV_OK) return status; block_gpu_split_2m(existing, block_context, gpu, new_big_ptes, &push); } else { big_page_index = uvm_va_block_big_page_index(existing, new_start_page_index, big_page_size); // If the split point is on a big page boundary, or if the split point // is not currently covered by a big PTE, we don't have to split // anything. if (IS_ALIGNED(new->start, big_page_size) || big_page_index == MAX_BIG_PAGES_PER_UVM_VA_BLOCK || !test_bit(big_page_index, existing_gpu_state->big_ptes)) return NV_OK; status = block_alloc_ptes_with_retry(existing, gpu, UVM_PAGE_SIZE_4K, NULL); if (status != NV_OK) return status; bitmap_zero(new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); __set_bit(big_page_index, new_big_ptes); status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_MEMOPS, &existing->tracker, &push, "Splitting big PTE, existing [0x%llx, 0x%llx) new [0x%llx, 0x%llx)", existing->start, existing->end + 1, new->start, new->end + 1); if (status != NV_OK) return status; block_gpu_split_big(existing, block_context, gpu, new_big_ptes, &push); } uvm_push_end(&push); // Adding this push to existing block tracker will cause all GPU PTE splits // to serialize on each other, but it's simpler than maintaining a separate // tracker and this path isn't performance-critical. return uvm_tracker_add_push_safe(&existing->tracker, &push); } static NV_STATUS block_split_presplit_ptes(uvm_va_block_t *existing, uvm_va_block_t *new) { uvm_gpu_t *gpu; uvm_gpu_id_t id; NV_STATUS status; for_each_gpu_id(id) { if (!uvm_va_block_gpu_state_get(existing, id)) continue; gpu = block_get_gpu(existing, id); if (block_gpu_has_page_tables(existing, gpu)) { status = block_split_presplit_ptes_gpu(existing, new, gpu); if (status != NV_OK) return status; } } return NV_OK; } typedef struct { // Number of chunks contained by this VA block size_t num_chunks; // Index of the "interesting" chunk, either adjacent to or spanning the // split point depending on which block this is. size_t chunk_index; // Size of the chunk referenced by chunk_index uvm_chunk_size_t chunk_size; } block_gpu_chunk_split_state_t; static void block_gpu_chunk_get_split_state(block_gpu_chunk_split_state_t *state, NvU64 start, NvU64 end, uvm_page_index_t page_index, uvm_gpu_t *gpu) { NvU64 size = end - start + 1; state->num_chunks = block_num_gpu_chunks_range(start, size, gpu); state->chunk_index = uvm_va_block_gpu_chunk_index_range(start, size, gpu, page_index, &state->chunk_size); } static void block_merge_chunk(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk) { uvm_gpu_t *accessing_gpu; uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); uvm_pmm_gpu_merge_chunk(&gpu->pmm, chunk); for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) { NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu); uvm_pmm_sysmem_mappings_merge_gpu_chunk_mappings(&accessing_gpu->pmm_reverse_sysmem_mappings, peer_addr, uvm_gpu_chunk_get_size(chunk)); } } // Perform any chunk splitting and array growing required for this block split, // but don't actually move chunk pointers anywhere. static NV_STATUS block_presplit_gpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_t *gpu) { uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu->id); uvm_gpu_t *accessing_gpu; uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing); uvm_gpu_chunk_t **temp_chunks; uvm_gpu_chunk_t *original_chunk, *curr_chunk; uvm_page_index_t split_page_index = uvm_va_block_cpu_page_index(existing, new->start); uvm_chunk_sizes_mask_t split_sizes; uvm_chunk_size_t subchunk_size; NV_STATUS status; block_gpu_chunk_split_state_t existing_before_state, existing_after_state, new_state; block_gpu_chunk_get_split_state(&existing_before_state, existing->start, existing->end, split_page_index, gpu); block_gpu_chunk_get_split_state(&existing_after_state, existing->start, new->start - 1, split_page_index - 1, gpu); block_gpu_chunk_get_split_state(&new_state, new->start, new->end, 0, gpu); // Even though we're splitting existing, we could wind up requiring a larger // chunks array if we split a large chunk into many smaller ones. if (existing_after_state.num_chunks > existing_before_state.num_chunks) { temp_chunks = uvm_kvrealloc(existing_gpu_state->chunks, existing_after_state.num_chunks * sizeof(existing_gpu_state->chunks[0])); if (!temp_chunks) return NV_ERR_NO_MEMORY; existing_gpu_state->chunks = temp_chunks; } original_chunk = existing_gpu_state->chunks[existing_before_state.chunk_index]; // If the chunk covering the split point is not populated, we're done. We've // already grown the array to cover any new chunks which may be populated // later. if (!original_chunk) return NV_OK; // Figure out the splits we need to perform. Remove all sizes >= the current // size, and all sizes < the target size. Note that the resulting mask will // be 0 if the sizes match (we're already splitting at a chunk boundary). UVM_ASSERT(uvm_gpu_chunk_get_size(original_chunk) == existing_before_state.chunk_size); UVM_ASSERT(existing_before_state.chunk_size >= new_state.chunk_size); split_sizes = gpu->parent->mmu_user_chunk_sizes; split_sizes &= existing_before_state.chunk_size - 1; split_sizes &= ~(new_state.chunk_size - 1); // Keep splitting the chunk covering the split point until we hit the target // size. curr_chunk = original_chunk; for_each_chunk_size_rev(subchunk_size, split_sizes) { size_t last_index, num_subchunks; status = uvm_pmm_gpu_split_chunk(&gpu->pmm, curr_chunk, subchunk_size, NULL); if (status != NV_OK) goto error; // Split physical GPU mappings for indirect peers for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) { NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, curr_chunk, accessing_gpu); status = uvm_pmm_sysmem_mappings_split_gpu_chunk_mappings(&accessing_gpu->pmm_reverse_sysmem_mappings, peer_addr, subchunk_size); if (status != NV_OK) goto error; } if (subchunk_size == new_state.chunk_size) break; // Compute the last subchunk index prior to the split point. Divide the // entire address space into units of subchunk_size, then mod by the // number of subchunks within the parent. last_index = (size_t)uvm_div_pow2_64(new->start - 1, subchunk_size); num_subchunks = (size_t)uvm_div_pow2_64(uvm_gpu_chunk_get_size(curr_chunk), subchunk_size); UVM_ASSERT(num_subchunks > 1); last_index &= num_subchunks - 1; uvm_pmm_gpu_get_subchunks(&gpu->pmm, curr_chunk, last_index, 1, &curr_chunk); UVM_ASSERT(uvm_gpu_chunk_get_size(curr_chunk) == subchunk_size); } // Note that existing's chunks array still has a pointer to original_chunk, // not to any newly-split subchunks. If a subsequent split failure occurs on // a later GPU we'll have to merge it back. Once we're past the preallocate // stage we'll remove it from the chunks array and move the new split chunks // in. return NV_OK; error: // On error we need to leave the chunk in its initial state block_merge_chunk(existing, gpu, original_chunk); return status; } // Perform any CPU chunk splitting that may be required for this block split. // Just like block_presplit_gpu_chunks, no chunks are moved to the new block. static NV_STATUS block_presplit_cpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new) { uvm_page_index_t page_index = uvm_va_block_cpu_page_index(existing, new->start); uvm_cpu_chunk_t *splitting_chunk; uvm_chunk_size_t split_sizes = uvm_cpu_chunk_get_allocation_sizes(); uvm_chunk_size_t subchunk_size; NV_STATUS status = NV_OK; UVM_ASSERT(!IS_ALIGNED(new->start, UVM_VA_BLOCK_SIZE)); // If the page covering the split point has not been populated, there is no // need to split. if (!uvm_page_mask_test(&existing->cpu.allocated, page_index)) return NV_OK; splitting_chunk = uvm_cpu_chunk_get_chunk_for_page(existing, page_index); // If the chunk spanning the split point is already at the correct size, // there is nothing to do. if (IS_ALIGNED(new->start, uvm_cpu_chunk_get_size(splitting_chunk))) return NV_OK; // Remove all sizes above the chunk's current size. split_sizes &= uvm_cpu_chunk_get_size(splitting_chunk) - 1; // Remove all sizes below the alignment of the new block's start. split_sizes &= ~(IS_ALIGNED(new->start, UVM_CHUNK_SIZE_64K) ? UVM_CHUNK_SIZE_64K - 1 : 0); for_each_chunk_size_rev(subchunk_size, split_sizes) { uvm_gpu_id_t id; UVM_ASSERT(IS_ALIGNED(uvm_cpu_chunk_get_size(splitting_chunk), subchunk_size)); for_each_gpu_id(id) { uvm_gpu_t *gpu; if (!uvm_va_block_gpu_state_get(existing, id)) continue; // If the parent chunk has not been mapped, there is nothing to split. if (uvm_cpu_chunk_get_gpu_mapping_addr(existing, page_index, splitting_chunk, id) == 0) continue; gpu = block_get_gpu(existing, id); status = uvm_pmm_sysmem_mappings_split_gpu_mappings(&gpu->pmm_reverse_sysmem_mappings, uvm_cpu_chunk_get_gpu_mapping_addr(existing, page_index, splitting_chunk, id), subchunk_size); if (status != NV_OK) return status; } status = uvm_cpu_chunk_split(existing, splitting_chunk, subchunk_size); if (status != NV_OK) return status; splitting_chunk = uvm_cpu_chunk_get_chunk_for_page(existing, page_index); } return NV_OK; } static void block_merge_cpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new) { uvm_page_index_t page_index = uvm_va_block_cpu_page_index(existing, new->start); uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(existing, page_index); uvm_va_space_t *va_space = existing->va_range->va_space; uvm_gpu_id_t id; if (!chunk) return; // Merge the CPU chunk. If a merge was not done, nothing else needs to be done. chunk = uvm_cpu_chunk_merge(existing, chunk); if (!chunk) return; for_each_gpu_id(id) { NvU64 gpu_mapping_addr; uvm_gpu_t *gpu; if (!uvm_va_block_gpu_state_get(existing, id)) continue; gpu_mapping_addr = uvm_cpu_chunk_get_gpu_mapping_addr(existing, page_index, chunk, id); if (gpu_mapping_addr == 0) continue; gpu = uvm_va_space_get_gpu(va_space, id); uvm_pmm_sysmem_mappings_merge_gpu_mappings(&gpu->pmm_reverse_sysmem_mappings, gpu_mapping_addr, uvm_cpu_chunk_get_size(chunk)); } } // Pre-allocate everything which doesn't require retry on both existing and new // which will be needed to handle a split. If this fails, existing must remain // functionally unmodified. static NV_STATUS block_split_preallocate_no_retry(uvm_va_block_t *existing, uvm_va_block_t *new) { NV_STATUS status; uvm_gpu_t *gpu; uvm_gpu_id_t id; uvm_page_index_t split_page_index; uvm_va_range_t *existing_va_range = existing->va_range; status = block_presplit_cpu_chunks(existing, new); if (status != NV_OK) goto error; for_each_gpu_id(id) { if (!uvm_va_block_gpu_state_get(existing, id)) continue; gpu = block_get_gpu(existing, id); status = block_presplit_gpu_chunks(existing, new, gpu); if (status != NV_OK) goto error; if (!block_gpu_state_get_alloc(new, gpu)) { status = NV_ERR_NO_MEMORY; goto error; } } if (existing_va_range && existing_va_range->inject_split_error) { existing_va_range->inject_split_error = false; status = NV_ERR_NO_MEMORY; goto error; } if (uvm_va_block_is_hmm(existing)) { uvm_va_policy_node_t *node = uvm_va_policy_node_find(existing, new->start); if (node && node->node.start != new->start) { status = uvm_va_policy_node_split(existing, node, new->start - 1, NULL); if (status != NV_OK) goto error; } } return NV_OK; error: // Merge back the chunks we split split_page_index = uvm_va_block_cpu_page_index(existing, new->start); for_each_gpu_id(id) { uvm_gpu_chunk_t *chunk; size_t chunk_index; uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, id); if (!existing_gpu_state) continue; // If the chunk spanning the split point was split, merge it back gpu = block_get_gpu(existing, id); chunk_index = block_gpu_chunk_index(existing, gpu, split_page_index, NULL); chunk = existing_gpu_state->chunks[chunk_index]; if (!chunk || chunk->state != UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT) continue; block_merge_chunk(existing, gpu, chunk); // We could attempt to shrink the chunks array back down, but it doesn't // hurt much to have it larger than necessary, and we'd have to handle // the shrink call failing anyway on this error path. } block_merge_cpu_chunks(existing, new); return status; } // Re-calculate the block's top-level processor masks: // - block->mapped // - block->resident // // This is called on block split. static void block_set_processor_masks(uvm_va_block_t *block) { size_t num_pages = uvm_va_block_num_cpu_pages(block); uvm_va_block_region_t block_region = uvm_va_block_region(0, num_pages); uvm_gpu_id_t id; if (uvm_page_mask_region_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], block_region)) { UVM_ASSERT(uvm_page_mask_region_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], block_region)); uvm_processor_mask_clear(&block->mapped, UVM_ID_CPU); } else { uvm_processor_mask_set(&block->mapped, UVM_ID_CPU); } if (uvm_page_mask_region_empty(&block->cpu.resident, block_region)) { uvm_va_space_t *va_space = uvm_va_block_get_va_space(block); if (uvm_processor_mask_get_gpu_count(&va_space->can_access[UVM_ID_CPU_VALUE]) == 0) UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, UVM_ID_CPU)); block_clear_resident_processor(block, UVM_ID_CPU); } else { block_set_resident_processor(block, UVM_ID_CPU); } for_each_gpu_id(id) { uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, id); if (!gpu_state) continue; if (uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], block_region)) { UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_WRITE], block_region)); UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_ATOMIC], block_region)); uvm_processor_mask_clear(&block->mapped, id); } else { uvm_processor_mask_set(&block->mapped, id); } if (uvm_page_mask_region_empty(&gpu_state->resident, block_region)) block_clear_resident_processor(block, id); else block_set_resident_processor(block, id); if (uvm_page_mask_region_empty(&gpu_state->evicted, block_region)) uvm_processor_mask_clear(&block->evicted_gpus, id); else uvm_processor_mask_set(&block->evicted_gpus, id); } } // Split a PAGES_PER_UVM_VA_BLOCK sized bitmap into new and existing parts // corresponding to a block split. static void block_split_page_mask(uvm_page_mask_t *existing_mask, size_t existing_pages, uvm_page_mask_t *new_mask, size_t new_pages) { UVM_ASSERT_MSG(existing_pages + new_pages <= PAGES_PER_UVM_VA_BLOCK, "existing %zu new %zu\n", existing_pages, new_pages); // The new block is always in the upper region of existing, so shift the bit // vectors down. // // Note that bitmap_shift_right requires both dst and src to be the same // size. That's ok since we don't scale them by block size. uvm_page_mask_shift_right(new_mask, existing_mask, existing_pages); uvm_page_mask_region_clear(existing_mask, uvm_va_block_region(existing_pages, existing_pages + new_pages)); } // Split the CPU state within the existing block. existing's start is correct // but its end has not yet been adjusted. static void block_split_cpu(uvm_va_block_t *existing, uvm_va_block_t *new) { size_t existing_pages, new_pages = uvm_va_block_num_cpu_pages(new); uvm_pte_bits_cpu_t pte_bit; uvm_va_block_region_t block_region = uvm_va_block_region_from_block(existing); uvm_page_index_t split_page_index = uvm_va_block_cpu_page_index(existing, new->start); uvm_page_index_t page_index; uvm_cpu_chunk_t *chunk; uvm_va_range_t *existing_va_range = existing->va_range; if (existing_va_range) { UVM_ASSERT(existing->va_range->type == UVM_VA_RANGE_TYPE_MANAGED); UVM_ASSERT(existing->va_range->type == new->va_range->type); } UVM_ASSERT(existing->start < new->start); UVM_ASSERT(existing->end == new->end); UVM_ASSERT(PAGE_ALIGNED(new->start)); UVM_ASSERT(PAGE_ALIGNED(existing->start)); existing_pages = (new->start - existing->start) / PAGE_SIZE; // We don't have to unmap the CPU since its virtual -> physical mappings // don't change. page_index = uvm_va_block_next_page_in_mask(block_region, &existing->cpu.allocated, split_page_index - 1); while (page_index < block_region.outer) { uvm_page_index_t new_chunk_page_index; NV_STATUS status; chunk = uvm_cpu_chunk_get_chunk_for_page(existing, page_index); UVM_ASSERT(chunk); uvm_cpu_chunk_remove_from_block(existing, chunk, page_index); // The chunk has to be adjusted for the new block before inserting it. new_chunk_page_index = page_index - split_page_index; status = uvm_cpu_chunk_insert_in_block(new, chunk, new_chunk_page_index); UVM_ASSERT(status == NV_OK); page_index = uvm_va_block_next_page_in_mask(block_region, &existing->cpu.allocated, page_index + uvm_cpu_chunk_num_pages(chunk) - 1); } new->cpu.ever_mapped = existing->cpu.ever_mapped; block_split_page_mask(&existing->cpu.resident, existing_pages, &new->cpu.resident, new_pages); for (pte_bit = 0; pte_bit < UVM_PTE_BITS_CPU_MAX; pte_bit++) block_split_page_mask(&existing->cpu.pte_bits[pte_bit], existing_pages, &new->cpu.pte_bits[pte_bit], new_pages); } // Fill out the blocks' chunks arrays with the chunks split by // block_presplit_gpu_chunks. static void block_copy_split_gpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_t *gpu) { uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu->id); uvm_va_block_gpu_state_t *new_gpu_state = uvm_va_block_gpu_state_get(new, gpu->id); uvm_gpu_chunk_t **temp_chunks; uvm_gpu_chunk_t *original_chunk; block_gpu_chunk_split_state_t existing_before_state, existing_after_state, new_state; size_t num_pre_chunks, num_post_chunks, num_split_chunks_existing, num_split_chunks_new; uvm_page_index_t split_page_index = uvm_va_block_cpu_page_index(existing, new->start); size_t i; block_gpu_chunk_get_split_state(&existing_before_state, existing->start, existing->end, split_page_index, gpu); block_gpu_chunk_get_split_state(&existing_after_state, existing->start, new->start - 1, split_page_index - 1, gpu); block_gpu_chunk_get_split_state(&new_state, new->start, new->end, 0, gpu); // General case (B is original_chunk): // split // v // existing (before) [------ A -----][------ B -----][------ C -----] // existing (after) [------ A -----][- B0 -] // new [- B1 -][------ C -----] // // Note that the logic below also handles the case of the split happening at // a chunk boundary. That case behaves as though there is no B0 chunk. // Number of chunks to the left and right of original_chunk (A and C above). // Either or both of these may be 0. num_pre_chunks = existing_before_state.chunk_index; num_post_chunks = existing_before_state.num_chunks - num_pre_chunks - 1; // Number of subchunks under existing's portion of original_chunk (B0 above) num_split_chunks_existing = existing_after_state.num_chunks - num_pre_chunks; // Number of subchunks under new's portion of original_chunk (B1 above) num_split_chunks_new = new_state.num_chunks - num_post_chunks; UVM_ASSERT(num_pre_chunks + num_split_chunks_existing > 0); UVM_ASSERT(num_split_chunks_new > 0); // Copy post chunks from the end of existing into new (C above) memcpy(&new_gpu_state->chunks[num_split_chunks_new], &existing_gpu_state->chunks[existing_before_state.chunk_index + 1], num_post_chunks * sizeof(new_gpu_state->chunks[0])); // Save off the original split chunk since we may overwrite the array original_chunk = existing_gpu_state->chunks[existing_before_state.chunk_index]; // Fill out the new pointers if (original_chunk) { // Note that if the split happened at a chunk boundary, original_chunk // will not be split. In that case, num_split_chunks_existing will be 0 // and num_split_chunks_new will be 1, so the left copy will be skipped // and the right copy will pick up the chunk. // Copy left newly-split chunks into existing (B0 above). The array was // re-sized in block_presplit_gpu_chunks as necessary. size_t num_subchunks; num_subchunks = uvm_pmm_gpu_get_subchunks(&gpu->pmm, original_chunk, 0, // start_index num_split_chunks_existing, &existing_gpu_state->chunks[existing_before_state.chunk_index]); UVM_ASSERT(num_subchunks == num_split_chunks_existing); // Copy right newly-split chunks into new (B1 above), overwriting the // pointer to the original chunk. num_subchunks = uvm_pmm_gpu_get_subchunks(&gpu->pmm, original_chunk, num_split_chunks_existing, // start_index num_split_chunks_new, &new_gpu_state->chunks[0]); UVM_ASSERT(num_subchunks == num_split_chunks_new); } else { // If the chunk wasn't already populated we don't need to copy pointers // anywhere, but we need to clear out stale pointers from existing's // array covering the new elements. new's chunks array was already zero- // initialized. memset(&existing_gpu_state->chunks[existing_before_state.chunk_index], 0, num_split_chunks_existing * sizeof(existing_gpu_state->chunks[0])); } // Since we update the reverse map information, protect it against a // concurrent lookup uvm_spin_lock(&gpu->pmm.list_lock); // Update the reverse map of all the chunks that are now under the new block for (i = 0; i < new_state.num_chunks; ++i) { if (new_gpu_state->chunks[i]) { UVM_ASSERT(new_gpu_state->chunks[i]->va_block == existing); new_gpu_state->chunks[i]->va_block = new; // Adjust the page_index within the VA block for the new subchunks in // the new VA block UVM_ASSERT(new_gpu_state->chunks[i]->va_block_page_index >= split_page_index); new_gpu_state->chunks[i]->va_block_page_index -= split_page_index; } } uvm_spin_unlock(&gpu->pmm.list_lock); // Attempt to shrink existing's chunk allocation. If the realloc fails, just // keep on using the old larger one. if (existing_after_state.num_chunks < existing_before_state.num_chunks) { temp_chunks = uvm_kvrealloc(existing_gpu_state->chunks, existing_after_state.num_chunks * sizeof(existing_gpu_state->chunks[0])); if (temp_chunks) existing_gpu_state->chunks = temp_chunks; } } static void block_split_gpu(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_id_t gpu_id) { uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu_id); uvm_va_block_gpu_state_t *new_gpu_state = uvm_va_block_gpu_state_get(new, gpu_id); uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing); uvm_gpu_va_space_t *gpu_va_space; uvm_gpu_t *gpu; uvm_gpu_t *accessing_gpu; size_t new_pages = uvm_va_block_num_cpu_pages(new); size_t existing_pages, existing_pages_4k, existing_pages_big, new_pages_big; uvm_pte_bits_gpu_t pte_bit; size_t num_chunks, i; uvm_cpu_chunk_t *cpu_chunk; uvm_page_index_t page_index; if (!existing_gpu_state) return; gpu = uvm_va_space_get_gpu(va_space, gpu_id); UVM_ASSERT(new_gpu_state); new_gpu_state->force_4k_ptes = existing_gpu_state->force_4k_ptes; UVM_ASSERT(PAGE_ALIGNED(new->start)); UVM_ASSERT(PAGE_ALIGNED(existing->start)); existing_pages = (new->start - existing->start) / PAGE_SIZE; uvm_cpu_chunk_gpu_mapping_split(existing, new, gpu_id); for_each_cpu_chunk_in_block(cpu_chunk, page_index, new) { uvm_pmm_sysmem_mappings_reparent_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings, uvm_cpu_chunk_get_gpu_mapping_addr(new, page_index, cpu_chunk, gpu_id), new); } block_copy_split_gpu_chunks(existing, new, gpu); num_chunks = block_num_gpu_chunks(new, gpu); // Reparent GPU mappings for indirect peers for (i = 0; i < num_chunks; ++i) { uvm_gpu_chunk_t *chunk = new_gpu_state->chunks[i]; if (!chunk) continue; for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) { NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu); uvm_pmm_sysmem_mappings_reparent_gpu_chunk_mapping(&accessing_gpu->pmm_reverse_sysmem_mappings, peer_addr, new); } } block_split_page_mask(&existing_gpu_state->resident, existing_pages, &new_gpu_state->resident, new_pages); for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) { block_split_page_mask(&existing_gpu_state->pte_bits[pte_bit], existing_pages, &new_gpu_state->pte_bits[pte_bit], new_pages); } // Adjust page table ranges. gpu_va_space = uvm_gpu_va_space_get(va_space, gpu); if (gpu_va_space) { if (existing_gpu_state->page_table_range_big.table) { NvU32 big_page_size = uvm_va_block_gpu_big_page_size(existing, gpu); // existing's end has not been adjusted yet existing_pages_big = range_num_big_pages(existing->start, new->start - 1, big_page_size); // Take references on all big pages covered by new new_pages_big = uvm_va_block_num_big_pages(new, big_page_size); if (new_pages_big) { uvm_page_table_range_get_upper(&gpu_va_space->page_tables, &existing_gpu_state->page_table_range_big, &new_gpu_state->page_table_range_big, new_pages_big); // If the split point is within a big page region, we might have // a gap since neither existing nor new can use it anymore. // Get the top N bits from existing's mask to handle that. bitmap_shift_right(new_gpu_state->big_ptes, existing_gpu_state->big_ptes, uvm_va_block_num_big_pages(existing, big_page_size) - new_pages_big, MAX_BIG_PAGES_PER_UVM_VA_BLOCK); new_gpu_state->initialized_big = existing_gpu_state->initialized_big; } // Drop existing's references on the big PTEs it no longer covers // now that new has references on them. Note that neither existing // nor new might have big PTEs after the split. In that case, this // shrink will free the entire old range. uvm_page_table_range_shrink(&gpu_va_space->page_tables, &existing_gpu_state->page_table_range_big, existing_pages_big); if (existing_pages_big == 0) { memset(&existing_gpu_state->page_table_range_big, 0, sizeof(existing_gpu_state->page_table_range_big)); existing_gpu_state->initialized_big = false; } bitmap_clear(existing_gpu_state->big_ptes, existing_pages_big, MAX_BIG_PAGES_PER_UVM_VA_BLOCK - existing_pages_big); } if (existing_gpu_state->page_table_range_4k.table) { // Since existing and new share the same PDE we just need to bump // the ref-count on new's sub-range. uvm_page_table_range_get_upper(&gpu_va_space->page_tables, &existing_gpu_state->page_table_range_4k, &new_gpu_state->page_table_range_4k, uvm_va_block_size(new) / UVM_PAGE_SIZE_4K); // Drop existing's references on the PTEs it no longer covers now // that new has references on them. existing_pages_4k = existing_pages * (PAGE_SIZE / UVM_PAGE_SIZE_4K); uvm_page_table_range_shrink(&gpu_va_space->page_tables, &existing_gpu_state->page_table_range_4k, existing_pages_4k); } // We have to set this explicitly to handle the case of splitting an // invalid, active 2M PTE with no lower page tables allocated. if (existing_gpu_state->pte_is_2m) { UVM_ASSERT(!existing_gpu_state->page_table_range_big.table); UVM_ASSERT(!existing_gpu_state->page_table_range_4k.table); existing_gpu_state->pte_is_2m = false; } // existing can't possibly cover 2MB after a split, so drop any 2M PTE // references it has. We've taken the necessary references on the lower // tables above. block_put_ptes_safe(&gpu_va_space->page_tables, &existing_gpu_state->page_table_range_2m); existing_gpu_state->activated_big = false; existing_gpu_state->activated_4k = false; } block_split_page_mask(&existing_gpu_state->evicted, existing_pages, &new_gpu_state->evicted, new_pages); } NV_STATUS uvm_va_block_split(uvm_va_block_t *existing_va_block, NvU64 new_end, uvm_va_block_t **new_va_block, uvm_va_range_t *new_va_range) { uvm_va_space_t *va_space; uvm_va_block_t *new_block = NULL; NV_STATUS status; va_space = new_va_range->va_space; UVM_ASSERT(existing_va_block->va_range); UVM_ASSERT(existing_va_block->va_range->va_space == va_space); UVM_ASSERT(!uvm_va_block_is_hmm(existing_va_block)); // External range types can't be split UVM_ASSERT(existing_va_block->va_range->type == UVM_VA_RANGE_TYPE_MANAGED); UVM_ASSERT(new_va_range->type == UVM_VA_RANGE_TYPE_MANAGED); uvm_assert_rwsem_locked_write(&va_space->lock); UVM_ASSERT(new_end > existing_va_block->start); UVM_ASSERT(new_end < existing_va_block->end); UVM_ASSERT(PAGE_ALIGNED(new_end + 1)); status = uvm_va_block_create(new_va_range, new_end + 1, existing_va_block->end, &new_block); if (status != NV_OK) return status; // We're protected from other splits and faults by the va_space lock being // held in write mode, but that doesn't stop the reverse mapping (eviction // path) from inspecting the existing block. Stop those threads by taking // the block lock. When a reverse mapping thread takes this lock after the // split has been performed, it will have to re-inspect state and may see // that it should use the newly-split block instead. uvm_mutex_lock(&existing_va_block->lock); status = uvm_va_block_split_locked(existing_va_block, new_end, new_block, new_va_range); uvm_mutex_unlock(&existing_va_block->lock); if (status != NV_OK) uvm_va_block_release(new_block); else if (new_va_block) *new_va_block = new_block; return status; } NV_STATUS uvm_va_block_split_locked(uvm_va_block_t *existing_va_block, NvU64 new_end, uvm_va_block_t *new_block, uvm_va_range_t *new_va_range) { uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing_va_block); uvm_gpu_id_t id; NV_STATUS status; uvm_perf_event_data_t event_data; for_each_gpu_id(id) UVM_ASSERT(block_check_chunks(existing_va_block, id)); // As soon as we update existing's reverse mappings to point to the newly- // split block, the eviction path could try to operate on the new block. // Lock that out too until new is ready. // // Note that we usually shouldn't nest block locks, but it's ok here because // we just created new_block so no other thread could possibly take it out // of order with existing's lock. uvm_mutex_lock_no_tracking(&new_block->lock); // The split has to be transactional, meaning that if we fail, the existing // block must not be modified. Handle that by pre-allocating everything we // might need under both existing and new at the start so we only have a // single point of failure. // Since pre-allocation might require allocating new PTEs, we have to handle // allocation retry which might drop existing's block lock. The // preallocation is split into two steps for that: the first part which // allocates and splits PTEs can handle having the block lock dropped then // re-taken. It won't modify existing_va_block other than adding new PTE // allocations and splitting existing PTEs, which is always safe. status = UVM_VA_BLOCK_RETRY_LOCKED(existing_va_block, NULL, block_split_presplit_ptes(existing_va_block, new_block)); if (status != NV_OK) goto out; // Pre-allocate, stage two. This modifies existing_va_block in ways which // violate many assumptions (such as changing chunk size), but it will put // things back into place on a failure without dropping the block lock. status = block_split_preallocate_no_retry(existing_va_block, new_block); if (status != NV_OK) goto out; // We'll potentially be freeing page tables, so we need to wait for any // outstanding work before we start status = uvm_tracker_wait(&existing_va_block->tracker); if (status != NV_OK) goto out; // Update existing's state only once we're past all failure points event_data.block_shrink.block = existing_va_block; uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_BLOCK_SHRINK, &event_data); block_split_cpu(existing_va_block, new_block); for_each_gpu_id(id) block_split_gpu(existing_va_block, new_block, id); // Update the size of the existing block first so that // block_set_processor_masks can use block_{set,clear}_resident_processor // that relies on the size to be correct. existing_va_block->end = new_end; block_split_page_mask(&existing_va_block->read_duplicated_pages, uvm_va_block_num_cpu_pages(existing_va_block), &new_block->read_duplicated_pages, uvm_va_block_num_cpu_pages(new_block)); block_split_page_mask(&existing_va_block->maybe_mapped_pages, uvm_va_block_num_cpu_pages(existing_va_block), &new_block->maybe_mapped_pages, uvm_va_block_num_cpu_pages(new_block)); block_set_processor_masks(existing_va_block); block_set_processor_masks(new_block); if (uvm_va_block_is_hmm(existing_va_block)) uvm_va_policy_node_split_move(existing_va_block, new_block); out: // Run checks on existing_va_block even on failure, since an error must // leave the block in a consistent state. for_each_gpu_id(id) { UVM_ASSERT(block_check_chunks(existing_va_block, id)); if (status == NV_OK) UVM_ASSERT(block_check_chunks(new_block, id)); } UVM_ASSERT(block_check_mappings(existing_va_block)); UVM_ASSERT(block_verify_cpu_chunks(existing_va_block)); if (status == NV_OK) { UVM_ASSERT(block_check_mappings(new_block)); UVM_ASSERT(block_verify_cpu_chunks(new_block)); } uvm_mutex_unlock_no_tracking(&new_block->lock); return status; } static bool block_region_might_read_duplicate(uvm_va_block_t *va_block, uvm_va_block_region_t region) { uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); uvm_va_range_t *va_range = va_block->va_range; if (!uvm_va_space_can_read_duplicate(va_space, NULL)) return false; // TODO: Bug 2046423: need to implement read duplication support in Linux. if (uvm_va_block_is_hmm(va_block) || uvm_va_range_get_policy(va_range)->read_duplication == UVM_READ_DUPLICATION_DISABLED) return false; if (uvm_va_range_get_policy(va_range)->read_duplication == UVM_READ_DUPLICATION_UNSET && uvm_page_mask_region_weight(&va_block->read_duplicated_pages, region) == 0) return false; return true; } // Returns the new access permission for the processor that faulted or // triggered access counter notifications on the given page // // TODO: Bug 1766424: this function works on a single page at a time. This // could be changed in the future to optimize multiple faults/counters on // contiguous pages. static uvm_prot_t compute_new_permission(uvm_va_block_t *va_block, uvm_page_index_t page_index, uvm_processor_id_t fault_processor_id, uvm_processor_id_t new_residency, uvm_fault_access_type_t access_type) { uvm_va_range_t *va_range; uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); uvm_prot_t logical_prot, new_prot; // TODO: Bug 1766432: Refactor into policies. Current policy is // query_promote: upgrade access privileges to avoid future faults IF // they don't trigger further revocations. va_range = va_block->va_range; new_prot = uvm_fault_access_type_to_prot(access_type); logical_prot = uvm_va_range_logical_prot(va_range); UVM_ASSERT(logical_prot >= new_prot); if (logical_prot > UVM_PROT_READ_ONLY && new_prot == UVM_PROT_READ_ONLY && !block_region_might_read_duplicate(va_block, uvm_va_block_region_for_page(page_index))) { uvm_processor_mask_t processors_with_atomic_mapping; uvm_processor_mask_t revoke_processors; uvm_va_block_page_authorized_processors(va_block, page_index, UVM_PROT_READ_WRITE_ATOMIC, &processors_with_atomic_mapping); uvm_processor_mask_andnot(&revoke_processors, &processors_with_atomic_mapping, &va_space->has_native_atomics[uvm_id_value(new_residency)]); // Only check if there are no faultable processors in the revoke processors mask uvm_processor_mask_and(&revoke_processors, &revoke_processors, &va_space->faultable_processors); if (uvm_processor_mask_empty(&revoke_processors)) new_prot = UVM_PROT_READ_WRITE; } if (logical_prot == UVM_PROT_READ_WRITE_ATOMIC && new_prot == UVM_PROT_READ_WRITE) { if (uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(new_residency)], fault_processor_id)) new_prot = UVM_PROT_READ_WRITE_ATOMIC; } return new_prot; } static NV_STATUS do_block_add_mappings_after_migration(uvm_va_block_t *va_block, uvm_va_block_context_t *va_block_context, uvm_processor_id_t new_residency, uvm_processor_id_t processor_id, const uvm_processor_mask_t *map_processors, uvm_va_block_region_t region, const uvm_page_mask_t *map_page_mask, uvm_prot_t max_prot, const uvm_processor_mask_t *thrashing_processors, uvm_tracker_t *tracker) { NV_STATUS status; uvm_processor_id_t map_processor_id; uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); uvm_prot_t new_map_prot = max_prot; uvm_processor_mask_t map_processors_local; uvm_processor_mask_copy(&map_processors_local, map_processors); // Handle atomic mappings separately if (max_prot == UVM_PROT_READ_WRITE_ATOMIC) { bool this_processor_has_native_atomics; this_processor_has_native_atomics = uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(new_residency)], processor_id); if (this_processor_has_native_atomics) { uvm_processor_mask_t map_atomic_processors; // Compute processors with native atomics to the residency uvm_processor_mask_and(&map_atomic_processors, &map_processors_local, &va_space->has_native_atomics[uvm_id_value(new_residency)]); // Filter out these mapped processors for the next steps uvm_processor_mask_andnot(&map_processors_local, &map_processors_local, &map_atomic_processors); for_each_id_in_mask(map_processor_id, &map_atomic_processors) { UvmEventMapRemoteCause cause = UvmEventMapRemoteCausePolicy; if (thrashing_processors && uvm_processor_mask_test(thrashing_processors, map_processor_id)) cause = UvmEventMapRemoteCauseThrashing; status = uvm_va_block_map(va_block, va_block_context, map_processor_id, region, map_page_mask, UVM_PROT_READ_WRITE_ATOMIC, cause, tracker); if (status != NV_OK) return status; } new_map_prot = UVM_PROT_READ_WRITE; } else { if (UVM_ID_IS_CPU(processor_id)) new_map_prot = UVM_PROT_READ_WRITE; else new_map_prot = UVM_PROT_READ_ONLY; } } // Map the rest of processors for_each_id_in_mask(map_processor_id, &map_processors_local) { UvmEventMapRemoteCause cause = UvmEventMapRemoteCausePolicy; uvm_prot_t final_map_prot; bool map_processor_has_enabled_system_wide_atomics = uvm_processor_mask_test(&va_space->system_wide_atomics_enabled_processors, map_processor_id); // Write mappings from processors with disabled system-wide atomics are treated like atomics if (new_map_prot == UVM_PROT_READ_WRITE && !map_processor_has_enabled_system_wide_atomics) final_map_prot = UVM_PROT_READ_WRITE_ATOMIC; else final_map_prot = new_map_prot; if (thrashing_processors && uvm_processor_mask_test(thrashing_processors, map_processor_id)) cause = UvmEventMapRemoteCauseThrashing; status = uvm_va_block_map(va_block, va_block_context, map_processor_id, region, map_page_mask, final_map_prot, cause, tracker); if (status != NV_OK) return status; } return NV_OK; } NV_STATUS uvm_va_block_add_mappings_after_migration(uvm_va_block_t *va_block, uvm_va_block_context_t *va_block_context, uvm_processor_id_t new_residency, uvm_processor_id_t processor_id, uvm_va_block_region_t region, const uvm_page_mask_t *map_page_mask, uvm_prot_t max_prot, const uvm_processor_mask_t *thrashing_processors) { NV_STATUS tracker_status, status = NV_OK; uvm_processor_mask_t map_other_processors, map_uvm_lite_gpus; uvm_processor_id_t map_processor_id; uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); const uvm_page_mask_t *final_page_mask = map_page_mask; uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); uvm_va_policy_t *policy = va_block_context->policy; uvm_processor_id_t preferred_location; // Read duplication takes precedence over SetAccesedBy. // // Exclude ranges with read duplication set... if (uvm_va_policy_is_read_duplicate(policy, va_space)) { status = NV_OK; goto out; } // ... and pages read-duplicated by performance heuristics if (policy->read_duplication == UVM_READ_DUPLICATION_UNSET) { if (map_page_mask) { uvm_page_mask_andnot(&va_block_context->mapping.filtered_page_mask, map_page_mask, &va_block->read_duplicated_pages); } else { uvm_page_mask_complement(&va_block_context->mapping.filtered_page_mask, &va_block->read_duplicated_pages); } final_page_mask = &va_block_context->mapping.filtered_page_mask; } // Add mappings for accessed_by processors and the given processor mask if (thrashing_processors) uvm_processor_mask_or(&map_other_processors, &policy->accessed_by, thrashing_processors); else uvm_processor_mask_copy(&map_other_processors, &policy->accessed_by); // Only processors that can access the new location must be considered uvm_processor_mask_and(&map_other_processors, &map_other_processors, &va_space->accessible_from[uvm_id_value(new_residency)]); // Exclude caller processor as it must have already been mapped uvm_processor_mask_clear(&map_other_processors, processor_id); // Exclude preferred location so it won't get remote mappings preferred_location = policy->preferred_location; if (UVM_ID_IS_VALID(preferred_location) && !uvm_id_equal(new_residency, preferred_location) && uvm_va_space_processor_has_memory(va_space, preferred_location)) { uvm_processor_mask_clear(&map_other_processors, preferred_location); } // Map the UVM-Lite GPUs if the new location is the preferred location. This // will only create mappings on first touch. After that they're persistent // so uvm_va_block_map will be a no-op. uvm_processor_mask_and(&map_uvm_lite_gpus, &map_other_processors, block_get_uvm_lite_gpus(va_block)); if (!uvm_processor_mask_empty(&map_uvm_lite_gpus) && uvm_id_equal(new_residency, preferred_location)) { for_each_id_in_mask(map_processor_id, &map_uvm_lite_gpus) { status = uvm_va_block_map(va_block, va_block_context, map_processor_id, region, final_page_mask, UVM_PROT_READ_WRITE_ATOMIC, UvmEventMapRemoteCauseCoherence, &local_tracker); if (status != NV_OK) goto out; } } uvm_processor_mask_andnot(&map_other_processors, &map_other_processors, block_get_uvm_lite_gpus(va_block)); // We can't map non-migratable pages to the CPU. If we have any, build a // new mask of migratable pages and map the CPU separately. if (uvm_processor_mask_test(&map_other_processors, UVM_ID_CPU) && !uvm_range_group_all_migratable(va_space, uvm_va_block_region_start(va_block, region), uvm_va_block_region_end(va_block, region))) { uvm_page_mask_t *migratable_mask = &va_block_context->mapping.migratable_mask; uvm_range_group_migratable_page_mask(va_block, region, migratable_mask); if (uvm_page_mask_and(migratable_mask, migratable_mask, final_page_mask)) { uvm_processor_mask_t cpu_mask; uvm_processor_mask_zero(&cpu_mask); uvm_processor_mask_set(&cpu_mask, UVM_ID_CPU); status = do_block_add_mappings_after_migration(va_block, va_block_context, new_residency, processor_id, &cpu_mask, region, migratable_mask, max_prot, thrashing_processors, &local_tracker); if (status != NV_OK) goto out; } uvm_processor_mask_clear(&map_other_processors, UVM_ID_CPU); } status = do_block_add_mappings_after_migration(va_block, va_block_context, new_residency, processor_id, &map_other_processors, region, final_page_mask, max_prot, thrashing_processors, &local_tracker); if (status != NV_OK) goto out; out: tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker); uvm_tracker_deinit(&local_tracker); return status == NV_OK ? tracker_status : status; } // TODO: Bug 1750144: check logical permissions from HMM to know what's the // maximum allowed. uvm_prot_t uvm_va_block_page_compute_highest_permission(uvm_va_block_t *va_block, uvm_processor_id_t processor_id, uvm_page_index_t page_index) { uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); uvm_processor_mask_t resident_processors; NvU32 resident_processors_count; if (uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), processor_id)) return UVM_PROT_READ_WRITE_ATOMIC; uvm_va_block_page_resident_processors(va_block, page_index, &resident_processors); resident_processors_count = uvm_processor_mask_get_count(&resident_processors); if (resident_processors_count == 0) { return UVM_PROT_NONE; } else if (resident_processors_count > 1) { // If there are many copies, we can only map READ ONLY // // The block state doesn't track the mapping target (aperture) of each // individual PTE, just the permissions and where the data is resident. // If the data is resident in multiple places, then we have a problem // since we can't know where the PTE points. This means we won't know // what needs to be unmapped for cases like UvmUnregisterGpu and // UvmDisablePeerAccess. // // The simple way to solve this is to enforce that a read-duplication // mapping always points to local memory. if (uvm_processor_mask_test(&resident_processors, processor_id)) return UVM_PROT_READ_ONLY; return UVM_PROT_NONE; } else { uvm_processor_id_t atomic_id; uvm_processor_id_t residency; uvm_processor_mask_t atomic_mappings; uvm_processor_mask_t write_mappings; // Search the id of the processor with the only resident copy residency = uvm_processor_mask_find_first_id(&resident_processors); UVM_ASSERT(UVM_ID_IS_VALID(residency)); // If we cannot map the processor with the resident copy, exit if (!uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(residency)], processor_id)) return UVM_PROT_NONE; // Fast path: if the page is not mapped anywhere else, it can be safely // mapped with RWA permission if (!uvm_page_mask_test(&va_block->maybe_mapped_pages, page_index)) return UVM_PROT_READ_WRITE_ATOMIC; uvm_va_block_page_authorized_processors(va_block, page_index, UVM_PROT_READ_WRITE_ATOMIC, &atomic_mappings); // Exclude processors with system-wide atomics disabled from atomic_mappings uvm_processor_mask_and(&atomic_mappings, &atomic_mappings, &va_space->system_wide_atomics_enabled_processors); // Exclude the processor for which the mapping protections are being computed uvm_processor_mask_clear(&atomic_mappings, processor_id); // If there is any processor with atomic mapping, check if it has native atomics to the processor // with the resident copy. If it does not, we can only map READ ONLY atomic_id = uvm_processor_mask_find_first_id(&atomic_mappings); if (UVM_ID_IS_VALID(atomic_id) && !uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(residency)], atomic_id)) { return UVM_PROT_READ_ONLY; } uvm_va_block_page_authorized_processors(va_block, page_index, UVM_PROT_READ_WRITE, &write_mappings); // Exclude the processor for which the mapping protections are being computed uvm_processor_mask_clear(&write_mappings, processor_id); // At this point, any processor with atomic mappings either has native atomics support to the // processor with the resident copy or has disabled system-wide atomics. If the requesting // processor has disabled system-wide atomics or has native atomics to that processor, we can // map with ATOMIC privileges. Likewise, if there are no other processors with WRITE or ATOMIC // mappings, we can map with ATOMIC privileges. if (!uvm_processor_mask_test(&va_space->system_wide_atomics_enabled_processors, processor_id) || uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(residency)], processor_id) || uvm_processor_mask_empty(&write_mappings)) { return UVM_PROT_READ_WRITE_ATOMIC; } return UVM_PROT_READ_WRITE; } } NV_STATUS uvm_va_block_add_mappings(uvm_va_block_t *va_block, uvm_va_block_context_t *va_block_context, uvm_processor_id_t processor_id, uvm_va_block_region_t region, const uvm_page_mask_t *page_mask, UvmEventMapRemoteCause cause) { uvm_va_range_t *va_range = va_block->va_range; uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); NV_STATUS status = NV_OK; uvm_page_index_t page_index; uvm_range_group_range_iter_t iter; uvm_prot_t prot_to_map; if (UVM_ID_IS_CPU(processor_id) && !uvm_va_block_is_hmm(va_block)) { if (!uvm_va_range_vma_check(va_range, va_block_context->mm)) return NV_OK; uvm_range_group_range_migratability_iter_first(va_space, uvm_va_block_region_start(va_block, region), uvm_va_block_region_end(va_block, region), &iter); } for (prot_to_map = UVM_PROT_READ_ONLY; prot_to_map <= UVM_PROT_READ_WRITE_ATOMIC; ++prot_to_map) va_block_context->mask_by_prot[prot_to_map - 1].count = 0; for_each_va_block_page_in_region_mask(page_index, page_mask, region) { // Read duplication takes precedence over SetAccesedBy. Exclude pages // read-duplicated by performance heuristics if (uvm_page_mask_test(&va_block->read_duplicated_pages, page_index)) continue; prot_to_map = uvm_va_block_page_compute_highest_permission(va_block, processor_id, page_index); if (prot_to_map == UVM_PROT_NONE) continue; if (UVM_ID_IS_CPU(processor_id) && !uvm_va_block_is_hmm(va_block)) { while (uvm_va_block_cpu_page_index(va_block, iter.end) < page_index) { uvm_range_group_range_migratability_iter_next(va_space, &iter, uvm_va_block_region_end(va_block, region)); } if (!iter.migratable) continue; } if (va_block_context->mask_by_prot[prot_to_map - 1].count++ == 0) uvm_page_mask_zero(&va_block_context->mask_by_prot[prot_to_map - 1].page_mask); uvm_page_mask_set(&va_block_context->mask_by_prot[prot_to_map - 1].page_mask, page_index); } for (prot_to_map = UVM_PROT_READ_ONLY; prot_to_map <= UVM_PROT_READ_WRITE_ATOMIC; ++prot_to_map) { if (va_block_context->mask_by_prot[prot_to_map - 1].count == 0) continue; status = uvm_va_block_map(va_block, va_block_context, processor_id, region, &va_block_context->mask_by_prot[prot_to_map - 1].page_mask, prot_to_map, cause, &va_block->tracker); if (status != NV_OK) break; } return status; } static bool can_read_duplicate(uvm_va_block_t *va_block, uvm_page_index_t page_index, uvm_va_policy_t *policy, const uvm_perf_thrashing_hint_t *thrashing_hint) { if (uvm_va_policy_is_read_duplicate(policy, uvm_va_block_get_va_space(va_block))) return true; if (policy->read_duplication != UVM_READ_DUPLICATION_DISABLED && uvm_page_mask_test(&va_block->read_duplicated_pages, page_index) && thrashing_hint->type != UVM_PERF_THRASHING_HINT_TYPE_PIN) return true; return false; } // TODO: Bug 1827400: If the faulting processor has support for native // atomics to the current location and the faults on the page were // triggered by atomic accesses only, we keep the current residency. // This is a short-term solution to exercise remote atomics over // NVLINK when possible (not only when preferred location is set to // the remote GPU) as they are much faster than relying on page // faults and permission downgrades, which cause thrashing. In the // future, the thrashing detection/prevention heuristics should // detect and handle this case. static bool map_remote_on_atomic_fault(uvm_va_space_t *va_space, NvU32 access_type_mask, uvm_processor_id_t processor_id, uvm_processor_id_t residency) { // This policy can be enabled/disabled using a module parameter if (!uvm_perf_map_remote_on_native_atomics_fault) return false; // Only consider atomics faults if (uvm_fault_access_type_mask_lowest(access_type_mask) < UVM_FAULT_ACCESS_TYPE_ATOMIC_WEAK) return false; // We cannot differentiate CPU writes from atomics. We exclude CPU faults // from the logic explained above in order to avoid mapping CPU to vidmem // memory due to a write. if (UVM_ID_IS_CPU(processor_id)) return false; // On P9 systems (which have native HW support for system-wide atomics), we // have determined experimentally that placing memory on a GPU yields the // best performance on most cases (since CPU can cache vidmem but not vice // versa). Therefore, don't map remotely if the current residency is // sysmem. if (UVM_ID_IS_CPU(residency)) return false; return uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(residency)], processor_id); } // TODO: Bug 1766424: this function works on a single page at a time. This // could be changed in the future to optimize multiple faults or access // counter notifications on contiguous pages. static uvm_processor_id_t block_select_residency(uvm_va_block_t *va_block, uvm_page_index_t page_index, uvm_processor_id_t processor_id, NvU32 access_type_mask, uvm_va_policy_t *policy, const uvm_perf_thrashing_hint_t *thrashing_hint, uvm_service_operation_t operation, bool *read_duplicate) { uvm_processor_id_t closest_resident_processor; uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); bool may_read_duplicate; uvm_processor_id_t preferred_location; if (is_uvm_fault_force_sysmem_set()) { *read_duplicate = false; return UVM_ID_CPU; } may_read_duplicate = can_read_duplicate(va_block, page_index, policy, thrashing_hint); // Read/prefetch faults on a VA range with read duplication enabled // always create a copy of the page on the faulting processor's memory. // Note that access counters always use UVM_FAULT_ACCESS_TYPE_PREFETCH, // which will lead to read duplication if it is enabled. *read_duplicate = may_read_duplicate && (uvm_fault_access_type_mask_highest(access_type_mask) <= UVM_FAULT_ACCESS_TYPE_READ); if (*read_duplicate) return processor_id; *read_duplicate = false; // If read-duplication is active in the page but we are not // read-duplicating because the access type is not a read or a prefetch, // the faulting processor should get a local copy if (may_read_duplicate) return processor_id; // If the faulting processor is the preferred location always migrate preferred_location = policy->preferred_location; if (uvm_id_equal(processor_id, preferred_location)) { if (thrashing_hint->type != UVM_PERF_THRASHING_HINT_TYPE_NONE) { UVM_ASSERT(thrashing_hint->type == UVM_PERF_THRASHING_HINT_TYPE_PIN); if (uvm_va_space_processor_has_memory(va_space, processor_id)) UVM_ASSERT(uvm_id_equal(thrashing_hint->pin.residency, processor_id)); } return processor_id; } if (thrashing_hint->type == UVM_PERF_THRASHING_HINT_TYPE_PIN) { UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(thrashing_hint->pin.residency)], processor_id)); return thrashing_hint->pin.residency; } closest_resident_processor = uvm_va_block_page_get_closest_resident(va_block, page_index, processor_id); // If the page is not resident anywhere, select the preferred location as // long as the preferred location is accessible from the faulting processor. // Otherwise select the faulting processor. if (UVM_ID_IS_INVALID(closest_resident_processor)) { if (UVM_ID_IS_VALID(preferred_location) && uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(preferred_location)], processor_id)) { return preferred_location; } return processor_id; } // AccessedBy mappings might have not been created for the CPU if the thread // which made the memory resident did not have the proper references on the // mm_struct (for example, the GPU fault handling path when // uvm_va_space_mm_enabled() is false). // // Also, in uvm_migrate_*, we implement a two-pass scheme in which // AccessedBy mappings may be delayed to the second pass. This can produce // faults even if the faulting processor is in the accessed_by mask. // // Here, we keep it on the current residency and we just add the missing // mapping. if (uvm_processor_mask_test(&policy->accessed_by, processor_id) && uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(closest_resident_processor)], processor_id) && operation != UVM_SERVICE_OPERATION_ACCESS_COUNTERS) { return closest_resident_processor; } // Check if we should map the closest resident processor remotely on atomic // fault if (map_remote_on_atomic_fault(va_space, access_type_mask, processor_id, closest_resident_processor)) return closest_resident_processor; // If the processor has access to the preferred location, and the page is // not resident on the accessing processor, move it to the preferred // location. if (!uvm_id_equal(closest_resident_processor, processor_id) && UVM_ID_IS_VALID(preferred_location) && uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(preferred_location)], processor_id)) return preferred_location; // If the page is resident on a processor other than the preferred location, // or the faulting processor can't access the preferred location, we select // the faulting processor as the new residency. return processor_id; } uvm_processor_id_t uvm_va_block_select_residency(uvm_va_block_t *va_block, uvm_page_index_t page_index, uvm_processor_id_t processor_id, NvU32 access_type_mask, uvm_va_policy_t *policy, const uvm_perf_thrashing_hint_t *thrashing_hint, uvm_service_operation_t operation, bool *read_duplicate) { uvm_processor_id_t id = block_select_residency(va_block, page_index, processor_id, access_type_mask, policy, thrashing_hint, operation, read_duplicate); // If the intended residency doesn't have memory, fall back to the CPU. if (!block_processor_has_memory(va_block, id)) { *read_duplicate = false; return UVM_ID_CPU; } return id; } static bool check_access_counters_dont_revoke(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_va_block_region_t region, const uvm_processor_mask_t *revoke_processors, const uvm_page_mask_t *revoke_page_mask, uvm_prot_t revoke_prot) { uvm_processor_id_t id; for_each_id_in_mask(id, revoke_processors) { const uvm_page_mask_t *mapped_with_prot = block_map_with_prot_mask_get(block, id, revoke_prot); uvm_page_mask_and(&block_context->caller_page_mask, revoke_page_mask, mapped_with_prot); UVM_ASSERT(uvm_page_mask_region_weight(&block_context->caller_page_mask, region) == 0); } return true; } NV_STATUS uvm_va_block_service_locked(uvm_processor_id_t processor_id, uvm_va_block_t *va_block, uvm_va_block_retry_t *block_retry, uvm_service_block_context_t *service_context) { NV_STATUS status = NV_OK; uvm_processor_id_t new_residency; uvm_prot_t new_prot; uvm_va_range_t *va_range = va_block->va_range; uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); uvm_perf_prefetch_hint_t prefetch_hint = UVM_PERF_PREFETCH_HINT_NONE(); uvm_processor_mask_t processors_involved_in_cpu_migration; uvm_assert_mutex_locked(&va_block->lock); UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED); // GPU fault servicing must be done under the VA space read lock. GPU fault // servicing is required for RM to make forward progress, and we allow other // threads to call into RM while holding the VA space lock in read mode. If // we took the VA space lock in write mode on the GPU fault service path, // we could deadlock because the thread in RM which holds the VA space lock // for read wouldn't be able to complete until fault servicing completes. if (service_context->operation != UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS || UVM_ID_IS_CPU(processor_id)) uvm_assert_rwsem_locked(&va_space->lock); else uvm_assert_rwsem_locked_read(&va_space->lock); // Performance heuristics policy: we only consider prefetching when there // are migrations to a single processor, only. if (uvm_processor_mask_get_count(&service_context->resident_processors) == 1) { uvm_page_index_t page_index; uvm_page_mask_t *new_residency_mask; uvm_va_policy_t *policy = service_context->block_context.policy; new_residency = uvm_processor_mask_find_first_id(&service_context->resident_processors); new_residency_mask = &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency; // Update prefetch tracking structure with the pages that will migrate // due to faults uvm_perf_prefetch_prenotify_fault_migrations(va_block, &service_context->block_context, new_residency, new_residency_mask, service_context->region); prefetch_hint = uvm_perf_prefetch_get_hint(va_block, new_residency_mask); // Obtain the prefetch hint and give a fake fault access type to the // prefetched pages if (UVM_ID_IS_VALID(prefetch_hint.residency)) { UVM_ASSERT(prefetch_hint.prefetch_pages_mask != NULL); for_each_va_block_page_in_mask(page_index, prefetch_hint.prefetch_pages_mask, va_block) { UVM_ASSERT(!uvm_page_mask_test(new_residency_mask, page_index)); service_context->access_type[page_index] = UVM_FAULT_ACCESS_TYPE_PREFETCH; if (uvm_va_policy_is_read_duplicate(policy, va_space) || (policy->read_duplication != UVM_READ_DUPLICATION_DISABLED && uvm_page_mask_test(&va_block->read_duplicated_pages, page_index))) { if (service_context->read_duplicate_count++ == 0) uvm_page_mask_zero(&service_context->read_duplicate_mask); uvm_page_mask_set(&service_context->read_duplicate_mask, page_index); } } service_context->region = uvm_va_block_region_from_block(va_block); } } for (new_prot = UVM_PROT_READ_ONLY; new_prot < UVM_PROT_MAX; ++new_prot) service_context->mappings_by_prot[new_prot-1].count = 0; uvm_processor_mask_zero(&processors_involved_in_cpu_migration); // 1- Migrate pages and compute mapping protections for_each_id_in_mask(new_residency, &service_context->resident_processors) { uvm_processor_mask_t *all_involved_processors = &service_context->block_context.make_resident.all_involved_processors; uvm_page_mask_t *new_residency_mask = &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency; uvm_page_mask_t *did_migrate_mask = &service_context->block_context.make_resident.pages_changed_residency; uvm_page_index_t page_index; uvm_make_resident_cause_t cause; UVM_ASSERT_MSG(service_context->operation == UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS || service_context->operation == UVM_SERVICE_OPERATION_NON_REPLAYABLE_FAULTS || service_context->operation == UVM_SERVICE_OPERATION_ACCESS_COUNTERS, "Invalid operation value %u\n", service_context->operation); if (service_context->operation == UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS) cause = UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT; else if (service_context->operation == UVM_SERVICE_OPERATION_NON_REPLAYABLE_FAULTS) cause = UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT; else cause = UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER; // 1.1- Migrate pages // Reset masks before all of the make_resident calls uvm_page_mask_zero(did_migrate_mask); uvm_processor_mask_zero(all_involved_processors); if (UVM_ID_IS_VALID(prefetch_hint.residency)) { UVM_ASSERT(uvm_id_equal(prefetch_hint.residency, new_residency)); UVM_ASSERT(prefetch_hint.prefetch_pages_mask != NULL); uvm_page_mask_or(new_residency_mask, new_residency_mask, prefetch_hint.prefetch_pages_mask); } if (service_context->read_duplicate_count == 0 || uvm_page_mask_andnot(&service_context->block_context.caller_page_mask, new_residency_mask, &service_context->read_duplicate_mask)) { status = uvm_va_block_make_resident(va_block, block_retry, &service_context->block_context, new_residency, service_context->region, service_context->read_duplicate_count == 0? new_residency_mask: &service_context->block_context.caller_page_mask, prefetch_hint.prefetch_pages_mask, cause); if (status != NV_OK) return status; } if (service_context->read_duplicate_count != 0 && uvm_page_mask_and(&service_context->block_context.caller_page_mask, new_residency_mask, &service_context->read_duplicate_mask)) { status = uvm_va_block_make_resident_read_duplicate(va_block, block_retry, &service_context->block_context, new_residency, service_context->region, &service_context->block_context.caller_page_mask, prefetch_hint.prefetch_pages_mask, cause); if (status != NV_OK) return status; } if (UVM_ID_IS_CPU(new_residency)) { // Save all the processors involved in migrations to the CPU for // an ECC check before establishing the CPU mappings. uvm_processor_mask_copy(&processors_involved_in_cpu_migration, all_involved_processors); } if (UVM_ID_IS_CPU(processor_id) && !uvm_processor_mask_empty(all_involved_processors)) service_context->cpu_fault.did_migrate = true; uvm_page_mask_andnot(&service_context->did_not_migrate_mask, new_residency_mask, did_migrate_mask); // 1.2 - Compute mapping protections for the requesting processor on // the new residency for_each_va_block_page_in_region_mask(page_index, new_residency_mask, service_context->region) { new_prot = compute_new_permission(va_block, page_index, processor_id, new_residency, service_context->access_type[page_index]); if (service_context->mappings_by_prot[new_prot-1].count++ == 0) uvm_page_mask_zero(&service_context->mappings_by_prot[new_prot-1].page_mask); uvm_page_mask_set(&service_context->mappings_by_prot[new_prot-1].page_mask, page_index); } // 1.3- Revoke permissions // // NOTE: uvm_va_block_make_resident destroys mappings to old locations. // Thus, we need to revoke only if residency did not change and we // are mapping higher than READ ONLY. for (new_prot = UVM_PROT_READ_WRITE; new_prot <= UVM_PROT_READ_WRITE_ATOMIC; ++new_prot) { bool pages_need_revocation; uvm_processor_mask_t revoke_processors; uvm_prot_t revoke_prot; bool this_processor_has_enabled_atomics; if (service_context->mappings_by_prot[new_prot-1].count == 0) continue; pages_need_revocation = uvm_page_mask_and(&service_context->revocation_mask, &service_context->did_not_migrate_mask, &service_context->mappings_by_prot[new_prot-1].page_mask); if (!pages_need_revocation) continue; uvm_processor_mask_and(&revoke_processors, &va_block->mapped, &va_space->faultable_processors); // Do not revoke the processor that took the fault uvm_processor_mask_clear(&revoke_processors, processor_id); this_processor_has_enabled_atomics = uvm_processor_mask_test(&va_space->system_wide_atomics_enabled_processors, processor_id); // Atomic operations on processors with system-wide atomics // disabled or with native atomics access to new_residency // behave like writes. if (new_prot == UVM_PROT_READ_WRITE || !this_processor_has_enabled_atomics || uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(new_residency)], processor_id)) { // Exclude processors with native atomics on the resident copy uvm_processor_mask_andnot(&revoke_processors, &revoke_processors, &va_space->has_native_atomics[uvm_id_value(new_residency)]); // Exclude processors with disabled system-wide atomics uvm_processor_mask_and(&revoke_processors, &revoke_processors, &va_space->system_wide_atomics_enabled_processors); } if (UVM_ID_IS_CPU(processor_id)) { revoke_prot = UVM_PROT_READ_WRITE_ATOMIC; } else { revoke_prot = (new_prot == UVM_PROT_READ_WRITE_ATOMIC)? UVM_PROT_READ_WRITE: UVM_PROT_READ_WRITE_ATOMIC; } // UVM-Lite processors must always have RWA mappings if (uvm_processor_mask_andnot(&revoke_processors, &revoke_processors, block_get_uvm_lite_gpus(va_block))) { // Access counters should never trigger revocations apart from // read-duplication, which are performed in the calls to // uvm_va_block_make_resident_read_duplicate, above. if (service_context->operation == UVM_SERVICE_OPERATION_ACCESS_COUNTERS) { UVM_ASSERT(check_access_counters_dont_revoke(va_block, &service_context->block_context, service_context->region, &revoke_processors, &service_context->revocation_mask, revoke_prot)); } // Downgrade other processors' mappings status = uvm_va_block_revoke_prot_mask(va_block, &service_context->block_context, &revoke_processors, service_context->region, &service_context->revocation_mask, revoke_prot); if (status != NV_OK) return status; } } } // 2- Check for ECC errors on all GPUs involved in the migration if CPU is // the destination. Migrations in response to CPU faults are special // because they're on the only path (apart from tools) where CUDA is not // involved and wouldn't have a chance to do its own ECC checking. if (service_context->operation == UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS && UVM_ID_IS_CPU(processor_id) && !uvm_processor_mask_empty(&processors_involved_in_cpu_migration)) { uvm_gpu_t *gpu; // Before checking for ECC errors, make sure all of the GPU work // is finished. Creating mappings on the CPU would have to wait // for the tracker anyway so this shouldn't hurt performance. status = uvm_tracker_wait(&va_block->tracker); if (status != NV_OK) return status; for_each_va_space_gpu_in_mask(gpu, va_space, &processors_involved_in_cpu_migration) { // We cannot call into RM here so use the no RM ECC check. status = uvm_gpu_check_ecc_error_no_rm(gpu); if (status == NV_WARN_MORE_PROCESSING_REQUIRED) { // In case we need to call into RM to be sure whether // there is an ECC error or not, signal that to the // caller by adding the GPU to the mask. // // In that case the ECC error might be noticed only after // the CPU mappings have been already created below, // exposing different CPU threads to the possibly corrupt // data, but this thread will fault eventually and that's // considered to be an acceptable trade-off between // performance and ECC error containment. uvm_processor_mask_set(&service_context->cpu_fault.gpus_to_check_for_ecc, gpu->id); status = NV_OK; } if (status != NV_OK) return status; } } // 3- Map requesting processor with the necessary privileges for (new_prot = UVM_PROT_READ_ONLY; new_prot <= UVM_PROT_READ_WRITE_ATOMIC; ++new_prot) { const uvm_page_mask_t *map_prot_mask = &service_context->mappings_by_prot[new_prot-1].page_mask; if (service_context->mappings_by_prot[new_prot-1].count == 0) continue; // 3.1 - Unmap CPU pages if (service_context->operation != UVM_SERVICE_OPERATION_ACCESS_COUNTERS && UVM_ID_IS_CPU(processor_id)) { // The kernel can downgrade our CPU mappings at any time without // notifying us, which means our PTE state could be stale. We // handle this by unmapping the CPU PTE and re-mapping it again. // // A CPU fault is unexpected if: // curr_prot == RW || (!is_write && curr_prot == RO) status = uvm_va_block_unmap(va_block, &service_context->block_context, UVM_ID_CPU, service_context->region, map_prot_mask, NULL); if (status != NV_OK) return status; } // 3.2 - Add new mappings // The faulting processor can be mapped remotely due to user policy or // the thrashing mitigation heuristics. Therefore, we set the cause // accordingly in each case. // Map pages that are thrashing first if (service_context->thrashing_pin_count > 0 && va_space->tools.enabled) { uvm_page_mask_t *helper_page_mask = &service_context->block_context.caller_page_mask; bool pages_need_mapping = uvm_page_mask_and(helper_page_mask, map_prot_mask, &service_context->thrashing_pin_mask); if (pages_need_mapping) { status = uvm_va_block_map(va_block, &service_context->block_context, processor_id, service_context->region, helper_page_mask, new_prot, UvmEventMapRemoteCauseThrashing, &va_block->tracker); if (status != NV_OK) return status; // Remove thrashing pages from the map mask pages_need_mapping = uvm_page_mask_andnot(helper_page_mask, map_prot_mask, &service_context->thrashing_pin_mask); if (!pages_need_mapping) continue; map_prot_mask = helper_page_mask; } } status = uvm_va_block_map(va_block, &service_context->block_context, processor_id, service_context->region, map_prot_mask, new_prot, UvmEventMapRemoteCausePolicy, &va_block->tracker); if (status != NV_OK) return status; } // 4- If pages did migrate, map SetAccessedBy processors, except for UVM-Lite for_each_id_in_mask(new_residency, &service_context->resident_processors) { const uvm_page_mask_t *new_residency_mask; new_residency_mask = &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency; for (new_prot = UVM_PROT_READ_ONLY; new_prot <= UVM_PROT_READ_WRITE_ATOMIC; ++new_prot) { uvm_page_mask_t *map_prot_mask = &service_context->block_context.caller_page_mask; bool pages_need_mapping; if (service_context->mappings_by_prot[new_prot-1].count == 0) continue; pages_need_mapping = uvm_page_mask_and(map_prot_mask, new_residency_mask, &service_context->mappings_by_prot[new_prot-1].page_mask); if (!pages_need_mapping) continue; // Map pages that are thrashing if (service_context->thrashing_pin_count > 0) { uvm_page_index_t page_index; for_each_va_block_page_in_region_mask(page_index, &service_context->thrashing_pin_mask, service_context->region) { uvm_processor_mask_t *map_thrashing_processors = NULL; NvU64 page_addr = uvm_va_block_cpu_page_address(va_block, page_index); // Check protection type if (!uvm_page_mask_test(map_prot_mask, page_index)) continue; map_thrashing_processors = uvm_perf_thrashing_get_thrashing_processors(va_block, page_addr); status = uvm_va_block_add_mappings_after_migration(va_block, &service_context->block_context, new_residency, processor_id, uvm_va_block_region_for_page(page_index), map_prot_mask, new_prot, map_thrashing_processors); if (status != NV_OK) return status; } pages_need_mapping = uvm_page_mask_andnot(map_prot_mask, map_prot_mask, &service_context->thrashing_pin_mask); if (!pages_need_mapping) continue; } // Map the the rest of pages in a single shot status = uvm_va_block_add_mappings_after_migration(va_block, &service_context->block_context, new_residency, processor_id, service_context->region, map_prot_mask, new_prot, NULL); if (status != NV_OK) return status; } } return NV_OK; } // Check if we are faulting on a page with valid permissions to check if we can // skip fault handling. See uvm_va_block_t::cpu::fault_authorized for more // details static bool skip_cpu_fault_with_valid_permissions(uvm_va_block_t *va_block, uvm_page_index_t page_index, uvm_fault_access_type_t fault_access_type) { if (uvm_va_block_page_is_processor_authorized(va_block, page_index, UVM_ID_CPU, uvm_fault_access_type_to_prot(fault_access_type))) { NvU64 now = NV_GETTIME(); pid_t pid = current->pid; // Latch the pid/timestamp/page_index values for the first time if (!va_block->cpu.fault_authorized.first_fault_stamp) { va_block->cpu.fault_authorized.first_fault_stamp = now; va_block->cpu.fault_authorized.first_pid = pid; va_block->cpu.fault_authorized.page_index = page_index; return true; } // If the same thread shows up again, this means that the kernel // downgraded the page's PTEs. Service the fault to force a remap of // the page. if (va_block->cpu.fault_authorized.first_pid == pid && va_block->cpu.fault_authorized.page_index == page_index) { va_block->cpu.fault_authorized.first_fault_stamp = 0; } else { // If the window has expired, clear the information and service the // fault. Otherwise, just return if (now - va_block->cpu.fault_authorized.first_fault_stamp > uvm_perf_authorized_cpu_fault_tracking_window_ns) va_block->cpu.fault_authorized.first_fault_stamp = 0; else return true; } } return false; } static NV_STATUS block_cpu_fault_locked(uvm_va_block_t *va_block, uvm_va_block_retry_t *va_block_retry, NvU64 fault_addr, uvm_fault_access_type_t fault_access_type, uvm_service_block_context_t *service_context) { uvm_va_range_t *va_range = va_block->va_range; uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); NV_STATUS status = NV_OK; uvm_page_index_t page_index; uvm_perf_thrashing_hint_t thrashing_hint; uvm_processor_id_t new_residency; bool read_duplicate; uvm_assert_rwsem_locked(&va_space->lock); UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED); UVM_ASSERT(fault_addr >= va_block->start); UVM_ASSERT(fault_addr <= va_block->end); // There are up to three mm_structs to worry about, and they might all be // different: // // 1) vma->vm_mm // 2) current->mm // 3) va_space->va_space_mm.mm (though note that if this is valid, then it // must match vma->vm_mm). // // The kernel guarantees that vma->vm_mm has a reference taken with // mmap_lock held on the CPU fault path, so tell the fault handler to use // that one. current->mm might differ if we're on the access_process_vm // (ptrace) path or if another driver is calling get_user_pages. service_context->block_context.mm = uvm_va_range_vma(va_range)->vm_mm; uvm_assert_mmap_lock_locked(service_context->block_context.mm); service_context->block_context.policy = uvm_va_policy_get(va_block, fault_addr); if (service_context->num_retries == 0) { // notify event to tools/performance heuristics uvm_perf_event_notify_cpu_fault(&va_space->perf_events, va_block, service_context->block_context.policy->preferred_location, fault_addr, fault_access_type > UVM_FAULT_ACCESS_TYPE_READ, KSTK_EIP(current)); } // Check logical permissions status = uvm_va_range_check_logical_permissions(va_block->va_range, UVM_ID_CPU, fault_access_type, uvm_range_group_address_migratable(va_space, fault_addr)); if (status != NV_OK) return status; uvm_processor_mask_zero(&service_context->cpu_fault.gpus_to_check_for_ecc); page_index = uvm_va_block_cpu_page_index(va_block, fault_addr); if (skip_cpu_fault_with_valid_permissions(va_block, page_index, fault_access_type)) return NV_OK; thrashing_hint = uvm_perf_thrashing_get_hint(va_block, fault_addr, UVM_ID_CPU); // Throttling is implemented by sleeping in the fault handler on the CPU if (thrashing_hint.type == UVM_PERF_THRASHING_HINT_TYPE_THROTTLE) { service_context->cpu_fault.wakeup_time_stamp = thrashing_hint.throttle.end_time_stamp; return NV_WARN_MORE_PROCESSING_REQUIRED; } service_context->read_duplicate_count = 0; service_context->thrashing_pin_count = 0; service_context->operation = UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS; if (thrashing_hint.type == UVM_PERF_THRASHING_HINT_TYPE_PIN) { uvm_page_mask_zero(&service_context->thrashing_pin_mask); uvm_page_mask_set(&service_context->thrashing_pin_mask, page_index); service_context->thrashing_pin_count = 1; } // Compute new residency and update the masks new_residency = uvm_va_block_select_residency(va_block, page_index, UVM_ID_CPU, uvm_fault_access_type_mask_bit(fault_access_type), service_context->block_context.policy, &thrashing_hint, UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS, &read_duplicate); // Initialize the minimum necessary state in the fault service context uvm_processor_mask_zero(&service_context->resident_processors); // Set new residency and update the masks uvm_processor_mask_set(&service_context->resident_processors, new_residency); // The masks need to be fully zeroed as the fault region may grow due to prefetching uvm_page_mask_zero(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency); uvm_page_mask_set(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency, page_index); if (read_duplicate) { uvm_page_mask_zero(&service_context->read_duplicate_mask); uvm_page_mask_set(&service_context->read_duplicate_mask, page_index); service_context->read_duplicate_count = 1; } service_context->access_type[page_index] = fault_access_type; service_context->region = uvm_va_block_region_for_page(page_index); status = uvm_va_block_service_locked(UVM_ID_CPU, va_block, va_block_retry, service_context); ++service_context->num_retries; return status; } NV_STATUS uvm_va_block_cpu_fault(uvm_va_block_t *va_block, NvU64 fault_addr, bool is_write, uvm_service_block_context_t *service_context) { NV_STATUS status; uvm_va_block_retry_t va_block_retry; uvm_fault_access_type_t fault_access_type; if (is_write) fault_access_type = UVM_FAULT_ACCESS_TYPE_ATOMIC_STRONG; else fault_access_type = UVM_FAULT_ACCESS_TYPE_READ; service_context->num_retries = 0; service_context->cpu_fault.did_migrate = false; // We have to use vm_insert_page instead of handing the page to the kernel // and letting it insert the mapping, and we must do that while holding the // lock on this VA block. Otherwise there will be a window in which we think // we've mapped the page but the CPU mapping hasn't actually been created // yet. During that window a GPU fault event could arrive and claim // ownership of that VA, "unmapping" it. Then later the kernel would // eventually establish the mapping, and we'd end up with both CPU and GPU // thinking they each owned the page. // // This function must only be called when it's safe to call vm_insert_page. // That is, there must be a reference held on the vma's vm_mm, and // vm_mm->mmap_lock is held in at least read mode. Note that current->mm // might not be vma->vm_mm. status = UVM_VA_BLOCK_LOCK_RETRY(va_block, &va_block_retry, block_cpu_fault_locked(va_block, &va_block_retry, fault_addr, fault_access_type, service_context)); return status; } NV_STATUS uvm_va_block_find(uvm_va_space_t *va_space, NvU64 addr, uvm_va_block_t **out_block) { uvm_va_range_t *va_range; uvm_va_block_t *block; size_t index; va_range = uvm_va_range_find(va_space, addr); if (!va_range) return uvm_hmm_va_block_find(va_space, addr, out_block); UVM_ASSERT(uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_INVALID_ADDRESS || uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_OBJECT_NOT_FOUND); if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED) return NV_ERR_INVALID_ADDRESS; index = uvm_va_range_block_index(va_range, addr); block = uvm_va_range_block(va_range, index); if (!block) return NV_ERR_OBJECT_NOT_FOUND; *out_block = block; return NV_OK; } NV_STATUS uvm_va_block_find_create(uvm_va_space_t *va_space, struct mm_struct *mm, NvU64 addr, uvm_va_block_context_t *va_block_context, uvm_va_block_t **out_block) { uvm_va_range_t *va_range; size_t index; va_range = uvm_va_range_find(va_space, addr); if (!va_range) { if (!mm) return NV_ERR_INVALID_ADDRESS; return uvm_hmm_va_block_find_create(va_space, addr, va_block_context, out_block); } UVM_ASSERT(uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_INVALID_ADDRESS || uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_OBJECT_NOT_FOUND); if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED) return NV_ERR_INVALID_ADDRESS; index = uvm_va_range_block_index(va_range, addr); return uvm_va_range_block_create(va_range, index, out_block); } NV_STATUS uvm_va_block_write_from_cpu(uvm_va_block_t *va_block, uvm_va_block_context_t *block_context, NvU64 dst, uvm_mem_t *src_mem, size_t size) { NV_STATUS status; uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, dst); NvU64 page_offset = dst & (PAGE_SIZE - 1); uvm_processor_id_t proc = uvm_va_block_page_get_closest_resident(va_block, page_index, UVM_ID_CPU); uvm_va_block_region_t region = uvm_va_block_region_for_page(page_index); void *src = uvm_mem_get_cpu_addr_kernel(src_mem); uvm_gpu_t *gpu; uvm_gpu_address_t src_gpu_address; uvm_gpu_address_t dst_gpu_address; uvm_push_t push; uvm_assert_mutex_locked(&va_block->lock); UVM_ASSERT_MSG(UVM_ALIGN_DOWN(dst, PAGE_SIZE) == UVM_ALIGN_DOWN(dst + size - 1, PAGE_SIZE), "dst 0x%llx size 0x%zx\n", dst, size); if (UVM_ID_IS_INVALID(proc)) proc = UVM_ID_CPU; // Use make_resident() in all cases to break read-duplication, but // block_retry can be NULL as if the page is not resident yet we will make // it resident on the CPU. // Notably we don't care about coherence with respect to atomics from other // processors. status = uvm_va_block_make_resident(va_block, NULL, block_context, proc, region, NULL, NULL, UVM_MAKE_RESIDENT_CAUSE_API_TOOLS); if (status != NV_OK) return status; if (UVM_ID_IS_CPU(proc)) { char *mapped_page; uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index); struct page *page = uvm_cpu_chunk_get_cpu_page(va_block, chunk, page_index); status = uvm_tracker_wait(&va_block->tracker); if (status != NV_OK) return status; mapped_page = (char *)kmap(page); memcpy(mapped_page + page_offset, src, size); kunmap(page); return NV_OK; } gpu = block_get_gpu(va_block, proc); dst_gpu_address = block_phys_page_copy_address(va_block, block_phys_page(proc, page_index), gpu); dst_gpu_address.address += page_offset; src_gpu_address = uvm_mem_gpu_address_virtual_kernel(src_mem, gpu); status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_CPU_TO_GPU, &va_block->tracker, &push, "Direct write to [0x%llx, 0x%llx)", dst, dst + size); if (status != NV_OK) return status; gpu->parent->ce_hal->memcopy(&push, dst_gpu_address, src_gpu_address, size); return uvm_push_end_and_wait(&push); } NV_STATUS uvm_va_block_read_to_cpu(uvm_va_block_t *va_block, uvm_mem_t *dst_mem, NvU64 src, size_t size) { NV_STATUS status; uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, src); NvU64 page_offset = src & (PAGE_SIZE - 1); uvm_processor_id_t proc = uvm_va_block_page_get_closest_resident(va_block, page_index, UVM_ID_CPU); void *dst = uvm_mem_get_cpu_addr_kernel(dst_mem); uvm_gpu_t *gpu; uvm_gpu_address_t src_gpu_address; uvm_gpu_address_t dst_gpu_address; uvm_push_t push; uvm_assert_mutex_locked(&va_block->lock); UVM_ASSERT_MSG(UVM_ALIGN_DOWN(src, PAGE_SIZE) == UVM_ALIGN_DOWN(src + size - 1, PAGE_SIZE), "src 0x%llx size 0x%zx\n", src, size); if (UVM_ID_IS_INVALID(proc)) { memset(dst, 0, size); return NV_OK; } if (UVM_ID_IS_CPU(proc)) { char *mapped_page; uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index); struct page *page = uvm_cpu_chunk_get_cpu_page(va_block, chunk, page_index); status = uvm_tracker_wait(&va_block->tracker); if (status != NV_OK) return status; mapped_page = (char *)kmap(page); memcpy(dst, mapped_page + page_offset, size); kunmap(page); return NV_OK; } gpu = block_get_gpu(va_block, proc); dst_gpu_address = uvm_mem_gpu_address_virtual_kernel(dst_mem, gpu); src_gpu_address = block_phys_page_copy_address(va_block, block_phys_page(proc, page_index), gpu); src_gpu_address.address += page_offset; status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_TO_CPU, &va_block->tracker, &push, "Direct read from [0x%llx, 0x%llx)", src, src + size); if (status != NV_OK) return status; gpu->parent->ce_hal->memcopy(&push, dst_gpu_address, src_gpu_address, size); return uvm_push_end_and_wait(&push); } // Deferred work item reestablishing accessed by mappings after eviction. On // GPUs with access counters enabled, the evicted GPU will also get remote // mappings. static void block_deferred_eviction_mappings(void *args) { uvm_va_block_t *va_block = (uvm_va_block_t*)args; uvm_va_space_t *va_space; uvm_processor_id_t id; uvm_va_block_context_t *block_context = NULL; struct mm_struct *mm = NULL; uvm_mutex_lock(&va_block->lock); va_space = uvm_va_block_get_va_space_maybe_dead(va_block); uvm_mutex_unlock(&va_block->lock); if (!va_space) { // Block has been killed in the meantime goto done; } mm = uvm_va_space_mm_retain_lock(va_space); block_context = uvm_va_block_context_alloc(mm); if (!block_context) goto done; // The block wasn't dead when we checked above and that's enough to // guarantee that the VA space is still around, because // uvm_va_space_destroy() flushes the associated nv_kthread_q, and that // flush waits for this function call to finish. uvm_va_space_down_read(va_space); // Now that we have the VA space lock held, we can check whether the block // is still alive since the VA space write lock is needed to kill blocks. if (uvm_va_block_is_dead(va_block)) goto unlock; if (!uvm_va_block_is_hmm(va_block)) { uvm_va_range_t *va_range = va_block->va_range; NV_STATUS status = NV_OK; block_context->policy = uvm_va_range_get_policy(va_range); for_each_id_in_mask(id, &uvm_va_range_get_policy(va_range)->accessed_by) { status = uvm_va_block_set_accessed_by(va_block, block_context, id); if (status != NV_OK) break; } // On Volta+ GPUs, we can map evicted memory since we can pull it back // thanks to the access counters notifications if (status == NV_OK && va_space_map_remote_on_eviction(va_space)) { uvm_processor_mask_t map_processors; // Exclude the processors that have been already mapped due to // AccessedBy uvm_processor_mask_andnot(&map_processors, &va_block->evicted_gpus, &uvm_va_range_get_policy(va_range)->accessed_by); for_each_gpu_id_in_mask(id, &map_processors) { uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, id); uvm_va_block_gpu_state_t *gpu_state; if (!gpu->parent->access_counters_supported) continue; gpu_state = uvm_va_block_gpu_state_get(va_block, id); UVM_ASSERT(gpu_state); // TODO: Bug 2096389: uvm_va_block_add_mappings does not add // remote mappings to read-duplicated pages. Add support for it // or create a new function. status = UVM_VA_BLOCK_LOCK_RETRY(va_block, NULL, uvm_va_block_add_mappings(va_block, block_context, id, uvm_va_block_region_from_block(va_block), &gpu_state->evicted, UvmEventMapRemoteCauseEviction)); if (status != NV_OK) break; } } if (status != NV_OK) { UVM_ERR_PRINT("Deferred mappings to evicted memory for block [0x%llx, 0x%llx] failed %s, processor %s\n", va_block->start, va_block->end, nvstatusToString(status), uvm_va_space_processor_name(va_space, id)); } } unlock: uvm_va_space_up_read(va_space); uvm_va_block_context_free(block_context); done: uvm_va_space_mm_release_unlock(va_space, mm); uvm_va_block_release(va_block); } static void block_deferred_eviction_mappings_entry(void *args) { UVM_ENTRY_VOID(block_deferred_eviction_mappings(args)); } NV_STATUS uvm_va_block_evict_chunks(uvm_va_block_t *va_block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *root_chunk, uvm_tracker_t *tracker) { NV_STATUS status = NV_OK; NvU32 i; uvm_va_block_gpu_state_t *gpu_state; uvm_va_block_region_t chunk_region; size_t num_gpu_chunks = block_num_gpu_chunks(va_block, gpu); size_t chunks_to_evict = 0; uvm_va_block_context_t *block_context; uvm_page_mask_t *pages_to_evict; uvm_va_block_test_t *va_block_test = uvm_va_block_get_test(va_block); uvm_va_space_t *va_space = uvm_va_block_get_va_space_maybe_dead(va_block); struct mm_struct *mm; uvm_assert_mutex_locked(&va_block->lock); // The block might have been killed in the meantime if (!va_space) return NV_OK; gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); if (!gpu_state) return NV_OK; if (va_block_test && va_block_test->inject_eviction_error) { va_block_test->inject_eviction_error = false; return NV_ERR_NO_MEMORY; } // We cannot take this block's VA space or mmap_lock locks on the eviction // path, however, we retain mm in order to support accounting of CPU memory // allocations. If mappings need to be created, // block_deferred_eviction_mappings() will be scheduled below. mm = uvm_va_space_mm_retain(va_space); block_context = uvm_va_block_context_alloc(mm); if (!block_context) { if (mm) uvm_va_space_mm_release(va_space); return NV_ERR_NO_MEMORY; } pages_to_evict = &block_context->caller_page_mask; uvm_page_mask_zero(pages_to_evict); chunk_region.outer = 0; // Find all chunks that are subchunks of the root chunk for (i = 0; i < num_gpu_chunks; ++i) { uvm_chunk_size_t chunk_size; size_t chunk_index = block_gpu_chunk_index(va_block, gpu, chunk_region.outer, &chunk_size); UVM_ASSERT(chunk_index == i); chunk_region.first = chunk_region.outer; chunk_region.outer = chunk_region.first + chunk_size / PAGE_SIZE; if (!gpu_state->chunks[i]) continue; if (!uvm_gpu_chunk_same_root(gpu_state->chunks[i], root_chunk)) continue; uvm_page_mask_region_fill(pages_to_evict, chunk_region); ++chunks_to_evict; } if (chunks_to_evict == 0) goto out; // Only move pages resident on the GPU uvm_page_mask_and(pages_to_evict, pages_to_evict, uvm_va_block_resident_mask_get(va_block, gpu->id)); block_context->policy = uvm_va_range_get_policy(va_block->va_range); // TODO: Bug 1765193: make_resident() breaks read-duplication, but it's not // necessary to do so for eviction. Add a version that unmaps only the // processors that have mappings to the pages being evicted. status = uvm_va_block_make_resident(va_block, NULL, block_context, UVM_ID_CPU, uvm_va_block_region_from_block(va_block), pages_to_evict, NULL, UVM_MAKE_RESIDENT_CAUSE_EVICTION); if (status != NV_OK) goto out; // VA space lock may not be held and hence we cannot reestablish any // mappings here and need to defer it to a work queue. // // Reading the accessed_by mask without the VA space lock is safe because // adding a new processor to the mask triggers going over all the VA blocks // in the range and locking them. And we hold one of the VA block's locks. // // If uvm_va_range_set_accessed_by() hasn't called // uvm_va_block_set_accessed_by() for this block yet then it will take care // of adding the mapping after we are done. If it already did then we are // guaranteed to see the new processor in the accessed_by mask because we // locked the block's lock that the thread calling // uvm_va_range_set_accessed_by() unlocked after updating the mask. // // If a processor gets removed from the mask then we might not notice and // schedule the work item anyway, but that's benign as // block_deferred_eviction_mappings() re-examines the mask. // // Checking if access counters migrations are enabled on a VA space is racy // without holding the VA space lock. However, this is fine as // block_deferred_eviction_mappings() reexamines the value with the VA space // lock being held. if (uvm_processor_mask_get_count(&block_context->policy->accessed_by) > 0 || (gpu->parent->access_counters_supported && va_space_map_remote_on_eviction(va_space) && !uvm_va_block_is_hmm(va_block))) { // Always retain the VA block first so that it's safe for the deferred // callback to release it immediately after it runs. uvm_va_block_retain(va_block); if (!nv_kthread_q_schedule_q_item(&g_uvm_global.global_q, &va_block->eviction_mappings_q_item)) { // And release it if no new callback was scheduled uvm_va_block_release_no_destroy(va_block); } } status = uvm_tracker_add_tracker_safe(tracker, &va_block->tracker); if (status != NV_OK) goto out; for (i = 0; i < num_gpu_chunks; ++i) { uvm_gpu_id_t accessing_gpu_id; uvm_gpu_chunk_t *chunk = gpu_state->chunks[i]; if (!chunk) continue; if (!uvm_gpu_chunk_same_root(chunk, root_chunk)) continue; // Remove the mappings of indirect peers from the reverse map. We // access the indirect peer mask from the VA space without holding the // VA space lock. Therefore, we can race with enable_peer/disable_peer // operations. However this is fine: // // The enable_peer sequence is as follows: // // set_bit in va_space->indirect_peers // uvm_va_block_enable_peer; // // - If we read the mask BEFORE it is set or AFTER the mapping has // been added to the map there is no race. // - If we read the mask AFTER it is set but BEFORE adding the mapping // to the reverse map, we will try to remove it although it is not // there yet. Therefore, we use // uvm_pmm_sysmem_mappings_remove_gpu_mapping_on_eviction, which does // not check if the mapping is present in the reverse map. // // The disable_peer sequence is as follows: // // uvm_va_block_disable_peer; // clear_bit in va_space->indirect_peers // // - If we read the mask BEFORE the mapping has been added to the map // or AFTER the bit has been cleared, there is no race. // - If we read the mask AFTER the mapping has been removed and BEFORE // the bit is cleared, we will try to remove the mapping, too. // Again, uvm_pmm_sysmem_mappings_remove_gpu_mapping_on_eviction works // in this scenario. // Obtain the uvm_gpu_t directly via the parent GPU's id since indirect // peers are not supported when SMC is enabled. for_each_gpu_id_in_mask(accessing_gpu_id, &va_space->indirect_peers[uvm_id_value(gpu->id)]) { uvm_gpu_t *accessing_gpu = uvm_va_space_get_gpu(va_space, accessing_gpu_id); NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu); uvm_pmm_sysmem_mappings_remove_gpu_mapping_on_eviction(&accessing_gpu->pmm_reverse_sysmem_mappings, peer_addr); } uvm_mmu_chunk_unmap(chunk, tracker); uvm_pmm_gpu_mark_chunk_evicted(&gpu->pmm, gpu_state->chunks[i]); gpu_state->chunks[i] = NULL; } out: uvm_va_block_context_free(block_context); if (mm) uvm_va_space_mm_release(va_space); return status; } static NV_STATUS block_gpu_force_4k_ptes(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu) { uvm_va_block_gpu_state_t *gpu_state = block_gpu_state_get_alloc(block, gpu); uvm_push_t push; NV_STATUS status; // See comment in uvm_va_block_set_cancel UVM_ASSERT(!gpu->parent->fault_cancel_va_supported); if (!gpu_state) return NV_ERR_NO_MEMORY; // Force all pages to be 4K and prevent future upgrades during cancel gpu_state->force_4k_ptes = true; // If we have no page tables we're done. For fault cancel we need to make // sure that fatal faults are on different 4k PTEs than non-fatal faults, // and we need to service all non-fatal faults before issuing the cancel. So // either all faults are fatal and we have no PTEs (we're PROT_NONE), or // we'll allocate PTEs later when we service the non-fatal faults. Those // PTEs will be 4k since force_4k_ptes is set. if (!block_gpu_has_page_tables(block, gpu)) return NV_OK; // Are we 4k already? if (!gpu_state->pte_is_2m && bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) return NV_OK; status = block_alloc_ptes_with_retry(block, gpu, UVM_PAGE_SIZE_4K, NULL); if (status != NV_OK) return status; status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_MEMOPS, &block->tracker, &push, "Forcing 4k PTEs on block [0x%llx, 0x%llx)", block->start, block->end + 1); if (status != NV_OK) return status; if (gpu_state->pte_is_2m) block_gpu_split_2m(block, block_context, gpu, NULL, &push); else block_gpu_split_big(block, block_context, gpu, gpu_state->big_ptes, &push); uvm_push_end(&push); UVM_ASSERT(block_check_mappings(block)); return uvm_tracker_add_push_safe(&block->tracker, &push); } NV_STATUS uvm_va_block_set_cancel(uvm_va_block_t *va_block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu) { uvm_assert_mutex_locked(&va_block->lock); // Volta+ devices support a global VA cancel method that does not require // 4k PTEs. Thus, skip doing this PTE splitting, particularly because it // could result in 4k PTEs on P9 systems which otherwise would never need // them. if (gpu->parent->fault_cancel_va_supported) return NV_OK; return block_gpu_force_4k_ptes(va_block, block_context, gpu); } NV_STATUS uvm_test_va_block_inject_error(UVM_TEST_VA_BLOCK_INJECT_ERROR_PARAMS *params, struct file *filp) { uvm_va_space_t *va_space = uvm_va_space_get(filp); struct mm_struct *mm; uvm_va_block_t *va_block; uvm_va_block_test_t *va_block_test; NV_STATUS status = NV_OK; mm = uvm_va_space_mm_retain_lock(va_space); uvm_va_space_down_read(va_space); status = uvm_va_block_find_create(va_space, mm, params->lookup_address, NULL, &va_block); if (status != NV_OK) goto out; va_block_test = uvm_va_block_get_test(va_block); UVM_ASSERT(va_block_test); uvm_mutex_lock(&va_block->lock); if (params->page_table_allocation_retry_force_count) va_block_test->page_table_allocation_retry_force_count = params->page_table_allocation_retry_force_count; if (params->user_pages_allocation_retry_force_count) va_block_test->user_pages_allocation_retry_force_count = params->user_pages_allocation_retry_force_count; if (params->cpu_chunk_allocation_size_mask) va_block_test->cpu_chunk_allocation_size_mask = params->cpu_chunk_allocation_size_mask; if (params->eviction_error) va_block_test->inject_eviction_error = params->eviction_error; if (params->cpu_pages_allocation_error) va_block_test->inject_cpu_pages_allocation_error = params->cpu_pages_allocation_error; if (params->populate_error) va_block_test->inject_populate_error = params->populate_error; uvm_mutex_unlock(&va_block->lock); out: uvm_va_space_up_read(va_space); uvm_va_space_mm_release_unlock(va_space, mm); return status; } static uvm_prot_t g_uvm_test_pte_mapping_to_prot[UVM_TEST_PTE_MAPPING_MAX] = { [UVM_TEST_PTE_MAPPING_INVALID] = UVM_PROT_NONE, [UVM_TEST_PTE_MAPPING_READ_ONLY] = UVM_PROT_READ_ONLY, [UVM_TEST_PTE_MAPPING_READ_WRITE] = UVM_PROT_READ_WRITE, [UVM_TEST_PTE_MAPPING_READ_WRITE_ATOMIC] = UVM_PROT_READ_WRITE_ATOMIC, }; static UVM_TEST_PTE_MAPPING g_uvm_prot_to_test_pte_mapping[UVM_PROT_MAX] = { [UVM_PROT_NONE] = UVM_TEST_PTE_MAPPING_INVALID, [UVM_PROT_READ_ONLY] = UVM_TEST_PTE_MAPPING_READ_ONLY, [UVM_PROT_READ_WRITE] = UVM_TEST_PTE_MAPPING_READ_WRITE, [UVM_PROT_READ_WRITE_ATOMIC] = UVM_TEST_PTE_MAPPING_READ_WRITE_ATOMIC, }; NV_STATUS uvm_test_change_pte_mapping(UVM_TEST_CHANGE_PTE_MAPPING_PARAMS *params, struct file *filp) { uvm_va_space_t *va_space = uvm_va_space_get(filp); uvm_va_block_t *block; struct mm_struct *mm; NV_STATUS status = NV_OK; uvm_prot_t curr_prot, new_prot; uvm_gpu_t *gpu = NULL; uvm_processor_id_t id; uvm_tracker_t local_tracker; uvm_va_block_region_t region; uvm_va_block_context_t *block_context = NULL; if (!PAGE_ALIGNED(params->va)) return NV_ERR_INVALID_ADDRESS; if (params->mapping >= UVM_TEST_PTE_MAPPING_MAX) return NV_ERR_INVALID_ARGUMENT; new_prot = g_uvm_test_pte_mapping_to_prot[params->mapping]; // mmap_lock isn't needed for invalidating CPU mappings, but it will be // needed for inserting them. mm = uvm_va_space_mm_or_current_retain_lock(va_space); uvm_va_space_down_read(va_space); if (uvm_uuid_is_cpu(¶ms->uuid)) { id = UVM_ID_CPU; } else { gpu = uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(va_space, ¶ms->uuid); if (!gpu) { status = NV_ERR_INVALID_DEVICE; goto out; } // Check if the GPU can access the VA if (!uvm_gpu_can_address(gpu, params->va, PAGE_SIZE)) { status = NV_ERR_OUT_OF_RANGE; goto out; } id = gpu->id; } block_context = uvm_va_block_context_alloc(mm); if (!block_context) { status = NV_ERR_NO_MEMORY; goto out; } status = uvm_va_block_find_create(va_space, mm, params->va, block_context, &block); if (status != NV_OK) goto out; uvm_mutex_lock(&block->lock); region = uvm_va_block_region_from_start_size(block, params->va, PAGE_SIZE); curr_prot = block_page_prot(block, id, region.first); if (new_prot == curr_prot) { status = NV_OK; goto out_block; } // TODO: Bug 1766124: Upgrades might require revoking other processors' // access privileges. We just fail for now. Only downgrades are // supported. If we allowed upgrades, we would need to check the mm // like we do for revocation below. if (new_prot > curr_prot) { status = NV_ERR_INVALID_OPERATION; goto out_block; } block_context->policy = uvm_va_policy_get(block, params->va); if (new_prot == UVM_PROT_NONE) { status = uvm_va_block_unmap(block, block_context, id, region, NULL, &block->tracker); } else { UVM_ASSERT(block_is_page_resident_anywhere(block, region.first)); // Revoking CPU mappings performs a combination of unmap + map. The map // portion requires a valid mm. if (UVM_ID_IS_CPU(id) && !uvm_va_block_is_hmm(block) && !uvm_va_range_vma_check(block->va_range, mm)) { status = NV_ERR_INVALID_STATE; } else { status = uvm_va_block_revoke_prot(block, block_context, id, region, NULL, new_prot + 1, &block->tracker); } } out_block: if (status == NV_OK) status = uvm_tracker_init_from(&local_tracker, &block->tracker); uvm_mutex_unlock(&block->lock); if (status == NV_OK) status = uvm_tracker_wait_deinit(&local_tracker); out: uvm_va_space_up_read(va_space); uvm_va_space_mm_or_current_release_unlock(va_space, mm); uvm_va_block_context_free(block_context); return status; } NV_STATUS uvm_test_va_block_info(UVM_TEST_VA_BLOCK_INFO_PARAMS *params, struct file *filp) { uvm_va_space_t *va_space = uvm_va_space_get(filp); uvm_va_block_t *va_block; NV_STATUS status = NV_OK; BUILD_BUG_ON(UVM_TEST_VA_BLOCK_SIZE != UVM_VA_BLOCK_SIZE); uvm_va_space_down_read(va_space); status = uvm_va_block_find(va_space, params->lookup_address, &va_block); if (status != NV_OK) goto out; params->va_block_start = va_block->start; params->va_block_end = va_block->end; out: uvm_va_space_up_read(va_space); return status; } NV_STATUS uvm_test_va_residency_info(UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params, struct file *filp) { NV_STATUS status = NV_OK; uvm_va_space_t *va_space = uvm_va_space_get(filp); uvm_va_range_t *va_range = NULL; uvm_va_block_t *block = NULL; NvU32 count = 0; uvm_processor_mask_t resident_on_mask; uvm_processor_id_t id; uvm_page_index_t page_index; unsigned release_block_count = 0; NvU64 addr = UVM_ALIGN_DOWN(params->lookup_address, PAGE_SIZE); uvm_va_space_down_read(va_space); va_range = uvm_va_range_find(va_space, addr); if (!va_range || va_range->type != UVM_VA_RANGE_TYPE_MANAGED) { status = NV_ERR_INVALID_ADDRESS; goto out; } status = uvm_va_block_find(va_space, addr, &block); if (status != NV_OK) { UVM_ASSERT(status == NV_ERR_OBJECT_NOT_FOUND); params->resident_on_count = 0; params->populated_on_count = 0; params->mapped_on_count = 0; status = NV_OK; goto out; } uvm_mutex_lock(&block->lock); page_index = uvm_va_block_cpu_page_index(block, addr); uvm_va_block_page_resident_processors(block, page_index, &resident_on_mask); for_each_id_in_mask(id, &resident_on_mask) { block_phys_page_t block_page = block_phys_page(id, page_index); uvm_va_space_processor_uuid(va_space, ¶ms->resident_on[count], id); params->resident_physical_size[count] = block_phys_page_size(block, block_page); if (UVM_ID_IS_CPU(id)) { uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); params->resident_physical_address[count] = page_to_phys(uvm_cpu_chunk_get_cpu_page(block, chunk, page_index)); } else { params->resident_physical_address[count] = block_phys_page_address(block, block_page, uvm_va_space_get_gpu(va_space, id)).address; } ++count; } params->resident_on_count = count; count = 0; for_each_id_in_mask(id, &block->mapped) { NvU32 page_size = uvm_va_block_page_size_processor(block, id, page_index); if (page_size == 0) continue; uvm_va_space_processor_uuid(va_space, ¶ms->mapped_on[count], id); params->mapping_type[count] = g_uvm_prot_to_test_pte_mapping[block_page_prot(block, id, page_index)]; UVM_ASSERT(params->mapping_type[count] != UVM_TEST_PTE_MAPPING_INVALID); params->page_size[count] = page_size; ++count; } if (params->resident_on_count == 1) { if (uvm_processor_mask_test(&resident_on_mask, UVM_ID_CPU)) { if (uvm_pmm_sysmem_mappings_indirect_supported()) { for_each_gpu_id(id) { NvU32 page_size = uvm_va_block_page_size_processor(block, id, page_index); uvm_reverse_map_t sysmem_page; uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index); size_t num_pages; uvm_gpu_t *gpu; if (!uvm_va_block_gpu_state_get(block, id)) continue; gpu = uvm_va_space_get_gpu(va_space, id); if (!gpu->parent->access_counters_supported) continue; num_pages = uvm_pmm_sysmem_mappings_dma_to_virt(&gpu->pmm_reverse_sysmem_mappings, uvm_cpu_chunk_get_gpu_mapping_addr(block, page_index, chunk, id), uvm_cpu_chunk_get_size(chunk), &sysmem_page, 1); if (page_size > 0) UVM_ASSERT(num_pages == 1); else UVM_ASSERT(num_pages <= 1); if (num_pages == 1) { UVM_ASSERT(sysmem_page.va_block == block); UVM_ASSERT(uvm_reverse_map_start(&sysmem_page) <= addr); UVM_ASSERT(uvm_reverse_map_end(&sysmem_page) > addr); ++release_block_count; } } } } else { uvm_gpu_id_t id = uvm_processor_mask_find_first_id(&resident_on_mask); uvm_reverse_map_t gpu_mapping; size_t num_pages; uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, id); uvm_gpu_phys_address_t phys_addr; phys_addr = uvm_va_block_gpu_phys_page_address(block, page_index, gpu); num_pages = uvm_pmm_gpu_phys_to_virt(&gpu->pmm, phys_addr.address, PAGE_SIZE, &gpu_mapping); // Chunk may be in TEMP_PINNED state so it may not have a VA block // assigned. In that case, we don't get a valid translation. if (num_pages > 0) { UVM_ASSERT(num_pages == 1); UVM_ASSERT(gpu_mapping.va_block == block); UVM_ASSERT(uvm_reverse_map_start(&gpu_mapping) == addr); ++release_block_count; } } } params->mapped_on_count = count; count = 0; for_each_processor_id(id) { if (!block_processor_page_is_populated(block, id, page_index)) continue; uvm_va_space_processor_uuid(va_space, ¶ms->populated_on[count], id); ++count; } params->populated_on_count = count; out: if (block) { if (!params->is_async && status == NV_OK) status = uvm_tracker_wait(&block->tracker); uvm_mutex_unlock(&block->lock); while (release_block_count--) uvm_va_block_release(block); } uvm_va_space_up_read(va_space); return status; } void uvm_va_block_mark_cpu_dirty(uvm_va_block_t *va_block) { block_mark_region_cpu_dirty(va_block, uvm_va_block_region_from_block(va_block)); }