1
0
mirror of https://github.com/doitsujin/dxvk.git synced 2025-02-20 19:54:19 +01:00

[d3d9] Optimize late buffer uploads

... similar to what we're doing for textures.
This commit is contained in:
Robin Kertels 2024-09-19 01:15:10 +02:00 committed by Philip Rebohle
parent 5a08b3c451
commit ef8bad33a5
5 changed files with 89 additions and 46 deletions

View File

@ -46,7 +46,7 @@ namespace dxvk {
}
inline bool IsDegenerate() { return min == max; }
inline bool IsDegenerate() const { return min == max; }
inline void Conjoin(D3D9Range range) {
if (IsDegenerate())
@ -180,7 +180,7 @@ namespace dxvk {
/**
* \brief Whether or not the staging buffer needs to be copied to the actual buffer
*/
inline bool NeedsUpload() { return m_desc.Pool != D3DPOOL_DEFAULT && !m_dirtyRange.IsDegenerate(); }
inline bool NeedsUpload() const { return m_desc.Pool != D3DPOOL_DEFAULT && !m_dirtyRange.IsDegenerate(); }
void PreLoad();

View File

@ -1668,8 +1668,8 @@ namespace dxvk {
ctx->bindIndexBuffer(DxvkBufferSlice(), VK_INDEX_TYPE_UINT32);
});
}
for (uint32_t i = 0; i < caps::MaxStreams; i++) {
for (uint32_t i : bit::BitMask(~m_activeVertexBuffers & ((1 << 16) - 1))) {
if (m_state.vertexBuffers[i].vertexBuffer == nullptr) {
EmitCs([cIndex = i](DxvkContext* ctx) {
ctx->bindVertexBuffer(cIndex, DxvkBufferSlice(), 0);
@ -3342,12 +3342,25 @@ namespace dxvk {
if (needsUpdate)
vbo.vertexBuffer = buffer;
const uint32_t bit = 1u << StreamNumber;
m_activeVertexBuffers &= ~bit;
m_activeVertexBuffersToUploadPerDraw &= ~bit;
m_activeVertexBuffersToUpload &= ~bit;
if (buffer != nullptr) {
needsUpdate |= vbo.offset != OffsetInBytes
|| vbo.stride != Stride;
vbo.offset = OffsetInBytes;
vbo.stride = Stride;
const D3D9CommonBuffer* commonBuffer = GetCommonBuffer(buffer);
m_activeVertexBuffers |= bit;
if (commonBuffer->DoPerDrawUpload() || CanOnlySWVP())
m_activeVertexBuffersToUploadPerDraw |= bit;
if (commonBuffer->NeedsUpload()) {
m_activeVertexBuffersToUpload |= bit;
}
} else {
// D3D9 doesn't actually unbind any vertex buffer when passing null.
// Operation Flashpoint: Red River relies on this behavior.
@ -5100,9 +5113,18 @@ namespace dxvk {
uint32_t size = respectUserBounds ? std::min(SizeToLock, desc.Size - offset) : desc.Size;
D3D9Range lockRange = D3D9Range(offset, offset + size);
if ((desc.Pool == D3DPOOL_DEFAULT || !(Flags & D3DLOCK_NO_DIRTY_UPDATE)) && !(Flags & D3DLOCK_READONLY))
bool updateDirtyRange = (desc.Pool == D3DPOOL_DEFAULT || !(Flags & D3DLOCK_NO_DIRTY_UPDATE)) && !(Flags & D3DLOCK_READONLY);
if (updateDirtyRange) {
pResource->DirtyRange().Conjoin(lockRange);
for (uint32_t i : bit::BitMask(m_activeVertexBuffers)) {
auto commonBuffer = GetCommonBuffer(m_state.vertexBuffers[i].vertexBuffer);
if (commonBuffer == pResource) {
m_activeVertexBuffersToUpload |= 1 << i;
}
}
}
const bool directMapping = pResource->GetMapMode() == D3D9_COMMON_BUFFER_MAP_MODE_DIRECT;
const bool needsReadback = pResource->NeedsReadback();
@ -5237,11 +5259,9 @@ namespace dxvk {
bool* pDynamicVBOs,
bool* pDynamicIBO
) {
bool dynamicSysmemVBOs = true;
for (uint32_t i = 0; i < caps::MaxStreams && dynamicSysmemVBOs; i++) {
auto* vbo = GetCommonBuffer(m_state.vertexBuffers[i].vertexBuffer);
dynamicSysmemVBOs &= vbo == nullptr || (vbo->DoPerDrawUpload() || CanOnlySWVP());
}
const uint32_t usedBuffersMask = (m_state.vertexDecl != nullptr ? m_state.vertexDecl->GetStreamMask() : ~0u) & m_activeVertexBuffers;
bool dynamicSysmemVBOs = usedBuffersMask == m_activeVertexBuffersToUploadPerDraw;
D3D9CommonBuffer* ibo = GetCommonBuffer(m_state.indices);
bool dynamicSysmemIBO = NumIndices != 0 && ibo != nullptr && (ibo->DoPerDrawUpload() || CanOnlySWVP());
@ -5253,6 +5273,12 @@ namespace dxvk {
if (likely(!dynamicSysmemVBOs && !dynamicSysmemIBO))
return;
uint32_t vertexBuffersToUpload;
if (likely(dynamicSysmemVBOs))
vertexBuffersToUpload = m_activeVertexBuffersToUploadPerDraw & usedBuffersMask;
else
vertexBuffersToUpload = 0;
// The UP buffer allocator will invalidate,
// so we can only use 1 UP buffer slice per draw.
// First we calculate the size of that UP buffer slice
@ -5269,7 +5295,7 @@ namespace dxvk {
uint32_t totalUpBufferSize = 0;
std::array<VBOCopy, caps::MaxStreams> vboCopies = {};
for (uint32_t i = 0; i < caps::MaxStreams && dynamicSysmemVBOs; i++) {
for (uint32_t i : bit::BitMask(vertexBuffersToUpload)) {
auto* vbo = GetCommonBuffer(m_state.vertexBuffers[i].vertexBuffer);
if (likely(vbo == nullptr)) {
continue;
@ -5354,42 +5380,42 @@ namespace dxvk {
auto upSlice = AllocUPBuffer(totalUpBufferSize);
// Now copy the actual data and bind it.
for (uint32_t i = 0; i < caps::MaxStreams && dynamicSysmemVBOs; i++) {
const VBOCopy& copy = vboCopies[i];
if (dynamicSysmemVBOs) {
for (uint32_t i : bit::BitMask(vertexBuffersToUpload)) {
const VBOCopy& copy = vboCopies[i];
if (likely(copy.copyBufferLength != 0)) {
const auto* vbo = GetCommonBuffer(m_state.vertexBuffers[i].vertexBuffer);
uint8_t* data = reinterpret_cast<uint8_t*>(upSlice.mapPtr) + copy.dstOffset;
const uint8_t* src = reinterpret_cast<uint8_t*>(vbo->GetMappedSlice().mapPtr) + copy.srcOffset;
if (likely(copy.copyBufferLength != 0)) {
const auto* vbo = GetCommonBuffer(m_state.vertexBuffers[i].vertexBuffer);
uint8_t* data = reinterpret_cast<uint8_t*>(upSlice.mapPtr) + copy.dstOffset;
const uint8_t* src = reinterpret_cast<uint8_t*>(vbo->GetMappedSlice().mapPtr) + copy.srcOffset;
if (likely(copy.copyElementStride == copy.copyElementSize)) {
std::memcpy(data, src, copy.copyBufferLength);
} else {
for (uint32_t j = 0; j < copy.copyElementCount; j++) {
std::memcpy(data + j * copy.copyElementSize, src + j * copy.copyElementStride, copy.copyElementSize);
}
if (unlikely(copy.copyBufferLength > copy.copyElementCount * copy.copyElementSize)) {
// Partial vertex at the end
std::memcpy(
data + copy.copyElementCount * copy.copyElementSize,
src + copy.copyElementCount * copy.copyElementStride,
copy.copyBufferLength - copy.copyElementCount * copy.copyElementSize);
if (likely(copy.copyElementStride == copy.copyElementSize)) {
std::memcpy(data, src, copy.copyBufferLength);
} else {
for (uint32_t j = 0; j < copy.copyElementCount; j++) {
std::memcpy(data + j * copy.copyElementSize, src + j * copy.copyElementStride, copy.copyElementSize);
}
if (unlikely(copy.copyBufferLength > copy.copyElementCount * copy.copyElementSize)) {
// Partial vertex at the end
std::memcpy(
data + copy.copyElementCount * copy.copyElementSize,
src + copy.copyElementCount * copy.copyElementStride,
copy.copyBufferLength - copy.copyElementCount * copy.copyElementSize);
}
}
}
auto vboSlice = upSlice.slice.subSlice(copy.dstOffset, copy.copyBufferLength);
EmitCs([
cStream = i,
cBufferSlice = std::move(vboSlice),
cStride = copy.copyElementSize
](DxvkContext* ctx) mutable {
ctx->bindVertexBuffer(cStream, std::move(cBufferSlice), cStride);
});
m_flags.set(D3D9DeviceFlag::DirtyVertexBuffers);
}
auto vboSlice = upSlice.slice.subSlice(copy.dstOffset, copy.copyBufferLength);
EmitCs([
cStream = i,
cBufferSlice = std::move(vboSlice),
cStride = copy.copyElementSize
](DxvkContext* ctx) mutable {
ctx->bindVertexBuffer(cStream, std::move(cBufferSlice), cStride);
});
m_flags.set(D3D9DeviceFlag::DirtyVertexBuffers);
}
if (dynamicSysmemVBOs) {
// Change the draw call parameters to reflect the changed vertex buffers
if (NumIndices != 0) {
BaseVertexIndex = -FirstVertexIndex;
@ -6821,10 +6847,15 @@ namespace dxvk {
m_lastHazardsRT = m_activeHazardsRT;
}
for (uint32_t i = 0; i < caps::MaxStreams; i++) {
auto* vbo = GetCommonBuffer(m_state.vertexBuffers[i].vertexBuffer);
if (vbo != nullptr && vbo->NeedsUpload() && UploadVBOs)
FlushBuffer(vbo);
if (likely(UploadVBOs)) {
const uint32_t usedBuffersMask = m_state.vertexDecl != nullptr ? m_state.vertexDecl->GetStreamMask() : ~0u;
const uint32_t buffersToUpload = m_activeVertexBuffersToUpload & usedBuffersMask;
for (uint32_t bufferIdx : bit::BitMask(buffersToUpload)) {
auto* vbo = GetCommonBuffer(m_state.vertexBuffers[bufferIdx].vertexBuffer);
if (vbo != nullptr && vbo->NeedsUpload())
FlushBuffer(vbo);
}
m_activeVertexBuffersToUpload &= ~buffersToUpload;
}
const uint32_t usedSamplerMask = m_psShaderMasks.samplerMask | m_vsShaderMasks.samplerMask;
@ -6839,7 +6870,7 @@ namespace dxvk {
GenerateTextureMips(texturesToGen);
auto* ibo = GetCommonBuffer(m_state.indices);
if (ibo != nullptr && ibo->NeedsUpload() && UploadIBO)
if (UploadIBO && ibo != nullptr && ibo->NeedsUpload())
FlushBuffer(ibo);
UpdateFog();

View File

@ -1385,6 +1385,10 @@ namespace dxvk {
uint32_t m_activeTexturesToUpload = 0;
uint32_t m_activeTexturesToGen = 0;
uint32_t m_activeVertexBuffers = 0;
uint32_t m_activeVertexBuffersToUpload = 0;
uint32_t m_activeVertexBuffersToUploadPerDraw = 0;
// m_fetch4Enabled is whether fetch4 is currently enabled
// from the application.
//

View File

@ -374,6 +374,8 @@ namespace dxvk {
if (element.Usage == D3DDECLUSAGE_TEXCOORD)
m_texcoordMask |= GetDecltypeCount(D3DDECLTYPE(element.Type)) << (element.UsageIndex * 3);
m_streamMask |= 1 << element.Stream;
}
}

View File

@ -66,6 +66,10 @@ namespace dxvk {
return m_texcoordMask;
}
uint32_t GetStreamMask() const {
return m_streamMask;
}
private:
bool MapD3DDeclToFvf(
@ -94,6 +98,8 @@ namespace dxvk {
uint32_t m_texcoordMask = 0;
uint32_t m_streamMask = 0;
std::array<uint32_t, caps::MaxStreams> m_sizes = {};
};