[d3d9] Optimize NV12 conversion to use a macropixel of [2, 1]

2025-03-23 09:19:41 +01:00 · 2020-08-07 10:39:40 +01:00 · 2020-08-07 10:39:40 +01:00 · e2a26f2bc5
commit e2a26f2bc5
parent 9fe1b9d03f
3 changed files with 29 additions and 17 deletions
--- a/src/d3d9/d3d9_format_helpers.cpp
+++ b/src/d3d9/d3d9_format_helpers.cpp
@ -37,7 +37,7 @@ namespace dxvk {
      }

      case D3D9ConversionFormat_NV12:
-        ConvertGenericFormat(conversionFormat, dstImage, dstSubresource, srcBuffer, VK_FORMAT_R8_UINT, 0, { 1u, 1u });
+        ConvertGenericFormat(conversionFormat, dstImage, dstSubresource, srcBuffer, VK_FORMAT_R16_UINT, 0, { 2u, 1u });
        break;

      case D3D9ConversionFormat_L6V5U5:
--- a/src/d3d9/shaders/d3d9_convert_common.h
+++ b/src/d3d9/shaders/d3d9_convert_common.h
@ -16,6 +16,11 @@ float unpackUnorm(uint p) {
  return float(p) / 255.0;
 }

+vec2 unpackUnorm2x8(uint p) {
+  uvec2 value = uvec2(p & 0xFF, p >> 8);
+  return vec2(unpackUnorm(value.x), unpackUnorm(value.y));
+}
+
 mat3x4 g_yuv_to_rgb = {
  { 298 / 256,  0,          409 / 256, 0.5 },
  { 298 / 256, -100 / 256, -208 / 256, 0.5 },
--- a/src/d3d9/shaders/d3d9_convert_nv12.comp
+++ b/src/d3d9/shaders/d3d9_convert_nv12.comp
@ -18,39 +18,46 @@ uniform u_info_t {
  uvec2 extent;
 } u_info;

-float fetchUnorm(usamplerBuffer source, uint offset) {
-  return unpackUnorm(texelFetch(src, int(offset)).r);
+vec2 fetchUnorm2x8(usamplerBuffer source, uint offset) {
+  return unpackUnorm2x8(texelFetch(src, int(offset)).r);
 }

+// Format is:
+// YYYYYYYYYYYYYYY...
+// YYYYYYYYYYYYYYY...
+// UVUVUVUVUVUVUVU...
+
 void main() {
  ivec3 thread_id = ivec3(gl_GlobalInvocationID);

  if (all(lessThan(thread_id.xy, u_info.extent))) {
    uvec2 pitch = uvec2(u_info.extent.x, u_info.extent.y);

-    // Format is:
-    // YYYYYYYYYYYYYYY...
-    // UVUVUVUVUVUVUVU...
    uint offset = thread_id.x
                + thread_id.y * pitch.x;

-    float c0 = fetchUnorm(src, offset) - (16 / 255.0);
+    // Fetch 2 luminance samples.
+    vec2 y = fetchUnorm2x8(src, offset) - (16 / 255.0);        

-    // Floor .x to the nearest 2, because
-    // UV data is in WORDs, and we want to get the color
-    // for this pixel.
-    // Then divide thread_id.y by 2 because the macropixel
-    // layout for chroma data is [2, 2].
-    offset = (thread_id.x / 2) * 2
+    // Go into the second plane to get the chroma data.
+    // UV data is subsampled as [2, 2]
+    // So we need to divide thread_id.y by 2.
+    // thread_id.x is already accounted for as we read uint16
+    offset = thread_id.x
           + thread_id.y / 2 * pitch.x
           + pitch.x * pitch.y;

-    float u = fetchUnorm(src, offset)     - (128 / 255.0);
-    float v = fetchUnorm(src, offset + 1) - (128 / 255.0);
+    vec2 uv = fetchUnorm2x8(src, offset) - (128 / 255.0);

    // The NV12 format seems to use the BT.703 color space.
-    vec4 color0 = convertBT_703(vec3(c0, u, v));
+    vec4 color0 = convertBT_703(vec3(y.x, uv.x, uv.y));
+    vec4 color1 = convertBT_703(vec3(y.y, uv.x, uv.y));
+
+    // We write as a macropixel of [2, 1]
+    // So write out 2 pixels in this run.
+    ivec2 writePos = thread_id.xy * ivec2(2, 1);
    
-    imageStore(dst, thread_id.xy, color0);
+    imageStore(dst, ivec2(writePos.x,     writePos.y), color0);
+    imageStore(dst, ivec2(writePos.x + 1, writePos.y), color1);
  }
 }