[util] Implement utility functions for string conversion

2025-03-23 00:19:40 +01:00 · 2022-07-28 18:40:46 +02:00 · 2022-07-28 18:40:46 +02:00 · 200df73ba7
commit 200df73ba7
parent bad7d4690b
2 changed files with 354 additions and 1 deletions
--- a/src/util/util_string.cpp
+++ b/src/util/util_string.cpp
@ -1,6 +1,207 @@
 #include "util_string.h"

 namespace dxvk::str {
+
+  const uint8_t* decodeTypedChar(
+    const uint8_t*  begin,
+    const uint8_t*  end,
+          uint32_t& ch) {
+    uint32_t first = begin[0];
+
+    if (likely(first < 0x80)) {
+      // Basic ASCII character
+      ch = uint32_t(first);
+      return begin + 1;
+    } else if (unlikely(first < 0xC0)) {
+      // Character starts with a continuation byte,
+      // just skip until we find the next valid prefix
+      while ((begin < end) && (((*begin) & 0xC0) == 0x80))
+        begin += 1;
+
+      ch = uint32_t('?');
+      return begin;
+    } else {
+      // The number of leading 1 bits in the first byte
+      // determines the length of this character
+      size_t length = bit::lzcnt((~first) << 24);
+
+      if (unlikely(begin + length > end)) {
+        ch = uint32_t('?');
+        return end;
+      }
+
+      if (first < 0xE0) {
+        ch = ((uint32_t(begin[0]) & 0x1F) << 6)
+           | ((uint32_t(begin[1]) & 0x3F));
+      } else if (first < 0xF0) {
+        ch = ((uint32_t(begin[0]) & 0x0F) << 12)
+           | ((uint32_t(begin[1]) & 0x3F) << 6)
+           | ((uint32_t(begin[2]) & 0x3F));
+      } else if (first < 0xF8) {
+        ch = ((uint32_t(begin[0]) & 0x07) << 18)
+           | ((uint32_t(begin[1]) & 0x3F) << 12)
+           | ((uint32_t(begin[2]) & 0x3F) << 6)
+           | ((uint32_t(begin[3]) & 0x3F));
+      } else {
+        // Invalid prefix
+        ch = uint32_t('?');
+      }
+
+      return begin + length;
+    }
+  }
+
+  const uint16_t* decodeTypedChar(
+    const uint16_t* begin,
+    const uint16_t* end,
+          uint32_t& ch) {
+    uint32_t first = begin[0];
+
+    if (likely(first < 0xD800)) {
+      ch = first;
+      return begin + 1;
+    } else if (first < 0xDC00) {
+      if (unlikely(begin + 2 > end)) {
+        ch = uint32_t('?');
+        return end;
+      }
+
+      ch = 0x10000
+         + ((uint32_t(begin[0]) & 0x3FF) << 10)
+         + ((uint32_t(begin[1]) & 0x3FF));
+      return begin + 2;
+    } else if (unlikely(first < 0xE000)) {
+      // Stray low surrogate
+      ch = uint32_t('?');
+      return begin + 1;
+    } else {
+      ch = first;
+      return begin + 1;
+    }
+  }
+
+
+  const uint32_t* decodeTypedChar(
+    const uint32_t* begin,
+    const uint32_t* end,
+          uint32_t& ch) {
+    ch = begin[0];
+    return begin + 1;
+  }
+
+
+  size_t encodeTypedChar(
+          uint8_t*  begin,
+          uint8_t*  end,
+          uint32_t  ch) {
+    if (likely(ch < 0x80)) {
+      if (begin) {
+        if (unlikely(begin + 1 > end))
+          return 0;
+
+        begin[0] = uint8_t(ch);
+      }
+
+      return 1;
+    } else if (ch < 0x800) {
+      if (begin) {
+        if (unlikely(begin + 2 > end))
+          return 0;
+
+        begin[0] = uint8_t(0xC0 | (ch >> 6));
+        begin[1] = uint8_t(0x80 | (ch & 0x3F));
+      }
+
+      return 2;
+    } else if (ch < 0x10000) {
+      if (begin) {
+        if (unlikely(begin + 3 > end))
+          return 0;
+
+        begin[0] = uint8_t(0xE0 | ((ch >> 12)));
+        begin[1] = uint8_t(0x80 | ((ch >> 6) & 0x3F));
+        begin[2] = uint8_t(0x80 | ((ch >> 0) & 0x3F));
+      }
+
+      return 3;
+    } else if (ch < 0x200000) {
+      if (begin) {
+        if (unlikely(begin + 4 < end))
+          return 0;
+
+        begin[0] = uint8_t(0xF0 | ((ch >> 18)));
+        begin[1] = uint8_t(0x80 | ((ch >> 12) & 0x3F));
+        begin[2] = uint8_t(0x80 | ((ch >> 6) & 0x3F));
+        begin[3] = uint8_t(0x80 | ((ch >> 0) & 0x3F));
+      }
+
+      return 4;
+    } else {
+      // Invalid code point for UTF-8
+      return 0;
+    }
+  }
+
+
+  size_t encodeTypedChar(
+          uint16_t* begin,
+          uint16_t* end,
+          uint32_t  ch) {
+    if (likely(ch < 0xD800)) {
+      if (begin) {
+        if (unlikely(begin + 1 > end))
+          return 0;
+
+        begin[0] = ch;
+      }
+
+      return 1;
+    } else if (ch < 0xE000) {
+      // Private use code points,
+      // we can't encode these
+      return 0;
+    } else if (ch < 0x10000) {
+      if (begin) {
+        if (unlikely(begin + 1 > end))
+          return 0;
+
+        begin[0] = ch;
+      }
+
+      return 1;
+    } else if (ch < 0x110000) {
+      if (begin) {
+        if (unlikely(begin + 2 > end))
+          return 0;
+
+        ch -= 0x10000;
+        begin[0] = uint16_t(0xD800 + (ch >> 10));
+        begin[1] = uint16_t(0xDC00 + (ch & 0x3FF));
+      }
+
+      return 2;
+    } else {
+      // Invalid code point
+      return 0;
+    }
+  }
+
+
+  size_t encodeTypedChar(
+          uint32_t* begin,
+          uint32_t* end,
+          uint32_t  ch) {
+    if (begin) {
+      if (unlikely(begin + 1 > end))
+        return 0;
+
+      begin[0] = ch;
+    }
+
+    return 1;
+  }
+
+
  std::string fromws(const WCHAR *ws) {
    size_t len = ::WideCharToMultiByte(CP_UTF8,
      0, ws, -1, nullptr, 0, nullptr, nullptr);
--- a/src/util/util_string.h
+++ b/src/util/util_string.h
@ -7,8 +7,160 @@

 #include "./com/com_include.h"

+#include "util_bit.h"
+#include "util_likely.h"
+
 namespace dxvk::str {
-  
+
+  template<size_t S> struct UnicodeChar { };
+  template<> struct UnicodeChar<1> { using type = uint8_t;  };
+  template<> struct UnicodeChar<2> { using type = uint16_t; };
+  template<> struct UnicodeChar<4> { using type = uint32_t; };
+
+  template<typename T>
+  using UnicodeCharType = typename UnicodeChar<sizeof(T)>::type;
+
+  const uint8_t* decodeTypedChar(
+    const uint8_t*  begin,
+    const uint8_t*  end,
+          uint32_t& ch);
+
+  const uint16_t* decodeTypedChar(
+    const uint16_t* begin,
+    const uint16_t* end,
+          uint32_t& ch);
+
+  const uint32_t* decodeTypedChar(
+    const uint32_t* begin,
+    const uint32_t* end,
+          uint32_t& ch);
+
+  size_t encodeTypedChar(
+          uint8_t*  begin,
+          uint8_t*  end,
+          uint32_t  ch);
+
+  size_t encodeTypedChar(
+          uint16_t* begin,
+          uint16_t* end,
+          uint32_t  ch);
+
+  size_t encodeTypedChar(
+          uint32_t* begin,
+          uint32_t* end,
+          uint32_t  ch);
+
+  /**
+   * \brief Decodes a single character
+   *
+   * Note that \c begin and \c end must not be equal.
+   * \param [in] begin Pointer to current position within the input string
+   * \param [in] end Pointer to the end of the input string
+   * \param [out] ch Pointer to the decoded character code
+   * \returns Pointer to next character in the input string
+   */
+  template<typename T>
+  const T* decodeChar(
+    const T*        begin,
+    const T*        end,
+          uint32_t& ch) {
+    using CharType = UnicodeCharType<T>;
+
+    const CharType* result = decodeTypedChar(
+      reinterpret_cast<const CharType*>(begin),
+      reinterpret_cast<const CharType*>(end),
+      ch);
+
+    return reinterpret_cast<const T*>(result);
+  }
+
+  /**
+   * \brief Encodes a character
+   *
+   * Note that \c begin and \c end may be both be \c nullptr or equal, in
+   * which case only the length of the encoded character will be returned.
+   * \param [in] begin Pointer to current position within the output string
+   * \param [in] end Pointer to the end of the output string
+   * \param [in] ch Character to encode
+   * \returns If begin is \c nullptr , the number of units required to encode
+   *    the character. Otherwise, the number of units written to the output.
+   *    This may return \c 0 for characters that cannot be written or encoded.
+   */
+  template<typename T>
+  size_t encodeChar(
+          T*        begin,
+          T*        end,
+          uint32_t  ch) {
+    using CharType = UnicodeCharType<T>;
+
+    return encodeTypedChar(
+      reinterpret_cast<CharType*>(begin),
+      reinterpret_cast<CharType*>(end),
+      ch);
+  }
+
+  /**
+   * \brief Computes length of a null-terminated string
+   *
+   * \param [in] begin Start of input string
+   * \returns Number of characters in input string,
+   *    excluding the terminating null character
+   */
+  template<typename S>
+  size_t length(const S* string) {
+    size_t result = 0;
+
+    while (string[result])
+      result += 1;
+
+    return result;
+  }
+
+  /**
+   * \brief Converts string from one encoding to another
+   *
+   * The output string arguments may be \c nullptr. In that case, the
+   * total length of the transcoded string will be returned, in units
+   * of the output character type. The output string will only be
+   * null-terminated if the input string is also null-terminated.
+   * \tparam D Output character type
+   * \tparam S Input character type
+   * \param [in] dstBegin Start of output string
+   * \param [in] dstLength Length of output string
+   * \param [in] srcBegin Start of input string
+   * \param [in] srcLength Length of input string
+   * \returns If \c dstBegin is \c nullptr , the total number of output
+   *    characters required to store the output string. Otherwise, the
+   *    total number of characters written to the output string.
+   */
+  template<typename D, typename S>
+  size_t transcodeString(
+          D*      dstBegin,
+          size_t  dstLength,
+    const S*      srcBegin,
+          size_t  srcLength) {
+    size_t totalLength = 0;
+
+    auto dstEnd = dstBegin + dstLength;
+    auto srcEnd = srcBegin + srcLength;
+
+    while (srcBegin < srcEnd) {
+      uint32_t ch;
+
+      srcBegin = decodeChar<S>(srcBegin, srcEnd, ch);
+
+      if (dstBegin)
+        totalLength += encodeChar<D>(dstBegin + totalLength, dstEnd, ch);
+      else
+        totalLength += encodeChar<D>(nullptr, nullptr, ch);
+
+      if (!ch)
+        break;
+    }
+
+    return totalLength;
+  }
+
  std::string fromws(const WCHAR *ws);

  void tows(const char* mbs, WCHAR* wcs, size_t wcsLen);