1
0
mirror of https://github.com/doitsujin/dxvk.git synced 2025-02-20 19:54:19 +01:00

[util] Implement utility functions for string conversion

This commit is contained in:
Philip Rebohle 2022-07-28 18:40:46 +02:00 committed by Philip Rebohle
parent bad7d4690b
commit 200df73ba7
2 changed files with 354 additions and 1 deletions

View File

@ -1,6 +1,207 @@
#include "util_string.h"
namespace dxvk::str {
const uint8_t* decodeTypedChar(
const uint8_t* begin,
const uint8_t* end,
uint32_t& ch) {
uint32_t first = begin[0];
if (likely(first < 0x80)) {
// Basic ASCII character
ch = uint32_t(first);
return begin + 1;
} else if (unlikely(first < 0xC0)) {
// Character starts with a continuation byte,
// just skip until we find the next valid prefix
while ((begin < end) && (((*begin) & 0xC0) == 0x80))
begin += 1;
ch = uint32_t('?');
return begin;
} else {
// The number of leading 1 bits in the first byte
// determines the length of this character
size_t length = bit::lzcnt((~first) << 24);
if (unlikely(begin + length > end)) {
ch = uint32_t('?');
return end;
}
if (first < 0xE0) {
ch = ((uint32_t(begin[0]) & 0x1F) << 6)
| ((uint32_t(begin[1]) & 0x3F));
} else if (first < 0xF0) {
ch = ((uint32_t(begin[0]) & 0x0F) << 12)
| ((uint32_t(begin[1]) & 0x3F) << 6)
| ((uint32_t(begin[2]) & 0x3F));
} else if (first < 0xF8) {
ch = ((uint32_t(begin[0]) & 0x07) << 18)
| ((uint32_t(begin[1]) & 0x3F) << 12)
| ((uint32_t(begin[2]) & 0x3F) << 6)
| ((uint32_t(begin[3]) & 0x3F));
} else {
// Invalid prefix
ch = uint32_t('?');
}
return begin + length;
}
}
const uint16_t* decodeTypedChar(
const uint16_t* begin,
const uint16_t* end,
uint32_t& ch) {
uint32_t first = begin[0];
if (likely(first < 0xD800)) {
ch = first;
return begin + 1;
} else if (first < 0xDC00) {
if (unlikely(begin + 2 > end)) {
ch = uint32_t('?');
return end;
}
ch = 0x10000
+ ((uint32_t(begin[0]) & 0x3FF) << 10)
+ ((uint32_t(begin[1]) & 0x3FF));
return begin + 2;
} else if (unlikely(first < 0xE000)) {
// Stray low surrogate
ch = uint32_t('?');
return begin + 1;
} else {
ch = first;
return begin + 1;
}
}
const uint32_t* decodeTypedChar(
const uint32_t* begin,
const uint32_t* end,
uint32_t& ch) {
ch = begin[0];
return begin + 1;
}
size_t encodeTypedChar(
uint8_t* begin,
uint8_t* end,
uint32_t ch) {
if (likely(ch < 0x80)) {
if (begin) {
if (unlikely(begin + 1 > end))
return 0;
begin[0] = uint8_t(ch);
}
return 1;
} else if (ch < 0x800) {
if (begin) {
if (unlikely(begin + 2 > end))
return 0;
begin[0] = uint8_t(0xC0 | (ch >> 6));
begin[1] = uint8_t(0x80 | (ch & 0x3F));
}
return 2;
} else if (ch < 0x10000) {
if (begin) {
if (unlikely(begin + 3 > end))
return 0;
begin[0] = uint8_t(0xE0 | ((ch >> 12)));
begin[1] = uint8_t(0x80 | ((ch >> 6) & 0x3F));
begin[2] = uint8_t(0x80 | ((ch >> 0) & 0x3F));
}
return 3;
} else if (ch < 0x200000) {
if (begin) {
if (unlikely(begin + 4 < end))
return 0;
begin[0] = uint8_t(0xF0 | ((ch >> 18)));
begin[1] = uint8_t(0x80 | ((ch >> 12) & 0x3F));
begin[2] = uint8_t(0x80 | ((ch >> 6) & 0x3F));
begin[3] = uint8_t(0x80 | ((ch >> 0) & 0x3F));
}
return 4;
} else {
// Invalid code point for UTF-8
return 0;
}
}
size_t encodeTypedChar(
uint16_t* begin,
uint16_t* end,
uint32_t ch) {
if (likely(ch < 0xD800)) {
if (begin) {
if (unlikely(begin + 1 > end))
return 0;
begin[0] = ch;
}
return 1;
} else if (ch < 0xE000) {
// Private use code points,
// we can't encode these
return 0;
} else if (ch < 0x10000) {
if (begin) {
if (unlikely(begin + 1 > end))
return 0;
begin[0] = ch;
}
return 1;
} else if (ch < 0x110000) {
if (begin) {
if (unlikely(begin + 2 > end))
return 0;
ch -= 0x10000;
begin[0] = uint16_t(0xD800 + (ch >> 10));
begin[1] = uint16_t(0xDC00 + (ch & 0x3FF));
}
return 2;
} else {
// Invalid code point
return 0;
}
}
size_t encodeTypedChar(
uint32_t* begin,
uint32_t* end,
uint32_t ch) {
if (begin) {
if (unlikely(begin + 1 > end))
return 0;
begin[0] = ch;
}
return 1;
}
std::string fromws(const WCHAR *ws) {
size_t len = ::WideCharToMultiByte(CP_UTF8,
0, ws, -1, nullptr, 0, nullptr, nullptr);

View File

@ -7,8 +7,160 @@
#include "./com/com_include.h"
#include "util_bit.h"
#include "util_likely.h"
namespace dxvk::str {
template<size_t S> struct UnicodeChar { };
template<> struct UnicodeChar<1> { using type = uint8_t; };
template<> struct UnicodeChar<2> { using type = uint16_t; };
template<> struct UnicodeChar<4> { using type = uint32_t; };
template<typename T>
using UnicodeCharType = typename UnicodeChar<sizeof(T)>::type;
const uint8_t* decodeTypedChar(
const uint8_t* begin,
const uint8_t* end,
uint32_t& ch);
const uint16_t* decodeTypedChar(
const uint16_t* begin,
const uint16_t* end,
uint32_t& ch);
const uint32_t* decodeTypedChar(
const uint32_t* begin,
const uint32_t* end,
uint32_t& ch);
size_t encodeTypedChar(
uint8_t* begin,
uint8_t* end,
uint32_t ch);
size_t encodeTypedChar(
uint16_t* begin,
uint16_t* end,
uint32_t ch);
size_t encodeTypedChar(
uint32_t* begin,
uint32_t* end,
uint32_t ch);
/**
* \brief Decodes a single character
*
* Note that \c begin and \c end must not be equal.
* \param [in] begin Pointer to current position within the input string
* \param [in] end Pointer to the end of the input string
* \param [out] ch Pointer to the decoded character code
* \returns Pointer to next character in the input string
*/
template<typename T>
const T* decodeChar(
const T* begin,
const T* end,
uint32_t& ch) {
using CharType = UnicodeCharType<T>;
const CharType* result = decodeTypedChar(
reinterpret_cast<const CharType*>(begin),
reinterpret_cast<const CharType*>(end),
ch);
return reinterpret_cast<const T*>(result);
}
/**
* \brief Encodes a character
*
* Note that \c begin and \c end may be both be \c nullptr or equal, in
* which case only the length of the encoded character will be returned.
* \param [in] begin Pointer to current position within the output string
* \param [in] end Pointer to the end of the output string
* \param [in] ch Character to encode
* \returns If begin is \c nullptr , the number of units required to encode
* the character. Otherwise, the number of units written to the output.
* This may return \c 0 for characters that cannot be written or encoded.
*/
template<typename T>
size_t encodeChar(
T* begin,
T* end,
uint32_t ch) {
using CharType = UnicodeCharType<T>;
return encodeTypedChar(
reinterpret_cast<CharType*>(begin),
reinterpret_cast<CharType*>(end),
ch);
}
/**
* \brief Computes length of a null-terminated string
*
* \param [in] begin Start of input string
* \returns Number of characters in input string,
* excluding the terminating null character
*/
template<typename S>
size_t length(const S* string) {
size_t result = 0;
while (string[result])
result += 1;
return result;
}
/**
* \brief Converts string from one encoding to another
*
* The output string arguments may be \c nullptr. In that case, the
* total length of the transcoded string will be returned, in units
* of the output character type. The output string will only be
* null-terminated if the input string is also null-terminated.
* \tparam D Output character type
* \tparam S Input character type
* \param [in] dstBegin Start of output string
* \param [in] dstLength Length of output string
* \param [in] srcBegin Start of input string
* \param [in] srcLength Length of input string
* \returns If \c dstBegin is \c nullptr , the total number of output
* characters required to store the output string. Otherwise, the
* total number of characters written to the output string.
*/
template<typename D, typename S>
size_t transcodeString(
D* dstBegin,
size_t dstLength,
const S* srcBegin,
size_t srcLength) {
size_t totalLength = 0;
auto dstEnd = dstBegin + dstLength;
auto srcEnd = srcBegin + srcLength;
while (srcBegin < srcEnd) {
uint32_t ch;
srcBegin = decodeChar<S>(srcBegin, srcEnd, ch);
if (dstBegin)
totalLength += encodeChar<D>(dstBegin + totalLength, dstEnd, ch);
else
totalLength += encodeChar<D>(nullptr, nullptr, ch);
if (!ch)
break;
}
return totalLength;
}
std::string fromws(const WCHAR *ws);
void tows(const char* mbs, WCHAR* wcs, size_t wcsLen);