Discussion:
UTF16 <-> UTF32
(too old to reply)
Bonita Montero
2024-08-31 20:01:43 UTC
Permalink
Today I needed conversion functions from UTF32 to UTF16 and in the
opposite direction. I wanted to allow result-string re-usage and
decided to give the result string as a reference-parameter. This
would help the result string to keep its capacity.
I think there's no way to implement that code faster.


bool u16ToU32( u16string_view str, u32string &u32Str )
{
auto iterate = [&]<bool Err>( bool_constant<Err>, auto fn ) -> bool
{
constexpr char16_t
SURR_HDR_MSK = 0xF800,
HIGH_SURR = 0xD800,
SURR_HDR = HIGH_SURR,
LOW_SURR = 0xDC00,
SURR_MASK = 0xFC00;
for( auto it = str.begin(), end = str.end(); it != end; )
if( (*it & SURR_HDR_MSK) != SURR_HDR ) [[likely]]
fn( (char32_t)*it++ );
else
{
if( Err && (*it & SURR_MASK) != HIGH_SURR ) [[unlikely]]
return false;
if( Err && it + 1 == end ) [[unlikely]]
return false;
if( Err && (it[1] & SURR_MASK) != LOW_SURR ) [[unlikely]]
return false;
fn( 0x10000 + ((char32_t)(*it & ~SURR_MASK) << 10 | (char32_t)(it[1]
& ~SURR_MASK)) );
it += 2;
}
return true;
};
size_t n = 0;
if( !iterate( true_type(), [&]( char32_t ) { ++n; } ) )
return false;
u32Str.resize_and_overwrite( n, [&]( char32_t *p, size_t n )
{
auto it = span( p, n ).begin();
iterate( false_type(), [&]( char32_t c ) { *it++ = c; } );
return n;
} );
return true;
}

pair<bool, u32string> u16ToU32( u16string_view str )
{
u32string u32Str;
if( !u16ToU32( str, u32Str ) ) [[unlikely]]
return { false, {} };
return { true, move( u32Str ) };
}

bool u32ToU16( u32string_view str, u16string &u16Str )
{
auto iterate = [&]<bool Err>( bool_constant<Err>, auto fn ) -> bool
{
constexpr char32_t
UNICODE_MAX = 0x10FFFF;
constexpr char16_t
HIGH_SURR = 0xD800,
LOW_SURR = 0xDC00,
END_SURR = 0xDFFF;
for( auto it = str.begin(), end = str.end(); it != end; )
if( !Err || *it <= UNICODE_MAX && (*it < LOW_SURR || *it > END_SURR)
) [[likely]]
if( *it <= 0xFFFF ) [[likely]]
fn( (char16_t)*it++ );
else
{
char32_t c = *it++ - 0x10000;
fn( (char16_t)(HIGH_SURR | c >> 10) );
fn( (char16_t)(LOW_SURR | c & 0x3FF) );
}
else
return false;
return true;
};
size_t n = 0;
if( !iterate( true_type(), [&]( char16_t ) { ++n; } ) ) [[unlikely]]
return false;
u16Str.resize_and_overwrite( n, [&]( char16_t *p, size_t n )
{
auto it = span( p, n ).begin();
iterate( false_type(), [&]( char16_t c ) { *it++ = c; } );
return n;
} );
return true;
}

pair<bool, u16string> u32ToU16( u32string_view str )
{
u16string u16Str;
if( !u32ToU16( str, u16Str ) ) [[unlikely]]
return { false, {} };
return { true, move( u16Str ) };
}
Chris Ahlstrom
2024-09-01 14:07:35 UTC
Permalink
Post by Bonita Montero
Today I needed conversion functions from UTF32 to UTF16 and in the
opposite direction. I wanted to allow result-string re-usage and
decided to give the result string as a reference-parameter. This
would help the result string to keep its capacity.
I think there's no way to implement that code faster.
<snip>
Here's a method adopted from the Fluxbox window manager project:

https://github.com/ahlstromcj/potext/blob/master/library/src/po/iconvert.cpp

It uses iconv(3). No warranty implied.
--
It seemed the world was divided into good and bad people. The good ones slept
better... while the bad ones seemed to enjoy the waking hours much more.
-- Woody Allen, "Side Effects"
Bonita Montero
2024-09-06 16:22:40 UTC
Permalink
This is my final unicode.h. Have a look at u8_feeder::next<bool>
and u8_gen<bool>. Like with all routines the bool switches off
error detection if you know your string is a valid Unicode string.

#include <span>
#include <random>
#include <chrono>
#include <bit>
#include <concepts>
#include <cassert>
#include "inline.h"

#if defined(_MSC_VER)
#pragma warning(push)
#pragma warning(disable: 4554)
#endif
#if defined(__llvm__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wshift-op-parentheses"
#pragma clang diagnostic ignored "-Wlogical-op-parentheses"
#pragma clang diagnostic ignored "-Wbitwise-op-parentheses"
#pragma clang diagnostic ignored "-Wunqualified-std-cast-call"
#endif

#pragma push_macro("FI")
#define FI FORCEINLINE
#pragma push_macro("LFI")
#define LFI L_FORCEINLINE

struct u8_feeder
{
u8_feeder( std::u8string_view sv );
u8_feeder &operator =( std::u8string_view sv );
template<bool Err = true>
int32_t next();
operator std::u8string_view::iterator() const;
private:
std::u8string_view::iterator m_cur, m_end;
};

inline u8_feeder::u8_feeder( std::u8string_view sv ) :
m_cur( sv.begin() ),
m_end( sv.end() )
{
}

inline u8_feeder &u8_feeder::operator =( std::u8string_view sv )
{
m_cur = sv.begin();
m_end = sv.end();
return *this;
}

template<bool Err>
FI int32_t u8_feeder::next()
{
ptrdiff_t rem = m_end - m_cur;
if( !rem ) [[unlikely]]
return -1;
unsigned ones = std::countl_zero( (unsigned char)~*m_cur );
if( !ones ) [[likely]]
return *m_cur++;
if constexpr( Err )
{
if( ones == 1 ) [[unlikely]]
return - 1;
if( ones > 4 ) [[unlikely]]
return - 1;
}
char32_t c = *m_cur++ & (char8_t)0x7F >> ones;
auto step = [&]() LFI
{
char8_t c8 = *m_cur++;
if constexpr( Err )
if( (c8 & 0xC0) != 0x80 ) [[likely]]
return false;
c = c << 6 | c8 & 0x3F;
return true;
};
size_t add = ones - 1;
if( !step() ) [[unlikely]]
return -1;
if( !--add )
goto ret;
if( !step() ) [[unlikely]]
return -1;
if( !--add )
goto ret;
if( !step() ) [[unlikely]]
return -1;
if constexpr( Err )
if( c > 0x10FFFF ) [[unlikely]]
return -1;
ret:
if constexpr( Err )
if( (c & -0x800) == 0xD800 ) [[unlikely]]
return -1;
[[assume(c <= 0x10FFFF)]];
return c;
}

inline u8_feeder::operator std::u8string_view::iterator() const
{
return m_cur;
}

struct u16_feeder
{
u16_feeder( std::u16string_view sv );
u16_feeder &operator =( std::u16string_view sv );
template<bool Err = true>
int32_t next();
operator std::u16string_view::iterator() const;
private:
std::u16string_view::iterator m_cur, m_end;
};

inline u16_feeder::u16_feeder( std::u16string_view sv ) :
m_cur( sv.begin() ),
m_end( sv.end() )
{
}

inline u16_feeder &u16_feeder::operator =( std::u16string_view sv )
{
m_cur = sv.begin();
m_end = sv.end();
return *this;
}

template<bool Err>
FI int32_t u16_feeder::next()
{
using namespace std;
constexpr char16_t
HIGH_SURR = 0xD800,
LOW_SURR = 0xDC00,
SURR_MASK = 0xFC00,
SURR_HDR = HIGH_SURR;
constexpr uint32_t
SURR_HDR_MSK = -0x800;
char32_t c;
ptrdiff_t rem = m_end - m_cur;
if( !rem ) [[unlikely]]
return -1;
if( (*m_cur & SURR_HDR_MSK) != SURR_HDR ) [[likely]]
c = *m_cur++;
else
{
if( rem < 2 ) [[unlikely]]
return -1;
if constexpr( Err )
{
if( (*m_cur & SURR_MASK) != HIGH_SURR ) [[unlikely]]
return -1;
if( (m_cur[1] & SURR_MASK) != LOW_SURR ) [[unlikely]]
return -1;
}
c = 0x10000 + ((char32_t)(*m_cur & ~SURR_MASK) << 10 |
(char32_t)(m_cur[1] & ~SURR_MASK));
if constexpr( Err )
if( (c & SURR_HDR_MSK) == SURR_HDR ) [[unlikely]]
return -1;
m_cur += 2;
}
return c;
}

inline u16_feeder::operator std::u16string_view::iterator() const
{
return m_cur;
}

struct u32_feeder
{
u32_feeder( std::u32string_view sv );
u32_feeder &operator =( std::u32string_view sv );
template<bool Err = true>
int32_t next();
operator std::u32string_view::iterator();
private:
std::u32string_view::iterator m_cur, m_end;
};

u32_feeder::u32_feeder( std::u32string_view sv ) :
m_cur( sv.begin() ),
m_end( sv.end() )
{
}

inline u32_feeder &u32_feeder::operator =( std::u32string_view sv )
{
m_cur = sv.begin();
m_end = sv.end();
return *this;
}

template<bool Err>
FI int32_t u32_feeder::next()
{
using namespace std;
ptrdiff_t rem = m_end - m_cur;
if( !rem ) [[unlikely]]
return -1;
char32_t c = *m_cur++;
if constexpr( Err )
{
if( (c & -0x800) == 0xD800 ) [[unlikely]]
return -1;
if( c > 0x10FFFF ) [[unlikely]]
return -1;
}
return c;
}

inline u32_feeder::operator std::u32string_view::iterator()
{
return m_cur;
}

#define YYY

template<bool Err, typename Consumer>
requires requires( Consumer consumer, char8_t c ) { { consumer( c ) }; }
FI size_t u8_gen( char32_t c, Consumer consumer )
{
using namespace std;
static struct Map
{
uint8_t head, firstBit;
} const rawMap[22] =
{
{ 0xF0, 18 }, // 21
{ 0xF0, 18 }, // 20
{ 0xF0, 18 }, // 19
{ 0xF0, 18 }, // 18
{ 0xF0, 18 }, // 17
{ 0xE0, 12 }, // 16
{ 0xE0, 12 }, // 15
{ 0xE0, 12 }, // 14
{ 0xE0, 12 }, // 13
{ 0xE0, 12 }, // 12
{ 0xC0, 6 }, // 11
{ 0xC0, 6 }, // 10
{ 0xC0, 6 }, // 9
{ 0xC0, 6 }, // 8
{ 0, 0 }, // 7
{ 0, 0 }, // 6
{ 0, 0 }, // 5
{ 0, 0 }, // 4
{ 0, 0 }, // 3
{ 0, 0 }, // 2
{ 0, 0 }, // 1
{ 0, 0 } // 0
};
span<Map const> map( rawMap );
unsigned lzCnt = countl_zero( (uint32_t)c );
if constexpr( Err )
{
if( lzCnt < 11 ) [[unlikely]]
return 0;
if( c > 0x10FFFF ) [[unlikely]]
return 0;
}
Map const &mapped = map[lzCnt - 11];
int8_t bit = mapped.firstBit;
consumer( (char8_t)(mapped.head | c >> bit) );
size_t n = 1;
for( ; (bit -= 6) >= 0; ++n )
consumer( (char8_t)(0x80 | c >> bit & 0x3F) );
assert(bit == -6);
return n;
}

template<bool Err, typename Consumer>
requires requires( Consumer consumer, char16_t c ) { { consumer( c ) }; }
FI size_t u16_gen( char32_t c, Consumer consumer )
{
if( c <= 0xFFFF ) [[likely]]
{
if constexpr( Err )
if( (c & -0x800) == 0xD800 ) [[unlikely]]
return 0;
consumer( (char16_t)c );
return 1;
}
else if( !Err || c <= 0x10FFFF ) [[likely]]
{
c -= 0x10000;
consumer( (char16_t)(0xD800 | c >> 10) );
consumer( (char16_t)(0xDC00 | c & 0x3FF) );
return 2;
}
return 0;
}

template<bool Err, typename Reserve>
requires requires( Reserve reserve, size_t n ) { { reserve( n ) } ->
std::convertible_to<std::span<char32_t>>; }
size_t u8_to_u32( std::u8string_view u8Str, Reserve reserve )
{
using namespace std;
size_t n = 0;
u8_feeder u8f( u8Str );
for( ; u8f.next<Err>() >= 0; ++n );
if( u8f != u8Str.end() ) [[unlikely]]
return u8Str.end() - u8f;
span<char32_t> sp = reserve( n );
auto it = sp.begin();
u8f = u8Str;
for( int32_t c ; (c = u8f.next<false>()) >= 0; *it++ = c );
return 0;
}

template<bool Err>
size_t u8_to_u32( std::u8string_view u8Str, std::u32string &u32Str )
{
using namespace std;
return u8_to_u32<Err>( u8Str,
[&]( size_t n ) LFI -> span<char32_t>
{
u32Str.resize_and_overwrite( n, [&]( char32_t *, size_t n ) LFI {
return n; } );
return u32Str;
} );
}

template<bool Err>
std::pair<size_t, std::u32string> u8_to_u32( std::u8string_view u8Str )
{
using namespace std;
u32string u32Str;
size_t rem = u8_to_u32<Err>( u8Str,
[&]( size_t n ) LFI -> span<char32_t>
{
u32Str.resize_and_overwrite( n, [&]( char32_t *, size_t n ) LFI {
return n; } );
return u32Str;
} );
return { rem, move( u32Str ) };
}

template<bool Err, typename Reserve>
requires requires( Reserve reserve, size_t n ) { { reserve( n ) } ->
std::convertible_to<std::span<char16_t>>; }
size_t u8_to_u16( std::u8string_view u8Str, Reserve reserve )
{
using namespace std;
size_t n = 0;
u8_feeder u8f( u8Str );
for( int32_t c; (c = u8f.next<Err>()) >= 0; u16_gen<false>( c, [&](
char16_t ) LFI { ++n; } ) );
if( u8f != u8Str.end() ) [[unlikely]]
return u8Str.end() - u8f;
span<char16_t> sp( reserve( n ) );
auto it = sp.begin();
u8f = u8Str;
for( int32_t c; (c = u8f.next<false>()) >= 0; u16_gen<false>( c, [&](
char16_t c16 ) LFI { *it++ = c16; } ) );
return 0;
}

template<bool Err>
size_t u8_to_u16( std::u8string_view u8Str, std::u16string &u16Str )
{
using namespace std;
return u8_to_u16<Err>( u8Str,
[&]( size_t n ) LFI -> span<char16_t>
{
u16Str.resize_and_overwrite( n, [&]( char16_t *, size_t n ) LFI {
return n; } );
return u16Str;
} );
}

template<bool Err>
std::pair<size_t, std::u16string> u8_to_u16( std::u8string_view u8Str )
{
using namespace std;
u16string u16Str;
size_t rem = u8_to_u16<Err>( u8Str,
[&]( size_t n ) LFI -> span<char16_t>
{
u16Str.resize_and_overwrite( n, [&]( char16_t *, size_t n ) LFI {
return n; } );
return u16Str;
} );
return { rem, move( u16Str ) };
}

template<bool Err, typename Reserve>
requires requires( Reserve reserve, size_t n ) { { reserve( n ) } ->
std::convertible_to<std::span<char32_t>>; }
size_t u16_to_u32( std::u16string_view u16Str, Reserve reserve )
{
using namespace std;
size_t n = 0;
u16_feeder u16f( u16Str );
for( ; u16f.next<Err>() >= 0; ++n );
if( u16f != u16Str.end() )
return u16Str.end() - u16f;
span<char32_t> sp = reserve( n );
auto it = sp.begin();
u16f = u16Str;
for( int32_t c; (c = u16f.next<false>()) >= 0; *it++ = c );
return 0;
}

template<bool Err>
size_t u16_to_u32( std::u16string_view u16Str, std::u32string &u32Str )
{
using namespace std;
return u16_to_u32<Err>( u16Str,
[&]( size_t n ) LFI -> span<char32_t>
{
u32Str.resize_and_overwrite( n, [&]( char32_t *, size_t n ) LFI {
return n; } );
return u32Str;
} );
}

template<bool Err>
std::pair<size_t, std::u32string> u16_to_u32( std::u16string_view u16Str )
{
using namespace std;
u32string u32Str;
size_t rem = u16_to_u32<Err>( u16Str, u32Str );
return { rem, move( u32Str ) };
}

template<bool Err, typename Reserve>
requires requires( Reserve reserve, size_t n ) { { reserve( n ) } ->
std::convertible_to<std::span<char8_t>>; }
size_t u32_to_u8( std::u32string_view u32Str, Reserve reserve )
{
using namespace std;
size_t n = 0;
u32_feeder u32f( u32Str );
for( int32_t c; (c = u32f.next<Err>()) >= 0; n += u8_gen<false>( c,
[&]( char8_t ) LFI {} ) );
if( u32f != u32Str.end() ) [[unlikely]]
return u32Str.end() - u32f;
span<char8_t> sp = reserve( n );
auto it = sp.begin();
u32f = u32Str;
for( int32_t c; (c = u32f.next<false>()) >= 0; u8_gen<false>( c, [&](
char8_t c8 ) LFI { *it++ = c8; } ) );
return 0;
}

template<bool Err>
size_t u32_to_u8( std::u32string_view u32Str, std::u8string &u8Str )
{
using namespace std;
return u32_to_u8<Err>( u32Str,
[&]( size_t n ) LFI -> span<char8_t>
{
u8Str.resize_and_overwrite( n, [&]( char8_t *, size_t n ) LFI {
return n; } );
return u8Str;
} );
}

template<bool Err>
std::pair<size_t, std::u8string> u32_to_u8( std::u32string_view u32Str )
{
using namespace std;
u8string u8Str;
size_t rem = u32_to_u8<Err>( u32Str, u8Str );
return { rem, move( u8Str ) };
}

template<bool Err, typename Reserve>
requires requires( Reserve reserve, size_t n ) { { reserve( n ) } ->
std::convertible_to<std::span<char16_t>>; }
size_t u32_to_u16( std::u32string_view u32Str, Reserve reserve )
{
using namespace std;
size_t n = 0;
u32_feeder u32f( u32Str );
for( int32_t c; (c = u32f.next<Err>()) >= 0; u16_gen<false>( c, [&](
char16_t ) LFI { ++n; } ) );
if( u32f != u32Str.end() ) [[unlikely]]
return u32Str.end() - u32f;
span<char16_t> sp = reserve( n );
auto it = sp.begin();
u32f = u32Str;
for( int32_t c; (c = u32f.next<false>()) >= 0; u16_gen<false>( c, [&](
char16_t c16 ) LFI { *it++ = c16; } ) );
return 0;
}

template<bool Err>
size_t u32_to_u16( std::u32string_view u32Str, std::u16string &u16Str )
{
using namespace std;
return u32_to_u16<Err>( u32Str,
[&]( size_t n ) LFI -> span<char16_t>
{
u16Str.resize_and_overwrite( n, [&]( char16_t *, size_t n ) LFI {
return n; } );
return u16Str;
} );
}

template<bool Err>
std::pair<size_t, std::u16string> u32_to_u16( std::u32string_view u32Str )
{
using namespace std;
u16string u16Str;
size_t rem = u32_to_u16<Err>( u32Str, u16Str );
return { rem, move( u16Str ) };
}

template<bool Err, typename Reserve>
requires requires( Reserve reserve, size_t n ) { { reserve( n ) } ->
std::convertible_to<std::span<char8_t>>; }
size_t u16_to_u8( std::u16string_view u16Str, Reserve reserve )
{
using namespace std;
size_t n = 0;
u16_feeder u16f( u16Str );
for( int32_t c; (c = u16f.next<Err>()) >= 0; n += u8_gen<false>( c,
[&]( char8_t ) LFI {} ) );
if( u16f != u16Str.end() )
return u16Str.end() - u16f;
span<char8_t> sp = reserve( n );
auto it = sp.begin();
u16f = u16Str;
for( int32_t c; (c = u16f.next<false>()) >= 0; u8_gen<false>( c, [&](
char8_t c8 ) LFI { *it++ = c8; } ) );
return 0;
}

template<bool Err>
size_t u16_to_u8( std::u16string_view u16Str, std::u8string &u8Str )
{
using namespace std;
return u16_to_u8<Err>( u16Str,
[&]( size_t n ) LFI -> span<char8_t>
{
u8Str.resize_and_overwrite( n, [&]( char8_t *, size_t n ) LFI {
return n; } );
return u8Str;
} );
}

template<bool Err>
std::pair<size_t, std::u8string> u16_to_u8( std::u16string_view u16Str )
{
using namespace std;
u8string u8Str;
size_t rem = u16_to_u8<Err>( u16Str,
[&]( size_t n ) -> span<char8_t>
{
u8Str.resize_and_overwrite( n, [&]( char8_t *, size_t n ) LFI {
return n; } );
return u8Str;
} );
return { rem, move( u8Str ) };
}

template<bool Err, typename Consumer>
requires requires( Consumer consumer, char32_t c ) { { consumer( c ) }; }
FI size_t u8_iterate( std::u8string_view u8Str, Consumer consumer )
{
u8_feeder u8f( u8Str );
for( int32_t c; (c = u8f.next<Err>()) >= 0; )
if constexpr ( !requires( Consumer consumer, char32_t c ) { {
consumer( c ) } -> std::convertible_to<bool>; } )
consumer( c );
else
if( !consumer( c ) ) [[unlikely]]
break;
return u8Str.end() - u8f;
}

template<bool Err, typename Consumer>
requires requires( Consumer consumer, char32_t c ) { { consumer( c ) }; }
FI size_t u16_iterate( std::u16string_view u16Str, Consumer consumer )
{
u16_feeder u16f( u16Str );
for( int32_t c; (c = u16f.next<Err>()) >= 0; )
if constexpr ( !requires( Consumer consumer, char32_t c ) { {
consumer( c ) } -> std::convertible_to<bool>; } )
consumer( c );
else
if( !consumer( c ) ) [[unlikely]]
break;
return u16Str.end() - u16f;
}

template<bool Err, typename Consumer>
requires requires( Consumer consumer, char32_t c ) { { consumer( c ) }; }
FI size_t u32_iterate( std::u32string_view u32Str, Consumer consumer )
{
u32_feeder u32f( u32Str );
for( int32_t c; (c = u32f.next<Err>()) >= 0; )
if constexpr ( !requires( Consumer consumer, char32_t c ) { {
consumer( c ) } -> std::convertible_to<bool>; } )
consumer( c );
else
if( !consumer( c ) ) [[unlikely]]
break;
return u32Str.end() - u32f;
}

#pragma pop_macro("FI")
#pragma pop_macro("LFI")

#if defined(_MSC_VER)
#pragma warning(pop)
#endif
#if defined(__llvm__)
#pragma clang diagnostic pop
#endif

Loading...