Discussion:
lock-free alternative to atomic<shared_ptr<>>
Add Reply
Bonita Montero
2024-08-19 14:22:50 UTC
Reply
Permalink
I've developed a lock-free alternative to atomic<shared_ptr<>>. The idea
is that the pointer inside this atomic has two counters which come along
in a size_t'd word. The first counter has a width of 24 bit and counts
the number of control block increments which are in progress, so I'm
also using double reference counting. The remaing 40 bits of the second
word contain a counter which constantly increments when the pointer is
being overwritten. This prevents the lowerr 24 bit to be decremented
if the pointer is already overwritten.
The code is somewhat faster than the futex'd solution of MSVC and
mangitudes faster than the clang's libc++ atomic<shared_ptr<>>.

Here's the code:

#pragma once
#include <cstdint>
#include <atomic>
#include <utility>
#include <cassert>
#include <stdexcept>
#include <type_traits>
#include "dcas_atomic.h"

template<typename T>
struct shared_obj;

template<typename T, typename ... Args>
shared_obj<T> make_shared_obj( Args &&... args );
template<typename T>
struct tshared_obj;

struct dca_update_exception : std::invalid_argument
{
dca_update_exception() : std::invalid_argument( "" ) {};
};

template<typename T>
struct shared_obj
{
shared_obj() noexcept;
shared_obj( shared_obj && ) noexcept;
shared_obj( shared_obj const & ) noexcept;
shared_obj &operator =( shared_obj && ) noexcept;
shared_obj &operator =( shared_obj const & ) noexcept;
~shared_obj();
T *get() const noexcept;
operator bool() const noexcept;
size_t use_count() const noexcept;
T &operator *() const noexcept;
T *operator ->() const noexcept
requires std::is_class_v<T>;
private:
template<typename T2, typename ... Args>
friend shared_obj<T2> make_shared_obj( Args &&... args );
template<typename T2>
friend struct tshared_obj;
struct ctrl_t
{
template<typename ... Args>
ctrl_t( Args &&... args );
T m_obj;
std::atomic<size_t> m_refCount;
};
shared_obj( ctrl_t *obj );
ctrl_t *m_ctrl;
};

template<typename T>
inline shared_obj<T>::shared_obj() noexcept :
m_ctrl( nullptr )
{
}

template<typename T>
inline shared_obj<T>::shared_obj( shared_obj &&other ) noexcept :
m_ctrl( other.m_ctrl )
{
other.m_ctrl = nullptr;
}

template<typename T>
inline shared_obj<T>::shared_obj( shared_obj const &other ) noexcept :
m_ctrl( other.m_ctrl )
{
size_t before = m_ctrl->m_refCount.fetch_add( 1,
std::memory_order_relaxed );
assert(before != (size_t)-1);
}

template<typename T>
template<typename ... Args>
inline shared_obj<T>::ctrl_t::ctrl_t( Args &&... args ) :
m_obj( std::forward<Args>( args ) ... ),
m_refCount( 1 )
{
}

template<typename T>
shared_obj<T> &shared_obj<T>::operator =( shared_obj &&other ) noexcept
{
if( m_ctrl && m_ctrl->m_refCount.fetch_sub( 1,
std::memory_order_release ) == 1 )
delete m_ctrl;
m_ctrl = other.m_ctrl;
other = nullptr;
}

template<typename T>
shared_obj<T> &shared_obj<T>::operator =( shared_obj const &other ) noexcept
{
this->~shared_obj();
m_ctrl = other.m_ctrl;
m_ctrl->fetch_add( 1, std::memory_order_acquire );
}

template<typename T>
shared_obj<T>::~shared_obj()
{
if( m_ctrl && m_ctrl->m_refCount.fetch_sub( 1,
std::memory_order_release ) == 1 )
delete m_ctrl;
}

template<typename T>
inline T *shared_obj<T>::get() const noexcept
{
return &m_ctrl->m_obj;
}

template<typename T>
inline shared_obj<T>::operator bool() const noexcept
{
return (bool)m_ctrl;
}

template<typename T>
inline size_t shared_obj<T>::use_count() const noexcept
{
if( m_ctrl ) [[likely]]
return m_ctrl->m_refCount.load( std::memory_order_relaxed );
return 0;
}

template<typename T>
inline shared_obj<T>::shared_obj( ctrl_t *obj ) :
m_ctrl( obj )
{
}

template<typename T>
inline T &shared_obj<T>::operator *() const noexcept
{
assert(m_ctrl);
return m_ctrl->m_obj;
}

template<typename T>
inline T *shared_obj<T>::operator ->() const noexcept
requires std::is_class_v<T>
{
assert(m_ctrl);
return m_ctrl->m_obj;
}

template<typename T, typename ... Args>
shared_obj<T> make_shared_obj( Args &&... args )
{
return shared_obj<T>( new shared_obj<T>::ctrl_t( std::forward<Args>(
args ) ... ) );
}

template<typename T>
struct tshared_obj
{
tshared_obj();
tshared_obj( tshared_obj && ) noexcept;
tshared_obj( tshared_obj const & ) noexcept;
tshared_obj( shared_obj<T> && ) noexcept;
tshared_obj( shared_obj<T> const & ) noexcept;
~tshared_obj();
tshared_obj &operator =( tshared_obj && ) noexcept;
tshared_obj &operator =( tshared_obj & );
tshared_obj &operator =( shared_obj<T> && ) noexcept;
tshared_obj &operator =( shared_obj<T> const & ) noexcept;
operator shared_obj<T>();
private:
using ctrl_t = shared_obj<T>::ctrl_t;
using pair_t = dcas_atomic::pair_t;
static_assert(sizeof(size_t) == 8, "");
static constexpr unsigned
UPDATER_BITS = 24,
SEQENCE_BITS = 64 - UPDATER_BITS;
static constexpr uint64_t
UPDATER_VALUE = 1,
SEQUENCE_VALUE = 1ull << UPDATER_BITS,
UPDATER_MASK = SEQUENCE_VALUE - 1,
SEQUENCE_MASK = ~UPDATER_MASK;
mutable dcas_atomic m_dca;
ctrl_t *copy();
ctrl_t *remove() noexcept;
void store( ctrl_t *obj ) noexcept;
};

template<typename T>
inline tshared_obj<T>::tshared_obj() :
m_dca( pair_t( 0, 0 ) )
{
}

template<typename T>
tshared_obj<T>::~tshared_obj()
{
ctrl_t *obj = (ctrl_t *)m_dca.first();
if( !obj )
return;
assert(!(m_dca.second() & UPDATER_MASK));
size_t before = obj->m_refCount.fetch_sub( 1, std::memory_order_release );
assert(before > 0);
if( before == 1 )
delete obj;
}

template<typename T>
inline tshared_obj<T>::tshared_obj( shared_obj<T> &&other ) noexcept :
m_dca( pair_t( (size_t)other.m_ctrl, 0 ) )
{
other.m_ctrl = nullptr;
}

template<typename T>
inline tshared_obj<T>::tshared_obj( shared_obj<T> const &other ) noexcept :
m_dca( pair_t( (size_t)other.m_ctrl, 0 ) )
{
other.m_ctrl->m_refCount.fetch_add( 1, std::memory_order_release );
}

template<typename T>
tshared_obj<T>::tshared_obj( tshared_obj &&other ) noexcept :
m_dca( pair_t( (size_t)other.remove(), 0 ) )
{
}

template<typename T>
tshared_obj<T>::tshared_obj( tshared_obj const &other ) noexcept :
m_dca( pair_t( (size_t)other.copy(), 0 ) )
{
}

template<typename T>
tshared_obj<T> &tshared_obj<T>::operator =( shared_obj<T> &&shared )
noexcept
{
ctrl_t *obj = shared.m_ctrl;
shared.m_ctrl = nullptr;
store( obj );
return *this;
}

template<typename T>
tshared_obj<T> &tshared_obj<T>::operator =( shared_obj<T> const &shared
) noexcept
{
ctrl_t *obj = shared.m_ctrl;
size_t before = obj->m_refCount.fetch_add( 1, std::memory_order_relaxed );
assert(before != (size_t)-1);
store( obj );
return *this;
}

template<typename T>
tshared_obj<T> &tshared_obj<T>::operator =( tshared_obj &&other ) noexcept
{
store( other.remove() );
return *this;
}

template<typename T>
tshared_obj<T> &tshared_obj<T>::operator =( tshared_obj &other )
{
store( other.copy() );
return *this;
}

template<typename T>
tshared_obj<T>::operator shared_obj<T>()
{
return shared_obj<T>( copy() );
}

// Copies the content of an atomic shared object to a pointer.
// Proceeds in three steps:
// 1. Increment the update counter.
// 2. Increment the reference counter
// 3. Decrement the update counter if pointer or sequence counter hasn't
changed.

template<typename T>
inline typename tshared_obj<T>::ctrl_t *tshared_obj<T>::copy()
{
pair_t expected( m_dca.first(), m_dca.second() ), desired;
do
if( !expected.first ) [[unlikely]]
return nullptr;
else if( (expected.second & UPDATER_MASK) != UPDATER_MASK ) [[likely]]
desired = { expected.first, expected.second + UPDATER_VALUE };
else
throw dca_update_exception();
while( !m_dca.compare_exchange( expected, desired, dcas_relaxed() ) );
ctrl_t *obj = (ctrl_t *)desired.first;
size_t rcBefore = obj->m_refCount.fetch_add( 1,
std::memory_order_relaxed );
assert(rcBefore > 0 && rcBefore != (size_t)-1);
expected = desired;
uint64_t seq = expected.second >> UPDATER_BITS;
do
if( expected.first == desired.first && expected.second >> UPDATER_BITS
== seq ) [[likely]]
desired.second = expected.second - UPDATER_VALUE;
else
break;
while( !m_dca.compare_exchange( expected, desired, dcas_relaxed() ) );
return obj;
}

// Removes the atomic pointer, preparing it to be moved.

template<typename T>
inline typename tshared_obj<T>::ctrl_t *tshared_obj<T>::remove() noexcept
{
pair_t expected( m_dca.first(), m_dca.second() ), desired;
desired.first = 0;
do
{
if( !expected.first )
return nullptr;
desired.second = expected.second + SEQUENCE_VALUE & SEQUENCE_MASK;
} while( !m_dca.compare_exchange( expected, desired, dcas_relaxed() ) );
return (ctrl_t *)expected.first;
}

// Replaces the atomic pointer and deletes the old if there aren't
pending loads.

template<typename T>
inline void tshared_obj<T>::store( ctrl_t *obj ) noexcept
{
pair_t expected( m_dca.first(), m_dca.second() ), desired;
desired.first = (size_t)obj;
do
if( expected.first != desired.first )
desired.second = expected.first + SEQUENCE_VALUE & SEQUENCE_MASK;
else
return;
while( !m_dca.compare_exchange( expected, desired, dcas_release() ) );
ctrl_t *oldObj = (ctrl_t *)expected.first;
if( !oldObj ) [[unlikely]]
return;
size_t before = oldObj->m_refCount.fetch_sub( 1,
std::memory_order_relaxed );
assert(before);
if( before == 1 && !(expected.second & UPDATER_MASK) ) [[unlikely]]
delete oldObj;
}

Here's the DW-CAS class used by this code:

#pragma once
#include <cstdint>
#include <utility>
#include <atomic>
#include <type_traits>
#include <cstddef>
#include <bit>
#include <memory>

#if defined(_MSC_VER)
#pragma warning(push)
#pragma warning(disable: 26495)
#endif
#if defined(__llvm__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Watomic-alignment"
#endif

constexpr int DCAS_SEQ = 0;
constexpr int DCAS_ACQUIRE = 1;
constexpr int DCAS_RELAXED = 2;
constexpr int DCAS_RELEASE = 3;
template<int Type>
using dcas_type = std::integral_constant<int, Type>;
using dcas_seq = dcas_type<DCAS_SEQ>;
using dcas_acquire = dcas_type<DCAS_ACQUIRE>;
using dcas_relaxed = dcas_type<DCAS_RELAXED>;
using dcas_release = dcas_type<DCAS_RELEASE>;

struct dcas_atomic
{
static_assert(sizeof(size_t) == 8 || sizeof(size_t) == 4, "must be 64
or 32 bit architecture");
struct pair_t { size_t first, second; };
dcas_atomic() = default;
dcas_atomic( pair_t const &desired );
size_t first() const noexcept;
size_t second() const noexcept;
size_t first_na() const noexcept;
size_t second_na() const noexcept;
operator pair_t() const noexcept;
template<int Type>
bool compare_exchange( pair_t &expected, pair_t const &desired,
dcas_type<Type> = dcas_relaxed() ) noexcept;
template<int Type>
pair_t load( dcas_type<Type> = dcas_relaxed() ) const noexcept;
template<int Type>
void store( pair_t const &niu, dcas_type<Type> = dcas_relaxed() );
private:
#if defined(__GNUC__) || defined(__clang__)
using dword_t = std::conditional_t<sizeof(size_t) == 8, unsigned
__int128, unsigned long long>;
#endif
static constexpr bool SUBSTITITABLE =
std::atomic<pair_t>::is_always_lock_free || sizeof(std::atomic<pair_t>)
== sizeof(pair_t);
union alignas(2 * sizeof(size_t)) U
{
U() {}
static_assert(sizeof(std::atomic<size_t>) == sizeof(size_t),
"sizeof(atomic<size_t>) must be == sizeof(size_t)");
std::atomic<size_t> m_atomics[2];
std::conditional_t<SUBSTITITABLE, std::atomic<pair_t>, char> m_subsitute;
#if defined(_MSC_VER)
#if defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM64EC)
__int64 volatile m_firstAndSecond[2];
#elif defined(_M_IX86)
__int64 volatile m_firstAndSecond;
#else
#error unknown architecture
#endif
#elif defined(__GNUC__) || defined(__clang__)
dword_t volatile m_firstAndSecond;
#endif
} u;
};

inline dcas_atomic::dcas_atomic( pair_t const &desired )
{
u.m_atomics[0].store( desired.first, std::memory_order_relaxed );
u.m_atomics[1].store( desired.second, std::memory_order_relaxed );
}

inline size_t dcas_atomic::first() const noexcept
{
return u.m_atomics[0].load( std::memory_order_relaxed );
}

inline size_t dcas_atomic::second() const noexcept
{
return u.m_atomics[1].load( std::memory_order_relaxed );
}

inline dcas_atomic::operator pair_t() const noexcept
{
return pair_t( first(), second() );
}

template<int Type>
inline bool dcas_atomic::compare_exchange( pair_t &expected, pair_t
const &desired, dcas_type<Type> ) noexcept
{
using namespace std;
static_assert(Type == DCAS_SEQ || Type == DCAS_ACQUIRE || Type ==
DCAS_RELAXED || Type == DCAS_RELEASE);
if constexpr( !SUBSTITITABLE )
{
#if defined(_MSC_VER)
#if defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM64EC)
__int64 expectedA[2];
expectedA[0] = (__int64)expected.first;
expectedA[1] = (__int64)expected.second;
char fRet;
#if defined(_M_X64)
fRet = _InterlockedCompareExchange128( u.m_firstAndSecond,
desired.second, desired.first, expectedA );
#else
if constexpr( Type == DCAS_SEQ )
fRet = _InterlockedCompareExchange128( u.m_firstAndSecond,
desired.second, desired.first, expectedA );
else if constexpr( Type == DCAS_ACQUIRE )
fRet = _InterlockedCompareExchange128_acq( u.m_firstAndSecond,
desired.second, desired.first, expectedA );
else if constexpr( Type == DCAS_RELAXED )
fRet = _InterlockedCompareExchange128_nf( u.m_firstAndSecond,
desired.second, desired.first, expectedA );
else
fRet = _InterlockedCompareExchange128_rel( u.m_firstAndSecond,
desired.second, desired.first, expectedA );
#endif
if( fRet )
return true;
expected.first = expectedA[0];
expected.second = expectedA[1];
return false;
#elif defined(_M_IX86)
auto compose = []( pair_t const &p ) -> uint64_t { return p.first |
(uint64_t)p.second << 32; };
uint64_t
cDesired = compose( desired ),
cExpected = compose( expected ),
cResult = _InterlockedCompareExchange64( &u.m_firstAndSecond,
cDesired, cExpected );
if( cResult == cExpected ) [[likely]]
return true;
expected.first = (uint32_t)cResult;
expected.second = (uint32_t)(cResult >> 32);
return false;
#else
#error unspupported Windows-platform
#endif
#elif defined(__GNUC__) || defined(__clang__)
constexpr auto
pair_t::*FIRST = std::endian::native == std::endian::little ?
&pair_t::first : &pair_t::second,
pair_t::*SECOND = std::endian::native == std::endian::little ?
&pair_t::second : &pair_t::first;
constexpr unsigned SZT_BITS = sizeof(size_t) * 8;
auto compose = []( pair_t const &p ) -> dword_t { return
(dword_t)(p.*FIRST) | (dword_t)(p.*SECOND) << SZT_BITS; };
dword_t
dwExpected = compose( expected ),
dwDesired = compose( desired );
bool ret;
if constexpr( Type == DCAS_SEQ )
ret = __atomic_compare_exchange_n( &u.m_firstAndSecond, &dwExpected,
dwDesired, true, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED );
else if constexpr( Type == DCAS_ACQUIRE )
ret = __atomic_compare_exchange_n( &u.m_firstAndSecond, &dwExpected,
dwDesired, true, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED );
else if constexpr( Type == DCAS_RELAXED )
ret = __atomic_compare_exchange_n( &u.m_firstAndSecond, &dwExpected,
dwDesired, true, __ATOMIC_RELAXED, __ATOMIC_RELAXED );
else
ret = __atomic_compare_exchange_n( &u.m_firstAndSecond, &dwExpected,
dwDesired, true, __ATOMIC_RELEASE, __ATOMIC_RELAXED );
if( ret ) [[likely]]
return true;
expected.*FIRST = (size_t)dwExpected;
expected.*SECOND = (size_t)(dwExpected >> SZT_BITS);
return false;
#else
#error unspupported platform
#endif
}
else
if constexpr( Type == DCAS_SEQ )
return u.m_subsitute.compare_exchange_weak( expected, desired,
memory_order_seq_cst, memory_order_relaxed );
else if constexpr( Type == DCAS_ACQUIRE )
return u.m_subsitute.compare_exchange_weak( expected, desired,
memory_order_acquire, memory_order_relaxed );
else if constexpr( Type == DCAS_RELAXED )
return u.m_subsitute.compare_exchange_weak( expected, desired,
memory_order_relaxed, memory_order_relaxed );
else
return u.m_subsitute.compare_exchange_weak( expected, desired,
memory_order_release, memory_order_relaxed );
}

template<int Type>
inline typename dcas_atomic::pair_t dcas_atomic::load( dcas_type<Type> )
const noexcept
{
using namespace std;
static_assert(Type == DCAS_SEQ || Type == DCAS_ACQUIRE || Type ==
DCAS_RELAXED || Type == DCAS_RELEASE);
if constexpr( !SUBSTITITABLE )
{
pair_t cmp( *this );
while( !this->compare_exchange( cmp, cmp, std::bool_constant<Type>() ) );
return cmp;
}
else
if constexpr( Type == DCAS_SEQ )
return u.m_subsitute.load( memory_order_seq_cst );
else if constexpr( Type == DCAS_ACQUIRE )
return u.m_subsitute.load( memory_order_acquire );
else if constexpr( Type == DCAS_RELAXED )
return u.m_subsitute.load( memory_order_relaxed );
else
return u.m_subsitute.load( memory_order_release );
}

template<int Type>
inline void dcas_atomic::store( pair_t const &niu, dcas_type<Type> )
{
using namespace std;
static_assert(Type == DCAS_SEQ || Type == DCAS_ACQUIRE || Type ==
DCAS_RELAXED || Type == DCAS_RELEASE);
if constexpr( !SUBSTITITABLE )
{
pair_t cmp( *this );
while( !this->compare_exchange( cmp, niu, dcas_type<Type>() ) );
}
else
if constexpr( Type == DCAS_SEQ )
u.m_subsitute.store( niu, memory_order_seq_cst );
else if constexpr( Type == DCAS_ACQUIRE )
u.m_subsitute.store( niu, memory_order_acquire );
else if constexpr( Type == DCAS_RELAXED )
u.m_subsitute.store( niu, memory_order_relaxed );
else
u.m_subsitute.store( niu, memory_order_release );
}

#if defined(_MSC_VER)
#pragma warning(pop)
#endif
#if defined(__llvm__)
#pragma clang diagnostic pop
#endif
Chris M. Thomasson
2024-08-19 20:32:41 UTC
Reply
Permalink
Post by Bonita Montero
I've developed a lock-free alternative to atomic<shared_ptr<>>. The idea
is that the pointer inside this atomic has two counters which come along
in a size_t'd word. The first counter has a width of 24 bit and counts
the number of control block increments which are in progress, so I'm
also using double reference counting. The remaing 40 bits of the second
word contain a counter which constantly increments when the pointer is
being overwritten. This prevents the lowerr 24 bit to be decremented
if the pointer is already overwritten.
The code is somewhat faster than the futex'd solution of MSVC and
mangitudes faster than the clang's libc++ atomic<shared_ptr<>>.
[...]

Need to read it, but are you familiar with Joe Seighs atomic_ptr? Its
been out for a long time.
jseigh
2024-08-20 13:11:18 UTC
Reply
Permalink
I've developed a lock-free alternative to atomic<shared_ptr<>>. The idea is that the pointer inside this atomic has two counters which come along
in a size_t'd word. The first counter has a width of 24 bit and counts
the number of control block increments which are in progress, so I'm
also using double reference counting. The remaing 40 bits of the second
word contain a counter which constantly increments when the pointer is
being overwritten. This prevents the lowerr 24 bit to be decremented
if the pointer is already overwritten.
The code is somewhat faster than the futex'd solution of MSVC and
mangitudes faster than the clang's libc++ atomic<shared_ptr<>>.
[...]
Need to read it, but are you familiar with Joe Seighs atomic_ptr? Its been out for a long time.
I need to update it with actual C++ atomics. I'll have to use the folly lib trick of packing a 48 bit ptr into 64 bits
with 16 bits left over for a reference count since it's not likely C++ will support DWCAS. There's no actual technical
reason for that. They need to ask the linux kernel folk and everyone implementing lock-free code with DWCAS how they
manage to do that without atomic 128 bit loads. :)

Anyway, that isn't very high on my list since reference counting isn't very performant. I'm working on more interesting
stuff for now.

As far as split reference counting goes, I think they are doing something like atomic_ptr without local_ptr. So yes,
atomic_ptr to local atomic_ptr copies are expensive and they're doing some kind of convoluted batching to ameliorate
that. Maybe.

Joe Seigh
Bonita Montero
2024-08-20 13:21:53 UTC
Reply
Permalink
I need to update it with actual C++ atomics.  I'll have to use the folly
lib trick of packing a 48 bit ptr into 64 bits with 16 bits left over for
a reference count since it's not likely C++ will support DWCAS.
Only having 16 bits is dangerous and doesn't work with newer x86-CPUs
which use 55 bit for the userspace.
jseigh
2024-08-20 19:08:49 UTC
Reply
Permalink
Post by Bonita Montero
I need to update it with actual C++ atomics.  I'll have to use the folly lib trick of packing a 48 bit ptr into 64 bits with 16 bits left over for
a reference count since it's not likely C++ will support DWCAS.
Only having 16 bits is dangerous and doesn't work with newer x86-CPUs
which use 55 bit for the userspace.
That's seriously big. There's not enough memory to require something like that. Memory mapped files or i/o space.
You'd have to be careful with your memory access patterns or paging overhead would kill your performance. I don't
even want to know what the page table layout looks like. Anyway 9 bits would limit you to just 512 concurrent
references, probably not enough.

I have an cmpxchg16b based macro. I used to have powerpc and sparc but no longer. Hopefully arm based pc's that
can run linux show up some time. Arm has some interesting atomics.

It would be nice if there was an atomic library that just had the primitives and not some half baked implemenation of
stdatomic with we don't want or need.

Joe Seigh
Chris M. Thomasson
2024-08-20 19:30:26 UTC
Reply
Permalink
On 8/20/2024 12:08 PM, jseigh wrote:
[...]

Remember your older 63-bit atomic counter? That was fun. ;^D
Scott Lurndal
2024-08-20 20:51:58 UTC
Reply
Permalink
Post by jseigh
Post by Bonita Montero
I need to update it with actual C++ atomics.  I'll have to use the folly lib trick of packing a 48 bit ptr into 64 bits with 16 bits left over for
a reference count since it's not likely C++ will support DWCAS.
Only having 16 bits is dangerous and doesn't work with newer x86-CPUs
which use 55 bit for the userspace.
That's seriously big. There's not enough memory to require something like that.
There sure is. More then enough. DAGS "CXL-memory".
Chris M. Thomasson
2024-08-20 21:14:56 UTC
Reply
Permalink
Post by Bonita Montero
I need to update it with actual C++ atomics.  I'll have to use the
folly lib trick of packing a 48 bit ptr into 64 bits with 16 bits
left over for
a reference count since it's not likely C++ will support DWCAS.
Only having 16 bits is dangerous and doesn't work with newer x86-CPUs
which use 55 bit for the userspace.
That's seriously big.  There's not enough memory to require something
like that.  Memory mapped files or i/o space.
You'd have to be careful with your memory access patterns or paging
overhead would kill your performance.  I don't
even want to know what the page table layout looks like.  Anyway 9 bits
would limit you to just 512 concurrent
references, probably not enough.
I have an cmpxchg16b based macro.  I used to have powerpc and sparc but
no longer.   Hopefully arm based pc's that
can run linux show up some time.  Arm has some interesting atomics.
;^D

Here is some of my old asm code:

https://web.archive.org/web/20060214112345/http://appcore.home.comcast.net/appcore/src/cpu/i686/ac_i686_gcc_asm.html

https://web.archive.org/web/20060214112539/http://appcore.home.comcast.net/appcore/src/cpu/i686/ac_i686_masm_asm.html

wow, that was a while ago....
It would be nice if there was an atomic library that just had the
primitives and not some half baked implemenation of
stdatomic with we don't want or need.
Chris M. Thomasson
2024-08-21 00:09:04 UTC
Reply
Permalink
Post by Bonita Montero
I need to update it with actual C++ atomics.  I'll have to use the
folly lib trick of packing a 48 bit ptr into 64 bits with 16 bits
left over for
a reference count since it's not likely C++ will support DWCAS.
Only having 16 bits is dangerous and doesn't work with newer x86-CPUs
which use 55 bit for the userspace.
That's seriously big.  There's not enough memory to require something
like that.  Memory mapped files or i/o space.
You'd have to be careful with your memory access patterns or paging
overhead would kill your performance.  I don't
even want to know what the page table layout looks like.  Anyway 9 bits
would limit you to just 512 concurrent
references, probably not enough.
I have an cmpxchg16b based macro.  I used to have powerpc and sparc but
no longer.   Hopefully arm based pc's that
can run linux show up some time.  Arm has some interesting atomics.
I used to have that SunFire T2000 that sun gave me in the CoolThreads
contest. Wel, I sold that damn thing! Shit. I wish I did not do that
now. It sounded like 5 or 6 vacuum cleaners going off at the same time
in my house... ;^o
It would be nice if there was an atomic library that just had the
primitives and not some half baked implemenation of
stdatomic with we don't want or need.
Joe Seigh
Chris M. Thomasson
2024-08-21 03:33:57 UTC
Reply
Permalink
Post by Bonita Montero
I need to update it with actual C++ atomics.  I'll have to use the
folly lib trick of packing a 48 bit ptr into 64 bits with 16 bits
left over for
a reference count since it's not likely C++ will support DWCAS.
Only having 16 bits is dangerous and doesn't work with newer x86-CPUs
which use 55 bit for the userspace.
That's seriously big.  There's not enough memory to require something
like that.  Memory mapped files or i/o space.
You'd have to be careful with your memory access patterns or paging
overhead would kill your performance.  I don't
even want to know what the page table layout looks like.  Anyway 9 bits
would limit you to just 512 concurrent
references, probably not enough.
I have an cmpxchg16b based macro.  I used to have powerpc and sparc but
no longer.   Hopefully arm based pc's that
can run linux show up some time.  Arm has some interesting atomics.
It would be nice if there was an atomic library that just had the
primitives and not some half baked implemenation of
stdatomic with we don't want or need.
I remember stealing bits from a pointer to get a specialized strong
reference count that was single word. Well, basically it was
differential in nature and had to use a back link aka atomic_ptr_plus.
Chris M. Thomasson
2024-08-21 03:36:07 UTC
Reply
Permalink
On 8/20/2024 12:08 PM, jseigh wrote:
[...]
I have an cmpxchg16b based macro.  I used to have powerpc and sparc but
no longer.   Hopefully arm based pc's that
can run linux show up some time.  Arm has some interesting atomics.
[...]

Remember that really strange one, something like cmp8xchg16 or
something? The itanic?
Chris M. Thomasson
2024-08-20 19:21:12 UTC
Reply
Permalink
Post by Bonita Montero
I need to update it with actual C++ atomics.  I'll have to use the
folly lib trick of packing a 48 bit ptr into 64 bits with 16 bits left
over for
a reference count since it's not likely C++ will support DWCAS.
Only having 16 bits is dangerous and doesn't work with newer x86-CPUs
which use 55 bit for the userspace.
Wrt DWCAS, think of the following struct:

struct anchor
{
void* pointer;
word count;
};

atomic_ptr likes this... ;^)

on a 32 bit system this should be 64 bits in size. On a 64 bit system
this should be 128 bits in size. Hence the term DWCAS as in Double Width
Compare-and-Swap.

So, on a 32 bit system a word should be an unsigned 32 bit integer.
So, on a 64 bit system a word should be an unsigned 64 bit integer.

Wrt to standard C++11 it should be lock-free on many arch's, aka, they
have a real DWCAS. It can be checked with is_lock_free() wrt:

https://en.cppreference.com/w/cpp/atomic/atomic/is_lock_free

Right?
Bonita Montero
2024-08-21 03:29:11 UTC
Reply
Permalink
Post by Chris M. Thomasson
struct anchor
{
    void* pointer;
    word count;
};
You need two counters inside count, one for the doubled reference
count and a counter which is update for each new pointer store.
Chris M. Thomasson
2024-08-21 03:32:35 UTC
Reply
Permalink
Post by Bonita Montero
Post by Chris M. Thomasson
struct anchor
{
     void* pointer;
     word count;
};
You need two counters inside count, one for the doubled reference
count and a counter which is update for each new pointer store.
Not with atomic_ptr aka differential reference counting. Well, I might
be misunderstanding you here. atomic_ptr has a counterpart called
local_ptr. The "strong" reference counting relies on DWCAS wrt (struct
anchor) type of a setup. Joe has a nice write up on it. Actually the
patent for the type of refcounting going on here is pretty good as well:

https://patents.google.com/patent/US5295262A
Bonita Montero
2024-08-21 03:44:52 UTC
Reply
Permalink
Post by Chris M. Thomasson
Not with atomic_ptr aka differential reference counting.
For split reference counting you need both.
Chris M. Thomasson
2024-08-21 03:54:17 UTC
Reply
Permalink
Post by Bonita Montero
Post by Chris M. Thomasson
Not with atomic_ptr aka differential reference counting.
For split reference counting you need both.
Ahhh... How the split counters relate to each other? How do they provide
strong thready safety? I assume there is no spinlock like odd logic
between them... Thanks. But I don't have a lot of time to study up.
Working on another animation. Can you help me out here?
Bonita Montero
2024-08-21 03:59:52 UTC
Reply
Permalink
Post by Chris M. Thomasson
Post by Bonita Montero
For split reference counting you need both.
Ahhh... How the split counters relate to each other? How do they provide
strong thready safety? I assume there is no spinlock like odd logic
between them... Thanks. But I don't have a lot of time to study up.
Working on another animation. Can you help me out here?
The problem with reference counting only is that the pointer along
with the reference counter may be overwritten multiple times and
atlast the initial pointer value was set and then the thread which
updated the initial reference counter comes back and tries to
reduce the reference counter which may become negative then. So
you have to have also a sequence couner inside the second word
which is updated every time you store a new pointer, even if the
new pointer value is the same.
Chris M. Thomasson
2024-08-21 06:26:14 UTC
Reply
Permalink
Post by Bonita Montero
Post by Chris M. Thomasson
Post by Bonita Montero
For split reference counting you need both.
Ahhh... How the split counters relate to each other? How do they
provide strong thready safety? I assume there is no spinlock like odd
logic between them... Thanks. But I don't have a lot of time to study
up. Working on another animation. Can you help me out here?
The problem with reference counting only is that the pointer along
with the reference counter may be overwritten multiple times and
atlast the initial pointer value was set and then the thread which
updated the initial reference counter comes back and tries to
reduce the reference counter which may become negative then. So
you have to have also a sequence couner inside the second word
which is updated every time you store a new pointer, even if the
new pointer value is the same.
This is not an issue in atomic_ptr. It handles the pointer and a word
for a reference count all in one DWCAS. It provides strong thread
safety. Not just basic.
Chris M. Thomasson
2024-08-21 06:43:38 UTC
Reply
Permalink
Post by Chris M. Thomasson
Post by Bonita Montero
For split reference counting you need both.
[...]
Post by Chris M. Thomasson
This is not an issue in atomic_ptr. It handles the pointer and a word
for a reference count all in one DWCAS. It provides strong thread
safety. Not just basic.
atomic_ptr uses differential reference counting instead of split counters.
Bonita Montero
2024-08-21 08:10:48 UTC
Reply
Permalink
Post by Chris M. Thomasson
This is not an issue in atomic_ptr. It handles the pointer and a
word for a reference count all in one DWCAS. It provides strong
thread safety. Not just basic.
This doesn't make a difference from the performance standpoint since
most of the cost is fetching a cacheline from a remote cache. DWCAS
vs. CAS are only six cycles different on my CPU.
Chris M. Thomasson
2024-08-21 19:39:18 UTC
Reply
Permalink
Post by Bonita Montero
Post by Chris M. Thomasson
This is not an issue in atomic_ptr. It handles the pointer and a
word for a reference count all in one DWCAS. It provides strong
thread safety. Not just basic.
This doesn't make a difference from the performance standpoint since
most of the cost is fetching a cacheline from a remote cache. DWCAS
vs. CAS are only six cycles different on my CPU.
Well, it does wrt the number of atomic RMW's you need to use! atomic_ptr
is a single DWCAS loop. Humm... Actually, for some reason the split
counters kind of, sorta reminds me of this experimental algorithm I did
that uses a single word for strong thread safety. I did two of them there:

It's in the following thread:

https://groups.google.com/g/lock-free/c/X3fuuXknQF0/m/Ho0H1iJgmrQJ
(read all...)
Chris M. Thomasson
2024-08-21 20:16:39 UTC
Reply
Permalink
Post by Chris M. Thomasson
Post by Bonita Montero
Post by Chris M. Thomasson
This is not an issue in atomic_ptr. It handles the pointer and a
word for a reference count all in one DWCAS. It provides strong
thread safety. Not just basic.
This doesn't make a difference from the performance standpoint since
most of the cost is fetching a cacheline from a remote cache. DWCAS
vs. CAS are only six cycles different on my CPU.
Well, it does wrt the number of atomic RMW's you need to use! atomic_ptr
is a single DWCAS loop. Humm... Actually, for some reason the split
counters kind of, sorta reminds me of this experimental algorithm I did
https://groups.google.com/g/lock-free/c/X3fuuXknQF0/m/Ho0H1iJgmrQJ
(read all...)
The one I am reference is:


_________________________________
FWIW, here is some weird seqlock proxy collector hybrid thing. Needs
mutex on collect phase, but hay now... It certainly keeps the collector
in line (e.g., atomic) with the _current_ reference count or else the
whole thing bites the dust!

;^o

https://pastebin.com/raw/TgTcfYtR



Anyway, wrt stpc, the fact that you load a pointer to collector along
with the CAS loop to a seq number in VERY interesting indeed!

:^)
_________________________________



/*
Word-Based Portable Proxy Garbage Collector
Copyright (C) 2013 Christopher Michael Thomasson

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/





//#define RL_DEBUGBREAK_ON_ASSERT
//#define RL_GC
//#define RL_MSVC_OUTPUT
//#define RL_FORCE_SEQ_CST
#include <relacy/relacy_std.hpp>
#include <cstdio>
#include <cstddef>




#if ! defined (NDEBUG)
# define DBG_PRINTF(e) std::printf e
#else
# define DBG_PRINTF(e) ((void)0)
#endif



#define mb_relaxed std::memory_order_relaxed
#define mb_acquire std::memory_order_acquire
#define mb_release std::memory_order_release
#define mb_acq_rel std::memory_order_acq_rel
#define mb_seq_cst std::memory_order_seq_cst


#define MB_FENCE(mb_mp) std::atomic_thread_fence(mb_mp)
#define mb_fence(mb_mp) MB_FENCE(mb_mp)



typedef unsigned int ver_uint_type;


struct node
{
std::atomic<node*> m_next;
VAR_T(node*) m_defer_next;


node()
: m_next(NULL),
m_defer_next(NULL)
{

}
};




class proxy_gc
{
static void prv_destroy(node* n)
{
while (n)
{
node* next = VAR(n->m_defer_next);
delete n;
n = next;
}
}



public:
struct collector
{
std::atomic<ver_uint_type> m_count;
std::atomic<collector*> m_next;
VAR_T(node*) m_defer;

collector()
: m_count(0x00000000U),
m_next(NULL),
m_defer(NULL)
{

}


collector(ver_uint_type count)
: m_count(count),
m_next(NULL),
m_defer(NULL)
{

}


~collector()
{
node* defer = VAR(m_defer);
prv_destroy(defer);
}
};



private:
std::atomic<ver_uint_type> m_count;
std::atomic<collector*> m_collector;
std::mutex m_mutex;



public:
proxy_gc()
: m_count(0x00000000U),
m_collector(new collector())
{

}

~proxy_gc()
{
prv_collect_collector_ref(NULL, NULL);
}



private:
collector*
prv_acquire_collector_ref()
{
ver_uint_type count = m_count.load(mb_relaxed);
collector* c = NULL;
rl::linear_backoff backoff;

for (;;)
{
if (count & 0x00000001U)
{
backoff.yield($);
count = m_count.load(mb_relaxed);
continue;
}

mb_fence(mb_acquire);

c = m_collector.load(mb_relaxed);

if (m_count.compare_exchange_weak(
count,
count + 0x00020000U,
mb_relaxed))
{
break;
}
}

mb_fence(mb_acquire);

return c;
}


void
prv_release_collector_release(collector* c)
{
mb_fence(mb_release);

ver_uint_type count = c->m_count.fetch_sub(0x00000002U,
mb_relaxed);

if (count == 0x00000003U)
{
prv_dtor_collector(c);
}
}


void
prv_collect_collector_ref(collector* nc, node* defer)
{
if (nc)
{
VAR(nc->m_defer) = defer;
}

m_mutex.lock($);

ver_uint_type count1 = m_count.fetch_add(0x00000001U, mb_relaxed);

collector* oc = m_collector.load(mb_relaxed);
m_collector.store(nc, mb_relaxed);

oc->m_next.store(nc, mb_relaxed);

ver_uint_type count2 = count1 + 0x00000001U;
ver_uint_type xchg = (count2 + 0x00000001U) & 0x0000FFFFU;

mb_fence(mb_release);

if (! m_count.compare_exchange_strong(
count2,
xchg,
mb_relaxed))
{
RL_ASSERT(false);
}

m_mutex.unlock($);

ver_uint_type count = count1 & 0xFFFF0000U;
count >>= 0x00000010U;

ver_uint_type ocount =
oc->m_count.fetch_add(count + 0x00000001U, mb_relaxed);

if (ocount == -count)
{
prv_dtor_collector(oc);
}
}


void
prv_dtor_collector(collector* c)
{
mb_fence(mb_acquire);

collector* head = c;
collector* tail = c;
collector* next = c->m_next.load(mb_relaxed);
c->m_next.store(NULL, mb_relaxed);

while (next)
{
ver_uint_type next_count =
next->m_count.fetch_sub(0x00000002U, mb_relaxed);

if (next_count != 0x00000003U)
{
break;
}

mb_fence(mb_acquire);

collector* next2 = next->m_next.load(mb_relaxed);

DBG_PRINTF(("prv_dtor_collector: collected! - %u - %p -
%u\n",
next_count,
c,
next_count));

next->m_next.store(NULL, mb_relaxed);
tail->m_next.store(next, mb_relaxed);

tail = next;
next = next2;
}

while (head)
{
collector* next = head->m_next.load(mb_relaxed);

delete head;

head = next;
}
}



public:
collector*
acquire()
{
collector* c = prv_acquire_collector_ref();

return c;
}


void
release(collector* c)
{
prv_release_collector_release(c);
}


void
collect(node* defer = NULL)
{
collector* nc = new collector(0x00000002U);

prv_collect_collector_ref(nc, defer);
}
};




// you're basic lock-free stack...
// well, minus ABA counter and DWCAS of course! ;^)
class stack
{
std::atomic<node*> m_head;



public:
stack()
: m_head(NULL)
{

}



public:
void push(node* n)
{
node* head = m_head.load(std::memory_order_relaxed);

do
{
n->m_next.store(head, std::memory_order_relaxed);

mb_fence(mb_release);
}

while (! m_head.compare_exchange_weak(
head,
n,
mb_relaxed));
}


node* flush()
{
node* n = m_head.exchange(NULL, mb_relaxed);

if (n) mb_fence(mb_acquire);

return n;
}


node* get_head()
{
node* n = m_head.load(mb_relaxed);

if (n) mb_fence(mb_acquire);

return n;
}


node* pop()
{
node* head = m_head.load(mb_relaxed);
node* xchg;

do
{
if (! head) return NULL;

mb_fence(mb_acquire);

xchg = head->m_next.load(std::memory_order_relaxed);
}

while (! m_head.compare_exchange_weak(
head,
xchg,
mb_relaxed));

return head;
}
};




#define ITERS 7
#define DEFER 6
#define WRITERS 2
#define READERS 4
#define REAPERS 1
#define THREADS (WRITERS + READERS + REAPERS)



typedef proxy_gc proxy_gc_type;


struct proxy_test
: rl::test_suite<proxy_test, THREADS>
{
proxy_gc_type g_proxy_gc;
stack g_stack;
unsigned int g_writers;


void before()
{
g_writers = WRITERS;
}


void thread(unsigned int tidx)
{
rl::backoff backoff;

if (tidx < READERS)
{
DBG_PRINTF(("reader thread entry(%d)\n", tidx));

while (g_writers)
{
backoff.yield($);

proxy_gc_type::collector* c = g_proxy_gc.acquire();

node* n = g_stack.get_head();

while (n)
{
node* next = n->m_next.load(mb_relaxed);

backoff.yield($);

DBG_PRINTF(("reader thread(%d) - READ (%p:%p)\n",
tidx, c, n));

n = next;
}

g_proxy_gc.release(c);
}

DBG_PRINTF(("reader thread exit(%d)\n", tidx));
}

else if (tidx < WRITERS + READERS)
{
DBG_PRINTF(("writer thread entry(%d)\n", tidx));

node* n;

for (unsigned int i = 0; i < ITERS; ++i)
{
n = new node();

g_stack.push(n);

DBG_PRINTF(("writer thread(%d) - WRITE PUSH (%p)\n",
tidx, n));

if (! (i % (rl::rand(4) + 1)))
{
proxy_gc_type::collector* c = g_proxy_gc.acquire();
backoff.yield($);
n = g_stack.pop();
g_proxy_gc.release(c);

DBG_PRINTF(("writer thread(%d) - WRITE POP
(%p:%p)\n", tidx, n, c));
g_proxy_gc.collect(n);
}
}

for (unsigned int i = 0; i < ITERS; ++i)
{
proxy_gc_type::collector* c = g_proxy_gc.acquire();
n = g_stack.pop();
backoff.yield($);
g_proxy_gc.release(c);
g_proxy_gc.collect(n);
}

--g_writers;

DBG_PRINTF(("writer thread exit(%d)\n", tidx));
}

else
{
DBG_PRINTF(("reaper thread entry(%d)\n", tidx));

while (g_writers)
{
g_proxy_gc.collect();
backoff.yield($);
}

DBG_PRINTF(("reaper thread exit(%d)\n", tidx));
}
}
};




int main()
{
{
rl::test_params p;

p.iteration_count = 30000000;
p.execution_depth_limit = 10000;
// p.context_bound = 3;
// p.search_type = rl::sched_bound;
// p.search_type = rl::fair_full_search_scheduler_type;
//p.search_type = rl::fair_context_bound_scheduler_type;
rl::simulate<proxy_test>(p);
}

std::puts("\n\n____________________________\nTesting Complete!");
std::getchar();

return 0;
}
Bonita Montero
2024-08-22 10:55:58 UTC
Reply
Permalink
Well, it does wrt the number of atomic RMW's you need to use! ..
The time to get the cachelines into L1 from remote is much higher
than a series of atomic oprerations. Take this code:

#include <iostream>
#include <atomic>
#include <thread>
#include <chrono>
#include <vector>

using namespace std;
using namespace chrono;

int main()
{
atomic_char aTomic;
atomic_int64_t tSum;
constexpr size_t N = 1'000'000;
auto thr = [&]
{
auto start = high_resolution_clock::now();
char cmp = aTomic.load( memory_order_relaxed );
for( size_t r = N; r; --r )
while( !aTomic.compare_exchange_weak( cmp, cmp + 1,
memory_order_seq_cst, memory_order_relaxed ) );
tSum.fetch_add( duration_cast<nanoseconds>(
high_resolution_clock::now() - start ).count(), memory_order_relaxed );
};
unsigned hc = thread::hardware_concurrency();
for( unsigned nThreads = 1; nThreads <= hc; ++nThreads )
{
vector<jthread> threads;
tSum.store( 0, memory_order_relaxed );
for( unsigned t = 0; t != nThreads; ++t )
threads.emplace_back( thr );
threads.resize( 0 );
cout << nThreads << ": " << tSum.load( memory_order_relaxed ) /
((double)nThreads * N) << endl;
}
}

1: 3.5034
2: 26.236
3: 30.6016
4: 39.2783
5: 45.3143
6: 58.3598
7: 78.3758
8: 127.335
9: 136.524
10: 136.577
11: 149.088
12: 198.628
13: 238.068
14: 258.649
15: 282.617
16: 308.73
17: 327.076
18: 339.655
19: 362.38
20: 411.776
21: 433.617
22: 468.451
23: 504.567
24: 615.026
25: 688.946
26: 877.79
27: 949.739
28: 1168.26
29: 1452.5
30: 1808.28
31: 2404.82
32: 2804.56

If there's one thread each increment through a CMPXCHBG (im inten-
tionally not taking LOCK XADD) is processed below 4 nanoseconds on
myZen4 computer. When 32 threads are atively contending on the cache
- line each successful operation is about 2.8 _microseconds_ that's
about 12.600 clockycles (base clock with all cores 4.5GHz). So it's
not the cost of the operation itself once it is in L1, it's the cost
to fetch the cacheline itself ! That's while my "atomic<shared_ptr<>>"
variant is only somewhat faster than the futex'd solution vom MSVC
(but still magnitudes faster than libstdc++'s and libc++'s code).
Chris M. Thomasson
2024-08-21 19:40:27 UTC
Reply
Permalink
Post by Bonita Montero
Post by Chris M. Thomasson
This is not an issue in atomic_ptr. It handles the pointer and a
word for a reference count all in one DWCAS. It provides strong
thread safety. Not just basic.
This doesn't make a difference from the performance standpoint since
most of the cost is fetching a cacheline from a remote cache. DWCAS
vs. CAS are only six cycles different on my CPU.
It makes a big difference. Your using multiple atomic RMW's to do the
strong thread safety pointer copy. Joe is using a single DWCAS.
Chris M. Thomasson
2024-08-20 19:27:01 UTC
Reply
Permalink
Post by Chris M. Thomasson
Post by Bonita Montero
I've developed a lock-free alternative to atomic<shared_ptr<>>. The
idea is that the pointer inside this atomic has two counters which
come along
in a size_t'd word. The first counter has a width of 24 bit and counts
the number of control block increments which are in progress, so I'm
also using double reference counting. The remaing 40 bits of the second
word contain a counter which constantly increments when the pointer is
being overwritten. This prevents the lowerr 24 bit to be decremented
if the pointer is already overwritten.
The code is somewhat faster than the futex'd solution of MSVC and
mangitudes faster than the clang's libc++ atomic<shared_ptr<>>.
[...]
Need to read it, but are you familiar with Joe Seighs atomic_ptr? Its
been out for a long time.
I need to update it with actual C++ atomics.  I'll have to use the folly
lib trick of packing a 48 bit ptr into 64 bits
with 16 bits left over for a reference count since it's not likely C++
will support DWCAS.   There's no actual technical
reason for that.  They need to ask the linux kernel folk and everyone
implementing lock-free code with DWCAS how they
manage to do that without atomic 128 bit loads. :)
Anyway, that isn't very high on my list since reference counting isn't
very performant.  I'm working on more interesting
stuff for now.
As far as split reference counting goes, I think they are doing
something like atomic_ptr without local_ptr.  So yes,
atomic_ptr to local atomic_ptr copies are expensive and they're doing
some kind of convoluted batching to ameliorate
that.  Maybe.
Wrt your atomic_ptr:

struct anchor
{
void* pointer;
word count;
};

Wrt to standard C++11 it should be lock-free on many arch's by now, aka,
they have a real DWCAS. It can be checked with is_lock_free() wrt:

https://en.cppreference.com/w/cpp/atomic/atomic/is_lock_free

Reminded of this old question of yours:

https://groups.google.com/g/lock-free/c/X3fuuXknQF0/m/Ho0H1iJgmrQJ

I need to get back into that group! Have been really busy lately with a
lot of math and fractals.

Thanks Joe. :^)
Bonita Montero
2024-08-21 03:28:06 UTC
Reply
Permalink
Post by Chris M. Thomasson
struct anchor
{
    void* pointer;
    word count;
};
Wrt to standard C++11 it should be lock-free on many arch's by now, aka,
g++ doesn't support 16 byte atomics which are lock-free, also not MSVC
(but 8 byte lock-free atomics in 32 bit mode). But clang++ does support
it. I use it with a "if constexpr( SUBSTITUTABLE )" in my dcas_atomic.h.
Post by Chris M. Thomasson
https://en.cppreference.com/w/cpp/atomic/atomic/is_lock_free
is_always_lock_free is better
Chris M. Thomasson
2024-08-21 03:29:57 UTC
Reply
Permalink
Post by Bonita Montero
Post by Chris M. Thomasson
struct anchor
{
     void* pointer;
     word count;
};
Wrt to standard C++11 it should be lock-free on many arch's by now,
g++ doesn't support 16 byte atomics which are lock-free, also not MSVC
(but 8 byte lock-free atomics in 32 bit mode). But clang++ does support
it. I use it with a "if constexpr( SUBSTITUTABLE )" in my dcas_atomic.h.
It's been a while since I checked them, but damn it... C++11 should be
using native DWCAS if the underlying arch supports it wrt (struct
anchor). Ahh shit. Well, I need to check again.
Post by Bonita Montero
Post by Chris M. Thomasson
https://en.cppreference.com/w/cpp/atomic/atomic/is_lock_free
is_always_lock_free is better
Touche. :^)
jseigh
2024-08-21 12:24:58 UTC
Reply
Permalink
Post by Bonita Montero
Post by Chris M. Thomasson
struct anchor
{
     void* pointer;
     word count;
};
g++ doesn't support 16 byte atomics which are lock-free, also not MSVC
(but 8 byte lock-free atomics in 32 bit mode). But clang++ does support
it. I use it with a "if constexpr( SUBSTITUTABLE )" in my dcas_atomic.h.
It's been a while since I checked them, but damn it... C++11 should be using native DWCAS if the underlying arch supports it wrt (struct anchor). Ahh shit. Well, I need to check again.
No, C++ won't. Their reasoning is they can only do it if a 128 bit atomic can be defined
with all the operations that other atomic types have. But some architectures don't have
128 bit atomic loads and/or stores (not performant ones anyway) so no go in their view.
The idea that you could provide DWCAS some other way seems not to fit into their
world view. And lock-free programming isn't important to them.
Post by Bonita Montero
Post by Chris M. Thomasson
https://en.cppreference.com/w/cpp/atomic/atomic/is_lock_free
is_always_lock_free is better
Touche. :^)
Bonita Montero
2024-08-21 12:44:39 UTC
Reply
Permalink
No, C++ won't.  Their reasoning is they can only do it if a 128 bit
atomic can be defined
with all the operations that other atomic types have.  But some
architectures don't have
128 bit atomic loads and/or stores (not performant ones anyway) so no go in their view.
The idea that you could provide DWCAS some other way seems not to fit into their
world view.  And lock-free programming isn't important to them.
It's not necessary to define an explicit atomic for that in the
standard. clang++ supports lock-free atomics up to any 128 bit
trivial type. That's what all compilers for x84 and ARM should
support.
Chris M. Thomasson
2024-08-21 19:48:02 UTC
Reply
Permalink
Post by Bonita Montero
No, C++ won't.  Their reasoning is they can only do it if a 128 bit
atomic can be defined
with all the operations that other atomic types have.  But some
architectures don't have
128 bit atomic loads and/or stores (not performant ones anyway) so no go in their view.
The idea that you could provide DWCAS some other way seems not to fit into their
world view.  And lock-free programming isn't important to them.
It's not necessary to define an explicit atomic for that in the
standard. clang++ supports lock-free atomics up to any 128 bit
trivial type. That's what all compilers for x84 and ARM should
support.
True. But some of them say its not lock-free even though the arch
supports DWCAS directly. Well, the last time I checked, around a year or
two ago, iirc. Then there are other compilers do not use the optimal
atomic RMW's for the job. For instance, check this out:

https://forum.pellesc.de/index.php?topic=7167.msg27217#msg27217
(read all...)

They used CMPXCHG instead of XADD! I wrote about it for sure. Got some
attention... :^)
jseigh
2024-08-22 15:17:03 UTC
Reply
Permalink
Post by Bonita Montero
No, C++ won't.  Their reasoning is they can only do it if a 128 bit atomic can be defined
with all the operations that other atomic types have.  But some architectures don't have
128 bit atomic loads and/or stores (not performant ones anyway) so no go in their view.
The idea that you could provide DWCAS some other way seems not to fit into their
world view.  And lock-free programming isn't important to them.
It's not necessary to define an explicit atomic for that in the
standard. clang++ supports lock-free atomics up to any 128 bit
trivial type. That's what all compilers for x84 and ARM should
support.
Not sure what you are talking about. Clang documentation is only clear
if you already know the answer. I tried using _Atomic __uint128_t and
it didn't like it.

Anyway I have a gcc inline DWCAS so as far as I am concerned C/C++
support is still where it was in 2003, meaning I still have to use
the same techniques I used back then.

Joe Seigh
Bonita Montero
2024-08-22 15:46:43 UTC
Reply
Permalink
Post by Bonita Montero
No, C++ won't.  Their reasoning is they can only do it if a 128 bit
atomic can be defined
with all the operations that other atomic types have.  But some
architectures don't have
128 bit atomic loads and/or stores (not performant ones anyway) so no go in their view.
The idea that you could provide DWCAS some other way seems not to fit into their
world view.  And lock-free programming isn't important to them.
It's not necessary to define an explicit atomic for that in the
standard. clang++ supports lock-free atomics up to any 128 bit
trivial type. That's what all compilers for x84 and ARM should
support.
Not sure what you are talking about.  Clang documentation is only clear
if you already know the answer.  I tried using _Atomic __uint128_t and
it didn't like it.
Anyway I have a gcc inline DWCAS so as far as I am concerned C/C++
support is still where it was in 2003, meaning I still have to use
the same techniques I used back then.
I tested this with clang++-18:

#include <iostream>
#include <atomic>

using namespace std;

int main()
{
atomic<__int128> a;
cout << sizeof(a) << endl;
cout << a.is_lock_free() << endl;
}

This prints:

16
0

So the atomic<__int128> is reported not to be lock-free, but
it calls libatomic, which internally uses CMPXCHG16B. Seems
the compiler thiks "I dont know what libatomic does, but I
guess it's not lockfree", although it actually.
jseigh
2024-08-22 16:34:11 UTC
Reply
Permalink
Post by Bonita Montero
Post by Bonita Montero
No, C++ won't.  Their reasoning is they can only do it if a 128 bit atomic can be defined
with all the operations that other atomic types have.  But some architectures don't have
128 bit atomic loads and/or stores (not performant ones anyway) so no go in their view.
The idea that you could provide DWCAS some other way seems not to fit into their
world view.  And lock-free programming isn't important to them.
It's not necessary to define an explicit atomic for that in the
standard. clang++ supports lock-free atomics up to any 128 bit
trivial type. That's what all compilers for x84 and ARM should
support.
Not sure what you are talking about.  Clang documentation is only clear
if you already know the answer.  I tried using _Atomic __uint128_t and
it didn't like it.
Anyway I have a gcc inline DWCAS so as far as I am concerned C/C++
support is still where it was in 2003, meaning I still have to use
the same techniques I used back then.
#include <iostream>
#include <atomic>
using namespace std;
int main()
{
        atomic<__int128> a;
        cout << sizeof(a) << endl;
        cout << a.is_lock_free() << endl;
}
    16
    0
So the atomic<__int128> is reported not to be lock-free, but
it calls libatomic, which internally uses CMPXCHG16B. Seems
the compiler thiks "I dont know what libatomic does, but I
guess it's not lockfree", although it actually.
Well, moot for now. It seems installing clang on my dev box
killed it.

Joe Seigh
Chris M. Thomasson
2024-08-22 19:29:44 UTC
Reply
Permalink
Post by Bonita Montero
Post by Bonita Montero
No, C++ won't.  Their reasoning is they can only do it if a 128 bit
atomic can be defined
with all the operations that other atomic types have.  But some
architectures don't have
128 bit atomic loads and/or stores (not performant ones anyway) so
no go in their view.
The idea that you could provide DWCAS some other way seems not to fit into their
world view.  And lock-free programming isn't important to them.
It's not necessary to define an explicit atomic for that in the
standard. clang++ supports lock-free atomics up to any 128 bit
trivial type. That's what all compilers for x84 and ARM should
support.
Not sure what you are talking about.  Clang documentation is only clear
if you already know the answer.  I tried using _Atomic __uint128_t and
it didn't like it.
Anyway I have a gcc inline DWCAS so as far as I am concerned C/C++
support is still where it was in 2003, meaning I still have to use
the same techniques I used back then.
#include <iostream>
#include <atomic>
using namespace std;
int main()
{
        atomic<__int128> a;
        cout << sizeof(a) << endl;
        cout << a.is_lock_free() << endl;
}
    16
    0
So the atomic<__int128> is reported not to be lock-free, but
it calls libatomic, which internally uses CMPXCHG16B. Seems
the compiler thiks "I dont know what libatomic does, but I
guess it's not lockfree", although it actually.
CMPXCHG16B is the thing to use on a 64-bit Intel system wrt DWCAS. The
damn compiler should use it directly and it should report that it _is_
always lock-free... Agreed?
Bonita Montero
2024-08-23 03:58:15 UTC
Reply
Permalink
Post by Chris M. Thomasson
CMPXCHG16B is the thing to use on a 64-bit Intel system wrt DWCAS. The
damn compiler should use it directly and it should report that it _is_
always lock-free... Agreed?
With linux libatomic is used for that which uses CMPXCHG16B.
That's not much slower.
Chris M. Thomasson
2024-08-23 06:40:23 UTC
Reply
Permalink
Post by Bonita Montero
Post by Chris M. Thomasson
CMPXCHG16B is the thing to use on a 64-bit Intel system wrt DWCAS. The
damn compiler should use it directly and it should report that it _is_
always lock-free... Agreed?
With linux libatomic is used for that which uses CMPXCHG16B.
That's not much slower.
The compiler should say its always lock-free in this case. If it does
not, then there is a problem. Agreed?
Bonita Montero
2024-08-23 10:03:23 UTC
Reply
Permalink
Post by Chris M. Thomasson
Post by Bonita Montero
Post by Chris M. Thomasson
CMPXCHG16B is the thing to use on a 64-bit Intel system wrt DWCAS.
The damn compiler should use it directly and it should report that it
_is_ always lock-free... Agreed?
With linux libatomic is used for that which uses CMPXCHG16B.
That's not much slower.
The compiler should say its always lock-free in this case. If it does
not, then there is a problem. Agreed?
You could rely on "sizeof(atomic<XXX> == sizeof(XXX)>" as an indicator
that it's lock free. If there's no extra space there couldn't be a futex
or a mutex. I did it the same way for my dcas_atomic as I noticed lately
that ::is_always_lock_free() doesn't work.
Chris M. Thomasson
2024-08-23 06:42:46 UTC
Reply
Permalink
Post by Bonita Montero
Post by Chris M. Thomasson
CMPXCHG16B is the thing to use on a 64-bit Intel system wrt DWCAS. The
damn compiler should use it directly and it should report that it _is_
always lock-free... Agreed?
With linux libatomic is used for that which uses CMPXCHG16B.
That's not much slower.
Remember that strange instruction on the Itanium? CMP8XCHG16?
Chris M. Thomasson
2024-08-23 06:47:44 UTC
Reply
Permalink
Post by Bonita Montero
Post by Chris M. Thomasson
CMPXCHG16B is the thing to use on a 64-bit Intel system wrt DWCAS. The
damn compiler should use it directly and it should report that it _is_
always lock-free... Agreed?
With linux libatomic is used for that which uses CMPXCHG16B.
That's not much slower.
For windows, it has:

https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-isprocessorfeaturepresent

PF_COMPARE_EXCHANGE128 and PF_COMPARE64_EXCHANGE128 are in there.

I just need to make a new project in MSVC to test it out.
Chris M. Thomasson
2024-08-22 19:27:34 UTC
Reply
Permalink
Post by Bonita Montero
No, C++ won't.  Their reasoning is they can only do it if a 128 bit
atomic can be defined
with all the operations that other atomic types have.  But some
architectures don't have
128 bit atomic loads and/or stores (not performant ones anyway) so no go in their view.
The idea that you could provide DWCAS some other way seems not to fit into their
world view.  And lock-free programming isn't important to them.
It's not necessary to define an explicit atomic for that in the
standard. clang++ supports lock-free atomics up to any 128 bit
trivial type. That's what all compilers for x84 and ARM should
support.
Not sure what you are talking about.  Clang documentation is only clear
if you already know the answer.  I tried using _Atomic __uint128_t and
it didn't like it.
Anyway I have a gcc inline DWCAS so as far as I am concerned C/C++
support is still where it was in 2003, meaning I still have to use
the same techniques I used back then.
Does this logic ring a bell? I saw your DWCAS asm impl _way_ back on
c.p.t, 20+ years now?

____________________
.align 16
.globl np_ac_i686_atomic_dwcas_fence
np_ac_i686_atomic_dwcas_fence:
pushl %esi
pushl %ebx
movl 16(%esp), %esi
movl (%esi), %eax
movl 4(%esi), %edx
movl 20(%esp), %esi
movl (%esi), %ebx
movl 4(%esi), %ecx
movl 12(%esp), %esi
lock cmpxchg8b (%esi)
jne np_ac_i686_atomic_dwcas_fence_fail
xorl %eax, %eax
popl %ebx
popl %esi
ret

np_ac_i686_atomic_dwcas_fence_fail:
movl 16(%esp), %esi
movl %eax, (%esi)
movl %edx, 4(%esi)
movl $1, %eax
popl %ebx
popl %esi
ret
____________________

;^)
jseigh
2024-08-22 20:03:34 UTC
Reply
Permalink
Post by jseigh
Anyway I have a gcc inline DWCAS so as far as I am concerned C/C++
support is still where it was in 2003, meaning I still have to use
the same techniques I used back then.
Does this logic ring a bell? I saw your DWCAS asm impl _way_ back on c.p.t, 20+ years now?
____________________
.align 16
.globl np_ac_i686_atomic_dwcas_fence
  pushl %esi
  pushl %ebx
  movl 16(%esp), %esi
  movl (%esi), %eax
  movl 4(%esi), %edx
  movl 20(%esp), %esi
  movl (%esi), %ebx
  movl 4(%esi), %ecx
  movl 12(%esp), %esi
  lock cmpxchg8b (%esi)
  jne np_ac_i686_atomic_dwcas_fence_fail
  xorl %eax, %eax
  popl %ebx
  popl %esi
  ret
  movl 16(%esp), %esi
  movl %eax, (%esi)
  movl %edx, 4(%esi)
  movl $1, %eax
  popl %ebx
  popl %esi
ret
____________________
;^)
that would be for 32 bit cpus. I use gcc inline assembly. Also I update the expected value if the cas fails.

Joe Seigh
Chris M. Thomasson
2024-08-22 20:33:31 UTC
Reply
Permalink
Post by Chris M. Thomasson
Post by jseigh
Anyway I have a gcc inline DWCAS so as far as I am concerned C/C++
support is still where it was in 2003, meaning I still have to use
the same techniques I used back then.
Does this logic ring a bell? I saw your DWCAS asm impl _way_ back on
c.p.t, 20+ years now?
____________________
.align 16
.globl np_ac_i686_atomic_dwcas_fence
   pushl %esi
   pushl %ebx
   movl 16(%esp), %esi
   movl (%esi), %eax
   movl 4(%esi), %edx
   movl 20(%esp), %esi
   movl (%esi), %ebx
   movl 4(%esi), %ecx
   movl 12(%esp), %esi
   lock cmpxchg8b (%esi)
   jne np_ac_i686_atomic_dwcas_fence_fail
   xorl %eax, %eax
   popl %ebx
   popl %esi
   ret
   movl 16(%esp), %esi
   movl %eax, (%esi)
   movl %edx, 4(%esi)
   movl $1, %eax
   popl %ebx
   popl %esi
ret
____________________
;^)
that would be for 32 bit cpus.  I use gcc inline assembly.  Also I
update the expected value if the cas fails.
Post by Chris M. Thomasson
movl 16(%esp), %esi
movl %eax, (%esi)
movl %edx, 4(%esi)
movl $1, %eax
popl %ebx
popl %esi
ret
I remember you teaching me about atomic_ptr _way_ back over on c.p.t 20+
years ago. I saw your original inline asm wrt DWCAS for GCC.
jseigh
2024-08-23 01:26:07 UTC
Reply
Permalink
that would be for 32 bit cpus.  I use gcc inline assembly.  Also I update the expected value if the cas fails.
    movl 16(%esp), %esi
    movl %eax, (%esi)
    movl %edx, 4(%esi)
    movl $1, %eax
    popl %ebx
    popl %esi
ret
I remember you teaching me about atomic_ptr _way_ back over on c.p.t 20+ years ago. I saw your original inline asm wrt DWCAS for GCC.
From the atomic-ptr-plus project

//------------------------------------------------------------------------------
// __atomic_compare_exchange_16 --
//
// returns:
// rc = 0, fail - xcmp updated
// rc = 1, success - dest updated
//------------------------------------------------------------------------------
int __atomic_compare_exchange_16(__int128 * dest, __int128 * xcmp, __int128 xxchg, int m, int ms, int mf) {
int rc;

__asm__ __volatile__ (
"lea %3, %%rsi ;\n" // address of exchange
"mov 0(%%rsi), %%rbx ;\n" // exchange low
"mov 8(%%rsi), %%rcx ;\n" // exchange high

"mov %2, %%rsi ;\n" // comparand
"mov 0(%%rsi), %%rax ;\n" // comparand low
"mov 8(%%rsi), %%rdx ;\n" // comparand high

"mov %1, %%rsi ;\n" // destination
"lock cmpxchg16b (%%rsi) ;\n"
"jz 1f ;\n"
"mov %2, %%rsi ;\n" // comparand
"mov %%rax, 0(%%rsi) ;\n" // comparand low
"mov %%rdx, 8(%%rsi) ;\n" // comparand high
"1: mov $0, %0 ;\n"
"setz %b0 ;\n" // rc =
: "=&a" (rc)
: "m" (dest), "m" (xcmp), "m" (xxchg)
: "cc", "memory", "rdx", "rbx", "rcx", "rsi"
);

return rc;
}

This is cut and pasted several times with some versions having bad offsets of 4 which
won't work very well. I know since I grabbed one of the bad versions for something
else I am working on. The last 3 parameters are ignored. They're just to document
memory fencing.

Joe Seigh
Chris M. Thomasson
2024-08-23 06:38:28 UTC
Reply
Permalink
Post by jseigh
that would be for 32 bit cpus.  I use gcc inline assembly.  Also I
update the expected value if the cas fails.
 >>    movl 16(%esp), %esi
 >>    movl %eax, (%esi)
 >>    movl %edx, 4(%esi)
 >>    movl $1, %eax
 >>    popl %ebx
 >>    popl %esi
 >> ret
I remember you teaching me about atomic_ptr _way_ back over on c.p.t
20+ years ago. I saw your original inline asm wrt DWCAS for GCC.
From the atomic-ptr-plus project
//------------------------------------------------------------------------------
// __atomic_compare_exchange_16 --
//
//   rc = 0, fail - xcmp updated
//   rc = 1, success - dest updated
//------------------------------------------------------------------------------
int __atomic_compare_exchange_16(__int128 * dest, __int128 * xcmp,
__int128 xxchg, int m, int ms, int mf) {
    int rc;
    __asm__ __volatile__ (
        "lea    %3, %%rsi            ;\n"    // address of exchange
        "mov    0(%%rsi), %%rbx      ;\n"    // exchange low
        "mov    8(%%rsi), %%rcx      ;\n"    // exchange high
        "mov    %2, %%rsi            ;\n"    // comparand
        "mov    0(%%rsi), %%rax      ;\n"    // comparand low
        "mov    8(%%rsi), %%rdx      ;\n"    // comparand high
        "mov    %1, %%rsi            ;\n"    // destination
        "lock cmpxchg16b (%%rsi)     ;\n"
        "jz      1f                   ;\n"
        "mov    %2, %%rsi            ;\n"    // comparand
        "mov    %%rax, 0(%%rsi)      ;\n"    // comparand low
        "mov    %%rdx, 8(%%rsi)      ;\n"    // comparand high
"1:         mov    $0, %0               ;\n"
        "setz    %b0                  ;\n"    // rc =
        : "=&a" (rc)
        : "m" (dest), "m" (xcmp), "m" (xxchg)
        : "cc", "memory", "rdx", "rbx", "rcx", "rsi"
        );
    return rc;
}
This is cut and pasted several times with some versions having bad offsets of 4 which
won't work very well.  I know since I grabbed one of the bad versions
for something
else I am working on.  The last 3 parameters are ignored.  They're just
to document
memory fencing.
Thanks. Btw, it's always fun to read inline asm... ;^)

Fwiw, my DWCAS version for x86 in:

https://web.archive.org/web/20060214112345/http://appcore.home.comcast.net/appcore/src/cpu/i686/ac_i686_gcc_asm.html

is using the POSIX return values. 0 for success, non-zero for failure.
Yikes! Had PThreads on the mind when I was coding AppCore. ;^D
Chris M. Thomasson
2024-08-19 20:42:48 UTC
Reply
Permalink
Post by Bonita Montero
template<typename T>
inline typename tshared_obj<T>::ctrl_t *tshared_obj<T>::copy()
{
    pair_t expected( m_dca.first(), m_dca.second() ), desired;
    do
        if( !expected.first ) [[unlikely]]
            return nullptr;
        else if( (expected.second & UPDATER_MASK) != UPDATER_MASK )
[[likely]]
            desired = { expected.first, expected.second + UPDATER_VALUE };
        else
            throw dca_update_exception();
    while( !m_dca.compare_exchange( expected, desired, dcas_relaxed() ) );
    ctrl_t *obj = (ctrl_t *)desired.first;
    size_t rcBefore = obj->m_refCount.fetch_add( 1,
std::memory_order_relaxed );
    assert(rcBefore > 0 && rcBefore != (size_t)-1);
    expected = desired;
    uint64_t seq = expected.second >> UPDATER_BITS;
    do
        if( expected.first == desired.first && expected.second >>
UPDATER_BITS == seq ) [[likely]]
            desired.second = expected.second - UPDATER_VALUE;
        else
            break;
    while( !m_dca.compare_exchange( expected, desired, dcas_relaxed() ) );
    return obj;
}
Two CAS loops? Have you modeled it in a race detector yet? There is a
lot going on here. Also, you should take a look at atomic_ptr.
Bonita Montero
2024-08-20 07:31:55 UTC
Reply
Permalink
Post by Bonita Montero
template<typename T>
inline typename tshared_obj<T>::ctrl_t *tshared_obj<T>::copy()
{
     pair_t expected( m_dca.first(), m_dca.second() ), desired;
     do
         if( !expected.first ) [[unlikely]]
             return nullptr;
         else if( (expected.second & UPDATER_MASK) != UPDATER_MASK )
[[likely]]
             desired = { expected.first, expected.second +
UPDATER_VALUE };
         else
             throw dca_update_exception();
     while( !m_dca.compare_exchange( expected, desired,
dcas_relaxed() ) );
     ctrl_t *obj = (ctrl_t *)desired.first;
     size_t rcBefore = obj->m_refCount.fetch_add( 1,
std::memory_order_relaxed );
     assert(rcBefore > 0 && rcBefore != (size_t)-1);
     expected = desired;
     uint64_t seq = expected.second >> UPDATER_BITS;
     do
         if( expected.first == desired.first && expected.second >>
UPDATER_BITS == seq ) [[likely]]
             desired.second = expected.second - UPDATER_VALUE;
         else
             break;
     while( !m_dca.compare_exchange( expected, desired,
dcas_relaxed() ) );
     return obj;
}
Two CAS loops? ...
With split reference counting this is the usual way since you've to
revert the updater counter after you adjusted the reference count.
It simply works.
Bonita Montero
2024-08-20 07:56:20 UTC
Reply
Permalink
Post by Bonita Montero
Post by Bonita Montero
template<typename T>
inline typename tshared_obj<T>::ctrl_t *tshared_obj<T>::copy()
{
     pair_t expected( m_dca.first(), m_dca.second() ), desired;
     do
         if( !expected.first ) [[unlikely]]
             return nullptr;
         else if( (expected.second & UPDATER_MASK) != UPDATER_MASK )
[[likely]]
             desired = { expected.first, expected.second +
UPDATER_VALUE };
         else
             throw dca_update_exception();
     while( !m_dca.compare_exchange( expected, desired,
dcas_relaxed() ) );
     ctrl_t *obj = (ctrl_t *)desired.first;
     size_t rcBefore = obj->m_refCount.fetch_add( 1,
std::memory_order_relaxed );
     assert(rcBefore > 0 && rcBefore != (size_t)-1);
     expected = desired;
     uint64_t seq = expected.second >> UPDATER_BITS;
     do
         if( expected.first == desired.first && expected.second >>
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Always true and can be removed. Relying on the sequence
counter (2nd condition) is sufficient.
Post by Bonita Montero
Post by Bonita Montero
UPDATER_BITS == seq ) [[likely]]
             desired.second = expected.second - UPDATER_VALUE;
         else
             break;
     while( !m_dca.compare_exchange( expected, desired,
dcas_relaxed() ) );
     return obj;
}
Two CAS loops? ...
With split reference counting this is the usual way since you've to
revert the updater counter after you adjusted the reference count.
It simply works.
Chris M. Thomasson
2024-08-20 19:31:35 UTC
Reply
Permalink
[...]
Post by Bonita Montero
With split reference counting this is the usual way since you've to
revert the updater counter after you adjusted the reference count.
It simply works.
I need to study it some more.
Bonita Montero
2024-08-23 14:34:14 UTC
Reply
Permalink
I've got a tshared_obj<T>, which is s substitute for a atomic
<shared_ptr<T>> and a shared_obj<>, which is used locally on a thread.
The shared_obj<T> may be a thread_local variable and may be frequently
updated from the central tshared_obj<T> object.
I made a simple optimization which compares the current value of the
shared_obj<T> control block pointer to the current calue of the tshared
_obj<T> object's control block-pointer before copying. If its the same
the nothing happend on assignment. So the most likely case that the
central tshared_obj<T> isn't updated so frequently happens without
traffic between the caches because there's usually no write to shared
cachelines.

This are the numbers of repeatedly copying a tshared_obj<T> to a
shared_obj<T> in one to 32 threads:


1: 2.817
2: 2.37
3: 2.42133
4: 2.403
5: 2.56
6: 2.40233
7: 2.58986
8: 3.344
9: 4.08256
10: 4.689
11: 5.06118
12: 5.75808
13: 6.89769
14: 8.12543
15: 7.9756
16: 10.2607
17: 10.6755
18: 11.1416
19: 10.4275
20: 12.3337
21: 12.2635
22: 13.6835
23: 12.5886
24: 13.7635
25: 13.3899
26: 15.128
27: 12.6504
28: 15.0184
29: 14.5865
30: 14.7942
31: 15.6673
32: 14.8539

This are the numbers of repeatedly copying an atomic<shared_ptr<T>> to
a shared_ptr<T>> with the futex'd solution of MSVC (libstdc++ and libc++
are a magnitude slower with contention) in one ot 32 threads:

1: 7.017
2: 82.352
3: 236.794
4: 440.081
5: 652.365
6: 808.366
7: 929.841
8: 1003.45
9: 1093.29
10: 1081.51
11: 1020
12: 1101.01
13: 1234.35
14: 1362.97
15: 1588
16: 1764.23
17: 2011.28
18: 2207.23
19: 2336.67
20: 2707.68
21: 3048.06
22: 3067.7
23: 3479.35
24: 3584.29
25: 3832.91
26: 3899.98
27: 4177.75
28: 4333.31
29: 4344.12
30: 4604.6
31: 4760.91
32: 4875.24

So URCU is really obsolete with this simple optimization which also
would be possible with atomic<shared_ptr<T>> if they'd implement it.
There's would be only more memory-consumption through the thread_local
shared_ptr<T> and you'd need to check sometimes if the contents could
be flushed.
Chris M. Thomasson
2024-08-23 19:19:42 UTC
Reply
Permalink
On 8/23/2024 7:34 AM, Bonita Montero wrote:
[...]
Post by Bonita Montero
This are the numbers of repeatedly copying an atomic<shared_ptr<T>> to
a shared_ptr<T>> with the futex'd solution of MSVC (libstdc++ and libc++
Why would anybody need a futex for a reference count that provides
strong thread safety? This means they must have to block somewhere, or
else why use a futex at all? Again, take a look at atomic_ptr. DWCAS is
all it needs. No blocking. Futex here baffles me!
Bonita Montero
2024-08-24 02:47:06 UTC
Reply
Permalink
Post by Chris M. Thomasson
Why would anybody need a futex for a reference count that provides
strong thread safety? ..
The MSVC, libstdc++ and libc++ atomic<shared_ptr<>> are futex'd.
Chris M. Thomasson
2024-08-24 18:19:27 UTC
Reply
Permalink
Post by Bonita Montero
Post by Chris M. Thomasson
Why would anybody need a futex for a reference count that provides
strong thread safety? ..
The MSVC, libstdc++ and libc++ atomic<shared_ptr<>> are futex'd.
Still don't know why they would use a futex at all. Humm...

Bonita Montero
2024-08-24 14:52:16 UTC
Reply
Permalink
Post by Chris M. Thomasson
Why would anybody need a futex for a reference count that provides
strong thread safety? This means they must have to block somewhere,
or else why use a futex at all? Again, take a look at atomic_ptr.
DWCAS is all it needs. No blocking. Futex here baffles me!
The DWCAS-solution I wrote was only somewhat faster than the futex'd
solution from MSVC. The simlpe change I made not to update equal
pointers, thereby skipping the three atomic updates if the central
tshared_object hasn't changed, made the whole thing 900 times faster
as there are no cache-invalidations in this case. With clang-cl the
new code is even 3.800 times faster when there are no updates of the
central tsharded_obj<>.
Loading...