Proxy test code...

Discussion:

Proxy test code...

Add Reply

Chris M. Thomasson

2024-12-01 22:12:42 UTC

This is not using any thread locals. It's a proxy collector that does
not use any CAS. Also, it's interesting to test against other asymmetric
proxy algorithms under heavy load. Can you get it to compile and run on
your end? Thanks.

https://pastebin.com/raw/CYZ78gVj
(raw text link, no ads... :^)

____________________________________

// Chris M. Thomassons Poor Mans RCU... Example 456...

#include <iostream>
#include <atomic>
#include <thread>
#include <cstdlib>
#include <cstdint>
#include <climits>
#include <functional>

// Masks
static constexpr std::uint32_t ct_ref_mask = 0xFFFFFFF0U;
static constexpr std::uint32_t ct_ref_complete = 0x30U;
static constexpr std::uint32_t ct_ref_inc = 0x20U;
static constexpr std::uint32_t ct_proxy_mask = 0xFU;
static constexpr std::uint32_t ct_proxy_quiescent = 0x10U;

// Iteration settings
static constexpr unsigned long ct_reader_iters_n = 2000000;
static constexpr unsigned long ct_writer_iters_n = 200000;

// Thread counts
static constexpr unsigned long ct_reader_threads_n = 53;
static constexpr unsigned long ct_writer_threads_n = 11;

// Some debug/sanity check things...
// Need to make this conditional in compilation with some macros...
static std::atomic<std::uint32_t> g_debug_node_allocations(0);
static std::atomic<std::uint32_t> g_debug_node_deallocations(0);
static std::atomic<std::uint32_t> g_debug_dtor_collect(0);
static std::atomic<std::uint32_t> g_debug_release_collect(0);
static std::atomic<std::uint32_t> g_debug_quiesce_begin(0);
static std::atomic<std::uint32_t> g_debug_quiesce_complete(0);
static std::atomic<std::uint32_t> g_debug_quiesce_complete_nodes(0);

// Need to align and pad data structures! To do...

struct ct_node
{
std::atomic<ct_node*> m_next;
ct_node* m_defer_next;

ct_node() : m_next(nullptr), m_defer_next(nullptr)
{
g_debug_node_allocations.fetch_add(1, std::memory_order_relaxed);
}

~ct_node()
{
g_debug_node_deallocations.fetch_add(1, std::memory_order_relaxed);
}
};

// The proxy collector itself... :^)
template<std::size_t T_defer_limit>
class ct_proxy
{
static std::uint32_t prv_destroy(ct_node* n)
{
std::uint32_t count = 0;

while (n)
{
ct_node* next = n->m_defer_next;
delete n;
count++;
n = next;
}

return count;
}

public:
class collector
{
friend class ct_proxy;

private:
std::atomic<ct_node*> m_defer;
std::atomic<std::uint32_t> m_defer_count;
std::atomic<std::uint32_t> m_count;

public:
collector()
: m_defer(nullptr),
m_defer_count(0),
m_count(0)
{

}

~collector()
{
prv_destroy(m_defer.load(std::memory_order_relaxed));
}
};

private:
std::atomic<std::uint32_t> m_current;
std::atomic<bool> m_quiesce;
ct_node* m_defer;
collector m_collectors[2];

public:
ct_proxy()
: m_current(0),
m_quiesce(false),
m_defer(nullptr)
{

}

~ct_proxy()
{
prv_destroy(m_defer);
}

private:
void prv_quiesce_begin()
{
// Try to begin the quiescence process.
if (! m_quiesce.exchange(true, std::memory_order_acquire))
{
g_debug_quiesce_begin.fetch_add(1, std::memory_order_relaxed);

// advance the current collector and grab the old one.
std::uint32_t old =
m_current.load(std::memory_order_relaxed) & ct_proxy_mask;
old = m_current.exchange((old + 1) & 1,
std::memory_order_acq_rel);
collector& c = m_collectors[old & ct_proxy_mask];

// decode reference count.
std::uint32_t refs = old & ct_ref_mask;

// increment and generate an odd reference count.
std::uint32_t old_refs = c.m_count.fetch_add(refs +
ct_proxy_quiescent, std::memory_order_release);

if (old_refs == 0 - refs)
{
g_debug_dtor_collect.fetch_add(1,
std::memory_order_relaxed);

// odd reference count and drop-to-zero condition detected!
prv_quiesce_complete(c);
}
}
}

void prv_quiesce_complete(collector& c)
{
g_debug_quiesce_complete.fetch_add(1, std::memory_order_relaxed);

// the collector `c' is now in a quiescent state! :^)
std::atomic_thread_fence(std::memory_order_acquire);

// maintain the back link and obtain "fresh" objects from
// this collection.
ct_node* n = m_defer;
m_defer = c.m_defer.load(std::memory_order_relaxed);
c.m_defer.store(0, std::memory_order_relaxed);

// reset the reference count.
c.m_count.store(0, std::memory_order_relaxed);
c.m_defer_count.store(0, std::memory_order_relaxed);

// release the quiesce lock.
m_quiesce.store(false, std::memory_order_release);

// destroy nodes.
std::uint32_t count = prv_destroy(n);

g_debug_quiesce_complete_nodes.fetch_add(count,
std::memory_order_relaxed);
}

public:
collector& acquire()
{
// increment the master count _and_ obtain current collector.
std::uint32_t current =
m_current.fetch_add(ct_ref_inc, std::memory_order_acquire);

// decode the collector index.
return m_collectors[current & ct_proxy_mask];
}

void release(collector& c)
{
// decrement the collector.
std::uint32_t count =
c.m_count.fetch_sub(ct_ref_inc, std::memory_order_release);

// check for the completion of the quiescence process.
if ((count & ct_ref_mask) == ct_ref_complete)
{
// odd reference count and drop-to-zero condition detected!
g_debug_release_collect.fetch_add(1,
std::memory_order_relaxed);

prv_quiesce_complete(c);
}
}

collector& sync(collector& c)
{
// check if the `c' is in the middle of a quiescence process.
if (c.m_count.load(std::memory_order_relaxed) & ct_proxy_quiescent)
{
// drop `c' and get the next collector.
release(c);

return acquire();
}

return c;
}

void collect()
{
prv_quiesce_begin();
}

void collect(collector& c, ct_node* n)
{
if (! n) return;

// link node into the defer list.
ct_node* prev = c.m_defer.exchange(n, std::memory_order_relaxed);
n->m_defer_next = prev;

// bump the defer count and begin quiescence process if over
// the limit.
std::uint32_t count =
c.m_defer_count.fetch_add(1, std::memory_order_relaxed) + 1;

if (count >= (T_defer_limit / 2))
{
prv_quiesce_begin();
}
}
};

typedef ct_proxy<10> ct_proxy_collector;

// you're basic lock-free stack...
// well, minus ABA counter and DWCAS of course! ;^)
class ct_stack
{
std::atomic<ct_node*> m_head;

public:
ct_stack() : m_head(nullptr)
{

}

public:
void push(ct_node* n)
{
ct_node* head = m_head.load(std::memory_order_relaxed);

do
{
n->m_next.store(head, std::memory_order_relaxed);
}

while (! m_head.compare_exchange_weak(
head,
n,
std::memory_order_release));
}

ct_node* flush()
{
return m_head.exchange(nullptr, std::memory_order_acquire);
}

ct_node* get_head()
{
return m_head.load(std::memory_order_acquire);
}

ct_node* pop()
{
ct_node* head = m_head.load(std::memory_order_acquire);
ct_node* xchg;

do
{
if (! head) return nullptr;

xchg = head->m_next.load(std::memory_order_relaxed);
}

while (!m_head.compare_exchange_weak(
head,
xchg,
std::memory_order_acquire));

return head;
}
};

// The shared state
struct ct_shared
{
ct_proxy<10> m_proxy_gc;
ct_stack m_stack;
};

// Reader threads
// Iterates through the lock free stack
void ct_thread_reader(ct_shared& shared)
{
// iterate the lockfree stack
for (unsigned long i = 0; i < ct_reader_iters_n; ++i)
{
ct_proxy_collector::collector& c = shared.m_proxy_gc.acquire();

ct_node* n = shared.m_stack.get_head();

while (n)
{
// need to add in some processing...
// std::this_thread::yield();

n = n->m_next.load(std::memory_order_relaxed);
}

shared.m_proxy_gc.release(c);
}
}

// Writer threads
// Mutates the lock free stack
void ct_thread_writer(ct_shared& shared)
{
for (unsigned long wloop = 0; wloop < 42; ++wloop)
{
shared.m_proxy_gc.collect();

for (unsigned long i = 0; i < ct_writer_iters_n; ++i)
{
shared.m_stack.push(new ct_node());
}

//std::this_thread::yield();

ct_proxy_collector::collector& c = shared.m_proxy_gc.acquire();

for (unsigned long i = 0; i < ct_writer_iters_n; ++i)
{
shared.m_proxy_gc.collect(c, shared.m_stack.pop());
}

shared.m_proxy_gc.release(c);

for (unsigned long i = 0; i < ct_writer_iters_n / 2; ++i)
{
shared.m_proxy_gc.collect();
}

{
ct_proxy_collector::collector& c = shared.m_proxy_gc.acquire();

for (unsigned long i = 0; i < ct_writer_iters_n; ++i)
{
ct_node* n = shared.m_stack.pop();
if (! n) break;

shared.m_proxy_gc.collect(c, n);
}

shared.m_proxy_gc.release(c);
}

if ((wloop % 3) == 0)
{
shared.m_proxy_gc.collect();
}
}
}

int main()
{
std::cout << "Chris M. Thomassons Proxy Collector Port ver
.0.0.2...\n";
std::cout << "_______________________________________\n\n";

{
ct_shared shared;

std::thread readers[ct_reader_threads_n];
std::thread writers[ct_writer_threads_n];

std::cout << "Booting threads...\n";

for (unsigned long i = 0; i < ct_writer_threads_n; ++i)
{
writers[i] = std::thread(ct_thread_writer, std::ref(shared));
}

for (unsigned long i = 0; i < ct_reader_threads_n; ++i)
{
readers[i] = std::thread(ct_thread_reader, std::ref(shared));
}

std::cout << "Threads running...\n";

for (unsigned long i = 0; i < ct_reader_threads_n; ++i)
{
readers[i].join();
}

for (unsigned long i = 0; i < ct_writer_threads_n; ++i)
{
writers[i].join();
}
}

std::cout << "Threads completed!\n\n";

// Sanity check!
{
std::uint32_t node_allocations =
g_debug_node_allocations.load(std::memory_order_relaxed);
std::uint32_t node_deallocations =
g_debug_node_deallocations.load(std::memory_order_relaxed);
std::uint32_t dtor_collect =
g_debug_dtor_collect.load(std::memory_order_relaxed);
std::uint32_t release_collect =
g_debug_release_collect.load(std::memory_order_relaxed);
std::uint32_t quiesce_complete =
g_debug_quiesce_complete.load(std::memory_order_relaxed);
std::uint32_t quiesce_begin =
g_debug_quiesce_begin.load(std::memory_order_relaxed);
std::uint32_t quiesce_complete_nodes =
g_debug_quiesce_complete_nodes.load(std::memory_order_relaxed);

std::cout << "node_allocations = " << node_allocations << "\n";
std::cout << "node_deallocations = " << node_deallocations <<
"\n\n";
std::cout << "dtor_collect = " << dtor_collect << "\n";
std::cout << "release_collect = " << release_collect << "\n";
std::cout << "quiesce_complete = " << quiesce_complete << "\n";
std::cout << "quiesce_begin = " << quiesce_begin << "\n";
std::cout << "quiesce_complete_nodes = " <<
quiesce_complete_nodes << "\n";

if (node_allocations != node_deallocations)
{
std::cout << "OH SHIT! NODE LEAK!!! SHIT! = " <<
node_allocations - node_deallocations << "\n\n";
}

}

std::cout << "\n\nTest Completed!\n\n";

return 0;
}
____________________________________

Louis Krupp

2024-12-02 19:44:32 UTC

Permalink

Post by Chris M. Thomasson
This is not using any thread locals. It's a proxy collector that does
not use any CAS. Also, it's interesting to test against other
asymmetric proxy algorithms under heavy load. Can you get it to
compile and run on your end? Thanks.
https://pastebin.com/raw/CYZ78gVj
(raw text link, no ads... :^)
____________________________________
// Chris M. Thomassons Poor Mans RCU... Example 456...

<snip>

Post by Chris M. Thomasson
____________________________________

Using g++ (GCC) 14.2.1 20240912 (Red Hat 14.2.1-3):

===
Chris M. Thomassons Proxy Collector Port ver .0.0.2...
_______________________________________

Booting threads...
Threads running...
Threads completed!

node_allocations = 92400000
node_deallocations = 92400000

dtor_collect = 7
release_collect = 140
quiesce_complete = 147
quiesce_begin = 147
quiesce_complete_nodes = 92200000

Test Completed!

===

Louis

Chris M. Thomasson

2024-12-02 23:32:49 UTC

Permalink

Post by Louis Krupp

<snip>

Post by Chris M. Thomasson
____________________________________

===
Chris M. Thomassons Proxy Collector Port ver .0.0.2...
_______________________________________
Booting threads...
Threads running...
Threads completed!
node_allocations = 92400000
node_deallocations = 92400000
dtor_collect = 7
release_collect = 140
quiesce_complete = 147
quiesce_begin = 147
quiesce_complete_nodes = 92200000
Test Completed!

Thanks for taking the time to give it a go. These numbers are
interesting to me because quiesce_complete_nodes does not equal
node_deallocations. Nothing leaked, but it means that the nodes finally
got destroyed in the dtor's of the proxy collector itself and/or its
collector objects. I forgot to count those in my test. This condition
means they come from the actual dtors of:
__________________
~collector()
{
prv_destroy(m_defer.load(std::memory_order_relaxed));
}
__________________

and/or:
__________________
~ct_proxy()
{
prv_destroy(m_defer);
}
__________________

Fwiw, prv_destroy returns the number of nodes it destroyed wrt:
__________________
static std::uint32_t prv_destroy(ct_node* n)
{
std::uint32_t count = 0;

while (n)
{
ct_node* next = n->m_defer_next;
delete n;
count++;
n = next;
}

return count;
}
__________________

I count these in prv_destroy just for testing and experimental purposes
only. So, the fact that your resulting numbers wrt
quiesce_complete_nodes != node_deallocations, well the difference should
be accounted for in these dtors. I need to sum the results from
prv_destroy in these dtors, just for accounting purposes in my debug
variables used for a "sanity" check... ;^)

Thanks!

Paavo Helde

2024-12-02 20:26:01 UTC

Permalink

Post by Chris M. Thomasson
This is not using any thread locals. It's a proxy collector that does
not use any CAS. Also, it's interesting to test against other asymmetric
proxy algorithms under heavy load. Can you get it to compile and run on
your end? Thanks.
https://pastebin.com/raw/CYZ78gVj
(raw text link, no ads... :^)

On Windows x86_64 with VS2022:

Chris M. Thomassons Proxy Collector Port ver .0.0.2...
_______________________________________

Booting threads...
Threads running...
Threads completed!

node_allocations = 92400000
node_deallocations = 92400000

dtor_collect = 3
release_collect = 121
quiesce_complete = 124
quiesce_begin = 124
quiesce_complete_nodes = 92400000

Test Completed!

Chris M. Thomasson

2024-12-02 23:36:30 UTC

Permalink

Post by Louis Krupp

Chris M. Thomassons Proxy Collector Port ver .0.0.2...
_______________________________________
Booting threads...
Threads running...
Threads completed!
node_allocations = 92400000
node_deallocations = 92400000
dtor_collect = 3
release_collect = 121
quiesce_complete = 124
quiesce_begin = 124
quiesce_complete_nodes = 92400000
Test Completed!

Thanks! The numbers are more in line wrt what I usually get (refer to my
response to Louis Krupp) wrt quiesce_complete_nodes ==
node_deallocations. Only some of my test experiments result in
quiesce_complete_nodes != node_deallocations. Nothing is wrong, but I
forgot to account for nodes that are still there during the dtors of the
proxy and its collector objects.

Your run helped me for sure. Thank you Paavo and Louis.

:^)

Chris M. Thomasson

2024-12-02 23:41:26 UTC

Permalink

Post by Chris M. Thomasson

Post by Louis Krupp

To clarify, all nodes that were allocated are deallocated, so no node
leak. It's interesting to me when quiesce_complete_nodes !=
node_deallocations. It means there are nodes left in the defer lists
that get destroyed during the proxy and collector dtors where they dump
their defer lists. I need to add in a new debug/sanity counter to
account for that condition. My sanity check should account for
everything, not just the node allocations and deallocations. They help
me "see" how the system is being used during various use cases.

Post by Chris M. Thomasson
Your run helped me for sure. Thank you Paavo and Louis.
:^)

jseigh

2024-12-03 15:03:10 UTC

Permalink

Post by Chris M. Thomasson

Post by Louis Krupp

Post by Chris M. Thomasson
This is not using any thread locals. It's a proxy collector that
does not use any CAS. Also, it's interesting to test against other
asymmetric proxy algorithms under heavy load. Can you get it to
compile and run on your end? Thanks.
https://pastebin.com/raw/CYZ78gVj
(raw text link, no ads... :^)

Thanks! The numbers are more in line wrt what I usually get (refer to
my response to Louis Krupp) wrt quiesce_complete_nodes ==
node_deallocations. Only some of my test experiments result in
quiesce_complete_nodes != node_deallocations. Nothing is wrong, but I
forgot to account for nodes that are still there during the dtors of
the proxy and its collector objects.

Post by Chris M. Thomasson
Your run helped me for sure. Thank you Paavo and Louis.
:^)

I use the following kind of output

testcase: smr
Statistics:
reader thread count = 4
read_count = 400,000,000
elapsed cpu read_time = 21,235,330,270 nsecs
avg cpu read_time = 53.088 nsecs
elapsed read_time = 5,319,022,275 nsecs
avg elapsed read time = 53.190 nsecs
data state counts:
live = 399,998,095
stale = 1,905
invalid = 0
other = 0
retire_count = 1,053
elapsed retire_time = 5,323,396,964 nsecs
avg retire_time = 5,055.458 usecs
data allocs = 1,053
data deletes = 1,053
voluntary context switches = 7
involuntary context switches = 37
user cpu time = 21,215,620,000 nsecs
system cpu time = 19,971,000 nsecs

The data state counts tell me if the reads were valid,
invalid and other being not.

The read time is for lock/access data/unlock. You need to
compare it to unlocked access to measure the locking overhead.

testcase: unsafe
Statistics:
reader thread count = 4
read_count = 400,000,000
elapsed cpu read_time = 20,833,077,755 nsecs
avg cpu read_time = 52.083 nsecs
elapsed read_time = 5,220,600,403 nsecs
avg elapsed read time = 52.206 nsecs
data state counts:
live = 399,998,426
stale = 1,567
invalid = 0
other = 7
retire_count = 1,034
elapsed retire_time = 39,858 nsecs
avg retire_time = 0.039 usecs
data allocs = 1,034
data deletes = 1,034
voluntary context switches = 4
involuntary context switches = 21
user cpu time = 20,813,376,000 nsecs
system cpu time = 19,974,000 nsecs

Joe Seigh

Chris M. Thomasson

2024-12-03 20:49:49 UTC

Permalink

Post by jseigh

Post by Chris M. Thomasson

Post by Louis Krupp

Post by Chris M. Thomasson
This is not using any thread locals. It's a proxy collector that
does not use any CAS. Also, it's interesting to test against other
asymmetric proxy algorithms under heavy load. Can you get it to
compile and run on your end? Thanks.
https://pastebin.com/raw/CYZ78gVj
(raw text link, no ads... :^)

Thanks! The numbers are more in line wrt what I usually get (refer to
my response to Louis Krupp) wrt quiesce_complete_nodes ==
node_deallocations. Only some of my test experiments result in
quiesce_complete_nodes != node_deallocations. Nothing is wrong, but I
forgot to account for nodes that are still there during the dtors of
the proxy and its collector objects.

To clarify, all nodes that were allocated are deallocated, so no node
leak. It's interesting to me when quiesce_complete_nodes !=
node_deallocations. It means there are nodes left in the defer lists
that get destroyed during the proxy and collector dtors where they
dump their defer lists. I need to add in a new debug/sanity counter to
account for that condition. My sanity check should account for
everything, not just the node allocations and deallocations. They help
me "see" how the system is being used during various use cases.

Post by Chris M. Thomasson
Your run helped me for sure. Thank you Paavo and Louis.
:^)

I use the following kind of output
testcase: smr
reader thread count = 4
read_count = 400,000,000
elapsed cpu read_time = 21,235,330,270 nsecs
avg cpu read_time =    53.088 nsecs
elapsed read_time = 5,319,022,275 nsecs
avg elapsed read time =     53.190 nsecs
    live = 399,998,095
    stale = 1,905
    invalid = 0
    other = 0
retire_count = 1,053
elapsed retire_time = 5,323,396,964 nsecs
avg retire_time = 5,055.458 usecs
data allocs = 1,053
data deletes = 1,053
voluntary context switches = 7
involuntary context switches = 37
user cpu time = 21,215,620,000 nsecs
system cpu time = 19,971,000 nsecs
The data state counts tell me if the reads were valid,
invalid and other being not.
The read time is for lock/access data/unlock. You need to
compare it to unlocked access to measure the locking overhead.

Agreed. Fwiw, my fast paths for this older algorithm of mine wrt
acquiring a collector and releasing it are pretty "good'ish" wrt a
single "wait-free" fetch_add. No CAS. No DWCAS for that matter:
_______________________
collector& acquire()
{
// increment the master count _and_ obtain current collector.
std::uint32_t current =
m_current.fetch_add(ct_ref_inc, std::memory_order_acquire);

// decode the collector index.
return m_collectors[current & ct_proxy_mask];
}

void release(collector& c)
{
// decrement the collector.
std::uint32_t count =
c.m_count.fetch_sub(ct_ref_inc, std::memory_order_release);

// check for the completion of the quiescence process.
if ((count & ct_ref_mask) == ct_ref_complete)
{
// odd reference count and drop-to-zero condition detected!
g_debug_release_collect.fetch_add(1,
std::memory_order_relaxed);

prv_quiesce_complete(c);
}
}
_______________________

However, this should not perform as good as a membar free SMR load?
There is an interesting tweak that I never showed to the public that
allows the fetch_add's to be on a per-thread basis. Instead of hammering
a single cache line wrt my acquire function, they are spread out...
Basically distributing the collector logic across the "registered"
threads. This makes it even better, but still should not be as fast as a
membar free SMR load, I think.

Post by jseigh
testcase: unsafe
reader thread count = 4
read_count = 400,000,000
elapsed cpu read_time = 20,833,077,755 nsecs
avg cpu read_time =    52.083 nsecs
elapsed read_time = 5,220,600,403 nsecs
avg elapsed read time =     52.206 nsecs
    live = 399,998,426
    stale = 1,567
    invalid = 0
    other = 7
retire_count = 1,034
elapsed retire_time = 39,858 nsecs
avg retire_time =     0.039 usecs
data allocs = 1,034
data deletes = 1,034
voluntary context switches = 4
involuntary context switches = 21
user cpu time = 20,813,376,000 nsecs
system cpu time = 19,974,000 nsecs

Nice! Humm... I do remember back in the day when you were measuring,
iirc, reads per-second per-reader-thread? mutex solutions were so bad
they were just terminated because they took WAY to long. That measure of
how many reads a reader thread could do during a time frame is a good
statistic for sure.

Think of a reader thread that counted how many reads it actually
executed before it was artificially stopped at a certain time. Those
reads per-reader-thread numbers are very important.