Bonita Montero
2024-03-29 13:14:11 UTC
The following program simulates constant locking und unlocking of one
to jthread::hardware_concurrency() threadas with a std::mutex and a
futex. On my 16 core / 32 thread Zen4 system a futex is faster up to
5 threads constantly contending, but beyond the CPU time of the futex
explodes and the conventional mutex is faster with Windows as and with
Linux.
#include <iostream>
#include <thread>
#include <mutex>
#include <atomic>
#include <functional>
#include <chrono>
#include <vector>
using namespace std;
using namespace chrono;
int main()
{
constexpr int64_t ROUNDS = 10000;
auto bench = [&]( char const *head, auto fn )
{
cout << head << endl;
atomic_int64_t tSum( 0 );
vector<jthread> threads;
int hc = jthread::hardware_concurrency();
for( int nThreads = 1; nThreads <= hc; ++nThreads )
{
for( unsigned t = nThreads; t; --t )
threads.emplace_back( [&]()
{
auto start = high_resolution_clock::now();
int64_t cmp = 0;
for( int64_t r = ROUNDS; r; --r )
cmp = fn( cmp );
tSum += duration_cast<nanoseconds>(
high_resolution_clock::now() - start ).count();
} );
threads.resize( 0 );
cout << "\t" << nThreads << ": " << tSum /
((double)nThreads * ROUNDS) << endl;
}
};
{
mutex mtx;
bench( "mutex: ",
[&]( int64_t )
{
mtx.lock();
mtx.unlock();
return 0;
} );
}
{
atomic_int64_t futex( 0 );
constexpr int64_t HIBIT = numeric_limits<int64_t>::min();
bench( "futex:",
[&]( int64_t cmp )
{
int64_t niu;
for( ; ; )
if( cmp >= 0 )
if( futex.compare_exchange_weak( cmp, niu = cmp
| HIBIT, memory_order_acquire, memory_order_relaxed ) )
{
cmp = niu;
break;
}
else;
else
if( futex.compare_exchange_weak( cmp, niu = cmp
+ 1, memory_order_relaxed, memory_order_relaxed ) )
{
futex.wait( niu, memory_order_acquire );
cmp = futex.load( memory_order_relaxed );
}
for( ; ; )
if( cmp & ~HIBIT )
if( futex.compare_exchange_weak( cmp, niu =
(cmp - 1) & ~HIBIT, memory_order_release, memory_order_relaxed ) )
{
cmp = niu;
futex.notify_one();
break;
}
else;
else
if( futex.compare_exchange_weak( cmp, 0,
memory_order_release, memory_order_relaxed ) )
{
cmp = 0;
break;
}
return cmp;
} );
}
}
This are the results under Ubuntu 20.04 LTS:
mutex:
1: 3.9134
2: 5.8706
3: 33.1753
4: 48.63
5: 71.6777
6: 109.556
7: 151.271
8: 208.871
9: 288.031
10: 353.536
11: 446.28
12: 563.841
13: 701.134
14: 833.08
15: 983.734
16: 1138.48
17: 1297.53
18: 1481.15
19: 1664.51
20: 1857.88
21: 2053.83
22: 2285.07
23: 2546.67
24: 2782.53
25: 3065.25
26: 3349.4
27: 3652.06
28: 3971.25
29: 4339.13
30: 4727.31
31: 5129.95
32: 5499.05
futex:
1: 3.3654
2: 5.02155
3: 6.69107
4: 15.2522
5: 16.8824
6: 185.235
7: 162.176
8: 360.421
9: 2272.83
10: 4805.11
11: 8987.24
12: 14225.5
13: 20076.4
14: 27199.4
15: 36711.8
16: 49480.8
17: 57641.8
18: 75633.2
19: 97196.6
20: 122223
21: 147617
22: 180377
23: 216084
24: 256032
25: 299681
26: 345518
27: 398785
28: 445499
29: 506351
30: 572440
31: 643455
32: 721461
With all threads locking and unlocking the conventional mutex is 131
timea faster.
to jthread::hardware_concurrency() threadas with a std::mutex and a
futex. On my 16 core / 32 thread Zen4 system a futex is faster up to
5 threads constantly contending, but beyond the CPU time of the futex
explodes and the conventional mutex is faster with Windows as and with
Linux.
#include <iostream>
#include <thread>
#include <mutex>
#include <atomic>
#include <functional>
#include <chrono>
#include <vector>
using namespace std;
using namespace chrono;
int main()
{
constexpr int64_t ROUNDS = 10000;
auto bench = [&]( char const *head, auto fn )
{
cout << head << endl;
atomic_int64_t tSum( 0 );
vector<jthread> threads;
int hc = jthread::hardware_concurrency();
for( int nThreads = 1; nThreads <= hc; ++nThreads )
{
for( unsigned t = nThreads; t; --t )
threads.emplace_back( [&]()
{
auto start = high_resolution_clock::now();
int64_t cmp = 0;
for( int64_t r = ROUNDS; r; --r )
cmp = fn( cmp );
tSum += duration_cast<nanoseconds>(
high_resolution_clock::now() - start ).count();
} );
threads.resize( 0 );
cout << "\t" << nThreads << ": " << tSum /
((double)nThreads * ROUNDS) << endl;
}
};
{
mutex mtx;
bench( "mutex: ",
[&]( int64_t )
{
mtx.lock();
mtx.unlock();
return 0;
} );
}
{
atomic_int64_t futex( 0 );
constexpr int64_t HIBIT = numeric_limits<int64_t>::min();
bench( "futex:",
[&]( int64_t cmp )
{
int64_t niu;
for( ; ; )
if( cmp >= 0 )
if( futex.compare_exchange_weak( cmp, niu = cmp
| HIBIT, memory_order_acquire, memory_order_relaxed ) )
{
cmp = niu;
break;
}
else;
else
if( futex.compare_exchange_weak( cmp, niu = cmp
+ 1, memory_order_relaxed, memory_order_relaxed ) )
{
futex.wait( niu, memory_order_acquire );
cmp = futex.load( memory_order_relaxed );
}
for( ; ; )
if( cmp & ~HIBIT )
if( futex.compare_exchange_weak( cmp, niu =
(cmp - 1) & ~HIBIT, memory_order_release, memory_order_relaxed ) )
{
cmp = niu;
futex.notify_one();
break;
}
else;
else
if( futex.compare_exchange_weak( cmp, 0,
memory_order_release, memory_order_relaxed ) )
{
cmp = 0;
break;
}
return cmp;
} );
}
}
This are the results under Ubuntu 20.04 LTS:
mutex:
1: 3.9134
2: 5.8706
3: 33.1753
4: 48.63
5: 71.6777
6: 109.556
7: 151.271
8: 208.871
9: 288.031
10: 353.536
11: 446.28
12: 563.841
13: 701.134
14: 833.08
15: 983.734
16: 1138.48
17: 1297.53
18: 1481.15
19: 1664.51
20: 1857.88
21: 2053.83
22: 2285.07
23: 2546.67
24: 2782.53
25: 3065.25
26: 3349.4
27: 3652.06
28: 3971.25
29: 4339.13
30: 4727.31
31: 5129.95
32: 5499.05
futex:
1: 3.3654
2: 5.02155
3: 6.69107
4: 15.2522
5: 16.8824
6: 185.235
7: 162.176
8: 360.421
9: 2272.83
10: 4805.11
11: 8987.24
12: 14225.5
13: 20076.4
14: 27199.4
15: 36711.8
16: 49480.8
17: 57641.8
18: 75633.2
19: 97196.6
20: 122223
21: 147617
22: 180377
23: 216084
24: 256032
25: 299681
26: 345518
27: 398785
28: 445499
29: 506351
30: 572440
31: 643455
32: 721461
With all threads locking and unlocking the conventional mutex is 131
timea faster.