Futexes ain't fast

Discussion:

Futexes ain't fast

Add Reply

Bonita Montero

2024-08-28 12:09:10 UTC

I tested the operating-system specific mutex (CRITICAL_SECTION Or
pthread_mutext_t) against a futex and a std::mutex. I guessed std::mutex
uses th operating system specific mutex internally, but the times varied
so much across Windows and Linux that I gues that std::mutex used at
least a differently parametrized operating system mutex or maybe even
completely own code.
This are the times and each line has a further contender:

os-mutex C++-futex std::mutex
1 4.538 6.746 6.867
2 14.084 37.033 21.505
3 28.533 111.212 48.5523
4 51.3995 233.28 82.1983
5 85.293 353.142 122.056
6 124.706 482.997 169.534
7 178.731 601.233 250.503
8 285.089 826.833 326.098
9 405.877 1023.48 368.619
10 425.566 1200.88 459.109
11 301.405 1384.84 567.276
12 365.924 1467.74 718.556
13 398.08 1674.63 1152.45
14 471.867 1815.81 1274.33
15 524.513 1949.76 1599.02
16 606.325 2086.62 1773.66
17 655.687 2121.09 2034.13
18 724.213 2267.87 2102.01
19 815.462 2365.32 2398.93
20 868.996 2520.45 2447.82
21 945.699 2588.83 2636.89
22 1093.38 2742.53 2858.13
23 1132.9 2873.63 3080.97
24 1234.79 2963.54 3274.75
25 1336.52 2906.26 3483.22
26 1447.84 3028.91 3676.8
27 1538.75 3227.37 3829.54
28 1624.9 3393.49 4023.94
29 1719.17 3516.76 4184.77
30 1822.47 3644.64 4363.56
31 1937.37 3818.5 4582.23
32 2018.14 3948.14 4705.01

#if defined(_WIN32)
#include <Windows.h>
#elif defined(__unix__)
#include <pthread.h>
#endif
#include <iostream>
#include <atomic>
#include <functional>
#include <thread>
#include <vector>
#include <chrono>
#include <optional>
#include <latch>
#include <mutex>

using namespace std;
using namespace chrono;

int main()
{
using test_t = pair<char const *, function<void ()>>;
vector<jthread> threads;
unsigned hc = jthread::hardware_concurrency();
threads.reserve( hc );
vector<test_t> tests;
#if defined(_WIN32)
CRITICAL_SECTION cs;
InitializeCriticalSection( &cs );
tests.emplace_back( "os-mutex", [&]
{
EnterCriticalSection( &cs );
LeaveCriticalSection( &cs );
} );
#elif defined(__unix__)
pthread_mutex_t pm;
pthread_mutex_init( &pm, nullptr );
tests.emplace_back( "os-mutex", [&]
{
pthread_mutex_lock( &pm );
pthread_mutex_unlock( &pm );
} );
#endif
atomic_bool futex( false );
tests.emplace_back( "C++-futex", [&]
{
while( futex.exchange( true, memory_order_acquire ) )
futex.wait( true, memory_order_relaxed );
futex.exchange( false, memory_order_release );
futex.notify_one();
} );
mutex mtx;
tests.emplace_back( "std::mutex", [&]
{
mtx.lock();
mtx.unlock();
} );
constexpr int64_t ROUNDS = 100'000;
ostringstream oss;
auto print = [&]( auto const &param )
{
oss.str( "" );
oss << param;
string str( oss.str() );
size_t len = str.length();
str = string( len <= 11 ? 11 - len : 0, ' ' ) + str;
cout << str;
};
print( "" );
for( test_t const &test : tests )
print( test.first );
cout << endl;
for( unsigned nThreads = 1; nThreads <= hc; ++nThreads )
{
print( nThreads );
for( test_t const &test : tests )
{
atomic_int64_t tSum( 0 );
latch lat( nThreads + 1 );
for( unsigned t = 0; t < nThreads; ++t )
threads.emplace_back( [&]
{
lat.arrive_and_wait();
auto start = high_resolution_clock::now();
for( int64_t r = ROUNDS; r; --r )
test.second();
tSum.fetch_add( duration_cast<nanoseconds>(
high_resolution_clock::now() - start ).count(), memory_order_relaxed );
} );
lat.arrive_and_wait();
threads.resize( 0 );
print( tSum.load( memory_order_relaxed ) / ((double)nThreads * ROUNDS) );
}
cout << endl;
}
}

Bonita Montero

2024-08-28 12:14:11 UTC