Skip to content

Instantly share code, notes, and snippets.

@Jura-Z
Created August 27, 2025 22:18
Show Gist options
  • Select an option

  • Save Jura-Z/f1414bd5d01ac72c763667ec116edcf5 to your computer and use it in GitHub Desktop.

Select an option

Save Jura-Z/f1414bd5d01ac72c763667ec116edcf5 to your computer and use it in GitHub Desktop.
Demo of direct TLS slot (fast) vs pthread TLS (slow)
// License: Public Domain (www.unlicense.org)
// Run several times in release with TSL_SLOT set to 0. Then several times with TSL_SLOT set to 1
#define TSL_SLOT 0
// release build. TSL_SLOT == 0. macos 15.6.1. M2 Max is around 60+ msec
// release build. TSL_SLOT == 1. macos 15.6.1. M2 Max is around 40 msec
#include <iostream>
#include <pthread.h>
#include <vector>
#include <chrono>
#include <cstdlib>
static inline void* tls_slot_get(size_t slot) noexcept
{
void** tcb;
__asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb));
return tcb[slot];
}
static inline void tls_slot_set(size_t slot, void* value) noexcept
{
void** tcb;
__asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb));
tcb[slot] = value;
}
// Number of threads and number of iterations per thread
const int N = 142;
const int M = 1000000;
pthread_key_t tls_key; // TLS key
pthread_mutex_t sum_mutex = PTHREAD_MUTEX_INITIALIZER;
std::vector<long> tls_results(N, 0); // For gathering results
// Destructor for thread-specific data
void tls_destructor(void* ptr) {
delete static_cast<long*>(ptr);
}
// Thread function
void* thread_func(void* arg) {
int thread_index = *(int*)arg;
delete (int*)arg; // Clean up passed index memory
{
// Allocate and initialize thread-local storage
long* local_value = new long(0);
#if TSL_SLOT
tls_slot_set(112, local_value);
#else
pthread_setspecific(tls_key, local_value);
#endif
}
{
for (int i = 0; i < M; ++i)
{
#if TSL_SLOT
long* local_value = (long*)tls_slot_get(112);
#else
long* local_value = (long*)pthread_getspecific(tls_key);
#endif
(*local_value)++;
#if TSL_SLOT
tls_slot_set(112, local_value);
#else
pthread_setspecific(tls_key, local_value);
#endif
}
}
{
#if TSL_SLOT
long* local_value_read = (long*)tls_slot_get(112);
#else
long* local_value_read = (long*)pthread_getspecific(tls_key);
#endif
// Save result into shared vector
tls_results[thread_index] = *local_value_read;
}
pthread_exit(nullptr);
}
int main() {
// Create TLS key with destructor
if (pthread_key_create(&tls_key, tls_destructor) != 0) {
std::cerr << "Failed to create TLS key\n";
return 1;
}
std::vector<pthread_t> threads(N);
auto start_time = std::chrono::high_resolution_clock::now();
// Spawn threads
for (int i = 0; i < N; ++i) {
int* index = new int(i); // Pass index to thread
if (pthread_create(&threads[i], nullptr, thread_func, index) != 0) {
std::cerr << "Failed to create thread " << i << "\n";
return 1;
}
}
// Join threads
for (int i = 0; i < N; ++i) {
pthread_join(threads[i], nullptr);
}
auto end_time = std::chrono::high_resolution_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
// Sum all TLS values
long total = 0;
for (const auto& val : tls_results) {
total += val;
}
std::cout << "Total TLS value from all threads: " << total << "\n";
std::cout << "Elapsed time: " << elapsed.count() << " msec\n";
// Clean up TLS key
pthread_key_delete(tls_key);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment