5

Linux Thread Local Storage(TLS)

 2 years ago
source link: https://easeapi.com/blog/blog/159-pthread-tls.html
Go to the source link to view the article. You can view the picture content, updated content and better typesetting reading experience. If the link is broken, please click the button below to view the snapshot at that time.

在C/C++程序中,全局变量默认是所有线程共享的,开发者需要处理多线程竞争问题。有些情况下我们需要保证一个线程独享一份数据,其它线程无法访问。典型的就是errno全局变量,它总是会保存当前线程最后一个调用的错误码,不会存在线程冲突。这个时候需要使用线程局部存储(TLS)来解决。

pthread的内存结构

在说明TLS之前,先了解下pthread的内存结构。glibc/nptl/descr.h中定义了线程重要的数据结构struct pthread,它描述了用户态线程的完整信息,每创建一个pthread线程,都在内存中有一个对应的pthread结构体。pthread结构非常复杂,与TLS有关的是specific_1stblock数组和specific二级数组,后面会做说明。

#define PTHREAD_KEY_2NDLEVEL_SIZE       32
#define PTHREAD_KEY_1STLEVEL_SIZE \
  ((PTHREAD_KEYS_MAX + PTHREAD_KEY_2NDLEVEL_SIZE - 1) \
   / PTHREAD_KEY_2NDLEVEL_SIZE)

struct pthread
{
    union
  {
#if !TLS_DTV_AT_TP
    /* This overlaps the TCB as used for TLS without threads (see tls.h).  */
    tcbhead_t header;
#else
    struct
    {
      int multiple_threads;
      int gscope_flag;
    } header;
#endif

    void *__padding[24];
  };

  list_t list;
  pid_t tid;

  ...
  struct pthread_key_data
  {
    /* Sequence number.  We use uintptr_t to not require padding on
       32- and 64-bit machines.  On 64-bit machines it helps to avoid
       wrapping, too.  */
    uintptr_t seq;

    /* Data pointer.  */
    void *data;
  } specific_1stblock[PTHREAD_KEY_2NDLEVEL_SIZE];

  /* Two-level array for the thread-specific data.  */
  struct pthread_key_data *specific[PTHREAD_KEY_1STLEVEL_SIZE];

  /* Flag which is set when specific data is set.  */
  bool specific_used;
  ...
}

__thread

在GCC/Clang编译环境中,可以使用__thread关键字来声明TLS变量,__thread关键字不是C标准,不同的编译器名字不同。

在Xcode 13.2上测试仅i386架构不支持__thread

#if defined(__i386__)
static char *g_thread_data = NULL;
#else
static __thread char *g_thread_data = NULL;
#endif

使用__thread关键字声明的变量,存储在pthred结构体之后,栈空间之间的内存区域。也就是说,从内存布局上看,高地址到底地址的内存分布是:pthred结构、__thread变量区域、栈区(栈底和__thread变量区顶相接)。

下面以Xcode 13.2/arm64运行的程序来说明这点。

__thread uint64_t g_tls_int = 6;
__thread char *g_tls_string = "easeapi.com";;

void tls_test(void)
{
    uint64_t value = g_tls_int;
    printf("%llu", value);
    char *string = g_tls_string;
    printf("%s", string);
}

在tls_test入口处断点,查看对应的汇编程序,如下:

    0x104235240 <+0>:   sub    sp, sp, #0x40             ; =0x40 
    0x104235244 <+4>:   stp    x29, x30, [sp, #0x30]
    0x104235248 <+8>:   add    x29, sp, #0x30            ; =0x30 
    0x10423524c <+12>:  adrp   x0, 529
    0x104235250 <+16>:  add    x0, x0, #0xd70            ; =0xd70 
    0x104235254 <+20>:  ldr    x8, [x0]
    0x104235258 <+24>:  blr    x8
    0x10423525c <+28>:  str    x0, [sp, #0x10]
    0x104235260 <+32>:  adrp   x0, 529
    0x104235264 <+36>:  add    x0, x0, #0xd88            ; =0xd88 
    0x104235268 <+40>:  ldr    x8, [x0]
    0x10423526c <+44>:  blr    x8
    0x104235270 <+48>:  mov    x8, x0
    0x104235274 <+52>:  ldr    x0, [sp, #0x10]
    0x104235278 <+56>:  str    x8, [sp, #0x18]
    0x10423527c <+60>:  ldr    x8, [x0]
    0x104235280 <+64>:  stur   x8, [x29, #-0x8]
    0x104235284 <+68>:  ldur   x8, [x29, #-0x8]
    0x104235288 <+72>:  adrp   x0, 471
    0x10423528c <+76>:  add    x0, x0, #0x7fc            ; =0x7fc 
    0x104235290 <+80>:  mov    x9, sp
    0x104235294 <+84>:  str    x8, [x9]
    0x104235298 <+88>:  bl     0x104403be0               ; symbol stub for: printf
    0x10423529c <+92>:  ldr    x0, [sp, #0x18]
    0x1042352a0 <+96>:  ldr    x8, [x0]
    0x1042352a4 <+100>: stur   x8, [x29, #-0x10]
    0x1042352a8 <+104>: ldur   x8, [x29, #-0x10]
    0x1042352ac <+108>: adrp   x0, 471
    0x1042352b0 <+112>: add    x0, x0, #0x801            ; =0x801 
    0x1042352b4 <+116>: mov    x9, sp
    0x1042352b8 <+120>: str    x8, [x9]
    0x1042352bc <+124>: bl     0x104403be0               ; symbol stub for: printf
    0x1042352c0 <+128>: ldp    x29, x30, [sp, #0x30]
    0x1042352c4 <+132>: add    sp, sp, #0x40             ; =0x40 
    0x1042352c8 <+136>: ret 

0x104235274处,sp寄存器偏移0x10字节读取到x0。在0x104235278处读取x0寄存器的值(g_tls_int):

(lldb) register read x0
      x0 = 0x0000000281cf41a0
(lldb) memory read/1xg 0x0000000281cf41a0
0x281cf41a0: 0x0000000000000006

0x10423529c处,sp寄存器偏移0x18字节读取到x0。在0x1042352a0处读取x0寄存器的值(g_tls_string):

(lldb) register read x0
      x0 = 0x0000000281cf41a8
(lldb) memory read/1xg 0x0000000281cf41a8
0x281cf41a8: 0x000000010440c7f0
(lldb) memory read 0x000000010440c7f0
0x10440c7f0: 65 61 73 65 61 70 69 2e 63 6f 6d 00 25 6c 6c 75  easeapi.com.%llu
0x10440c800: 00 25 73 00 4d 79 41 70 70 6c 69 63 61 74 69 6f  .%s.MyApplicatio

从上面的测试结果来看,读取__thread变量都是通过fp指针偏移(向高地址偏移)来完成的。

__thread修饰的变量必须是POD(Plain Old Data)类型,不支持class等高级语言特性。__thread变量在线程生命周期一直存在,在线程销毁时释放。需要注意的是,由于__thread并不能指定销毁方法,当我们定义一个__thread修饰的指针变量,并在线程运行中malloc内存后,线程结束仅会将__thread变量指针置NULL,需要开发者手动free内存。

__thread char *g_tls_string = NULL;

void tls_test(void)
{
    if (g_tls_string == NULL) g_tls_string = calloc(1024, 1);
    //线程销毁时,需要手动释放malloc的内存
}

如果想要在线程结束时,自动完成malloc内存的释放,需要使用pthread specific相关的API。

pthread specific API

pthread同时提供了以下API实现TLS的功能:

//nptl/bits/pthreadtypes.h
/* Keys for thread-specific data */
typedef unsigned int pthread_key_t;

int pthread_key_create(pthread_key_t *, void (* _Nullable)(void *));
int pthread_key_delete(pthread_key_t);

int pthread_setspecific(pthread_key_t , const void * _Nullable);
void* _Nullable pthread_getspecific(pthread_key_t);

pthread_key_create的第一个参数是pthread_key_t指针,用于接收创建成功返回的pthread_key_t,第二个参数是数据析构函数指针,会在线程销毁时执行。pthread_key_create成功后获得pthread_key_t,之后可通过pthread_key_t进行线程私有数据的读写。示例代码如下:

//create key
pthread_key_t key = 0;
pthread_key_create(&key, NULL); 

//write
struct easeapi_struct data;
pthread_setspecific(key, &struct_data);

//read
struct easeapi_struct* = (struct easeapi_struct *)pthread_getspecific(key)

每一个进程都有一个全局数组__pthread_keys来管理pthread_key_t。

//nptl/internaltypes.h:
/* Thread-local data handling.  */
struct pthread_key_struct
{
  /* Sequence numbers.  Even numbers indicated vacant entries.  Note
     that zero is even.  We use uintptr_t to not require padding on
     32- and 64-bit machines.  On 64-bit machines it helps to avoid
     wrapping, too.  */
  uintptr_t seq;

  /* Destructor for the data.  */
  void (*destr) (void *);
};

//sysdeps/unix/sysv/linux/bits/local_lim.h
/* This is the value this implementation supports.  */
#define PTHREAD_KEYS_MAX 1024

//nptl/pthread_keys.c
/* Table of the key information.  */
struct pthread_key_struct __pthread_keys[PTHREAD_KEYS_MAX];

struct pthread_key_struct结构中定义了seq和传入的析构函数的指针。一个程序同时最多可以创建PTHREAD_KEYS_MAX个pthread_key_t。pthread_key_t是全局的,但不同的线程通过pthread_key_t访问读写接口时,实际上操作的是不同的内存。

当执行pthread_key_create时,会从__pthread_keys数组中找到一个没有使用的pthread_key_struct结构,并对其seq加1。返回的pthread_key_t实际上就是这个pthread_key_struct在__pthread_keys数组中的序号。如下代码:

//nptl/pthread_key_create.c:
int
___pthread_key_create (pthread_key_t *key, void (*destr) (void *))
{
  /* Find a slot in __pthread_keys which is unused.  */
  for (size_t cnt = 0; cnt < PTHREAD_KEYS_MAX; ++cnt)
    {
      uintptr_t seq = __pthread_keys[cnt].seq;

      if (KEY_UNUSED (seq) && KEY_USABLE (seq)
   /* We found an unused slot.  Try to allocate it.  */
   && ! atomic_compare_and_exchange_bool_acq (&__pthread_keys[cnt].seq,
           seq + 1, seq))
 {
   /* Remember the destructor.  */
   __pthread_keys[cnt].destr = destr;

   /* Return the key to the caller.  */
   *key = cnt;

   /* The call succeeded.  */
   return 0;
 }
    }

  return EAGAIN;
}

当执行pthread_key_delete时,会根据pthread_key_t的序号,从__pthread_keys找到对应的pthread_key_struct,并对其seq加1。如下代码:

//nptl/pthread_key_delete.c
int
___pthread_key_delete (pthread_key_t key)
{
  int result = EINVAL;

  if (__glibc_likely (key < PTHREAD_KEYS_MAX))
    {
      unsigned int seq = __pthread_keys[key].seq;

      if (__builtin_expect (! KEY_UNUSED (seq), 1)
   && ! atomic_compare_and_exchange_bool_acq (&__pthread_keys[key].seq,
           seq + 1, seq))
 /* We deleted a valid key.  */
 result = 0;
    }

  return result;
}

注意这里使用了atomic_compare_and_exchange_bool_acq来保证原子操作。

seq默认为0,无论是pthread_key_create还是pthread_key_delete都是对seq加1。当seq的值是偶数(包括0)时,表示当前pthread_key_struct未被使用,为奇数时表示在使用。

通过pthread_key_create分配pthread_key_t是全局的,但键值关联却是各线程独立的。在struct pthread结构体中有下面的定义:

 struct pthread_key_data
  {
    /* Sequence number.  We use uintptr_t to not require padding on
       32- and 64-bit machines.  On 64-bit machines it helps to avoid
       wrapping, too.  */
    uintptr_t seq;

    /* Data pointer.  */
    void *data;
  } specific_1stblock[PTHREAD_KEY_2NDLEVEL_SIZE];

  /* Two-level array for the thread-specific data.  */
  struct pthread_key_data *specific[PTHREAD_KEY_1STLEVEL_SIZE];

struct pthread_key_data结构定义了当前线程存储TLS数据的指针data,seq和struct pthread_key_struct的seq一样,标识了对应的key是否创建。

specific_1stblock并没有设置和PTHREAD_KEYS_MAX一样的大小,而是设置为PTHREAD_KEY_2NDLEVEL_SIZE(32)大小,这应该是从节省内存的角度设计的,大部分情况下我们并不会使用很多TLS变量。

执行pthread_setspecific时,当pthread_key_t个数小于PTHREAD_KEY_2NDLEVEL_SIZE,直接使用specific_1stblock数组;当pthread_key_t个数超过PTHREAD_KEY_2NDLEVEL_SIZE时,再申请内存空间使用specific二级数组,值存储在specific[idx1st][idx2nd].data。

//nptl/pthread_setspecific.c
int
___pthread_setspecific (pthread_key_t key, const void *value)
{
  struct pthread *self;
  unsigned int idx1st;
  unsigned int idx2nd;
  struct pthread_key_data *level2;
  unsigned int seq;

  self = THREAD_SELF;

  /* Special case access to the first 2nd-level block.  This is the
     usual case.  */
  if (__glibc_likely (key < PTHREAD_KEY_2NDLEVEL_SIZE))
    {
      /* Verify the key is sane.  */
      if (KEY_UNUSED ((seq = __pthread_keys[key].seq)))
 /* Not valid.  */
 return EINVAL;

      level2 = &self->specific_1stblock[key];

      /* Remember that we stored at least one set of data.  */
      if (value != NULL)
 THREAD_SETMEM (self, specific_used, true);
    }
  else
    {
      if (key >= PTHREAD_KEYS_MAX
   || KEY_UNUSED ((seq = __pthread_keys[key].seq)))
 /* Not valid.  */
 return EINVAL;

      idx1st = key / PTHREAD_KEY_2NDLEVEL_SIZE;
      idx2nd = key % PTHREAD_KEY_2NDLEVEL_SIZE;

      /* This is the second level array.  Allocate it if necessary.  */
      level2 = THREAD_GETMEM_NC (self, specific, idx1st);
      if (level2 == NULL)
 {
   if (value == NULL)
     /* We don't have to do anything.  The value would in any case
        be NULL.  We can save the memory allocation.  */
     return 0;

   level2
     = (struct pthread_key_data *) calloc (PTHREAD_KEY_2NDLEVEL_SIZE,
        sizeof (*level2));
   if (level2 == NULL)
     return ENOMEM;

   THREAD_SETMEM_NC (self, specific, idx1st, level2);
 }

      /* Pointer to the right array element.  */
      level2 = &level2[idx2nd];

      /* Remember that we stored at least one set of data.  */
      THREAD_SETMEM (self, specific_used, true);
    }

  /* Store the data and the sequence number so that we can recognize
     stale data.  */
  level2->seq = seq;
  level2->data = (void *) value;

  return 0;
}

有了上面的分析,执行pthread_getspecific的逻辑就比较清晰了。

//nptl/pthread_getspecific.c

void *
___pthread_getspecific (pthread_key_t key)
{
  struct pthread_key_data *data;

  /* Special case access to the first 2nd-level block.  This is the
     usual case.  */
  if (__glibc_likely (key < PTHREAD_KEY_2NDLEVEL_SIZE))
    data = &THREAD_SELF->specific_1stblock[key];
  else
    {
      /* Verify the key is sane.  */
      if (key >= PTHREAD_KEYS_MAX)
 /* Not valid.  */
 return NULL;

      unsigned int idx1st = key / PTHREAD_KEY_2NDLEVEL_SIZE;
      unsigned int idx2nd = key % PTHREAD_KEY_2NDLEVEL_SIZE;

      /* If the sequence number doesn't match or the key cannot be defined
  for this thread since the second level array is not allocated
  return NULL, too.  */
      struct pthread_key_data *level2 = THREAD_GETMEM_NC (THREAD_SELF,
         specific, idx1st);
      if (level2 == NULL)
 /* Not allocated, therefore no data.  */
 return NULL;

      /* There is data.  */
      data = &level2[idx2nd];
    }

  void *result = data->data;
  if (result != NULL)
    {
      uintptr_t seq = data->seq;

      if (__glibc_unlikely (seq != __pthread_keys[key].seq))
 result = data->data = NULL;
    }

  return result;
}

按照glibc的实现,当执行pthread_key_create获取的pthread_key_t应该是比较小的值才能优先使用specific_1stblock数组。但笔者在macOS环境测试发现获取的pthread_key_t比较大,这里应该是macOS具体的实现有和glibc不一致的地方?

__thread和pthread specific API对比

  • 存储区域/寻址方式不同

pthread specific API定义的数据,是通过struct pthread结构体的specific_1stblock数组和specific二级数组寻址,而__thread变量则是通过fp寄存器偏移寻址。

  • 性能/效率不同

由于__thread是通过fp寄存器偏移寻址,性能比pthread specific API高。

  • 能存储的数据不同

__thread只能修饰POD类型变量,对于指针类型的数据,有申请内存时需要手动销毁;而pthread specific API支持传入销毁方法,支持所有数据类型。

  • 支持的数据个数不同

理论上只要栈不被占满,__thread可以无限定义(存疑?);而pthread specific API只能创建PTHREAD_KEYS_MAX个key,但可以通过结构体等的方式,使用一个key存储多个值。

参考:glibc源码

如何正确地获取线程ID?


About Joyk


Aggregate valuable and interesting links.
Joyk means Joy of geeK