总结:
不论arm还是x86,但单核up情况下,关中断就能保证不被打断。
smp情况下,arm是以原子指令保证最终的执行正确性,x86是通过指令对操作内存锁定保证不会被打断。
代码来自linux 5.8.18,有点乱。
arm v6之前的版本不支持smp:
/*
* ARMv6 UP and SMP safe atomic ops. We use load exclusive and
* store exclusive to ensure that these are atomic. We may loop
* to ensure that the update happens. // 最后这句话说明了这里原子操作的本质是保持着一动作的完整性,而不会真正的“不可打断、中断”
*/
#define ATOMIC_OP(op, c_op, asm_op) \
static inline void atomic_##op(int i, atomic_t *v) \
{ \
unsigned long tmp; \
int result; \
\
prefetchw(&v->counter); \
__asm__ __volatile__("@ atomic_" #op "\n" \
"1: ldrex %0, [%3]\n" \
" " #asm_op " %0, %0, %4\n" \
" strex %1, %0, [%3]\n" \
" teq %1, #0\n" \
" bne 1b" \
: "=&r" (result), "=&r" (tmp), "+Qo" (v->counter) \
: "r" (&v->counter), "Ir" (i) \
: "cc"); \
} \
x86的原子操作是通过LOCK指令,使得其它指令无法获得
SMP,LOCK指令
static inline void atomic_add(int i, atomic_t *v)
{
atomic_add_return(i, v);
}
#ifndef atomic_add_return
ATOMIC_OP_RETURN(add, +)
#endif
1. 默认的动作(这里是说如果没有arch定义的情况下)
单核:这里就比较简单了,关中断就保证不会被打断了
#define ATOMIC_OP_RETURN(op, c_op) \
static inline int atomic_##op##_return(int i, atomic_t *v) \
{ \
unsigned long flags; \
int ret; \
\
raw_local_irq_save(flags); \
ret = (v->counter = v->counter c_op i); \
raw_local_irq_restore(flags); \
\
return ret; \
}
SMP:
#define ATOMIC_OP_RETURN(op, c_op) \
static inline int atomic_##op##_return(int i, atomic_t *v) \
{ \
int c, old; \
\
c = v->counter; \
while ((old = cmpxchg(&v->counter, c, c c_op i)) != c) \
c = old; \
\
return c c_op i; \
}
#define cmpxchg(ptr, o, n) cmpxchg_local((ptr), (o), (n))
注意,这里的call tree, 都会优先调用arch里定义的架构实现(ifndef)
atomic_add()
\_atomic_add_return()
\_cmpxchg()
2. arm(armv7)的atomic实现
仍然是ldrex、strex实现:
#define ATOMIC_OP_RETURN(op, c_op, asm_op) \
static inline int atomic_##op##_return_relaxed(int i, atomic_t *v) \
{ \
unsigned long tmp; \
int result; \
\
prefetchw(&v->counter); \
\
__asm__ __volatile__("@ atomic_" #op "_return\n" \
"1: ldrex %0, [%3]\n" \
" " #asm_op " %0, %0, %4\n" \
" strex %1, %0, [%3]\n" \
" teq %1, #0\n" \
" bne 1b" \
: "=&r" (result), "=&r" (tmp), "+Qo" (v->counter) \
: "r" (&v->counter), "Ir" (i) \
: "cc"); \
\
return result; \
}
这里引用网上来的一个表,说明发生smp并发访问同一个内存的情况下的状态变化:
LDREX <Rt>, [<Rn>]
<Rn>是base register,保存memory的address,LDREX指令从base register中获取memory address,并且将memory的内容加载到<Rt>(destination register)中。这些操作和ldr的操作是一样的,那么如何体现exclusive呢?其实,在执行这条指令的时候,还放出两条“狗”来负责观察特定地址的访问(就是保存在[<Rn>]中的地址了),这两条狗一条叫做local monitor,一条叫做global monitor。
STREX <Rd>, <Rt>, [<Rn>]
和LDREX指令类似,<Rn>是base register,保存memory的address,STREX指令从base register中获取memory address,并且将<Rt> (source register)中的内容加载到该memory中。这里的<Rd>保存了memeory 更新成功或者失败的结果,0表示memory更新成功,1表示失败。STREX指令是否能成功执行是和local monitor和global monitor的状态相关的。对于Non-shareable memory(该memory不是多个CPU之间共享的,只会被一个CPU访问),只需要放出该CPU的local monitor这条狗就OK了,
3. arm64(ARMv8)的atomic实现
这里使用的是ldxr、stxr
/*
* AArch64 UP and SMP safe atomic ops. We use load exclusive and
* store exclusive to ensure that these are atomic. We may loop
* to ensure that the update happens.
*/
#define ATOMIC_OP(op, asm_op, constraint) \
static inline void \
__ll_sc_atomic_##op(int i, atomic_t *v) \
{ \
unsigned long tmp; \
int result; \
\
asm volatile("// atomic_" #op "\n" \
__LL_SC_FALLBACK( \
" prfm pstl1strm, %2\n" \
"1: ldxr %w0, %2\n" \
" " #asm_op " %w0, %w0, %w3\n" \
" stxr %w1, %w0, %2\n" \
" cbnz %w1, 1b\n") \
: "=&r" (result), "=&r" (tmp), "+Q" (v->counter) \
: __stringify(constraint) "r" (i)); \
}
#define ATOMIC_OP_RETURN(name, mb, acq, rel, cl, op, asm_op, constraint)\
static inline int \
__ll_sc_atomic_##op##_return##name(int i, atomic_t *v) \
{ \
unsigned long tmp; \
int result; \
\
asm volatile("// atomic_" #op "_return" #name "\n" \
__LL_SC_FALLBACK( \
" prfm pstl1strm, %2\n" \
"1: ld" #acq "xr %w0, %2\n" \
" " #asm_op " %w0, %w0, %w3\n" \
" st" #rel "xr %w1, %w0, %2\n" \
" cbnz %w1, 1b\n" \
" " #mb ) \
: "=&r" (result), "=&r" (tmp), "+Q" (v->counter) \
: __stringify(constraint) "r" (i) \
: cl); \
\
return result; \
}
注:在ARMv8.1架构中引入atomic instruction, 例如LDADD (Atomic add),CAS(Compare and Swap)
LDXR/ STXR和一般的LDR/STR有什么区别:
-- 这个区别就在于LDXR除了向memory发起load请求外,还会记录该memory所在地址的状态
-- 一般ARM处理器在同一个cache line大小,也就是64 byte的地址范围内共用一个状态,就是Open和Exclusive
4)x86
x86提供了cmpxchg()实现:这里和预想的一致,以LOCK/LOCK_PREFIX解决锁住独占
#define __cmpxchg(ptr, old, new, size) \
__raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX)
#define cmpxchg(ptr, old, new) \
__cmpxchg(ptr, old, new, sizeof(*(ptr)))
/*
* Atomic compare and exchange. Compare OLD with MEM, if identical,
* store NEW in MEM. Return the initial value in MEM. Success is
* indicated by comparing RETURN with OLD.
*/
#define __raw_cmpxchg(ptr, old, new, size, lock) \
({ \
__typeof__(*(ptr)) __ret; \
__typeof__(*(ptr)) __old = (old); \
__typeof__(*(ptr)) __new = (new); \
switch (size) { \
case __X86_CASE_B: \
{ \
volatile u8 *__ptr = (volatile u8 *)(ptr); \
asm volatile(lock "cmpxchgb %2,%1" \
: "=a" (__ret), "+m" (*__ptr) \
: "q" (__new), "0" (__old) \
: "memory"); \
break; \
} \
case __X86_CASE_W: \
{ \
volatile u16 *__ptr = (volatile u16 *)(ptr); \
asm volatile(lock "cmpxchgw %2,%1" \
: "=a" (__ret), "+m" (*__ptr) \
: "r" (__new), "0" (__old) \
: "memory"); \
break; \
} \
case __X86_CASE_L: \
{ \
volatile u32 *__ptr = (volatile u32 *)(ptr); \
asm volatile(lock "cmpxchgl %2,%1" \
: "=a" (__ret), "+m" (*__ptr) \
: "r" (__new), "0" (__old) \
: "memory"); \
break; \
} \
case __X86_CASE_Q: \
{ \
volatile u64 *__ptr = (volatile u64 *)(ptr); \
asm volatile(lock "cmpxchgq %2,%1" \
: "=a" (__ret), "+m" (*__ptr) \
: "r" (__new), "0" (__old) \
: "memory"); \
break; \
} \
default: \
__cmpxchg_wrong_size(); \
} \
__ret; \
})
看一下LOCK_PREFIX:
#ifdef CONFIG_SMP
#define LOCK_PREFIX_HERE \
".pushsection .smp_locks,\"a\"\n" \
".balign 4\n" \
".long 671f - .\n" /* offset */ \
".popsection\n" \
"671:"
#define LOCK_PREFIX LOCK_PREFIX_HERE "\n\tlock; "
#else /* ! CONFIG_SMP */
#define LOCK_PREFIX_HERE ""
#define LOCK_PREFIX ""
#endif
这段代码汇编后,在 .text 段生成一条 lock 指令前缀 0xf0,在 .smp_locks 段生成四个字节的 lock 前缀的地址链接的时候,所有的 .smp_locks 段合并起来,形成一个所有 lock 指令地址的数组。
在所有的 X86 CPU 上都具有锁定一个特定内存地址的能力,当这个特定内存地址被锁定后,它就可以阻止其他的系统总线读取或修改这个内存地址。这种能力是通过 LOCK 指令前缀再加上下面的汇编指令来实现的。当使用 LOCK 指令前缀时,它会使 CPU 宣告一个 LOCK# 信号,这样就能确保在多处理器系统或多线程竞争的环境下互斥地使用这个内存地址。当指令执行完毕,这个锁定动作也就会消失