Message ID | 1395330365-9901-11-git-send-email-ian.campbell@citrix.com |
---|---|
State | Superseded |
Headers | show |
On 03/20/2014 03:45 PM, Ian Campbell wrote: > Xen, like Linux, expects full barrier semantics for bitops, atomics and > cmpxchgs. This issue was discovered on Linux and we get our implementation of > these from Linux so quoting Will Deacon in Linux commit 8e86f0b409a4 for the > gory details: > Linux requires a number of atomic operations to provide full barrier > semantics, that is no memory accesses after the operation can be > observed before any accesses up to and including the operation in > program order. > > On arm64, these operations have been incorrectly implemented as follows: > > // A, B, C are independent memory locations > > <Access [A]> > > // atomic_op (B) > 1: ldaxr x0, [B] // Exclusive load with acquire > <op(B)> > stlxr w1, x0, [B] // Exclusive store with release > cbnz w1, 1b > > <Access [C]> > > The assumption here being that two half barriers are equivalent to a > full barrier, so the only permitted ordering would be A -> B -> C > (where B is the atomic operation involving both a load and a store). > > Unfortunately, this is not the case by the letter of the architecture > and, in fact, the accesses to A and C are permitted to pass their > nearest half barrier resulting in orderings such as Bl -> A -> C -> Bs > or Bl -> C -> A -> Bs (where Bl is the load-acquire on B and Bs is the > store-release on B). This is a clear violation of the full barrier > requirement. > > The simple way to fix this is to implement the same algorithm as ARMv7 > using explicit barriers: > > <Access [A]> > > // atomic_op (B) > dmb ish // Full barrier > 1: ldxr x0, [B] // Exclusive load > <op(B)> > stxr w1, x0, [B] // Exclusive store > cbnz w1, 1b > dmb ish // Full barrier > > <Access [C]> > > but this has the undesirable effect of introducing *two* full barrier > instructions. A better approach is actually the following, non-intuitive > sequence: > > <Access [A]> > > // atomic_op (B) > 1: ldxr x0, [B] // Exclusive load > <op(B)> > stlxr w1, x0, [B] // Exclusive store with release > cbnz w1, 1b > dmb ish // Full barrier > > <Access [C]> > > The simple observations here are: > > - The dmb ensures that no subsequent accesses (e.g. the access to C) > can enter or pass the atomic sequence. > > - The dmb also ensures that no prior accesses (e.g. the access to A) > can pass the atomic sequence. > > - Therefore, no prior access can pass a subsequent access, or > vice-versa (i.e. A is strictly ordered before C). > > - The stlxr ensures that no prior access can pass the store component > of the atomic operation. > > The only tricky part remaining is the ordering between the ldxr and the > access to A, since the absence of the first dmb means that we're now > permitting re-ordering between the ldxr and any prior accesses. > > From an (arbitrary) observer's point of view, there are two scenarios: > > 1. We have observed the ldxr. This means that if we perform a store to > [B], the ldxr will still return older data. If we can observe the > ldxr, then we can potentially observe the permitted re-ordering > with the access to A, which is clearly an issue when compared to > the dmb variant of the code. Thankfully, the exclusive monitor will > save us here since it will be cleared as a result of the store and > the ldxr will retry. Notice that any use of a later memory > observation to imply observation of the ldxr will also imply > observation of the access to A, since the stlxr/dmb ensure strict > ordering. > > 2. We have not observed the ldxr. This means we can perform a store > and influence the later ldxr. However, that doesn't actually tell > us anything about the access to [A], so we've not lost anything > here either when compared to the dmb variant. > > This patch implements this solution for our barriered atomic operations, > ensuring that we satisfy the full barrier requirements where they are > needed. > > Cc: <stable@vger.kernel.org> > Cc: Peter Zijlstra <peterz@infradead.org> > Signed-off-by: Will Deacon <will.deacon@arm.com> > Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> > > Signed-off-by: Ian Campbell <ian.campbell@citrix.com> Acked-by: Julien Grall <julien.grall@linaro.org>
diff --git a/xen/arch/arm/arm64/lib/bitops.S b/xen/arch/arm/arm64/lib/bitops.S index 80cc903..e1ad239 100644 --- a/xen/arch/arm/arm64/lib/bitops.S +++ b/xen/arch/arm/arm64/lib/bitops.S @@ -46,11 +46,12 @@ ENTRY( \name ) mov x2, #1 add x1, x1, x0, lsr #3 // Get word offset lsl x4, x2, x3 // Create mask -1: ldaxr w2, [x1] +1: ldxr w2, [x1] lsr w0, w2, w3 // Save old value of bit \instr w2, w2, w4 // toggle bit stlxr w5, w2, [x1] cbnz w5, 1b + dmb ish and w0, w0, #1 3: ret ENDPROC(\name ) diff --git a/xen/include/asm-arm/arm64/atomic.h b/xen/include/asm-arm/arm64/atomic.h index 6b37945..3f37ed5 100644 --- a/xen/include/asm-arm/arm64/atomic.h +++ b/xen/include/asm-arm/arm64/atomic.h @@ -48,7 +48,7 @@ static inline int atomic_add_return(int i, atomic_t *v) int result; asm volatile("// atomic_add_return\n" -"1: ldaxr %w0, %2\n" +"1: ldxr %w0, %2\n" " add %w0, %w0, %w3\n" " stlxr %w1, %w0, %2\n" " cbnz %w1, 1b" @@ -56,6 +56,7 @@ static inline int atomic_add_return(int i, atomic_t *v) : "Ir" (i) : "cc", "memory"); + smp_mb(); return result; } @@ -80,7 +81,7 @@ static inline int atomic_sub_return(int i, atomic_t *v) int result; asm volatile("// atomic_sub_return\n" -"1: ldaxr %w0, %2\n" +"1: ldxr %w0, %2\n" " sub %w0, %w0, %w3\n" " stlxr %w1, %w0, %2\n" " cbnz %w1, 1b" @@ -88,6 +89,7 @@ static inline int atomic_sub_return(int i, atomic_t *v) : "Ir" (i) : "cc", "memory"); + smp_mb(); return result; } @@ -96,17 +98,20 @@ static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) unsigned long tmp; int oldval; + smp_mb(); + asm volatile("// atomic_cmpxchg\n" -"1: ldaxr %w1, %2\n" +"1: ldxr %w1, %2\n" " cmp %w1, %w3\n" " b.ne 2f\n" -" stlxr %w0, %w4, %2\n" +" stxr %w0, %w4, %2\n" " cbnz %w0, 1b\n" "2:" : "=&r" (tmp), "=&r" (oldval), "+Q" (ptr->counter) : "Ir" (old), "r" (new) : "cc", "memory"); + smp_mb(); return oldval; } diff --git a/xen/include/asm-arm/arm64/system.h b/xen/include/asm-arm/arm64/system.h index 570af5c..0db96e0 100644 --- a/xen/include/asm-arm/arm64/system.h +++ b/xen/include/asm-arm/arm64/system.h @@ -8,49 +8,50 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr, int size { unsigned long ret, tmp; - switch (size) { - case 1: - asm volatile("// __xchg1\n" - "1: ldaxrb %w0, %2\n" - " stlxrb %w1, %w3, %2\n" - " cbnz %w1, 1b\n" - : "=&r" (ret), "=&r" (tmp), "+Q" (*(u8 *)ptr) + switch (size) { + case 1: + asm volatile("// __xchg1\n" + "1: ldxrb %w0, %2\n" + " stlxrb %w1, %w3, %2\n" + " cbnz %w1, 1b\n" + : "=&r" (ret), "=&r" (tmp), "+Q" (*(u8 *)ptr) : "r" (x) : "cc", "memory"); - break; - case 2: - asm volatile("// __xchg2\n" - "1: ldaxrh %w0, %2\n" - " stlxrh %w1, %w3, %2\n" - " cbnz %w1, 1b\n" - : "=&r" (ret), "=&r" (tmp), "+Q" (*(u16 *)ptr) + break; + case 2: + asm volatile("// __xchg2\n" + "1: ldxrh %w0, %2\n" + " stlxrh %w1, %w3, %2\n" + " cbnz %w1, 1b\n" + : "=&r" (ret), "=&r" (tmp), "+Q" (*(u16 *)ptr) : "r" (x) : "cc", "memory"); - break; - case 4: - asm volatile("// __xchg4\n" - "1: ldaxr %w0, %2\n" - " stlxr %w1, %w3, %2\n" - " cbnz %w1, 1b\n" - : "=&r" (ret), "=&r" (tmp), "+Q" (*(u32 *)ptr) + break; + case 4: + asm volatile("// __xchg4\n" + "1: ldxr %w0, %2\n" + " stlxr %w1, %w3, %2\n" + " cbnz %w1, 1b\n" + : "=&r" (ret), "=&r" (tmp), "+Q" (*(u32 *)ptr) : "r" (x) : "cc", "memory"); - break; - case 8: - asm volatile("// __xchg8\n" - "1: ldaxr %0, %2\n" - " stlxr %w1, %3, %2\n" - " cbnz %w1, 1b\n" - : "=&r" (ret), "=&r" (tmp), "+Q" (*(u64 *)ptr) + break; + case 8: + asm volatile("// __xchg8\n" + "1: ldxr %0, %2\n" + " stlxr %w1, %3, %2\n" + " cbnz %w1, 1b\n" + : "=&r" (ret), "=&r" (tmp), "+Q" (*(u64 *)ptr) : "r" (x) : "cc", "memory"); break; default: __bad_xchg(ptr, size), ret = 0; break; - } + } - return ret; + smp_mb(); + return ret; } #define xchg(ptr,x) \