On Mon, 2005-06-27 at 00:33 -0400, David S. Miller wrote:
> [. CC:'ing Josh Grebe, he could reproduce these SMP hangs on
> his box quite reliably ]
>
I'm not squash, but I have an update. System=U60(2x450), 2 disks
installed.
Kernel=2.6.12_rc6
Patch installs without any issues; however, system up time was about
4 hours; died under heavy disk activity (trying to do reverse dependency
check for package rebuilds). The failure looked like it always looks:
everything stops, system responds only to the reset switch.
As an aside, as a test on SB100(2x900), kernel uptime was about 90
seconds. After which, system looked like it was running, but things
like pinging it, ssh to it, using the keyboard, etc. were all dead. X
kept going, though.
Regards,
Ferris
> Ok folks, give this patch a try. It applies cleanly to
> 2.6.12 and 2.6.12.1
>
> [SPARC64]: Avoid membar instructions in delay slots.
>
> In particular, avoid membar instructions in the delay
> slot of a jmpl instruction.
>
> UltraSPARC-I, II, IIi, and IIe have a bug, documented in
> the UltraSPARC-IIi User's Manual, Appendix K, Erratum 51
>
> The long and short of it is that if the IMU unit misses
> on a branch or jmpl, and there is a store buffer synchronizing
> membar in the delay slot, the chip can stop fetching instructions.
>
> If interrupts are enabled or some other trap is enabled, the
> chip will unwedge itself, but performance will suffer.
>
> We already had a workaround for this bug in a few spots, but
> it's better to have the entire tree sanitized for this rule.
>
> Signed-off-by: David S. Miller <davem@davemloft.net>
>
> diff --git a/arch/sparc64/kernel/entry.S b/arch/sparc64/kernel/entry.S
> --- a/arch/sparc64/kernel/entry.S
> +++ b/arch/sparc64/kernel/entry.S
> @@ -271,8 +271,9 @@ cplus_fptrap_insn_1:
> fmuld %f0, %f2, %f26
> faddd %f0, %f2, %f28
> fmuld %f0, %f2, %f30
> + membar #Sync
> b,pt %xcc, fpdis_exit
> - membar #Sync
> + nop
> 2: andcc %g5, FPRS_DU, %g0
> bne,pt %icc, 3f
> fzero %f32
> @@ -301,8 +302,9 @@ cplus_fptrap_insn_2:
> fmuld %f32, %f34, %f58
> faddd %f32, %f34, %f60
> fmuld %f32, %f34, %f62
> + membar #Sync
> ba,pt %xcc, fpdis_exit
> - membar #Sync
> + nop
> 3: mov SECONDARY_CONTEXT, %g3
> add %g6, TI_FPREGS, %g1
> ldxa [%g3] ASI_DMMU, %g5
> diff --git a/arch/sparc64/kernel/semaphore.c b/arch/sparc64/kernel/semaphore.c
> --- a/arch/sparc64/kernel/semaphore.c
> +++ b/arch/sparc64/kernel/semaphore.c
> @@ -32,8 +32,9 @@ static __inline__ int __sem_update_count
> " add %1, %4, %1\n"
> " cas [%3], %0, %1\n"
> " cmp %0, %1\n"
> +" membar #StoreLoad | #StoreStore\n"
> " bne,pn %%icc, 1b\n"
> -" membar #StoreLoad | #StoreStore\n"
> +" nop\n"
> : "=&r" (old_count), "=&r" (tmp), "=m" (sem->count)
> : "r" (&sem->count), "r" (incr), "m" (sem->count)
> : "cc");
> @@ -71,8 +72,9 @@ void up(struct semaphore *sem)
> " cmp %%g1, %%g7\n"
> " bne,pn %%icc, 1b\n"
> " addcc %%g7, 1, %%g0\n"
> +" membar #StoreLoad | #StoreStore\n"
> " ble,pn %%icc, 3f\n"
> -" membar #StoreLoad | #StoreStore\n"
> +" nop\n"
> "2:\n"
> " .subsection 2\n"
> "3: mov %0, %%g1\n"
> @@ -128,8 +130,9 @@ void __sched down(struct semaphore *sem)
> " cmp %%g1, %%g7\n"
> " bne,pn %%icc, 1b\n"
> " cmp %%g7, 1\n"
> +" membar #StoreLoad | #StoreStore\n"
> " bl,pn %%icc, 3f\n"
> -" membar #StoreLoad | #StoreStore\n"
> +" nop\n"
> "2:\n"
> " .subsection 2\n"
> "3: mov %0, %%g1\n"
> @@ -233,8 +236,9 @@ int __sched down_interruptible(struct se
> " cmp %%g1, %%g7\n"
> " bne,pn %%icc, 1b\n"
> " cmp %%g7, 1\n"
> +" membar #StoreLoad | #StoreStore\n"
> " bl,pn %%icc, 3f\n"
> -" membar #StoreLoad | #StoreStore\n"
> +" nop\n"
> "2:\n"
> " .subsection 2\n"
> "3: mov %2, %%g1\n"
> diff --git a/arch/sparc64/kernel/trampoline.S b/arch/sparc64/kernel/trampoline.S
> --- a/arch/sparc64/kernel/trampoline.S
> +++ b/arch/sparc64/kernel/trampoline.S
> @@ -98,8 +98,9 @@ startup_continue:
>
> sethi %hi(prom_entry_lock), %g2
> 1: ldstub [%g2 + %lo(prom_entry_lock)], %g1
> + membar #StoreLoad | #StoreStore
> brnz,pn %g1, 1b
> - membar #StoreLoad | #StoreStore
> + nop
>
> sethi %hi(p1275buf), %g2
> or %g2, %lo(p1275buf), %g2
> diff --git a/arch/sparc64/lib/U1memcpy.S b/arch/sparc64/lib/U1memcpy.S
> --- a/arch/sparc64/lib/U1memcpy.S
> +++ b/arch/sparc64/lib/U1memcpy.S
> @@ -87,14 +87,17 @@
> #define LOOP_CHUNK3(src, dest, len, branch_dest) \
> MAIN_LOOP_CHUNK(src, dest, f32, f48, len, branch_dest)
>
> +#define DO_SYNC membar #Sync;
> #define STORE_SYNC(dest, fsrc) \
> EX_ST(STORE_BLK(%fsrc, %dest)); \
> - add %dest, 0x40, %dest;
> + add %dest, 0x40, %dest; \
> + DO_SYNC
>
> #define STORE_JUMP(dest, fsrc, target) \
> EX_ST(STORE_BLK(%fsrc, %dest)); \
> add %dest, 0x40, %dest; \
> - ba,pt %xcc, target;
> + ba,pt %xcc, target; \
> + nop;
>
> #define FINISH_VISCHUNK(dest, f0, f1, left) \
> subcc %left, 8, %left;\
> @@ -239,17 +242,17 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len
> ba,pt %xcc, 1b+4
> faligndata %f0, %f2, %f48
> 1: FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)
> - STORE_JUMP(o0, f48, 40f) membar #Sync
> + STORE_JUMP(o0, f48, 40f)
> 2: FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)
> - STORE_JUMP(o0, f48, 48f) membar #Sync
> + STORE_JUMP(o0, f48, 48f)
> 3: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)
> - STORE_JUMP(o0, f48, 56f) membar #Sync
> + STORE_JUMP(o0, f48, 56f)
>
> 1: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
> LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
> @@ -260,17 +263,17 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len
> ba,pt %xcc, 1b+4
> faligndata %f2, %f4, %f48
> 1: FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)
> - STORE_JUMP(o0, f48, 41f) membar #Sync
> + STORE_JUMP(o0, f48, 41f)
> 2: FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
> - STORE_JUMP(o0, f48, 49f) membar #Sync
> + STORE_JUMP(o0, f48, 49f)
> 3: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)
> - STORE_JUMP(o0, f48, 57f) membar #Sync
> + STORE_JUMP(o0, f48, 57f)
>
> 1: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
> LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
> @@ -281,17 +284,17 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len
> ba,pt %xcc, 1b+4
> faligndata %f4, %f6, %f48
> 1: FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)
> - STORE_JUMP(o0, f48, 42f) membar #Sync
> + STORE_JUMP(o0, f48, 42f)
> 2: FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
> - STORE_JUMP(o0, f48, 50f) membar #Sync
> + STORE_JUMP(o0, f48, 50f)
> 3: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)
> - STORE_JUMP(o0, f48, 58f) membar #Sync
> + STORE_JUMP(o0, f48, 58f)
>
> 1: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
> LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
> @@ -302,17 +305,17 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len
> ba,pt %xcc, 1b+4
> faligndata %f6, %f8, %f48
> 1: FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)
> - STORE_JUMP(o0, f48, 43f) membar #Sync
> + STORE_JUMP(o0, f48, 43f)
> 2: FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
> - STORE_JUMP(o0, f48, 51f) membar #Sync
> + STORE_JUMP(o0, f48, 51f)
> 3: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)
> - STORE_JUMP(o0, f48, 59f) membar #Sync
> + STORE_JUMP(o0, f48, 59f)
>
> 1: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
> LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
> @@ -323,17 +326,17 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len
> ba,pt %xcc, 1b+4
> faligndata %f8, %f10, %f48
> 1: FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)
> - STORE_JUMP(o0, f48, 44f) membar #Sync
> + STORE_JUMP(o0, f48, 44f)
> 2: FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
> - STORE_JUMP(o0, f48, 52f) membar #Sync
> + STORE_JUMP(o0, f48, 52f)
> 3: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)
> - STORE_JUMP(o0, f48, 60f) membar #Sync
> + STORE_JUMP(o0, f48, 60f)
>
> 1: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
> LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
> @@ -344,17 +347,17 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len
> ba,pt %xcc, 1b+4
> faligndata %f10, %f12, %f48
> 1: FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)
> - STORE_JUMP(o0, f48, 45f) membar #Sync
> + STORE_JUMP(o0, f48, 45f)
> 2: FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
> - STORE_JUMP(o0, f48, 53f) membar #Sync
> + STORE_JUMP(o0, f48, 53f)
> 3: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)
> - STORE_JUMP(o0, f48, 61f) membar #Sync
> + STORE_JUMP(o0, f48, 61f)
>
> 1: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
> LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
> @@ -365,17 +368,17 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len
> ba,pt %xcc, 1b+4
> faligndata %f12, %f14, %f48
> 1: FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)
> - STORE_JUMP(o0, f48, 46f) membar #Sync
> + STORE_JUMP(o0, f48, 46f)
> 2: FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
> - STORE_JUMP(o0, f48, 54f) membar #Sync
> + STORE_JUMP(o0, f48, 54f)
> 3: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)
> - STORE_JUMP(o0, f48, 62f) membar #Sync
> + STORE_JUMP(o0, f48, 62f)
>
> 1: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
> LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
> @@ -386,17 +389,17 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len
> ba,pt %xcc, 1b+4
> faligndata %f14, %f16, %f48
> 1: FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)
> - STORE_JUMP(o0, f48, 47f) membar #Sync
> + STORE_JUMP(o0, f48, 47f)
> 2: FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
> - STORE_JUMP(o0, f48, 55f) membar #Sync
> + STORE_JUMP(o0, f48, 55f)
> 3: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
> - STORE_SYNC(o0, f48) membar #Sync
> + STORE_SYNC(o0, f48)
> FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
> - STORE_JUMP(o0, f48, 63f) membar #Sync
> + STORE_JUMP(o0, f48, 63f)
>
> 40: FINISH_VISCHUNK(o0, f0, f2, g3)
> 41: FINISH_VISCHUNK(o0, f2, f4, g3)
> diff --git a/arch/sparc64/lib/VISsave.S b/arch/sparc64/lib/VISsave.S
> --- a/arch/sparc64/lib/VISsave.S
> +++ b/arch/sparc64/lib/VISsave.S
> @@ -72,7 +72,11 @@ vis1: ldub [%g6 + TI_FPSAVED], %g3
>
> stda %f48, [%g3 + %g1] ASI_BLK_P
> 5: membar #Sync
> - jmpl %g7 + %g0, %g0
> + ba,pt %xcc, 80f
> + nop
> +
> + .align 32
> +80: jmpl %g7 + %g0, %g0
> nop
>
> 6: ldub [%g3 + TI_FPSAVED], %o5
> @@ -87,8 +91,11 @@ vis1: ldub [%g6 + TI_FPSAVED], %g3
> stda %f32, [%g2 + %g1] ASI_BLK_P
> stda %f48, [%g3 + %g1] ASI_BLK_P
> membar #Sync
> - jmpl %g7 + %g0, %g0
> + ba,pt %xcc, 80f
> + nop
>
> + .align 32
> +80: jmpl %g7 + %g0, %g0
> nop
>
> .align 32
> @@ -126,6 +133,10 @@ VISenterhalf:
> stda %f0, [%g2 + %g1] ASI_BLK_P
> stda %f16, [%g3 + %g1] ASI_BLK_P
> membar #Sync
> + ba,pt %xcc, 4f
> + nop
> +
> + .align 32
> 4: and %o5, FPRS_DU, %o5
> jmpl %g7 + %g0, %g0
> wr %o5, FPRS_FEF, %fprs
> diff --git a/arch/sparc64/lib/atomic.S b/arch/sparc64/lib/atomic.S
> --- a/arch/sparc64/lib/atomic.S
> +++ b/arch/sparc64/lib/atomic.S
> @@ -7,18 +7,6 @@
> #include <linux/config.h>
> #include <asm/asi.h>
>
> - /* On SMP we need to use memory barriers to ensure
> - * correct memory operation ordering, nop these out
> - * for uniprocessor.
> - */
> -#ifdef CONFIG_SMP
> -#define ATOMIC_PRE_BARRIER membar #StoreLoad | #LoadLoad
> -#define ATOMIC_POST_BARRIER membar #StoreLoad | #StoreStore
> -#else
> -#define ATOMIC_PRE_BARRIER nop
> -#define ATOMIC_POST_BARRIER nop
> -#endif
> -
> .text
>
> /* Two versions of the atomic routines, one that
> @@ -52,6 +40,24 @@ atomic_sub: /* %o0 = decrement, %o1 = at
> nop
> .size atomic_sub, .-atomic_sub
>
> + /* On SMP we need to use memory barriers to ensure
> + * correct memory operation ordering, nop these out
> + * for uniprocessor.
> + */
> +#ifdef CONFIG_SMP
> +
> +#define ATOMIC_PRE_BARRIER membar #StoreLoad | #LoadLoad;
> +#define ATOMIC_POST_BARRIER \
> + ba,pt %xcc, 80b; \
> + membar #StoreLoad | #StoreStore
> +
> +80: retl
> + nop
> +#else
> +#define ATOMIC_PRE_BARRIER
> +#define ATOMIC_POST_BARRIER
> +#endif
> +
> .globl atomic_add_ret
> .type atomic_add_ret,#function
> atomic_add_ret: /* %o0 = increment, %o1 = atomic_ptr */
> @@ -62,9 +68,10 @@ atomic_add_ret: /* %o0 = increment, %o1
> cmp %g1, %g7
> bne,pn %icc, 1b
> add %g7, %o0, %g7
> + sra %g7, 0, %o0
> ATOMIC_POST_BARRIER
> retl
> - sra %g7, 0, %o0
> + nop
> .size atomic_add_ret, .-atomic_add_ret
>
> .globl atomic_sub_ret
> @@ -77,9 +84,10 @@ atomic_sub_ret: /* %o0 = decrement, %o1
> cmp %g1, %g7
> bne,pn %icc, 1b
> sub %g7, %o0, %g7
> + sra %g7, 0, %o0
> ATOMIC_POST_BARRIER
> retl
> - sra %g7, 0, %o0
> + nop
> .size atomic_sub_ret, .-atomic_sub_ret
>
> .globl atomic64_add
> @@ -118,9 +126,10 @@ atomic64_add_ret: /* %o0 = increment, %o
> cmp %g1, %g7
> bne,pn %xcc, 1b
> add %g7, %o0, %g7
> + mov %g7, %o0
> ATOMIC_POST_BARRIER
> retl
> - mov %g7, %o0
> + nop
> .size atomic64_add_ret, .-atomic64_add_ret
>
> .globl atomic64_sub_ret
> @@ -133,7 +142,8 @@ atomic64_sub_ret: /* %o0 = decrement, %o
> cmp %g1, %g7
> bne,pn %xcc, 1b
> sub %g7, %o0, %g7
> + mov %g7, %o0
> ATOMIC_POST_BARRIER
> retl
> - mov %g7, %o0
> + nop
> .size atomic64_sub_ret, .-atomic64_sub_ret
> diff --git a/arch/sparc64/lib/bitops.S b/arch/sparc64/lib/bitops.S
> --- a/arch/sparc64/lib/bitops.S
> +++ b/arch/sparc64/lib/bitops.S
> @@ -7,20 +7,26 @@
> #include <linux/config.h>
> #include <asm/asi.h>
>
> + .text
> +
> /* On SMP we need to use memory barriers to ensure
> * correct memory operation ordering, nop these out
> * for uniprocessor.
> */
> +
> #ifdef CONFIG_SMP
> #define BITOP_PRE_BARRIER membar #StoreLoad | #LoadLoad
> -#define BITOP_POST_BARRIER membar #StoreLoad | #StoreStore
> +#define BITOP_POST_BARRIER \
> + ba,pt %xcc, 80b; \
> + membar #StoreLoad | #StoreStore
> +
> +80: retl
> + nop
> #else
> -#define BITOP_PRE_BARRIER nop
> -#define BITOP_POST_BARRIER nop
> +#define BITOP_PRE_BARRIER
> +#define BITOP_POST_BARRIER
> #endif
>
> - .text
> -
> .globl test_and_set_bit
> .type test_and_set_bit,#function
> test_and_set_bit: /* %o0=nr, %o1=addr */
> @@ -37,10 +43,11 @@ test_and_set_bit: /* %o0=nr, %o1=addr */
> cmp %g7, %g1
> bne,pn %xcc, 1b
> and %g7, %o2, %g2
> - BITOP_POST_BARRIER
> clr %o0
> + movrne %g2, 1, %o0
> + BITOP_POST_BARRIER
> retl
> - movrne %g2, 1, %o0
> + nop
> .size test_and_set_bit, .-test_and_set_bit
>
> .globl test_and_clear_bit
> @@ -59,10 +66,11 @@ test_and_clear_bit: /* %o0=nr, %o1=addr
> cmp %g7, %g1
> bne,pn %xcc, 1b
> and %g7, %o2, %g2
> - BITOP_POST_BARRIER
> clr %o0
> + movrne %g2, 1, %o0
> + BITOP_POST_BARRIER
> retl
> - movrne %g2, 1, %o0
> + nop
> .size test_and_clear_bit, .-test_and_clear_bit
>
> .globl test_and_change_bit
> @@ -81,10 +89,11 @@ test_and_change_bit: /* %o0=nr, %o1=addr
> cmp %g7, %g1
> bne,pn %xcc, 1b
> and %g7, %o2, %g2
> - BITOP_POST_BARRIER
> clr %o0
> + movrne %g2, 1, %o0
> + BITOP_POST_BARRIER
> retl
> - movrne %g2, 1, %o0
> + nop
> .size test_and_change_bit, .-test_and_change_bit
>
> .globl set_bit
> diff --git a/arch/sparc64/lib/debuglocks.c b/arch/sparc64/lib/debuglocks.c
> --- a/arch/sparc64/lib/debuglocks.c
> +++ b/arch/sparc64/lib/debuglocks.c
> @@ -252,8 +252,9 @@ wlock_again:
> " andn %%g1, %%g3, %%g7\n"
> " casx [%0], %%g1, %%g7\n"
> " cmp %%g1, %%g7\n"
> +" membar #StoreLoad | #StoreStore\n"
> " bne,pn %%xcc, 1b\n"
> -" membar #StoreLoad | #StoreStore"
> +" nop"
> : /* no outputs */
> : "r" (&(rw->lock))
> : "g3", "g1", "g7", "cc", "memory");
> @@ -351,8 +352,9 @@ int _do_write_trylock (rwlock_t *rw, cha
> " andn %%g1, %%g3, %%g7\n"
> " casx [%0], %%g1, %%g7\n"
> " cmp %%g1, %%g7\n"
> +" membar #StoreLoad | #StoreStore\n"
> " bne,pn %%xcc, 1b\n"
> -" membar #StoreLoad | #StoreStore"
> +" nop"
> : /* no outputs */
> : "r" (&(rw->lock))
> : "g3", "g1", "g7", "cc", "memory");
> diff --git a/arch/sparc64/lib/dec_and_lock.S b/arch/sparc64/lib/dec_and_lock.S
> --- a/arch/sparc64/lib/dec_and_lock.S
> +++ b/arch/sparc64/lib/dec_and_lock.S
> @@ -48,8 +48,9 @@ start_to_zero:
> #endif
> to_zero:
> ldstub [%o1], %g3
> + membar #StoreLoad | #StoreStore
> brnz,pn %g3, spin_on_lock
> - membar #StoreLoad | #StoreStore
> + nop
> loop2: cas [%o0], %g2, %g7 /* ASSERT(g7 == 0) */
> cmp %g2, %g7
>
> @@ -71,8 +72,9 @@ loop2: cas [%o0], %g2, %g7 /* ASSERT(g7
> nop
> spin_on_lock:
> ldub [%o1], %g3
> + membar #LoadLoad
> brnz,pt %g3, spin_on_lock
> - membar #LoadLoad
> + nop
> ba,pt %xcc, to_zero
> nop
> nop
> diff --git a/arch/sparc64/lib/rwsem.S b/arch/sparc64/lib/rwsem.S
> --- a/arch/sparc64/lib/rwsem.S
> +++ b/arch/sparc64/lib/rwsem.S
> @@ -17,8 +17,9 @@ __down_read:
> bne,pn %icc, 1b
> add %g7, 1, %g7
> cmp %g7, 0
> + membar #StoreLoad | #StoreStore
> bl,pn %icc, 3f
> - membar #StoreLoad | #StoreStore
> + nop
> 2:
> retl
> nop
> @@ -57,8 +58,9 @@ __down_write:
> cmp %g3, %g7
> bne,pn %icc, 1b
> cmp %g7, 0
> + membar #StoreLoad | #StoreStore
> bne,pn %icc, 3f
> - membar #StoreLoad | #StoreStore
> + nop
> 2: retl
> nop
> 3:
> @@ -97,8 +99,9 @@ __up_read:
> cmp %g1, %g7
> bne,pn %icc, 1b
> cmp %g7, 0
> + membar #StoreLoad | #StoreStore
> bl,pn %icc, 3f
> - membar #StoreLoad | #StoreStore
> + nop
> 2: retl
> nop
> 3: sethi %hi(RWSEM_ACTIVE_MASK), %g1
> @@ -126,8 +129,9 @@ __up_write:
> bne,pn %icc, 1b
> sub %g7, %g1, %g7
> cmp %g7, 0
> + membar #StoreLoad | #StoreStore
> bl,pn %icc, 3f
> - membar #StoreLoad | #StoreStore
> + nop
> 2:
> retl
> nop
> @@ -151,8 +155,9 @@ __downgrade_write:
> bne,pn %icc, 1b
> sub %g7, %g1, %g7
> cmp %g7, 0
> + membar #StoreLoad | #StoreStore
> bl,pn %icc, 3f
> - membar #StoreLoad | #StoreStore
> + nop
> 2:
> retl
> nop
> diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
> --- a/arch/sparc64/mm/init.c
> +++ b/arch/sparc64/mm/init.c
> @@ -136,8 +136,9 @@ static __inline__ void set_dcache_dirty(
> "or %%g1, %0, %%g1\n\t"
> "casx [%2], %%g7, %%g1\n\t"
> "cmp %%g7, %%g1\n\t"
> + "membar #StoreLoad | #StoreStore\n\t"
> "bne,pn %%xcc, 1b\n\t"
> - " membar #StoreLoad | #StoreStore"
> + " nop"
> : /* no outputs */
> : "r" (mask), "r" (non_cpu_bits), "r" (&page->flags)
> : "g1", "g7");
> @@ -157,8 +158,9 @@ static __inline__ void clear_dcache_dirt
> " andn %%g7, %1, %%g1\n\t"
> "casx [%2], %%g7, %%g1\n\t"
> "cmp %%g7, %%g1\n\t"
> + "membar #StoreLoad | #StoreStore\n\t"
> "bne,pn %%xcc, 1b\n\t"
> - " membar #StoreLoad | #StoreStore\n"
> + " nop\n"
> "2:"
> : /* no outputs */
> : "r" (cpu), "r" (mask), "r" (&page->flags),
> diff --git a/arch/sparc64/mm/ultra.S b/arch/sparc64/mm/ultra.S
> --- a/arch/sparc64/mm/ultra.S
> +++ b/arch/sparc64/mm/ultra.S
> @@ -266,8 +266,9 @@ __cheetah_flush_tlb_pending: /* 22 insns
> andn %o3, 1, %o3
> stxa %g0, [%o3] ASI_IMMU_DEMAP
> 2: stxa %g0, [%o3] ASI_DMMU_DEMAP
> + membar #Sync
> brnz,pt %o1, 1b
> - membar #Sync
> + nop
> stxa %g2, [%o4] ASI_DMMU
> flush %g6
> wrpr %g0, 0, %tl
> diff --git a/include/asm-sparc64/rwsem.h b/include/asm-sparc64/rwsem.h
> --- a/include/asm-sparc64/rwsem.h
> +++ b/include/asm-sparc64/rwsem.h
> @@ -55,8 +55,9 @@ static __inline__ int rwsem_atomic_updat
> "add %%g1, %1, %%g7\n\t"
> "cas [%2], %%g1, %%g7\n\t"
> "cmp %%g1, %%g7\n\t"
> + "membar #StoreLoad | #StoreStore\n\t"
> "bne,pn %%icc, 1b\n\t"
> - " membar #StoreLoad | #StoreStore\n\t"
> + " nop\n\t"
> "mov %%g7, %0\n\t"
> : "=&r" (tmp)
> : "0" (tmp), "r" (sem)
> diff --git a/include/asm-sparc64/spinlock.h b/include/asm-sparc64/spinlock.h
> --- a/include/asm-sparc64/spinlock.h
> +++ b/include/asm-sparc64/spinlock.h
> @@ -52,12 +52,14 @@ static inline void _raw_spin_lock(spinlo
>
> __asm__ __volatile__(
> "1: ldstub [%1], %0\n"
> +" membar #StoreLoad | #StoreStore\n"
> " brnz,pn %0, 2f\n"
> -" membar #StoreLoad | #StoreStore\n"
> +" nop\n"
> " .subsection 2\n"
> "2: ldub [%1], %0\n"
> +" membar #LoadLoad\n"
> " brnz,pt %0, 2b\n"
> -" membar #LoadLoad\n"
> +" nop\n"
> " ba,a,pt %%xcc, 1b\n"
> " .previous"
> : "=&r" (tmp)
> @@ -95,16 +97,18 @@ static inline void _raw_spin_lock_flags(
>
> __asm__ __volatile__(
> "1: ldstub [%2], %0\n"
> -" brnz,pn %0, 2f\n"
> " membar #StoreLoad | #StoreStore\n"
> +" brnz,pn %0, 2f\n"
> +" nop\n"
> " .subsection 2\n"
> "2: rdpr %%pil, %1\n"
> " wrpr %3, %%pil\n"
> "3: ldub [%2], %0\n"
> -" brnz,pt %0, 3b\n"
> " membar #LoadLoad\n"
> +" brnz,pt %0, 3b\n"
> +" nop\n"
> " ba,pt %%xcc, 1b\n"
> -" wrpr %1, %%pil\n"
> +" wrpr %1, %%pil\n"
> " .previous"
> : "=&r" (tmp1), "=&r" (tmp2)
> : "r"(lock), "r"(flags)
> @@ -162,12 +166,14 @@ static void inline __read_lock(rwlock_t
> "4: add %0, 1, %1\n"
> " cas [%2], %0, %1\n"
> " cmp %0, %1\n"
> +" membar #StoreLoad | #StoreStore\n"
> " bne,pn %%icc, 1b\n"
> -" membar #StoreLoad | #StoreStore\n"
> +" nop\n"
> " .subsection 2\n"
> "2: ldsw [%2], %0\n"
> +" membar #LoadLoad\n"
> " brlz,pt %0, 2b\n"
> -" membar #LoadLoad\n"
> +" nop\n"
> " ba,a,pt %%xcc, 4b\n"
> " .previous"
> : "=&r" (tmp1), "=&r" (tmp2)
> @@ -204,12 +210,14 @@ static void inline __write_lock(rwlock_t
> "4: or %0, %3, %1\n"
> " cas [%2], %0, %1\n"
> " cmp %0, %1\n"
> +" membar #StoreLoad | #StoreStore\n"
> " bne,pn %%icc, 1b\n"
> -" membar #StoreLoad | #StoreStore\n"
> +" nop\n"
> " .subsection 2\n"
> "2: lduw [%2], %0\n"
> +" membar #LoadLoad\n"
> " brnz,pt %0, 2b\n"
> -" membar #LoadLoad\n"
> +" nop\n"
> " ba,a,pt %%xcc, 4b\n"
> " .previous"
> : "=&r" (tmp1), "=&r" (tmp2)
> @@ -240,8 +248,9 @@ static int inline __write_trylock(rwlock
> " or %0, %4, %1\n"
> " cas [%3], %0, %1\n"
> " cmp %0, %1\n"
> +" membar #StoreLoad | #StoreStore\n"
> " bne,pn %%icc, 1b\n"
> -" membar #StoreLoad | #StoreStore\n"
> +" nop\n"
> " mov 1, %2\n"
> "2:"
> : "=&r" (tmp1), "=&r" (tmp2), "=&r" (result)
> diff --git a/include/asm-sparc64/spitfire.h b/include/asm-sparc64/spitfire.h
> --- a/include/asm-sparc64/spitfire.h
> +++ b/include/asm-sparc64/spitfire.h
> @@ -111,7 +111,6 @@ static __inline__ void spitfire_put_dcac
> "membar #Sync"
> : /* No outputs */
> : "r" (tag), "r" (addr), "i" (ASI_DCACHE_TAG));
> - __asm__ __volatile__ ("membar #Sync" : : : "memory");
> }
>
> /* The instruction cache lines are flushed with this, but note that
--
Ferris McCormick (P44646, MI) <fmccor@gentoo.org>
Developer, Gentoo Linux (Sparc, Devrel)