MINOR: threads/atomic: implement pl_mb() in asm on x86

[ plock commit 44081ea493dd78dab48076980e881748e9b33db5 ]

Older compilers (eg: gcc 3.4) don't provide __sync_synchronize() so let's
do it by hand on this platform.
diff --git a/include/import/atomic-ops.h b/include/import/atomic-ops.h
index 9ee7da7..f613a0b 100644
--- a/include/import/atomic-ops.h
+++ b/include/import/atomic-ops.h
@@ -8,14 +8,22 @@
 	asm volatile("" ::: "memory");
 }
 
-/* full memory barrier */
+#if defined(__i386__) || defined (__i486__) || defined (__i586__) || defined (__i686__) || defined (__x86_64__)
+
+/* full memory barrier using mfence when SSE2 is supported, falling back to
+ * "lock add %esp" (gcc uses "lock add" or "lock or").
+ */
 static inline void pl_mb()
 {
-	__sync_synchronize();
+#if defined(__SSE2__)
+	asm volatile("mfence" ::: "memory");
+#elif defined(__x86_64__)
+	asm volatile("lock addl $0,0 (%%rsp)" ::: "memory", "cc");
+#else
+	asm volatile("lock addl $0,0 (%%esp)" ::: "memory", "cc");
+#endif
 }
 
-#if defined(__i386__) || defined (__i486__) || defined (__i586__) || defined (__i686__) || defined (__x86_64__)
-
 /*
  * Generic functions common to the x86 family
  */
@@ -488,6 +496,12 @@
 	asm volatile("");
 }
 
+/* full memory barrier */
+static inline void pl_mb()
+{
+	__sync_synchronize();
+}
+
 #define pl_inc_noret(ptr)     ({ __sync_add_and_fetch((ptr), 1);   })
 #define pl_dec_noret(ptr)     ({ __sync_sub_and_fetch((ptr), 1);   })
 #define pl_inc(ptr)           ({ __sync_add_and_fetch((ptr), 1);   })