BUILD: threads: fix again the __ha_cas_dw() definition

This low-level asm implementation of a double CAS was implemented only
for certain architectures (x86_64, armv7, armv8). When threads are not
used, they were not defined, but since they were called directly from
a few locations, they were causing build issues on certain platforms
with threads disabled. This was addressed in commit f4436e1 ("BUILD:
threads: Add __ha_cas_dw fallback for single threaded builds") by
making it fall back to HA_ATOMIC_CAS() when threads are not defined,
but this actually made the situation worse by breaking other cases.

This patch fixes this by creating a high-level macro HA_ATOMIC_DWCAS()
which is similar to HA_ATOMIC_CAS() except that it's intended to work
on a double word, and which rely on the asm implementations when threads
are in use, and uses its own open-coded implementation when threads are
not used. The 3 call places relying on __ha_cas_dw() were updated to
use HA_ATOMIC_DWCAS() instead.

This change was tested on i586, x86_64, armv7, armv8 with and without
threads with gcc 4.7, armv8 with gcc 5.4 with and without threads, as
well as i586 with gcc-3.4 without threads. It will need to be backported
to 1.9 along with the fix above to fix build on armv7 with threads
disabled.
diff --git a/include/common/hathreads.h b/include/common/hathreads.h
index ef7ba7f..a1a7fae 100644
--- a/include/common/hathreads.h
+++ b/include/common/hathreads.h
@@ -51,6 +51,7 @@
 #define __decl_aligned_rwlock(lock)
 
 #define HA_ATOMIC_CAS(val, old, new) ({((*val) == (*old)) ? (*(val) = (new) , 1) : (*(old) = *(val), 0);})
+#define HA_ATOMIC_DWCAS(val, o, n)   ({((*val) == (*o))   ? (*(val) = (n)   , 1) : (*(o)   = *(val), 0);})
 #define HA_ATOMIC_ADD(val, i)        ({*(val) += (i);})
 #define HA_ATOMIC_SUB(val, i)        ({*(val) -= (i);})
 #define HA_ATOMIC_XADD(val, i)						\
@@ -153,11 +154,6 @@
 {
 }
 
-static inline int __ha_cas_dw(void *target, void *compare, void *set)
-{
-	return HA_ATOMIC_CAS(target, compare, set);
-}
-
 static inline void thread_harmless_now()
 {
 }
@@ -251,6 +247,8 @@
 		__ret_cas;						\
 	})
 
+#define HA_ATOMIC_DWCAS(val, o, n) __ha_cas_dw(val, o, n)
+
 #define HA_ATOMIC_XCHG(val, new)					\
 	({								\
 		typeof((val)) __val_xchg = (val);			\
@@ -293,6 +291,7 @@
 #else
 /* gcc >= 4.7 */
 #define HA_ATOMIC_CAS(val, old, new) __atomic_compare_exchange_n(val, old, new, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+#define HA_ATOMIC_DWCAS(val, o, n)   __ha_cas_dw(val, o, n)
 #define HA_ATOMIC_ADD(val, i)        __atomic_add_fetch(val, i, __ATOMIC_SEQ_CST)
 #define HA_ATOMIC_XADD(val, i)       __atomic_fetch_add(val, i, __ATOMIC_SEQ_CST)
 #define HA_ATOMIC_SUB(val, i)        __atomic_sub_fetch(val, i, __ATOMIC_SEQ_CST)
@@ -321,6 +320,7 @@
  * ie updating a counter. Otherwise a barrier is required.
  */
 #define _HA_ATOMIC_CAS(val, old, new) __atomic_compare_exchange_n(val, old, new, 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED)
+#define _HA_ATOMIC_DWCAS(val, o, n)   __ha_cas_dw(val, o, n)
 #define _HA_ATOMIC_ADD(val, i)        __atomic_add_fetch(val, i, __ATOMIC_RELAXED)
 #define _HA_ATOMIC_XADD(val, i)       __atomic_fetch_add(val, i, __ATOMIC_RELAXED)
 #define _HA_ATOMIC_SUB(val, i)        __atomic_sub_fetch(val, i, __ATOMIC_RELAXED)
@@ -1109,6 +1109,10 @@
 #define _HA_ATOMIC_CAS HA_ATOMIC_CAS
 #endif /* !_HA_ATOMIC_CAS */
 
+#ifndef _HA_ATOMIC_DWCAS
+#define _HA_ATOMIC_DWCAS HA_ATOMIC_DWCAS
+#endif /* !_HA_ATOMIC_CAS */
+
 #ifndef _HA_ATOMIC_ADD
 #define _HA_ATOMIC_ADD HA_ATOMIC_ADD
 #endif /* !_HA_ATOMIC_ADD */