BUILD: threads: fix again the __ha_cas_dw() definition

This low-level asm implementation of a double CAS was implemented only
for certain architectures (x86_64, armv7, armv8). When threads are not
used, they were not defined, but since they were called directly from
a few locations, they were causing build issues on certain platforms
with threads disabled. This was addressed in commit f4436e1 ("BUILD:
threads: Add __ha_cas_dw fallback for single threaded builds") by
making it fall back to HA_ATOMIC_CAS() when threads are not defined,
but this actually made the situation worse by breaking other cases.

This patch fixes this by creating a high-level macro HA_ATOMIC_DWCAS()
which is similar to HA_ATOMIC_CAS() except that it's intended to work
on a double word, and which rely on the asm implementations when threads
are in use, and uses its own open-coded implementation when threads are
not used. The 3 call places relying on __ha_cas_dw() were updated to
use HA_ATOMIC_DWCAS() instead.

This change was tested on i586, x86_64, armv7, armv8 with and without
threads with gcc 4.7, armv8 with gcc 5.4 with and without threads, as
well as i586 with gcc-3.4 without threads. It will need to be backported
to 1.9 along with the fix above to fix build on armv7 with threads
disabled.
diff --git a/include/common/hathreads.h b/include/common/hathreads.h
index ef7ba7f..a1a7fae 100644
--- a/include/common/hathreads.h
+++ b/include/common/hathreads.h
@@ -51,6 +51,7 @@
 #define __decl_aligned_rwlock(lock)
 
 #define HA_ATOMIC_CAS(val, old, new) ({((*val) == (*old)) ? (*(val) = (new) , 1) : (*(old) = *(val), 0);})
+#define HA_ATOMIC_DWCAS(val, o, n)   ({((*val) == (*o))   ? (*(val) = (n)   , 1) : (*(o)   = *(val), 0);})
 #define HA_ATOMIC_ADD(val, i)        ({*(val) += (i);})
 #define HA_ATOMIC_SUB(val, i)        ({*(val) -= (i);})
 #define HA_ATOMIC_XADD(val, i)						\
@@ -153,11 +154,6 @@
 {
 }
 
-static inline int __ha_cas_dw(void *target, void *compare, void *set)
-{
-	return HA_ATOMIC_CAS(target, compare, set);
-}
-
 static inline void thread_harmless_now()
 {
 }
@@ -251,6 +247,8 @@
 		__ret_cas;						\
 	})
 
+#define HA_ATOMIC_DWCAS(val, o, n) __ha_cas_dw(val, o, n)
+
 #define HA_ATOMIC_XCHG(val, new)					\
 	({								\
 		typeof((val)) __val_xchg = (val);			\
@@ -293,6 +291,7 @@
 #else
 /* gcc >= 4.7 */
 #define HA_ATOMIC_CAS(val, old, new) __atomic_compare_exchange_n(val, old, new, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+#define HA_ATOMIC_DWCAS(val, o, n)   __ha_cas_dw(val, o, n)
 #define HA_ATOMIC_ADD(val, i)        __atomic_add_fetch(val, i, __ATOMIC_SEQ_CST)
 #define HA_ATOMIC_XADD(val, i)       __atomic_fetch_add(val, i, __ATOMIC_SEQ_CST)
 #define HA_ATOMIC_SUB(val, i)        __atomic_sub_fetch(val, i, __ATOMIC_SEQ_CST)
@@ -321,6 +320,7 @@
  * ie updating a counter. Otherwise a barrier is required.
  */
 #define _HA_ATOMIC_CAS(val, old, new) __atomic_compare_exchange_n(val, old, new, 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED)
+#define _HA_ATOMIC_DWCAS(val, o, n)   __ha_cas_dw(val, o, n)
 #define _HA_ATOMIC_ADD(val, i)        __atomic_add_fetch(val, i, __ATOMIC_RELAXED)
 #define _HA_ATOMIC_XADD(val, i)       __atomic_fetch_add(val, i, __ATOMIC_RELAXED)
 #define _HA_ATOMIC_SUB(val, i)        __atomic_sub_fetch(val, i, __ATOMIC_RELAXED)
@@ -1109,6 +1109,10 @@
 #define _HA_ATOMIC_CAS HA_ATOMIC_CAS
 #endif /* !_HA_ATOMIC_CAS */
 
+#ifndef _HA_ATOMIC_DWCAS
+#define _HA_ATOMIC_DWCAS HA_ATOMIC_DWCAS
+#endif /* !_HA_ATOMIC_CAS */
+
 #ifndef _HA_ATOMIC_ADD
 #define _HA_ATOMIC_ADD HA_ATOMIC_ADD
 #endif /* !_HA_ATOMIC_ADD */
diff --git a/include/common/memory.h b/include/common/memory.h
index e7599dc..8e61156 100644
--- a/include/common/memory.h
+++ b/include/common/memory.h
@@ -228,7 +228,7 @@
 		new.seq = cmp.seq + 1;
 		__ha_barrier_load();
 		new.free_list = *POOL_LINK(pool, cmp.free_list);
-	} while (__ha_cas_dw((void *)&pool->free_list, (void *)&cmp, (void *)&new) == 0);
+	} while (HA_ATOMIC_DWCAS((void *)&pool->free_list, (void *)&cmp, (void *)&new) == 0);
 	__ha_barrier_atomic_store();
 
 	_HA_ATOMIC_ADD(&pool->used, 1);
diff --git a/src/fd.c b/src/fd.c
index fef4457..3827e83 100644
--- a/src/fd.c
+++ b/src/fd.c
@@ -272,7 +272,7 @@
 #ifdef HA_CAS_IS_8B
 	    unlikely(!_HA_ATOMIC_CAS(((void **)(void *)&_GET_NEXT(fd, off)), ((void **)(void *)&cur_list), (*(void **)(void *)&next_list))))
 #else
-	    unlikely(!__ha_cas_dw((void *)&_GET_NEXT(fd, off), (void *)&cur_list, (void *)&next_list)))
+	    unlikely(!_HA_ATOMIC_DWCAS(((void **)(void *)&_GET_NEXT(fd, off)), ((void **)(void *)&cur_list), (*(void **)(void *)&next_list))))
 #endif
 	    ;
 	next = cur_list.next;
diff --git a/src/memory.c b/src/memory.c
index ef7ec93..140671b 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -252,7 +252,7 @@
 				break;
 			new.free_list = *POOL_LINK(entry, cmp.free_list);
 			new.seq = cmp.seq + 1;
-			if (__ha_cas_dw(&entry->free_list, &cmp, &new) == 0)
+			if (HA_ATOMIC_DWCAS(&entry->free_list, &cmp, &new) == 0)
 				continue;
 			free(cmp.free_list);
 			_HA_ATOMIC_SUB(&entry->allocated, 1);