BUG/MEDIUM: threads: fix double-word CAS on non-optimized 32-bit platforms

On armv7 haproxy doesn't work because of the fixes on the double-word
CAS. There are two issues. The first one is that the last argument in
case of dwcas is a pointer to the set of value and not a value ; the
second is that it's not enough to cast the data as (void*) since it will
be a single word. Let's fix this by using the pointers as an array of
long. This was tested on i386, armv7, x86_64 and aarch64 and it is now
fine. An alternate approach using a struct was attempted as well but it
used to produce less optimal code.

This fix must be backported to 1.9. This fixes github issue #105.

Cc: Olivier Houchard <ohouchard@haproxy.com>
diff --git a/include/common/hathreads.h b/include/common/hathreads.h
index 268dd63..0ba56d0 100644
--- a/include/common/hathreads.h
+++ b/include/common/hathreads.h
@@ -77,7 +77,19 @@
 #define __decl_aligned_rwlock(lock)
 
 #define HA_ATOMIC_CAS(val, old, new) ({((*val) == (*old)) ? (*(val) = (new) , 1) : (*(old) = *(val), 0);})
-#define HA_ATOMIC_DWCAS(val, o, n)   ({((*val) == (*o))   ? (*(val) = (n)   , 1) : (*(o)   = *(val), 0);})
+
+/* warning, n is a pointer to the double value for dwcas */
+#define HA_ATOMIC_DWCAS(val, o, n)				       \
+	({                                                             \
+		long *_v = (long*)(val);                               \
+		long *_o = (long*)(o);				       \
+		long *_n = (long*)(n);				       \
+		long _v0 = _v[0], _v1 = _v[1];			       \
+		(_v0 == _o[0] && _v1 == _o[1]) ?                       \
+			(_v[0] = _n[0], _v[1] = _n[1], 1) :	       \
+			(_o[0] = _v0,   _o[1] = _v1,   0);	       \
+	})
+
 #define HA_ATOMIC_ADD(val, i)        ({*(val) += (i);})
 #define HA_ATOMIC_SUB(val, i)        ({*(val) -= (i);})
 #define HA_ATOMIC_XADD(val, i)						\
@@ -293,6 +305,7 @@
 		__ret_cas;						\
 	})
 
+/* warning, n is a pointer to the double value for dwcas */
 #define HA_ATOMIC_DWCAS(val, o, n) __ha_cas_dw(val, o, n)
 
 #define HA_ATOMIC_XCHG(val, new)					\
@@ -337,6 +350,7 @@
 #else
 /* gcc >= 4.7 */
 #define HA_ATOMIC_CAS(val, old, new) __atomic_compare_exchange_n(val, old, new, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+/* warning, n is a pointer to the double value for dwcas */
 #define HA_ATOMIC_DWCAS(val, o, n)   __ha_cas_dw(val, o, n)
 #define HA_ATOMIC_ADD(val, i)        __atomic_add_fetch(val, i, __ATOMIC_SEQ_CST)
 #define HA_ATOMIC_XADD(val, i)       __atomic_fetch_add(val, i, __ATOMIC_SEQ_CST)
@@ -366,6 +380,7 @@
  * ie updating a counter. Otherwise a barrier is required.
  */
 #define _HA_ATOMIC_CAS(val, old, new) __atomic_compare_exchange_n(val, old, new, 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED)
+/* warning, n is a pointer to the double value for dwcas */
 #define _HA_ATOMIC_DWCAS(val, o, n)   __ha_cas_dw(val, o, n)
 #define _HA_ATOMIC_ADD(val, i)        __atomic_add_fetch(val, i, __ATOMIC_RELAXED)
 #define _HA_ATOMIC_XADD(val, i)       __atomic_fetch_add(val, i, __ATOMIC_RELAXED)
diff --git a/src/fd.c b/src/fd.c
index d87dddc..e71c2ee 100644
--- a/src/fd.c
+++ b/src/fd.c
@@ -272,7 +272,7 @@
 #ifdef HA_CAS_IS_8B
 	    unlikely(!_HA_ATOMIC_CAS(((void **)(void *)&_GET_NEXT(fd, off)), ((void **)(void *)&cur_list), (*(void **)(void *)&next_list))))
 #else
-	    unlikely(!_HA_ATOMIC_DWCAS(((void **)(void *)&_GET_NEXT(fd, off)), ((void **)(void *)&cur_list), (*(void **)(void *)&next_list))))
+	    unlikely(!_HA_ATOMIC_DWCAS(((void *)&_GET_NEXT(fd, off)), ((void *)&cur_list), ((void *)&next_list))))
 #endif
 	    ;
 	next = cur_list.next;