MEDIUM: lua-thread: use atomics for memory accounting

Let's switch memory accounting to atomics so that the allocator function
may safely be used from concurrent Lua states.

Given that this function is extremely hot on the call path, we try to
optimize it for the most common case, which is:
  - no limit
  - there's enough memory

The accounting is what is particuarly expensive in threads since all
CPUs compete for a cache line, so when the limit is not used, we don't
want to use accounting. However we need to preserve it during the boot
phase until we may parse a "tune.lua.maxmem" value. For this, we turn
the unlimited "0" value to ~0 at the end of the boot phase to mark the
definite end of accounting. The function then detects this value and
directly jumps to realloc() in this case.

When the limit is enforced however, we use a CAS to check and reserve
our share of memory, and we roll back on failure. The CAS is used both
for increments and decrements so that a single operation is enough to
update the counters.
diff --git a/src/hlua.c b/src/hlua.c
index 176e6f7..0497041 100644
--- a/src/hlua.c
+++ b/src/hlua.c
@@ -225,15 +225,18 @@
  */
 static unsigned int hlua_nb_instruction = 10000;
 
-/* Descriptor for the memory allocation state. If limit is not null, it will
- * be enforced on any memory allocation.
+/* Descriptor for the memory allocation state. The limit is pre-initialised to
+ * 0 until it is replaced by "tune.lua.maxmem" during the config parsing, or it
+ * is replaced with ~0 during post_init after everything was loaded. This way
+ * it is guaranteed that if limit is ~0 the boot is complete and that if it's
+ * zero it's not yet limited and proper accounting is required.
  */
 struct hlua_mem_allocator {
 	size_t allocated;
 	size_t limit;
 };
 
-static struct hlua_mem_allocator hlua_global_allocator;
+static struct hlua_mem_allocator hlua_global_allocator THREAD_ALIGNED(64);
 
 /* These functions converts types between HAProxy internal args or
  * sample and LUA types. Another function permits to check if the
@@ -8180,6 +8183,10 @@
 	enum hlua_exec ret;
 	const char *error;
 
+	/* disable memory limit checks if limit is not set */
+	if (!hlua_global_allocator.limit)
+		hlua_global_allocator.limit = ~hlua_global_allocator.limit;
+
 	/* Call post initialisation function in safe environment. */
 	if (!SET_SAFE_LJMP(gL.T)) {
 		if (lua_type(gL.T, -1) == LUA_TSTRING)
@@ -8231,21 +8238,38 @@
  * is the previously allocated size or the kind of object in case of a new
  * allocation. <nsize> is the requested new size. A new allocation is
  * indicated by <ptr> being NULL. A free is indicated by <nsize> being
- * zero.
+ * zero. This one verifies that the limits are respected but is optimized
+ * for the fast case where limits are not used, hence stats are not updated.
  */
 static void *hlua_alloc(void *ud, void *ptr, size_t osize, size_t nsize)
 {
 	struct hlua_mem_allocator *zone = ud;
+	size_t limit, old, new;
+
+	/* a limit of ~0 means unlimited and boot complete, so there's no need
+	 * for accounting anymore.
+	 */
+	if (likely(~zone->limit == 0))
+		return realloc(ptr, nsize);
 
 	if (!ptr)
 		osize = 0;
 
-	if (zone->limit && zone->allocated + nsize - osize > zone->limit)
-		return NULL;
+	/* enforce strict limits across all threads */
+	limit = zone->limit;
+	old = _HA_ATOMIC_LOAD(&zone->allocated);
+	do {
+		new = old + nsize - osize;
+		if (unlikely(nsize && limit && new > limit))
+			return NULL;
+	} while (!_HA_ATOMIC_CAS(&zone->allocated, &old, new));
 
 	ptr = realloc(ptr, nsize);
-	if (ptr || !nsize)
-		zone->allocated += nsize - osize;
+
+	if (unlikely(!ptr && nsize)) // failed
+		_HA_ATOMIC_SUB(&zone->allocated, nsize - osize);
+
+	__ha_barrier_atomic_store();
 	return ptr;
 }