BUG/MEDIUM: peers: re-work refcnt on table to protect against flush
In proxy.c, when process is stopping we try to flush tables content
using 'stktable_trash_oldest'. A check on a counter "table->syncing" was
made to verify if there is no pending resync in progress.
But using multiple threads this counter can be increased by an other thread
only after some delay, so the content of some tables can be trashed earlier and
won't be pushed to the new process (after reload, some tables appear reset and
others don't).
This patch re-names the counter "table->syncing" to "table->refcnt" and
the counter is increased during configuration parsing (registering a table to
a peer section) to protect tables during runtime and until resync of a new
process has succeeded or failed.
The inc/dec operations are now made using atomic operations
because multiple peer sections could refer to the same table in futur.
This fix addresses github #1216.
This patch should be backported on all branches multi-thread support (v >= 1.8)
diff --git a/include/haproxy/stick_table-t.h b/include/haproxy/stick_table-t.h
index bae32d9..237bfbe 100644
--- a/include/haproxy/stick_table-t.h
+++ b/include/haproxy/stick_table-t.h
@@ -192,7 +192,8 @@
unsigned int localupdate;
unsigned int commitupdate;/* used to identify the latest local updates
pending for sync */
- unsigned int syncing; /* number of sync tasks watching this table now */
+ unsigned int refcnt; /* number of local peer over all peers sections
+ attached to this table */
union {
struct peers *p; /* sync peers */
char *name;
diff --git a/src/peers.c b/src/peers.c
index dd1d259..89d193d 100644
--- a/src/peers.c
+++ b/src/peers.c
@@ -3038,9 +3038,6 @@
/* add DO NOT STOP flag if not present */
_HA_ATOMIC_INC(&jobs);
peers->flags |= PEERS_F_DONOTSTOP;
- ps = peers->local;
- for (st = ps->tables; st ; st = st->next)
- st->table->syncing++;
/* disconnect all connected peers to process a local sync
* this must be done only the first time we are switching
@@ -3066,7 +3063,7 @@
_HA_ATOMIC_DEC(&jobs);
peers->flags &= ~PEERS_F_DONOTSTOP;
for (st = ps->tables; st ; st = st->next)
- st->table->syncing--;
+ HA_ATOMIC_DEC(&st->table->refcnt);
}
}
else if (!ps->appctx) {
@@ -3092,7 +3089,7 @@
_HA_ATOMIC_DEC(&jobs);
peers->flags &= ~PEERS_F_DONOTSTOP;
for (st = ps->tables; st ; st = st->next)
- st->table->syncing--;
+ HA_ATOMIC_DEC(&st->table->refcnt);
}
}
}
@@ -3306,6 +3303,13 @@
id = curpeer->tables->local_id;
st->local_id = id + 1;
+ /* If peer is local we inc table
+ * refcnt to protect against flush
+ * until this process pushed all
+ * table content to the new one
+ */
+ if (curpeer->local)
+ HA_ATOMIC_INC(&st->table->refcnt);
curpeer->tables = st;
}
@@ -3493,8 +3497,8 @@
st->last_acked, st->last_pushed, st->last_get,
st->teaching_origin, st->update);
chunk_appendf(&trash, "\n table:%p id=%s update=%u localupdate=%u"
- " commitupdate=%u syncing=%u",
- t, t->id, t->update, t->localupdate, t->commitupdate, t->syncing);
+ " commitupdate=%u refcnt=%u",
+ t, t->id, t->update, t->localupdate, t->commitupdate, t->refcnt);
if (flags & PEERS_SHOW_F_DICT) {
chunk_appendf(&trash, "\n TX dictionary cache:");
count = 0;
diff --git a/src/proxy.c b/src/proxy.c
index c3859e6..c209b23 100644
--- a/src/proxy.c
+++ b/src/proxy.c
@@ -1851,7 +1851,13 @@
* However we protect tables that are being synced to peers.
*/
if (unlikely(stopping && p->disabled && p->table && p->table->current)) {
- if (!p->table->syncing) {
+
+ if (!p->table->refcnt) {
+ /* !table->refcnt means there
+ * is no more pending full resync
+ * to push to a new process and
+ * we are free to flush the table.
+ */
stktable_trash_oldest(p->table, p->table->current);
pool_gc(NULL);
}