OPTIM: global: move byte counts out of global and per-thread
During multiple tests we've already noticed that shared stats counters
have become a real bottleneck under large thread counts. With QUIC it's
pretty visible, with qc_snd_buf() taking 2.5% of the CPU on a 48-thread
machine at only 25 Gbps, and this CPU is entirely spent in the atomic
increment of the byte count and byte rate. It's also visible in H1/H2
but slightly less since we're working with larger buffers, hence less
frequent updates. These counters are exclusively used to report the
byte count in "show info" and the byte rate in the stats.
Let's move them to the thread_ctx struct and make the stats reader
just collect each thread's stats when requested. That's way more
efficient than competing on a single cache line.
After this, qc_snd_buf has totally disappeared from the perf profile
and tests made in h1 show roughly 1% performance increase on small
objects.
diff --git a/include/haproxy/global-t.h b/include/haproxy/global-t.h
index 92827c6..dec16a2 100644
--- a/include/haproxy/global-t.h
+++ b/include/haproxy/global-t.h
@@ -198,10 +198,7 @@
struct freq_ctr ssl_be_keys_per_sec;
struct freq_ctr comp_bps_in; /* bytes per second, before http compression */
struct freq_ctr comp_bps_out; /* bytes per second, after http compression */
- struct freq_ctr out_32bps; /* #of 32-byte blocks emitted per second */
uint sslconns, totalsslconns; /* active, total # of SSL conns */
- unsigned long long out_bytes; /* total #of bytes emitted */
- unsigned long long spliced_out_bytes; /* total #of bytes emitted though a kernel pipe */
int cps_lim, cps_max;
int sps_lim, sps_max;
int ssl_lim, ssl_max;
diff --git a/include/haproxy/tinfo-t.h b/include/haproxy/tinfo-t.h
index 4c2cef5..ddf0317 100644
--- a/include/haproxy/tinfo-t.h
+++ b/include/haproxy/tinfo-t.h
@@ -25,6 +25,7 @@
#include <import/ebtree-t.h>
#include <haproxy/api-t.h>
+#include <haproxy/freq_ctr-t.h>
#include <haproxy/thread-t.h>
/* tasklet classes */
@@ -138,6 +139,10 @@
struct eb_root rqueue_shared; /* run queue fed by other threads */
__decl_thread(HA_SPINLOCK_T rqsh_lock); /* lock protecting the shared runqueue */
+ struct freq_ctr out_32bps; /* #of 32-byte blocks emitted per second */
+ unsigned long long out_bytes; /* total #of bytes emitted */
+ unsigned long long spliced_out_bytes; /* total #of bytes emitted though a kernel pipe */
+
ALWAYS_ALIGN(128);
};
diff --git a/src/quic_sock.c b/src/quic_sock.c
index 9d5c5be..ba8d36f 100644
--- a/src/quic_sock.c
+++ b/src/quic_sock.c
@@ -552,8 +552,8 @@
* The reason for the latter is that freq_ctr are limited to 4GB and
* that it's not enough per second.
*/
- _HA_ATOMIC_ADD(&global.out_bytes, ret);
- update_freq_ctr(&global.out_32bps, (ret + 16) / 32);
+ _HA_ATOMIC_ADD(&th_ctx->out_bytes, ret);
+ update_freq_ctr(&th_ctx->out_32bps, (ret + 16) / 32);
return 0;
}
diff --git a/src/raw_sock.c b/src/raw_sock.c
index af95c82..9901f9b 100644
--- a/src/raw_sock.c
+++ b/src/raw_sock.c
@@ -152,9 +152,9 @@
* blocks. The reason for the latter is that freq_ctr are
* limited to 4GB and that it's not enough per second.
*/
- _HA_ATOMIC_ADD(&global.out_bytes, retval);
- _HA_ATOMIC_ADD(&global.spliced_out_bytes, retval);
- update_freq_ctr(&global.out_32bps, (retval + 16) / 32);
+ _HA_ATOMIC_ADD(&th_ctx->out_bytes, retval);
+ _HA_ATOMIC_ADD(&th_ctx->spliced_out_bytes, retval);
+ update_freq_ctr(&th_ctx->out_32bps, (retval + 16) / 32);
}
return retval;
@@ -421,8 +421,8 @@
* blocks. The reason for the latter is that freq_ctr are
* limited to 4GB and that it's not enough per second.
*/
- _HA_ATOMIC_ADD(&global.out_bytes, done);
- update_freq_ctr(&global.out_32bps, (done + 16) / 32);
+ _HA_ATOMIC_ADD(&th_ctx->out_bytes, done);
+ update_freq_ctr(&th_ctx->out_32bps, (done + 16) / 32);
}
return done;
}
diff --git a/src/stats.c b/src/stats.c
index 47b6c06..64631a2 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -3461,7 +3461,11 @@
unsigned int up = (now.tv_sec - start_date.tv_sec);
char scope_txt[STAT_SCOPE_TXT_MAXLEN + sizeof STAT_SCOPE_PATTERN];
const char *scope_ptr = stats_scope_ptr(appctx, sc);
- unsigned long long bps = (unsigned long long)read_freq_ctr(&global.out_32bps) * 32;
+ unsigned long long bps;
+ int thr;
+
+ for (bps = thr = 0; thr < global.nbthread; thr++)
+ bps += 32ULL * read_freq_ctr(&ha_thread_ctx[thr].out_32bps);
/* Turn the bytes per second to bits per second and take care of the
* usual ethernet overhead in order to help figure how far we are from
@@ -4505,6 +4509,8 @@
{
struct timeval up;
struct buffer *out = get_trash_chunk();
+ uint64_t glob_out_bytes, glob_spl_bytes, glob_out_b32;
+ int thr;
#ifdef USE_OPENSSL
double ssl_sess_rate = read_freq_ctr_flt(&global.ssl_per_sec);
@@ -4515,6 +4521,15 @@
ssl_reuse = 100.0 * (1.0 - ssl_key_rate / ssl_sess_rate);
#endif
+ /* sum certain per-thread totals (mostly byte counts) */
+ glob_out_bytes = glob_spl_bytes = glob_out_b32 = 0;
+ for (thr = 0; thr < global.nbthread; thr++) {
+ glob_out_bytes += HA_ATOMIC_LOAD(&ha_thread_ctx[thr].out_bytes);
+ glob_spl_bytes += HA_ATOMIC_LOAD(&ha_thread_ctx[thr].spliced_out_bytes);
+ glob_out_b32 += read_freq_ctr(&ha_thread_ctx[thr].out_32bps);
+ }
+ glob_out_b32 *= 32; // values are 32-byte units
+
tv_remain(&start_date, &now, &up);
if (len < INF_TOTAL_FIELDS)
@@ -4601,9 +4616,9 @@
info[INF_DROPPED_LOGS] = mkf_u32(0, dropped_logs);
info[INF_BUSY_POLLING] = mkf_u32(0, !!(global.tune.options & GTUNE_BUSY_POLLING));
info[INF_FAILED_RESOLUTIONS] = mkf_u32(0, resolv_failed_resolutions);
- info[INF_TOTAL_BYTES_OUT] = mkf_u64(0, global.out_bytes);
- info[INF_TOTAL_SPLICED_BYTES_OUT] = mkf_u64(0, global.spliced_out_bytes);
- info[INF_BYTES_OUT_RATE] = mkf_u64(FN_RATE, (unsigned long long)read_freq_ctr(&global.out_32bps) * 32);
+ info[INF_TOTAL_BYTES_OUT] = mkf_u64(0, glob_out_bytes);
+ info[INF_TOTAL_SPLICED_BYTES_OUT] = mkf_u64(0, glob_spl_bytes);
+ info[INF_BYTES_OUT_RATE] = mkf_u64(FN_RATE, glob_out_b32);
info[INF_DEBUG_COMMANDS_ISSUED] = mkf_u32(0, debug_commands_issued);
info[INF_CUM_LOG_MSGS] = mkf_u32(FN_COUNTER, cum_log_messages);