CLEANUP: quic: no need for atomics on packet refcnt
This is a leftover from the implementation's history, but the
quic_rx_packet and quic_tx_packet ref counts were still atomically
updated. It was found in perf top that the cost of the atomic inc
in quic_tx_packet_refinc() alone was responsible for 1% of the CPU
usage at 135 Gbps. Given that packets are only processed on their
assigned thread we don't need that anymore and this can be replaced
with regular non-atomic operations.
Doing this alone has reduced the CPU usage of qc_do_build_pkt()
from 3.6 to 2.5% and increased the overall bit rate by about 1%.
diff --git a/include/haproxy/quic_conn.h b/include/haproxy/quic_conn.h
index 60afccd..b5f25d8 100644
--- a/include/haproxy/quic_conn.h
+++ b/include/haproxy/quic_conn.h
@@ -502,13 +502,13 @@
/* Increment the reference counter of <pkt> */
static inline void quic_tx_packet_refinc(struct quic_tx_packet *pkt)
{
- HA_ATOMIC_ADD(&pkt->refcnt, 1);
+ pkt->refcnt++;
}
/* Decrement the reference counter of <pkt> */
static inline void quic_tx_packet_refdec(struct quic_tx_packet *pkt)
{
- if (!HA_ATOMIC_SUB_FETCH(&pkt->refcnt, 1)) {
+ if (--pkt->refcnt == 0) {
BUG_ON(!LIST_ISEMPTY(&pkt->frms));
/* If there are others packet in the same datagram <pkt> is attached to,
* detach the previous one and the next one from <pkt>.
@@ -670,7 +670,7 @@
break;
}
- if (HA_ATOMIC_LOAD(&pkt->refcnt))
+ if (pkt->refcnt)
break;
b_del(&qc->rx.buf, pkt->raw_len);
@@ -685,17 +685,14 @@
/* Increment the reference counter of <pkt> */
static inline void quic_rx_packet_refinc(struct quic_rx_packet *pkt)
{
- HA_ATOMIC_ADD(&pkt->refcnt, 1);
+ pkt->refcnt++;
}
/* Decrement the reference counter of <pkt> while remaining positive */
static inline void quic_rx_packet_refdec(struct quic_rx_packet *pkt)
{
- unsigned int refcnt;
-
- do {
- refcnt = HA_ATOMIC_LOAD(&pkt->refcnt);
- } while (refcnt && !HA_ATOMIC_CAS(&pkt->refcnt, &refcnt, refcnt - 1));
+ if (pkt->refcnt)
+ pkt->refcnt--;
}
/* Delete all RX packets for <qel> QUIC encryption level */