MINOR: sample: add cpu_calls, cpu_ns_avg, cpu_ns_tot, lat_ns_avg, lat_ns_tot

These sample fetch keywords report performance metrics about the task calling
them. They are useful to report in logs which requests consume too much CPU
time and what negative performane impact it has on other requests. Typically
logging cpu_ns_avg and lat_ns_avg will show culprits and victims.
diff --git a/doc/configuration.txt b/doc/configuration.txt
index 180bfc8..2ce9616 100644
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@@ -13968,6 +13968,42 @@
   then this fetch clearly does not make sense, in which case the value returned
   will be -1.
 
+cpu_calls : integer
+  Returns the number of calls to the task processing the stream or current
+  request since it was allocated. This number is reset for each new request on
+  the same connections in case of HTTP keep-alive. This value should usually be
+  low and stable (around 2 calls for a typically simple request) but may become
+  high if some processing (compression, caching or analysis) is performed. This
+  is purely for performance monitoring purposes.
+
+cpu_ns_avg : integer
+  Returns the average number of nanoseconds spent in each call to the task
+  processing the stream or current request. This number is reset for each new
+  request on the same connections in case of HTTP keep-alive. This value
+  indicates the overall cost of processing the request or the connection for
+  each call. There is no good nor bad value but the time spent in a call
+  automatically causes latency for other processing (see lat_ns_avg below),
+  and may affect other connection's apparent response time. Certain operations
+  like compression, complex regex matching or heavy Lua operations may directly
+  affect this value, and having it in the logs will make it easier to spot the
+  faulty processing that needs to be fixed to recover decent performance.
+  Note: this value is exactly cpu_ns_tot divided by cpu_calls.
+
+cpu_ns_tot : integer
+  Returns the total number of nanoseconds spent in each call to the task
+  processing the stream or current request. This number is reset for each new
+  request on the same connections in case of HTTP keep-alive. This value
+  indicates the overall cost of processing the request or the connection for
+  each call. There is no good nor bad value but the time spent in a call
+  automatically causes latency for other processing (see lat_ns_avg below),
+  induces CPU costs on the machine, and may affect other connection's apparent
+  response time. Certain operations like compression, complex regex matching or
+  heavy Lua operations may directly affect this value, and having it in the
+  logs will make it easier to spot the faulty processing that needs to be fixed
+  to recover decent performance. The value may be artificially high due to a
+  high cpu_calls count, for example when processing many HTTP chunks, and for
+  this reason it is often preferred to log cpu_ns_avg instead.
+
 date([<offset>]) : integer
   Returns the current date as the epoch (number of seconds since 01/01/1970).
   If an offset value is specified, then it is a number of seconds that is added
@@ -14076,6 +14112,43 @@
 ipv6(<ipv6>) : ipv6
   Returns an ipv6.
 
+lat_ns_avg : integer
+  Returns the average number of nanoseconds spent between the moment the task
+  handling the stream is woken up and the moment it is effectively called. This
+  number is reset for each new request on the same connections in case of HTTP
+  keep-alive. This value indicates the overall latency inflicted to the current
+  request by all other requests being processed in parallel, and is a direct
+  indicator of perceived performance due to noisy neighbours. In order to keep
+  the value low, it is possible to reduce the scheduler's run queue depth using
+  "tune.runqueue-depth", to reduce the number of concurrent events processed at
+  once using "tune.maxpollevents", to decrease the stream's nice value using
+  the "nice" option on the "bind" lines or in the frontend, or to look for
+  other heavy requests in logs (those exhibiting large values of "cpu_ns_avg"),
+  whose processing needs to be adjusted or fixed. Compression of large buffers
+  could be a culprit, like heavy regex or long lists of regex.
+  Note: this value is exactly lat_ns_tot divided by cpu_calls.
+
+lat_ns_tot : integer
+  Returns the total number of nanoseconds spent between the moment the task
+  handling the stream is woken up and the moment it is effectively called. This
+  number is reset for each new request on the same connections in case of HTTP
+  keep-alive. This value indicates the overall latency inflicted to the current
+  request by all other requests being processed in parallel, and is a direct
+  indicator of perceived performance due to noisy neighbours. In order to keep
+  the value low, it is possible to reduce the scheduler's run queue depth using
+  "tune.runqueue-depth", to reduce the number of concurrent events processed at
+  once using "tune.maxpollevents", to decrease the stream's nice value using
+  the "nice" option on the "bind" lines or in the frontend, or to look for
+  other heavy requests in logs (those exhibiting large values of "cpu_ns_avg"),
+  whose processing needs to be adjusted or fixed. Compression of large buffers
+  could be a culprit, like heavy regex or long lists of regex. Note: while it
+  may intuitively seem that the total latency adds to a transfer time, it is
+  almost never true because while a task waits for the CPU, network buffers
+  continue to fill up and the next call will process more at once. The value
+  may be artificially high due to a high cpu_calls count, for example when
+  processing many HTTP chunks, and for this reason it is often preferred to log
+  lat_ns_avg instead, which is a more relevant performance indicator.
+
 meth(<method>) : method
   Returns a method.
 
diff --git a/src/sample.c b/src/sample.c
index 659c5f8..8ca4540 100644
--- a/src/sample.c
+++ b/src/sample.c
@@ -2889,6 +2889,51 @@
 	return 1;
 }
 
+/* returns the number of calls of the current stream's process_stream() */
+static int
+smp_fetch_cpu_calls(const struct arg *args, struct sample *smp, const char *kw, void *private)
+{
+	smp->data.type = SMP_T_SINT;
+	smp->data.u.sint = smp->strm->task->calls;
+	return 1;
+}
+
+/* returns the average number of nanoseconds spent processing the stream per call */
+static int
+smp_fetch_cpu_ns_avg(const struct arg *args, struct sample *smp, const char *kw, void *private)
+{
+	smp->data.type = SMP_T_SINT;
+	smp->data.u.sint = smp->strm->task->calls ? smp->strm->task->cpu_time / smp->strm->task->calls : 0;
+	return 1;
+}
+
+/* returns the total number of nanoseconds spent processing the stream */
+static int
+smp_fetch_cpu_ns_tot(const struct arg *args, struct sample *smp, const char *kw, void *private)
+{
+	smp->data.type = SMP_T_SINT;
+	smp->data.u.sint = smp->strm->task->cpu_time;
+	return 1;
+}
+
+/* returns the average number of nanoseconds per call spent waiting for other tasks to be processed */
+static int
+smp_fetch_lat_ns_avg(const struct arg *args, struct sample *smp, const char *kw, void *private)
+{
+	smp->data.type = SMP_T_SINT;
+	smp->data.u.sint = smp->strm->task->calls ? smp->strm->task->lat_time / smp->strm->task->calls : 0;
+	return 1;
+}
+
+/* returns the total number of nanoseconds per call spent waiting for other tasks to be processed */
+static int
+smp_fetch_lat_ns_tot(const struct arg *args, struct sample *smp, const char *kw, void *private)
+{
+	smp->data.type = SMP_T_SINT;
+	smp->data.u.sint = smp->strm->task->lat_time;
+	return 1;
+}
+
 static int smp_fetch_const_str(const struct arg *args, struct sample *smp, const char *kw, void *private)
 {
 	smp->flags |= SMP_F_CONST;
@@ -3025,6 +3070,13 @@
 	{ "thread",       smp_fetch_thread,  0,          NULL, SMP_T_SINT, SMP_USE_INTRN },
 	{ "rand",         smp_fetch_rand,  ARG1(0,SINT), NULL, SMP_T_SINT, SMP_USE_INTRN },
 	{ "stopping",     smp_fetch_stopping, 0,         NULL, SMP_T_BOOL, SMP_USE_INTRN },
+	{ "stopping",     smp_fetch_stopping, 0,         NULL, SMP_T_BOOL, SMP_USE_INTRN },
+
+	{ "cpu_calls",    smp_fetch_cpu_calls,  0,       NULL, SMP_T_SINT, SMP_USE_INTRN },
+	{ "cpu_ns_avg",   smp_fetch_cpu_ns_avg, 0,       NULL, SMP_T_SINT, SMP_USE_INTRN },
+	{ "cpu_ns_tot",   smp_fetch_cpu_ns_tot, 0,       NULL, SMP_T_SINT, SMP_USE_INTRN },
+	{ "lat_ns_avg",   smp_fetch_lat_ns_avg, 0,       NULL, SMP_T_SINT, SMP_USE_INTRN },
+	{ "lat_ns_tot",   smp_fetch_lat_ns_tot, 0,       NULL, SMP_T_SINT, SMP_USE_INTRN },
 
 	{ "str",  smp_fetch_const_str,  ARG1(1,STR),  NULL                , SMP_T_STR,  SMP_USE_INTRN },
 	{ "bool", smp_fetch_const_bool, ARG1(1,STR),  smp_check_const_bool, SMP_T_BOOL, SMP_USE_INTRN },