MEDIUM: tasks: collect per-task CPU time and latency

Right now we measure for each task the cumulated time spent waiting for
the CPU and using it. The timestamp uses a 64-bit integer to report a
nanosecond-level date. This is only enabled when "profiling.tasks" is
enabled, and consumes less than 1% extra CPU on x86_64 when enabled.
The cumulated processing time and wait time are reported in "show sess".

The task's counters are also reset when an HTTP transaction is reset
since the HTTP part pretends to restart on a fresh new stream. This
will make sure we always report correct numbers for each request in
the logs.
diff --git a/include/proto/task.h b/include/proto/task.h
index cc18bab..264899e 100644
--- a/include/proto/task.h
+++ b/include/proto/task.h
@@ -299,6 +299,9 @@
 	t->thread_mask = thread_mask;
 	t->nice = 0;
 	t->calls = 0;
+	t->call_date = 0;
+	t->cpu_time = 0;
+	t->lat_time = 0;
 	t->expire = TICK_ETERNITY;
 	return t;
 }
diff --git a/include/types/task.h b/include/types/task.h
index bc64f43..e8ade67 100644
--- a/include/types/task.h
+++ b/include/types/task.h
@@ -64,6 +64,7 @@
 		unsigned short state; /* task state : bitfield of TASK_	*/ \
 		short nice; /* task prio from -1024 to +1024, or -32768 for tasklets */ \
 		unsigned int calls; /* number of times process was called */ \
+		uint64_t cpu_time; /* total CPU time consumed */            \
 		struct task *(*process)(struct task *t, void *ctx, unsigned short state); /* the function which processes the task */ \
 		void *context; /* the task's context */			\
 	}
@@ -75,6 +76,8 @@
 	struct eb32_node wq;		/* ebtree node used to hold the task in the wait queue */
 	int expire;			/* next expiration date for this task, in ticks */
 	unsigned long thread_mask;	/* mask of thread IDs authorized to process the task */
+	uint64_t call_date;		/* date of the last task wakeup or call */
+	uint64_t lat_time;		/* total latency time experienced */
 };
 
 /* lightweight tasks, without priority, mainly used for I/Os */
diff --git a/src/proto_http.c b/src/proto_http.c
index f7222cd..019556c 100644
--- a/src/proto_http.c
+++ b/src/proto_http.c
@@ -3809,6 +3809,12 @@
 	stream_stop_content_counters(s);
 	stream_update_time_stats(s);
 
+	/* reset the profiling counter */
+	s->task->calls     = 0;
+	s->task->cpu_time  = 0;
+	s->task->lat_time  = 0;
+	s->task->call_date = (profiling & HA_PROF_TASKS) ? now_mono_time() : 0;
+
 	s->logs.accept_date = date; /* user-visible date for logging */
 	s->logs.tv_accept = now;  /* corrected date for internal use */
 	s->logs.t_handshake = 0; /* There are no handshake in keep alive connection. */
diff --git a/src/stream.c b/src/stream.c
index 3c84cbe..96e7f88 100644
--- a/src/stream.c
+++ b/src/stream.c
@@ -2984,14 +2984,15 @@
 		}
 		else if ((tmpctx = objt_appctx(strm->si[0].end)) != NULL) {
 			chunk_appendf(&trash,
-			              "  app0=%p st0=%d st1=%d st2=%d applet=%s tmask=0x%lx nice=%d calls=%u\n",
+			              "  app0=%p st0=%d st1=%d st2=%d applet=%s tmask=0x%lx nice=%d calls=%u cpu=%llu lat=%llu\n",
 				      tmpctx,
 				      tmpctx->st0,
 				      tmpctx->st1,
 				      tmpctx->st2,
 			              tmpctx->applet->name,
 			              tmpctx->thread_mask,
-			              tmpctx->t->nice, tmpctx->t->calls);
+			              tmpctx->t->nice, tmpctx->t->calls,
+			              (unsigned long long)tmpctx->t->cpu_time, (unsigned long long)tmpctx->t->lat_time);
 		}
 
 		if ((cs = objt_cs(strm->si[1].end)) != NULL) {
@@ -3018,14 +3019,15 @@
 		}
 		else if ((tmpctx = objt_appctx(strm->si[1].end)) != NULL) {
 			chunk_appendf(&trash,
-			              "  app1=%p st0=%d st1=%d st2=%d applet=%s tmask=0x%lx, nice=%d, calls=%u\n",
+			              "  app1=%p st0=%d st1=%d st2=%d applet=%s tmask=0x%lx, nice=%d, calls=%u, cpu=%llu, lat=%llu\n",
 				      tmpctx,
 				      tmpctx->st0,
 				      tmpctx->st1,
 				      tmpctx->st2,
 			              tmpctx->applet->name,
 			              tmpctx->thread_mask,
-			              tmpctx->t->nice, tmpctx->t->calls);
+			              tmpctx->t->nice, tmpctx->t->calls,
+			              (unsigned long long)tmpctx->t->cpu_time, (unsigned long long)tmpctx->t->lat_time);
 		}
 
 		chunk_appendf(&trash,
@@ -3225,10 +3227,11 @@
 			}
 
 			chunk_appendf(&trash,
-				     " ts=%02x age=%s calls=%d",
+				     " ts=%02x age=%s calls=%d cpu=%llu lat=%llu",
 				     curr_strm->task->state,
 				     human_time(now.tv_sec - curr_strm->logs.tv_accept.tv_sec, 1),
-				     curr_strm->task->calls);
+			             curr_strm->task->calls,
+			             (unsigned long long)curr_strm->task->cpu_time, (unsigned long long)curr_strm->task->lat_time);
 
 			chunk_appendf(&trash,
 				     " rq[f=%06xh,i=%u,an=%02xh,rx=%s",
diff --git a/src/task.c b/src/task.c
index 779dbf5..5c5e104 100644
--- a/src/task.c
+++ b/src/task.c
@@ -140,6 +140,9 @@
 		t->rq.key += offset;
 	}
 
+	if (profiling & HA_PROF_TASKS)
+		t->call_date = now_mono_time();
+
 	eb32sc_insert(root, &t->rq, t->thread_mask);
 #ifdef USE_THREAD
 	if (root == &rqueue) {
@@ -416,6 +419,14 @@
 		ctx = t->context;
 		process = t->process;
 		t->calls++;
+
+		if (unlikely(!TASK_IS_TASKLET(t) && t->call_date)) {
+			uint64_t now_ns = now_mono_time();
+
+			t->lat_time += now_ns - t->call_date;
+			t->call_date = now_ns;
+		}
+
 		curr_task = (struct task *)t;
 		if (likely(process == process_stream))
 			t = process_stream(t, ctx, state);
@@ -432,6 +443,11 @@
 		 * immediately, else we defer it into wait queue
 		 */
 		if (t != NULL) {
+			if (unlikely(!TASK_IS_TASKLET(t) && t->call_date)) {
+				t->cpu_time += now_mono_time() - t->call_date;
+				t->call_date = 0;
+			}
+
 			state = HA_ATOMIC_AND(&t->state, ~TASK_RUNNING);
 			if (state)
 #ifdef USE_THREAD