MINOR: add a new function call tracer for debugging purposes

This feature relies on GCC's ability to call helpers at function entry/exit
points. We define these helpers to quickly dump the minimum info into a trace
file that can be converted to a human readable format using a script in the
contrib/trace directory. This has only been implemented in the GNU makefile
for now on as it is unsure whether it's supported on all OSes.

The feature is enabled by building with "TRACE=1". The performance impact is
huge, so this feature should only be used when debugging. To limit the loss
of performance, fprintf() has been disabled and the output is hand-crafted
and emitted using fwrite(), resulting in doubling the performance. Using the
TSC instead of gettimeofday() also doubles the performance. Around 1200 conns/s
may be achieved on a Pentium-M 1.7 GHz which leads to around 50 MB/s of traces.

The entry and exits of all functions will be dumped into a file designated
by the HAPROXY_TRACE environment variable, or by default "trace.out". If the
trace file name is empty or "/dev/null", then traces are disabled. If
opening the trace file fails, then stderr is used. If HAPROXY_TRACE_FAST is
used, then the time is taken from the global <now> variable. Last, if
HAPROXY_TRACE_TSC is used, then the machine's TSC is used instead of the
real time (almost twice as fast).

The output format is :

  <sec.usec> <level> <caller_ptr> <dir> <callee_ptr>
or :
  <tsc> <level> <caller_ptr> <dir> <callee_ptr>

where <dir> is '>' when entering a function and '<' when leaving.

The awk script in contrib/trace provides a nicer indented output :

6f74989e6f8 ->->->   run_poll_loop > signal_process_queue [src/haproxy.c:1097:0x804bd69] > [include/proto/signal.h:32:0x8049cd0]
6f74989eb00          run_poll_loop < signal_process_queue [src/haproxy.c:1097:0x804bd69] < [include/proto/signal.h:32:0x8049cd0]
6f74989ef44 ->->->   run_poll_loop > wake_expired_tasks [src/haproxy.c:1100:0x804bd72] > [src/task.c:123:0x8055060]
6f74989f3a6 ->->->->   wake_expired_tasks > eb32_lookup_ge [src/task.c:128:0x8055091] > [ebtree/eb32tree.c:138:0x80a8c70]
6f74989f7e9            wake_expired_tasks < eb32_lookup_ge [src/task.c:128:0x8055091] < [ebtree/eb32tree.c:138:0x80a8c70]
6f74989fc0d ->->->->   wake_expired_tasks > eb32_first [src/task.c:134:0x80550d5] > [ebtree/eb32tree.h:55:0x8054ad0]
6f7498a003d ->->->->->   eb32_first > eb_first [ebtree/eb32tree.h:56:0x8054af1] > [ebtree/ebtree.h:520:0x8054a10]
6f7498a0436 ->->->->->->   eb_first > eb_walk_down [ebtree/ebtree.h:521:0x8054a33] > [ebtree/ebtree.h:442:0x80549a0]
6f7498a0843 ->->->->->->->   eb_walk_down > eb_gettag [ebtree/ebtree.h:445:0x80549d6] > [ebtree/ebtree.h:418:0x80548e0]
6f7498a0c2b                  eb_walk_down < eb_gettag [ebtree/ebtree.h:445:0x80549d6] < [ebtree/ebtree.h:418:0x80548e0]
6f7498a1042 ->->->->->->->   eb_walk_down > eb_untag [ebtree/ebtree.h:447:0x80549e2] > [ebtree/ebtree.h:412:0x80548a0]
6f7498a1498                  eb_walk_down < eb_untag [ebtree/ebtree.h:447:0x80549e2] < [ebtree/ebtree.h:412:0x80548a0]
6f7498a18c6 ->->->->->->->   eb_walk_down > eb_root_to_node [ebtree/ebtree.h:448:0x80549e7] > [ebtree/ebtree.h:432:0x8054960]
6f7498a1cd4                  eb_walk_down < eb_root_to_node [ebtree/ebtree.h:448:0x80549e7] < [ebtree/ebtree.h:432:0x8054960]
6f7498a20c4                eb_first < eb_walk_down [ebtree/ebtree.h:521:0x8054a33] < [ebtree/ebtree.h:442:0x80549a0]
6f7498a24b4              eb32_first < eb_first [ebtree/eb32tree.h:56:0x8054af1] < [ebtree/ebtree.h:520:0x8054a10]
6f7498a289c            wake_expired_tasks < eb32_first [src/task.c:134:0x80550d5] < [ebtree/eb32tree.h:55:0x8054ad0]
6f7498a2c8c          run_poll_loop < wake_expired_tasks [src/haproxy.c:1100:0x804bd72] < [src/task.c:123:0x8055060]
6f7498a3095 ->->->   run_poll_loop > process_runnable_tasks [src/haproxy.c:1103:0x804bd7a] > [src/task.c:190:0x8055150]

A nice improvement would possibly consist in trying to get the function's
arguments in the stack and to dump a few more infor for some well-known
functions (eg: the session's status for process_session).
diff --git a/Makefile b/Makefile
index a6c7962..c70dc94 100644
--- a/Makefile
+++ b/Makefile
@@ -122,6 +122,11 @@
 # use at all.
 DEBUG =
 
+#### Trace options
+# Use TRACE=1 to trace function calls to file "trace.out" or to stderr if not
+# possible.
+TRACE =
+
 #### Additional include and library dirs
 # Redefine this if you want to add some special PATH to include/libs
 ADDINC =
@@ -478,6 +483,12 @@
 COPTS += -DCONFIG_HAPROXY_DATE=\"$(VERDATE)\"
 endif
 
+ifneq ($(TRACE),)
+# if tracing is enabled, we want it to be as fast as possible
+TRACE_COPTS := $(filter-out -O0 -O1 -O2 -pg -finstrument-functions,$(COPTS)) -O3 -fomit-frame-pointer
+COPTS += -finstrument-functions
+endif
+
 #### Global link options
 # These options are added at the end of the "ld" command line. Use LDFLAGS to
 # add options at the beginning of the "ld" command line if needed.
@@ -523,6 +534,10 @@
               $(EBTREE_DIR)/ebmbtree.o $(EBTREE_DIR)/ebsttree.o \
               $(EBTREE_DIR)/ebimtree.o $(EBTREE_DIR)/ebistree.o
 
+ifneq ($(TRACE),)
+OBJS += src/trace.o
+endif
+
 # Not used right now
 LIB_EBTREE = $(EBTREE_DIR)/libebtree.a
 
@@ -538,6 +553,9 @@
 %.o:	%.c
 	$(CC) $(COPTS) -c -o $@ $<
 
+src/trace.o: src/trace.c
+	$(CC) $(TRACE_COPTS) -c -o $@ $<
+
 src/haproxy.o:	src/haproxy.c
 	$(CC) $(COPTS) \
 	      -DBUILD_TARGET='"$(strip $(TARGET))"' \
diff --git a/contrib/trace/trace.awk b/contrib/trace/trace.awk
new file mode 100755
index 0000000..8c8854a
--- /dev/null
+++ b/contrib/trace/trace.awk
@@ -0,0 +1,70 @@
+#!/bin/sh
+#
+# trace.awk - Fast trace symbol resolver - w@1wt.eu - 2012/05/25
+#
+# Principle: this program launches reads pointers from a trace file and if not
+# found in its cache, it passes them over a pipe to addr2line which is forked
+# in a coprocess, then stores the result in the cache.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version
+# 2 of the License, or (at your option) any later version.
+#
+# usage: $0 exec_file < trace.out
+#
+
+if [ $# -lt 1 ]; then
+  echo "Usage:   ${0##*/} exec_file < trace.out"
+  echo "Example: ${0##*/} ./haproxy < trace.out"
+  echo "Example: HAPROXY_TRACE=/dev/stdout ./haproxy -f cfg | ${0##*/} ./haproxy"
+  exit 1
+fi
+
+if [ ! -s "$1" ]; then
+  echo "$1 is not a valid executable file"
+  exit 1
+fi
+
+exec awk -v prog="$1" \
+'
+BEGIN {
+  if (cmd == "")
+    cmd=ENVIRON["ADDR2LINE"];
+  if (cmd == "")
+    cmd="addr2line";
+
+  if (prog == "")
+    prog=ENVIRON["PROG"];
+
+  cmd=cmd " -f -e " prog;
+
+  for (i = 1; i < 100; i++) {
+    indents[">",i] = indents[">",i-1] "->"
+    indents[">",i-1] = indents[">",i-1] " "
+    indents["<",i] = indents["<",i-1] "  "
+    indents["<",i-1] = indents["<",i-1] " "
+  }
+}
+
+function getptr(ptr)
+{
+  loc=locs[ptr];
+  name=names[ptr];
+  if (loc == "" || name == "") {
+    print ptr |& cmd;
+    cmd |& getline name;
+    cmd |& getline loc;
+    names[ptr]=name
+    locs[ptr]=loc
+  }
+}
+
+{
+  # input format: <timestamp> <level> <caller> <dir> <callee>
+  getptr($3); caller_loc=loc; caller_name=name
+  getptr($5); callee_loc=loc; callee_name=name
+  printf "%s %s  %s %s %s [%s:%s] %s [%s:%s]\n",
+    $1, indents[$4,$2], caller_name, $4, callee_name, caller_loc, $3, $4, callee_loc, $5
+}
+'
diff --git a/src/trace.c b/src/trace.c
new file mode 100644
index 0000000..ab3827d
--- /dev/null
+++ b/src/trace.c
@@ -0,0 +1,247 @@
+/*
+ * Function call tracing for gcc >= 2.95
+ *
+ * Copyright 2012 Willy Tarreau <w@1wt.eu>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * gcc is able to call a specific function when entering and leaving any
+ * function when compiled with -finstrument-functions. This code must not
+ * be built with this argument. The performance impact is huge, so this
+ * feature should only be used when debugging.
+ *
+ * The entry and exits of all functions will be dumped into a file designated
+ * by the HAPROXY_TRACE environment variable, or by default "trace.out". If the
+ * trace file name is empty or "/dev/null", then traces are disabled. If
+ * opening the trace file fails, then stderr is used. If HAPROXY_TRACE_FAST is
+ * used, then the time is taken from the global <now> variable. Last, if
+ * HAPROXY_TRACE_TSC is used, then the machine's TSC is used instead of the
+ * real time (almost twice as fast).
+ *
+ * The output format is :
+ *
+ *   <sec.usec> <level> <caller_ptr> <dir> <callee_ptr>
+ *  or :
+ *   <tsc> <level> <caller_ptr> <dir> <callee_ptr>
+ *
+ * where <dir> is '>' when entering a function and '<' when leaving.
+ *
+ * The article below is a nice explanation of how this works :
+ *   http://balau82.wordpress.com/2010/10/06/trace-and-profile-function-calls-with-gcc/
+ */
+
+#include <sys/time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <common/compiler.h>
+#include <common/time.h>
+
+static FILE *log;
+static int level;
+static int disabled;
+static int fast_time;
+static int use_tsc;
+static struct timeval trace_now;
+static struct timeval *now_ptr;
+static char line[128]; /* more than enough for a message (9+1+6+1+3+1+18+1+1+18+1+1) */
+
+static int open_trace()
+{
+	const char *output = getenv("HAPROXY_TRACE");
+
+	if (!output)
+		output = "trace.out";
+
+	if (!*output || strcmp(output, "/dev/null") == 0) {
+		disabled = 1;
+		return 0;
+	}
+
+	log = fopen(output, "w");
+	if (!log)
+		log = stderr;
+
+	now_ptr = &now;
+	if (getenv("HAPROXY_TRACE_FAST") != NULL) {
+		fast_time = 1;
+		now_ptr = &trace_now;
+	}
+	if (getenv("HAPROXY_TRACE_TSC") != NULL) {
+		fast_time = 1;
+		use_tsc = 1;
+	}
+	return 1;
+}
+
+/* This function first divides the number by 100M then iteratively multiplies it
+ * by 100 (using adds and shifts). The trick is that dividing by 100M is equivalent
+ * to multiplying by 1/100M, which approximates to 1441151881/2^57. All local
+ * variables fit in registers on x86. This version outputs two digits per round.
+ * <min_pairs> indicates the minimum number of pairs of digits that have to be
+ * emitted, which might be left-padded with zeroes.
+ * It returns the pointer to the ending '\0'.
+ */
+static char *ultoad2(unsigned int x, char *out, int min_pairs)
+{
+	unsigned int q;
+	char *p = out;
+	int pos = 4;
+	unsigned long long y;
+
+	static const unsigned short bcd[100] = {
+		0x3030, 0x3130, 0x3230, 0x3330, 0x3430, 0x3530, 0x3630, 0x3730, 0x3830, 0x3930,
+		0x3031, 0x3131, 0x3231, 0x3331, 0x3431, 0x3531, 0x3631, 0x3731, 0x3831, 0x3931,
+		0x3032, 0x3132, 0x3232, 0x3332, 0x3432, 0x3532, 0x3632, 0x3732, 0x3832, 0x3932,
+		0x3033, 0x3133, 0x3233, 0x3333, 0x3433, 0x3533, 0x3633, 0x3733, 0x3833, 0x3933,
+		0x3034, 0x3134, 0x3234, 0x3334, 0x3434, 0x3534, 0x3634, 0x3734, 0x3834, 0x3934,
+		0x3035, 0x3135, 0x3235, 0x3335, 0x3435, 0x3535, 0x3635, 0x3735, 0x3835, 0x3935,
+		0x3036, 0x3136, 0x3236, 0x3336, 0x3436, 0x3536, 0x3636, 0x3736, 0x3836, 0x3936,
+		0x3037, 0x3137, 0x3237, 0x3337, 0x3437, 0x3537, 0x3637, 0x3737, 0x3837, 0x3937,
+		0x3038, 0x3138, 0x3238, 0x3338, 0x3438, 0x3538, 0x3638, 0x3738, 0x3838, 0x3938,
+		0x3039, 0x3139, 0x3239, 0x3339, 0x3439, 0x3539, 0x3639, 0x3739, 0x3839, 0x3939 };
+
+	y = x * 1441151881ULL;  /* y>>57 will be the integer part of x/100M */
+	while (1) {
+		q = y >> 57;
+		/* Q is composed of the first digit in the lower byte and the second
+		 * digit in the higher byte.
+		 */
+		if (p != out || q > 9 || pos < min_pairs) {
+#if defined(__i386__) || defined(__x86_64__)
+			/* unaligned accesses are fast on x86 */
+			*(unsigned short *)p = bcd[q];
+			p += 2;
+#else
+			*(p++) = bcd[q];
+			*(p++) = bcd[q] >> 8;
+#endif
+		}
+		else if (q || !pos) {
+			/* only at most one digit */
+			*(p++) = bcd[q] >> 8;
+		}
+		if (--pos < 0)
+			break;
+
+		y &= 0x1FFFFFFFFFFFFFFULL;  // remainder
+
+		if (sizeof(long) >= sizeof(long long)) {
+			/* shifting is preferred on 64-bit archs, while mult is faster on 32-bit.
+			 * We multiply by 100 by doing *5, *5 and *4, all of which are trivial.
+			 */
+			y += (y << 2);
+			y += (y << 2);
+			y <<= 2;
+		}
+		else
+			y *= 100;
+	}
+
+	*p = '\0';
+	return p;
+}
+
+/* Send <h> as hex into <out>. Returns the pointer to the ending '\0'. */
+static char *emit_hex(unsigned long h, char *out)
+{
+	static unsigned char hextab[16] = "0123456789abcdef";
+	int shift = sizeof(h) * 8 - 4;
+	unsigned int idx;
+
+	do {
+		idx = (h >> shift);
+		if (idx || !shift)
+			*out++ = hextab[idx & 15];
+		shift -= 4;
+	} while (shift >= 0);
+	*out = '\0';
+	return out;
+}
+
+#if defined(__i386__) || defined(__x86_64__)
+static inline unsigned long long rdtsc()
+{
+     unsigned int a, d;
+     asm volatile("rdtsc" : "=a" (a), "=d" (d));
+     return a + ((unsigned long long)d << 32);
+}
+#else
+static inline unsigned long long rdtsc()
+{
+	struct timeval tv;
+	gettimeofday(&tv, NULL);
+	return tv.tv_sec * 1000000 + tv.tv_usec;
+}
+#endif
+
+static void make_line(void *from, void *to, int level, char dir)
+{
+	char *p = line;
+
+	if (unlikely(!log) && !open_trace())
+		return;
+
+	if (unlikely(!fast_time))
+		gettimeofday(now_ptr, NULL);
+
+#ifdef USE_SLOW_FPRINTF
+	if (!use_tsc)
+		fprintf(log, "%u.%06u %d %p %c %p\n",
+			(unsigned int)now_ptr->tv_sec,
+			(unsigned int)now_ptr->tv_usec,
+			level, from, dir, to);
+	else
+		fprintf(log, "%llx %d %p %c %p\n",
+			rdtsc(), level, from, dir, to);
+	return;
+#endif
+
+	if (unlikely(!use_tsc)) {
+		/* "%u.06u", tv_sec, tv_usec */
+		p = ultoad2(now_ptr->tv_sec, p, 0);
+		*p++ = '.';
+		p = ultoad2(now_ptr->tv_usec, p, 3);
+	} else {
+		/* "%08x%08x", high, low */
+		unsigned long long t = rdtsc();
+		if (sizeof(long) < sizeof(long long))
+			p = emit_hex((unsigned long)(t >> 32U), p);
+		p = emit_hex((unsigned long)(t), p);
+	}
+
+	/* " %u", level */
+	*p++ = ' ';
+	p = ultoad2(level, p, 0);
+
+	/* " %p", from */
+	*p++ = ' '; *p++ = '0'; *p++ = 'x';
+	p = emit_hex((unsigned long)from, p);
+
+	/* " %c", dir */
+	*p++ = ' '; *p++ = dir;
+
+	/* " %p", to */
+	*p++ = ' '; *p++ = '0'; *p++ = 'x';
+	p = emit_hex((unsigned long)to, p);
+
+	*p++ = '\n';
+
+	fwrite(line, p - line, 1, log);
+}
+
+/* These are the functions GCC calls */
+void __cyg_profile_func_enter(void *to,  void *from)
+{
+	if (!disabled)
+		return make_line(from, to, ++level, '>');
+}
+
+void __cyg_profile_func_exit(void *to,  void *from)
+{
+	if (!disabled)
+		return make_line(from, to, level--, '<');
+}