Merge branch 'tcpsplice'
diff --git a/Makefile b/Makefile
index 195e6a9..48f4775 100644
--- a/Makefile
+++ b/Makefile
@@ -31,6 +31,9 @@
 PCREDIR	:= $(shell pcre-config --prefix 2>/dev/null || :)
 #PCREDIR=/usr/local
 
+# This is the directory hosting libtcpsplice.[ah] when USE_TCPSPLICE is set
+TCPSPLICEDIR :=
+
 # This is for standard Linux 2.6 with netfilter and epoll()
 COPTS.linux26 = -DNETFILTER -DENABLE_POLL -DENABLE_EPOLL
 LIBS.linux26 =
@@ -117,6 +120,10 @@
 # do not change this one, enable USE_* variables instead.
 OPTIONS =
 
+ifneq ($(USE_TCPSPLICE),)
+OPTIONS += -DCONFIG_HAP_TCPSPLICE
+endif
+
 ifneq ($(USE_CTTPROXY),)
 OPTIONS += -DCONFIG_HAP_CTTPROXY
 endif
@@ -169,8 +176,16 @@
 REGEX_OPTS=$(COPTS.$(REGEX))
 CPU_OPTS=$(COPTS.$(CPU))
 
-COPTS=-Iinclude $(ADDINC) $(CPU_OPTS) $(TARGET_OPTS) $(REGEX_OPTS) $(SMALL_OPTS) $(DEFINE) $(OPTIONS)
-LIBS=$(LIBS.$(TARGET)) $(LIBS.$(REGEX)) $(ADDLIB)
+COPTS = -Iinclude $(CPU_OPTS) $(TARGET_OPTS) $(REGEX_OPTS) $(SMALL_OPTS) $(DEFINE) $(OPTIONS)
+LIBS=$(LIBS.$(TARGET)) $(LIBS.$(REGEX))
+
+ifneq ($(USE_TCPSPLICE),)
+COPTS += -I$(TCPSPLICEDIR)
+LIBS  += -L$(TCPSPLICEDIR) -ltcpsplice
+endif
+
+COPTS += $(ADDINC)
+LIBS += $(ADDLIB)
 
 CFLAGS = -Wall $(COPTS) $(DEBUG)
 LDFLAGS = -g
diff --git a/doc/tcp-splicing.txt b/doc/tcp-splicing.txt
new file mode 100644
index 0000000..ffdb256
--- /dev/null
+++ b/doc/tcp-splicing.txt
@@ -0,0 +1,196 @@
+                     Using Linux TCP Splicing with HAProxy
+                           Willy Tarreau <w@1wt.eu>
+                                - 2007/01/06 -
+
+
+Alexandre Cassen has started a project called Linux Layer7 Switching (L7SW),
+whose goal is to provide kernel services to help userland proxies achieving
+very high performance. Right now, the project consists in a loadable kernel
+module providing TCP Splicing under Linux.
+
+TCP Splicing is a method by which a userland proxy can tell the kernel that
+it considers it has no added value on the data part of a connection, and that
+the kernel can perform the transfers it itself, thus relieving the proxy from
+a potentially heavy job. There are two advantages to this method :
+
+  - it reduces the number of process wakeups
+  - it reduces the number of data copies between user-space and kernel buffers
+
+This method is particularly suited to protocols in which data is sent till
+the end of the session. This is the case for FTP data for instance, and it
+is also the case for the BODY part of HTTP/1.0.
+
+The great news is that haproxy has been designed from the beginning with a
+clear distinction between the headers and the DATA phase, so it was a child's
+game to add hooks to Alex's library in it
+
+Be careful! Both versions are to be considered BETA software ! Run them on
+your systems if you want, but do not complain if it crashes twice a day !
+Anyway, it seems stable on our test machines.
+
+In order to use TCP Splicing on haproxy, you need :
+
+  - Linux Layer7 Switching code version 0.1.1 : [ http://linux-l7sw.sf.net/ ]
+  - Haproxy version 1.3.5 : [ http://haproxy.1wt.eu/download/1.3/src/ ]
+
+Then, you must untar both packages in any location, let's assume you'll
+be using /tmp. First extract l7sw and :
+
+  $ cd /tmp
+  $ tar zxf layer7switch-0.1.1.tar.gz
+  $ cd layer7switch-0.1.1
+
+L7SW currently only supports Linux kernel 2.6.19+. If you prefer to use it
+on a more stable kernel, such as 2.6.16.X, you can apply this patch to the
+L7SW directory :
+
+  [ http://haproxy.1wt.eu/download/patches/tcp_splice-0.1.1-linux-2.6.16.diff ]
+
+  $ patch -p1 -d kernel < tcp_splice-0.1.1-linux-2.6.16.diff
+
+Alternatively, if you prefer to run it on 2.4.33+, you can apply this patch
+to the L7SW directory :
+
+  [ http://haproxy.1wt.eu/download/patches/tcp_splice-0.1.1-linux-2.4.33.diff ]
+
+  $ patch -p1 -d kernel < tcp_splice-0.1.1-linux-2.4.33.diff
+
+Then build the kernel module as described in the L7SW README. Basically, you
+just have to do this once your tree has been patched :
+
+  $ cd kernel
+  $ make
+
+You can either install the resulting module (tcp_splice) or load it now. During
+early testing periods, it might be preferable to avoid installing anything and
+just load it manually :
+
+  $ sudo insmod tcp_splice.*o
+  $ cd ..
+
+Now that the module is loaded, you need to build the libtcpsplice library on
+which haproxy currently relies :
+
+  $ cd userland/libtcpsplice
+  $ make
+  $ cd ..
+
+For the adventurous, there's also a proof of concept in the userlan/switchd
+directory, it may be useful if you encounter problems with haproxy for
+instance. But it is not needed at all here.
+
+OK, L7SW is ready. Now you have to extract haproxy and tell it to build using
+libtcpsplice :
+
+  $ cd /tmp
+  $ tar zxf haproxy-1.3.5.tar.gz
+  $ cd haproxy-1.3.5
+  $ make USE_TCPSPLICE=1 TCPSPLICEDIR=/tmp/layer7switch-0.1.1/userland/libtcpsplice
+
+There are other options to make, which are hugely recommended, such as
+CPU=, REGEX=, and above all, TARGET= so that you use the best syscalls and
+functions for your system. Generally you will use TARGET=linux26, but 2.4 users
+with an epoll-patched kernel will use TARGET=linux24e. This is very important
+because failing to specify those options will disable important optimizations
+which might hide the tcpsplice benefits ! Please consult the haproxy's README.
+
+Now that you have haproxy built with support for tcpsplice, and that the module
+is loaded, you have to write a config. There is an example in the 'examples'
+directory. Basically, you just have to add the "option tcpsplice" keyword BOTH
+in the frontend AND in the backend sections that you want to accelerate.
+
+If the option is specified only in the frontend or in the backend, then no
+acceleration will be used. It is designed this way to allow some front-back
+combinations to use it without forcing others to use it. Of course, if you use
+a single "listen" section, you just have to specify it once.
+
+As of now (l7sw-0.1.1 and haproxy-1.3.5), you need the CAP_NETADMIN capability
+to START and to RUN. For human beings, it means that you have to start haproxy
+as root and keep it running as root, so it must not drop its priviledges. This
+is somewhat annoying, but we'll try to find a solution later.
+
+Also, l7sw-0.1.1 does not yet support TCP window scaling nor SACK. So you have
+to disable both features on the proxy :
+
+  $ sudo sysctl -w net.ipv4.tcp_window_scaling=0
+  $ sudo sysctl -w net.ipv4.tcp_sack=0
+  $ sudo sysctl -w net.ipv4.tcp_dsack=0
+  $ sudo sysctl -w net.ipv4.tcp_tw_recycle=1
+
+You can now check that everything works as expected. Run "vmstat 1" or "top"
+in one terminal, and haproxy in another one :
+
+  $ sudo ./haproxy -f examples/tcp-splicing-sample.cfg
+
+Transfering large file through it should not affect it much. You should observe
+something like 10% CPU instead of 95% when transferring 1 MB files at full
+speed. You can play with the tcpsplice option in the configuration to see the
+effects.
+
+
+Troubleshooting
+---------------
+
+This software is still beta, and you will probably encounter some caveats.
+I personnally ran into a few issues that we'll try to address with Alex. First
+of all, I had occasionnal lockups on my SMP machine which I never had on an UP
+one. So if you get problems on an SMP machine, please reboot it in UP and do
+not lose your time on this.
+
+I also noticed that sometimes, some sessions remained established even after
+the end of the program. You might also see some situtations where even after
+the proxy's exit, the traffic still passes through the system. It may happen
+when you have a limited source port range and that you reuse a TIME_WAIT
+session matching exactly the same source and destinations. This will need
+to be addressed too.
+
+You can play with tcp_splice variables and timeouts here in /proc/sys/net/ :
+
+  $ ls /proc/sys/net/tcp_splice/
+  debug_level        timeout_established  timeout_listen   timeout_synsent
+  timeout_close      timeout_finwait      timeout_synack   timeout_timewait
+  timeout_closewait  timeout_lastack      timeout_synrecv
+  
+  $ sysctl net/tcp_splice
+  net.tcp_splice.debug_level = 0
+  net.tcp_splice.timeout_synack = 120
+  net.tcp_splice.timeout_listen = 120
+  net.tcp_splice.timeout_lastack = 30
+  net.tcp_splice.timeout_closewait = 60
+  net.tcp_splice.timeout_close = 10
+  net.tcp_splice.timeout_timewait = 120
+  net.tcp_splice.timeout_finwait = 120
+  net.tcp_splice.timeout_synrecv = 60
+  net.tcp_splice.timeout_synsent = 120
+  net.tcp_splice.timeout_established = 900
+
+You can also consult the full session list here :
+
+$ head /proc/net/tcp_splice_conn 
+FromIP   FPrt ToIP     TPrt LocalIP  LPrt DestIP   DPrt State       Expires
+0A000301 4EBB 0A000302 1F40 0A000302 817B 0A000301 0050 CLOSE             7
+0A000301 4E9B 0A000302 1F40 0A000302 8165 0A000301 0050 CLOSE             7
+
+Since a session exists at least in CLOSE state for 10 seconds, you just have
+to consult this entry less than 10 seconds after a test to see a session.
+
+Please report your successes, failures, suggestions or fixes to the L7SW
+mailing list here (do not use the list to report other haproxy bugs) :
+
+  https://lists.sourceforge.net/lists/listinfo/linux-l7sw-devel
+
+
+Motivations
+-----------
+
+I've always wanted haproxy to be the fastest and most reliable software load
+balancer available. L7SW is an opportunity to make get a huge performance boost
+on high traffic sites (eg: photo sharing, streaming, ...). In turn, I find it a
+shame that Alex wastes his time redevelopping a proxy as a proof of concept for
+his kernel code. While it is a fun game to enter into, it really becomes harder
+when you need to get close to customers' needs. So by porting haproxy early to
+L7SW, I get both the opportunity to get an idea of what it will soon be capable
+of, and help Alex spend more time on the complex kernel part.
+
+Have fun !
+Willy
diff --git a/examples/tcp-splicing-sample.cfg b/examples/tcp-splicing-sample.cfg
new file mode 100644
index 0000000..84d55a3
--- /dev/null
+++ b/examples/tcp-splicing-sample.cfg
@@ -0,0 +1,82 @@
+#
+# This is a sample configuration
+# haproxy >= 1.3.5 required.
+#
+# It listens on 192.168.1.10:80, and directs all requests for Host 'img' or
+# URIs starting with /img or /css to a dedicated group of servers. URIs
+# starting with /admin/stats are directed to a backend dedicated to statistics.
+# TCP splicing is used on static objects to relieve the process from the heavy
+# job.
+#
+
+global
+        maxconn         10000
+        log             127.0.0.1 local0
+        uid             200
+        gid             200
+        chroot          /var/empty
+        daemon
+
+
+# The public 'www' address in the DMZ
+frontend public
+	bind		192.168.1.10:80
+        mode            http
+	log		global
+        option		httplog
+        option          dontlognull
+        option		httpclose
+        option		tcpsplice
+	monitor-uri	/monitoruri
+        maxconn		8000
+	clitimeout	30000
+
+	# Host: will use a specific keyword soon
+	reqisetbe	^Host:\ img		static
+
+	# The URI will use a specific keyword soon
+	reqisetbe	^[^\ ]*\ /(img|css)/	static
+	reqisetbe	^[^\ ]*\ /admin/stats	stats
+
+	default_backend	dynamic
+
+
+# The static backend backend for 'Host: img', /img and /css.
+# TCP splicing is enabled on this backend because we don't expect to do
+# anything interesting with static objects, but we know they can eat much
+# bandwidth.
+backend static
+	mode		http
+	balance		roundrobin
+        option		tcpsplice
+        contimeout      5000
+        srvtimeout      5000
+	redispatch
+	retries		2
+	option		httpchk HEAD /favicon.ico
+        server		statsrv1 192.168.1.8:80 check inter 1000
+        server		statsrv2 192.168.1.9:80 check inter 1000
+
+
+backend dynamic
+	mode		http
+	balance		roundrobin
+	contimeout	30000
+	srvtimeout	30000
+	redispatch
+	retries		2
+	option		httpchk HEAD /login.php
+	cookie		DYNSRV insert indirect nocache
+	fullconn	4000 # the servers will be used at full load above this number of connections
+        server		dynsrv1 192.168.1.1:80 minconn 50 maxconn 500 cookie s1 check inter 1000
+        server		dynsrv2 192.168.1.2:80 minconn 50 maxconn 500 cookie s2 check inter 1000
+        server		dynsrv3 192.168.1.3:80 minconn 50 maxconn 500 cookie s3 check inter 1000
+        server		dynsrv4 192.168.1.4:80 minconn 50 maxconn 500 cookie s4 check inter 1000
+
+
+backend stats
+        log             global
+        mode            http
+        stats           uri /
+        balance         roundrobin
+
diff --git a/include/types/backend.h b/include/types/backend.h
index f70a370..d6079ad 100644
--- a/include/types/backend.h
+++ b/include/types/backend.h
@@ -57,6 +57,7 @@
 #define	PR_O_TPXY_CIP	0x04000000	/* bind to the client's IP address when connect()ing */
 #define	PR_O_TPXY_CLI	0x06000000	/* bind to the client's IP+port when connect()ing */
 #define	PR_O_TPXY_MASK	0x06000000	/* bind to a non-local address when connect()ing */
+#define	PR_O_TCPSPLICE	0x08000000      /* delegate data transfer to linux kernel's tcp_splice */
 
 
 #endif /* _TYPES_BACKEND_H */
diff --git a/include/types/global.h b/include/types/global.h
index 40c6c99..222d4fe 100644
--- a/include/types/global.h
+++ b/include/types/global.h
@@ -42,6 +42,7 @@
 #define LSTCHK_CAP_BIND	0x00000001	/* check that we can bind to any port */
 #define LSTCHK_CTTPROXY	0x00000002	/* check that tproxy is enabled */
 #define LSTCHK_NETADM	0x00000004	/* check that we have CAP_NET_ADMIN */
+#define LSTCHK_TCPSPLICE	0x00000008	/* check that linux tcp_splice is enabled */
 
 /* FIXME : this will have to be redefined correctly */
 struct global {
diff --git a/src/backend.c b/src/backend.c
index f7dd675..c0283d2 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -41,6 +41,10 @@
 #include <import/ip_tproxy.h>
 #endif
 
+#ifdef CONFIG_HAP_TCPSPLICE
+#include <libtcpsplice.h>
+#endif
+
 /*
  * This function recounts the number of usable active and backup servers for
  * proxy <p>. These numbers are returned into the p->srv_act and p->srv_bck.
@@ -364,6 +368,13 @@
 		return SN_ERR_PRXCOND; /* it is a configuration limit */
 	}
 
+#ifdef CONFIG_HAP_TCPSPLICE
+	if ((s->fe->options & s->be->beprm->options) & PR_O_TCPSPLICE) {
+		/* TCP splicing supported by both FE and BE */
+		tcp_splice_initfd(s->cli_fd, fd);
+	}
+#endif
+
 	if ((fcntl(fd, F_SETFL, O_NONBLOCK)==-1) ||
 	    (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char *) &one, sizeof(one)) == -1)) {
 		qfprintf(stderr,"Cannot set client socket to non blocking mode.\n");
diff --git a/src/cfgparse.c b/src/cfgparse.c
index 5017d50..8b6e94d 100644
--- a/src/cfgparse.c
+++ b/src/cfgparse.c
@@ -94,6 +94,10 @@
 	{ "allbackups",   PR_O_USE_ALL_BK, PR_CAP_BE, 0 },
 	{ "persist",      PR_O_PERSIST,    PR_CAP_BE, 0 },
 	{ "forceclose",   PR_O_FORCE_CLO | PR_O_HTTP_CLOSE, PR_CAP_BE, 0 },
+#ifdef CONFIG_HAP_TCPSPLICE
+	{ "tcpsplice",    PR_O_TCPSPLICE , PR_CAP_BE|PR_CAP_FE, LSTCHK_TCPSPLICE|LSTCHK_NETADM },
+#endif
+
 	{ NULL, 0, 0 }
 };
 
diff --git a/src/haproxy.c b/src/haproxy.c
index 0471364..19fffd9 100644
--- a/src/haproxy.c
+++ b/src/haproxy.c
@@ -90,6 +90,10 @@
 #include <proto/stream_sock.h>
 #include <proto/task.h>
 
+#ifdef CONFIG_HAP_TCPSPLICE
+#include <libtcpsplice.h>
+#endif
+
 /*********************************************************************/
 
 /*********************************************************************/
@@ -751,6 +755,18 @@
 #endif
 	}
 
+#ifdef CONFIG_HAP_TCPSPLICE
+	if (global.last_checks & LSTCHK_TCPSPLICE) {
+		if (tcp_splice_start() < 0) {
+			Alert("[%s.main()] Cannot enable tcp_splice.\n"
+			      "  Make sure you have enough permissions and that the module is loadable.\n"
+			      "  Alternatively, you may disable the 'tcpsplice' options in the configuration.\n"
+			      "", argv[0], global.gid);
+			exit(1);
+		}
+	}
+#endif
+
 	if (nb_oldpids)
 		tell_old_pids(oldpids_sig);
 
diff --git a/src/proto_http.c b/src/proto_http.c
index 184d91a..b5f974a 100644
--- a/src/proto_http.c
+++ b/src/proto_http.c
@@ -51,6 +51,9 @@
 #include <proto/session.h>
 #include <proto/task.h>
 
+#ifdef CONFIG_HAP_TCPSPLICE
+#include <libtcpsplice.h>
+#endif
 
 #define DEBUG_PARSE_NO_SPEEDUP
 #undef DEBUG_PARSE_NO_SPEEDUP
@@ -1800,6 +1803,12 @@
 					t->logs.t_close = t->logs.t_connect; /* to get a valid end date */
 					sess_log(t);
 				}
+#ifdef CONFIG_HAP_TCPSPLICE
+				if ((t->fe->options & t->be->beprm->options) & PR_O_TCPSPLICE) {
+					/* TCP splicing supported by both FE and BE */
+					tcp_splice_splicefd(t->cli_fd, t->srv_fd, 0);
+				}
+#endif
 			}
 			else {
 				t->srv_state = SV_STHEADERS;
@@ -1956,6 +1965,12 @@
 					t->srv_state = SV_STSHUTW;
 				}
 
+#ifdef CONFIG_HAP_TCPSPLICE
+				if ((t->fe->options & t->be->beprm->options) & PR_O_TCPSPLICE) {
+					/* TCP splicing supported by both FE and BE */
+					tcp_splice_splicefd(t->cli_fd, t->srv_fd, 0);
+				}
+#endif
 				/* if the user wants to log as soon as possible, without counting
 				   bytes from the server, then this is the right moment. */
 				if (t->fe->to_log && !(t->logs.logwait & LW_BYTES)) {