[MEDIUM] limit the number of events returned by *poll*

By default, epoll/kqueue used to return as many events as possible.
This could sometimes cause huge latencies (latencies of up to 400 ms
have been observed with many thousands of fds at once). Limiting the
number of events returned also reduces the latency by avoiding too
many blind processing. The value is set to 200 by default and can be
changed in the global section using the tune.maxpollevents parameter.
diff --git a/doc/haproxy-en.txt b/doc/haproxy-en.txt
index 865bf8e..73372e3 100644
--- a/doc/haproxy-en.txt
+++ b/doc/haproxy-en.txt
@@ -128,6 +128,7 @@
   - pidfile <file>
   - ulimit-n <number>
   - stats
+  - tune.maxpollevents <number>
 
 
 1.1) Event logging
@@ -338,6 +339,12 @@
 Version 1.3.9 introduced kqueue() for FreeBSD/OpenBSD, and speculative epoll()
 which consists in trying to perform I/O before queuing the events via syscalls.
 
+In order to optimize latency, it is now possible to limit the number of events
+returned by a single call to poll. The limit is fixed to 200 by default. If a
+smaller latency is seeked, it may be useful to reduce this value by using the
+'tune.maxpollevents' parameter in the 'global' section. Increasing it will
+slightly save CPU cycles in presence of large number of connections.
+
 Haproxy will use kqueue() or speculative epoll() when available, then epoll(),
 and will fall back to poll(), then to select(). However, if for any reason you
 need to disable epoll() or poll() (eg. because of a bug or just to compare
@@ -351,6 +358,7 @@
         # use only select()
         noepoll
         nopoll
+        tune.maxpollevents 100
 
 Note :
 ------
diff --git a/doc/haproxy-fr.txt b/doc/haproxy-fr.txt
index 63ac9a4..ec74611 100644
--- a/doc/haproxy-fr.txt
+++ b/doc/haproxy-fr.txt
@@ -134,6 +134,7 @@
   - quiet
   - pidfile <fichier>
   - ulimit-n <nombre>
+  - tune.maxpollevents <nombre>
 
 
 1.1) Journalisation des événements
@@ -362,6 +363,13 @@
 les opérations d'entrées/sorties avant de chaîner les événements par les appels
 système.
 
+Afin d'optimiser la latence, il est désormais possible de limiter le nombre
+d'événements remontés à chaque appel. La limite par défaut est fixée à 200. Si
+une latence plus petite est recherchée, il peut être justifié d'abaisser cette
+limite par l'utilisation du paramètre 'tune.maxpollevents' dans la section
+'global'. L'augmenter permettra d'économiser un peu le processeur en présence
+de très grands nombres de connexions simultanées.
+
 Haproxy utilisera kqueue() ou speculative epoll() lorsque ce sera disponible,
 puis epoll(), et se repliera sur poll(), puis en dernier lieu sur select().
 Cependant, si pour une raison quelconque il s'avérait nécessaire de désactiver
@@ -375,6 +383,7 @@
         # utiliser seulement select()
         noepoll
         nopoll
+        tune.maxpollevents 100
 
 Remarque :
 ----------
diff --git a/include/common/defaults.h b/include/common/defaults.h
index cfe60e8..198288f 100644
--- a/include/common/defaults.h
+++ b/include/common/defaults.h
@@ -2,7 +2,7 @@
   include/common/defaults.h
   Miscellaneous default values.
 
-  Copyright (C) 2000-2006 Willy Tarreau - w@1wt.eu
+  Copyright (C) 2000-2007 Willy Tarreau - w@1wt.eu
   
   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -78,6 +78,12 @@
 #define MIN_RET_FOR_READ_LOOP 1460
 #endif
 
+// the max number of events returned in one call to poll/epoll. Too small a
+// value will cause lots of calls, and too high a value may cause high latency.
+#ifndef MAX_POLL_EVENTS
+#define MAX_POLL_EVENTS 200
+#endif
+
 // cookie delimitor in "prefix" mode. This character is inserted between the
 // persistence cookie and the original value. The '~' is allowed by RFC2965,
 // and should not be too common in server names.
diff --git a/include/types/global.h b/include/types/global.h
index 222d4fe..b259954 100644
--- a/include/types/global.h
+++ b/include/types/global.h
@@ -60,6 +60,9 @@
 	int logfac1, logfac2;
 	int loglev1, loglev2;
 	struct sockaddr_in logsrv1, logsrv2;
+	struct {
+		int maxpollevents; /* max number of poll events at once */
+	} tune;
 };
 
 extern struct global global;
diff --git a/src/cfgparse.c b/src/cfgparse.c
index 5045ed2..69f11e0 100644
--- a/src/cfgparse.c
+++ b/src/cfgparse.c
@@ -276,6 +276,17 @@
 	else if (!strcmp(args[0], "stats")) {
 		global.mode |= MODE_STATS;
 	}
+	else if (!strcmp(args[0], "tune.maxpollevents")) {
+		if (global.tune.maxpollevents != 0) {
+			Alert("parsing [%s:%d] : '%s' already specified. Continuing.\n", file, linenum, args[0]);
+			return 0;
+		}
+		if (*(args[1]) == 0) {
+			Alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]);
+			return -1;
+		}
+		global.tune.maxpollevents = atol(args[1]);
+	}
 	else if (!strcmp(args[0], "uid")) {
 		if (global.uid != 0) {
 			Alert("parsing [%s:%d] : user/uid already specified. Continuing.\n", file, linenum);
diff --git a/src/ev_epoll.c b/src/ev_epoll.c
index 9dccb35..a0c48b1 100644
--- a/src/ev_epoll.c
+++ b/src/ev_epoll.c
@@ -18,6 +18,7 @@
 #include <common/config.h>
 #include <common/standard.h>
 #include <common/time.h>
+#include <common/tools.h>
 
 #include <types/fd.h>
 #include <types/global.h>
@@ -238,7 +239,8 @@
 	else
 		wait_time = __tv_ms_elapsed(&now, exp) + 1;
 
-	status = epoll_wait(epoll_fd, epoll_events, maxfd, wait_time);
+	fd = MIN(maxfd, global.tune.maxpollevents);
+	status = epoll_wait(epoll_fd, epoll_events, fd, wait_time);
 	tv_now(&now);
 
 	for (count = 0; count < status; count++) {
@@ -278,7 +280,7 @@
 		goto fail_fd;
 
 	epoll_events = (struct epoll_event*)
-		calloc(1, sizeof(struct epoll_event) * global.maxsock);
+		calloc(1, sizeof(struct epoll_event) * global.tune.maxpollevents);
 
 	if (epoll_events == NULL)
 		goto fail_ee;
diff --git a/src/ev_kqueue.c b/src/ev_kqueue.c
index 8740217..773db74 100644
--- a/src/ev_kqueue.c
+++ b/src/ev_kqueue.c
@@ -24,6 +24,7 @@
 #include <common/compat.h>
 #include <common/config.h>
 #include <common/time.h>
+#include <common/tools.h>
 
 #include <types/fd.h>
 #include <types/global.h>
@@ -118,11 +119,12 @@
 		to_ptr = &timeout;
 	}
 
+	fd = MIN(maxfd, global.tune.maxpollevents);
 	status = kevent(kqueue_fd, // int kq
 			NULL,      // const struct kevent *changelist
 			0,         // int nchanges
 			kev,       // struct kevent *eventlist
-			maxfd,     // int nevents
+			fd,        // int nevents
 			to_ptr);   // const struct timespec *timeout
 	tv_now(&now);
 
@@ -161,7 +163,7 @@
 	if (kqueue_fd < 0)
 		goto fail_fd;
 
-	kev = (struct kevent*)calloc(1, sizeof(struct kevent) * global.maxsock);
+	kev = (struct kevent*)calloc(1, sizeof(struct kevent) * global.tune.maxpollevents);
 
 	if (kev == NULL)
 		goto fail_kev;
diff --git a/src/ev_sepoll.c b/src/ev_sepoll.c
index d304c12..852577c 100644
--- a/src/ev_sepoll.c
+++ b/src/ev_sepoll.c
@@ -18,6 +18,7 @@
 #include <common/config.h>
 #include <common/standard.h>
 #include <common/time.h>
+#include <common/tools.h>
 
 #include <types/fd.h>
 #include <types/global.h>
@@ -389,7 +390,8 @@
 	}
 
 	/* now let's wait for real events */
-	status = epoll_wait(epoll_fd, epoll_events, maxfd, wait_time);
+	fd = MIN(maxfd, global.tune.maxpollevents);
+	status = epoll_wait(epoll_fd, epoll_events, fd, wait_time);
 
 	tv_now(&now);
 
@@ -439,7 +441,7 @@
 		goto fail_fd;
 
 	epoll_events = (struct epoll_event*)
-		calloc(1, sizeof(struct epoll_event) * global.maxsock);
+		calloc(1, sizeof(struct epoll_event) * global.tune.maxpollevents);
 
 	if (epoll_events == NULL)
 		goto fail_ee;
diff --git a/src/haproxy.c b/src/haproxy.c
index a86abeb..e0e42b2 100644
--- a/src/haproxy.c
+++ b/src/haproxy.c
@@ -520,6 +520,9 @@
 
 	global.maxsock += global.maxconn * 2; /* each connection needs two sockets */
 
+	if (global.tune.maxpollevents <= 0)
+		global.tune.maxpollevents = MAX_POLL_EVENTS;
+
 	if (arg_mode & (MODE_DEBUG | MODE_FOREGROUND)) {
 		/* command line debug mode inhibits configuration mode */
 		global.mode &= ~(MODE_DAEMON | MODE_QUIET);