[MEDIUM] further improve monotonic clock by check forward jumps The first implementation of the monotonic clock did not verify forward jumps. The consequence is that a fast changing time may expire a lot of tasks. While it does seem minor, in fact it is problematic because most machines which boot with a wrong date are in the past and suddenly see their time jump by several years in the future. The solution is to check if we spent more apparent time in a poller than allowed (with a margin applied). The margin is currently set to 1000 ms. It should be large enough for any poll() to complete. Tests with randomly jumping clock show that the result is quite accurate (error less than 1 second at every change of more than one second).

commit: b0b37bcd657142d942e7ee8b1c7f1d3651db2d29 [log] [tgz]
author: Willy Tarreau <w@1wt.eu> Mon Jun 23 14:00:57 2008 +0200
committer: Willy Tarreau <w@1wt.eu> Mon Jun 23 14:00:57 2008 +0200
tree: 0755226a6ca35d5c04b9ecc6b415efd79aadaba4
parent: b7f694f20e2eeb8f9020955bc64b0a55e9d33f50 [diff]
diff --git a/include/common/time.h b/include/common/time.h
index d615526..82ec402 100644
--- a/include/common/time.h
+++ b/include/common/time.h

@@ -44,6 +44,11 @@
 
 #define TIME_ETERNITY   (TV_ETERNITY_MS)
 
+/* we want to be able to detect time jumps. Fix the maximum wait time to a low
+ * value so that we know the time has changed if we wait longer.
+ */
+#define MAX_DELAY_MS    1000
+
 
 /* returns the lowest delay amongst <old> and <new>, and respects TIME_ETERNITY */
 #define MINTIME(old, new)	(((new)<0)?(old):(((old)<0||(new)<(old))?(new):(old)))
@@ -84,13 +89,15 @@
 	return tv;
 }
 
-/* tv_now_mono: sets <date> to the current time (wall clock), <mono> to a value
- * following a monotonic function, and applies any required correction if the
- * time goes backwards. Note that while we could improve it a bit by checking
- * that the new date is not too far in the future, it is not much necessary to
- * do so. 
+/* tv_udpate_date: sets <date> to system time, and sets <now> to something as
+ * close as possible to real time, following a monotonic function. The main
+ * principle consists in detecting backwards and forwards time jumps and adjust
+ * an offset to correct them. This function should be called only once after
+ * each poll. The poll's timeout should be passed in <max_wait>, and the return
+ * value in <interrupted> (a non-zero value means that we have not expired the
+ * timeout).
  */
-REGPRM2 struct timeval *tv_now_mono(struct timeval *mono, struct timeval *wall);
+REGPRM2 void tv_update_date(int max_wait, int interrupted);
 
 /*
  * sets a struct timeval to its highest value so that it can never happen

diff --git a/src/cfgparse.c b/src/cfgparse.c
index 13d2a89..90611cd 100644
--- a/src/cfgparse.c
+++ b/src/cfgparse.c

@@ -2831,7 +2831,7 @@
 	 */
 
 	/* will be needed further to delay some tasks */
-	tv_now_mono(&now, &date);
+	tv_update_date(0,1);
 
 	if ((curproxy = proxy) == NULL) {
 		Alert("parsing %s : no <listen> line. Nothing to do !\n",

diff --git a/src/ev_epoll.c b/src/ev_epoll.c
index e7aea93..adb0fd6 100644
--- a/src/ev_epoll.c
+++ b/src/ev_epoll.c

@@ -235,15 +235,18 @@
 	if (run_queue)
 		wait_time = 0;
 	else if (tv_iseternity(exp))
-		wait_time = -1;
+		wait_time = MAX_DELAY_MS;
 	else if (tv_isge(&now, exp))
 		wait_time = 0;
-	else
+	else {
 		wait_time = __tv_ms_elapsed(&now, exp) + 1;
+		if (wait_time > MAX_DELAY_MS)
+			wait_time = MAX_DELAY_MS;
+	}
 
 	fd = MIN(maxfd, global.tune.maxpollevents);
 	status = epoll_wait(epoll_fd, epoll_events, fd, wait_time);
-	tv_now_mono(&now, &date);
+	tv_update_date(wait_time, status);
 
 	for (count = 0; count < status; count++) {
 		fd = epoll_events[count].data.fd;

diff --git a/src/ev_kqueue.c b/src/ev_kqueue.c
index f22aa5b..71e9ecf 100644
--- a/src/ev_kqueue.c
+++ b/src/ev_kqueue.c

@@ -102,25 +102,41 @@
 REGPRM2 static void _do_poll(struct poller *p, struct timeval *exp)
 {
 	int status;
-	int count, fd;
-	struct timespec timeout, *to_ptr;
+	int count, fd, delta_ms;
+	struct timespec timeout;
 
-	to_ptr = NULL;	// no timeout
 	if (run_queue) {
 		timeout.tv_sec = timeout.tv_nsec = 0;
-		to_ptr = &timeout;
+		delta_ms = 0;
 	}
 	else if (tv_isset(exp)) {
+		const struct timeval max_delay = {
+			.tv_sec  = MAX_DELAY_MS / 1000,
+			.tv_usec = (MAX_DELAY_MS % 1000) * 1000
+		};
 		struct timeval delta;
 
-		if (tv_isge(&now, exp))
+		if (tv_isge(&now, exp)) {
 			delta.tv_sec = delta.tv_usec = 0;
-		else
+			delta_ms = 0;
+		}
+		else {
 			tv_remain(&now, exp, &delta);
+			if (__tv_isgt(&delta, &max_delay)) {
+				delta    = max_delay;
+				delta_ms = MAX_DELAY_MS;
+			} else {
+				delta_ms = delta.tv_sec * 1000 + delta.tv_usec / 1000;
+			}
+		}
 
 		timeout.tv_sec  = delta.tv_sec;
 		timeout.tv_nsec = delta.tv_usec * 1000;
-		to_ptr = &timeout;
+	}
+	else {
+		delta_ms = MAX_DELAY_MS;
+		timeout.tv_sec  = MAX_DELAY_MS / 1000;
+		timeout.tv_nsec = (MAX_DELAY_MS % 1000) * 1000000;
 	}
 
 	fd = MIN(maxfd, global.tune.maxpollevents);
@@ -129,8 +145,8 @@
 			0,         // int nchanges
 			kev,       // struct kevent *eventlist
 			fd,        // int nevents
-			to_ptr);   // const struct timespec *timeout
-	tv_now_mono(&now, &date);
+			&timeout); // const struct timespec *timeout
+	tv_update_date(delta_ms, status);
 
 	for (count = 0; count < status; count++) {
 		fd = kev[count].ident;

diff --git a/src/ev_poll.c b/src/ev_poll.c
index bfbe999..a0355aa 100644
--- a/src/ev_poll.c
+++ b/src/ev_poll.c

@@ -127,14 +127,17 @@
 	if (run_queue)
 		wait_time = 0;
 	else if (tv_iseternity(exp))
-		wait_time = -1;
+		wait_time = MAX_DELAY_MS;
 	else if (tv_isge(&now, exp))
 		wait_time = 0;
-	else
+	else {
 		wait_time = __tv_ms_elapsed(&now, exp) + 1;
+		if (wait_time > MAX_DELAY_MS)
+			wait_time = MAX_DELAY_MS;
+	}
 
 	status = poll(poll_events, nbfd, wait_time);
-	tv_now_mono(&now, &date);
+	tv_update_date(wait_time, status);
 
 	for (count = 0; status > 0 && count < nbfd; count++) {
 		fd = poll_events[count].fd;

diff --git a/src/ev_select.c b/src/ev_select.c
index 25bd3ec..30df928 100644
--- a/src/ev_select.c
+++ b/src/ev_select.c

@@ -80,17 +80,27 @@
  */
 REGPRM2 static void _do_poll(struct poller *p, struct timeval *exp)
 {
+	const struct timeval max_delay = {
+		.tv_sec  = MAX_DELAY_MS / 1000,
+		.tv_usec = (MAX_DELAY_MS % 1000) * 1000
+	};
 	int status;
 	int fd, i;
 	struct timeval delta;
+	int delta_ms;
 	int readnotnull, writenotnull;
 	int fds;
 	char count;
 		
 	/* allow select to return immediately when needed */
 	delta.tv_sec = delta.tv_usec = 0;
-	if (!run_queue && tv_isset(exp)) {
-		if (tv_islt(&now, exp)) {
+	delta_ms = 0;
+	if (!run_queue) {
+		if (!tv_isset(exp)) {
+			delta = max_delay;
+			delta_ms = MAX_DELAY_MS;
+		}
+		else if (tv_islt(&now, exp)) {
 			tv_remain(&now, exp, &delta);
 			/* To avoid eventual select loops due to timer precision */
 			delta.tv_usec += SCHEDULER_RESOLUTION * 1000;
@@ -98,6 +108,12 @@
 				delta.tv_usec -= 1000000;
 				delta.tv_sec ++;
 			}
+			if (__tv_isge(&delta, &max_delay)) {
+				delta = max_delay;
+				delta_ms = MAX_DELAY_MS;
+			} else {
+				delta_ms = delta.tv_sec * 1000 + delta.tv_usec / 1000;
+			}
 		}
 	}
 
@@ -122,9 +138,9 @@
 			readnotnull ? tmp_evts[DIR_RD] : NULL,
 			writenotnull ? tmp_evts[DIR_WR] : NULL,
 			NULL,
-			tv_isset(exp) ? &delta : NULL);
+			&delta);
       
-	tv_now_mono(&now, &date);
+	tv_update_date(delta_ms, status);
 
 	if (status <= 0)
 		return;

diff --git a/src/ev_sepoll.c b/src/ev_sepoll.c
index ed2103c..f42a97f 100644
--- a/src/ev_sepoll.c
+++ b/src/ev_sepoll.c

@@ -418,7 +418,7 @@
 		 * returning now without checking epoll_wait().
 		 */
 		if (++last_skipped <= 1) {
-			tv_now_mono(&now, &date);
+			tv_update_date(0, 1);
 			return;
 		}
 	}
@@ -435,11 +435,14 @@
 	}
 	else {
 		if (tv_iseternity(exp))
-			wait_time = -1;
+			wait_time = MAX_DELAY_MS;
 		else if (tv_isge(&now, exp))
 			wait_time = 0;
-		else
+		else {
 			wait_time = __tv_ms_elapsed(&now, exp) + 1;
+			if (wait_time > MAX_DELAY_MS)
+				wait_time = MAX_DELAY_MS;
+		}
 	}
 
 	/* now let's wait for real events. We normally use maxpollevents as a
@@ -451,8 +454,7 @@
 	fd = MIN(maxfd, fd);
 	spec_processed = 0;
 	status = epoll_wait(epoll_fd, epoll_events, fd, wait_time);
-
-	tv_now_mono(&now, &date);
+	tv_update_date(wait_time, status);
 
 	for (count = 0; count < status; count++) {
 		int e = epoll_events[count].events;

diff --git a/src/haproxy.c b/src/haproxy.c
index f10e47d..45e4852 100644
--- a/src/haproxy.c
+++ b/src/haproxy.c

@@ -415,7 +415,7 @@
 	global.rlimit_memmax = HAPROXY_MEMMAX;
 #endif
 
-	tv_now_mono(&now, &date);
+	tv_update_date(-1,-1);
 	start_date = now;
 
 	init_task();
@@ -897,7 +897,7 @@
 {
 	struct timeval next;
 
-	tv_now_mono(&now, &date);
+	tv_update_date(0,1);
 	while (1) {
 		process_runnable_tasks(&next);
 

diff --git a/src/proxy.c b/src/proxy.c
index 16804f9..a7b4efc 100644
--- a/src/proxy.c
+++ b/src/proxy.c

@@ -385,7 +385,7 @@
 
 	stopping = 1;
 	p = proxy;
-	tv_now_mono(&now, &date); /* else, the old time before select will be used */
+	tv_update_date(0,1); /* else, the old time before select will be used */
 	while (p) {
 		if (p->state != PR_STSTOPPED) {
 			Warning("Stopping proxy %s in %d ms.\n", p->id, p->grace);
@@ -434,7 +434,7 @@
 
 	err = 0;
 	p = proxy;
-	tv_now_mono(&now, &date); /* else, the old time before select will be used */
+	tv_update_date(0,1); /* else, the old time before select will be used */
 	while (p) {
 		if (p->state != PR_STERROR &&
 		    p->state != PR_STSTOPPED &&
@@ -469,7 +469,7 @@
 	struct listener *l;
 
 	p = proxy;
-	tv_now_mono(&now, &date); /* else, the old time before select will be used */
+	tv_update_date(0,1); /* else, the old time before select will be used */
 	while (p) {
 		if (p->state == PR_STPAUSED) {
 			Warning("Enabling proxy %s.\n", p->id);

diff --git a/src/time.c b/src/time.c
index ccb30b2..f637f6c 100644
--- a/src/time.c
+++ b/src/time.c

@@ -143,25 +143,56 @@
 	return __tv_isgt(tv1, tv2);
 }
 
-/* tv_now_mono: sets <date> to the current time (wall clock), <mono> to a value
- * following a monotonic function, and applies any required correction if the
- * time goes backwards. Note that while we could improve it a bit by checking
- * that the new date is not too far in the future, it is not much necessary to
- * do so. 
+/* tv_udpate_date: sets <date> to system time, and sets <now> to something as
+ * close as possible to real time, following a monotonic function. The main
+ * principle consists in detecting backwards and forwards time jumps and adjust
+ * an offset to correct them. This function should be called once after each
+ * poll, and never farther apart than MAX_DELAY_MS*2. The poll's timeout should
+ * be passed in <max_wait>, and the return value in <interrupted> (a non-zero
+ * value means that we have not expired the timeout). Calling it with (-1,*)
+ * sets both <date> and <now> to current date, and calling it with (0,1) simply
+ * updates the values.
  */
-REGPRM2 struct timeval *tv_now_mono(struct timeval *mono, struct timeval *wall)
+REGPRM2 void tv_update_date(int max_wait, int interrupted)
 {
-	static struct timeval tv_offset;
-	struct timeval adjusted;
+	static struct timeval tv_offset; /* warning: signed offset! */
+	struct timeval adjusted, deadline;
 
-	gettimeofday(wall, NULL);
-	__tv_add(&adjusted, wall, &tv_offset);
-	if (unlikely(__tv_islt(&adjusted, mono))) {
-		__tv_remain(wall, mono, &tv_offset);
-		return mono;
+	gettimeofday(&date, NULL);
+	if (unlikely(max_wait < 0)) {
+		tv_zero(&tv_offset);
+		now = date;
+		return;
+	}
+	__tv_add(&adjusted, &date, &tv_offset);
+	if (unlikely(__tv_islt(&adjusted, &now))) {
+		goto fixup; /* jump in the past */
+	}
+
+	/* OK we did not jump backwards, let's see if we have jumped too far
+	 * forwards. The poll value was in <max_wait>, we accept that plus
+	 * MAX_DELAY_MS to cover additional time.
+	 */
+	_tv_ms_add(&deadline, &now, max_wait + MAX_DELAY_MS);
+	if (unlikely(__tv_isge(&adjusted, &deadline))) {
+		goto fixup; /* jump in the future */
+	}
+	now = adjusted;
+	return;
+ fixup:
+	/* Large jump. If the poll was interrupted, we consider that the date
+	 * has not changed (immediate wake-up), otherwise we add the poll
+	 * time-out to the previous date. The new offset is recomputed.
+	 */
+	if (!interrupted)
+		_tv_ms_add(&now, &now, max_wait);
+	tv_offset.tv_sec  = now.tv_sec  - date.tv_sec;
+	tv_offset.tv_usec = now.tv_usec - date.tv_usec;
+	if (tv_offset.tv_usec < 0) {
+		tv_offset.tv_usec += 1000000;
+		tv_offset.tv_sec--;
 	}
-	*mono = adjusted;
-	return mono;
+	return;
 }
 
 char *human_time(int t, short hz_div) {
commit	b0b37bcd657142d942e7ee8b1c7f1d3651db2d29	[log] [tgz]
author	Willy Tarreau <w@1wt.eu>	Mon Jun 23 14:00:57 2008 +0200
committer	Willy Tarreau <w@1wt.eu>	Mon Jun 23 14:00:57 2008 +0200
tree	0755226a6ca35d5c04b9ecc6b415efd79aadaba4
parent	b7f694f20e2eeb8f9020955bc64b0a55e9d33f50 [diff]