BUG/MEDIUM: threads/time: maintain a common time reference between all threads During high loads it becomes visible that the time drifts between threads, sometimes showing tens of seconds after several minutes. The root cause is the per-thread correction which is performed based on a local offset and local time. But we can't use a unique global time either as we need the thread-local time to be stable between two poll() calls. This commit takes a stab at this problem by proceeding this way : - a global "global_now" date is monotonous and common between all threads. - each thread has its own local <now> which is resynced with <global_now> on each invocation of tv_update_date() - each thread detects its own drift based on its poll() timeout and its local <now>, and recalculates its adjusted local time - each thread then ensures its new local time is no older than the current global time, otherwise it readjusts its local time to match this one - finally threads do atomically update the global time to match its own local one This guarantees a monotonous global time and a monotonous+stable local time. It is still possible by definition for two threads to report a minor time variation on subsequent events but that variation will only be caused by the moment they watched the time and are very small. When a common global time is needed between all threads, global_now could be used as a reference (with care). The wallclock time used in logs is still <date> anyway.

commit: 9fefc51c569b692a09229182a76fae78883b1cd8 [log] [tgz]
author: Willy Tarreau <w@1wt.eu> Thu Nov 23 14:52:28 2017 +0100
committer: Willy Tarreau <w@1wt.eu> Thu Nov 23 16:32:32 2017 +0100
tree: 5cd5c567da7275f959ea4babd21be7cbd1f2b63a
parent: 7649aacf7f17410b88f258b2b2eddea9e8445922 [diff]
diff --git a/src/time.c b/src/time.c
index bcd6a72..789b47c 100644
--- a/src/time.c
+++ b/src/time.c

@@ -28,6 +28,9 @@
 THREAD_LOCAL struct timeval before_poll;     /* system date before calling poll() */
 THREAD_LOCAL struct timeval after_poll;      /* system date after leaving poll() */
 
+static THREAD_LOCAL struct timeval tv_offset;  /* per-thread time ofsset relative to global time */
+volatile unsigned long long global_now;        /* common date between all threads (32:32) */
+
 /*
  * adds <ms> ms to <from>, set the result to <tv> and returns a pointer <tv>
  */
@@ -170,55 +173,70 @@
  */
 REGPRM2 void tv_update_date(int max_wait, int interrupted)
 {
-	volatile static long long offset = 0;
-	struct timeval adjusted, deadline;
+	struct timeval adjusted, deadline, tmp_now;
 	unsigned int   curr_sec_ms;     /* millisecond of current second (0..999) */
-	long long new_ofs;
+	unsigned long long old_now;
+	unsigned long long new_now;
 
 	gettimeofday(&date, NULL);
 	if (unlikely(max_wait < 0)) {
-		HA_ATOMIC_STORE(&offset, 0);
+		tv_zero(&tv_offset);
 		adjusted = date;
 		after_poll = date;
 		samp_time = idle_time = 0;
 		idle_pct = 100;
+		global_now = (((unsigned long long)adjusted.tv_sec) << 32) +
+		             (unsigned int)adjusted.tv_usec;
 		goto to_ms;
 	}
 
-	new_ofs = offset;
+	__tv_add(&adjusted, &date, &tv_offset);
 
-	adjusted.tv_sec  = date.tv_sec  + (int)(new_ofs >> 32);
-	adjusted.tv_usec = date.tv_usec + (int)(new_ofs & 0xFFFFFFFFU);
-	if (adjusted.tv_usec > 999999) {
-		adjusted.tv_usec -= 1000000;
-		adjusted.tv_sec  += 1;
-	}
+	/* compute the minimum and maximum local date we may have reached based
+	 * on our past date and the associated timeout.
+	 */
+	_tv_ms_add(&deadline, &now, max_wait + MAX_DELAY_MS);
 
-	if (unlikely(__tv_islt(&adjusted, &now))) {
-		goto fixup; /* jump in the past */
+	if (unlikely(__tv_islt(&adjusted, &now) || __tv_islt(&deadline, &adjusted))) {
+		/* Large jump. If the poll was interrupted, we consider that the
+		 * date has not changed (immediate wake-up), otherwise we add
+		 * the poll time-out to the previous date. The new offset is
+		 * recomputed.
+		 */
+		_tv_ms_add(&adjusted, &now, interrupted ? 0 : max_wait);
 	}
 
-	/* OK we did not jump backwards, let's see if we have jumped too far
-	 * forwards. The poll value was in <max_wait>, we accept that plus
-	 * MAX_DELAY_MS to cover additional time.
-	 */
-	_tv_ms_add(&deadline, &now, max_wait + MAX_DELAY_MS);
-	if (likely(__tv_islt(&adjusted, &deadline)))
-		goto to_ms; /* OK time is within expected range */
- fixup:
-	/* Large jump. If the poll was interrupted, we consider that the date
-	 * has not changed (immediate wake-up), otherwise we add the poll
-	 * time-out to the previous date. The new offset is recomputed.
+	/* now that we have bounded the local time, let's check if it's
+	 * realistic regarding the global date, which only moves forward,
+	 * otherwise catch up.
 	 */
-	_tv_ms_add(&adjusted, &now, interrupted ? 0 : max_wait);
+	old_now = global_now;
 
-	new_ofs = (((long)(adjusted.tv_sec  - date.tv_sec)) << 32) +
-	          (unsigned int)(adjusted.tv_usec - date.tv_usec);
+	do {
+		tmp_now.tv_sec  = (unsigned int)(old_now >> 32);
+		tmp_now.tv_usec = old_now & 0xFFFFFFFFU;
 
-	if ((int)(new_ofs & 0xFFFFFFFFU) < 0)
-		new_ofs = new_ofs + 1000000 - 0x100000000UL;
+		if (__tv_islt(&adjusted, &tmp_now))
+			adjusted = tmp_now;
+
+		/* now <adjusted> is expected to be the most accurate date,
+		 * equal to <global_now> or newer.
+		 */
+		new_now = (((unsigned long long)adjusted.tv_sec) << 32) + (unsigned int)adjusted.tv_usec;
+
+		/* let's try to update the global <now> or loop again */
+	} while (!HA_ATOMIC_CAS(&global_now, &old_now, new_now));
+
+	/* the new global date when we looked was old_now, and the new one is
+	 * new_now == adjusted. We can recompute our local offset.
+	 */
+	tv_offset.tv_sec  = adjusted.tv_sec  - date.tv_sec;
+	tv_offset.tv_usec = adjusted.tv_usec - date.tv_usec;
+	if (tv_offset.tv_usec < 0) {
+		tv_offset.tv_usec += 1000000;
+		tv_offset.tv_sec--;
+	}
 
-	HA_ATOMIC_STORE(&offset, new_ofs);
  to_ms:
 	now = adjusted;
 	curr_sec_ms = now.tv_usec / 1000;            /* ms of current second */
commit	9fefc51c569b692a09229182a76fae78883b1cd8	[log] [tgz]
author	Willy Tarreau <w@1wt.eu>	Thu Nov 23 14:52:28 2017 +0100
committer	Willy Tarreau <w@1wt.eu>	Thu Nov 23 16:32:32 2017 +0100
tree	5cd5c567da7275f959ea4babd21be7cbd1f2b63a
parent	7649aacf7f17410b88f258b2b2eddea9e8445922 [diff]