OPTIM: tools: optimize my_ffsl() for x86_64

This call is now used quite a bit in the fd cache, to decide which cache
to add/remove the fd to/from, when waking up a task for a single thread
in __task_wakeup(), in fd_cant_recv() and in fd_process_cached_events(),
and we can replace it with a single instruction, removing ~30 instructions
and ~80 bytes from the inner loop of some of these functions.

In addition the test for zero value was replaced with a comment saying
that it is illegal and leads to an undefined behaviour. The code does
not make use of this useless case today.
diff --git a/include/common/standard.h b/include/common/standard.h
index 5c7d152..fabe972 100644
--- a/include/common/standard.h
+++ b/include/common/standard.h
@@ -802,13 +802,16 @@
 }
 
 /* Simple ffs implementation. It returns the position of the lowest bit set to
- * one. */
+ * one. It is illegal to call it with a==0 (undefined result).
+ */
 static inline unsigned int my_ffsl(unsigned long a)
 {
-	unsigned int cnt;
+	unsigned long cnt;
 
-	if (!a)
-		return 0;
+#if defined(__x86_64__)
+	__asm__("bsr %1,%0\n" : "=r" (cnt) : "rm" (a));
+	cnt++;
+#else
 
 	cnt = 1;
 #if LONG_MAX > 0x7FFFFFFFL /* 64bits */
@@ -837,6 +840,7 @@
 		a >>= 1;
 		cnt += 1;
 	}
+#endif /* x86_64 */
 
 	return cnt;
 }