OPTIM: tools: optimize my_ffsl() for x86_64
This call is now used quite a bit in the fd cache, to decide which cache
to add/remove the fd to/from, when waking up a task for a single thread
in __task_wakeup(), in fd_cant_recv() and in fd_process_cached_events(),
and we can replace it with a single instruction, removing ~30 instructions
and ~80 bytes from the inner loop of some of these functions.
In addition the test for zero value was replaced with a comment saying
that it is illegal and leads to an undefined behaviour. The code does
not make use of this useless case today.
diff --git a/include/common/standard.h b/include/common/standard.h
index 5c7d152..fabe972 100644
--- a/include/common/standard.h
+++ b/include/common/standard.h
@@ -802,13 +802,16 @@
}
/* Simple ffs implementation. It returns the position of the lowest bit set to
- * one. */
+ * one. It is illegal to call it with a==0 (undefined result).
+ */
static inline unsigned int my_ffsl(unsigned long a)
{
- unsigned int cnt;
+ unsigned long cnt;
- if (!a)
- return 0;
+#if defined(__x86_64__)
+ __asm__("bsr %1,%0\n" : "=r" (cnt) : "rm" (a));
+ cnt++;
+#else
cnt = 1;
#if LONG_MAX > 0x7FFFFFFFL /* 64bits */
@@ -837,6 +840,7 @@
a >>= 1;
cnt += 1;
}
+#endif /* x86_64 */
return cnt;
}