MINOR: tools: make my_htonll() more efficient on x86_64

The current construct was made when developing on a 32-bit machine.
Having a simple bswap operation replaced with 2 bswap, 2 shift and
2 or is quite of a waste of precious cycles... Let's provide a trivial
asm-based implementation for x86_64.
diff --git a/include/common/standard.h b/include/common/standard.h
index 74dc041..2645c44 100644
--- a/include/common/standard.h
+++ b/include/common/standard.h
@@ -1201,6 +1201,10 @@
  */
 static inline unsigned long long my_htonll(unsigned long long a)
 {
+#if defined(__x86_64__)
+	__asm__ volatile("bswap %0" : "=r"(a));
+	return a;
+#else
 	union {
 		struct {
 			unsigned int w1;
@@ -1209,6 +1213,7 @@
 		unsigned long long by64;
 	} w = { .by64 = a };
 	return ((unsigned long long)htonl(w.by32.w1) << 32) | htonl(w.by32.w2);
+#endif
 }
 
 /* Turns 64-bit value <a> from network byte order to host byte order. */