[OPTIM] halog: speed up fgets2-64 by about 10%
This version uses more 64-bit lookups and two 32-bit lookups
to converge faster. This saves about 10% performance.
diff --git a/contrib/halog/fgets2-64.c b/contrib/halog/fgets2-64.c
index 9209c08..236b970 100644
--- a/contrib/halog/fgets2-64.c
+++ b/contrib/halog/fgets2-64.c
@@ -62,7 +62,7 @@
#define FGETS2_BUFSIZE (256*1024)
const char *fgets2(FILE *stream)
{
- static char buffer[FGETS2_BUFSIZE + 5];
+ static char buffer[FGETS2_BUFSIZE + 9]; // +9 to have zeroes past the end
static char *end = buffer;
static char *line = buffer;
@@ -72,15 +72,32 @@
next = line;
while (1) {
- /* this is a speed-up, we read 32 bits at once and check for an
+ /* this is a speed-up, we read 64 bits at once and check for an
* LF character there. We stop if found then continue one at a
* time.
*/
- while (next < end && (((unsigned long)next) & 7) && *next != '\n')
- next++;
- /* now next is multiple of 4 or equal to end */
- while (next <= (end-32)) {
+ if (next <= (end-12)) {
+ /* max 3 bytes tested here */
+ while ((((unsigned long)next) & 3) && *next != '\n')
+ next++;
+
+ /* maybe we have can skip 4 more bytes */
+ if ((((unsigned long)next) & 4) && !has_zero(*(unsigned int *)next ^ 0x0A0A0A0AU))
+ next += 4;
+ }
+
+ /* now next is multiple of 8 or equal to end */
+ while (next <= (end-68)) {
+ if (has_zero64(*(unsigned long long *)next ^ 0x0A0A0A0A0A0A0A0AULL))
+ break;
+ next += 8;
+ if (has_zero64(*(unsigned long long *)next ^ 0x0A0A0A0A0A0A0A0AULL))
+ break;
+ next += 8;
+ if (has_zero64(*(unsigned long long *)next ^ 0x0A0A0A0A0A0A0A0AULL))
+ break;
+ next += 8;
if (has_zero64(*(unsigned long long *)next ^ 0x0A0A0A0A0A0A0A0AULL))
break;
next += 8;
@@ -93,8 +110,15 @@
if (has_zero64(*(unsigned long long *)next ^ 0x0A0A0A0A0A0A0A0AULL))
break;
next += 8;
+ if (has_zero64(*(unsigned long long *)next ^ 0x0A0A0A0A0A0A0A0AULL))
+ break;
+ next += 8;
}
+ /* maybe we can skip 4 more bytes */
+ if (!has_zero(*(unsigned int *)next ^ 0x0A0A0A0AU))
+ next += 4;
+
/* we finish if needed. Note that next might be slightly higher
* than end here because we might have gone past it above.
*/