MINOR: tools: improve word fingerprinting by counting presence The distance between two words can be high due to a sub-word being missing and in this case it happens that other totally unrealted words are proposed because their average score looks lower thanks to being shorter. Here we're introducing the notion of presence of each character so that word sequences that contain existing sub-words are favored against the shorter ones having nothing in common. In addition we do not distinguish being/end from a regular delimitor anymore. That made it harder to spot inverted words.

commit: 9294e8822f6f359ea9dba6f19b97fb75a72c433e [log] [tgz]
author: Willy Tarreau <w@1wt.eu> Mon Mar 15 09:34:27 2021 +0100
committer: Willy Tarreau <w@1wt.eu> Mon Mar 15 09:38:42 2021 +0100
tree: a634cd557ccf951d6858b1583167362aed4308d8
parent: 101df31503e7bef59cd6096cd9eb2d708de7471b [diff]
diff --git a/include/haproxy/tools.h b/include/haproxy/tools.h
index 901dca0..3121fea 100644
--- a/include/haproxy/tools.h
+++ b/include/haproxy/tools.h

@@ -1077,28 +1077,30 @@
  * is zero, it's assumed that <curr> is the first character. If <curr> is zero
  * its assumed to mark the end. Both may be zero. <fp> is a 1024-entries array
  * indexed as 32*from+to. Positions for 'from' and 'to' are:
- *   0..25=letter, 26=digit, 27=other, 28=begin, 29=end, others unused.
+ *   1..26=letter, 27=digit, 28=other/begin/end.
+ * Row "from=0" is used to mark the character's presence. Others unused.
  */
 static inline void update_char_fingerprint(uint8_t *fp, char prev, char curr)
 {
 	int from, to;
 
 	switch (prev) {
-	case 0:         from = 26; break; // begin
-	case 'a'...'z': from = prev - 'a'; break;
-	case 'A'...'Z': from = tolower(prev) - 'a'; break;
-	case '0'...'9': from = 26; break;
-	default:        from = 27; break;
+	case 0:         from = 28; break; // begin
+	case 'a'...'z': from = prev - 'a' + 1; break;
+	case 'A'...'Z': from = tolower(prev) - 'a' + 1; break;
+	case '0'...'9': from = 27; break;
+	default:        from = 28; break;
 	}
 
 	switch (curr) {
 	case 0:         to = 28; break; // end
-	case 'a'...'z': to = curr - 'a'; break;
-	case 'A'...'Z': to = tolower(curr) - 'a'; break;
-	case '0'...'9': to = 26; break;
-	default:        to = 27; break;
+	case 'a'...'z': to = curr - 'a' + 1; break;
+	case 'A'...'Z': to = tolower(curr) - 'a' + 1; break;
+	case '0'...'9': to = 27; break;
+	default:        to = 28; break;
 	}
-
+	if (curr)
+		fp[to] = 1;
 	fp[32 * from + to]++;
 }
 

diff --git a/src/tools.c b/src/tools.c
index 1255e74..ffd167a 100644
--- a/src/tools.c
+++ b/src/tools.c

@@ -5372,7 +5372,8 @@
 /* Update array <fp> with the fingerprint of word <word> by counting the
  * transitions between characters. <fp> is a 1024-entries array indexed as
  * 32*from+to. Positions for 'from' and 'to' are:
- *   0..25=letter, 26=digit, 27=other, 28=begin, 29=end, others unused.
+ *   1..26=letter, 27=digit, 28=other/begin/end.
+ * Row "from=0" is used to mark the character's presence. Others unused.
  */
 void update_word_fingerprint(uint8_t *fp, const char *word)
 {
@@ -5384,11 +5385,12 @@
 	for (p = word; *p; p++) {
 		c = tolower(*p);
 		switch(c) {
-		case 'a'...'z': to = c - 'a'; break;
-		case 'A'...'Z': to = tolower(c) - 'a'; break;
-		case '0'...'9': to = 26; break;
-		default: to = 27; break;
+		case 'a'...'z': to = c - 'a' + 1; break;
+		case 'A'...'Z': to = tolower(c) - 'a' + 1; break;
+		case '0'...'9': to = 27; break;
+		default:        to = 28; break;
 		}
+		fp[to] = 1;
 		fp[32 * from + to]++;
 		from = to;
 	}
commit	9294e8822f6f359ea9dba6f19b97fb75a72c433e	[log] [tgz]
author	Willy Tarreau <w@1wt.eu>	Mon Mar 15 09:34:27 2021 +0100
committer	Willy Tarreau <w@1wt.eu>	Mon Mar 15 09:38:42 2021 +0100
tree	a634cd557ccf951d6858b1583167362aed4308d8
parent	101df31503e7bef59cd6096cd9eb2d708de7471b [diff]