MINOR: tools: add the ability to update a word fingerprint
Instead of making a new one from scratch, let's support not wiping the
existing fingerprint and updating it, and to do the same char by char.
The word-by-word one will still result in multiple beginnings and ends,
but that will accurately translate word boundaries. The char-based one
has more flexibility and requires that the caller maintains the previous
char to indicate the transition, which also allows to insert delimiters
for example.
diff --git a/src/tools.c b/src/tools.c
index 0fd3ede..1255e74 100644
--- a/src/tools.c
+++ b/src/tools.c
@@ -5369,18 +5369,17 @@
return pos - shift;
}
-/* Initialize array <fp> with the fingerprint of word <word> by counting the
+/* Update array <fp> with the fingerprint of word <word> by counting the
* transitions between characters. <fp> is a 1024-entries array indexed as
* 32*from+to. Positions for 'from' and 'to' are:
* 0..25=letter, 26=digit, 27=other, 28=begin, 29=end, others unused.
*/
-void make_word_fingerprint(uint8_t *fp, const char *word)
+void update_word_fingerprint(uint8_t *fp, const char *word)
{
const char *p;
int from, to;
int c;
- memset(fp, 0, 1024);
from = 28; // begin
for (p = word; *p; p++) {
c = tolower(*p);
@@ -5397,6 +5396,17 @@
fp[32 * from + to]++;
}
+/* Initialize array <fp> with the fingerprint of word <word> by counting the
+ * transitions between characters. <fp> is a 1024-entries array indexed as
+ * 32*from+to. Positions for 'from' and 'to' are:
+ * 0..25=letter, 26=digit, 27=other, 28=begin, 29=end, others unused.
+ */
+void make_word_fingerprint(uint8_t *fp, const char *word)
+{
+ memset(fp, 0, 1024);
+ update_word_fingerprint(fp, word);
+}
+
/* Return the distance between two word fingerprints created by function
* make_word_fingerprint(). It's a positive integer calculated as the sum of
* the squares of the differences between each location.