MAJOR: pattern: add LRU-based cache on pattern matching
The principle of this cache is to have a global cache for all pattern
matching operations which rely on lists (reg, sub, dir, dom, ...). The
input data, the expression and a random seed are used as a hashing key.
The cached entries contains a pointer to the expression and a revision
number for that expression so that we don't accidently used obsolete
data after a pattern update or a very unlikely hash collision.
Regarding the risk of collisions, 10k entries at 10k req/s mean 1% risk
of a collision after 60 years, that's already much less than the memory's
reliability in most machines and more durable than most admin's life
expectancy. A collision will result in a valid result to be returned
for a different entry from the same list. If this is not acceptable,
the cache can be disabled using tune.pattern.cache-size.
A test on a file containing 10k small regex showed that the regex
matching was limited to 6k/s instead of 70k with regular strings.
When enabling the LRU cache, the performance was back to 70k/s.
diff --git a/doc/configuration.txt b/doc/configuration.txt
index b6b3f08..f72339a 100644
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@@ -493,6 +493,7 @@
- tune.maxaccept
- tune.maxpollevents
- tune.maxrewrite
+ - tune.pattern.cache-size
- tune.pipesize
- tune.rcvbuf.client
- tune.rcvbuf.server
@@ -1050,6 +1051,25 @@
larger than that. This means you don't have to worry about it when changing
bufsize.
+tune.pattern.cache-size <number>
+ Sets the size of the pattern lookup cache to <number> entries. This is an LRU
+ cache which reminds previous lookups and their results. It is used by ACLs
+ and maps on slow pattern lookups, namely the ones using the "sub", "reg",
+ "dir", "dom", "end", "bin" match methods as well as the case-insensitive
+ strings. It applies to pattern expressions which means that it will be able
+ to memorize the result of a lookup among all the patterns specified on a
+ configuration line (including all those loaded from files). It automatically
+ invalidates entries which are updated using HTTP actions or on the CLI. The
+ default cache size is set to 10000 entries, which limits its footprint to
+ about 5 MB on 32-bit systems and 8 MB on 64-bit systems. There is a very low
+ risk of collision in this cache, which is in the order of the size of the
+ cache divided by 2^64. Typically, at 10000 requests per second with the
+ default cache size of 10000 entries, there's 1% chance that a brute force
+ attack could cause a single collision after 60 years, or 0.1% after 6 years.
+ This is considered much lower than the risk of a memory corruption caused by
+ aging components. If this is not acceptable, the cache can be disabled by
+ setting this parameter to 0.
+
tune.pipesize <number>
Sets the kernel pipe buffer size to this size (in bytes). By default, pipes
are the default size for the system. But sometimes when using TCP splicing,
diff --git a/include/common/defaults.h b/include/common/defaults.h
index 63b2b89..6193bdc 100644
--- a/include/common/defaults.h
+++ b/include/common/defaults.h
@@ -295,4 +295,15 @@
#ifndef TLS_TICKETS_NO
#define TLS_TICKETS_NO 3
#endif
+
+/* pattern lookup default cache size, in number of entries :
+ * 10k entries at 10k req/s mean 1% risk of a collision after 60 years, that's
+ * already much less than the memory's reliability in most machines and more
+ * durable than most admin's life expectancy. A collision will result in a
+ * valid result to be returned for a different entry from the same list.
+ */
+#ifndef DEFAULT_PAT_LRU_SIZE
+#define DEFAULT_PAT_LRU_SIZE 10000
+#endif
+
#endif /* _COMMON_DEFAULTS_H */
diff --git a/include/types/global.h b/include/types/global.h
index afd7aef..7533bb0 100644
--- a/include/types/global.h
+++ b/include/types/global.h
@@ -142,6 +142,7 @@
int pipesize; /* pipe size in bytes, system defaults if zero */
int max_http_hdr; /* max number of HTTP headers, use MAX_HTTP_HDR if zero */
int cookie_len; /* max length of cookie captures */
+ int pattern_cache; /* max number of entries in the pattern cache. */
int sslcachesize; /* SSL cache size in session, defaults to 20000 */
#ifdef USE_OPENSSL
int sslprivatecache; /* Force to use a private session cache even if nbproc > 1 */
diff --git a/src/cfgparse.c b/src/cfgparse.c
index edaee56..4f75256 100644
--- a/src/cfgparse.c
+++ b/src/cfgparse.c
@@ -907,6 +907,22 @@
goto out;
}
}
+ else if (!strcmp(args[0], "tune.pattern.cache-size")) {
+ if (*args[1]) {
+ global.tune.pattern_cache = atoi(args[1]);
+ if (global.tune.pattern_cache < 0) {
+ Alert("parsing [%s:%d] : '%s' expects a positive numeric value\n",
+ file, linenum, args[0]);
+ err_code |= ERR_ALERT | ERR_FATAL;
+ goto out;
+ }
+ } else {
+ Alert("parsing [%s:%d] : '%s' expects a positive numeric value\n",
+ file, linenum, args[0]);
+ err_code |= ERR_ALERT | ERR_FATAL;
+ goto out;
+ }
+ }
else if (!strcmp(args[0], "uid")) {
if (global.uid != 0) {
Alert("parsing [%s:%d] : user/uid already specified. Continuing.\n", file, linenum);
diff --git a/src/haproxy.c b/src/haproxy.c
index 474179c..5822a8b 100644
--- a/src/haproxy.c
+++ b/src/haproxy.c
@@ -146,6 +146,7 @@
.maxrewrite = MAXREWRITE,
.chksize = BUFSIZE,
.reserved_bufs = RESERVED_BUFS,
+ .pattern_cache = DEFAULT_PAT_LRU_SIZE,
#ifdef USE_OPENSSL
.sslcachesize = SSLCACHESIZE,
.ssl_default_dh_param = SSL_DEFAULT_DH_PARAM,
diff --git a/src/pattern.c b/src/pattern.c
index ebae85d..cbfa20d 100644
--- a/src/pattern.c
+++ b/src/pattern.c
@@ -24,6 +24,8 @@
#include <proto/sample.h>
#include <ebsttree.h>
+#include <import/lru.h>
+#include <import/xxhash.h>
char *pat_match_names[PAT_MATCH_NUM] = {
[PAT_MATCH_FOUND] = "found",
@@ -144,6 +146,9 @@
/* This is the root of the list of all pattern_ref avalaibles. */
struct list pattern_reference = LIST_HEAD_INIT(pattern_reference);
+static struct lru64_head *pat_lru_tree;
+static unsigned long long pat_lru_seed;
+
/*
*
* The following functions are not exported and are used by internals process
@@ -443,6 +448,8 @@
struct pattern_tree *elt;
struct pattern_list *lst;
struct pattern *pattern;
+ struct pattern *ret = NULL;
+ struct lru64 *lru = NULL;
/* Lookup a string in the expression's pattern tree. */
if (!eb_is_empty(&expr->pattern_tree)) {
@@ -468,6 +475,15 @@
}
/* look in the list */
+ if (pat_lru_tree) {
+ unsigned long long seed = pat_lru_seed ^ (unsigned long long)expr;
+
+ lru = lru64_get(XXH64(smp->data.str.str, smp->data.str.len, seed),
+ pat_lru_tree, expr, expr->revision);
+ if (lru && lru->domain)
+ return lru->data;
+ }
+
list_for_each_entry(lst, &expr->patterns, list) {
pattern = &lst->pat;
@@ -476,11 +492,16 @@
icase = expr->mflags & PAT_MF_IGNORE_CASE;
if ((icase && strncasecmp(pattern->ptr.str, smp->data.str.str, smp->data.str.len) == 0) ||
- (!icase && strncmp(pattern->ptr.str, smp->data.str.str, smp->data.str.len) == 0))
- return pattern;
+ (!icase && strncmp(pattern->ptr.str, smp->data.str.str, smp->data.str.len) == 0)) {
+ ret = pattern;
+ break;
+ }
}
- return NULL;
+ if (lru)
+ lru64_commit(lru, ret, expr, expr->revision);
+
+ return ret;
}
/* NB: For two binaries buf to be identical, it is required that their lengths match */
@@ -488,19 +509,34 @@
{
struct pattern_list *lst;
struct pattern *pattern;
+ struct pattern *ret = NULL;
+ struct lru64 *lru = NULL;
+
+ if (pat_lru_tree) {
+ unsigned long long seed = pat_lru_seed ^ (unsigned long long)expr;
- /* Look in the list. */
+ lru = lru64_get(XXH64(smp->data.str.str, smp->data.str.len, seed),
+ pat_lru_tree, expr, expr->revision);
+ if (lru && lru->domain)
+ return lru->data;
+ }
+
list_for_each_entry(lst, &expr->patterns, list) {
pattern = &lst->pat;
if (pattern->len != smp->data.str.len)
continue;
- if (memcmp(pattern->ptr.str, smp->data.str.str, smp->data.str.len) == 0)
- return pattern;
+ if (memcmp(pattern->ptr.str, smp->data.str.str, smp->data.str.len) == 0) {
+ ret = pattern;
+ break;
+ }
}
- return NULL;
+ if (lru)
+ lru64_commit(lru, ret, expr, expr->revision);
+
+ return ret;
}
/* Executes a regex. It temporarily changes the data to add a trailing zero,
@@ -510,15 +546,31 @@
{
struct pattern_list *lst;
struct pattern *pattern;
+ struct pattern *ret = NULL;
+ struct lru64 *lru = NULL;
- /* look in the list */
+ if (pat_lru_tree) {
+ unsigned long long seed = pat_lru_seed ^ (unsigned long long)expr;
+
+ lru = lru64_get(XXH64(smp->data.str.str, smp->data.str.len, seed),
+ pat_lru_tree, expr, expr->revision);
+ if (lru && lru->domain)
+ return lru->data;
+ }
+
list_for_each_entry(lst, &expr->patterns, list) {
pattern = &lst->pat;
- if (regex_exec2(pattern->ptr.reg, smp->data.str.str, smp->data.str.len))
- return pattern;
+ if (regex_exec2(pattern->ptr.reg, smp->data.str.str, smp->data.str.len)) {
+ ret = pattern;
+ break;
+ }
}
- return NULL;
+
+ if (lru)
+ lru64_commit(lru, ret, expr, expr->revision);
+
+ return ret;
}
/* Checks that the pattern matches the beginning of the tested string. */
@@ -530,6 +582,8 @@
struct pattern_tree *elt;
struct pattern_list *lst;
struct pattern *pattern;
+ struct pattern *ret = NULL;
+ struct lru64 *lru = NULL;
/* Lookup a string in the expression's pattern tree. */
if (!eb_is_empty(&expr->pattern_tree)) {
@@ -555,6 +609,15 @@
}
/* look in the list */
+ if (pat_lru_tree) {
+ unsigned long long seed = pat_lru_seed ^ (unsigned long long)expr;
+
+ lru = lru64_get(XXH64(smp->data.str.str, smp->data.str.len, seed),
+ pat_lru_tree, expr, expr->revision);
+ if (lru && lru->domain)
+ return lru->data;
+ }
+
list_for_each_entry(lst, &expr->patterns, list) {
pattern = &lst->pat;
@@ -566,9 +629,14 @@
(!icase && strncmp(pattern->ptr.str, smp->data.str.str, pattern->len) != 0))
continue;
- return pattern;
+ ret = pattern;
+ break;
}
- return NULL;
+
+ if (lru)
+ lru64_commit(lru, ret, expr, expr->revision);
+
+ return ret;
}
/* Checks that the pattern matches the end of the tested string. */
@@ -577,6 +645,17 @@
int icase;
struct pattern_list *lst;
struct pattern *pattern;
+ struct pattern *ret = NULL;
+ struct lru64 *lru = NULL;
+
+ if (pat_lru_tree) {
+ unsigned long long seed = pat_lru_seed ^ (unsigned long long)expr;
+
+ lru = lru64_get(XXH64(smp->data.str.str, smp->data.str.len, seed),
+ pat_lru_tree, expr, expr->revision);
+ if (lru && lru->domain)
+ return lru->data;
+ }
list_for_each_entry(lst, &expr->patterns, list) {
pattern = &lst->pat;
@@ -589,9 +668,14 @@
(!icase && strncmp(pattern->ptr.str, smp->data.str.str + smp->data.str.len - pattern->len, pattern->len) != 0))
continue;
- return pattern;
+ ret = pattern;
+ break;
}
- return NULL;
+
+ if (lru)
+ lru64_commit(lru, ret, expr, expr->revision);
+
+ return ret;
}
/* Checks that the pattern is included inside the tested string.
@@ -604,6 +688,17 @@
char *c;
struct pattern_list *lst;
struct pattern *pattern;
+ struct pattern *ret = NULL;
+ struct lru64 *lru = NULL;
+
+ if (pat_lru_tree) {
+ unsigned long long seed = pat_lru_seed ^ (unsigned long long)expr;
+
+ lru = lru64_get(XXH64(smp->data.str.str, smp->data.str.len, seed),
+ pat_lru_tree, expr, expr->revision);
+ if (lru && lru->domain)
+ return lru->data;
+ }
list_for_each_entry(lst, &expr->patterns, list) {
pattern = &lst->pat;
@@ -617,19 +712,27 @@
for (c = smp->data.str.str; c <= end; c++) {
if (tolower(*c) != tolower(*pattern->ptr.str))
continue;
- if (strncasecmp(pattern->ptr.str, c, pattern->len) == 0)
- return pattern;
+ if (strncasecmp(pattern->ptr.str, c, pattern->len) == 0) {
+ ret = pattern;
+ goto leave;
+ }
}
} else {
for (c = smp->data.str.str; c <= end; c++) {
if (*c != *pattern->ptr.str)
continue;
- if (strncmp(pattern->ptr.str, c, pattern->len) == 0)
- return pattern;
+ if (strncmp(pattern->ptr.str, c, pattern->len) == 0) {
+ ret = pattern;
+ goto leave;
+ }
}
}
}
- return NULL;
+ leave:
+ if (lru)
+ lru64_commit(lru, ret, expr, expr->revision);
+
+ return ret;
}
/* This one is used by other real functions. It checks that the pattern is
@@ -2321,6 +2424,10 @@
struct pat_ref *ref, *ref2, *ref3;
struct list pr = LIST_HEAD_INIT(pr);
+ pat_lru_seed = random();
+ if (global.tune.pattern_cache)
+ pat_lru_tree = lru64_new(global.tune.pattern_cache);
+
list_for_each_entry(ref, &pattern_reference, list) {
if (ref->unique_id == -1) {
/* Look for the first free id. */