MEDIUM: pattern: add the "base" sample fetch method

This one returns the concatenation of the first Host header entry with
the path. It can make content-switching rules easier, help with fighting
DDoS on certain URLs and improve shared caches efficiency.
diff --git a/doc/configuration.txt b/doc/configuration.txt
index 0d748e4..66abe40 100644
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@@ -7968,6 +7968,47 @@
 read, and are only evaluated then. They may require slightly more CPU resources
 than the layer 4 ones, but not much since the request and response are indexed.
 
+base <string>
+  Returns true when the concatenation of the first Host header and the path
+  part of the request, which starts at the first slash and ends before the
+  question mark, equals one of the strings. It may be used to match known
+  files in virtual hosting environments, such as "www.example.com/favicon.ico".
+  See also "path" and "uri".
+
+base_beg <string>
+  Returns true when the base (see above) begins with one of the strings. This
+  can be used to send certain directory names to alternative backends. See also
+  "path_beg".
+
+base_dir <string>
+  Returns true when one of the strings is found isolated or delimited with
+  slashes in the base (see above). Probably of little use, see "url_dir" and
+  "path_dir" instead.
+
+base_dom <string>
+  Returns true when one of the strings is found isolated or delimited with dots
+  in the base (see above). Probably of little use, see "path_dom" and "url_dom"
+  instead.
+
+base_end <string>
+  Returns true when the base (see above) ends with one of the strings. This may
+  be used to control file name extension, though "path_end" is cheaper.
+
+base_len <integer>
+  Returns true when the base (see above) length matches the values or ranges
+  specified.  This may be used to detect abusive requests for instance.
+
+base_reg <regex>
+  Returns true when the base (see above) matches one of the regular
+  expressions. It can be used any time, but it is important to remember that
+  regex matching is slower than other methods. See also "path_reg", "url_reg"
+  and all "base_" criteria.
+
+base_sub <string>
+  Returns true when the base (see above) contains one of the strings. It can be
+  used to detect particular patterns in paths, such as "../" for example. See
+  also "base_dir".
+
 cook(<name>) <string>
   All "cook*" matching criteria inspect all "Cookie" headers to find a cookie
   with the name between parenthesis. If multiple occurrences of the cookie are
@@ -8209,11 +8250,12 @@
 
 url <string>
   Applies to the whole URL passed in the request. The only real use is to match
-  "*", for which there already is a predefined ACL.
+  "*", for which there already is a predefined ACL. See also "base".
 
 url_beg <string>
   Returns true when the URL begins with one of the strings. This can be used to
-  check whether a URL begins with a slash or with a protocol scheme.
+  check whether a URL begins with a slash or with a protocol scheme. See also
+  "base_beg".
 
 url_dir <string>
   Returns true when one of the strings is found isolated or delimited with
@@ -8248,7 +8290,7 @@
 url_reg <regex>
   Returns true when the URL matches one of the regular expressions. It can be
   used any time, but it is important to remember that regex matching is slower
-  than other methods. See also "path_reg" and all "url_" criteria.
+  than other methods. See also "base_reg", "path_reg" and all "url_" criteria.
 
 url_sub <string>
   Returns true when the URL contains one of the strings. It can be used to
@@ -8421,6 +8463,14 @@
 
 The list of currently supported pattern fetch functions is the following :
 
+  base         This returns the concatenation of the first Host header and the
+               path part of the request, which starts at the first slash and
+               ends before the question mark. It can be useful in virtual
+               hosted environments to detect URL abuses as well as to improve
+               shared caches efficiency. Using this with a limited size stick
+               table also allows one to collect statistics about most commonly
+               requested objects by host/path.
+
   src          This is the source IPv4 address of the client of the session.
                It is of type IPv4 and works on both IPv4 and IPv6 tables.
                On IPv6 tables, IPv4 address is mapped to its IPv6 equivalent,
diff --git a/src/proto_http.c b/src/proto_http.c
index 082b530..820f643 100644
--- a/src/proto_http.c
+++ b/src/proto_http.c
@@ -8041,6 +8041,50 @@
 	return 1;
 }
 
+/* This produces a concatenation of the first occurrence of the Host header
+ * followed by the path component if it begins with a slash ('/'). This means
+ * that '*' will not be added, resulting in exactly the first Host entry.
+ * If no Host header is found, then the path is returned as-is. The returned
+ * value is stored in the trash so it does not need to be marked constant.
+ */
+static int
+smp_fetch_base(struct proxy *px, struct session *l4, void *l7, unsigned int opt,
+               const struct arg *args, struct sample *smp)
+{
+	struct http_txn *txn = l7;
+	char *ptr, *end, *beg;
+	struct hdr_ctx ctx;
+
+	CHECK_HTTP_MESSAGE_FIRST();
+
+	ctx.idx = 0;
+	if (!http_find_header2("Host", 4, txn->req.buf->p + txn->req.sol, &txn->hdr_idx, &ctx) ||
+	    !ctx.vlen)
+		return smp_fetch_path(px, l4, l7, opt, args, smp);
+
+	/* OK we have the header value in ctx.line+ctx.val for ctx.vlen bytes */
+	memcpy(trash, ctx.line + ctx.val, ctx.vlen);
+	smp->type = SMP_T_STR;
+	smp->data.str.str = trash;
+	smp->data.str.len = ctx.vlen;
+
+	/* now retrieve the path */
+	end = txn->req.buf->p + txn->req.sol + txn->req.sl.rq.u + txn->req.sl.rq.u_l;
+	beg = http_get_path(txn);
+	if (!beg)
+		beg = end;
+
+	for (ptr = beg; ptr < end && *ptr != '?'; ptr++);
+
+	if (beg < ptr && *beg == '/') {
+		memcpy(smp->data.str.str + smp->data.str.len, beg, ptr - beg);
+		smp->data.str.len += ptr - beg;
+	}
+
+	smp->flags = SMP_F_VOL_1ST;
+	return 1;
+}
+
 static int
 acl_fetch_proto_http(struct proxy *px, struct session *l4, void *l7, unsigned int opt,
                      const struct arg *args, struct sample *smp)
@@ -8530,6 +8574,15 @@
  * Please take care of keeping this list alphabetically sorted.
  */
 static struct acl_kw_list acl_kws = {{ },{
+	{ "base",            acl_parse_str,     smp_fetch_base,           acl_match_str,     ACL_USE_L7REQ_VOLATILE|ACL_MAY_LOOKUP, 0 },
+	{ "base_beg",        acl_parse_str,     smp_fetch_base,           acl_match_beg,     ACL_USE_L7REQ_VOLATILE, 0 },
+	{ "base_dir",        acl_parse_str,     smp_fetch_base,           acl_match_dir,     ACL_USE_L7REQ_VOLATILE, 0 },
+	{ "base_dom",        acl_parse_str,     smp_fetch_base,           acl_match_dom,     ACL_USE_L7REQ_VOLATILE, 0 },
+	{ "base_end",        acl_parse_str,     smp_fetch_base,           acl_match_end,     ACL_USE_L7REQ_VOLATILE, 0 },
+	{ "base_len",        acl_parse_int,     smp_fetch_base,           acl_match_len,     ACL_USE_L7REQ_VOLATILE, 0 },
+	{ "base_reg",        acl_parse_reg,     smp_fetch_base,           acl_match_reg,     ACL_USE_L7REQ_VOLATILE, 0 },
+	{ "base_sub",        acl_parse_str,     smp_fetch_base,           acl_match_sub,     ACL_USE_L7REQ_VOLATILE, 0 },
+
 	{ "cook",            acl_parse_str,     smp_fetch_cookie,         acl_match_str,     ACL_USE_L7REQ_VOLATILE|ACL_MAY_LOOKUP, ARG1(0,STR) },
 	{ "cook_beg",        acl_parse_str,     smp_fetch_cookie,         acl_match_beg,     ACL_USE_L7REQ_VOLATILE, ARG1(0,STR) },
 	{ "cook_cnt",        acl_parse_int,     acl_fetch_cookie_cnt,     acl_match_int,     ACL_USE_L7REQ_VOLATILE, ARG1(0,STR) },
@@ -8627,6 +8680,7 @@
 /* Note: must not be declared <const> as its list will be overwritten */
 static struct sample_fetch_kw_list sample_fetch_keywords = {{ },{
 	{ "hdr",        smp_fetch_hdr,            ARG2(1,STR,SINT), val_hdr, SMP_T_CSTR, SMP_CAP_REQ },
+	{ "base",       smp_fetch_base,           0,           NULL, SMP_T_CSTR, SMP_CAP_REQ },
 	{ "path",       smp_fetch_path,           0,           NULL, SMP_T_CSTR, SMP_CAP_REQ },
 	{ "url",        smp_fetch_url,            0,           NULL, SMP_T_CSTR, SMP_CAP_REQ },
 	{ "url_ip",     smp_fetch_url_ip,         0,           NULL, SMP_T_IPV4, SMP_CAP_REQ },