Blame - src/uri_normalizer.c - haproxy

blob: bc793f2f17a051a2a277b31f3b3a09a5055d484a [file] [log] [blame]

Tim Duesterhus	dbd25c3	2021-04-15 21:45:55 +0200	[diff] [blame]	1	/*
				2	* HTTP request URI normalization.
				3	*
				4	* Copyright 2021 Tim Duesterhus <tim@bastelstu.be>
				5	*
				6	* This program is free software; you can redistribute it and/or
				7	* modify it under the terms of the GNU General Public License
				8	* as published by the Free Software Foundation; either version
				9	* 2 of the License, or (at your option) any later version.
				10	*
				11	*/
				12
Tim Duesterhus	d371e99	2021-04-15 21:45:58 +0200	[diff] [blame]	13	#include <import/ist.h>
				14
Tim Duesterhus	dbd25c3	2021-04-15 21:45:55 +0200	[diff] [blame]	15	#include <haproxy/api.h>
Tim Duesterhus	d7b89be	2021-04-15 21:46:01 +0200	[diff] [blame]	16	#include <haproxy/buf.h>
				17	#include <haproxy/chunk.h>
Tim Duesterhus	a407193	2021-04-15 21:46:02 +0200	[diff] [blame]	18	#include <haproxy/tools.h>
Tim Duesterhus	dbd25c3	2021-04-15 21:45:55 +0200	[diff] [blame]	19	#include <haproxy/uri_normalizer.h>
				20
Tim Duesterhus	dec1c36	2021-05-10 17:28:26 +0200	[diff] [blame]	21	/* Encodes '#' as '%23'. */
				22	enum uri_normalizer_err uri_normalizer_fragment_encode(const struct ist input, struct ist *dst)
				23	{
				24	enum uri_normalizer_err err;
				25
				26	const size_t size = istclear(dst);
				27	struct ist output = *dst;
				28
				29	struct ist scanner = input;
				30
				31	while (istlen(scanner)) {
				32	const struct ist before_hash = istsplit(&scanner, '#');
				33
				34	if (istcat(&output, before_hash, size) < 0) {
				35	err = URI_NORMALIZER_ERR_ALLOC;
				36	goto fail;
				37	}
				38
				39	if (istend(before_hash) != istend(scanner)) {
				40	if (istcat(&output, ist("%23"), size) < 0) {
				41	err = URI_NORMALIZER_ERR_ALLOC;
				42	goto fail;
				43	}
				44	}
				45	}
				46
				47	*dst = output;
				48
				49	return URI_NORMALIZER_ERR_NONE;
				50
				51	fail:
				52
				53	return err;
				54	}
				55
Tim Duesterhus	2e4a18e	2021-04-21 21:20:36 +0200	[diff] [blame]	56	/* Returns 1 if the given character is part of the 'unreserved' set in the
				57	* RFC 3986 ABNF.
				58	* Returns 0 if not.
				59	*/
				60	static int is_unreserved_character(unsigned char c)
				61	{
				62	switch (c) {
				63	case 'A'...'Z': /* ALPHA */
				64	case 'a'...'z': /* ALPHA */
				65	case '0'...'9': /* DIGIT */
				66	case '-':
				67	case '.':
				68	case '_':
				69	case '~':
				70	return 1;
				71	default:
				72	return 0;
				73	}
				74	}
				75
				76	/* Decodes percent encoded characters that are part of the 'unreserved' set.
				77	*
				78	* RFC 3986, section 2.3:
				79	* > URIs that differ in the replacement of an unreserved character with
				80	* > its corresponding percent-encoded US-ASCII octet are equivalent [...]
				81	* > when found in a URI, should be decoded to their corresponding unreserved
				82	* > characters by URI normalizers.
				83	*
				84	* If `strict` is set to 0 then percent characters that are not followed by a
				85	* hexadecimal digit are returned as-is without performing any decoding.
				86	* If `strict` is set to 1 then `URI_NORMALIZER_ERR_INVALID_INPUT` is returned
				87	* for invalid sequences.
				88	*/
				89	enum uri_normalizer_err uri_normalizer_percent_decode_unreserved(const struct ist input, int strict, struct ist *dst)
				90	{
				91	enum uri_normalizer_err err;
				92
				93	const size_t size = istclear(dst);
				94	struct ist output = *dst;
				95
				96	struct ist scanner = input;
				97
				98	/* The output will either be shortened or have the same length. */
				99	if (size < istlen(input)) {
				100	err = URI_NORMALIZER_ERR_ALLOC;
				101	goto fail;
				102	}
				103
				104	while (istlen(scanner)) {
				105	const char current = istshift(&scanner);
				106
				107	if (current == '%') {
				108	if (istlen(scanner) >= 2) {
				109	if (ishex(istptr(scanner)[0]) && ishex(istptr(scanner)[1])) {
				110	char hex1, hex2, c;
				111
				112	hex1 = istshift(&scanner);
				113	hex2 = istshift(&scanner);
				114	c = (hex2i(hex1) << 4) + hex2i(hex2);
				115
				116	if (is_unreserved_character(c)) {
				117	output = __istappend(output, c);
				118	}
				119	else {
				120	output = __istappend(output, current);
				121	output = __istappend(output, hex1);
				122	output = __istappend(output, hex2);
				123	}
				124
				125	continue;
				126	}
				127	}
				128
				129	if (strict) {
				130	err = URI_NORMALIZER_ERR_INVALID_INPUT;
				131	goto fail;
				132	}
				133	else {
				134	output = __istappend(output, current);
				135	}
				136	}
				137	else {
				138	output = __istappend(output, current);
				139	}
				140	}
				141
				142	*dst = output;
				143
				144	return URI_NORMALIZER_ERR_NONE;
				145
				146	fail:
				147
				148	return err;
				149	}
				150
Tim Duesterhus	a407193	2021-04-15 21:46:02 +0200	[diff] [blame]	151	/* Uppercases letters used in percent encoding.
				152	*
				153	* If `strict` is set to 0 then percent characters that are not followed by a
				154	* hexadecimal digit are returned as-is without modifying the following letters.
				155	* If `strict` is set to 1 then `URI_NORMALIZER_ERR_INVALID_INPUT` is returned
				156	* for invalid sequences.
				157	*/
				158	enum uri_normalizer_err uri_normalizer_percent_upper(const struct ist input, int strict, struct ist *dst)
				159	{
				160	enum uri_normalizer_err err;
				161
				162	const size_t size = istclear(dst);
				163	struct ist output = *dst;
				164
				165	struct ist scanner = input;
				166
				167	/* The output will have the same length. */
				168	if (size < istlen(input)) {
				169	err = URI_NORMALIZER_ERR_ALLOC;
				170	goto fail;
				171	}
				172
				173	while (istlen(scanner)) {
				174	const char current = istshift(&scanner);
				175
				176	if (current == '%') {
				177	if (istlen(scanner) >= 2) {
				178	if (ishex(istptr(scanner)[0]) && ishex(istptr(scanner)[1])) {
				179	output = __istappend(output, current);
				180	output = __istappend(output, toupper(istshift(&scanner)));
				181	output = __istappend(output, toupper(istshift(&scanner)));
				182	continue;
				183	}
				184	}
				185
				186	if (strict) {
				187	err = URI_NORMALIZER_ERR_INVALID_INPUT;
				188	goto fail;
				189	}
				190	else {
				191	output = __istappend(output, current);
				192	}
				193	}
				194	else {
				195	output = __istappend(output, current);
				196	}
				197	}
				198
				199	*dst = output;
				200
				201	return URI_NORMALIZER_ERR_NONE;
				202
				203	fail:
				204
				205	return err;
				206	}
				207
Maximilian Mader	ff3bb8b	2021-04-21 00:22:50 +0200	[diff] [blame]	208	/* Removes `/./` from the given path. */
				209	enum uri_normalizer_err uri_normalizer_path_dot(const struct ist path, struct ist *dst)
				210	{
				211	enum uri_normalizer_err err;
				212
				213	const size_t size = istclear(dst);
				214	struct ist newpath = *dst;
				215
				216	struct ist scanner = path;
				217
				218	/* The path will either be shortened or have the same length. */
				219	if (size < istlen(path)) {
				220	err = URI_NORMALIZER_ERR_ALLOC;
				221	goto fail;
				222	}
				223
				224	while (istlen(scanner) > 0) {
				225	const struct ist segment = istsplit(&scanner, '/');
				226
				227	if (!isteq(segment, ist("."))) {
				228	if (istcat(&newpath, segment, size) < 0) {
				229	/* This is impossible, because we checked the size of the destination buffer. */
				230	my_unreachable();
				231	err = URI_NORMALIZER_ERR_INTERNAL_ERROR;
				232	goto fail;
				233	}
				234
				235	if (istend(segment) != istend(scanner))
				236	newpath = __istappend(newpath, '/');
				237	}
				238	}
				239
				240	*dst = newpath;
				241
				242	return URI_NORMALIZER_ERR_NONE;
				243
				244	fail:
				245
				246	return err;
				247	}
				248
Tim Duesterhus	560e1a6	2021-04-15 21:46:00 +0200	[diff] [blame]	249	/* Merges `/../` with preceding path segments.
				250	*
				251	* If `full` is set to `0` then `/../` will be printed at the start of the resulting
				252	* path if the number of `/../` exceeds the number of other segments. If `full` is
				253	* set to `1` these will not be printed.
				254	*/
				255	enum uri_normalizer_err uri_normalizer_path_dotdot(const struct ist path, int full, struct ist *dst)
Tim Duesterhus	9982fc2	2021-04-15 21:45:59 +0200	[diff] [blame]	256	{
				257	enum uri_normalizer_err err;
				258
				259	const size_t size = istclear(dst);
				260	char * const tail = istptr(*dst) + size;
				261	char *head = tail;
				262
				263	ssize_t offset = istlen(path) - 1;
				264
				265	int up = 0;
				266
				267	/* The path will either be shortened or have the same length. */
				268	if (size < istlen(path)) {
				269	err = URI_NORMALIZER_ERR_ALLOC;
				270	goto fail;
				271	}
				272
				273	/* Handle `/..` at the end of the path without a trailing slash. */
				274	if (offset >= 2 && istmatch(istadv(path, offset - 2), ist("/.."))) {
				275	up++;
				276	offset -= 2;
				277	}
				278
				279	while (offset >= 0) {
				280	if (offset >= 3 && istmatch(istadv(path, offset - 3), ist("/../"))) {
				281	up++;
				282	offset -= 3;
				283	continue;
				284	}
				285
				286	if (up > 0) {
				287	/* Skip the slash. */
				288	offset--;
				289
				290	/* First check whether we already reached the start of the path,
				291	* before popping the current `/../`.
				292	*/
				293	if (offset >= 0) {
				294	up--;
				295
				296	/* Skip the current path segment. */
				297	while (offset >= 0 && istptr(path)[offset] != '/')
				298	offset--;
				299	}
				300	}
				301	else {
				302	/* Prepend the slash. */
				303	*(--head) = istptr(path)[offset];
				304	offset--;
				305
				306	/* Prepend the current path segment. */
				307	while (offset >= 0 && istptr(path)[offset] != '/') {
				308	*(--head) = istptr(path)[offset];
				309	offset--;
				310	}
				311	}
				312	}
				313
				314	if (up > 0) {
				315	/* Prepend a trailing slash. */
				316	*(--head) = '/';
				317
Tim Duesterhus	560e1a6	2021-04-15 21:46:00 +0200	[diff] [blame]	318	if (!full) {
				319	/* Prepend unconsumed `/..`. */
				320	do {
				321	*(--head) = '.';
				322	*(--head) = '.';
				323	*(--head) = '/';
				324	up--;
				325	} while (up > 0);
				326	}
Tim Duesterhus	9982fc2	2021-04-15 21:45:59 +0200	[diff] [blame]	327	}
				328
				329	*dst = ist2(head, tail - head);
				330
				331	return URI_NORMALIZER_ERR_NONE;
				332
				333	fail:
				334
				335	return err;
				336	}
				337
Tim Duesterhus	d371e99	2021-04-15 21:45:58 +0200	[diff] [blame]	338	/* Merges adjacent slashes in the given path. */
				339	enum uri_normalizer_err uri_normalizer_path_merge_slashes(const struct ist path, struct ist *dst)
				340	{
				341	enum uri_normalizer_err err;
				342
				343	const size_t size = istclear(dst);
				344	struct ist newpath = *dst;
				345
				346	struct ist scanner = path;
				347
				348	/* The path will either be shortened or have the same length. */
				349	if (size < istlen(path)) {
				350	err = URI_NORMALIZER_ERR_ALLOC;
				351	goto fail;
				352	}
				353
				354	while (istlen(scanner) > 0) {
				355	const char current = istshift(&scanner);
				356
				357	if (current == '/') {
				358	while (istlen(scanner) > 0 && *istptr(scanner) == '/')
				359	scanner = istnext(scanner);
				360	}
				361
				362	newpath = __istappend(newpath, current);
				363	}
				364
				365	*dst = newpath;
				366
				367	return URI_NORMALIZER_ERR_NONE;
				368
				369	fail:
				370
				371	return err;
				372	}
				373
Tim Duesterhus	d7b89be	2021-04-15 21:46:01 +0200	[diff] [blame]	374	/* Compares two query parameters by name. Query parameters are ordered
				375	* as with memcmp. Shorter parameter names are ordered lower. Identical
				376	* parameter names are compared by their pointer to maintain a stable
				377	* sort.
				378	*/
				379	static int query_param_cmp(const void a, const void b)
				380	{
				381	const struct ist param_a = (struct ist)a;
				382	const struct ist param_b = (struct ist)b;
				383	const struct ist param_a_name = iststop(param_a, '=');
				384	const struct ist param_b_name = iststop(param_b, '=');
				385
				386	int cmp = istdiff(param_a_name, param_b_name);
				387
				388	if (cmp != 0)
				389	return cmp;
				390
				391	/* The contents are identical: Compare the pointer. */
				392	if (istptr(param_a) < istptr(param_b))
				393	return -1;
				394
				395	if (istptr(param_a) > istptr(param_b))
				396	return 1;
				397
				398	return 0;
				399	}
				400
				401	/* Sorts the parameters within the given query string. */
				402	enum uri_normalizer_err uri_normalizer_query_sort(const struct ist query, const char delim, struct ist *dst)
				403	{
				404	enum uri_normalizer_err err;
				405
				406	const size_t size = istclear(dst);
				407	struct ist newquery = *dst;
				408
				409	struct ist scanner = query;
				410
				411	const struct buffer *trash = get_trash_chunk();
				412	struct ist params = (struct ist )b_orig(trash);
Maximilian Mader	c9c7957	2021-04-21 00:22:49 +0200	[diff] [blame]	413	const size_t max_param = b_size(trash) / sizeof(*params);
Tim Duesterhus	d7b89be	2021-04-15 21:46:01 +0200	[diff] [blame]	414	size_t param_count = 0;
				415
				416	size_t i;
				417
				418	/* The query will have the same length. */
				419	if (size < istlen(query)) {
				420	err = URI_NORMALIZER_ERR_ALLOC;
				421	goto fail;
				422	}
				423
				424	/* Handle the leading '?'. */
				425	newquery = __istappend(newquery, istshift(&scanner));
				426
				427	while (istlen(scanner) > 0) {
				428	const struct ist param = istsplit(&scanner, delim);
				429
				430	if (param_count + 1 > max_param) {
				431	err = URI_NORMALIZER_ERR_ALLOC;
				432	goto fail;
				433	}
				434
				435	params[param_count] = param;
				436	param_count++;
				437	}
				438
				439	qsort(params, param_count, sizeof(*params), query_param_cmp);
				440
				441	for (i = 0; i < param_count; i++) {
				442	if (i > 0)
Maximilian Mader	11f6f85	2021-04-21 00:22:48 +0200	[diff] [blame]	443	newquery = __istappend(newquery, delim);
Tim Duesterhus	d7b89be	2021-04-15 21:46:01 +0200	[diff] [blame]	444
				445	if (istcat(&newquery, params[i], size) < 0) {
				446	/* This is impossible, because we checked the size of the destination buffer. */
				447	my_unreachable();
				448	err = URI_NORMALIZER_ERR_INTERNAL_ERROR;
				449	goto fail;
				450	}
				451	}
				452
				453	*dst = newquery;
				454
				455	return URI_NORMALIZER_ERR_NONE;
				456
				457	fail:
				458
				459	return err;
				460	}
Tim Duesterhus	d371e99	2021-04-15 21:45:58 +0200	[diff] [blame]	461
Tim Duesterhus	dbd25c3	2021-04-15 21:45:55 +0200	[diff] [blame]	462	/*
				463	* Local variables:
				464	* c-indent-level: 8
				465	* c-basic-offset: 8
				466	* End:
				467	*/