blob: bc40b29e808042f32333b2f11969685116b1be6e [file] [log] [blame]
Baptiste Assmann325137d2015-04-13 23:40:55 +02001/*
2 * Name server resolution
3 *
Emeric Brunc9437992021-02-12 19:42:55 +01004 * Copyright 2020 Haproxy Technologies
Baptiste Assmann325137d2015-04-13 23:40:55 +02005 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 */
12
13#include <errno.h>
14#include <fcntl.h>
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18#include <unistd.h>
19
20#include <sys/types.h>
21
Willy Tarreau122eba92020-06-04 10:15:32 +020022#include <haproxy/action.h>
Willy Tarreau4c7e4b72020-05-27 12:58:42 +020023#include <haproxy/api.h>
Willy Tarreau6be78492020-06-05 00:00:29 +020024#include <haproxy/cfgparse.h>
Willy Tarreauf1d32c42020-06-04 21:07:02 +020025#include <haproxy/channel.h>
Willy Tarreaub2551052020-06-09 09:07:15 +020026#include <haproxy/check.h>
Willy Tarreau83487a82020-06-04 20:19:54 +020027#include <haproxy/cli.h>
Willy Tarreau7c18b542020-06-11 09:23:02 +020028#include <haproxy/dgram.h>
Willy Tarreaueb92deb2020-06-04 10:53:16 +020029#include <haproxy/dns.h>
Willy Tarreau8d366972020-05-27 16:10:29 +020030#include <haproxy/errors.h>
Willy Tarreaub2551052020-06-09 09:07:15 +020031#include <haproxy/fd.h>
Willy Tarreauaeed4a82020-06-04 22:01:04 +020032#include <haproxy/log.h>
Emeric Brund26a6232021-01-04 13:32:20 +010033#include <haproxy/ring.h>
Emeric Brunfd647d52021-02-12 20:03:38 +010034#include <haproxy/stream.h>
35#include <haproxy/stream_interface.h>
Baptiste Assmann325137d2015-04-13 23:40:55 +020036
Emeric Brund26a6232021-01-04 13:32:20 +010037static THREAD_LOCAL char *dns_msg_trash;
Baptiste Assmann325137d2015-04-13 23:40:55 +020038
Emeric Brunfd647d52021-02-12 20:03:38 +010039DECLARE_STATIC_POOL(dns_session_pool, "dns_session", sizeof(struct dns_session));
40DECLARE_STATIC_POOL(dns_query_pool, "dns_query", sizeof(struct dns_query));
41DECLARE_STATIC_POOL(dns_msg_buf, "dns_msg_buf", DNS_TCP_MSG_RING_MAX_SIZE);
42
Christopher Faulet67957bd2017-09-27 11:00:59 +020043/* Opens an UDP socket on the namesaver's IP/Port, if required. Returns 0 on
44 * success, -1 otherwise.
Baptiste Assmann325137d2015-04-13 23:40:55 +020045 */
Emeric Brund26a6232021-01-04 13:32:20 +010046static int dns_connect_nameserver(struct dns_nameserver *ns)
Baptiste Assmann325137d2015-04-13 23:40:55 +020047{
Emeric Brund26a6232021-01-04 13:32:20 +010048 if (ns->dgram) {
49 struct dgram_conn *dgram = &ns->dgram->conn;
50 int fd;
Baptiste Assmann325137d2015-04-13 23:40:55 +020051
Emeric Brund26a6232021-01-04 13:32:20 +010052 /* Already connected */
53 if (dgram->t.sock.fd != -1)
54 return 0;
Baptiste Assmann325137d2015-04-13 23:40:55 +020055
Emeric Brund26a6232021-01-04 13:32:20 +010056 /* Create an UDP socket and connect it on the nameserver's IP/Port */
57 if ((fd = socket(dgram->addr.to.ss_family, SOCK_DGRAM, IPPROTO_UDP)) == -1) {
58 send_log(NULL, LOG_WARNING,
Emeric Brunc9437992021-02-12 19:42:55 +010059 "DNS : section '%s': can't create socket for nameserver '%s'.\n",
Emeric Brund26a6232021-01-04 13:32:20 +010060 ns->counters->pid, ns->id);
61 return -1;
62 }
63 if (connect(fd, (struct sockaddr*)&dgram->addr.to, get_addr_len(&dgram->addr.to)) == -1) {
64 send_log(NULL, LOG_WARNING,
Emeric Brunc9437992021-02-12 19:42:55 +010065 "DNS : section '%s': can't connect socket for nameserver '%s'.\n",
Emeric Brund26a6232021-01-04 13:32:20 +010066 ns->counters->id, ns->id);
Emeric Brunc9437992021-02-12 19:42:55 +010067 close(fd);
Emeric Brund26a6232021-01-04 13:32:20 +010068 return -1;
69 }
Baptiste Assmann325137d2015-04-13 23:40:55 +020070
Emeric Brund26a6232021-01-04 13:32:20 +010071 /* Make the socket non blocking */
72 fcntl(fd, F_SETFL, O_NONBLOCK);
Olivier Houcharda8c6db82017-07-06 18:46:47 +020073
Emeric Brund26a6232021-01-04 13:32:20 +010074 /* Add the fd in the fd list and update its parameters */
75 dgram->t.sock.fd = fd;
76 fd_insert(fd, dgram, dgram_fd_handler, MAX_THREADS_MASK);
77 fd_want_recv(fd);
Emeric Brunc9437992021-02-12 19:42:55 +010078 }
79 return 0;
Baptiste Assmann325137d2015-04-13 23:40:55 +020080}
81
Emeric Brund26a6232021-01-04 13:32:20 +010082/* Sends a message to a name server
83 * It returns message length on success
84 * or -1 in error case
85 * 0 is returned in case of output ring buffer is full
86 */
87int dns_send_nameserver(struct dns_nameserver *ns, void *buf, size_t len)
88{
89 int ret = -1;
90
91 if (ns->dgram) {
92 struct dgram_conn *dgram = &ns->dgram->conn;
93 int fd = dgram->t.sock.fd;
94
95 if (dgram->t.sock.fd == -1) {
96 if (dns_connect_nameserver(ns) == -1)
97 return -1;
98 fd = dgram->t.sock.fd;
99 }
100
101 ret = send(fd, buf, len, 0);
102 if (ret < 0) {
103 if (errno == EAGAIN) {
104 struct ist myist;
105
106 myist.ptr = buf;
107 myist.len = len;
108 ret = ring_write(ns->dgram->ring_req, DNS_TCP_MSG_MAX_SIZE, NULL, 0, &myist, 1);
109 if (!ret) {
110 ns->counters->snd_error++;
111 return -1;
112 }
113 fd_cant_send(fd);
114 return ret;
115 }
116 ns->counters->snd_error++;
117 fd_delete(fd);
118 close(fd);
119 dgram->t.sock.fd = -1;
120 return -1;
121 }
122 ns->counters->sent++;
123 }
Emeric Brunfd647d52021-02-12 20:03:38 +0100124 else if (ns->stream) {
125 struct ist myist;
126
127 myist.ptr = buf;
128 myist.len = len;
129 ret = ring_write(ns->stream->ring_req, DNS_TCP_MSG_MAX_SIZE, NULL, 0, &myist, 1);
130 if (!ret) {
131 ns->counters->snd_error++;
132 return -1;
133 }
134 task_wakeup(ns->stream->task_req, TASK_WOKEN_MSG);
135 return ret;
136 }
Emeric Brund26a6232021-01-04 13:32:20 +0100137
138 return ret;
139}
140
Emeric Brunfd647d52021-02-12 20:03:38 +0100141void dns_session_free(struct dns_session *);
142
Emeric Brund26a6232021-01-04 13:32:20 +0100143/* Receives a dns message
144 * Returns message length
145 * 0 is returned if no more message available
146 * -1 in error case
147 */
148ssize_t dns_recv_nameserver(struct dns_nameserver *ns, void *data, size_t size)
149{
150 ssize_t ret = -1;
151
152 if (ns->dgram) {
153 struct dgram_conn *dgram = &ns->dgram->conn;
154 int fd = dgram->t.sock.fd;
155
156 if (fd == -1)
157 return -1;
158
159 if ((ret = recv(fd, data, size, 0)) < 0) {
160 if (errno == EAGAIN) {
161 fd_cant_recv(fd);
162 return 0;
163 }
164 fd_delete(fd);
165 close(fd);
166 dgram->t.sock.fd = -1;
167 return -1;
168 }
169 }
Emeric Brunfd647d52021-02-12 20:03:38 +0100170 else if (ns->stream) {
171 struct dns_stream_server *dss = ns->stream;
172 struct dns_session *ds;
173
174 HA_SPIN_LOCK(DNS_LOCK, &dss->lock);
175
176 if (!LIST_ISEMPTY(&dss->wait_sess)) {
177 ds = LIST_NEXT(&dss->wait_sess, struct dns_session *, waiter);
178 fprintf(stderr, "ds: %p\n", ds);
179 ret = ds->rx_msg.len < size ? ds->rx_msg.len : size;
180 memcpy(data, ds->rx_msg.area, ret);
181
182 ds->rx_msg.len = 0;
183
184 /* This barrier is here to ensure that all data is
185 * stored if the appctx detect the elem is out of the list */
186 __ha_barrier_store();
187
188 LIST_DEL_INIT(&ds->waiter);
189
190 if (ds->appctx) {
191 /* This second barrier is here to ensure that
192 * the waked up appctx won't miss that the
193 * elem is removed from the list */
194 __ha_barrier_store();
195
196 /* awake appctx beacause it may have other
197 * message to receive
198 */
199 appctx_wakeup(ds->appctx);
200
201 /* dns_session could already be into free_sess list
202 * so we firstly remove it */
203 LIST_DEL_INIT(&ds->list);
204
205 /* decrease nb_queries to free a slot for a new query on that sess */
206 ds->nb_queries--;
207 if (ds->nb_queries) {
208 /* it remains pipelined unanswered request
209 * into this session but we just decrease
210 * the counter so the session
211 * can not be full of pipelined requests
212 * so we can add if to free_sess list
213 * to receive a new request
214 */
215 LIST_ADD(&ds->dss->free_sess, &ds->list);
216 }
217 else {
218 /* there is no more pipelined requests
219 * into this session, so we move it
220 * to idle_sess list */
221 LIST_ADD(&ds->dss->idle_sess, &ds->list);
222
223 /* update the counter of idle sessions */
224 ds->dss->idle_conns++;
225
226 /* Note: this is useless there to update
227 * the max_active_conns since we increase
228 * the idle count */
229 }
230 }
231 else {
232 /* there is no more appctx for this session
233 * it means it is ready to die
234 */
235 dns_session_free(ds);
236 }
237
238
239 }
240
241 HA_SPIN_UNLOCK(DNS_LOCK, &dss->lock);
242 }
Emeric Brund26a6232021-01-04 13:32:20 +0100243
244 return ret;
245}
246
247static void dns_resolve_recv(struct dgram_conn *dgram)
248{
249 struct dns_nameserver *ns;
250 int fd;
251
252 fd = dgram->t.sock.fd;
253
254 /* check if ready for reading */
255 if (!fd_recv_ready(fd))
256 return;
257
258 /* no need to go further if we can't retrieve the nameserver */
259 if ((ns = dgram->owner) == NULL) {
260 _HA_ATOMIC_AND(&fdtab[fd].ev, ~(FD_POLL_HUP|FD_POLL_ERR));
261 fd_stop_recv(fd);
262 return;
263 }
264
265 ns->process_responses(ns);
266}
267
268/* Called when a dns network socket is ready to send data */
269static void dns_resolve_send(struct dgram_conn *dgram)
270{
271 int fd;
272 struct dns_nameserver *ns;
273 struct ring *ring;
274 struct buffer *buf;
275 uint64_t msg_len;
276 size_t len, cnt, ofs;
277
278 fd = dgram->t.sock.fd;
279
280 /* check if ready for sending */
281 if (!fd_send_ready(fd))
282 return;
283
284 /* no need to go further if we can't retrieve the nameserver */
285 if ((ns = dgram->owner) == NULL) {
286 _HA_ATOMIC_AND(&fdtab[fd].ev, ~(FD_POLL_HUP|FD_POLL_ERR));
287 fd_stop_send(fd);
288 return;
289 }
290
291 ring = ns->dgram->ring_req;
292 buf = &ring->buf;
293
294 HA_RWLOCK_RDLOCK(DNS_LOCK, &ring->lock);
295 ofs = ns->dgram->ofs_req;
296
297 /* explanation for the initialization below: it would be better to do
298 * this in the parsing function but this would occasionally result in
299 * dropped events because we'd take a reference on the oldest message
300 * and keep it while being scheduled. Thus instead let's take it the
301 * first time we enter here so that we have a chance to pass many
302 * existing messages before grabbing a reference to a location. This
303 * value cannot be produced after initialization.
304 */
305 if (unlikely(ofs == ~0)) {
306 ofs = 0;
307 HA_ATOMIC_ADD(b_peek(buf, ofs), 1);
308 ofs += ring->ofs;
309 }
310
311 /* we were already there, adjust the offset to be relative to
312 * the buffer's head and remove us from the counter.
313 */
314 ofs -= ring->ofs;
315 BUG_ON(ofs >= buf->size);
316 HA_ATOMIC_SUB(b_peek(buf, ofs), 1);
317
318 while (ofs + 1 < b_data(buf)) {
319 int ret;
320
321 cnt = 1;
322 len = b_peek_varint(buf, ofs + cnt, &msg_len);
323 if (!len)
324 break;
325 cnt += len;
326 BUG_ON(msg_len + ofs + cnt + 1 > b_data(buf));
327 if (unlikely(msg_len > DNS_TCP_MSG_MAX_SIZE)) {
328 /* too large a message to ever fit, let's skip it */
329 ofs += cnt + msg_len;
330 continue;
331 }
332
333 len = b_getblk(buf, dns_msg_trash, msg_len, ofs + cnt);
334
335 ret = send(fd, dns_msg_trash, len, 0);
336 if (ret < 0) {
337 if (errno == EAGAIN) {
338 fd_cant_send(fd);
339 goto out;
340 }
341 ns->counters->snd_error++;
342 fd_delete(fd);
343 close(fd);
344 fd = dgram->t.sock.fd = -1;
345 goto out;
346 }
347 ns->counters->sent++;
348
349 ofs += cnt + len;
350 }
351
352 /* we don't want/need to be waked up any more for sending
353 * because all ring content is sent */
354 fd_stop_send(fd);
355
356out:
357
358 HA_ATOMIC_ADD(b_peek(buf, ofs), 1);
359 ofs += ring->ofs;
360 ns->dgram->ofs_req = ofs;
361 HA_RWLOCK_RDUNLOCK(DNS_LOCK, &ring->lock);
362
363}
364
Emeric Brunc9437992021-02-12 19:42:55 +0100365/* proto_udp callback functions for a DNS resolution */
366struct dgram_data_cb dns_dgram_cb = {
367 .recv = dns_resolve_recv,
368 .send = dns_resolve_send,
369};
Baptiste Assmann325137d2015-04-13 23:40:55 +0200370
Emeric Brunc9437992021-02-12 19:42:55 +0100371int dns_dgram_init(struct dns_nameserver *ns, struct sockaddr_storage *sk)
Baptiste Assmann325137d2015-04-13 23:40:55 +0200372{
Emeric Brunc9437992021-02-12 19:42:55 +0100373 struct dns_dgram_server *dgram;
Baptiste Assmann201c07f2017-05-22 15:17:15 +0200374
Emeric Brunc9437992021-02-12 19:42:55 +0100375 if ((dgram = calloc(1, sizeof(*dgram))) == NULL)
Christopher Faulet67957bd2017-09-27 11:00:59 +0200376 return -1;
Baptiste Assmannc1ce5f32016-05-14 11:26:22 +0200377
Emeric Brunc9437992021-02-12 19:42:55 +0100378 /* Leave dgram partially initialized, no FD attached for
379 * now. */
380 dgram->conn.owner = ns;
381 dgram->conn.data = &dns_dgram_cb;
382 dgram->conn.t.sock.fd = -1;
383 dgram->conn.addr.to = *sk;
384 ns->dgram = dgram;
Baptiste Assmannc1ce5f32016-05-14 11:26:22 +0200385
Emeric Brunc9437992021-02-12 19:42:55 +0100386 dgram->ofs_req = ~0; /* init ring offset */
387 dgram->ring_req = ring_new(2*DNS_TCP_MSG_RING_MAX_SIZE);
388 if (!dgram->ring_req) {
389 ha_alert("memory allocation error initializing the ring for nameserver.\n");
390 goto out;
Baptiste Assmannc1ce5f32016-05-14 11:26:22 +0200391 }
392
Emeric Brunc9437992021-02-12 19:42:55 +0100393 /* attach the task as reader */
394 if (!ring_attach(dgram->ring_req)) {
395 /* mark server attached to the ring */
396 ha_alert("nameserver sets too many watchers > 255 on ring. This is a bug and should not happen.\n");
397 goto out;
Baptiste Assmannc1ce5f32016-05-14 11:26:22 +0200398 }
Baptiste Assmannc1ce5f32016-05-14 11:26:22 +0200399 return 0;
Emeric Brunc9437992021-02-12 19:42:55 +0100400out:
401 if (dgram->ring_req)
402 ring_free(dgram->ring_req);
Christopher Fauletd6c6b5f2020-09-08 10:27:24 +0200403
Emeric Brunc9437992021-02-12 19:42:55 +0100404 free(dgram);
Olivier Houchard2ec2db92018-01-08 16:28:57 +0100405
Emeric Brunfd647d52021-02-12 20:03:38 +0100406 return -1;
407}
408
409/*
410 * IO Handler to handle message push to dns tcp server
411 */
412static void dns_session_io_handler(struct appctx *appctx)
413{
414 struct stream_interface *si = appctx->owner;
415 struct dns_session *ds = appctx->ctx.sft.ptr;
416 struct ring *ring = &ds->ring;
417 struct buffer *buf = &ring->buf;
418 uint64_t msg_len;
419 int available_room;
420 size_t len, cnt, ofs;
421 int ret = 0;
422
423 /* if stopping was requested, close immediately */
424 if (unlikely(stopping))
425 goto close;
426
427 /* we want to be sure to not miss that we have been awaked for a shutdown */
428 __ha_barrier_load();
429
430 /* that means the connection was requested to shutdown
431 * for instance idle expire */
432 if (ds->shutdown)
433 goto close;
434
435 /* an error was detected */
436 if (unlikely(si_ic(si)->flags & (CF_WRITE_ERROR|CF_SHUTW)))
437 goto close;
438
439 /* con closed by server side, we will skip data write and drain data from channel */
440 if ((si_oc(si)->flags & CF_SHUTW)) {
441 goto read;
442 }
443
444 /* if the connection is not established, inform the stream that we want
445 * to be notified whenever the connection completes.
446 */
447 if (si_opposite(si)->state < SI_ST_EST) {
448 si_cant_get(si);
449 si_rx_conn_blk(si);
450 si_rx_endp_more(si);
451 return;
452 }
453
454
455 ofs = ds->ofs;
456
457 HA_RWLOCK_WRLOCK(DNS_LOCK, &ring->lock);
458 LIST_DEL_INIT(&appctx->wait_entry);
459 HA_RWLOCK_WRUNLOCK(DNS_LOCK, &ring->lock);
460
461 HA_RWLOCK_RDLOCK(DNS_LOCK, &ring->lock);
462
463 /* explanation for the initialization below: it would be better to do
464 * this in the parsing function but this would occasionally result in
465 * dropped events because we'd take a reference on the oldest message
466 * and keep it while being scheduled. Thus instead let's take it the
467 * first time we enter here so that we have a chance to pass many
468 * existing messages before grabbing a reference to a location. This
469 * value cannot be produced after initialization.
470 */
471 if (unlikely(ofs == ~0)) {
472 ofs = 0;
473
474 HA_ATOMIC_ADD(b_peek(buf, ofs), 1);
475 ofs += ring->ofs;
476 }
477
478 /* in this loop, ofs always points to the counter byte that precedes
479 * the message so that we can take our reference there if we have to
480 * stop before the end (ret=0).
481 */
482 if (si_opposite(si)->state == SI_ST_EST) {
483 /* we were already there, adjust the offset to be relative to
484 * the buffer's head and remove us from the counter.
485 */
486 ofs -= ring->ofs;
487 BUG_ON(ofs >= buf->size);
488 HA_ATOMIC_SUB(b_peek(buf, ofs), 1);
489
490 ret = 1;
491 while (ofs + 1 < b_data(buf)) {
492 struct dns_query *query;
493 uint16_t original_qid;
494 uint16_t new_qid;
495
496 cnt = 1;
497 len = b_peek_varint(buf, ofs + cnt, &msg_len);
498 if (!len)
499 break;
500 cnt += len;
501 BUG_ON(msg_len + ofs + cnt + 1 > b_data(buf));
502
503 /* retrieve available room on output channel */
504 available_room = channel_recv_max(si_ic(si));
505
506 /* tx_msg_offset null means we are at the start of a new message */
507 if (!ds->tx_msg_offset) {
508 uint16_t slen;
509
510 /* check if there is enough room to put message len and query id */
511 if (available_room < sizeof(slen) + sizeof(new_qid)) {
512 si_rx_room_blk(si);
513 ret = 0;
514 break;
515 }
516
517 /* put msg len into then channel */
518 slen = (uint16_t)msg_len;
519 slen = htons(slen);
520 ci_putblk(si_ic(si), (char *)&slen, sizeof(slen));
521 available_room -= sizeof(slen);
522
523 /* backup original query id */
524 len = b_getblk(buf, (char *)&original_qid, sizeof(original_qid), ofs + cnt);
Emeric Brun538bb042021-02-15 13:58:06 +0100525 if (!len) {
526 /* should never happen since messages are atomically
527 * written into ring
528 */
529 ret = 0;
530 break;
531 }
Emeric Brunfd647d52021-02-12 20:03:38 +0100532
533 /* generates new query id */
534 new_qid = ++ds->query_counter;
535 new_qid = htons(new_qid);
536
537 /* put new query id into the channel */
538 ci_putblk(si_ic(si), (char *)&new_qid, sizeof(new_qid));
539 available_room -= sizeof(new_qid);
540
541 /* keep query id mapping */
542
543 query = pool_alloc(dns_query_pool);
544 if (query) {
545 query->qid.key = new_qid;
546 query->original_qid = original_qid;
547 query->expire = tick_add(now_ms, 5000);
548 LIST_INIT(&query->list);
549 if (LIST_ISEMPTY(&ds->queries)) {
550 /* enable task to handle expire */
551 ds->task_exp->expire = query->expire;
552 /* ensure this will be executed by the same
553 * thread than ds_session_release
554 * to ensure session_release is free
555 * to destroy the task */
556 task_queue(ds->task_exp);
557 }
558 LIST_ADDQ(&ds->queries, &query->list);
559 eb32_insert(&ds->query_ids, &query->qid);
560 ds->onfly_queries++;
561 }
562
563 /* update the tx_offset to handle output in 16k streams */
564 ds->tx_msg_offset = sizeof(original_qid);
565
566 }
567
568 /* check if it remains available room on output chan */
569 if (unlikely(!available_room)) {
570 si_rx_room_blk(si);
571 ret = 0;
572 break;
573 }
574
575 chunk_reset(&trash);
576 if ((msg_len - ds->tx_msg_offset) > available_room) {
577 /* remaining msg data is too large to be written in output channel at one time */
578
579 len = b_getblk(buf, trash.area, available_room, ofs + cnt + ds->tx_msg_offset);
580
581 /* update offset to complete mesg forwarding later */
582 ds->tx_msg_offset += len;
583 }
584 else {
585 /* remaining msg data can be written in output channel at one time */
586 len = b_getblk(buf, trash.area, msg_len - ds->tx_msg_offset, ofs + cnt + ds->tx_msg_offset);
587
588 /* reset tx_msg_offset to mark forward fully processed */
589 ds->tx_msg_offset = 0;
590 }
591 trash.data += len;
592
593 ci_putchk(si_ic(si), &trash);
594
595 if (ds->tx_msg_offset) {
596 /* msg was not fully processed, we must be awake to drain pending data */
597
598 si_rx_room_blk(si);
599 ret = 0;
600 break;
601 }
602 /* switch to next message */
603 ofs += cnt + msg_len;
604 }
605
606 HA_ATOMIC_ADD(b_peek(buf, ofs), 1);
607 ofs += ring->ofs;
608 ds->ofs = ofs;
609 }
610 HA_RWLOCK_RDUNLOCK(DNS_LOCK, &ring->lock);
611
612 if (ret) {
613 /* let's be woken up once new request to write arrived */
614 HA_RWLOCK_WRLOCK(DNS_LOCK, &ring->lock);
615 LIST_ADDQ(&ring->waiters, &appctx->wait_entry);
616 HA_RWLOCK_WRUNLOCK(DNS_LOCK, &ring->lock);
617 si_rx_endp_done(si);
618 }
619
620read:
621
622 /* if session is not a waiter it means there is no commited
623 * message into rx_buf and we are free to use it
624 * Note: we need a load barrier here to not miss the
625 * delete from the list
626 */
627 __ha_barrier_load();
628 if (!LIST_ADDED(&ds->waiter)) {
629 while (1) {
630 uint16_t query_id;
631 struct eb32_node *eb;
632 struct dns_query *query;
633
634 if (!ds->rx_msg.len) {
635 /* next message len is not fully available into the channel */
636 if (co_data(si_oc(si)) < 2)
637 break;
638
639 /* retrieve message len */
640 co_getblk(si_oc(si), (char *)&msg_len, 2, 0);
641
642 /* mark as consumed */
643 co_skip(si_oc(si), 2);
644
645 /* store message len */
646 ds->rx_msg.len = ntohs(msg_len);
647 }
648
649 if (!co_data(si_oc(si))) {
650 /* we need more data but nothing is available */
651 break;
652 }
653
654 if (co_data(si_oc(si)) + ds->rx_msg.offset < ds->rx_msg.len) {
655 /* message only partially available */
656
657 /* read available data */
658 co_getblk(si_oc(si), ds->rx_msg.area + ds->rx_msg.offset, co_data(si_oc(si)), 0);
659
660 /* update message offset */
661 ds->rx_msg.offset += co_data(si_oc(si));
662
663 /* consume all pending data from the channel */
664 co_skip(si_oc(si), co_data(si_oc(si)));
665
666 /* we need to wait for more data */
667 break;
668 }
669
670 /* enougth data is available into the channel to read the message until the end */
671
672 /* read from the channel until the end of the message */
673 co_getblk(si_oc(si), ds->rx_msg.area + ds->rx_msg.offset, ds->rx_msg.len - ds->rx_msg.offset, 0);
674
675 /* consume all data until the end of the message from the channel */
676 co_skip(si_oc(si), ds->rx_msg.len - ds->rx_msg.offset);
677
678 /* reset reader offset to 0 for next message reand */
679 ds->rx_msg.offset = 0;
680
681 /* try remap query id to original */
682 memcpy(&query_id, ds->rx_msg.area, sizeof(query_id));
683 eb = eb32_lookup(&ds->query_ids, query_id);
684 if (!eb) {
685 /* query id not found means we have an unknown corresponding
686 * request, perhaps server's bug or or the query reached
687 * timeout
688 */
689 ds->rx_msg.len = 0;
690 continue;
691 }
692
693 /* re-map the original query id set by the requester */
694 query = eb32_entry(eb, struct dns_query, qid);
695 memcpy(ds->rx_msg.area, &query->original_qid, sizeof(query->original_qid));
696
697 /* remove query ids mapping from pending queries list/tree */
698 eb32_delete(&query->qid);
699 LIST_DEL(&query->list);
700 pool_free(dns_query_pool, query);
701 ds->onfly_queries--;
702
703 /* lock the dns_stream_server containing lists heads */
704 HA_SPIN_LOCK(DNS_LOCK, &ds->dss->lock);
705
706 /* the dns_session is also added in queue of the
707 * wait_sess list where the task processing
708 * response will pop available responses
709 */
710 LIST_ADDQ(&ds->dss->wait_sess, &ds->waiter);
711
712 /* lock the dns_stream_server containing lists heads */
713 HA_SPIN_UNLOCK(DNS_LOCK, &ds->dss->lock);
714
715 /* awake the task processing the responses */
716 task_wakeup(ds->dss->task_rsp, TASK_WOKEN_INIT);
717
718 break;
719 }
720
721 if (!LIST_ADDED(&ds->waiter)) {
722 /* there is no more pending data to read and the con was closed by the server side */
723 if (!co_data(si_oc(si)) && (si_oc(si)->flags & CF_SHUTW)) {
724 goto close;
725 }
726 }
727
728 }
729
730
731 return;
732close:
733 si_shutw(si);
734 si_shutr(si);
735 si_ic(si)->flags |= CF_READ_NULL;
736}
737
738void dns_queries_flush(struct dns_session *ds)
739{
740 struct dns_query *query, *queryb;
741
742 list_for_each_entry_safe(query, queryb, &ds->queries, list) {
743 eb32_delete(&query->qid);
744 LIST_DEL(&query->list);
745 pool_free(dns_query_pool, query);
746 }
747}
748
749void dns_session_free(struct dns_session *ds)
750{
751 if (ds->rx_msg.area)
752 pool_free(dns_msg_buf, ds->rx_msg.area);
753 if (ds->tx_ring_area)
754 pool_free(dns_msg_buf, ds->tx_ring_area);
755 if (ds->task_exp)
756 task_destroy(ds->task_exp);
757
758 dns_queries_flush(ds);
759
760 ds->dss->cur_conns--;
761 /* Note: this is useless to update
762 * max_active_conns here because
763 * we decrease the value
764 */
765 pool_free(dns_session_pool, ds);
766}
767
768static struct appctx *dns_session_create(struct dns_session *ds);
769
770/*
771 * Function to release a DNS tcp session
772 */
773static void dns_session_release(struct appctx *appctx)
774{
775 struct dns_session *ds = appctx->ctx.sft.ptr;
776 struct dns_stream_server *dss;
777
778 if (!ds)
779 return;
780
781 dss = ds->dss;
782
783 HA_SPIN_LOCK(DNS_LOCK, &dss->lock);
784 LIST_DEL_INIT(&ds->list);
785
786 if (stopping) {
787 dns_session_free(ds);
788 HA_SPIN_UNLOCK(DNS_LOCK, &dss->lock);
789 return;
790 }
791
792 if (!ds->nb_queries) {
793 /* this is an idle session */
794 /* Note: this is useless to update max_active_sess
795 * here because we decrease idle_conns but
796 * dns_session_free decrease curconns
797 */
798
799 ds->dss->idle_conns--;
800 dns_session_free(ds);
801 HA_SPIN_UNLOCK(DNS_LOCK, &dss->lock);
802 return;
803 }
804
805 if (ds->onfly_queries == ds->nb_queries) {
806 /* the session can be released because
807 * it means that all queries AND
808 * responses are in fly */
809 dns_session_free(ds);
810 HA_SPIN_UNLOCK(DNS_LOCK, &dss->lock);
811 return;
812 }
813
814 /* We do not call ring_appctx_detach here
815 * because we want to keep readers counters
816 * to retry a con with a different appctx*/
817 HA_RWLOCK_WRLOCK(DNS_LOCK, &ds->ring.lock);
818 LIST_DEL_INIT(&appctx->wait_entry);
819 HA_RWLOCK_WRUNLOCK(DNS_LOCK, &ds->ring.lock);
820
821 /* if there is no pending complete response
822 * message, ensure to reset
823 * message offsets if the session
824 * was closed with an incomplete pending response
825 */
826 if (!LIST_ADDED(&ds->waiter))
827 ds->rx_msg.len = ds->rx_msg.offset = 0;
828
829 /* we flush pending sent queries because we never
830 * have responses
831 */
832 ds->nb_queries -= ds->onfly_queries;
833 dns_queries_flush(ds);
834
835 /* reset offset to be sure to start from message start */
836 ds->tx_msg_offset = 0;
837
838 /* here the ofs and the attached counter
839 * are kept unchanged
840 */
841
842 /* Create a new appctx, We hope we can
843 * create from the release callback! */
844 ds->appctx = dns_session_create(ds);
845 if (!ds->appctx) {
846 dns_session_free(ds);
847 HA_SPIN_UNLOCK(DNS_LOCK, &dss->lock);
848 return;
849 }
850
851 if (ds->nb_queries < DNS_STREAM_MAX_PIPELINED_REQ)
852 LIST_ADD(&ds->dss->free_sess, &ds->list);
853
854 HA_SPIN_UNLOCK(DNS_LOCK, &dss->lock);
855}
856
857/* DNS tcp session applet */
858static struct applet dns_session_applet = {
859 .obj_type = OBJ_TYPE_APPLET,
860 .name = "<STRMDNS>", /* used for logging */
861 .fct = dns_session_io_handler,
862 .release = dns_session_release,
863};
864
865/*
866 * Function used to create an appctx for a DNS session
867 */
868static struct appctx *dns_session_create(struct dns_session *ds)
869{
870 struct appctx *appctx;
871 struct session *sess;
872 struct stream *s;
873 struct applet *applet = &dns_session_applet;
874
875 appctx = appctx_new(applet, tid_bit);
876 if (!appctx)
877 goto out_close;
878
879 appctx->ctx.sft.ptr = (void *)ds;
880
881 sess = session_new(ds->dss->srv->proxy, NULL, &appctx->obj_type);
882 if (!sess) {
883 ha_alert("out of memory in peer_session_create().\n");
884 goto out_free_appctx;
885 }
886
887 if ((s = stream_new(sess, &appctx->obj_type, &BUF_NULL)) == NULL) {
888 ha_alert("Failed to initialize stream in peer_session_create().\n");
889 goto out_free_sess;
890 }
891
892
893 s->target = &ds->dss->srv->obj_type;
894 if (!sockaddr_alloc(&s->target_addr, &ds->dss->srv->addr, sizeof(ds->dss->srv->addr)))
895 goto out_free_strm;
896 s->flags = SF_ASSIGNED|SF_ADDR_SET;
897 s->si[1].flags |= SI_FL_NOLINGER;
898
899 s->do_log = NULL;
900 s->uniq_id = 0;
901
902 s->res.flags |= CF_READ_DONTWAIT;
903 /* for rto and rex to eternity to not expire on idle recv:
904 * We are using a syslog server.
905 */
906 s->res.rto = TICK_ETERNITY;
907 s->res.rex = TICK_ETERNITY;
908 ds->appctx = appctx;
909 task_wakeup(s->task, TASK_WOKEN_INIT);
910 return appctx;
911
912 /* Error unrolling */
913 out_free_strm:
914 LIST_DEL(&s->list);
915 pool_free(pool_head_stream, s);
916 out_free_sess:
917 session_free(sess);
918 out_free_appctx:
919 appctx_free(appctx);
920 out_close:
921 return NULL;
922}
923
924/* Task processing expiration of unresponded queries, this one is supposed
925 * to be stuck on the same thread than the appctx handler
926 */
927static struct task *dns_process_query_exp(struct task *t, void *context, unsigned short state)
928{
929 struct dns_session *ds = (struct dns_session *)context;
930 struct dns_query *query, *queryb;
931
932 t->expire = TICK_ETERNITY;
933
934 list_for_each_entry_safe(query, queryb, &ds->queries, list) {
935 if (tick_is_expired(query->expire, now_ms)) {
936 eb32_delete(&query->qid);
937 LIST_DEL(&query->list);
938 pool_free(dns_query_pool, query);
939 ds->onfly_queries--;
940 }
941 else {
942 t->expire = query->expire;
943 break;
944 }
945 }
946
947 return t;
948}
949
950/* Task processing expiration of idle sessions */
951static struct task *dns_process_idle_exp(struct task *t, void *context, unsigned short state)
952{
953 struct dns_stream_server *dss = (struct dns_stream_server *)context;
954 struct dns_session *ds, *dsb;
955 int target = 0;
956 int cur_active_conns;
957
958 HA_SPIN_LOCK(DNS_LOCK, &dss->lock);
959
960
961 cur_active_conns = dss->cur_conns - dss->idle_conns;
962 if (cur_active_conns > dss->max_active_conns)
963 dss->max_active_conns = cur_active_conns;
964
965 target = (dss->max_active_conns - cur_active_conns) / 2;
966 list_for_each_entry_safe(ds, dsb, &dss->idle_sess, list) {
967 if (!target)
968 break;
969
970 /* remove conn to pending list to ensure it won't be reused */
971 LIST_DEL_INIT(&ds->list);
972
973 /* force session shutdown */
974 ds->shutdown = 1;
975
976 /* to be sure that the appctx wont miss shutdown */
977 __ha_barrier_store();
978
979 /* wake appctx to perform the shutdown */
980 appctx_wakeup(ds->appctx);
981 }
982
983 /* reset max to current active conns */
984 dss->max_active_conns = cur_active_conns;
985
986 HA_SPIN_UNLOCK(DNS_LOCK, &dss->lock);
987
988 t->expire = tick_add(now_ms, 5000);
989
990 return t;
991}
992
993struct dns_session *dns_session_new(struct dns_stream_server *dss)
994{
995 struct dns_session *ds;
996
997 if (dss->maxconn && (dss->maxconn <= dss->cur_conns))
998 return NULL;
999
1000 ds = pool_alloc(dns_session_pool);
1001 if (!ds)
1002 return NULL;
1003
1004 ds->ofs = ~0;
1005 ds->dss = dss;
1006 LIST_INIT(&ds->list);
1007 LIST_INIT(&ds->queries);
1008 LIST_INIT(&ds->waiter);
1009 ds->rx_msg.offset = ds->rx_msg.len = 0;
1010 ds->rx_msg.area = NULL;
1011 ds->tx_ring_area = NULL;
1012 ds->task_exp = NULL;
1013 ds->appctx = NULL;
1014 ds->shutdown = 0;
1015 ds->nb_queries = 0;
1016 ds->query_ids = EB_ROOT_UNIQUE;
1017 ds->rx_msg.area = pool_alloc(dns_msg_buf);
1018 if (!ds->rx_msg.area)
1019 goto error;
1020
1021 ds->tx_ring_area = pool_alloc(dns_msg_buf);
1022 if (!ds->tx_ring_area)
1023 goto error;
1024
1025 ring_init(&ds->ring, ds->tx_ring_area, DNS_TCP_MSG_RING_MAX_SIZE);
1026 ring_attach(&ds->ring);
1027
1028 if ((ds->task_exp = task_new(tid_bit)) == NULL)
1029 goto error;
1030
1031 ds->task_exp->process = dns_process_query_exp;
1032 ds->task_exp->context = ds;
1033
1034 ds->appctx = dns_session_create(ds);
1035 if (!ds->appctx)
1036 goto error;
1037
1038 dss->cur_conns++;
1039
1040 return ds;
1041
1042error:
1043 if (ds->task_exp)
1044 task_destroy(ds->task_exp);
1045 if (ds->rx_msg.area)
1046 pool_free(dns_msg_buf, ds->rx_msg.area);
1047 if (ds->tx_ring_area)
1048 pool_free(dns_msg_buf, ds->tx_ring_area);
1049
1050 pool_free(dns_session_pool, ds);
1051
1052 return NULL;
1053}
1054
1055/*
1056 * Task used to consume pending messages from nameserver ring
1057 * and forward them to dns_session ring.
1058 * Note: If no slot found a new dns_session is allocated
1059 */
1060static struct task *dns_process_req(struct task *t, void *context, unsigned short state)
1061{
1062 struct dns_nameserver *ns = (struct dns_nameserver *)context;
1063 struct dns_stream_server *dss = ns->stream;
1064 struct ring *ring = dss->ring_req;
1065 struct buffer *buf = &ring->buf;
1066 uint64_t msg_len;
1067 size_t len, cnt, ofs;
1068 struct dns_session *ds, *ads;
1069 HA_SPIN_LOCK(DNS_LOCK, &dss->lock);
1070
1071 ofs = dss->ofs_req;
1072
1073 HA_RWLOCK_RDLOCK(DNS_LOCK, &ring->lock);
1074
1075 /* explanation for the initialization below: it would be better to do
1076 * this in the parsing function but this would occasionally result in
1077 * dropped events because we'd take a reference on the oldest message
1078 * and keep it while being scheduled. Thus instead let's take it the
1079 * first time we enter here so that we have a chance to pass many
1080 * existing messages before grabbing a reference to a location. This
1081 * value cannot be produced after initialization.
1082 */
1083 if (unlikely(ofs == ~0)) {
1084 ofs = 0;
1085 HA_ATOMIC_ADD(b_peek(buf, ofs), 1);
1086 ofs += ring->ofs;
1087 }
1088
1089 /* we were already there, adjust the offset to be relative to
1090 * the buffer's head and remove us from the counter.
1091 */
1092 ofs -= ring->ofs;
1093 BUG_ON(ofs >= buf->size);
1094 HA_ATOMIC_SUB(b_peek(buf, ofs), 1);
1095
1096 while (ofs + 1 < b_data(buf)) {
1097 struct ist myist;
1098
1099 cnt = 1;
1100 len = b_peek_varint(buf, ofs + cnt, &msg_len);
1101 if (!len)
1102 break;
1103 cnt += len;
1104 BUG_ON(msg_len + ofs + cnt + 1 > b_data(buf));
1105 if (unlikely(msg_len > DNS_TCP_MSG_MAX_SIZE)) {
1106 /* too large a message to ever fit, let's skip it */
1107 ofs += cnt + msg_len;
1108 continue;
1109 }
1110
1111 len = b_getblk(buf, dns_msg_trash, msg_len, ofs + cnt);
1112
1113 myist.ptr = dns_msg_trash;
1114 myist.len = len;
1115
1116 ads = NULL;
1117 /* try to push request into activ sess with free slot */
1118 if (!LIST_ISEMPTY(&dss->free_sess)) {
1119 ds = LIST_NEXT(&dss->free_sess, struct dns_session *, list);
1120
1121 if (ring_write(&ds->ring, DNS_TCP_MSG_MAX_SIZE, NULL, 0, &myist, 1) > 0) {
1122 ds->nb_queries++;
1123 if (ds->nb_queries >= DNS_STREAM_MAX_PIPELINED_REQ)
1124 LIST_DEL_INIT(&ds->list);
1125 ads = ds;
1126 }
1127 else {
1128 /* it means we were unable to put a request in this slot,
1129 * it may be close to be full so we put it at the end
1130 * of free conn list */
1131 LIST_DEL_INIT(&ds->list);
1132 LIST_ADDQ(&dss->free_sess, &ds->list);
1133 }
1134 }
1135
1136 if (!ads) {
1137 /* try to push request into idle, this one should have enought free space */
1138 if (!LIST_ISEMPTY(&dss->idle_sess)) {
1139 ds = LIST_NEXT(&dss->idle_sess, struct dns_session *, list);
1140
1141 /* ring is empty so this ring_write should never fail */
1142 ring_write(&ds->ring, DNS_TCP_MSG_MAX_SIZE, NULL, 0, &myist, 1);
1143 ds->nb_queries++;
1144 LIST_DEL_INIT(&ds->list);
1145
1146 ds->dss->idle_conns--;
1147
1148 /* we may have to update the max_active_conns */
1149 if (ds->dss->max_active_conns < ds->dss->cur_conns - ds->dss->idle_conns)
1150 ds->dss->max_active_conns = ds->dss->cur_conns - ds->dss->idle_conns;
1151
1152 /* since we may unable to find a free list to handle
1153 * this request, this request may be large and fill
1154 * the ring buffer so we prefer to put at the end of free
1155 * list. */
1156 LIST_ADDQ(&dss->free_sess, &ds->list);
1157 ads = ds;
1158 }
1159 }
1160
1161 /* we didn't find a session avalaible with large enough room */
1162 if (!ads) {
1163 /* allocate a new session */
1164 ads = dns_session_new(dss);
1165 if (ads) {
1166 /* ring is empty so this ring_write should never fail */
1167 ring_write(&ads->ring, DNS_TCP_MSG_MAX_SIZE, NULL, 0, &myist, 1);
1168 ads->nb_queries++;
1169 LIST_ADD(&dss->free_sess, &ads->list);
1170 }
1171 else
1172 ns->counters->snd_error++;
1173 }
1174
1175 if (ads)
1176 ns->counters->sent++;
1177
1178 ofs += cnt + len;
1179 }
1180
1181 HA_ATOMIC_ADD(b_peek(buf, ofs), 1);
1182 ofs += ring->ofs;
1183 dss->ofs_req = ofs;
1184 HA_RWLOCK_RDUNLOCK(DNS_LOCK, &ring->lock);
1185
1186
1187 HA_SPIN_UNLOCK(DNS_LOCK, &dss->lock);
1188 return t;
1189}
1190
1191/*
1192 * Task used to consume response
1193 * Note: upper layer callback is called
1194 */
1195static struct task *dns_process_rsp(struct task *t, void *context, unsigned short state)
1196{
1197 struct dns_nameserver *ns = (struct dns_nameserver *)context;
1198
1199 ns->process_responses(ns);
1200
1201 return t;
1202}
1203
1204/* Function used to initialize an TCP nameserver */
1205int dns_stream_init(struct dns_nameserver *ns, struct server *srv)
1206{
1207 struct dns_stream_server *dss = NULL;
1208
1209 dss = calloc(1, sizeof(*dss));
1210 if (!dss) {
1211 ha_alert("memory allocation error initializing dns tcp server '%s'.\n", srv->id);
1212 goto out;
1213 }
1214
1215 dss->srv = srv;
1216 dss->maxconn = srv->maxconn;
1217
1218 dss->ofs_req = ~0; /* init ring offset */
1219 dss->ring_req = ring_new(2*DNS_TCP_MSG_RING_MAX_SIZE);
1220 if (!dss->ring_req) {
1221 ha_alert("memory allocation error initializing the ring for dns tcp server '%s'.\n", srv->id);
1222 goto out;
1223 }
1224 /* Create the task associated to the resolver target handling conns */
1225 if ((dss->task_req = task_new(MAX_THREADS_MASK)) == NULL) {
1226 ha_alert("memory allocation error initializing the ring for dns tcp server '%s'.\n", srv->id);
1227 goto out;
1228 }
1229
1230 /* Update task's parameters */
1231 dss->task_req->process = dns_process_req;
1232 dss->task_req->context = ns;
1233
1234 /* attach the task as reader */
1235 if (!ring_attach(dss->ring_req)) {
1236 /* mark server attached to the ring */
1237 ha_alert("server '%s': too many watchers for ring. this should never happen.\n", srv->id);
1238 goto out;
1239 }
1240
1241 /* Create the task associated to the resolver target handling conns */
1242 if ((dss->task_rsp = task_new(MAX_THREADS_MASK)) == NULL) {
1243 ha_alert("memory allocation error initializing the ring for dns tcp server '%s'.\n", srv->id);
1244 goto out;
1245 }
1246
1247 /* Update task's parameters */
1248 dss->task_rsp->process = dns_process_rsp;
1249 dss->task_rsp->context = ns;
1250
1251 /* Create the task associated to the resolver target handling conns */
1252 if ((dss->task_idle = task_new(MAX_THREADS_MASK)) == NULL) {
1253 ha_alert("memory allocation error initializing the ring for dns tcp server '%s'.\n", srv->id);
1254 goto out;
1255 }
1256
1257 /* Update task's parameters */
1258 dss->task_idle->process = dns_process_idle_exp;
1259 dss->task_idle->context = dss;
1260 dss->task_idle->expire = tick_add(now_ms, 5000);
1261
1262 /* let start the task to free idle conns immediatly */
1263 task_queue(dss->task_idle);
1264
1265 LIST_INIT(&dss->free_sess);
1266 LIST_INIT(&dss->idle_sess);
1267 LIST_INIT(&dss->wait_sess);
1268 HA_SPIN_INIT(&dss->lock);
1269 ns->stream = dss;
1270 return 0;
1271out:
1272 if (dss && dss->task_rsp)
1273 task_destroy(dss->task_rsp);
1274 if (dss && dss->task_req)
1275 task_destroy(dss->task_req);
1276 if (dss && dss->ring_req)
1277 ring_free(dss->ring_req);
1278
1279 free(dss);
Emeric Brunc9437992021-02-12 19:42:55 +01001280 return -1;
Christopher Faulet67957bd2017-09-27 11:00:59 +02001281}
1282
Emeric Brunc9437992021-02-12 19:42:55 +01001283int init_dns_buffers()
Baptiste Assmann325137d2015-04-13 23:40:55 +02001284{
Emeric Brunc9437992021-02-12 19:42:55 +01001285 dns_msg_trash = malloc(DNS_TCP_MSG_MAX_SIZE);
1286 if (!dns_msg_trash)
1287 return 0;
Baptiste Assmann325137d2015-04-13 23:40:55 +02001288
Emeric Brunc9437992021-02-12 19:42:55 +01001289 return 1;
1290}
Baptiste Assmannc1ce5f32016-05-14 11:26:22 +02001291
Emeric Brunc9437992021-02-12 19:42:55 +01001292void deinit_dns_buffers()
1293{
1294 free(dns_msg_trash);
1295 dns_msg_trash = NULL;
1296}
Emeric Brund26a6232021-01-04 13:32:20 +01001297
1298REGISTER_PER_THREAD_ALLOC(init_dns_buffers);
1299REGISTER_PER_THREAD_FREE(deinit_dns_buffers);