blob: 1ef5e87005079ce1f12ed220adb85c14a921ba93 [file] [log] [blame]
/*
* Name server resolution
*
* Copyright 2020 HAProxy Technologies
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
*/
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <haproxy/action.h>
#include <haproxy/api.h>
#include <haproxy/cfgparse.h>
#include <haproxy/channel.h>
#include <haproxy/check.h>
#include <haproxy/cli.h>
#include <haproxy/dgram.h>
#include <haproxy/dns.h>
#include <haproxy/errors.h>
#include <haproxy/fd.h>
#include <haproxy/log.h>
#include <haproxy/ring.h>
#include <haproxy/stream.h>
#include <haproxy/stream_interface.h>
#include <haproxy/tools.h>
static THREAD_LOCAL char *dns_msg_trash;
DECLARE_STATIC_POOL(dns_session_pool, "dns_session", sizeof(struct dns_session));
DECLARE_STATIC_POOL(dns_query_pool, "dns_query", sizeof(struct dns_query));
DECLARE_STATIC_POOL(dns_msg_buf, "dns_msg_buf", DNS_TCP_MSG_RING_MAX_SIZE);
/* Opens an UDP socket on the namesaver's IP/Port, if required. Returns 0 on
* success, -1 otherwise. ns->dgram must be defined.
*/
static int dns_connect_nameserver(struct dns_nameserver *ns)
{
struct dgram_conn *dgram = &ns->dgram->conn;
int fd;
/* Already connected */
if (dgram->t.sock.fd != -1)
return 0;
/* Create an UDP socket and connect it on the nameserver's IP/Port */
if ((fd = socket(dgram->addr.to.ss_family, SOCK_DGRAM, IPPROTO_UDP)) == -1) {
send_log(NULL, LOG_WARNING,
"DNS : section '%s': can't create socket for nameserver '%s'.\n",
ns->counters->pid, ns->id);
return -1;
}
if (connect(fd, (struct sockaddr*)&dgram->addr.to, get_addr_len(&dgram->addr.to)) == -1) {
send_log(NULL, LOG_WARNING,
"DNS : section '%s': can't connect socket for nameserver '%s'.\n",
ns->counters->id, ns->id);
close(fd);
return -1;
}
/* Make the socket non blocking */
fcntl(fd, F_SETFL, O_NONBLOCK);
/* Add the fd in the fd list and update its parameters */
dgram->t.sock.fd = fd;
fd_insert(fd, dgram, dgram_fd_handler, MAX_THREADS_MASK);
fd_want_recv(fd);
return 0;
}
/* Sends a message to a name server
* It returns message length on success
* or -1 in error case
* 0 is returned in case of output ring buffer is full
*/
int dns_send_nameserver(struct dns_nameserver *ns, void *buf, size_t len)
{
int ret = -1;
if (ns->dgram) {
struct dgram_conn *dgram = &ns->dgram->conn;
int fd;
HA_SPIN_LOCK(DNS_LOCK, &dgram->lock);
fd = dgram->t.sock.fd;
if (fd == -1) {
if (dns_connect_nameserver(ns) == -1) {
HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock);
return -1;
}
fd = dgram->t.sock.fd;
}
ret = send(fd, buf, len, 0);
if (ret < 0) {
if (errno == EAGAIN) {
struct ist myist;
myist = ist2(buf, len);
ret = ring_write(ns->dgram->ring_req, DNS_TCP_MSG_MAX_SIZE, NULL, 0, &myist, 1);
if (!ret) {
ns->counters->snd_error++;
HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock);
return -1;
}
fd_cant_send(fd);
HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock);
return ret;
}
ns->counters->snd_error++;
fd_delete(fd);
dgram->t.sock.fd = -1;
HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock);
return -1;
}
ns->counters->sent++;
HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock);
}
else if (ns->stream) {
struct ist myist;
myist = ist2(buf, len);
ret = ring_write(ns->stream->ring_req, DNS_TCP_MSG_MAX_SIZE, NULL, 0, &myist, 1);
if (!ret) {
ns->counters->snd_error++;
return -1;
}
task_wakeup(ns->stream->task_req, TASK_WOKEN_MSG);
return ret;
}
return ret;
}
void dns_session_free(struct dns_session *);
/* Receives a dns message
* Returns message length
* 0 is returned if no more message available
* -1 in error case
*/
ssize_t dns_recv_nameserver(struct dns_nameserver *ns, void *data, size_t size)
{
ssize_t ret = -1;
if (ns->dgram) {
struct dgram_conn *dgram = &ns->dgram->conn;
int fd;
HA_SPIN_LOCK(DNS_LOCK, &dgram->lock);
fd = dgram->t.sock.fd;
if (fd == -1) {
HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock);
return -1;
}
if ((ret = recv(fd, data, size, 0)) < 0) {
if (errno == EAGAIN) {
fd_cant_recv(fd);
HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock);
return 0;
}
fd_delete(fd);
dgram->t.sock.fd = -1;
HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock);
return -1;
}
HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock);
}
else if (ns->stream) {
struct dns_stream_server *dss = ns->stream;
struct dns_session *ds;
HA_SPIN_LOCK(DNS_LOCK, &dss->lock);
if (!LIST_ISEMPTY(&dss->wait_sess)) {
ds = LIST_NEXT(&dss->wait_sess, struct dns_session *, waiter);
ret = ds->rx_msg.len < size ? ds->rx_msg.len : size;
memcpy(data, ds->rx_msg.area, ret);
ds->rx_msg.len = 0;
LIST_DEL_INIT(&ds->waiter);
if (ds->appctx) {
/* awake appctx because it may have other
* message to receive
*/
appctx_wakeup(ds->appctx);
/* dns_session could already be into free_sess list
* so we firstly remove it */
LIST_DEL_INIT(&ds->list);
/* decrease nb_queries to free a slot for a new query on that sess */
ds->nb_queries--;
if (ds->nb_queries) {
/* it remains pipelined unanswered request
* into this session but we just decrease
* the counter so the session
* can not be full of pipelined requests
* so we can add if to free_sess list
* to receive a new request
*/
LIST_INSERT(&ds->dss->free_sess, &ds->list);
}
else {
/* there is no more pipelined requests
* into this session, so we move it
* to idle_sess list */
LIST_INSERT(&ds->dss->idle_sess, &ds->list);
/* update the counter of idle sessions */
ds->dss->idle_conns++;
/* Note: this is useless there to update
* the max_active_conns since we increase
* the idle count */
}
}
else {
/* there is no more appctx for this session
* it means it is ready to die
*/
dns_session_free(ds);
}
}
HA_SPIN_UNLOCK(DNS_LOCK, &dss->lock);
}
return ret;
}
static void dns_resolve_recv(struct dgram_conn *dgram)
{
struct dns_nameserver *ns;
int fd;
HA_SPIN_LOCK(DNS_LOCK, &dgram->lock);
fd = dgram->t.sock.fd;
/* check if ready for reading */
if ((fd == -1) || !fd_recv_ready(fd)) {
HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock);
return;
}
/* no need to go further if we can't retrieve the nameserver */
if ((ns = dgram->owner) == NULL) {
_HA_ATOMIC_AND(&fdtab[fd].state, ~(FD_POLL_HUP|FD_POLL_ERR));
fd_stop_recv(fd);
HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock);
return;
}
HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock);
ns->process_responses(ns);
}
/* Called when a dns network socket is ready to send data */
static void dns_resolve_send(struct dgram_conn *dgram)
{
int fd;
struct dns_nameserver *ns;
struct ring *ring;
struct buffer *buf;
uint64_t msg_len;
size_t len, cnt, ofs;
HA_SPIN_LOCK(DNS_LOCK, &dgram->lock);
fd = dgram->t.sock.fd;
/* check if ready for sending */
if ((fd == -1) || !fd_send_ready(fd)) {
HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock);
return;
}
/* no need to go further if we can't retrieve the nameserver */
if ((ns = dgram->owner) == NULL) {
_HA_ATOMIC_AND(&fdtab[fd].state, ~(FD_POLL_HUP|FD_POLL_ERR));
fd_stop_send(fd);
HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock);
return;
}
ring = ns->dgram->ring_req;
buf = &ring->buf;
HA_RWLOCK_RDLOCK(DNS_LOCK, &ring->lock);
ofs = ns->dgram->ofs_req;
/* explanation for the initialization below: it would be better to do
* this in the parsing function but this would occasionally result in
* dropped events because we'd take a reference on the oldest message
* and keep it while being scheduled. Thus instead let's take it the
* first time we enter here so that we have a chance to pass many
* existing messages before grabbing a reference to a location. This
* value cannot be produced after initialization.
*/
if (unlikely(ofs == ~0)) {
ofs = 0;
HA_ATOMIC_INC(b_peek(buf, ofs));
ofs += ring->ofs;
}
/* we were already there, adjust the offset to be relative to
* the buffer's head and remove us from the counter.
*/
ofs -= ring->ofs;
BUG_ON(ofs >= buf->size);
HA_ATOMIC_DEC(b_peek(buf, ofs));
while (ofs + 1 < b_data(buf)) {
int ret;
cnt = 1;
len = b_peek_varint(buf, ofs + cnt, &msg_len);
if (!len)
break;
cnt += len;
BUG_ON(msg_len + ofs + cnt + 1 > b_data(buf));
if (unlikely(msg_len > DNS_TCP_MSG_MAX_SIZE)) {
/* too large a message to ever fit, let's skip it */
ofs += cnt + msg_len;
continue;
}
len = b_getblk(buf, dns_msg_trash, msg_len, ofs + cnt);
ret = send(fd, dns_msg_trash, len, 0);
if (ret < 0) {
if (errno == EAGAIN) {
fd_cant_send(fd);
goto out;
}
ns->counters->snd_error++;
fd_delete(fd);
fd = dgram->t.sock.fd = -1;
goto out;
}
ns->counters->sent++;
ofs += cnt + len;
}
/* we don't want/need to be waked up any more for sending
* because all ring content is sent */
fd_stop_send(fd);
out:
HA_ATOMIC_INC(b_peek(buf, ofs));
ofs += ring->ofs;
ns->dgram->ofs_req = ofs;
HA_RWLOCK_RDUNLOCK(DNS_LOCK, &ring->lock);
HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock);
}
/* proto_udp callback functions for a DNS resolution */
struct dgram_data_cb dns_dgram_cb = {
.recv = dns_resolve_recv,
.send = dns_resolve_send,
};
int dns_dgram_init(struct dns_nameserver *ns, struct sockaddr_storage *sk)
{
struct dns_dgram_server *dgram;
if ((dgram = calloc(1, sizeof(*dgram))) == NULL)
return -1;
/* Leave dgram partially initialized, no FD attached for
* now. */
dgram->conn.owner = ns;
dgram->conn.data = &dns_dgram_cb;
dgram->conn.t.sock.fd = -1;
dgram->conn.addr.to = *sk;
HA_SPIN_INIT(&dgram->conn.lock);
ns->dgram = dgram;
dgram->ofs_req = ~0; /* init ring offset */
dgram->ring_req = ring_new(2*DNS_TCP_MSG_RING_MAX_SIZE);
if (!dgram->ring_req) {
ha_alert("memory allocation error initializing the ring for nameserver.\n");
goto out;
}
/* attach the task as reader */
if (!ring_attach(dgram->ring_req)) {
/* mark server attached to the ring */
ha_alert("nameserver sets too many watchers > 255 on ring. This is a bug and should not happen.\n");
goto out;
}
return 0;
out:
if (dgram->ring_req)
ring_free(dgram->ring_req);
free(dgram);
return -1;
}
/*
* IO Handler to handle message push to dns tcp server
*/
static void dns_session_io_handler(struct appctx *appctx)
{
struct stream_interface *si = appctx->owner;
struct dns_session *ds = appctx->ctx.sft.ptr;
struct ring *ring = &ds->ring;
struct buffer *buf = &ring->buf;
uint64_t msg_len;
int available_room;
size_t len, cnt, ofs;
int ret = 0;
/* if stopping was requested, close immediately */
if (unlikely(stopping))
goto close;
/* we want to be sure to not miss that we have been awaked for a shutdown */
__ha_barrier_load();
/* that means the connection was requested to shutdown
* for instance idle expire */
if (ds->shutdown)
goto close;
/* an error was detected */
if (unlikely(si_ic(si)->flags & (CF_WRITE_ERROR|CF_SHUTW)))
goto close;
/* con closed by server side, we will skip data write and drain data from channel */
if ((si_oc(si)->flags & CF_SHUTW)) {
goto read;
}
/* if the connection is not established, inform the stream that we want
* to be notified whenever the connection completes.
*/
if (si_opposite(si)->state < SI_ST_EST) {
si_cant_get(si);
si_rx_conn_blk(si);
si_rx_endp_more(si);
return;
}
ofs = ds->ofs;
HA_RWLOCK_WRLOCK(DNS_LOCK, &ring->lock);
LIST_DEL_INIT(&appctx->wait_entry);
HA_RWLOCK_WRUNLOCK(DNS_LOCK, &ring->lock);
HA_RWLOCK_RDLOCK(DNS_LOCK, &ring->lock);
/* explanation for the initialization below: it would be better to do
* this in the parsing function but this would occasionally result in
* dropped events because we'd take a reference on the oldest message
* and keep it while being scheduled. Thus instead let's take it the
* first time we enter here so that we have a chance to pass many
* existing messages before grabbing a reference to a location. This
* value cannot be produced after initialization.
*/
if (unlikely(ofs == ~0)) {
ofs = 0;
HA_ATOMIC_INC(b_peek(buf, ofs));
ofs += ring->ofs;
}
/* in this loop, ofs always points to the counter byte that precedes
* the message so that we can take our reference there if we have to
* stop before the end (ret=0).
*/
if (si_opposite(si)->state == SI_ST_EST) {
/* we were already there, adjust the offset to be relative to
* the buffer's head and remove us from the counter.
*/
ofs -= ring->ofs;
BUG_ON(ofs >= buf->size);
HA_ATOMIC_DEC(b_peek(buf, ofs));
ret = 1;
while (ofs + 1 < b_data(buf)) {
struct dns_query *query;
uint16_t original_qid;
uint16_t new_qid;
cnt = 1;
len = b_peek_varint(buf, ofs + cnt, &msg_len);
if (!len)
break;
cnt += len;
BUG_ON(msg_len + ofs + cnt + 1 > b_data(buf));
/* retrieve available room on output channel */
available_room = channel_recv_max(si_ic(si));
/* tx_msg_offset null means we are at the start of a new message */
if (!ds->tx_msg_offset) {
uint16_t slen;
/* check if there is enough room to put message len and query id */
if (available_room < sizeof(slen) + sizeof(new_qid)) {
si_rx_room_blk(si);
ret = 0;
break;
}
/* put msg len into then channel */
slen = (uint16_t)msg_len;
slen = htons(slen);
ci_putblk(si_ic(si), (char *)&slen, sizeof(slen));
available_room -= sizeof(slen);
/* backup original query id */
len = b_getblk(buf, (char *)&original_qid, sizeof(original_qid), ofs + cnt);
if (!len) {
/* should never happen since messages are atomically
* written into ring
*/
ret = 0;
break;
}
/* generates new query id */
new_qid = ++ds->query_counter;
new_qid = htons(new_qid);
/* put new query id into the channel */
ci_putblk(si_ic(si), (char *)&new_qid, sizeof(new_qid));
available_room -= sizeof(new_qid);
/* keep query id mapping */
query = pool_alloc(dns_query_pool);
if (query) {
query->qid.key = new_qid;
query->original_qid = original_qid;
query->expire = tick_add(now_ms, 5000);
LIST_INIT(&query->list);
if (LIST_ISEMPTY(&ds->queries)) {
/* enable task to handle expire */
ds->task_exp->expire = query->expire;
/* ensure this will be executed by the same
* thread than ds_session_release
* to ensure session_release is free
* to destroy the task */
task_queue(ds->task_exp);
}
LIST_APPEND(&ds->queries, &query->list);
eb32_insert(&ds->query_ids, &query->qid);
ds->onfly_queries++;
}
/* update the tx_offset to handle output in 16k streams */
ds->tx_msg_offset = sizeof(original_qid);
}
/* check if it remains available room on output chan */
if (unlikely(!available_room)) {
si_rx_room_blk(si);
ret = 0;
break;
}
chunk_reset(&trash);
if ((msg_len - ds->tx_msg_offset) > available_room) {
/* remaining msg data is too large to be written in output channel at one time */
len = b_getblk(buf, trash.area, available_room, ofs + cnt + ds->tx_msg_offset);
/* update offset to complete mesg forwarding later */
ds->tx_msg_offset += len;
}
else {
/* remaining msg data can be written in output channel at one time */
len = b_getblk(buf, trash.area, msg_len - ds->tx_msg_offset, ofs + cnt + ds->tx_msg_offset);
/* reset tx_msg_offset to mark forward fully processed */
ds->tx_msg_offset = 0;
}
trash.data += len;
if (ci_putchk(si_ic(si), &trash) == -1) {
/* should never happen since we
* check available_room is large
* enough here.
*/
si_rx_room_blk(si);
ret = 0;
break;
}
if (ds->tx_msg_offset) {
/* msg was not fully processed, we must be awake to drain pending data */
si_rx_room_blk(si);
ret = 0;
break;
}
/* switch to next message */
ofs += cnt + msg_len;
}
HA_ATOMIC_INC(b_peek(buf, ofs));
ofs += ring->ofs;
ds->ofs = ofs;
}
HA_RWLOCK_RDUNLOCK(DNS_LOCK, &ring->lock);
if (ret) {
/* let's be woken up once new request to write arrived */
HA_RWLOCK_WRLOCK(DNS_LOCK, &ring->lock);
LIST_APPEND(&ring->waiters, &appctx->wait_entry);
HA_RWLOCK_WRUNLOCK(DNS_LOCK, &ring->lock);
si_rx_endp_done(si);
}
read:
/* if session is not a waiter it means there is no committed
* message into rx_buf and we are free to use it
* Note: we need a load barrier here to not miss the
* delete from the list
*/
/* lock the dns_stream_server containing lists heads */
HA_SPIN_LOCK(DNS_LOCK, &ds->dss->lock);
if (!LIST_INLIST(&ds->waiter)) {
while (1) {
uint16_t query_id;
struct eb32_node *eb;
struct dns_query *query;
if (!ds->rx_msg.len) {
/* next message len is not fully available into the channel */
if (co_data(si_oc(si)) < 2)
break;
/* retrieve message len */
co_getblk(si_oc(si), (char *)&msg_len, 2, 0);
/* mark as consumed */
co_skip(si_oc(si), 2);
/* store message len */
ds->rx_msg.len = ntohs(msg_len);
}
if (!co_data(si_oc(si))) {
/* we need more data but nothing is available */
break;
}
if (co_data(si_oc(si)) + ds->rx_msg.offset < ds->rx_msg.len) {
/* message only partially available */
/* read available data */
co_getblk(si_oc(si), ds->rx_msg.area + ds->rx_msg.offset, co_data(si_oc(si)), 0);
/* update message offset */
ds->rx_msg.offset += co_data(si_oc(si));
/* consume all pending data from the channel */
co_skip(si_oc(si), co_data(si_oc(si)));
/* we need to wait for more data */
break;
}
/* enough data is available into the channel to read the message until the end */
/* read from the channel until the end of the message */
co_getblk(si_oc(si), ds->rx_msg.area + ds->rx_msg.offset, ds->rx_msg.len - ds->rx_msg.offset, 0);
/* consume all data until the end of the message from the channel */
co_skip(si_oc(si), ds->rx_msg.len - ds->rx_msg.offset);
/* reset reader offset to 0 for next message reand */
ds->rx_msg.offset = 0;
/* try remap query id to original */
memcpy(&query_id, ds->rx_msg.area, sizeof(query_id));
eb = eb32_lookup(&ds->query_ids, query_id);
if (!eb) {
/* query id not found means we have an unknown corresponding
* request, perhaps server's bug or or the query reached
* timeout
*/
ds->rx_msg.len = 0;
continue;
}
/* re-map the original query id set by the requester */
query = eb32_entry(eb, struct dns_query, qid);
memcpy(ds->rx_msg.area, &query->original_qid, sizeof(query->original_qid));
/* remove query ids mapping from pending queries list/tree */
eb32_delete(&query->qid);
LIST_DELETE(&query->list);
pool_free(dns_query_pool, query);
ds->onfly_queries--;
/* the dns_session is also added in queue of the
* wait_sess list where the task processing
* response will pop available responses
*/
LIST_APPEND(&ds->dss->wait_sess, &ds->waiter);
/* awake the task processing the responses */
task_wakeup(ds->dss->task_rsp, TASK_WOKEN_INIT);
break;
}
if (!LIST_INLIST(&ds->waiter)) {
/* there is no more pending data to read and the con was closed by the server side */
if (!co_data(si_oc(si)) && (si_oc(si)->flags & CF_SHUTW)) {
HA_SPIN_UNLOCK(DNS_LOCK, &ds->dss->lock);
goto close;
}
}
}
HA_SPIN_UNLOCK(DNS_LOCK, &ds->dss->lock);
return;
close:
si_shutw(si);
si_shutr(si);
si_ic(si)->flags |= CF_READ_NULL;
}
void dns_queries_flush(struct dns_session *ds)
{
struct dns_query *query, *queryb;
list_for_each_entry_safe(query, queryb, &ds->queries, list) {
eb32_delete(&query->qid);
LIST_DELETE(&query->list);
pool_free(dns_query_pool, query);
}
}
void dns_session_free(struct dns_session *ds)
{
if (ds->rx_msg.area)
pool_free(dns_msg_buf, ds->rx_msg.area);
if (ds->tx_ring_area)
pool_free(dns_msg_buf, ds->tx_ring_area);
if (ds->task_exp)
task_destroy(ds->task_exp);
dns_queries_flush(ds);
/* Ensure to remove this session from external lists
* Note: we are under the lock of dns_stream_server
* which own the heads of those lists.
*/
LIST_DEL_INIT(&ds->waiter);
LIST_DEL_INIT(&ds->list);
ds->dss->cur_conns--;
/* Note: this is useless to update
* max_active_conns here because
* we decrease the value
*/
pool_free(dns_session_pool, ds);
}
static struct appctx *dns_session_create(struct dns_session *ds);
/*
* Function to release a DNS tcp session
*/
static void dns_session_release(struct appctx *appctx)
{
struct dns_session *ds = appctx->ctx.sft.ptr;
struct dns_stream_server *dss __maybe_unused;
if (!ds)
return;
dss = ds->dss;
HA_SPIN_LOCK(DNS_LOCK, &dss->lock);
LIST_DEL_INIT(&ds->list);
if (stopping) {
dns_session_free(ds);
HA_SPIN_UNLOCK(DNS_LOCK, &dss->lock);
return;
}
if (!ds->nb_queries) {
/* this is an idle session */
/* Note: this is useless to update max_active_sess
* here because we decrease idle_conns but
* dns_session_free decrease curconns
*/
ds->dss->idle_conns--;
dns_session_free(ds);
HA_SPIN_UNLOCK(DNS_LOCK, &dss->lock);
return;
}
if (ds->onfly_queries == ds->nb_queries) {
/* the session can be released because
* it means that all queries AND
* responses are in fly */
dns_session_free(ds);
HA_SPIN_UNLOCK(DNS_LOCK, &dss->lock);
return;
}
/* We do not call ring_appctx_detach here
* because we want to keep readers counters
* to retry a con with a different appctx*/
HA_RWLOCK_WRLOCK(DNS_LOCK, &ds->ring.lock);
LIST_DEL_INIT(&appctx->wait_entry);
HA_RWLOCK_WRUNLOCK(DNS_LOCK, &ds->ring.lock);
/* if there is no pending complete response
* message, ensure to reset
* message offsets if the session
* was closed with an incomplete pending response
*/
if (!LIST_INLIST(&ds->waiter))
ds->rx_msg.len = ds->rx_msg.offset = 0;
/* we flush pending sent queries because we never
* have responses
*/
ds->nb_queries -= ds->onfly_queries;
dns_queries_flush(ds);
/* reset offset to be sure to start from message start */
ds->tx_msg_offset = 0;
/* here the ofs and the attached counter
* are kept unchanged
*/
/* Create a new appctx, We hope we can
* create from the release callback! */
ds->appctx = dns_session_create(ds);
if (!ds->appctx) {
dns_session_free(ds);
HA_SPIN_UNLOCK(DNS_LOCK, &dss->lock);
return;
}
if (ds->nb_queries < DNS_STREAM_MAX_PIPELINED_REQ)
LIST_INSERT(&ds->dss->free_sess, &ds->list);
HA_SPIN_UNLOCK(DNS_LOCK, &dss->lock);
}
/* DNS tcp session applet */
static struct applet dns_session_applet = {
.obj_type = OBJ_TYPE_APPLET,
.name = "<STRMDNS>", /* used for logging */
.fct = dns_session_io_handler,
.release = dns_session_release,
};
/*
* Function used to create an appctx for a DNS session
*/
static struct appctx *dns_session_create(struct dns_session *ds)
{
struct appctx *appctx;
struct session *sess;
struct stream *s;
struct applet *applet = &dns_session_applet;
appctx = appctx_new(applet, tid_bit);
if (!appctx)
goto out_close;
appctx->ctx.sft.ptr = (void *)ds;
sess = session_new(ds->dss->srv->proxy, NULL, &appctx->obj_type);
if (!sess) {
ha_alert("out of memory in peer_session_create().\n");
goto out_free_appctx;
}
if ((s = stream_new(sess, &appctx->obj_type, &BUF_NULL)) == NULL) {
ha_alert("Failed to initialize stream in peer_session_create().\n");
goto out_free_sess;
}
s->target = &ds->dss->srv->obj_type;
if (!sockaddr_alloc(&s->target_addr, &ds->dss->srv->addr, sizeof(ds->dss->srv->addr)))
goto out_free_strm;
s->flags = SF_ASSIGNED|SF_ADDR_SET;
s->si[1].flags |= SI_FL_NOLINGER;
s->do_log = NULL;
s->uniq_id = 0;
s->res.flags |= CF_READ_DONTWAIT;
/* for rto and rex to eternity to not expire on idle recv:
* We are using a syslog server.
*/
s->res.rto = TICK_ETERNITY;
s->res.rex = TICK_ETERNITY;
ds->appctx = appctx;
task_wakeup(s->task, TASK_WOKEN_INIT);
return appctx;
/* Error unrolling */
out_free_strm:
LIST_DELETE(&s->list);
pool_free(pool_head_stream, s);
out_free_sess:
session_free(sess);
out_free_appctx:
appctx_free(appctx);
out_close:
return NULL;
}
/* Task processing expiration of unresponded queries, this one is supposed
* to be stuck on the same thread than the appctx handler
*/
static struct task *dns_process_query_exp(struct task *t, void *context, unsigned int state)
{
struct dns_session *ds = (struct dns_session *)context;
struct dns_query *query, *queryb;
t->expire = TICK_ETERNITY;
list_for_each_entry_safe(query, queryb, &ds->queries, list) {
if (tick_is_expired(query->expire, now_ms)) {
eb32_delete(&query->qid);
LIST_DELETE(&query->list);
pool_free(dns_query_pool, query);
ds->onfly_queries--;
}
else {
t->expire = query->expire;
break;
}
}
return t;
}
/* Task processing expiration of idle sessions */
static struct task *dns_process_idle_exp(struct task *t, void *context, unsigned int state)
{
struct dns_stream_server *dss = (struct dns_stream_server *)context;
struct dns_session *ds, *dsb;
int target = 0;
int cur_active_conns;
HA_SPIN_LOCK(DNS_LOCK, &dss->lock);
cur_active_conns = dss->cur_conns - dss->idle_conns;
if (cur_active_conns > dss->max_active_conns)
dss->max_active_conns = cur_active_conns;
target = (dss->max_active_conns - cur_active_conns) / 2;
list_for_each_entry_safe(ds, dsb, &dss->idle_sess, list) {
if (!target)
break;
/* remove conn to pending list to ensure it won't be reused */
LIST_DEL_INIT(&ds->list);
/* force session shutdown */
ds->shutdown = 1;
/* to be sure that the appctx won't miss shutdown */
__ha_barrier_store();
/* wake appctx to perform the shutdown */
appctx_wakeup(ds->appctx);
}
/* reset max to current active conns */
dss->max_active_conns = cur_active_conns;
HA_SPIN_UNLOCK(DNS_LOCK, &dss->lock);
t->expire = tick_add(now_ms, 5000);
return t;
}
struct dns_session *dns_session_new(struct dns_stream_server *dss)
{
struct dns_session *ds;
if (dss->maxconn && (dss->maxconn <= dss->cur_conns))
return NULL;
ds = pool_alloc(dns_session_pool);
if (!ds)
return NULL;
ds->ofs = ~0;
ds->dss = dss;
LIST_INIT(&ds->list);
LIST_INIT(&ds->queries);
LIST_INIT(&ds->waiter);
ds->rx_msg.offset = ds->rx_msg.len = 0;
ds->rx_msg.area = NULL;
ds->tx_ring_area = NULL;
ds->task_exp = NULL;
ds->appctx = NULL;
ds->shutdown = 0;
ds->nb_queries = 0;
ds->query_ids = EB_ROOT_UNIQUE;
ds->rx_msg.area = pool_alloc(dns_msg_buf);
if (!ds->rx_msg.area)
goto error;
ds->tx_ring_area = pool_alloc(dns_msg_buf);
if (!ds->tx_ring_area)
goto error;
ring_init(&ds->ring, ds->tx_ring_area, DNS_TCP_MSG_RING_MAX_SIZE);
/* never fail because it is the first watcher attached to the ring */
DISGUISE(ring_attach(&ds->ring));
if ((ds->task_exp = task_new(tid_bit)) == NULL)
goto error;
ds->task_exp->process = dns_process_query_exp;
ds->task_exp->context = ds;
ds->appctx = dns_session_create(ds);
if (!ds->appctx)
goto error;
dss->cur_conns++;
return ds;
error:
if (ds->task_exp)
task_destroy(ds->task_exp);
if (ds->rx_msg.area)
pool_free(dns_msg_buf, ds->rx_msg.area);
if (ds->tx_ring_area)
pool_free(dns_msg_buf, ds->tx_ring_area);
pool_free(dns_session_pool, ds);
return NULL;
}
/*
* Task used to consume pending messages from nameserver ring
* and forward them to dns_session ring.
* Note: If no slot found a new dns_session is allocated
*/
static struct task *dns_process_req(struct task *t, void *context, unsigned int state)
{
struct dns_nameserver *ns = (struct dns_nameserver *)context;
struct dns_stream_server *dss = ns->stream;
struct ring *ring = dss->ring_req;
struct buffer *buf = &ring->buf;
uint64_t msg_len;
size_t len, cnt, ofs;
struct dns_session *ds, *ads;
HA_SPIN_LOCK(DNS_LOCK, &dss->lock);
ofs = dss->ofs_req;
HA_RWLOCK_RDLOCK(DNS_LOCK, &ring->lock);
/* explanation for the initialization below: it would be better to do
* this in the parsing function but this would occasionally result in
* dropped events because we'd take a reference on the oldest message
* and keep it while being scheduled. Thus instead let's take it the
* first time we enter here so that we have a chance to pass many
* existing messages before grabbing a reference to a location. This
* value cannot be produced after initialization.
*/
if (unlikely(ofs == ~0)) {
ofs = 0;
HA_ATOMIC_INC(b_peek(buf, ofs));
ofs += ring->ofs;
}
/* we were already there, adjust the offset to be relative to
* the buffer's head and remove us from the counter.
*/
ofs -= ring->ofs;
BUG_ON(ofs >= buf->size);
HA_ATOMIC_DEC(b_peek(buf, ofs));
while (ofs + 1 < b_data(buf)) {
struct ist myist;
cnt = 1;
len = b_peek_varint(buf, ofs + cnt, &msg_len);
if (!len)
break;
cnt += len;
BUG_ON(msg_len + ofs + cnt + 1 > b_data(buf));
if (unlikely(msg_len > DNS_TCP_MSG_MAX_SIZE)) {
/* too large a message to ever fit, let's skip it */
ofs += cnt + msg_len;
continue;
}
len = b_getblk(buf, dns_msg_trash, msg_len, ofs + cnt);
myist = ist2(dns_msg_trash, len);
ads = NULL;
/* try to push request into active sess with free slot */
if (!LIST_ISEMPTY(&dss->free_sess)) {
ds = LIST_NEXT(&dss->free_sess, struct dns_session *, list);
if (ring_write(&ds->ring, DNS_TCP_MSG_MAX_SIZE, NULL, 0, &myist, 1) > 0) {
ds->nb_queries++;
if (ds->nb_queries >= DNS_STREAM_MAX_PIPELINED_REQ)
LIST_DEL_INIT(&ds->list);
ads = ds;
}
else {
/* it means we were unable to put a request in this slot,
* it may be close to be full so we put it at the end
* of free conn list */
LIST_DEL_INIT(&ds->list);
LIST_APPEND(&dss->free_sess, &ds->list);
}
}
if (!ads) {
/* try to push request into idle, this one should have enough free space */
if (!LIST_ISEMPTY(&dss->idle_sess)) {
ds = LIST_NEXT(&dss->idle_sess, struct dns_session *, list);
/* ring is empty so this ring_write should never fail */
ring_write(&ds->ring, DNS_TCP_MSG_MAX_SIZE, NULL, 0, &myist, 1);
ds->nb_queries++;
LIST_DEL_INIT(&ds->list);
ds->dss->idle_conns--;
/* we may have to update the max_active_conns */
if (ds->dss->max_active_conns < ds->dss->cur_conns - ds->dss->idle_conns)
ds->dss->max_active_conns = ds->dss->cur_conns - ds->dss->idle_conns;
/* since we may unable to find a free list to handle
* this request, this request may be large and fill
* the ring buffer so we prefer to put at the end of free
* list. */
LIST_APPEND(&dss->free_sess, &ds->list);
ads = ds;
}
}
/* we didn't find a session available with large enough room */
if (!ads) {
/* allocate a new session */
ads = dns_session_new(dss);
if (ads) {
/* ring is empty so this ring_write should never fail */
ring_write(&ads->ring, DNS_TCP_MSG_MAX_SIZE, NULL, 0, &myist, 1);
ads->nb_queries++;
LIST_INSERT(&dss->free_sess, &ads->list);
}
else
ns->counters->snd_error++;
}
if (ads)
ns->counters->sent++;
ofs += cnt + len;
}
HA_ATOMIC_INC(b_peek(buf, ofs));
ofs += ring->ofs;
dss->ofs_req = ofs;
HA_RWLOCK_RDUNLOCK(DNS_LOCK, &ring->lock);
HA_SPIN_UNLOCK(DNS_LOCK, &dss->lock);
return t;
}
/*
* Task used to consume response
* Note: upper layer callback is called
*/
static struct task *dns_process_rsp(struct task *t, void *context, unsigned int state)
{
struct dns_nameserver *ns = (struct dns_nameserver *)context;
ns->process_responses(ns);
return t;
}
/* Function used to initialize an TCP nameserver */
int dns_stream_init(struct dns_nameserver *ns, struct server *srv)
{
struct dns_stream_server *dss = NULL;
dss = calloc(1, sizeof(*dss));
if (!dss) {
ha_alert("memory allocation error initializing dns tcp server '%s'.\n", srv->id);
goto out;
}
dss->srv = srv;
dss->maxconn = srv->maxconn;
dss->ofs_req = ~0; /* init ring offset */
dss->ring_req = ring_new(2*DNS_TCP_MSG_RING_MAX_SIZE);
if (!dss->ring_req) {
ha_alert("memory allocation error initializing the ring for dns tcp server '%s'.\n", srv->id);
goto out;
}
/* Create the task associated to the resolver target handling conns */
if ((dss->task_req = task_new(MAX_THREADS_MASK)) == NULL) {
ha_alert("memory allocation error initializing the ring for dns tcp server '%s'.\n", srv->id);
goto out;
}
/* Update task's parameters */
dss->task_req->process = dns_process_req;
dss->task_req->context = ns;
/* attach the task as reader */
if (!ring_attach(dss->ring_req)) {
/* mark server attached to the ring */
ha_alert("server '%s': too many watchers for ring. this should never happen.\n", srv->id);
goto out;
}
/* Create the task associated to the resolver target handling conns */
if ((dss->task_rsp = task_new(MAX_THREADS_MASK)) == NULL) {
ha_alert("memory allocation error initializing the ring for dns tcp server '%s'.\n", srv->id);
goto out;
}
/* Update task's parameters */
dss->task_rsp->process = dns_process_rsp;
dss->task_rsp->context = ns;
/* Create the task associated to the resolver target handling conns */
if ((dss->task_idle = task_new(MAX_THREADS_MASK)) == NULL) {
ha_alert("memory allocation error initializing the ring for dns tcp server '%s'.\n", srv->id);
goto out;
}
/* Update task's parameters */
dss->task_idle->process = dns_process_idle_exp;
dss->task_idle->context = dss;
dss->task_idle->expire = tick_add(now_ms, 5000);
/* let start the task to free idle conns immediately */
task_queue(dss->task_idle);
LIST_INIT(&dss->free_sess);
LIST_INIT(&dss->idle_sess);
LIST_INIT(&dss->wait_sess);
HA_SPIN_INIT(&dss->lock);
ns->stream = dss;
return 0;
out:
if (dss && dss->task_rsp)
task_destroy(dss->task_rsp);
if (dss && dss->task_req)
task_destroy(dss->task_req);
if (dss && dss->ring_req)
ring_free(dss->ring_req);
free(dss);
return -1;
}
int init_dns_buffers()
{
dns_msg_trash = malloc(DNS_TCP_MSG_MAX_SIZE);
if (!dns_msg_trash)
return 0;
return 1;
}
void deinit_dns_buffers()
{
ha_free(&dns_msg_trash);
}
REGISTER_PER_THREAD_ALLOC(init_dns_buffers);
REGISTER_PER_THREAD_FREE(deinit_dns_buffers);