blob: 25573cb6f6c24e0b293312d356be1ff2cdb27a6b [file] [log] [blame]
Willy Tarreaubaaee002006-06-26 02:48:02 +02001/*
2 * Functions operating on SOCK_STREAM and buffers.
3 *
Willy Tarreaua8f55d52010-05-31 17:44:19 +02004 * Copyright 2000-2010 Willy Tarreau <w@1wt.eu>
Willy Tarreaubaaee002006-06-26 02:48:02 +02005 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 */
12
Willy Tarreau6b4aad42009-01-18 21:59:13 +010013#define _GNU_SOURCE
Willy Tarreaubaaee002006-06-26 02:48:02 +020014#include <errno.h>
15#include <fcntl.h>
16#include <stdio.h>
17#include <stdlib.h>
18
19#include <sys/socket.h>
20#include <sys/stat.h>
21#include <sys/types.h>
22
Dmitry Sivachenkocaf58982009-08-24 15:11:06 +040023#include <netinet/tcp.h>
24
Willy Tarreau2dd0d472006-06-29 17:53:05 +020025#include <common/compat.h>
Willy Tarreaue3ba5f02006-06-29 18:54:54 +020026#include <common/config.h>
Willy Tarreaud6f087e2008-01-18 17:20:13 +010027#include <common/debug.h>
Willy Tarreau83749182007-04-15 20:56:27 +020028#include <common/standard.h>
Willy Tarreau0c303ee2008-07-07 00:09:58 +020029#include <common/ticks.h>
Willy Tarreau2dd0d472006-06-29 17:53:05 +020030#include <common/time.h>
Willy Tarreaubaaee002006-06-26 02:48:02 +020031
Willy Tarreau2d212792008-08-27 21:41:35 +020032#include <proto/buffers.h>
Willy Tarreaubaaee002006-06-26 02:48:02 +020033#include <proto/fd.h>
Willy Tarreaueb472682010-05-28 18:46:57 +020034#include <proto/freq_ctr.h>
35#include <proto/log.h>
Willy Tarreau3eba98a2009-01-25 13:56:13 +010036#include <proto/pipe.h>
Willy Tarreaufe598a72010-09-21 21:48:23 +020037#include <proto/protocols.h>
Willy Tarreaubaaee002006-06-26 02:48:02 +020038#include <proto/stream_sock.h>
39#include <proto/task.h>
40
Willy Tarreau5bd8c372009-01-19 00:32:22 +010041#include <types/global.h>
Willy Tarreaubaaee002006-06-26 02:48:02 +020042
Willy Tarreau6b4aad42009-01-18 21:59:13 +010043/* On recent Linux kernels, the splice() syscall may be used for faster data copy.
44 * But it's not always defined on some OS versions, and it even happens that some
45 * definitions are wrong with some glibc due to an offset bug in syscall().
46 */
47
48#if defined(CONFIG_HAP_LINUX_SPLICE)
49#include <unistd.h>
50#include <sys/syscall.h>
51
52#ifndef SPLICE_F_MOVE
53#define SPLICE_F_MOVE 0x1
54#endif
55
56#ifndef SPLICE_F_NONBLOCK
57#define SPLICE_F_NONBLOCK 0x2
58#endif
59
60#ifndef SPLICE_F_MORE
61#define SPLICE_F_MORE 0x4
62#endif
63
64#ifndef __NR_splice
65#if defined(__powerpc__) || defined(__powerpc64__)
66#define __NR_splice 283
67#elif defined(__sparc__) || defined(__sparc64__)
68#define __NR_splice 232
69#elif defined(__x86_64__)
70#define __NR_splice 275
71#elif defined(__alpha__)
72#define __NR_splice 468
73#elif defined (__i386__)
74#define __NR_splice 313
75#else
76#warning unsupported architecture, guessing __NR_splice=313 like x86...
77#define __NR_splice 313
78#endif /* $arch */
79
Willy Tarreau48d84c12010-11-14 17:09:33 +010080#if defined(CONFIG_HAP_LINUX_VSYSCALL) && defined(__linux__) && defined(__i386__)
81/* the syscall is redefined somewhere else */
82extern int splice(int fdin, loff_t *off_in, int fdout, loff_t *off_out, size_t len, unsigned long flags);
83#else
Willy Tarreau6b4aad42009-01-18 21:59:13 +010084_syscall6(int, splice, int, fdin, loff_t *, off_in, int, fdout, loff_t *, off_out, size_t, len, unsigned long, flags)
Willy Tarreau48d84c12010-11-14 17:09:33 +010085#endif
Willy Tarreau6b4aad42009-01-18 21:59:13 +010086#endif /* __NR_splice */
Willy Tarreau5bd8c372009-01-19 00:32:22 +010087
88/* A pipe contains 16 segments max, and it's common to see segments of 1448 bytes
89 * because of timestamps. Use this as a hint for not looping on splice().
90 */
91#define SPLICE_FULL_HINT 16*1448
92
Willy Tarreaua9de3332009-11-28 07:47:10 +010093/* how many data we attempt to splice at once when the buffer is configured for
94 * infinite forwarding */
95#define MAX_SPLICE_AT_ONCE (1<<30)
96
Willy Tarreau5bd8c372009-01-19 00:32:22 +010097/* Returns :
98 * -1 if splice is not possible or not possible anymore and we must switch to
99 * user-land copy (eg: to_forward reached)
100 * 0 when we know that polling is required to get more data (EAGAIN)
101 * 1 for all other cases (we can safely try again, or if an activity has been
102 * detected (DATA/NULL/ERR))
103 * Sets :
104 * BF_READ_NULL
105 * BF_READ_PARTIAL
106 * BF_WRITE_PARTIAL (during copy)
Willy Tarreauba0b63d2009-09-20 08:09:44 +0200107 * BF_OUT_EMPTY (during copy)
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100108 * SI_FL_ERR
109 * SI_FL_WAIT_ROOM
110 * (SI_FL_WAIT_RECV)
Willy Tarreau3eba98a2009-01-25 13:56:13 +0100111 *
112 * This function automatically allocates a pipe from the pipe pool. It also
113 * carefully ensures to clear b->pipe whenever it leaves the pipe empty.
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100114 */
115static int stream_sock_splice_in(struct buffer *b, struct stream_interface *si)
116{
117 int fd = si->fd;
Willy Tarreau31971e52009-09-20 12:07:52 +0200118 int ret;
119 unsigned long max;
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100120 int retval = 1;
121
122 if (!b->to_forward)
123 return -1;
124
125 if (!(b->flags & BF_KERN_SPLICING))
126 return -1;
127
128 if (b->l) {
129 /* We're embarrassed, there are already data pending in
130 * the buffer and we don't want to have them at two
131 * locations at a time. Let's indicate we need some
132 * place and ask the consumer to hurry.
133 */
134 si->flags |= SI_FL_WAIT_ROOM;
135 EV_FD_CLR(fd, DIR_RD);
136 b->rex = TICK_ETERNITY;
137 b->cons->chk_snd(b->cons);
138 return 1;
139 }
140
Willy Tarreau3eba98a2009-01-25 13:56:13 +0100141 if (unlikely(b->pipe == NULL)) {
142 if (pipes_used >= global.maxpipes || !(b->pipe = get_pipe())) {
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100143 b->flags &= ~BF_KERN_SPLICING;
144 return -1;
145 }
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100146 }
147
Willy Tarreau3eba98a2009-01-25 13:56:13 +0100148 /* At this point, b->pipe is valid */
149
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100150 while (1) {
Willy Tarreaua9de3332009-11-28 07:47:10 +0100151 if (b->to_forward == BUF_INFINITE_FORWARD)
152 max = MAX_SPLICE_AT_ONCE;
153 else
154 max = b->to_forward;
155
Willy Tarreau31971e52009-09-20 12:07:52 +0200156 if (!max) {
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100157 /* It looks like the buffer + the pipe already contain
158 * the maximum amount of data to be transferred. Try to
159 * send those data immediately on the other side if it
160 * is currently waiting.
161 */
162 retval = -1; /* end of forwarding */
163 break;
164 }
165
Willy Tarreau3eba98a2009-01-25 13:56:13 +0100166 ret = splice(fd, NULL, b->pipe->prod, NULL, max,
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100167 SPLICE_F_MOVE|SPLICE_F_NONBLOCK);
168
169 if (ret <= 0) {
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100170 if (ret == 0) {
Willy Tarreau98b306b2009-01-25 11:11:32 +0100171 /* connection closed. This is only detected by
172 * recent kernels (>= 2.6.27.13).
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100173 */
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100174 b->flags |= BF_READ_NULL;
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100175 retval = 1; /* no need for further polling */
176 break;
177 }
178
179 if (errno == EAGAIN) {
180 /* there are two reasons for EAGAIN :
181 * - nothing in the socket buffer (standard)
182 * - pipe is full
Willy Tarreau98b306b2009-01-25 11:11:32 +0100183 * - the connection is closed (kernel < 2.6.27.13)
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100184 * Since we don't know if pipe is full, we'll
185 * stop if the pipe is not empty. Anyway, we
186 * will almost always fill/empty the pipe.
187 */
188
Willy Tarreau3eba98a2009-01-25 13:56:13 +0100189 if (b->pipe->data) {
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100190 si->flags |= SI_FL_WAIT_ROOM;
191 retval = 1;
192 break;
193 }
194
Willy Tarreau98b306b2009-01-25 11:11:32 +0100195 /* We don't know if the connection was closed.
196 * But if we're called upon POLLIN with an empty
197 * pipe and get EAGAIN, it is suspect enought to
198 * try to fall back to the normal recv scheme
199 * which will be able to deal with the situation.
200 */
201 retval = -1;
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100202 break;
203 }
Willy Tarreaudc340a92009-06-28 23:10:19 +0200204
Willy Tarreaua9de3332009-11-28 07:47:10 +0100205 if (errno == ENOSYS || errno == EINVAL) {
Willy Tarreaudc340a92009-06-28 23:10:19 +0200206 /* splice not supported on this end, disable it */
207 b->flags &= ~BF_KERN_SPLICING;
208 si->flags &= ~SI_FL_CAP_SPLICE;
209 put_pipe(b->pipe);
210 b->pipe = NULL;
211 return -1;
212 }
213
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100214 /* here we have another error */
215 si->flags |= SI_FL_ERR;
216 retval = 1;
217 break;
218 } /* ret <= 0 */
219
Willy Tarreau31971e52009-09-20 12:07:52 +0200220 if (b->to_forward != BUF_INFINITE_FORWARD)
221 b->to_forward -= ret;
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100222 b->total += ret;
Willy Tarreau3eba98a2009-01-25 13:56:13 +0100223 b->pipe->data += ret;
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100224 b->flags |= BF_READ_PARTIAL;
Willy Tarreauba0b63d2009-09-20 08:09:44 +0200225 b->flags &= ~BF_OUT_EMPTY;
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100226
Willy Tarreau6f4a82c2009-03-21 20:43:57 +0100227 if (b->pipe->data >= SPLICE_FULL_HINT ||
228 ret >= global.tune.recv_enough) {
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100229 /* We've read enough of it for this time. */
230 retval = 1;
231 break;
232 }
233 } /* while */
234
Willy Tarreau3eba98a2009-01-25 13:56:13 +0100235 if (unlikely(!b->pipe->data)) {
236 put_pipe(b->pipe);
237 b->pipe = NULL;
238 }
239
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100240 return retval;
241}
242
Willy Tarreau6b4aad42009-01-18 21:59:13 +0100243#endif /* CONFIG_HAP_LINUX_SPLICE */
244
245
Willy Tarreaubaaee002006-06-26 02:48:02 +0200246/*
Willy Tarreaud7971282006-07-29 18:36:34 +0200247 * this function is called on a read event from a stream socket.
Willy Tarreau83749182007-04-15 20:56:27 +0200248 * It returns 0 if we have a high confidence that we will not be
249 * able to read more data without polling first. Returns non-zero
250 * otherwise.
Willy Tarreaubaaee002006-06-26 02:48:02 +0200251 */
Willy Tarreaud7971282006-07-29 18:36:34 +0200252int stream_sock_read(int fd) {
Willy Tarreaue5ed4062008-08-30 03:17:31 +0200253 struct stream_interface *si = fdtab[fd].owner;
Willy Tarreau48adac52008-08-30 04:58:38 +0200254 struct buffer *b = si->ib;
Willy Tarreau8a7af602008-05-03 23:07:14 +0200255 int ret, max, retval, cur_read;
Willy Tarreaub8949f12007-03-23 22:39:59 +0100256 int read_poll = MAX_READ_POLL_LOOPS;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200257
258#ifdef DEBUG_FULL
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100259 fprintf(stderr,"stream_sock_read : fd=%d, ev=0x%02x, owner=%p\n", fd, fdtab[fd].ev, fdtab[fd].owner);
Willy Tarreaubaaee002006-06-26 02:48:02 +0200260#endif
261
Willy Tarreau83749182007-04-15 20:56:27 +0200262 retval = 1;
263
Willy Tarreau71543652009-07-14 19:55:05 +0200264 /* stop immediately on errors. Note that we DON'T want to stop on
265 * POLL_ERR, as the poller might report a write error while there
266 * are still data available in the recv buffer. This typically
267 * happens when we send too large a request to a backend server
268 * which rejects it before reading it all.
269 */
270 if (fdtab[fd].state == FD_STERROR)
Willy Tarreau6996e152007-04-30 14:37:43 +0200271 goto out_error;
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100272
273 /* stop here if we reached the end of data */
274 if ((fdtab[fd].ev & (FD_POLL_IN|FD_POLL_HUP)) == FD_POLL_HUP)
275 goto out_shutdown_r;
Willy Tarreau83749182007-04-15 20:56:27 +0200276
Willy Tarreaud06e7112009-03-29 10:18:41 +0200277 /* maybe we were called immediately after an asynchronous shutr */
278 if (b->flags & BF_SHUTR)
279 goto out_wakeup;
280
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100281#if defined(CONFIG_HAP_LINUX_SPLICE)
282 if (b->to_forward && b->flags & BF_KERN_SPLICING) {
Willy Tarreau98b306b2009-01-25 11:11:32 +0100283
284 /* Under Linux, if FD_POLL_HUP is set, we have reached the end.
285 * Since older splice() implementations were buggy and returned
286 * EAGAIN on end of read, let's bypass the call to splice() now.
287 */
288 if (fdtab[fd].ev & FD_POLL_HUP)
289 goto out_shutdown_r;
290
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100291 retval = stream_sock_splice_in(b, si);
292
293 if (retval >= 0) {
294 if (si->flags & SI_FL_ERR)
295 goto out_error;
296 if (b->flags & BF_READ_NULL)
297 goto out_shutdown_r;
298 goto out_wakeup;
299 }
300 /* splice not possible (anymore), let's go on on standard copy */
301 }
302#endif
Willy Tarreau8a7af602008-05-03 23:07:14 +0200303 cur_read = 0;
Willy Tarreau6996e152007-04-30 14:37:43 +0200304 while (1) {
Willy Tarreau864e8252009-12-28 17:36:37 +0100305 max = buffer_max_len(b) - b->l;
306
307 if (max <= 0) {
308 b->flags |= BF_FULL;
309 si->flags |= SI_FL_WAIT_ROOM;
310 break;
311 }
312
Willy Tarreau6996e152007-04-30 14:37:43 +0200313 /*
314 * 1. compute the maximum block size we can read at once.
315 */
Willy Tarreau03d60bb2009-01-09 11:13:00 +0100316 if (b->l == 0) {
317 /* let's realign the buffer to optimize I/O */
318 b->r = b->w = b->lr = b->data;
Willy Tarreau83749182007-04-15 20:56:27 +0200319 }
320 else if (b->r > b->w) {
Willy Tarreau864e8252009-12-28 17:36:37 +0100321 /* remaining space wraps at the end, with a moving limit */
322 if (max > b->data + b->size - b->r)
323 max = b->data + b->size - b->r;
Willy Tarreau9c0fe592009-01-18 16:25:31 +0100324 }
Willy Tarreau864e8252009-12-28 17:36:37 +0100325 /* else max is already OK */
Willy Tarreaubaaee002006-06-26 02:48:02 +0200326
Willy Tarreau6996e152007-04-30 14:37:43 +0200327 /*
328 * 2. read the largest possible block
329 */
Willy Tarreaufc1daaf2010-01-15 10:26:13 +0100330 ret = recv(fd, b->r, max, 0);
Willy Tarreaud6d06902009-08-19 11:22:33 +0200331
Willy Tarreau83749182007-04-15 20:56:27 +0200332 if (ret > 0) {
333 b->r += ret;
334 b->l += ret;
Willy Tarreau8a7af602008-05-03 23:07:14 +0200335 cur_read += ret;
Willy Tarreaub38903c2008-11-23 21:33:29 +0100336
Willy Tarreau0abebcc2009-01-08 00:09:41 +0100337 /* if we're allowed to directly forward data, we must update send_max */
Willy Tarreau31971e52009-09-20 12:07:52 +0200338 if (b->to_forward && !(b->flags & (BF_SHUTW|BF_SHUTW_NOW))) {
339 unsigned long fwd = ret;
340 if (b->to_forward != BUF_INFINITE_FORWARD) {
341 if (fwd > b->to_forward)
342 fwd = b->to_forward;
343 b->to_forward -= fwd;
344 }
345 b->send_max += fwd;
Willy Tarreauba0b63d2009-09-20 08:09:44 +0200346 b->flags &= ~BF_OUT_EMPTY;
Willy Tarreau0abebcc2009-01-08 00:09:41 +0100347 }
Willy Tarreauf890dc92008-12-13 21:12:26 +0100348
Willy Tarreaub38903c2008-11-23 21:33:29 +0100349 if (fdtab[fd].state == FD_STCONN)
350 fdtab[fd].state = FD_STREADY;
351
Willy Tarreau3da77c52008-08-29 09:58:42 +0200352 b->flags |= BF_READ_PARTIAL;
Willy Tarreau74ab2ac2008-11-23 17:23:07 +0100353
Willy Tarreaua07a34e2009-08-16 23:27:46 +0200354 if (b->r == b->data + b->size) {
Willy Tarreau83749182007-04-15 20:56:27 +0200355 b->r = b->data; /* wrap around the buffer */
356 }
Willy Tarreau9641e8f2007-03-23 23:02:09 +0100357
Willy Tarreau83749182007-04-15 20:56:27 +0200358 b->total += ret;
Willy Tarreau9641e8f2007-03-23 23:02:09 +0100359
Willy Tarreau7c3c5412009-12-13 15:53:05 +0100360 if (b->l >= buffer_max_len(b)) {
Willy Tarreau6996e152007-04-30 14:37:43 +0200361 /* The buffer is now full, there's no point in going through
362 * the loop again.
363 */
Willy Tarreau8a7af602008-05-03 23:07:14 +0200364 if (!(b->flags & BF_STREAMER_FAST) && (cur_read == b->l)) {
365 b->xfer_small = 0;
366 b->xfer_large++;
367 if (b->xfer_large >= 3) {
368 /* we call this buffer a fast streamer if it manages
369 * to be filled in one call 3 consecutive times.
370 */
371 b->flags |= (BF_STREAMER | BF_STREAMER_FAST);
372 //fputc('+', stderr);
373 }
374 }
375 else if ((b->flags & (BF_STREAMER | BF_STREAMER_FAST)) &&
Willy Tarreaua07a34e2009-08-16 23:27:46 +0200376 (cur_read <= b->size / 2)) {
Willy Tarreau8a7af602008-05-03 23:07:14 +0200377 b->xfer_large = 0;
378 b->xfer_small++;
379 if (b->xfer_small >= 2) {
380 /* if the buffer has been at least half full twice,
381 * we receive faster than we send, so at least it
382 * is not a "fast streamer".
383 */
384 b->flags &= ~BF_STREAMER_FAST;
385 //fputc('-', stderr);
386 }
387 }
388 else {
389 b->xfer_small = 0;
390 b->xfer_large = 0;
391 }
Willy Tarreau9c0fe592009-01-18 16:25:31 +0100392
393 b->flags |= BF_FULL;
Willy Tarreaua456f2a2009-01-18 17:38:44 +0100394 si->flags |= SI_FL_WAIT_ROOM;
Willy Tarreau9c0fe592009-01-18 16:25:31 +0100395 break;
Willy Tarreau6996e152007-04-30 14:37:43 +0200396 }
397
Willy Tarreauab3e1d32007-06-03 14:10:36 +0200398 /* if too many bytes were missing from last read, it means that
399 * it's pointless trying to read again because the system does
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100400 * not have them in buffers. BTW, if FD_POLL_HUP was present,
401 * it means that we have reached the end and that the connection
402 * is closed.
Willy Tarreauab3e1d32007-06-03 14:10:36 +0200403 */
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100404 if (ret < max) {
Willy Tarreau8a7af602008-05-03 23:07:14 +0200405 if ((b->flags & (BF_STREAMER | BF_STREAMER_FAST)) &&
Willy Tarreaua07a34e2009-08-16 23:27:46 +0200406 (cur_read <= b->size / 2)) {
Willy Tarreau8a7af602008-05-03 23:07:14 +0200407 b->xfer_large = 0;
408 b->xfer_small++;
409 if (b->xfer_small >= 3) {
410 /* we have read less than half of the buffer in
411 * one pass, and this happened at least 3 times.
412 * This is definitely not a streamer.
413 */
414 b->flags &= ~(BF_STREAMER | BF_STREAMER_FAST);
415 //fputc('!', stderr);
416 }
417 }
Willy Tarreau2bea3a12008-08-28 09:47:43 +0200418 /* unfortunately, on level-triggered events, POLL_HUP
419 * is generally delivered AFTER the system buffer is
420 * empty, so this one might never match.
421 */
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100422 if (fdtab[fd].ev & FD_POLL_HUP)
423 goto out_shutdown_r;
Willy Tarreau2bea3a12008-08-28 09:47:43 +0200424
425 /* if a streamer has read few data, it may be because we
426 * have exhausted system buffers. It's not worth trying
427 * again.
428 */
429 if (b->flags & BF_STREAMER)
430 break;
Willy Tarreauab3e1d32007-06-03 14:10:36 +0200431
Willy Tarreau6f4a82c2009-03-21 20:43:57 +0100432 /* generally if we read something smaller than 1 or 2 MSS,
433 * it means that either we have exhausted the system's
434 * buffers (streamer or question-response protocol) or
435 * that the connection will be closed. Streamers are
436 * easily detected so we return early. For other cases,
437 * it's still better to perform a last read to be sure,
438 * because it may save one complete poll/read/wakeup cycle
439 * in case of shutdown.
440 */
441 if (ret < MIN_RET_FOR_READ_LOOP && b->flags & BF_STREAMER)
442 break;
443
444 /* if we read a large block smaller than what we requested,
445 * it's almost certain we'll never get anything more.
446 */
447 if (ret >= global.tune.recv_enough)
448 break;
449 }
Willy Tarreau83749182007-04-15 20:56:27 +0200450
Willy Tarreau1b194fe2009-03-21 21:10:04 +0100451 if ((b->flags & BF_READ_DONTWAIT) || --read_poll <= 0)
Willy Tarreaubaaee002006-06-26 02:48:02 +0200452 break;
Willy Tarreau83749182007-04-15 20:56:27 +0200453 }
454 else if (ret == 0) {
Willy Tarreau6996e152007-04-30 14:37:43 +0200455 /* connection closed */
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100456 goto out_shutdown_r;
Willy Tarreau83749182007-04-15 20:56:27 +0200457 }
Willy Tarreau9f195292007-04-15 21:26:58 +0200458 else if (errno == EAGAIN) {
459 /* Ignore EAGAIN but inform the poller that there is
Willy Tarreauaf78d0f2009-01-08 10:09:08 +0100460 * nothing to read left if we did not read much, ie
461 * less than what we were still expecting to read.
462 * But we may have done some work justifying to notify
463 * the task.
Willy Tarreau9f195292007-04-15 21:26:58 +0200464 */
Willy Tarreauaf78d0f2009-01-08 10:09:08 +0100465 if (cur_read < MIN_RET_FOR_READ_LOOP)
466 retval = 0;
Willy Tarreau83749182007-04-15 20:56:27 +0200467 break;
468 }
469 else {
Willy Tarreau6996e152007-04-30 14:37:43 +0200470 goto out_error;
Willy Tarreau83749182007-04-15 20:56:27 +0200471 }
Willy Tarreau6996e152007-04-30 14:37:43 +0200472 } /* while (1) */
Willy Tarreaubaaee002006-06-26 02:48:02 +0200473
Willy Tarreau6996e152007-04-30 14:37:43 +0200474 out_wakeup:
Willy Tarreau9c0fe592009-01-18 16:25:31 +0100475 /* We might have some data the consumer is waiting for */
Willy Tarreauba0b63d2009-09-20 08:09:44 +0200476 if (!(b->flags & BF_OUT_EMPTY) && (b->cons->flags & SI_FL_WAIT_DATA)) {
Willy Tarreau3eba98a2009-01-25 13:56:13 +0100477 int last_len = b->pipe ? b->pipe->data : 0;
Willy Tarreaua456f2a2009-01-18 17:38:44 +0100478
Willy Tarreau3ffeba12008-12-14 14:42:35 +0100479 b->cons->chk_snd(b->cons);
480
Willy Tarreaua456f2a2009-01-18 17:38:44 +0100481 /* check if the consumer has freed some space */
Willy Tarreau3eba98a2009-01-25 13:56:13 +0100482 if (!(b->flags & BF_FULL) &&
483 (!last_len || !b->pipe || b->pipe->data < last_len))
Willy Tarreaua456f2a2009-01-18 17:38:44 +0100484 si->flags &= ~SI_FL_WAIT_ROOM;
485 }
486
487 if (si->flags & SI_FL_WAIT_ROOM) {
Willy Tarreau9c0fe592009-01-18 16:25:31 +0100488 EV_FD_CLR(fd, DIR_RD);
489 b->rex = TICK_ETERNITY;
490 }
Willy Tarreauf1ba4b32009-10-17 14:37:52 +0200491 else if ((b->flags & (BF_SHUTR|BF_READ_PARTIAL|BF_FULL|BF_DONT_READ|BF_READ_NOEXP)) == BF_READ_PARTIAL)
Willy Tarreaua456f2a2009-01-18 17:38:44 +0100492 b->rex = tick_add_ifset(now_ms, b->rto);
Willy Tarreau9c0fe592009-01-18 16:25:31 +0100493
Willy Tarreau6b66f3e2008-12-14 17:31:54 +0100494 /* we have to wake up if there is a special event or if we don't have
495 * any more data to forward.
496 */
Willy Tarreau5af1fa12010-07-19 18:16:03 +0200497 if ((b->flags & (BF_READ_NULL|BF_READ_ERROR)) ||
Willy Tarreaua456f2a2009-01-18 17:38:44 +0100498 si->state != SI_ST_EST ||
Willy Tarreau5af1fa12010-07-19 18:16:03 +0200499 (si->flags & SI_FL_ERR) ||
500 ((b->flags & BF_READ_PARTIAL) && (!b->to_forward || b->cons->state != SI_ST_EST)))
Willy Tarreau6b66f3e2008-12-14 17:31:54 +0100501 task_wakeup(si->owner, TASK_WOKEN_IO);
Willy Tarreau5af1fa12010-07-19 18:16:03 +0200502
503 if (b->flags & BF_READ_ACTIVITY)
504 b->flags &= ~BF_READ_DONTWAIT;
505
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100506 fdtab[fd].ev &= ~FD_POLL_IN;
Willy Tarreau83749182007-04-15 20:56:27 +0200507 return retval;
Willy Tarreau6996e152007-04-30 14:37:43 +0200508
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100509 out_shutdown_r:
Willy Tarreaue5ed4062008-08-30 03:17:31 +0200510 /* we received a shutdown */
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100511 fdtab[fd].ev &= ~FD_POLL_HUP;
512 b->flags |= BF_READ_NULL;
Willy Tarreau520d95e2009-09-19 21:04:57 +0200513 if (b->flags & BF_AUTO_CLOSE)
Willy Tarreau418fd472009-09-06 21:37:23 +0200514 buffer_shutw_now(b);
Willy Tarreau99126c32008-11-27 10:30:51 +0100515 stream_sock_shutr(si);
Willy Tarreau0c303ee2008-07-07 00:09:58 +0200516 goto out_wakeup;
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100517
Willy Tarreau6996e152007-04-30 14:37:43 +0200518 out_error:
Willy Tarreaucff64112008-11-03 06:26:53 +0100519 /* Read error on the file descriptor. We mark the FD as STERROR so
520 * that we don't use it anymore. The error is reported to the stream
521 * interface which will take proper action. We must not perturbate the
522 * buffer because the stream interface wants to ensure transparent
523 * connection retries.
Willy Tarreau6996e152007-04-30 14:37:43 +0200524 */
Willy Tarreaucff64112008-11-03 06:26:53 +0100525
Willy Tarreau6996e152007-04-30 14:37:43 +0200526 fdtab[fd].state = FD_STERROR;
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100527 fdtab[fd].ev &= ~FD_POLL_STICKY;
Willy Tarreau1714e0f2009-03-28 20:54:53 +0100528 EV_FD_REM(fd);
Willy Tarreaucff64112008-11-03 06:26:53 +0100529 si->flags |= SI_FL_ERR;
Willy Tarreau9c0fe592009-01-18 16:25:31 +0100530 retval = 1;
531 goto out_wakeup;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200532}
533
534
535/*
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100536 * This function is called to send buffer data to a stream socket.
537 * It returns -1 in case of unrecoverable error, 0 if the caller needs to poll
Willy Tarreau3eba98a2009-01-25 13:56:13 +0100538 * before calling it again, otherwise 1. If a pipe was associated with the
539 * buffer and it empties it, it releases it as well.
Willy Tarreaubaaee002006-06-26 02:48:02 +0200540 */
Willy Tarreaua456f2a2009-01-18 17:38:44 +0100541static int stream_sock_write_loop(struct stream_interface *si, struct buffer *b)
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100542{
Willy Tarreau83749182007-04-15 20:56:27 +0200543 int write_poll = MAX_WRITE_POLL_LOOPS;
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100544 int retval = 1;
545 int ret, max;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200546
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100547#if defined(CONFIG_HAP_LINUX_SPLICE)
Willy Tarreau3eba98a2009-01-25 13:56:13 +0100548 while (b->pipe) {
549 ret = splice(b->pipe->cons, NULL, si->fd, NULL, b->pipe->data,
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100550 SPLICE_F_MOVE|SPLICE_F_NONBLOCK);
551 if (ret <= 0) {
552 if (ret == 0 || errno == EAGAIN) {
553 retval = 0;
554 return retval;
555 }
556 /* here we have another error */
557 retval = -1;
558 return retval;
559 }
560
561 b->flags |= BF_WRITE_PARTIAL;
Willy Tarreau3eba98a2009-01-25 13:56:13 +0100562 b->pipe->data -= ret;
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100563
Willy Tarreau3eba98a2009-01-25 13:56:13 +0100564 if (!b->pipe->data) {
565 put_pipe(b->pipe);
566 b->pipe = NULL;
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100567 break;
Willy Tarreau3eba98a2009-01-25 13:56:13 +0100568 }
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100569
570 if (--write_poll <= 0)
571 return retval;
572 }
573
574 /* At this point, the pipe is empty, but we may still have data pending
575 * in the normal buffer.
576 */
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100577#endif
Willy Tarreauba0b63d2009-09-20 08:09:44 +0200578 if (!b->send_max) {
579 b->flags |= BF_OUT_EMPTY;
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100580 return retval;
Willy Tarreauba0b63d2009-09-20 08:09:44 +0200581 }
Willy Tarreau83749182007-04-15 20:56:27 +0200582
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100583 /* when we're in this loop, we already know that there is no spliced
584 * data left, and that there are sendable buffered data.
585 */
Willy Tarreau6996e152007-04-30 14:37:43 +0200586 while (1) {
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100587 if (b->r > b->w)
Willy Tarreau83749182007-04-15 20:56:27 +0200588 max = b->r - b->w;
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100589 else
Willy Tarreaua07a34e2009-08-16 23:27:46 +0200590 max = b->data + b->size - b->w;
Willy Tarreau83749182007-04-15 20:56:27 +0200591
Willy Tarreauf890dc92008-12-13 21:12:26 +0100592 /* limit the amount of outgoing data if required */
593 if (max > b->send_max)
594 max = b->send_max;
595
Willy Tarreau6db06d32009-08-19 11:14:11 +0200596 /* check if we want to inform the kernel that we're interested in
597 * sending more data after this call. We want this if :
598 * - we're about to close after this last send and want to merge
599 * the ongoing FIN with the last segment.
600 * - we know we can't send everything at once and must get back
601 * here because of unaligned data
Willy Tarreaud38b53b2010-01-03 11:18:34 +0100602 * - there is still a finite amount of data to forward
Willy Tarreau6db06d32009-08-19 11:14:11 +0200603 * The test is arranged so that the most common case does only 2
604 * tests.
Willy Tarreaufb14edc2009-06-14 15:24:37 +0200605 */
Willy Tarreaufb14edc2009-06-14 15:24:37 +0200606
Willy Tarreauface8392010-01-03 11:37:54 +0100607 if (MSG_NOSIGNAL && MSG_MORE) {
Willy Tarreau6db06d32009-08-19 11:14:11 +0200608 unsigned int send_flag = MSG_DONTWAIT | MSG_NOSIGNAL;
609
Willy Tarreauface8392010-01-03 11:37:54 +0100610 if (((b->to_forward && b->to_forward != BUF_INFINITE_FORWARD) ||
Willy Tarreaud38b53b2010-01-03 11:18:34 +0100611 ((b->flags & (BF_SHUTW|BF_SHUTW_NOW|BF_HIJACK)) == BF_SHUTW_NOW && (max == b->send_max)) ||
Willy Tarreau6db06d32009-08-19 11:14:11 +0200612 (max != b->l && max != b->send_max))
613 && (fdtab[si->fd].flags & FD_FL_TCP)) {
614 send_flag |= MSG_MORE;
615 }
Willy Tarreauface8392010-01-03 11:37:54 +0100616 else if (b->flags & BF_EXPECT_MORE) {
617 /* it was forced on the buffer, this flag is one-shoot */
618 b->flags &= ~BF_EXPECT_MORE;
619 send_flag |= MSG_MORE;
620 }
Willy Tarreau6db06d32009-08-19 11:14:11 +0200621
Willy Tarreau2be39392010-01-03 17:24:51 +0100622 /* this flag has precedence over the rest */
623 if (b->flags & BF_SEND_DONTWAIT)
624 send_flag &= ~MSG_MORE;
625
Willy Tarreau6db06d32009-08-19 11:14:11 +0200626 ret = send(si->fd, b->w, max, send_flag);
Willy Tarreau2be39392010-01-03 17:24:51 +0100627
628 /* disable it only once everything has been sent */
629 if (ret == max && (b->flags & BF_SEND_DONTWAIT))
630 b->flags &= ~BF_SEND_DONTWAIT;
Willy Tarreaud6d06902009-08-19 11:22:33 +0200631 } else {
Willy Tarreaubaaee002006-06-26 02:48:02 +0200632 int skerr;
633 socklen_t lskerr = sizeof(skerr);
634
Willy Tarreau87bed622009-03-08 22:25:28 +0100635 ret = getsockopt(si->fd, SOL_SOCKET, SO_ERROR, &skerr, &lskerr);
Willy Tarreauc6423482006-10-15 14:59:03 +0200636 if (ret == -1 || skerr)
Willy Tarreaubaaee002006-06-26 02:48:02 +0200637 ret = -1;
638 else
Willy Tarreau87bed622009-03-08 22:25:28 +0100639 ret = send(si->fd, b->w, max, MSG_DONTWAIT);
Willy Tarreaubaaee002006-06-26 02:48:02 +0200640 }
Willy Tarreaubaaee002006-06-26 02:48:02 +0200641
642 if (ret > 0) {
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100643 if (fdtab[si->fd].state == FD_STCONN)
644 fdtab[si->fd].state = FD_STREADY;
Willy Tarreaub38903c2008-11-23 21:33:29 +0100645
Willy Tarreau3da77c52008-08-29 09:58:42 +0200646 b->flags |= BF_WRITE_PARTIAL;
Willy Tarreaue393fe22008-08-16 22:18:07 +0200647
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100648 b->w += ret;
Willy Tarreaua07a34e2009-08-16 23:27:46 +0200649 if (b->w == b->data + b->size)
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100650 b->w = b->data; /* wrap around the buffer */
651
652 b->l -= ret;
Willy Tarreau7c3c5412009-12-13 15:53:05 +0100653 if (likely(b->l < buffer_max_len(b)))
Willy Tarreaue393fe22008-08-16 22:18:07 +0200654 b->flags &= ~BF_FULL;
Willy Tarreau74ab2ac2008-11-23 17:23:07 +0100655
Willy Tarreauba0b63d2009-09-20 08:09:44 +0200656 if (likely(!b->l))
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100657 /* optimize data alignment in the buffer */
658 b->r = b->w = b->lr = b->data;
Willy Tarreau83749182007-04-15 20:56:27 +0200659
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100660 b->send_max -= ret;
Willy Tarreauba0b63d2009-09-20 08:09:44 +0200661 if (!b->send_max) {
662 if (likely(!b->pipe))
663 b->flags |= BF_OUT_EMPTY;
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100664 break;
Willy Tarreauba0b63d2009-09-20 08:09:44 +0200665 }
Willy Tarreau83749182007-04-15 20:56:27 +0200666
Willy Tarreauab3e1d32007-06-03 14:10:36 +0200667 /* if the system buffer is full, don't insist */
668 if (ret < max)
669 break;
670
Willy Tarreau6996e152007-04-30 14:37:43 +0200671 if (--write_poll <= 0)
672 break;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200673 }
Willy Tarreau6996e152007-04-30 14:37:43 +0200674 else if (ret == 0 || errno == EAGAIN) {
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100675 /* nothing written, we need to poll for write first */
Willy Tarreau83749182007-04-15 20:56:27 +0200676 retval = 0;
677 break;
678 }
Willy Tarreaubaaee002006-06-26 02:48:02 +0200679 else {
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100680 /* bad, we got an error */
681 retval = -1;
682 break;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200683 }
Willy Tarreau6996e152007-04-30 14:37:43 +0200684 } /* while (1) */
Willy Tarreaubaaee002006-06-26 02:48:02 +0200685
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100686 return retval;
687}
Willy Tarreau6996e152007-04-30 14:37:43 +0200688
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100689
690/*
691 * This function is called on a write event from a stream socket.
692 * It returns 0 if the caller needs to poll before calling it again, otherwise
693 * non-zero.
694 */
695int stream_sock_write(int fd)
696{
697 struct stream_interface *si = fdtab[fd].owner;
698 struct buffer *b = si->ob;
699 int retval = 1;
700
701#ifdef DEBUG_FULL
702 fprintf(stderr,"stream_sock_write : fd=%d, owner=%p\n", fd, fdtab[fd].owner);
703#endif
704
705 retval = 1;
Willy Tarreau71543652009-07-14 19:55:05 +0200706 if (fdtab[fd].state == FD_STERROR)
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100707 goto out_error;
708
Willy Tarreaud06e7112009-03-29 10:18:41 +0200709 /* we might have been called just after an asynchronous shutw */
710 if (b->flags & BF_SHUTW)
711 goto out_wakeup;
712
Willy Tarreauba0b63d2009-09-20 08:09:44 +0200713 if (likely(!(b->flags & BF_OUT_EMPTY))) {
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100714 /* OK there are data waiting to be sent */
715 retval = stream_sock_write_loop(si, b);
716 if (retval < 0)
717 goto out_error;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200718 }
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100719 else {
720 /* may be we have received a connection acknowledgement in TCP mode without data */
721 if (likely(fdtab[fd].state == FD_STCONN)) {
722 /* We have no data to send to check the connection, and
723 * getsockopt() will not inform us whether the connection
724 * is still pending. So we'll reuse connect() to check the
725 * state of the socket. This has the advantage of givig us
726 * the following info :
727 * - error
728 * - connecting (EALREADY, EINPROGRESS)
729 * - connected (EISCONN, 0)
730 */
Willy Tarreau8d5d77e2009-10-18 07:25:52 +0200731 if ((connect(fd, fdinfo[fd].peeraddr, fdinfo[fd].peerlen) == 0))
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100732 errno = 0;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200733
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100734 if (errno == EALREADY || errno == EINPROGRESS) {
735 retval = 0;
736 goto out_may_wakeup;
737 }
Willy Tarreau3ffeba12008-12-14 14:42:35 +0100738
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100739 if (errno && errno != EISCONN)
740 goto out_error;
Willy Tarreaufa7e1022008-10-19 07:30:41 +0200741
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100742 /* OK we just need to indicate that we got a connection
743 * and that we wrote nothing.
744 */
745 b->flags |= BF_WRITE_NULL;
746 fdtab[fd].state = FD_STREADY;
747 }
Willy Tarreau6996e152007-04-30 14:37:43 +0200748
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100749 /* Funny, we were called to write something but there wasn't
750 * anything. We can get there, for example if we were woken up
751 * on a write event to finish the splice, but the send_max is 0
752 * so we cannot write anything from the buffer. Let's disable
753 * the write event and pretend we never came there.
754 */
755 }
756
Willy Tarreauba0b63d2009-09-20 08:09:44 +0200757 if (b->flags & BF_OUT_EMPTY) {
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100758 /* the connection is established but we can't write. Either the
759 * buffer is empty, or we just refrain from sending because the
760 * send_max limit was reached. Maybe we just wrote the last
761 * chunk and need to close.
762 */
Willy Tarreau520d95e2009-09-19 21:04:57 +0200763 if (((b->flags & (BF_SHUTW|BF_HIJACK|BF_SHUTW_NOW)) == BF_SHUTW_NOW) &&
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100764 (si->state == SI_ST_EST)) {
765 stream_sock_shutw(si);
766 goto out_wakeup;
767 }
768
Willy Tarreau59454bf2009-09-20 11:13:40 +0200769 if ((b->flags & (BF_SHUTW|BF_SHUTW_NOW|BF_FULL|BF_HIJACK)) == 0)
Willy Tarreauac128fe2009-01-09 13:05:19 +0100770 si->flags |= SI_FL_WAIT_DATA;
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100771
Willy Tarreauac128fe2009-01-09 13:05:19 +0100772 EV_FD_CLR(fd, DIR_WR);
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100773 b->wex = TICK_ETERNITY;
Willy Tarreauac128fe2009-01-09 13:05:19 +0100774 }
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100775
776 out_may_wakeup:
777 if (b->flags & BF_WRITE_ACTIVITY) {
778 /* update timeout if we have written something */
Willy Tarreauba0b63d2009-09-20 08:09:44 +0200779 if ((b->flags & (BF_OUT_EMPTY|BF_SHUTW|BF_WRITE_PARTIAL)) == BF_WRITE_PARTIAL)
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100780 b->wex = tick_add_ifset(now_ms, b->wto);
781
782 out_wakeup:
Willy Tarreauf27b5ea2009-10-03 22:01:18 +0200783 if (tick_isset(si->ib->rex) && !(si->flags & SI_FL_INDEP_STR)) {
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100784 /* Note: to prevent the client from expiring read timeouts
Willy Tarreauf27b5ea2009-10-03 22:01:18 +0200785 * during writes, we refresh it. We only do this if the
786 * interface is not configured for "independant streams",
787 * because for some applications it's better not to do this,
788 * for instance when continuously exchanging small amounts
789 * of data which can full the socket buffers long before a
790 * write timeout is detected.
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100791 */
792 si->ib->rex = tick_add_ifset(now_ms, si->ib->rto);
793 }
794
795 /* the producer might be waiting for more room to store data */
Willy Tarreauf1ba4b32009-10-17 14:37:52 +0200796 if (likely((b->flags & (BF_SHUTW|BF_WRITE_PARTIAL|BF_FULL|BF_DONT_READ)) == BF_WRITE_PARTIAL &&
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100797 (b->prod->flags & SI_FL_WAIT_ROOM)))
798 b->prod->chk_rcv(b->prod);
799
800 /* we have to wake up if there is a special event or if we don't have
801 * any more data to forward and it's not planned to send any more.
802 */
803 if (likely((b->flags & (BF_WRITE_NULL|BF_WRITE_ERROR|BF_SHUTW)) ||
Willy Tarreauba0b63d2009-09-20 08:09:44 +0200804 ((b->flags & BF_OUT_EMPTY) && !b->to_forward) ||
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100805 si->state != SI_ST_EST ||
806 b->prod->state != SI_ST_EST))
807 task_wakeup(si->owner, TASK_WOKEN_IO);
808 }
809
810 fdtab[fd].ev &= ~FD_POLL_OUT;
811 return retval;
Willy Tarreauac128fe2009-01-09 13:05:19 +0100812
Willy Tarreau6996e152007-04-30 14:37:43 +0200813 out_error:
Willy Tarreaucff64112008-11-03 06:26:53 +0100814 /* Write error on the file descriptor. We mark the FD as STERROR so
815 * that we don't use it anymore. The error is reported to the stream
816 * interface which will take proper action. We must not perturbate the
817 * buffer because the stream interface wants to ensure transparent
818 * connection retries.
Willy Tarreau6996e152007-04-30 14:37:43 +0200819 */
Willy Tarreaucff64112008-11-03 06:26:53 +0100820
Willy Tarreau6996e152007-04-30 14:37:43 +0200821 fdtab[fd].state = FD_STERROR;
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100822 fdtab[fd].ev &= ~FD_POLL_STICKY;
Willy Tarreau1714e0f2009-03-28 20:54:53 +0100823 EV_FD_REM(fd);
Willy Tarreaucff64112008-11-03 06:26:53 +0100824 si->flags |= SI_FL_ERR;
Willy Tarreaue5ed4062008-08-30 03:17:31 +0200825 task_wakeup(si->owner, TASK_WOKEN_IO);
826 return 1;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200827}
828
Willy Tarreau48adac52008-08-30 04:58:38 +0200829/*
Willy Tarreau3c6ab2e2008-09-04 11:19:41 +0200830 * This function performs a shutdown-write on a stream interface in a connected or
831 * init state (it does nothing for other states). It either shuts the write side
Willy Tarreau99126c32008-11-27 10:30:51 +0100832 * or closes the file descriptor and marks itself as closed. The buffer flags are
Willy Tarreau7340ca52010-01-16 10:03:45 +0100833 * updated to reflect the new state. It does also close everything is the SI was
834 * marked as being in error state.
Willy Tarreau48adac52008-08-30 04:58:38 +0200835 */
Willy Tarreau0a5d5dd2008-11-23 19:31:35 +0100836void stream_sock_shutw(struct stream_interface *si)
Willy Tarreau48adac52008-08-30 04:58:38 +0200837{
Willy Tarreau418fd472009-09-06 21:37:23 +0200838 si->ob->flags &= ~BF_SHUTW_NOW;
Willy Tarreau99126c32008-11-27 10:30:51 +0100839 if (si->ob->flags & BF_SHUTW)
840 return;
841 si->ob->flags |= BF_SHUTW;
842 si->ob->wex = TICK_ETERNITY;
Willy Tarreaub0ef7352008-12-14 13:26:20 +0100843 si->flags &= ~SI_FL_WAIT_DATA;
Willy Tarreau99126c32008-11-27 10:30:51 +0100844
Willy Tarreaub38903c2008-11-23 21:33:29 +0100845 switch (si->state) {
Willy Tarreaub38903c2008-11-23 21:33:29 +0100846 case SI_ST_EST:
Willy Tarreau720058c2009-07-14 19:21:50 +0200847 /* we have to shut before closing, otherwise some short messages
848 * may never leave the system, especially when there are remaining
849 * unread data in the socket input buffer, or when nolinger is set.
Willy Tarreau4c283dc2009-12-29 14:36:34 +0100850 * However, if SI_FL_NOLINGER is explicitly set, we know there is
851 * no risk so we close both sides immediately.
Willy Tarreau720058c2009-07-14 19:21:50 +0200852 */
Willy Tarreau7340ca52010-01-16 10:03:45 +0100853 if (si->flags & SI_FL_ERR) {
854 /* quick close, the socket is already shut. Remove pending flags. */
855 si->flags &= ~SI_FL_NOLINGER;
856 } else if (si->flags & SI_FL_NOLINGER) {
Willy Tarreau4c283dc2009-12-29 14:36:34 +0100857 si->flags &= ~SI_FL_NOLINGER;
858 setsockopt(si->fd, SOL_SOCKET, SO_LINGER,
859 (struct linger *) &nolinger, sizeof(struct linger));
860 } else {
861 EV_FD_CLR(si->fd, DIR_WR);
862 shutdown(si->fd, SHUT_WR);
Willy Tarreau720058c2009-07-14 19:21:50 +0200863
Willy Tarreau4c283dc2009-12-29 14:36:34 +0100864 if (!(si->ib->flags & (BF_SHUTR|BF_DONT_READ)))
865 return;
866 }
Willy Tarreau5d707e12009-06-28 11:09:07 +0200867
Willy Tarreaub38903c2008-11-23 21:33:29 +0100868 /* fall through */
869 case SI_ST_CON:
Willy Tarreau8bfa4262008-11-27 09:25:45 +0100870 /* we may have to close a pending connection, and mark the
871 * response buffer as shutr
872 */
Willy Tarreau48adac52008-08-30 04:58:38 +0200873 fd_delete(si->fd);
Willy Tarreaufe3718a2008-11-30 18:14:12 +0100874 /* fall through */
875 case SI_ST_CER:
Willy Tarreau32d3ee92010-12-29 14:03:02 +0100876 case SI_ST_QUE:
877 case SI_ST_TAR:
Willy Tarreau7f006512008-12-07 14:04:04 +0100878 si->state = SI_ST_DIS;
879 default:
Willy Tarreaud06e7112009-03-29 10:18:41 +0200880 si->flags &= ~SI_FL_WAIT_ROOM;
Willy Tarreau99126c32008-11-27 10:30:51 +0100881 si->ib->flags |= BF_SHUTR;
Willy Tarreaufe3718a2008-11-30 18:14:12 +0100882 si->ib->rex = TICK_ETERNITY;
Willy Tarreau127334e2009-03-28 10:47:26 +0100883 si->exp = TICK_ETERNITY;
Willy Tarreau0a5d5dd2008-11-23 19:31:35 +0100884 return;
Willy Tarreau48adac52008-08-30 04:58:38 +0200885 }
Willy Tarreau0bd05ea2010-07-02 11:18:03 +0200886
887 if (si->release)
888 si->release(si);
Willy Tarreau48adac52008-08-30 04:58:38 +0200889}
Willy Tarreaubaaee002006-06-26 02:48:02 +0200890
Willy Tarreau2d212792008-08-27 21:41:35 +0200891/*
Willy Tarreau3c6ab2e2008-09-04 11:19:41 +0200892 * This function performs a shutdown-read on a stream interface in a connected or
Willy Tarreau0a5d5dd2008-11-23 19:31:35 +0100893 * init state (it does nothing for other states). It either shuts the read side
Willy Tarreau99126c32008-11-27 10:30:51 +0100894 * or closes the file descriptor and marks itself as closed. The buffer flags are
895 * updated to reflect the new state.
Willy Tarreau3c6ab2e2008-09-04 11:19:41 +0200896 */
Willy Tarreau0a5d5dd2008-11-23 19:31:35 +0100897void stream_sock_shutr(struct stream_interface *si)
Willy Tarreau3c6ab2e2008-09-04 11:19:41 +0200898{
Willy Tarreau418fd472009-09-06 21:37:23 +0200899 si->ib->flags &= ~BF_SHUTR_NOW;
Willy Tarreau99126c32008-11-27 10:30:51 +0100900 if (si->ib->flags & BF_SHUTR)
901 return;
902 si->ib->flags |= BF_SHUTR;
903 si->ib->rex = TICK_ETERNITY;
Willy Tarreaub0ef7352008-12-14 13:26:20 +0100904 si->flags &= ~SI_FL_WAIT_ROOM;
Willy Tarreau99126c32008-11-27 10:30:51 +0100905
Willy Tarreau8bfa4262008-11-27 09:25:45 +0100906 if (si->state != SI_ST_EST && si->state != SI_ST_CON)
Willy Tarreau0a5d5dd2008-11-23 19:31:35 +0100907 return;
Willy Tarreau3c6ab2e2008-09-04 11:19:41 +0200908
Willy Tarreaucff64112008-11-03 06:26:53 +0100909 if (si->ob->flags & BF_SHUTW) {
Willy Tarreau3c6ab2e2008-09-04 11:19:41 +0200910 fd_delete(si->fd);
Willy Tarreau74ab2ac2008-11-23 17:23:07 +0100911 si->state = SI_ST_DIS;
Willy Tarreau127334e2009-03-28 10:47:26 +0100912 si->exp = TICK_ETERNITY;
Willy Tarreau0bd05ea2010-07-02 11:18:03 +0200913
914 if (si->release)
915 si->release(si);
Willy Tarreau0a5d5dd2008-11-23 19:31:35 +0100916 return;
Willy Tarreau3c6ab2e2008-09-04 11:19:41 +0200917 }
918 EV_FD_CLR(si->fd, DIR_RD);
Willy Tarreau0a5d5dd2008-11-23 19:31:35 +0100919 return;
Willy Tarreau3c6ab2e2008-09-04 11:19:41 +0200920}
921
922/*
Willy Tarreau3a16b2c2008-08-28 08:54:27 +0200923 * Updates a connected stream_sock file descriptor status and timeouts
924 * according to the buffers' flags. It should only be called once after the
925 * buffer flags have settled down, and before they are cleared. It doesn't
926 * harm to call it as often as desired (it just slightly hurts performance).
927 */
Willy Tarreaub0253252008-11-30 21:37:12 +0100928void stream_sock_data_finish(struct stream_interface *si)
Willy Tarreau3a16b2c2008-08-28 08:54:27 +0200929{
Willy Tarreaub0253252008-11-30 21:37:12 +0100930 struct buffer *ib = si->ib;
931 struct buffer *ob = si->ob;
932 int fd = si->fd;
Willy Tarreau3a16b2c2008-08-28 08:54:27 +0200933
Willy Tarreaue5ed4062008-08-30 03:17:31 +0200934 DPRINTF(stderr,"[%u] %s: fd=%d owner=%p ib=%p, ob=%p, exp(r,w)=%u,%u ibf=%08x obf=%08x ibl=%d obl=%d si=%d\n",
Willy Tarreau3a16b2c2008-08-28 08:54:27 +0200935 now_ms, __FUNCTION__,
936 fd, fdtab[fd].owner,
937 ib, ob,
938 ib->rex, ob->wex,
939 ib->flags, ob->flags,
Willy Tarreaub0253252008-11-30 21:37:12 +0100940 ib->l, ob->l, si->state);
Willy Tarreau3a16b2c2008-08-28 08:54:27 +0200941
942 /* Check if we need to close the read side */
943 if (!(ib->flags & BF_SHUTR)) {
Willy Tarreau2d212792008-08-27 21:41:35 +0200944 /* Read not closed, update FD status and timeout for reads */
Willy Tarreauf1ba4b32009-10-17 14:37:52 +0200945 if (ib->flags & (BF_FULL|BF_HIJACK|BF_DONT_READ)) {
Willy Tarreau2d212792008-08-27 21:41:35 +0200946 /* stop reading */
Willy Tarreau11f49402010-11-11 23:08:17 +0100947 if (!(si->flags & SI_FL_WAIT_ROOM)) {
948 if ((ib->flags & (BF_FULL|BF_HIJACK|BF_DONT_READ)) == BF_FULL)
949 si->flags |= SI_FL_WAIT_ROOM;
950 EV_FD_COND_C(fd, DIR_RD);
951 ib->rex = TICK_ETERNITY;
952 }
Willy Tarreau2d212792008-08-27 21:41:35 +0200953 }
954 else {
955 /* (re)start reading and update timeout. Note: we don't recompute the timeout
956 * everytime we get here, otherwise it would risk never to expire. We only
Willy Tarreaufe8903c2009-10-04 10:56:08 +0200957 * update it if is was not yet set. The stream socket handler will already
958 * have updated it if there has been a completed I/O.
Willy Tarreau2d212792008-08-27 21:41:35 +0200959 */
Willy Tarreaub0ef7352008-12-14 13:26:20 +0100960 si->flags &= ~SI_FL_WAIT_ROOM;
Willy Tarreau2d212792008-08-27 21:41:35 +0200961 EV_FD_COND_S(fd, DIR_RD);
Willy Tarreauf1ba4b32009-10-17 14:37:52 +0200962 if (!(ib->flags & (BF_READ_NOEXP|BF_DONT_READ)) && !tick_isset(ib->rex))
Willy Tarreau2d212792008-08-27 21:41:35 +0200963 ib->rex = tick_add_ifset(now_ms, ib->rto);
964 }
965 }
966
967 /* Check if we need to close the write side */
968 if (!(ob->flags & BF_SHUTW)) {
Willy Tarreau2d212792008-08-27 21:41:35 +0200969 /* Write not closed, update FD status and timeout for writes */
Willy Tarreauba0b63d2009-09-20 08:09:44 +0200970 if (ob->flags & BF_OUT_EMPTY) {
Willy Tarreau2d212792008-08-27 21:41:35 +0200971 /* stop writing */
Willy Tarreau11f49402010-11-11 23:08:17 +0100972 if (!(si->flags & SI_FL_WAIT_DATA)) {
973 if ((ob->flags & (BF_FULL|BF_HIJACK|BF_SHUTW_NOW)) == 0)
974 si->flags |= SI_FL_WAIT_DATA;
975 EV_FD_COND_C(fd, DIR_WR);
976 ob->wex = TICK_ETERNITY;
977 }
Willy Tarreau2d212792008-08-27 21:41:35 +0200978 }
979 else {
980 /* (re)start writing and update timeout. Note: we don't recompute the timeout
981 * everytime we get here, otherwise it would risk never to expire. We only
Willy Tarreaufe8903c2009-10-04 10:56:08 +0200982 * update it if is was not yet set. The stream socket handler will already
983 * have updated it if there has been a completed I/O.
Willy Tarreau2d212792008-08-27 21:41:35 +0200984 */
Willy Tarreaub0ef7352008-12-14 13:26:20 +0100985 si->flags &= ~SI_FL_WAIT_DATA;
Willy Tarreau2d212792008-08-27 21:41:35 +0200986 EV_FD_COND_S(fd, DIR_WR);
Willy Tarreaufe8903c2009-10-04 10:56:08 +0200987 if (!tick_isset(ob->wex)) {
Willy Tarreau2d212792008-08-27 21:41:35 +0200988 ob->wex = tick_add_ifset(now_ms, ob->wto);
Willy Tarreauf27b5ea2009-10-03 22:01:18 +0200989 if (tick_isset(ib->rex) && !(si->flags & SI_FL_INDEP_STR)) {
Willy Tarreau2d212792008-08-27 21:41:35 +0200990 /* Note: depending on the protocol, we don't know if we're waiting
991 * for incoming data or not. So in order to prevent the socket from
992 * expiring read timeouts during writes, we refresh the read timeout,
Willy Tarreauf27b5ea2009-10-03 22:01:18 +0200993 * except if it was already infinite or if we have explicitly setup
994 * independant streams.
Willy Tarreau2d212792008-08-27 21:41:35 +0200995 */
Willy Tarreaud06e7112009-03-29 10:18:41 +0200996 ib->rex = tick_add_ifset(now_ms, ib->rto);
Willy Tarreau2d212792008-08-27 21:41:35 +0200997 }
998 }
999 }
1000 }
Willy Tarreau2d212792008-08-27 21:41:35 +02001001}
1002
Willy Tarreau3ffeba12008-12-14 14:42:35 +01001003/* This function is used for inter-stream-interface calls. It is called by the
1004 * consumer to inform the producer side that it may be interested in checking
1005 * for free space in the buffer. Note that it intentionally does not update
1006 * timeouts, so that we can still check them later at wake-up.
1007 */
1008void stream_sock_chk_rcv(struct stream_interface *si)
1009{
1010 struct buffer *ib = si->ib;
1011
1012 DPRINTF(stderr,"[%u] %s: fd=%d owner=%p ib=%p, ob=%p, exp(r,w)=%u,%u ibf=%08x obf=%08x ibl=%d obl=%d si=%d\n",
1013 now_ms, __FUNCTION__,
Vincenzo Farruggia9b97cff2009-01-30 16:49:10 +00001014 si->fd, fdtab[si->fd].owner,
1015 ib, si->ob,
1016 ib->rex, si->ob->wex,
1017 ib->flags, si->ob->flags,
1018 ib->l, si->ob->l, si->state);
Willy Tarreau3ffeba12008-12-14 14:42:35 +01001019
1020 if (unlikely(si->state != SI_ST_EST || (ib->flags & BF_SHUTR)))
1021 return;
1022
Willy Tarreauf1ba4b32009-10-17 14:37:52 +02001023 if (ib->flags & (BF_FULL|BF_HIJACK|BF_DONT_READ)) {
Willy Tarreau3ffeba12008-12-14 14:42:35 +01001024 /* stop reading */
Willy Tarreauf1ba4b32009-10-17 14:37:52 +02001025 if ((ib->flags & (BF_FULL|BF_HIJACK|BF_DONT_READ)) == BF_FULL)
Willy Tarreau3ffeba12008-12-14 14:42:35 +01001026 si->flags |= SI_FL_WAIT_ROOM;
1027 EV_FD_COND_C(si->fd, DIR_RD);
1028 }
1029 else {
1030 /* (re)start reading */
1031 si->flags &= ~SI_FL_WAIT_ROOM;
1032 EV_FD_COND_S(si->fd, DIR_RD);
1033 }
1034}
1035
1036
1037/* This function is used for inter-stream-interface calls. It is called by the
1038 * producer to inform the consumer side that it may be interested in checking
1039 * for data in the buffer. Note that it intentionally does not update timeouts,
1040 * so that we can still check them later at wake-up.
1041 */
1042void stream_sock_chk_snd(struct stream_interface *si)
1043{
1044 struct buffer *ob = si->ob;
Willy Tarreaua456f2a2009-01-18 17:38:44 +01001045 int retval;
Willy Tarreau3ffeba12008-12-14 14:42:35 +01001046
1047 DPRINTF(stderr,"[%u] %s: fd=%d owner=%p ib=%p, ob=%p, exp(r,w)=%u,%u ibf=%08x obf=%08x ibl=%d obl=%d si=%d\n",
1048 now_ms, __FUNCTION__,
Vincenzo Farruggia9b97cff2009-01-30 16:49:10 +00001049 si->fd, fdtab[si->fd].owner,
1050 si->ib, ob,
1051 si->ib->rex, ob->wex,
1052 si->ib->flags, ob->flags,
1053 si->ib->l, ob->l, si->state);
Willy Tarreau3ffeba12008-12-14 14:42:35 +01001054
1055 if (unlikely(si->state != SI_ST_EST || (ob->flags & BF_SHUTW)))
1056 return;
1057
Willy Tarreaua456f2a2009-01-18 17:38:44 +01001058 if (!(si->flags & SI_FL_WAIT_DATA) || /* not waiting for data */
1059 (fdtab[si->fd].ev & FD_POLL_OUT) || /* we'll be called anyway */
Willy Tarreauba0b63d2009-09-20 08:09:44 +02001060 (ob->flags & BF_OUT_EMPTY)) /* called with nothing to send ! */
Willy Tarreaua456f2a2009-01-18 17:38:44 +01001061 return;
1062
1063 retval = stream_sock_write_loop(si, ob);
Willy Tarreauc54aef32009-07-27 20:08:06 +02001064 /* here, we have :
1065 * retval < 0 if an error was encountered during write.
1066 * retval = 0 if we can't write anymore without polling
1067 * retval = 1 if we're invited to come back when desired
1068 */
Willy Tarreaua456f2a2009-01-18 17:38:44 +01001069 if (retval < 0) {
1070 /* Write error on the file descriptor. We mark the FD as STERROR so
1071 * that we don't use it anymore and we notify the task.
1072 */
1073 fdtab[si->fd].state = FD_STERROR;
1074 fdtab[si->fd].ev &= ~FD_POLL_STICKY;
Willy Tarreau1714e0f2009-03-28 20:54:53 +01001075 EV_FD_REM(si->fd);
Willy Tarreaua456f2a2009-01-18 17:38:44 +01001076 si->flags |= SI_FL_ERR;
1077 goto out_wakeup;
1078 }
1079
Willy Tarreauc54aef32009-07-27 20:08:06 +02001080 /* OK, so now we know that retval >= 0 means that some data might have
1081 * been sent, and that we may have to poll first. We have to do that
1082 * too if the buffer is not empty.
1083 */
Willy Tarreauba0b63d2009-09-20 08:09:44 +02001084 if (ob->flags & BF_OUT_EMPTY) {
Willy Tarreaua456f2a2009-01-18 17:38:44 +01001085 /* the connection is established but we can't write. Either the
1086 * buffer is empty, or we just refrain from sending because the
1087 * send_max limit was reached. Maybe we just wrote the last
1088 * chunk and need to close.
1089 */
Willy Tarreau520d95e2009-09-19 21:04:57 +02001090 if (((ob->flags & (BF_SHUTW|BF_HIJACK|BF_AUTO_CLOSE|BF_SHUTW_NOW)) ==
1091 (BF_AUTO_CLOSE|BF_SHUTW_NOW)) &&
Willy Tarreaua456f2a2009-01-18 17:38:44 +01001092 (si->state == SI_ST_EST)) {
1093 stream_sock_shutw(si);
1094 goto out_wakeup;
1095 }
Willy Tarreaud06e7112009-03-29 10:18:41 +02001096
Willy Tarreau59454bf2009-09-20 11:13:40 +02001097 if ((ob->flags & (BF_SHUTW|BF_SHUTW_NOW|BF_FULL|BF_HIJACK)) == 0)
Willy Tarreau3ffeba12008-12-14 14:42:35 +01001098 si->flags |= SI_FL_WAIT_DATA;
Willy Tarreaua456f2a2009-01-18 17:38:44 +01001099 ob->wex = TICK_ETERNITY;
Willy Tarreau3ffeba12008-12-14 14:42:35 +01001100 }
1101 else {
Willy Tarreauc54aef32009-07-27 20:08:06 +02001102 /* Otherwise there are remaining data to be sent in the buffer,
1103 * which means we have to poll before doing so.
1104 */
Willy Tarreau3ffeba12008-12-14 14:42:35 +01001105 EV_FD_COND_S(si->fd, DIR_WR);
Willy Tarreauc54aef32009-07-27 20:08:06 +02001106 si->flags &= ~SI_FL_WAIT_DATA;
1107 if (!tick_isset(ob->wex))
1108 ob->wex = tick_add_ifset(now_ms, ob->wto);
Willy Tarreau3ffeba12008-12-14 14:42:35 +01001109 }
Willy Tarreaua456f2a2009-01-18 17:38:44 +01001110
Willy Tarreauc9619462009-03-09 22:40:57 +01001111 if (likely(ob->flags & BF_WRITE_ACTIVITY)) {
1112 /* update timeout if we have written something */
Willy Tarreauba0b63d2009-09-20 08:09:44 +02001113 if ((ob->flags & (BF_OUT_EMPTY|BF_SHUTW|BF_WRITE_PARTIAL)) == BF_WRITE_PARTIAL)
Willy Tarreauc9619462009-03-09 22:40:57 +01001114 ob->wex = tick_add_ifset(now_ms, ob->wto);
1115
Willy Tarreauf27b5ea2009-10-03 22:01:18 +02001116 if (tick_isset(si->ib->rex) && !(si->flags & SI_FL_INDEP_STR)) {
Willy Tarreauc9619462009-03-09 22:40:57 +01001117 /* Note: to prevent the client from expiring read timeouts
Willy Tarreauf27b5ea2009-10-03 22:01:18 +02001118 * during writes, we refresh it. We only do this if the
1119 * interface is not configured for "independant streams",
1120 * because for some applications it's better not to do this,
1121 * for instance when continuously exchanging small amounts
1122 * of data which can full the socket buffers long before a
1123 * write timeout is detected.
Willy Tarreauc9619462009-03-09 22:40:57 +01001124 */
1125 si->ib->rex = tick_add_ifset(now_ms, si->ib->rto);
1126 }
1127 }
1128
Willy Tarreaua456f2a2009-01-18 17:38:44 +01001129 /* in case of special condition (error, shutdown, end of write...), we
1130 * have to notify the task.
1131 */
1132 if (likely((ob->flags & (BF_WRITE_NULL|BF_WRITE_ERROR|BF_SHUTW)) ||
Willy Tarreauba0b63d2009-09-20 08:09:44 +02001133 ((ob->flags & BF_OUT_EMPTY) && !ob->to_forward) ||
Willy Tarreaua456f2a2009-01-18 17:38:44 +01001134 si->state != SI_ST_EST)) {
1135 out_wakeup:
Willy Tarreau89f7ef22009-09-05 20:57:35 +02001136 if (!(si->flags & SI_FL_DONT_WAKE) && si->owner)
1137 task_wakeup(si->owner, TASK_WOKEN_IO);
Willy Tarreaua456f2a2009-01-18 17:38:44 +01001138 }
Willy Tarreau3ffeba12008-12-14 14:42:35 +01001139}
1140
Willy Tarreaueb472682010-05-28 18:46:57 +02001141/* This function is called on a read event from a listening socket, corresponding
1142 * to an accept. It tries to accept as many connections as possible, and for each
1143 * calls the listener's accept handler (generally the frontend's accept handler).
1144 */
1145int stream_sock_accept(int fd)
1146{
1147 struct listener *l = fdtab[fd].owner;
1148 struct proxy *p = l->frontend;
1149 int max_accept = global.tune.maxaccept;
1150 int cfd;
1151 int ret;
1152
1153 if (unlikely(l->nbconn >= l->maxconn)) {
1154 EV_FD_CLR(l->fd, DIR_RD);
1155 l->state = LI_FULL;
1156 return 0;
1157 }
1158
1159 if (p && p->fe_sps_lim) {
1160 int max = freq_ctr_remain(&p->fe_sess_per_sec, p->fe_sps_lim, 0);
1161 if (max_accept > max)
1162 max_accept = max;
1163 }
1164
1165 while ((!p || p->feconn < p->maxconn) && actconn < global.maxconn && max_accept--) {
1166 struct sockaddr_storage addr;
1167 socklen_t laddr = sizeof(addr);
1168
1169 cfd = accept(fd, (struct sockaddr *)&addr, &laddr);
1170 if (unlikely(cfd == -1)) {
1171 switch (errno) {
1172 case EAGAIN:
1173 case EINTR:
1174 case ECONNABORTED:
1175 return 0; /* nothing more to accept */
1176 case ENFILE:
Willy Tarreau7999ddb2010-06-04 20:46:13 +02001177 if (p)
1178 send_log(p, LOG_EMERG,
1179 "Proxy %s reached system FD limit at %d. Please check system tunables.\n",
1180 p->id, maxfd);
Willy Tarreaueb472682010-05-28 18:46:57 +02001181 return 0;
1182 case EMFILE:
Willy Tarreau7999ddb2010-06-04 20:46:13 +02001183 if (p)
1184 send_log(p, LOG_EMERG,
1185 "Proxy %s reached process FD limit at %d. Please check 'ulimit-n' and restart.\n",
1186 p->id, maxfd);
Willy Tarreaueb472682010-05-28 18:46:57 +02001187 return 0;
1188 case ENOBUFS:
1189 case ENOMEM:
Willy Tarreau7999ddb2010-06-04 20:46:13 +02001190 if (p)
1191 send_log(p, LOG_EMERG,
1192 "Proxy %s reached system memory limit at %d sockets. Please check system tunables.\n",
1193 p->id, maxfd);
Willy Tarreaueb472682010-05-28 18:46:57 +02001194 return 0;
1195 default:
1196 return 0;
1197 }
1198 }
1199
1200 if (unlikely(cfd >= global.maxsock)) {
Willy Tarreaufffe1322010-11-11 09:48:16 +01001201 send_log(p, LOG_EMERG,
1202 "Proxy %s reached the configured maximum connection limit. Please check the global 'maxconn' value.\n",
1203 p->id);
Willy Tarreauabe8ea52010-11-11 10:56:04 +01001204 close(cfd);
1205 return 0;
Willy Tarreaueb472682010-05-28 18:46:57 +02001206 }
1207
Willy Tarreauaf7ad002010-08-31 15:39:26 +02001208 jobs++;
Willy Tarreau24dcaf32010-06-05 10:49:41 +02001209 actconn++;
1210 totalconn++;
1211 l->nbconn++;
1212
1213 if (l->counters) {
1214 if (l->nbconn > l->counters->conn_max)
1215 l->counters->conn_max = l->nbconn;
1216 }
1217
Willy Tarreaueb472682010-05-28 18:46:57 +02001218 ret = l->accept(l, cfd, &addr);
Willy Tarreauabe8ea52010-11-11 10:56:04 +01001219 if (unlikely(ret <= 0)) {
1220 /* The connection was closed by session_accept(). Either
1221 * we just have to ignore it (ret == 0) or it's a critical
1222 * error due to a resource shortage, and we must stop the
1223 * listener (ret < 0).
1224 */
1225 jobs--;
1226 actconn--;
1227 l->nbconn--;
1228 if (ret == 0) /* successful termination */
1229 continue;
1230
Willy Tarreau7999ddb2010-06-04 20:46:13 +02001231 if (p) {
Willy Tarreaue9f32db2010-09-21 21:14:29 +02001232 disable_listener(l);
Willy Tarreau7999ddb2010-06-04 20:46:13 +02001233 p->state = PR_STIDLE;
1234 }
Willy Tarreauabe8ea52010-11-11 10:56:04 +01001235 return 0;
Willy Tarreaueb472682010-05-28 18:46:57 +02001236 }
1237
Willy Tarreaueb472682010-05-28 18:46:57 +02001238 if (l->nbconn >= l->maxconn) {
1239 EV_FD_CLR(l->fd, DIR_RD);
1240 l->state = LI_FULL;
1241 }
Willy Tarreaueb472682010-05-28 18:46:57 +02001242 } /* end of while (p->feconn < p->maxconn) */
1243 return 0;
Willy Tarreaueb472682010-05-28 18:46:57 +02001244}
1245
Willy Tarreauabe8ea52010-11-11 10:56:04 +01001246
Willy Tarreaua8f55d52010-05-31 17:44:19 +02001247/* Prepare a stream interface to be used in socket mode. */
1248void stream_sock_prepare_interface(struct stream_interface *si)
1249{
1250 si->update = stream_sock_data_finish;
1251 si->shutr = stream_sock_shutr;
1252 si->shutw = stream_sock_shutw;
1253 si->chk_rcv = stream_sock_chk_rcv;
1254 si->chk_snd = stream_sock_chk_snd;
1255 si->iohandler = NULL;
1256}
1257
Willy Tarreaubaaee002006-06-26 02:48:02 +02001258
1259/*
1260 * Local variables:
1261 * c-indent-level: 8
1262 * c-basic-offset: 8
1263 * End:
1264 */