blob: 362470216797d5b907e89781b8477abbe8c1e0ae [file] [log] [blame]
Willy Tarreaubaaee002006-06-26 02:48:02 +02001/*
2 * Functions operating on SOCK_STREAM and buffers.
3 *
Willy Tarreaua8f55d52010-05-31 17:44:19 +02004 * Copyright 2000-2010 Willy Tarreau <w@1wt.eu>
Willy Tarreaubaaee002006-06-26 02:48:02 +02005 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 */
12
Willy Tarreau6b4aad42009-01-18 21:59:13 +010013#define _GNU_SOURCE
Willy Tarreaubaaee002006-06-26 02:48:02 +020014#include <errno.h>
15#include <fcntl.h>
16#include <stdio.h>
17#include <stdlib.h>
18
19#include <sys/socket.h>
20#include <sys/stat.h>
21#include <sys/types.h>
22
Dmitry Sivachenkocaf58982009-08-24 15:11:06 +040023#include <netinet/tcp.h>
24
Willy Tarreau2dd0d472006-06-29 17:53:05 +020025#include <common/compat.h>
Willy Tarreaue3ba5f02006-06-29 18:54:54 +020026#include <common/config.h>
Willy Tarreaud6f087e2008-01-18 17:20:13 +010027#include <common/debug.h>
Willy Tarreau83749182007-04-15 20:56:27 +020028#include <common/standard.h>
Willy Tarreau0c303ee2008-07-07 00:09:58 +020029#include <common/ticks.h>
Willy Tarreau2dd0d472006-06-29 17:53:05 +020030#include <common/time.h>
Willy Tarreaubaaee002006-06-26 02:48:02 +020031
Willy Tarreau2d212792008-08-27 21:41:35 +020032#include <proto/buffers.h>
Willy Tarreaubaaee002006-06-26 02:48:02 +020033#include <proto/fd.h>
Willy Tarreaueb472682010-05-28 18:46:57 +020034#include <proto/freq_ctr.h>
35#include <proto/log.h>
Willy Tarreau3eba98a2009-01-25 13:56:13 +010036#include <proto/pipe.h>
Willy Tarreaufe598a72010-09-21 21:48:23 +020037#include <proto/protocols.h>
Willy Tarreaubaaee002006-06-26 02:48:02 +020038#include <proto/stream_sock.h>
39#include <proto/task.h>
40
Willy Tarreau5bd8c372009-01-19 00:32:22 +010041#include <types/global.h>
Willy Tarreaubaaee002006-06-26 02:48:02 +020042
Willy Tarreau6b4aad42009-01-18 21:59:13 +010043/* On recent Linux kernels, the splice() syscall may be used for faster data copy.
44 * But it's not always defined on some OS versions, and it even happens that some
45 * definitions are wrong with some glibc due to an offset bug in syscall().
46 */
47
48#if defined(CONFIG_HAP_LINUX_SPLICE)
49#include <unistd.h>
50#include <sys/syscall.h>
51
52#ifndef SPLICE_F_MOVE
53#define SPLICE_F_MOVE 0x1
54#endif
55
56#ifndef SPLICE_F_NONBLOCK
57#define SPLICE_F_NONBLOCK 0x2
58#endif
59
60#ifndef SPLICE_F_MORE
61#define SPLICE_F_MORE 0x4
62#endif
63
64#ifndef __NR_splice
65#if defined(__powerpc__) || defined(__powerpc64__)
66#define __NR_splice 283
67#elif defined(__sparc__) || defined(__sparc64__)
68#define __NR_splice 232
69#elif defined(__x86_64__)
70#define __NR_splice 275
71#elif defined(__alpha__)
72#define __NR_splice 468
73#elif defined (__i386__)
74#define __NR_splice 313
75#else
76#warning unsupported architecture, guessing __NR_splice=313 like x86...
77#define __NR_splice 313
78#endif /* $arch */
79
80_syscall6(int, splice, int, fdin, loff_t *, off_in, int, fdout, loff_t *, off_out, size_t, len, unsigned long, flags)
81
82#endif /* __NR_splice */
Willy Tarreau5bd8c372009-01-19 00:32:22 +010083
84/* A pipe contains 16 segments max, and it's common to see segments of 1448 bytes
85 * because of timestamps. Use this as a hint for not looping on splice().
86 */
87#define SPLICE_FULL_HINT 16*1448
88
Willy Tarreaua9de3332009-11-28 07:47:10 +010089/* how many data we attempt to splice at once when the buffer is configured for
90 * infinite forwarding */
91#define MAX_SPLICE_AT_ONCE (1<<30)
92
Willy Tarreau5bd8c372009-01-19 00:32:22 +010093/* Returns :
94 * -1 if splice is not possible or not possible anymore and we must switch to
95 * user-land copy (eg: to_forward reached)
96 * 0 when we know that polling is required to get more data (EAGAIN)
97 * 1 for all other cases (we can safely try again, or if an activity has been
98 * detected (DATA/NULL/ERR))
99 * Sets :
100 * BF_READ_NULL
101 * BF_READ_PARTIAL
102 * BF_WRITE_PARTIAL (during copy)
Willy Tarreauba0b63d2009-09-20 08:09:44 +0200103 * BF_OUT_EMPTY (during copy)
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100104 * SI_FL_ERR
105 * SI_FL_WAIT_ROOM
106 * (SI_FL_WAIT_RECV)
Willy Tarreau3eba98a2009-01-25 13:56:13 +0100107 *
108 * This function automatically allocates a pipe from the pipe pool. It also
109 * carefully ensures to clear b->pipe whenever it leaves the pipe empty.
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100110 */
111static int stream_sock_splice_in(struct buffer *b, struct stream_interface *si)
112{
113 int fd = si->fd;
Willy Tarreau31971e52009-09-20 12:07:52 +0200114 int ret;
115 unsigned long max;
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100116 int retval = 1;
117
118 if (!b->to_forward)
119 return -1;
120
121 if (!(b->flags & BF_KERN_SPLICING))
122 return -1;
123
124 if (b->l) {
125 /* We're embarrassed, there are already data pending in
126 * the buffer and we don't want to have them at two
127 * locations at a time. Let's indicate we need some
128 * place and ask the consumer to hurry.
129 */
130 si->flags |= SI_FL_WAIT_ROOM;
131 EV_FD_CLR(fd, DIR_RD);
132 b->rex = TICK_ETERNITY;
133 b->cons->chk_snd(b->cons);
134 return 1;
135 }
136
Willy Tarreau3eba98a2009-01-25 13:56:13 +0100137 if (unlikely(b->pipe == NULL)) {
138 if (pipes_used >= global.maxpipes || !(b->pipe = get_pipe())) {
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100139 b->flags &= ~BF_KERN_SPLICING;
140 return -1;
141 }
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100142 }
143
Willy Tarreau3eba98a2009-01-25 13:56:13 +0100144 /* At this point, b->pipe is valid */
145
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100146 while (1) {
Willy Tarreaua9de3332009-11-28 07:47:10 +0100147 if (b->to_forward == BUF_INFINITE_FORWARD)
148 max = MAX_SPLICE_AT_ONCE;
149 else
150 max = b->to_forward;
151
Willy Tarreau31971e52009-09-20 12:07:52 +0200152 if (!max) {
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100153 /* It looks like the buffer + the pipe already contain
154 * the maximum amount of data to be transferred. Try to
155 * send those data immediately on the other side if it
156 * is currently waiting.
157 */
158 retval = -1; /* end of forwarding */
159 break;
160 }
161
Willy Tarreau3eba98a2009-01-25 13:56:13 +0100162 ret = splice(fd, NULL, b->pipe->prod, NULL, max,
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100163 SPLICE_F_MOVE|SPLICE_F_NONBLOCK);
164
165 if (ret <= 0) {
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100166 if (ret == 0) {
Willy Tarreau98b306b2009-01-25 11:11:32 +0100167 /* connection closed. This is only detected by
168 * recent kernels (>= 2.6.27.13).
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100169 */
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100170 b->flags |= BF_READ_NULL;
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100171 retval = 1; /* no need for further polling */
172 break;
173 }
174
175 if (errno == EAGAIN) {
176 /* there are two reasons for EAGAIN :
177 * - nothing in the socket buffer (standard)
178 * - pipe is full
Willy Tarreau98b306b2009-01-25 11:11:32 +0100179 * - the connection is closed (kernel < 2.6.27.13)
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100180 * Since we don't know if pipe is full, we'll
181 * stop if the pipe is not empty. Anyway, we
182 * will almost always fill/empty the pipe.
183 */
184
Willy Tarreau3eba98a2009-01-25 13:56:13 +0100185 if (b->pipe->data) {
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100186 si->flags |= SI_FL_WAIT_ROOM;
187 retval = 1;
188 break;
189 }
190
Willy Tarreau98b306b2009-01-25 11:11:32 +0100191 /* We don't know if the connection was closed.
192 * But if we're called upon POLLIN with an empty
193 * pipe and get EAGAIN, it is suspect enought to
194 * try to fall back to the normal recv scheme
195 * which will be able to deal with the situation.
196 */
197 retval = -1;
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100198 break;
199 }
Willy Tarreaudc340a92009-06-28 23:10:19 +0200200
Willy Tarreaua9de3332009-11-28 07:47:10 +0100201 if (errno == ENOSYS || errno == EINVAL) {
Willy Tarreaudc340a92009-06-28 23:10:19 +0200202 /* splice not supported on this end, disable it */
203 b->flags &= ~BF_KERN_SPLICING;
204 si->flags &= ~SI_FL_CAP_SPLICE;
205 put_pipe(b->pipe);
206 b->pipe = NULL;
207 return -1;
208 }
209
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100210 /* here we have another error */
211 si->flags |= SI_FL_ERR;
212 retval = 1;
213 break;
214 } /* ret <= 0 */
215
Willy Tarreau31971e52009-09-20 12:07:52 +0200216 if (b->to_forward != BUF_INFINITE_FORWARD)
217 b->to_forward -= ret;
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100218 b->total += ret;
Willy Tarreau3eba98a2009-01-25 13:56:13 +0100219 b->pipe->data += ret;
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100220 b->flags |= BF_READ_PARTIAL;
Willy Tarreauba0b63d2009-09-20 08:09:44 +0200221 b->flags &= ~BF_OUT_EMPTY;
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100222
Willy Tarreau6f4a82c2009-03-21 20:43:57 +0100223 if (b->pipe->data >= SPLICE_FULL_HINT ||
224 ret >= global.tune.recv_enough) {
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100225 /* We've read enough of it for this time. */
226 retval = 1;
227 break;
228 }
229 } /* while */
230
Willy Tarreau3eba98a2009-01-25 13:56:13 +0100231 if (unlikely(!b->pipe->data)) {
232 put_pipe(b->pipe);
233 b->pipe = NULL;
234 }
235
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100236 return retval;
237}
238
Willy Tarreau6b4aad42009-01-18 21:59:13 +0100239#endif /* CONFIG_HAP_LINUX_SPLICE */
240
241
Willy Tarreaubaaee002006-06-26 02:48:02 +0200242/*
Willy Tarreaud7971282006-07-29 18:36:34 +0200243 * this function is called on a read event from a stream socket.
Willy Tarreau83749182007-04-15 20:56:27 +0200244 * It returns 0 if we have a high confidence that we will not be
245 * able to read more data without polling first. Returns non-zero
246 * otherwise.
Willy Tarreaubaaee002006-06-26 02:48:02 +0200247 */
Willy Tarreaud7971282006-07-29 18:36:34 +0200248int stream_sock_read(int fd) {
Willy Tarreaue5ed4062008-08-30 03:17:31 +0200249 struct stream_interface *si = fdtab[fd].owner;
Willy Tarreau48adac52008-08-30 04:58:38 +0200250 struct buffer *b = si->ib;
Willy Tarreau8a7af602008-05-03 23:07:14 +0200251 int ret, max, retval, cur_read;
Willy Tarreaub8949f12007-03-23 22:39:59 +0100252 int read_poll = MAX_READ_POLL_LOOPS;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200253
254#ifdef DEBUG_FULL
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100255 fprintf(stderr,"stream_sock_read : fd=%d, ev=0x%02x, owner=%p\n", fd, fdtab[fd].ev, fdtab[fd].owner);
Willy Tarreaubaaee002006-06-26 02:48:02 +0200256#endif
257
Willy Tarreau83749182007-04-15 20:56:27 +0200258 retval = 1;
259
Willy Tarreau71543652009-07-14 19:55:05 +0200260 /* stop immediately on errors. Note that we DON'T want to stop on
261 * POLL_ERR, as the poller might report a write error while there
262 * are still data available in the recv buffer. This typically
263 * happens when we send too large a request to a backend server
264 * which rejects it before reading it all.
265 */
266 if (fdtab[fd].state == FD_STERROR)
Willy Tarreau6996e152007-04-30 14:37:43 +0200267 goto out_error;
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100268
269 /* stop here if we reached the end of data */
270 if ((fdtab[fd].ev & (FD_POLL_IN|FD_POLL_HUP)) == FD_POLL_HUP)
271 goto out_shutdown_r;
Willy Tarreau83749182007-04-15 20:56:27 +0200272
Willy Tarreaud06e7112009-03-29 10:18:41 +0200273 /* maybe we were called immediately after an asynchronous shutr */
274 if (b->flags & BF_SHUTR)
275 goto out_wakeup;
276
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100277#if defined(CONFIG_HAP_LINUX_SPLICE)
278 if (b->to_forward && b->flags & BF_KERN_SPLICING) {
Willy Tarreau98b306b2009-01-25 11:11:32 +0100279
280 /* Under Linux, if FD_POLL_HUP is set, we have reached the end.
281 * Since older splice() implementations were buggy and returned
282 * EAGAIN on end of read, let's bypass the call to splice() now.
283 */
284 if (fdtab[fd].ev & FD_POLL_HUP)
285 goto out_shutdown_r;
286
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100287 retval = stream_sock_splice_in(b, si);
288
289 if (retval >= 0) {
290 if (si->flags & SI_FL_ERR)
291 goto out_error;
292 if (b->flags & BF_READ_NULL)
293 goto out_shutdown_r;
294 goto out_wakeup;
295 }
296 /* splice not possible (anymore), let's go on on standard copy */
297 }
298#endif
Willy Tarreau8a7af602008-05-03 23:07:14 +0200299 cur_read = 0;
Willy Tarreau6996e152007-04-30 14:37:43 +0200300 while (1) {
Willy Tarreau864e8252009-12-28 17:36:37 +0100301 max = buffer_max_len(b) - b->l;
302
303 if (max <= 0) {
304 b->flags |= BF_FULL;
305 si->flags |= SI_FL_WAIT_ROOM;
306 break;
307 }
308
Willy Tarreau6996e152007-04-30 14:37:43 +0200309 /*
310 * 1. compute the maximum block size we can read at once.
311 */
Willy Tarreau03d60bb2009-01-09 11:13:00 +0100312 if (b->l == 0) {
313 /* let's realign the buffer to optimize I/O */
314 b->r = b->w = b->lr = b->data;
Willy Tarreau83749182007-04-15 20:56:27 +0200315 }
316 else if (b->r > b->w) {
Willy Tarreau864e8252009-12-28 17:36:37 +0100317 /* remaining space wraps at the end, with a moving limit */
318 if (max > b->data + b->size - b->r)
319 max = b->data + b->size - b->r;
Willy Tarreau9c0fe592009-01-18 16:25:31 +0100320 }
Willy Tarreau864e8252009-12-28 17:36:37 +0100321 /* else max is already OK */
Willy Tarreaubaaee002006-06-26 02:48:02 +0200322
Willy Tarreau6996e152007-04-30 14:37:43 +0200323 /*
324 * 2. read the largest possible block
325 */
Willy Tarreaufc1daaf2010-01-15 10:26:13 +0100326 ret = recv(fd, b->r, max, 0);
Willy Tarreaud6d06902009-08-19 11:22:33 +0200327
Willy Tarreau83749182007-04-15 20:56:27 +0200328 if (ret > 0) {
329 b->r += ret;
330 b->l += ret;
Willy Tarreau8a7af602008-05-03 23:07:14 +0200331 cur_read += ret;
Willy Tarreaub38903c2008-11-23 21:33:29 +0100332
Willy Tarreau0abebcc2009-01-08 00:09:41 +0100333 /* if we're allowed to directly forward data, we must update send_max */
Willy Tarreau31971e52009-09-20 12:07:52 +0200334 if (b->to_forward && !(b->flags & (BF_SHUTW|BF_SHUTW_NOW))) {
335 unsigned long fwd = ret;
336 if (b->to_forward != BUF_INFINITE_FORWARD) {
337 if (fwd > b->to_forward)
338 fwd = b->to_forward;
339 b->to_forward -= fwd;
340 }
341 b->send_max += fwd;
Willy Tarreauba0b63d2009-09-20 08:09:44 +0200342 b->flags &= ~BF_OUT_EMPTY;
Willy Tarreau0abebcc2009-01-08 00:09:41 +0100343 }
Willy Tarreauf890dc92008-12-13 21:12:26 +0100344
Willy Tarreaub38903c2008-11-23 21:33:29 +0100345 if (fdtab[fd].state == FD_STCONN)
346 fdtab[fd].state = FD_STREADY;
347
Willy Tarreau3da77c52008-08-29 09:58:42 +0200348 b->flags |= BF_READ_PARTIAL;
Willy Tarreau74ab2ac2008-11-23 17:23:07 +0100349
Willy Tarreaua07a34e2009-08-16 23:27:46 +0200350 if (b->r == b->data + b->size) {
Willy Tarreau83749182007-04-15 20:56:27 +0200351 b->r = b->data; /* wrap around the buffer */
352 }
Willy Tarreau9641e8f2007-03-23 23:02:09 +0100353
Willy Tarreau83749182007-04-15 20:56:27 +0200354 b->total += ret;
Willy Tarreau9641e8f2007-03-23 23:02:09 +0100355
Willy Tarreau7c3c5412009-12-13 15:53:05 +0100356 if (b->l >= buffer_max_len(b)) {
Willy Tarreau6996e152007-04-30 14:37:43 +0200357 /* The buffer is now full, there's no point in going through
358 * the loop again.
359 */
Willy Tarreau8a7af602008-05-03 23:07:14 +0200360 if (!(b->flags & BF_STREAMER_FAST) && (cur_read == b->l)) {
361 b->xfer_small = 0;
362 b->xfer_large++;
363 if (b->xfer_large >= 3) {
364 /* we call this buffer a fast streamer if it manages
365 * to be filled in one call 3 consecutive times.
366 */
367 b->flags |= (BF_STREAMER | BF_STREAMER_FAST);
368 //fputc('+', stderr);
369 }
370 }
371 else if ((b->flags & (BF_STREAMER | BF_STREAMER_FAST)) &&
Willy Tarreaua07a34e2009-08-16 23:27:46 +0200372 (cur_read <= b->size / 2)) {
Willy Tarreau8a7af602008-05-03 23:07:14 +0200373 b->xfer_large = 0;
374 b->xfer_small++;
375 if (b->xfer_small >= 2) {
376 /* if the buffer has been at least half full twice,
377 * we receive faster than we send, so at least it
378 * is not a "fast streamer".
379 */
380 b->flags &= ~BF_STREAMER_FAST;
381 //fputc('-', stderr);
382 }
383 }
384 else {
385 b->xfer_small = 0;
386 b->xfer_large = 0;
387 }
Willy Tarreau9c0fe592009-01-18 16:25:31 +0100388
389 b->flags |= BF_FULL;
Willy Tarreaua456f2a2009-01-18 17:38:44 +0100390 si->flags |= SI_FL_WAIT_ROOM;
Willy Tarreau9c0fe592009-01-18 16:25:31 +0100391 break;
Willy Tarreau6996e152007-04-30 14:37:43 +0200392 }
393
Willy Tarreauab3e1d32007-06-03 14:10:36 +0200394 /* if too many bytes were missing from last read, it means that
395 * it's pointless trying to read again because the system does
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100396 * not have them in buffers. BTW, if FD_POLL_HUP was present,
397 * it means that we have reached the end and that the connection
398 * is closed.
Willy Tarreauab3e1d32007-06-03 14:10:36 +0200399 */
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100400 if (ret < max) {
Willy Tarreau8a7af602008-05-03 23:07:14 +0200401 if ((b->flags & (BF_STREAMER | BF_STREAMER_FAST)) &&
Willy Tarreaua07a34e2009-08-16 23:27:46 +0200402 (cur_read <= b->size / 2)) {
Willy Tarreau8a7af602008-05-03 23:07:14 +0200403 b->xfer_large = 0;
404 b->xfer_small++;
405 if (b->xfer_small >= 3) {
406 /* we have read less than half of the buffer in
407 * one pass, and this happened at least 3 times.
408 * This is definitely not a streamer.
409 */
410 b->flags &= ~(BF_STREAMER | BF_STREAMER_FAST);
411 //fputc('!', stderr);
412 }
413 }
Willy Tarreau2bea3a12008-08-28 09:47:43 +0200414 /* unfortunately, on level-triggered events, POLL_HUP
415 * is generally delivered AFTER the system buffer is
416 * empty, so this one might never match.
417 */
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100418 if (fdtab[fd].ev & FD_POLL_HUP)
419 goto out_shutdown_r;
Willy Tarreau2bea3a12008-08-28 09:47:43 +0200420
421 /* if a streamer has read few data, it may be because we
422 * have exhausted system buffers. It's not worth trying
423 * again.
424 */
425 if (b->flags & BF_STREAMER)
426 break;
Willy Tarreauab3e1d32007-06-03 14:10:36 +0200427
Willy Tarreau6f4a82c2009-03-21 20:43:57 +0100428 /* generally if we read something smaller than 1 or 2 MSS,
429 * it means that either we have exhausted the system's
430 * buffers (streamer or question-response protocol) or
431 * that the connection will be closed. Streamers are
432 * easily detected so we return early. For other cases,
433 * it's still better to perform a last read to be sure,
434 * because it may save one complete poll/read/wakeup cycle
435 * in case of shutdown.
436 */
437 if (ret < MIN_RET_FOR_READ_LOOP && b->flags & BF_STREAMER)
438 break;
439
440 /* if we read a large block smaller than what we requested,
441 * it's almost certain we'll never get anything more.
442 */
443 if (ret >= global.tune.recv_enough)
444 break;
445 }
Willy Tarreau83749182007-04-15 20:56:27 +0200446
Willy Tarreau1b194fe2009-03-21 21:10:04 +0100447 if ((b->flags & BF_READ_DONTWAIT) || --read_poll <= 0)
Willy Tarreaubaaee002006-06-26 02:48:02 +0200448 break;
Willy Tarreau83749182007-04-15 20:56:27 +0200449 }
450 else if (ret == 0) {
Willy Tarreau6996e152007-04-30 14:37:43 +0200451 /* connection closed */
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100452 goto out_shutdown_r;
Willy Tarreau83749182007-04-15 20:56:27 +0200453 }
Willy Tarreau9f195292007-04-15 21:26:58 +0200454 else if (errno == EAGAIN) {
455 /* Ignore EAGAIN but inform the poller that there is
Willy Tarreauaf78d0f2009-01-08 10:09:08 +0100456 * nothing to read left if we did not read much, ie
457 * less than what we were still expecting to read.
458 * But we may have done some work justifying to notify
459 * the task.
Willy Tarreau9f195292007-04-15 21:26:58 +0200460 */
Willy Tarreauaf78d0f2009-01-08 10:09:08 +0100461 if (cur_read < MIN_RET_FOR_READ_LOOP)
462 retval = 0;
Willy Tarreau83749182007-04-15 20:56:27 +0200463 break;
464 }
465 else {
Willy Tarreau6996e152007-04-30 14:37:43 +0200466 goto out_error;
Willy Tarreau83749182007-04-15 20:56:27 +0200467 }
Willy Tarreau6996e152007-04-30 14:37:43 +0200468 } /* while (1) */
Willy Tarreaubaaee002006-06-26 02:48:02 +0200469
Willy Tarreau6996e152007-04-30 14:37:43 +0200470 out_wakeup:
Willy Tarreau9c0fe592009-01-18 16:25:31 +0100471 /* We might have some data the consumer is waiting for */
Willy Tarreauba0b63d2009-09-20 08:09:44 +0200472 if (!(b->flags & BF_OUT_EMPTY) && (b->cons->flags & SI_FL_WAIT_DATA)) {
Willy Tarreau3eba98a2009-01-25 13:56:13 +0100473 int last_len = b->pipe ? b->pipe->data : 0;
Willy Tarreaua456f2a2009-01-18 17:38:44 +0100474
Willy Tarreau3ffeba12008-12-14 14:42:35 +0100475 b->cons->chk_snd(b->cons);
476
Willy Tarreaua456f2a2009-01-18 17:38:44 +0100477 /* check if the consumer has freed some space */
Willy Tarreau3eba98a2009-01-25 13:56:13 +0100478 if (!(b->flags & BF_FULL) &&
479 (!last_len || !b->pipe || b->pipe->data < last_len))
Willy Tarreaua456f2a2009-01-18 17:38:44 +0100480 si->flags &= ~SI_FL_WAIT_ROOM;
481 }
482
483 if (si->flags & SI_FL_WAIT_ROOM) {
Willy Tarreau9c0fe592009-01-18 16:25:31 +0100484 EV_FD_CLR(fd, DIR_RD);
485 b->rex = TICK_ETERNITY;
486 }
Willy Tarreauf1ba4b32009-10-17 14:37:52 +0200487 else if ((b->flags & (BF_SHUTR|BF_READ_PARTIAL|BF_FULL|BF_DONT_READ|BF_READ_NOEXP)) == BF_READ_PARTIAL)
Willy Tarreaua456f2a2009-01-18 17:38:44 +0100488 b->rex = tick_add_ifset(now_ms, b->rto);
Willy Tarreau9c0fe592009-01-18 16:25:31 +0100489
Willy Tarreau6b66f3e2008-12-14 17:31:54 +0100490 /* we have to wake up if there is a special event or if we don't have
491 * any more data to forward.
492 */
Willy Tarreau5af1fa12010-07-19 18:16:03 +0200493 if ((b->flags & (BF_READ_NULL|BF_READ_ERROR)) ||
Willy Tarreaua456f2a2009-01-18 17:38:44 +0100494 si->state != SI_ST_EST ||
Willy Tarreau5af1fa12010-07-19 18:16:03 +0200495 (si->flags & SI_FL_ERR) ||
496 ((b->flags & BF_READ_PARTIAL) && (!b->to_forward || b->cons->state != SI_ST_EST)))
Willy Tarreau6b66f3e2008-12-14 17:31:54 +0100497 task_wakeup(si->owner, TASK_WOKEN_IO);
Willy Tarreau5af1fa12010-07-19 18:16:03 +0200498
499 if (b->flags & BF_READ_ACTIVITY)
500 b->flags &= ~BF_READ_DONTWAIT;
501
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100502 fdtab[fd].ev &= ~FD_POLL_IN;
Willy Tarreau83749182007-04-15 20:56:27 +0200503 return retval;
Willy Tarreau6996e152007-04-30 14:37:43 +0200504
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100505 out_shutdown_r:
Willy Tarreaue5ed4062008-08-30 03:17:31 +0200506 /* we received a shutdown */
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100507 fdtab[fd].ev &= ~FD_POLL_HUP;
508 b->flags |= BF_READ_NULL;
Willy Tarreau520d95e2009-09-19 21:04:57 +0200509 if (b->flags & BF_AUTO_CLOSE)
Willy Tarreau418fd472009-09-06 21:37:23 +0200510 buffer_shutw_now(b);
Willy Tarreau99126c32008-11-27 10:30:51 +0100511 stream_sock_shutr(si);
Willy Tarreau0c303ee2008-07-07 00:09:58 +0200512 goto out_wakeup;
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100513
Willy Tarreau6996e152007-04-30 14:37:43 +0200514 out_error:
Willy Tarreaucff64112008-11-03 06:26:53 +0100515 /* Read error on the file descriptor. We mark the FD as STERROR so
516 * that we don't use it anymore. The error is reported to the stream
517 * interface which will take proper action. We must not perturbate the
518 * buffer because the stream interface wants to ensure transparent
519 * connection retries.
Willy Tarreau6996e152007-04-30 14:37:43 +0200520 */
Willy Tarreaucff64112008-11-03 06:26:53 +0100521
Willy Tarreau6996e152007-04-30 14:37:43 +0200522 fdtab[fd].state = FD_STERROR;
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100523 fdtab[fd].ev &= ~FD_POLL_STICKY;
Willy Tarreau1714e0f2009-03-28 20:54:53 +0100524 EV_FD_REM(fd);
Willy Tarreaucff64112008-11-03 06:26:53 +0100525 si->flags |= SI_FL_ERR;
Willy Tarreau9c0fe592009-01-18 16:25:31 +0100526 retval = 1;
527 goto out_wakeup;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200528}
529
530
531/*
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100532 * This function is called to send buffer data to a stream socket.
533 * It returns -1 in case of unrecoverable error, 0 if the caller needs to poll
Willy Tarreau3eba98a2009-01-25 13:56:13 +0100534 * before calling it again, otherwise 1. If a pipe was associated with the
535 * buffer and it empties it, it releases it as well.
Willy Tarreaubaaee002006-06-26 02:48:02 +0200536 */
Willy Tarreaua456f2a2009-01-18 17:38:44 +0100537static int stream_sock_write_loop(struct stream_interface *si, struct buffer *b)
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100538{
Willy Tarreau83749182007-04-15 20:56:27 +0200539 int write_poll = MAX_WRITE_POLL_LOOPS;
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100540 int retval = 1;
541 int ret, max;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200542
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100543#if defined(CONFIG_HAP_LINUX_SPLICE)
Willy Tarreau3eba98a2009-01-25 13:56:13 +0100544 while (b->pipe) {
545 ret = splice(b->pipe->cons, NULL, si->fd, NULL, b->pipe->data,
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100546 SPLICE_F_MOVE|SPLICE_F_NONBLOCK);
547 if (ret <= 0) {
548 if (ret == 0 || errno == EAGAIN) {
549 retval = 0;
550 return retval;
551 }
552 /* here we have another error */
553 retval = -1;
554 return retval;
555 }
556
557 b->flags |= BF_WRITE_PARTIAL;
Willy Tarreau3eba98a2009-01-25 13:56:13 +0100558 b->pipe->data -= ret;
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100559
Willy Tarreau3eba98a2009-01-25 13:56:13 +0100560 if (!b->pipe->data) {
561 put_pipe(b->pipe);
562 b->pipe = NULL;
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100563 break;
Willy Tarreau3eba98a2009-01-25 13:56:13 +0100564 }
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100565
566 if (--write_poll <= 0)
567 return retval;
568 }
569
570 /* At this point, the pipe is empty, but we may still have data pending
571 * in the normal buffer.
572 */
Willy Tarreau5bd8c372009-01-19 00:32:22 +0100573#endif
Willy Tarreauba0b63d2009-09-20 08:09:44 +0200574 if (!b->send_max) {
575 b->flags |= BF_OUT_EMPTY;
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100576 return retval;
Willy Tarreauba0b63d2009-09-20 08:09:44 +0200577 }
Willy Tarreau83749182007-04-15 20:56:27 +0200578
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100579 /* when we're in this loop, we already know that there is no spliced
580 * data left, and that there are sendable buffered data.
581 */
Willy Tarreau6996e152007-04-30 14:37:43 +0200582 while (1) {
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100583 if (b->r > b->w)
Willy Tarreau83749182007-04-15 20:56:27 +0200584 max = b->r - b->w;
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100585 else
Willy Tarreaua07a34e2009-08-16 23:27:46 +0200586 max = b->data + b->size - b->w;
Willy Tarreau83749182007-04-15 20:56:27 +0200587
Willy Tarreauf890dc92008-12-13 21:12:26 +0100588 /* limit the amount of outgoing data if required */
589 if (max > b->send_max)
590 max = b->send_max;
591
Willy Tarreau6db06d32009-08-19 11:14:11 +0200592 /* check if we want to inform the kernel that we're interested in
593 * sending more data after this call. We want this if :
594 * - we're about to close after this last send and want to merge
595 * the ongoing FIN with the last segment.
596 * - we know we can't send everything at once and must get back
597 * here because of unaligned data
Willy Tarreaud38b53b2010-01-03 11:18:34 +0100598 * - there is still a finite amount of data to forward
Willy Tarreau6db06d32009-08-19 11:14:11 +0200599 * The test is arranged so that the most common case does only 2
600 * tests.
Willy Tarreaufb14edc2009-06-14 15:24:37 +0200601 */
Willy Tarreaufb14edc2009-06-14 15:24:37 +0200602
Willy Tarreauface8392010-01-03 11:37:54 +0100603 if (MSG_NOSIGNAL && MSG_MORE) {
Willy Tarreau6db06d32009-08-19 11:14:11 +0200604 unsigned int send_flag = MSG_DONTWAIT | MSG_NOSIGNAL;
605
Willy Tarreauface8392010-01-03 11:37:54 +0100606 if (((b->to_forward && b->to_forward != BUF_INFINITE_FORWARD) ||
Willy Tarreaud38b53b2010-01-03 11:18:34 +0100607 ((b->flags & (BF_SHUTW|BF_SHUTW_NOW|BF_HIJACK)) == BF_SHUTW_NOW && (max == b->send_max)) ||
Willy Tarreau6db06d32009-08-19 11:14:11 +0200608 (max != b->l && max != b->send_max))
609 && (fdtab[si->fd].flags & FD_FL_TCP)) {
610 send_flag |= MSG_MORE;
611 }
Willy Tarreauface8392010-01-03 11:37:54 +0100612 else if (b->flags & BF_EXPECT_MORE) {
613 /* it was forced on the buffer, this flag is one-shoot */
614 b->flags &= ~BF_EXPECT_MORE;
615 send_flag |= MSG_MORE;
616 }
Willy Tarreau6db06d32009-08-19 11:14:11 +0200617
Willy Tarreau2be39392010-01-03 17:24:51 +0100618 /* this flag has precedence over the rest */
619 if (b->flags & BF_SEND_DONTWAIT)
620 send_flag &= ~MSG_MORE;
621
Willy Tarreau6db06d32009-08-19 11:14:11 +0200622 ret = send(si->fd, b->w, max, send_flag);
Willy Tarreau2be39392010-01-03 17:24:51 +0100623
624 /* disable it only once everything has been sent */
625 if (ret == max && (b->flags & BF_SEND_DONTWAIT))
626 b->flags &= ~BF_SEND_DONTWAIT;
Willy Tarreaud6d06902009-08-19 11:22:33 +0200627 } else {
Willy Tarreaubaaee002006-06-26 02:48:02 +0200628 int skerr;
629 socklen_t lskerr = sizeof(skerr);
630
Willy Tarreau87bed622009-03-08 22:25:28 +0100631 ret = getsockopt(si->fd, SOL_SOCKET, SO_ERROR, &skerr, &lskerr);
Willy Tarreauc6423482006-10-15 14:59:03 +0200632 if (ret == -1 || skerr)
Willy Tarreaubaaee002006-06-26 02:48:02 +0200633 ret = -1;
634 else
Willy Tarreau87bed622009-03-08 22:25:28 +0100635 ret = send(si->fd, b->w, max, MSG_DONTWAIT);
Willy Tarreaubaaee002006-06-26 02:48:02 +0200636 }
Willy Tarreaubaaee002006-06-26 02:48:02 +0200637
638 if (ret > 0) {
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100639 if (fdtab[si->fd].state == FD_STCONN)
640 fdtab[si->fd].state = FD_STREADY;
Willy Tarreaub38903c2008-11-23 21:33:29 +0100641
Willy Tarreau3da77c52008-08-29 09:58:42 +0200642 b->flags |= BF_WRITE_PARTIAL;
Willy Tarreaue393fe22008-08-16 22:18:07 +0200643
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100644 b->w += ret;
Willy Tarreaua07a34e2009-08-16 23:27:46 +0200645 if (b->w == b->data + b->size)
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100646 b->w = b->data; /* wrap around the buffer */
647
648 b->l -= ret;
Willy Tarreau7c3c5412009-12-13 15:53:05 +0100649 if (likely(b->l < buffer_max_len(b)))
Willy Tarreaue393fe22008-08-16 22:18:07 +0200650 b->flags &= ~BF_FULL;
Willy Tarreau74ab2ac2008-11-23 17:23:07 +0100651
Willy Tarreauba0b63d2009-09-20 08:09:44 +0200652 if (likely(!b->l))
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100653 /* optimize data alignment in the buffer */
654 b->r = b->w = b->lr = b->data;
Willy Tarreau83749182007-04-15 20:56:27 +0200655
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100656 b->send_max -= ret;
Willy Tarreauba0b63d2009-09-20 08:09:44 +0200657 if (!b->send_max) {
658 if (likely(!b->pipe))
659 b->flags |= BF_OUT_EMPTY;
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100660 break;
Willy Tarreauba0b63d2009-09-20 08:09:44 +0200661 }
Willy Tarreau83749182007-04-15 20:56:27 +0200662
Willy Tarreauab3e1d32007-06-03 14:10:36 +0200663 /* if the system buffer is full, don't insist */
664 if (ret < max)
665 break;
666
Willy Tarreau6996e152007-04-30 14:37:43 +0200667 if (--write_poll <= 0)
668 break;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200669 }
Willy Tarreau6996e152007-04-30 14:37:43 +0200670 else if (ret == 0 || errno == EAGAIN) {
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100671 /* nothing written, we need to poll for write first */
Willy Tarreau83749182007-04-15 20:56:27 +0200672 retval = 0;
673 break;
674 }
Willy Tarreaubaaee002006-06-26 02:48:02 +0200675 else {
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100676 /* bad, we got an error */
677 retval = -1;
678 break;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200679 }
Willy Tarreau6996e152007-04-30 14:37:43 +0200680 } /* while (1) */
Willy Tarreaubaaee002006-06-26 02:48:02 +0200681
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100682 return retval;
683}
Willy Tarreau6996e152007-04-30 14:37:43 +0200684
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100685
686/*
687 * This function is called on a write event from a stream socket.
688 * It returns 0 if the caller needs to poll before calling it again, otherwise
689 * non-zero.
690 */
691int stream_sock_write(int fd)
692{
693 struct stream_interface *si = fdtab[fd].owner;
694 struct buffer *b = si->ob;
695 int retval = 1;
696
697#ifdef DEBUG_FULL
698 fprintf(stderr,"stream_sock_write : fd=%d, owner=%p\n", fd, fdtab[fd].owner);
699#endif
700
701 retval = 1;
Willy Tarreau71543652009-07-14 19:55:05 +0200702 if (fdtab[fd].state == FD_STERROR)
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100703 goto out_error;
704
Willy Tarreaud06e7112009-03-29 10:18:41 +0200705 /* we might have been called just after an asynchronous shutw */
706 if (b->flags & BF_SHUTW)
707 goto out_wakeup;
708
Willy Tarreauba0b63d2009-09-20 08:09:44 +0200709 if (likely(!(b->flags & BF_OUT_EMPTY))) {
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100710 /* OK there are data waiting to be sent */
711 retval = stream_sock_write_loop(si, b);
712 if (retval < 0)
713 goto out_error;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200714 }
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100715 else {
716 /* may be we have received a connection acknowledgement in TCP mode without data */
717 if (likely(fdtab[fd].state == FD_STCONN)) {
718 /* We have no data to send to check the connection, and
719 * getsockopt() will not inform us whether the connection
720 * is still pending. So we'll reuse connect() to check the
721 * state of the socket. This has the advantage of givig us
722 * the following info :
723 * - error
724 * - connecting (EALREADY, EINPROGRESS)
725 * - connected (EISCONN, 0)
726 */
Willy Tarreau8d5d77e2009-10-18 07:25:52 +0200727 if ((connect(fd, fdinfo[fd].peeraddr, fdinfo[fd].peerlen) == 0))
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100728 errno = 0;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200729
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100730 if (errno == EALREADY || errno == EINPROGRESS) {
731 retval = 0;
732 goto out_may_wakeup;
733 }
Willy Tarreau3ffeba12008-12-14 14:42:35 +0100734
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100735 if (errno && errno != EISCONN)
736 goto out_error;
Willy Tarreaufa7e1022008-10-19 07:30:41 +0200737
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100738 /* OK we just need to indicate that we got a connection
739 * and that we wrote nothing.
740 */
741 b->flags |= BF_WRITE_NULL;
742 fdtab[fd].state = FD_STREADY;
743 }
Willy Tarreau6996e152007-04-30 14:37:43 +0200744
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100745 /* Funny, we were called to write something but there wasn't
746 * anything. We can get there, for example if we were woken up
747 * on a write event to finish the splice, but the send_max is 0
748 * so we cannot write anything from the buffer. Let's disable
749 * the write event and pretend we never came there.
750 */
751 }
752
Willy Tarreauba0b63d2009-09-20 08:09:44 +0200753 if (b->flags & BF_OUT_EMPTY) {
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100754 /* the connection is established but we can't write. Either the
755 * buffer is empty, or we just refrain from sending because the
756 * send_max limit was reached. Maybe we just wrote the last
757 * chunk and need to close.
758 */
Willy Tarreau520d95e2009-09-19 21:04:57 +0200759 if (((b->flags & (BF_SHUTW|BF_HIJACK|BF_SHUTW_NOW)) == BF_SHUTW_NOW) &&
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100760 (si->state == SI_ST_EST)) {
761 stream_sock_shutw(si);
762 goto out_wakeup;
763 }
764
Willy Tarreau59454bf2009-09-20 11:13:40 +0200765 if ((b->flags & (BF_SHUTW|BF_SHUTW_NOW|BF_FULL|BF_HIJACK)) == 0)
Willy Tarreauac128fe2009-01-09 13:05:19 +0100766 si->flags |= SI_FL_WAIT_DATA;
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100767
Willy Tarreauac128fe2009-01-09 13:05:19 +0100768 EV_FD_CLR(fd, DIR_WR);
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100769 b->wex = TICK_ETERNITY;
Willy Tarreauac128fe2009-01-09 13:05:19 +0100770 }
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100771
772 out_may_wakeup:
773 if (b->flags & BF_WRITE_ACTIVITY) {
774 /* update timeout if we have written something */
Willy Tarreauba0b63d2009-09-20 08:09:44 +0200775 if ((b->flags & (BF_OUT_EMPTY|BF_SHUTW|BF_WRITE_PARTIAL)) == BF_WRITE_PARTIAL)
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100776 b->wex = tick_add_ifset(now_ms, b->wto);
777
778 out_wakeup:
Willy Tarreauf27b5ea2009-10-03 22:01:18 +0200779 if (tick_isset(si->ib->rex) && !(si->flags & SI_FL_INDEP_STR)) {
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100780 /* Note: to prevent the client from expiring read timeouts
Willy Tarreauf27b5ea2009-10-03 22:01:18 +0200781 * during writes, we refresh it. We only do this if the
782 * interface is not configured for "independant streams",
783 * because for some applications it's better not to do this,
784 * for instance when continuously exchanging small amounts
785 * of data which can full the socket buffers long before a
786 * write timeout is detected.
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100787 */
788 si->ib->rex = tick_add_ifset(now_ms, si->ib->rto);
789 }
790
791 /* the producer might be waiting for more room to store data */
Willy Tarreauf1ba4b32009-10-17 14:37:52 +0200792 if (likely((b->flags & (BF_SHUTW|BF_WRITE_PARTIAL|BF_FULL|BF_DONT_READ)) == BF_WRITE_PARTIAL &&
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100793 (b->prod->flags & SI_FL_WAIT_ROOM)))
794 b->prod->chk_rcv(b->prod);
795
796 /* we have to wake up if there is a special event or if we don't have
797 * any more data to forward and it's not planned to send any more.
798 */
799 if (likely((b->flags & (BF_WRITE_NULL|BF_WRITE_ERROR|BF_SHUTW)) ||
Willy Tarreauba0b63d2009-09-20 08:09:44 +0200800 ((b->flags & BF_OUT_EMPTY) && !b->to_forward) ||
Willy Tarreau0c2fc1f2009-01-18 15:30:37 +0100801 si->state != SI_ST_EST ||
802 b->prod->state != SI_ST_EST))
803 task_wakeup(si->owner, TASK_WOKEN_IO);
804 }
805
806 fdtab[fd].ev &= ~FD_POLL_OUT;
807 return retval;
Willy Tarreauac128fe2009-01-09 13:05:19 +0100808
Willy Tarreau6996e152007-04-30 14:37:43 +0200809 out_error:
Willy Tarreaucff64112008-11-03 06:26:53 +0100810 /* Write error on the file descriptor. We mark the FD as STERROR so
811 * that we don't use it anymore. The error is reported to the stream
812 * interface which will take proper action. We must not perturbate the
813 * buffer because the stream interface wants to ensure transparent
814 * connection retries.
Willy Tarreau6996e152007-04-30 14:37:43 +0200815 */
Willy Tarreaucff64112008-11-03 06:26:53 +0100816
Willy Tarreau6996e152007-04-30 14:37:43 +0200817 fdtab[fd].state = FD_STERROR;
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100818 fdtab[fd].ev &= ~FD_POLL_STICKY;
Willy Tarreau1714e0f2009-03-28 20:54:53 +0100819 EV_FD_REM(fd);
Willy Tarreaucff64112008-11-03 06:26:53 +0100820 si->flags |= SI_FL_ERR;
Willy Tarreaue5ed4062008-08-30 03:17:31 +0200821 task_wakeup(si->owner, TASK_WOKEN_IO);
822 return 1;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200823}
824
Willy Tarreau48adac52008-08-30 04:58:38 +0200825/*
Willy Tarreau3c6ab2e2008-09-04 11:19:41 +0200826 * This function performs a shutdown-write on a stream interface in a connected or
827 * init state (it does nothing for other states). It either shuts the write side
Willy Tarreau99126c32008-11-27 10:30:51 +0100828 * or closes the file descriptor and marks itself as closed. The buffer flags are
Willy Tarreau7340ca52010-01-16 10:03:45 +0100829 * updated to reflect the new state. It does also close everything is the SI was
830 * marked as being in error state.
Willy Tarreau48adac52008-08-30 04:58:38 +0200831 */
Willy Tarreau0a5d5dd2008-11-23 19:31:35 +0100832void stream_sock_shutw(struct stream_interface *si)
Willy Tarreau48adac52008-08-30 04:58:38 +0200833{
Willy Tarreau418fd472009-09-06 21:37:23 +0200834 si->ob->flags &= ~BF_SHUTW_NOW;
Willy Tarreau99126c32008-11-27 10:30:51 +0100835 if (si->ob->flags & BF_SHUTW)
836 return;
837 si->ob->flags |= BF_SHUTW;
838 si->ob->wex = TICK_ETERNITY;
Willy Tarreaub0ef7352008-12-14 13:26:20 +0100839 si->flags &= ~SI_FL_WAIT_DATA;
Willy Tarreau99126c32008-11-27 10:30:51 +0100840
Willy Tarreaub38903c2008-11-23 21:33:29 +0100841 switch (si->state) {
Willy Tarreaub38903c2008-11-23 21:33:29 +0100842 case SI_ST_EST:
Willy Tarreau720058c2009-07-14 19:21:50 +0200843 /* we have to shut before closing, otherwise some short messages
844 * may never leave the system, especially when there are remaining
845 * unread data in the socket input buffer, or when nolinger is set.
Willy Tarreau4c283dc2009-12-29 14:36:34 +0100846 * However, if SI_FL_NOLINGER is explicitly set, we know there is
847 * no risk so we close both sides immediately.
Willy Tarreau720058c2009-07-14 19:21:50 +0200848 */
Willy Tarreau7340ca52010-01-16 10:03:45 +0100849 if (si->flags & SI_FL_ERR) {
850 /* quick close, the socket is already shut. Remove pending flags. */
851 si->flags &= ~SI_FL_NOLINGER;
852 } else if (si->flags & SI_FL_NOLINGER) {
Willy Tarreau4c283dc2009-12-29 14:36:34 +0100853 si->flags &= ~SI_FL_NOLINGER;
854 setsockopt(si->fd, SOL_SOCKET, SO_LINGER,
855 (struct linger *) &nolinger, sizeof(struct linger));
856 } else {
857 EV_FD_CLR(si->fd, DIR_WR);
858 shutdown(si->fd, SHUT_WR);
Willy Tarreau720058c2009-07-14 19:21:50 +0200859
Willy Tarreau4c283dc2009-12-29 14:36:34 +0100860 if (!(si->ib->flags & (BF_SHUTR|BF_DONT_READ)))
861 return;
862 }
Willy Tarreau5d707e12009-06-28 11:09:07 +0200863
Willy Tarreaub38903c2008-11-23 21:33:29 +0100864 /* fall through */
865 case SI_ST_CON:
Willy Tarreau8bfa4262008-11-27 09:25:45 +0100866 /* we may have to close a pending connection, and mark the
867 * response buffer as shutr
868 */
Willy Tarreau48adac52008-08-30 04:58:38 +0200869 fd_delete(si->fd);
Willy Tarreaufe3718a2008-11-30 18:14:12 +0100870 /* fall through */
871 case SI_ST_CER:
Willy Tarreau7f006512008-12-07 14:04:04 +0100872 si->state = SI_ST_DIS;
873 default:
Willy Tarreaud06e7112009-03-29 10:18:41 +0200874 si->flags &= ~SI_FL_WAIT_ROOM;
Willy Tarreau99126c32008-11-27 10:30:51 +0100875 si->ib->flags |= BF_SHUTR;
Willy Tarreaufe3718a2008-11-30 18:14:12 +0100876 si->ib->rex = TICK_ETERNITY;
Willy Tarreau127334e2009-03-28 10:47:26 +0100877 si->exp = TICK_ETERNITY;
Willy Tarreau0a5d5dd2008-11-23 19:31:35 +0100878 return;
Willy Tarreau48adac52008-08-30 04:58:38 +0200879 }
Willy Tarreau0bd05ea2010-07-02 11:18:03 +0200880
881 if (si->release)
882 si->release(si);
Willy Tarreau48adac52008-08-30 04:58:38 +0200883}
Willy Tarreaubaaee002006-06-26 02:48:02 +0200884
Willy Tarreau2d212792008-08-27 21:41:35 +0200885/*
Willy Tarreau3c6ab2e2008-09-04 11:19:41 +0200886 * This function performs a shutdown-read on a stream interface in a connected or
Willy Tarreau0a5d5dd2008-11-23 19:31:35 +0100887 * init state (it does nothing for other states). It either shuts the read side
Willy Tarreau99126c32008-11-27 10:30:51 +0100888 * or closes the file descriptor and marks itself as closed. The buffer flags are
889 * updated to reflect the new state.
Willy Tarreau3c6ab2e2008-09-04 11:19:41 +0200890 */
Willy Tarreau0a5d5dd2008-11-23 19:31:35 +0100891void stream_sock_shutr(struct stream_interface *si)
Willy Tarreau3c6ab2e2008-09-04 11:19:41 +0200892{
Willy Tarreau418fd472009-09-06 21:37:23 +0200893 si->ib->flags &= ~BF_SHUTR_NOW;
Willy Tarreau99126c32008-11-27 10:30:51 +0100894 if (si->ib->flags & BF_SHUTR)
895 return;
896 si->ib->flags |= BF_SHUTR;
897 si->ib->rex = TICK_ETERNITY;
Willy Tarreaub0ef7352008-12-14 13:26:20 +0100898 si->flags &= ~SI_FL_WAIT_ROOM;
Willy Tarreau99126c32008-11-27 10:30:51 +0100899
Willy Tarreau8bfa4262008-11-27 09:25:45 +0100900 if (si->state != SI_ST_EST && si->state != SI_ST_CON)
Willy Tarreau0a5d5dd2008-11-23 19:31:35 +0100901 return;
Willy Tarreau3c6ab2e2008-09-04 11:19:41 +0200902
Willy Tarreaucff64112008-11-03 06:26:53 +0100903 if (si->ob->flags & BF_SHUTW) {
Willy Tarreau3c6ab2e2008-09-04 11:19:41 +0200904 fd_delete(si->fd);
Willy Tarreau74ab2ac2008-11-23 17:23:07 +0100905 si->state = SI_ST_DIS;
Willy Tarreau127334e2009-03-28 10:47:26 +0100906 si->exp = TICK_ETERNITY;
Willy Tarreau0bd05ea2010-07-02 11:18:03 +0200907
908 if (si->release)
909 si->release(si);
Willy Tarreau0a5d5dd2008-11-23 19:31:35 +0100910 return;
Willy Tarreau3c6ab2e2008-09-04 11:19:41 +0200911 }
912 EV_FD_CLR(si->fd, DIR_RD);
Willy Tarreau0a5d5dd2008-11-23 19:31:35 +0100913 return;
Willy Tarreau3c6ab2e2008-09-04 11:19:41 +0200914}
915
916/*
Willy Tarreau3a16b2c2008-08-28 08:54:27 +0200917 * Updates a connected stream_sock file descriptor status and timeouts
918 * according to the buffers' flags. It should only be called once after the
919 * buffer flags have settled down, and before they are cleared. It doesn't
920 * harm to call it as often as desired (it just slightly hurts performance).
921 */
Willy Tarreaub0253252008-11-30 21:37:12 +0100922void stream_sock_data_finish(struct stream_interface *si)
Willy Tarreau3a16b2c2008-08-28 08:54:27 +0200923{
Willy Tarreaub0253252008-11-30 21:37:12 +0100924 struct buffer *ib = si->ib;
925 struct buffer *ob = si->ob;
926 int fd = si->fd;
Willy Tarreau3a16b2c2008-08-28 08:54:27 +0200927
Willy Tarreaue5ed4062008-08-30 03:17:31 +0200928 DPRINTF(stderr,"[%u] %s: fd=%d owner=%p ib=%p, ob=%p, exp(r,w)=%u,%u ibf=%08x obf=%08x ibl=%d obl=%d si=%d\n",
Willy Tarreau3a16b2c2008-08-28 08:54:27 +0200929 now_ms, __FUNCTION__,
930 fd, fdtab[fd].owner,
931 ib, ob,
932 ib->rex, ob->wex,
933 ib->flags, ob->flags,
Willy Tarreaub0253252008-11-30 21:37:12 +0100934 ib->l, ob->l, si->state);
Willy Tarreau3a16b2c2008-08-28 08:54:27 +0200935
936 /* Check if we need to close the read side */
937 if (!(ib->flags & BF_SHUTR)) {
Willy Tarreau2d212792008-08-27 21:41:35 +0200938 /* Read not closed, update FD status and timeout for reads */
Willy Tarreauf1ba4b32009-10-17 14:37:52 +0200939 if (ib->flags & (BF_FULL|BF_HIJACK|BF_DONT_READ)) {
Willy Tarreau2d212792008-08-27 21:41:35 +0200940 /* stop reading */
Willy Tarreauf1ba4b32009-10-17 14:37:52 +0200941 if ((ib->flags & (BF_FULL|BF_HIJACK|BF_DONT_READ)) == BF_FULL)
Willy Tarreaub0ef7352008-12-14 13:26:20 +0100942 si->flags |= SI_FL_WAIT_ROOM;
Willy Tarreau2d212792008-08-27 21:41:35 +0200943 EV_FD_COND_C(fd, DIR_RD);
944 ib->rex = TICK_ETERNITY;
945 }
946 else {
947 /* (re)start reading and update timeout. Note: we don't recompute the timeout
948 * everytime we get here, otherwise it would risk never to expire. We only
Willy Tarreaufe8903c2009-10-04 10:56:08 +0200949 * update it if is was not yet set. The stream socket handler will already
950 * have updated it if there has been a completed I/O.
Willy Tarreau2d212792008-08-27 21:41:35 +0200951 */
Willy Tarreaub0ef7352008-12-14 13:26:20 +0100952 si->flags &= ~SI_FL_WAIT_ROOM;
Willy Tarreau2d212792008-08-27 21:41:35 +0200953 EV_FD_COND_S(fd, DIR_RD);
Willy Tarreauf1ba4b32009-10-17 14:37:52 +0200954 if (!(ib->flags & (BF_READ_NOEXP|BF_DONT_READ)) && !tick_isset(ib->rex))
Willy Tarreau2d212792008-08-27 21:41:35 +0200955 ib->rex = tick_add_ifset(now_ms, ib->rto);
956 }
957 }
958
959 /* Check if we need to close the write side */
960 if (!(ob->flags & BF_SHUTW)) {
Willy Tarreau2d212792008-08-27 21:41:35 +0200961 /* Write not closed, update FD status and timeout for writes */
Willy Tarreauba0b63d2009-09-20 08:09:44 +0200962 if (ob->flags & BF_OUT_EMPTY) {
Willy Tarreau2d212792008-08-27 21:41:35 +0200963 /* stop writing */
Willy Tarreau59454bf2009-09-20 11:13:40 +0200964 if ((ob->flags & (BF_FULL|BF_HIJACK|BF_SHUTW_NOW)) == 0)
Willy Tarreaub0ef7352008-12-14 13:26:20 +0100965 si->flags |= SI_FL_WAIT_DATA;
Willy Tarreau2d212792008-08-27 21:41:35 +0200966 EV_FD_COND_C(fd, DIR_WR);
967 ob->wex = TICK_ETERNITY;
968 }
969 else {
970 /* (re)start writing and update timeout. Note: we don't recompute the timeout
971 * everytime we get here, otherwise it would risk never to expire. We only
Willy Tarreaufe8903c2009-10-04 10:56:08 +0200972 * update it if is was not yet set. The stream socket handler will already
973 * have updated it if there has been a completed I/O.
Willy Tarreau2d212792008-08-27 21:41:35 +0200974 */
Willy Tarreaub0ef7352008-12-14 13:26:20 +0100975 si->flags &= ~SI_FL_WAIT_DATA;
Willy Tarreau2d212792008-08-27 21:41:35 +0200976 EV_FD_COND_S(fd, DIR_WR);
Willy Tarreaufe8903c2009-10-04 10:56:08 +0200977 if (!tick_isset(ob->wex)) {
Willy Tarreau2d212792008-08-27 21:41:35 +0200978 ob->wex = tick_add_ifset(now_ms, ob->wto);
Willy Tarreauf27b5ea2009-10-03 22:01:18 +0200979 if (tick_isset(ib->rex) && !(si->flags & SI_FL_INDEP_STR)) {
Willy Tarreau2d212792008-08-27 21:41:35 +0200980 /* Note: depending on the protocol, we don't know if we're waiting
981 * for incoming data or not. So in order to prevent the socket from
982 * expiring read timeouts during writes, we refresh the read timeout,
Willy Tarreauf27b5ea2009-10-03 22:01:18 +0200983 * except if it was already infinite or if we have explicitly setup
984 * independant streams.
Willy Tarreau2d212792008-08-27 21:41:35 +0200985 */
Willy Tarreaud06e7112009-03-29 10:18:41 +0200986 ib->rex = tick_add_ifset(now_ms, ib->rto);
Willy Tarreau2d212792008-08-27 21:41:35 +0200987 }
988 }
989 }
990 }
Willy Tarreau2d212792008-08-27 21:41:35 +0200991}
992
Willy Tarreau3ffeba12008-12-14 14:42:35 +0100993/* This function is used for inter-stream-interface calls. It is called by the
994 * consumer to inform the producer side that it may be interested in checking
995 * for free space in the buffer. Note that it intentionally does not update
996 * timeouts, so that we can still check them later at wake-up.
997 */
998void stream_sock_chk_rcv(struct stream_interface *si)
999{
1000 struct buffer *ib = si->ib;
1001
1002 DPRINTF(stderr,"[%u] %s: fd=%d owner=%p ib=%p, ob=%p, exp(r,w)=%u,%u ibf=%08x obf=%08x ibl=%d obl=%d si=%d\n",
1003 now_ms, __FUNCTION__,
Vincenzo Farruggia9b97cff2009-01-30 16:49:10 +00001004 si->fd, fdtab[si->fd].owner,
1005 ib, si->ob,
1006 ib->rex, si->ob->wex,
1007 ib->flags, si->ob->flags,
1008 ib->l, si->ob->l, si->state);
Willy Tarreau3ffeba12008-12-14 14:42:35 +01001009
1010 if (unlikely(si->state != SI_ST_EST || (ib->flags & BF_SHUTR)))
1011 return;
1012
Willy Tarreauf1ba4b32009-10-17 14:37:52 +02001013 if (ib->flags & (BF_FULL|BF_HIJACK|BF_DONT_READ)) {
Willy Tarreau3ffeba12008-12-14 14:42:35 +01001014 /* stop reading */
Willy Tarreauf1ba4b32009-10-17 14:37:52 +02001015 if ((ib->flags & (BF_FULL|BF_HIJACK|BF_DONT_READ)) == BF_FULL)
Willy Tarreau3ffeba12008-12-14 14:42:35 +01001016 si->flags |= SI_FL_WAIT_ROOM;
1017 EV_FD_COND_C(si->fd, DIR_RD);
1018 }
1019 else {
1020 /* (re)start reading */
1021 si->flags &= ~SI_FL_WAIT_ROOM;
1022 EV_FD_COND_S(si->fd, DIR_RD);
1023 }
1024}
1025
1026
1027/* This function is used for inter-stream-interface calls. It is called by the
1028 * producer to inform the consumer side that it may be interested in checking
1029 * for data in the buffer. Note that it intentionally does not update timeouts,
1030 * so that we can still check them later at wake-up.
1031 */
1032void stream_sock_chk_snd(struct stream_interface *si)
1033{
1034 struct buffer *ob = si->ob;
Willy Tarreaua456f2a2009-01-18 17:38:44 +01001035 int retval;
Willy Tarreau3ffeba12008-12-14 14:42:35 +01001036
1037 DPRINTF(stderr,"[%u] %s: fd=%d owner=%p ib=%p, ob=%p, exp(r,w)=%u,%u ibf=%08x obf=%08x ibl=%d obl=%d si=%d\n",
1038 now_ms, __FUNCTION__,
Vincenzo Farruggia9b97cff2009-01-30 16:49:10 +00001039 si->fd, fdtab[si->fd].owner,
1040 si->ib, ob,
1041 si->ib->rex, ob->wex,
1042 si->ib->flags, ob->flags,
1043 si->ib->l, ob->l, si->state);
Willy Tarreau3ffeba12008-12-14 14:42:35 +01001044
1045 if (unlikely(si->state != SI_ST_EST || (ob->flags & BF_SHUTW)))
1046 return;
1047
Willy Tarreaua456f2a2009-01-18 17:38:44 +01001048 if (!(si->flags & SI_FL_WAIT_DATA) || /* not waiting for data */
1049 (fdtab[si->fd].ev & FD_POLL_OUT) || /* we'll be called anyway */
Willy Tarreauba0b63d2009-09-20 08:09:44 +02001050 (ob->flags & BF_OUT_EMPTY)) /* called with nothing to send ! */
Willy Tarreaua456f2a2009-01-18 17:38:44 +01001051 return;
1052
1053 retval = stream_sock_write_loop(si, ob);
Willy Tarreauc54aef32009-07-27 20:08:06 +02001054 /* here, we have :
1055 * retval < 0 if an error was encountered during write.
1056 * retval = 0 if we can't write anymore without polling
1057 * retval = 1 if we're invited to come back when desired
1058 */
Willy Tarreaua456f2a2009-01-18 17:38:44 +01001059 if (retval < 0) {
1060 /* Write error on the file descriptor. We mark the FD as STERROR so
1061 * that we don't use it anymore and we notify the task.
1062 */
1063 fdtab[si->fd].state = FD_STERROR;
1064 fdtab[si->fd].ev &= ~FD_POLL_STICKY;
Willy Tarreau1714e0f2009-03-28 20:54:53 +01001065 EV_FD_REM(si->fd);
Willy Tarreaua456f2a2009-01-18 17:38:44 +01001066 si->flags |= SI_FL_ERR;
1067 goto out_wakeup;
1068 }
1069
Willy Tarreauc54aef32009-07-27 20:08:06 +02001070 /* OK, so now we know that retval >= 0 means that some data might have
1071 * been sent, and that we may have to poll first. We have to do that
1072 * too if the buffer is not empty.
1073 */
Willy Tarreauba0b63d2009-09-20 08:09:44 +02001074 if (ob->flags & BF_OUT_EMPTY) {
Willy Tarreaua456f2a2009-01-18 17:38:44 +01001075 /* the connection is established but we can't write. Either the
1076 * buffer is empty, or we just refrain from sending because the
1077 * send_max limit was reached. Maybe we just wrote the last
1078 * chunk and need to close.
1079 */
Willy Tarreau520d95e2009-09-19 21:04:57 +02001080 if (((ob->flags & (BF_SHUTW|BF_HIJACK|BF_AUTO_CLOSE|BF_SHUTW_NOW)) ==
1081 (BF_AUTO_CLOSE|BF_SHUTW_NOW)) &&
Willy Tarreaua456f2a2009-01-18 17:38:44 +01001082 (si->state == SI_ST_EST)) {
1083 stream_sock_shutw(si);
1084 goto out_wakeup;
1085 }
Willy Tarreaud06e7112009-03-29 10:18:41 +02001086
Willy Tarreau59454bf2009-09-20 11:13:40 +02001087 if ((ob->flags & (BF_SHUTW|BF_SHUTW_NOW|BF_FULL|BF_HIJACK)) == 0)
Willy Tarreau3ffeba12008-12-14 14:42:35 +01001088 si->flags |= SI_FL_WAIT_DATA;
Willy Tarreaua456f2a2009-01-18 17:38:44 +01001089 ob->wex = TICK_ETERNITY;
Willy Tarreau3ffeba12008-12-14 14:42:35 +01001090 }
1091 else {
Willy Tarreauc54aef32009-07-27 20:08:06 +02001092 /* Otherwise there are remaining data to be sent in the buffer,
1093 * which means we have to poll before doing so.
1094 */
Willy Tarreau3ffeba12008-12-14 14:42:35 +01001095 EV_FD_COND_S(si->fd, DIR_WR);
Willy Tarreauc54aef32009-07-27 20:08:06 +02001096 si->flags &= ~SI_FL_WAIT_DATA;
1097 if (!tick_isset(ob->wex))
1098 ob->wex = tick_add_ifset(now_ms, ob->wto);
Willy Tarreau3ffeba12008-12-14 14:42:35 +01001099 }
Willy Tarreaua456f2a2009-01-18 17:38:44 +01001100
Willy Tarreauc9619462009-03-09 22:40:57 +01001101 if (likely(ob->flags & BF_WRITE_ACTIVITY)) {
1102 /* update timeout if we have written something */
Willy Tarreauba0b63d2009-09-20 08:09:44 +02001103 if ((ob->flags & (BF_OUT_EMPTY|BF_SHUTW|BF_WRITE_PARTIAL)) == BF_WRITE_PARTIAL)
Willy Tarreauc9619462009-03-09 22:40:57 +01001104 ob->wex = tick_add_ifset(now_ms, ob->wto);
1105
Willy Tarreauf27b5ea2009-10-03 22:01:18 +02001106 if (tick_isset(si->ib->rex) && !(si->flags & SI_FL_INDEP_STR)) {
Willy Tarreauc9619462009-03-09 22:40:57 +01001107 /* Note: to prevent the client from expiring read timeouts
Willy Tarreauf27b5ea2009-10-03 22:01:18 +02001108 * during writes, we refresh it. We only do this if the
1109 * interface is not configured for "independant streams",
1110 * because for some applications it's better not to do this,
1111 * for instance when continuously exchanging small amounts
1112 * of data which can full the socket buffers long before a
1113 * write timeout is detected.
Willy Tarreauc9619462009-03-09 22:40:57 +01001114 */
1115 si->ib->rex = tick_add_ifset(now_ms, si->ib->rto);
1116 }
1117 }
1118
Willy Tarreaua456f2a2009-01-18 17:38:44 +01001119 /* in case of special condition (error, shutdown, end of write...), we
1120 * have to notify the task.
1121 */
1122 if (likely((ob->flags & (BF_WRITE_NULL|BF_WRITE_ERROR|BF_SHUTW)) ||
Willy Tarreauba0b63d2009-09-20 08:09:44 +02001123 ((ob->flags & BF_OUT_EMPTY) && !ob->to_forward) ||
Willy Tarreaua456f2a2009-01-18 17:38:44 +01001124 si->state != SI_ST_EST)) {
1125 out_wakeup:
Willy Tarreau89f7ef22009-09-05 20:57:35 +02001126 if (!(si->flags & SI_FL_DONT_WAKE) && si->owner)
1127 task_wakeup(si->owner, TASK_WOKEN_IO);
Willy Tarreaua456f2a2009-01-18 17:38:44 +01001128 }
Willy Tarreau3ffeba12008-12-14 14:42:35 +01001129}
1130
Willy Tarreaueb472682010-05-28 18:46:57 +02001131/* This function is called on a read event from a listening socket, corresponding
1132 * to an accept. It tries to accept as many connections as possible, and for each
1133 * calls the listener's accept handler (generally the frontend's accept handler).
1134 */
1135int stream_sock_accept(int fd)
1136{
1137 struct listener *l = fdtab[fd].owner;
1138 struct proxy *p = l->frontend;
1139 int max_accept = global.tune.maxaccept;
1140 int cfd;
1141 int ret;
1142
1143 if (unlikely(l->nbconn >= l->maxconn)) {
1144 EV_FD_CLR(l->fd, DIR_RD);
1145 l->state = LI_FULL;
1146 return 0;
1147 }
1148
1149 if (p && p->fe_sps_lim) {
1150 int max = freq_ctr_remain(&p->fe_sess_per_sec, p->fe_sps_lim, 0);
1151 if (max_accept > max)
1152 max_accept = max;
1153 }
1154
1155 while ((!p || p->feconn < p->maxconn) && actconn < global.maxconn && max_accept--) {
1156 struct sockaddr_storage addr;
1157 socklen_t laddr = sizeof(addr);
1158
1159 cfd = accept(fd, (struct sockaddr *)&addr, &laddr);
1160 if (unlikely(cfd == -1)) {
1161 switch (errno) {
1162 case EAGAIN:
1163 case EINTR:
1164 case ECONNABORTED:
1165 return 0; /* nothing more to accept */
1166 case ENFILE:
Willy Tarreau7999ddb2010-06-04 20:46:13 +02001167 if (p)
1168 send_log(p, LOG_EMERG,
1169 "Proxy %s reached system FD limit at %d. Please check system tunables.\n",
1170 p->id, maxfd);
Willy Tarreaueb472682010-05-28 18:46:57 +02001171 return 0;
1172 case EMFILE:
Willy Tarreau7999ddb2010-06-04 20:46:13 +02001173 if (p)
1174 send_log(p, LOG_EMERG,
1175 "Proxy %s reached process FD limit at %d. Please check 'ulimit-n' and restart.\n",
1176 p->id, maxfd);
Willy Tarreaueb472682010-05-28 18:46:57 +02001177 return 0;
1178 case ENOBUFS:
1179 case ENOMEM:
Willy Tarreau7999ddb2010-06-04 20:46:13 +02001180 if (p)
1181 send_log(p, LOG_EMERG,
1182 "Proxy %s reached system memory limit at %d sockets. Please check system tunables.\n",
1183 p->id, maxfd);
Willy Tarreaueb472682010-05-28 18:46:57 +02001184 return 0;
1185 default:
1186 return 0;
1187 }
1188 }
1189
1190 if (unlikely(cfd >= global.maxsock)) {
1191 Alert("accept(): not enough free sockets. Raise -n argument. Giving up.\n");
1192 goto out_close;
1193 }
1194
Willy Tarreauaf7ad002010-08-31 15:39:26 +02001195 jobs++;
Willy Tarreau24dcaf32010-06-05 10:49:41 +02001196 actconn++;
1197 totalconn++;
1198 l->nbconn++;
1199
1200 if (l->counters) {
1201 if (l->nbconn > l->counters->conn_max)
1202 l->counters->conn_max = l->nbconn;
1203 }
1204
Willy Tarreaueb472682010-05-28 18:46:57 +02001205 ret = l->accept(l, cfd, &addr);
1206 if (unlikely(ret < 0)) {
1207 /* critical error encountered, generally a resource shortage */
Willy Tarreau7999ddb2010-06-04 20:46:13 +02001208 if (p) {
Willy Tarreaue9f32db2010-09-21 21:14:29 +02001209 disable_listener(l);
Willy Tarreau7999ddb2010-06-04 20:46:13 +02001210 p->state = PR_STIDLE;
1211 }
Willy Tarreauaf7ad002010-08-31 15:39:26 +02001212 jobs--;
Willy Tarreau24dcaf32010-06-05 10:49:41 +02001213 actconn--;
1214 l->nbconn--;
Willy Tarreaueb472682010-05-28 18:46:57 +02001215 goto out_close;
1216 }
1217 else if (unlikely(ret == 0)) {
1218 /* ignore this connection */
Willy Tarreauaf7ad002010-08-31 15:39:26 +02001219 jobs--;
Willy Tarreau24dcaf32010-06-05 10:49:41 +02001220 actconn--;
1221 l->nbconn--;
Willy Tarreaueb472682010-05-28 18:46:57 +02001222 close(cfd);
1223 continue;
1224 }
1225
Willy Tarreaueb472682010-05-28 18:46:57 +02001226 if (l->nbconn >= l->maxconn) {
1227 EV_FD_CLR(l->fd, DIR_RD);
1228 l->state = LI_FULL;
1229 }
Willy Tarreaueb472682010-05-28 18:46:57 +02001230 } /* end of while (p->feconn < p->maxconn) */
1231 return 0;
1232
1233 /* Error unrolling */
1234 out_close:
1235 close(cfd);
1236 return 0;
1237}
1238
Willy Tarreaua8f55d52010-05-31 17:44:19 +02001239/* Prepare a stream interface to be used in socket mode. */
1240void stream_sock_prepare_interface(struct stream_interface *si)
1241{
1242 si->update = stream_sock_data_finish;
1243 si->shutr = stream_sock_shutr;
1244 si->shutw = stream_sock_shutw;
1245 si->chk_rcv = stream_sock_chk_rcv;
1246 si->chk_snd = stream_sock_chk_snd;
1247 si->iohandler = NULL;
1248}
1249
Willy Tarreaubaaee002006-06-26 02:48:02 +02001250
1251/*
1252 * Local variables:
1253 * c-indent-level: 8
1254 * c-basic-offset: 8
1255 * End:
1256 */