blob: aa45bdd10300185d0738c3cb14039675b5dc771a [file] [log] [blame]
Willy Tarreaubaaee002006-06-26 02:48:02 +02001/*
2 * Functions operating on SOCK_STREAM and buffers.
3 *
Willy Tarreau0c303ee2008-07-07 00:09:58 +02004 * Copyright 2000-2008 Willy Tarreau <w@1wt.eu>
Willy Tarreaubaaee002006-06-26 02:48:02 +02005 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 */
12
13#include <errno.h>
14#include <fcntl.h>
15#include <stdio.h>
16#include <stdlib.h>
17
18#include <sys/socket.h>
19#include <sys/stat.h>
20#include <sys/types.h>
21
Willy Tarreau2dd0d472006-06-29 17:53:05 +020022#include <common/compat.h>
Willy Tarreaue3ba5f02006-06-29 18:54:54 +020023#include <common/config.h>
Willy Tarreaud6f087e2008-01-18 17:20:13 +010024#include <common/debug.h>
Willy Tarreau83749182007-04-15 20:56:27 +020025#include <common/standard.h>
Willy Tarreau0c303ee2008-07-07 00:09:58 +020026#include <common/ticks.h>
Willy Tarreau2dd0d472006-06-29 17:53:05 +020027#include <common/time.h>
Willy Tarreaubaaee002006-06-26 02:48:02 +020028
Willy Tarreaubaaee002006-06-26 02:48:02 +020029#include <types/buffers.h>
30#include <types/global.h>
Willy Tarreaubaaee002006-06-26 02:48:02 +020031#include <types/polling.h>
Willy Tarreaubaaee002006-06-26 02:48:02 +020032
33#include <proto/client.h>
34#include <proto/fd.h>
Willy Tarreaubaaee002006-06-26 02:48:02 +020035#include <proto/stream_sock.h>
36#include <proto/task.h>
37
38
39/*
Willy Tarreaud7971282006-07-29 18:36:34 +020040 * this function is called on a read event from a stream socket.
Willy Tarreau83749182007-04-15 20:56:27 +020041 * It returns 0 if we have a high confidence that we will not be
42 * able to read more data without polling first. Returns non-zero
43 * otherwise.
Willy Tarreaubaaee002006-06-26 02:48:02 +020044 */
Willy Tarreaud7971282006-07-29 18:36:34 +020045int stream_sock_read(int fd) {
Willy Tarreau0c303ee2008-07-07 00:09:58 +020046 __label__ out_wakeup, out_shutdown_r, out_error;
Willy Tarreau54469402006-07-29 16:59:06 +020047 struct buffer *b = fdtab[fd].cb[DIR_RD].b;
Willy Tarreau8a7af602008-05-03 23:07:14 +020048 int ret, max, retval, cur_read;
Willy Tarreaub8949f12007-03-23 22:39:59 +010049 int read_poll = MAX_READ_POLL_LOOPS;
Willy Tarreaubaaee002006-06-26 02:48:02 +020050
51#ifdef DEBUG_FULL
Willy Tarreaud6f087e2008-01-18 17:20:13 +010052 fprintf(stderr,"stream_sock_read : fd=%d, ev=0x%02x, owner=%p\n", fd, fdtab[fd].ev, fdtab[fd].owner);
Willy Tarreaubaaee002006-06-26 02:48:02 +020053#endif
54
Willy Tarreau83749182007-04-15 20:56:27 +020055 retval = 1;
56
Willy Tarreaud6f087e2008-01-18 17:20:13 +010057 /* stop immediately on errors */
58 if (fdtab[fd].state == FD_STERROR || (fdtab[fd].ev & FD_POLL_ERR))
Willy Tarreau6996e152007-04-30 14:37:43 +020059 goto out_error;
Willy Tarreaud6f087e2008-01-18 17:20:13 +010060
61 /* stop here if we reached the end of data */
62 if ((fdtab[fd].ev & (FD_POLL_IN|FD_POLL_HUP)) == FD_POLL_HUP)
63 goto out_shutdown_r;
Willy Tarreau83749182007-04-15 20:56:27 +020064
Willy Tarreau8a7af602008-05-03 23:07:14 +020065 cur_read = 0;
Willy Tarreau6996e152007-04-30 14:37:43 +020066 while (1) {
67 /*
68 * 1. compute the maximum block size we can read at once.
69 */
Willy Tarreau83749182007-04-15 20:56:27 +020070 if (b->l == 0) { /* let's realign the buffer to optimize I/O */
71 b->r = b->w = b->lr = b->data;
72 max = b->rlim - b->data;
73 }
74 else if (b->r > b->w) {
75 max = b->rlim - b->r;
76 }
77 else {
78 max = b->w - b->r;
79 /* FIXME: theorically, if w>0, we shouldn't have rlim < data+size anymore
80 * since it means that the rewrite protection has been removed. This
81 * implies that the if statement can be removed.
82 */
83 if (max > b->rlim - b->data)
Willy Tarreaubaaee002006-06-26 02:48:02 +020084 max = b->rlim - b->data;
Willy Tarreau83749182007-04-15 20:56:27 +020085 }
Willy Tarreaubaaee002006-06-26 02:48:02 +020086
Willy Tarreau6996e152007-04-30 14:37:43 +020087 if (unlikely(max == 0)) {
88 /* Not anymore room to store data. This should theorically
89 * never happen, but better safe than sorry !
90 */
Willy Tarreau83749182007-04-15 20:56:27 +020091 EV_FD_CLR(fd, DIR_RD);
Willy Tarreau0c303ee2008-07-07 00:09:58 +020092 b->rex = TICK_ETERNITY;
93 goto out_wakeup;
Willy Tarreau83749182007-04-15 20:56:27 +020094 }
Willy Tarreaubaaee002006-06-26 02:48:02 +020095
Willy Tarreau6996e152007-04-30 14:37:43 +020096 /*
97 * 2. read the largest possible block
98 */
Willy Tarreaubaaee002006-06-26 02:48:02 +020099#ifndef MSG_NOSIGNAL
Willy Tarreau83749182007-04-15 20:56:27 +0200100 {
101 int skerr;
102 socklen_t lskerr = sizeof(skerr);
103
104 ret = getsockopt(fd, SOL_SOCKET, SO_ERROR, &skerr, &lskerr);
105 if (ret == -1 || skerr)
106 ret = -1;
107 else
108 ret = recv(fd, b->r, max, 0);
109 }
Willy Tarreaubaaee002006-06-26 02:48:02 +0200110#else
Willy Tarreau83749182007-04-15 20:56:27 +0200111 ret = recv(fd, b->r, max, MSG_NOSIGNAL);
Willy Tarreaubaaee002006-06-26 02:48:02 +0200112#endif
Willy Tarreau83749182007-04-15 20:56:27 +0200113 if (ret > 0) {
114 b->r += ret;
115 b->l += ret;
Willy Tarreau8a7af602008-05-03 23:07:14 +0200116 cur_read += ret;
Willy Tarreau83749182007-04-15 20:56:27 +0200117 b->flags |= BF_PARTIAL_READ;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200118
Willy Tarreau83749182007-04-15 20:56:27 +0200119 if (b->r == b->data + BUFSIZE) {
120 b->r = b->data; /* wrap around the buffer */
121 }
Willy Tarreau9641e8f2007-03-23 23:02:09 +0100122
Willy Tarreau83749182007-04-15 20:56:27 +0200123 b->total += ret;
Willy Tarreau9641e8f2007-03-23 23:02:09 +0100124
Willy Tarreau6996e152007-04-30 14:37:43 +0200125 if (b->l == b->rlim - b->data) {
126 /* The buffer is now full, there's no point in going through
127 * the loop again.
128 */
Willy Tarreau8a7af602008-05-03 23:07:14 +0200129 if (!(b->flags & BF_STREAMER_FAST) && (cur_read == b->l)) {
130 b->xfer_small = 0;
131 b->xfer_large++;
132 if (b->xfer_large >= 3) {
133 /* we call this buffer a fast streamer if it manages
134 * to be filled in one call 3 consecutive times.
135 */
136 b->flags |= (BF_STREAMER | BF_STREAMER_FAST);
137 //fputc('+', stderr);
138 }
139 }
140 else if ((b->flags & (BF_STREAMER | BF_STREAMER_FAST)) &&
141 (cur_read <= BUFSIZE / 2)) {
142 b->xfer_large = 0;
143 b->xfer_small++;
144 if (b->xfer_small >= 2) {
145 /* if the buffer has been at least half full twice,
146 * we receive faster than we send, so at least it
147 * is not a "fast streamer".
148 */
149 b->flags &= ~BF_STREAMER_FAST;
150 //fputc('-', stderr);
151 }
152 }
153 else {
154 b->xfer_small = 0;
155 b->xfer_large = 0;
156 }
157
Willy Tarreau6996e152007-04-30 14:37:43 +0200158 EV_FD_CLR(fd, DIR_RD);
Willy Tarreau0c303ee2008-07-07 00:09:58 +0200159 b->rex = TICK_ETERNITY;
160 goto out_wakeup;
Willy Tarreau6996e152007-04-30 14:37:43 +0200161 }
162
Willy Tarreauab3e1d32007-06-03 14:10:36 +0200163 /* if too many bytes were missing from last read, it means that
164 * it's pointless trying to read again because the system does
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100165 * not have them in buffers. BTW, if FD_POLL_HUP was present,
166 * it means that we have reached the end and that the connection
167 * is closed.
Willy Tarreauab3e1d32007-06-03 14:10:36 +0200168 */
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100169 if (ret < max) {
Willy Tarreau8a7af602008-05-03 23:07:14 +0200170 if ((b->flags & (BF_STREAMER | BF_STREAMER_FAST)) &&
171 (cur_read <= BUFSIZE / 2)) {
172 b->xfer_large = 0;
173 b->xfer_small++;
174 if (b->xfer_small >= 3) {
175 /* we have read less than half of the buffer in
176 * one pass, and this happened at least 3 times.
177 * This is definitely not a streamer.
178 */
179 b->flags &= ~(BF_STREAMER | BF_STREAMER_FAST);
180 //fputc('!', stderr);
181 }
182 }
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100183 if (fdtab[fd].ev & FD_POLL_HUP)
184 goto out_shutdown_r;
Willy Tarreauab3e1d32007-06-03 14:10:36 +0200185 break;
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100186 }
Willy Tarreauab3e1d32007-06-03 14:10:36 +0200187
188 /* generally if we read something smaller than 1 or 2 MSS,
Willy Tarreau83749182007-04-15 20:56:27 +0200189 * it means that it's not worth trying to read again. It may
190 * also happen on headers, but the application then can stop
191 * reading before we start polling.
192 */
193 if (ret < MIN_RET_FOR_READ_LOOP)
Willy Tarreaubaaee002006-06-26 02:48:02 +0200194 break;
Willy Tarreau83749182007-04-15 20:56:27 +0200195
Willy Tarreau6996e152007-04-30 14:37:43 +0200196 if (--read_poll <= 0)
Willy Tarreaubaaee002006-06-26 02:48:02 +0200197 break;
Willy Tarreau83749182007-04-15 20:56:27 +0200198
Willy Tarreau83749182007-04-15 20:56:27 +0200199 }
200 else if (ret == 0) {
Willy Tarreau6996e152007-04-30 14:37:43 +0200201 /* connection closed */
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100202 goto out_shutdown_r;
Willy Tarreau83749182007-04-15 20:56:27 +0200203 }
Willy Tarreau9f195292007-04-15 21:26:58 +0200204 else if (errno == EAGAIN) {
205 /* Ignore EAGAIN but inform the poller that there is
Willy Tarreau6996e152007-04-30 14:37:43 +0200206 * nothing to read left. But we may have done some work
207 * justifying to notify the task.
Willy Tarreau9f195292007-04-15 21:26:58 +0200208 */
Willy Tarreau83749182007-04-15 20:56:27 +0200209 retval = 0;
210 break;
211 }
212 else {
Willy Tarreau6996e152007-04-30 14:37:43 +0200213 goto out_error;
Willy Tarreau83749182007-04-15 20:56:27 +0200214 }
Willy Tarreau6996e152007-04-30 14:37:43 +0200215 } /* while (1) */
Willy Tarreaubaaee002006-06-26 02:48:02 +0200216
Willy Tarreau6996e152007-04-30 14:37:43 +0200217 /*
218 * The only way to get out of this loop is to have stopped reading
219 * without any error nor close, either by limiting the number of
220 * loops, or because of an EAGAIN. We only rearm the timer if we
221 * have at least read something.
222 */
223
Willy Tarreau0c303ee2008-07-07 00:09:58 +0200224 if (b->flags & BF_PARTIAL_READ)
225 b->rex = tick_add_ifset(now_ms, b->rto);
Willy Tarreaubaaee002006-06-26 02:48:02 +0200226
Willy Tarreau6996e152007-04-30 14:37:43 +0200227 out_wakeup:
228 if (b->flags & BF_READ_STATUS)
229 task_wakeup(fdtab[fd].owner);
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100230 fdtab[fd].ev &= ~FD_POLL_IN;
Willy Tarreau83749182007-04-15 20:56:27 +0200231 return retval;
Willy Tarreau6996e152007-04-30 14:37:43 +0200232
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100233 out_shutdown_r:
234 fdtab[fd].ev &= ~FD_POLL_HUP;
235 b->flags |= BF_READ_NULL;
Willy Tarreau0c303ee2008-07-07 00:09:58 +0200236 b->rex = TICK_ETERNITY;
237 goto out_wakeup;
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100238
Willy Tarreau6996e152007-04-30 14:37:43 +0200239 out_error:
240 /* There was an error. we must wakeup the task. No need to clear
241 * the events, the task will do it.
242 */
243 fdtab[fd].state = FD_STERROR;
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100244 fdtab[fd].ev &= ~FD_POLL_STICKY;
Willy Tarreau6996e152007-04-30 14:37:43 +0200245 b->flags |= BF_READ_ERROR;
Willy Tarreau0c303ee2008-07-07 00:09:58 +0200246 b->rex = TICK_ETERNITY;
247 goto out_wakeup;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200248}
249
250
251/*
Willy Tarreauf8306d52006-07-29 19:01:31 +0200252 * this function is called on a write event from a stream socket.
Willy Tarreau83749182007-04-15 20:56:27 +0200253 * It returns 0 if we have a high confidence that we will not be
254 * able to write more data without polling first. Returns non-zero
255 * otherwise.
Willy Tarreaubaaee002006-06-26 02:48:02 +0200256 */
Willy Tarreauf8306d52006-07-29 19:01:31 +0200257int stream_sock_write(int fd) {
Willy Tarreau0c303ee2008-07-07 00:09:58 +0200258 __label__ out_wakeup, out_error;
Willy Tarreau54469402006-07-29 16:59:06 +0200259 struct buffer *b = fdtab[fd].cb[DIR_WR].b;
Willy Tarreau83749182007-04-15 20:56:27 +0200260 int ret, max, retval;
261 int write_poll = MAX_WRITE_POLL_LOOPS;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200262
263#ifdef DEBUG_FULL
Willy Tarreauf8306d52006-07-29 19:01:31 +0200264 fprintf(stderr,"stream_sock_write : fd=%d, owner=%p\n", fd, fdtab[fd].owner);
Willy Tarreaubaaee002006-06-26 02:48:02 +0200265#endif
266
Willy Tarreau83749182007-04-15 20:56:27 +0200267 retval = 1;
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100268 if (fdtab[fd].state == FD_STERROR || (fdtab[fd].ev & FD_POLL_ERR))
Willy Tarreau6996e152007-04-30 14:37:43 +0200269 goto out_error;
Willy Tarreau83749182007-04-15 20:56:27 +0200270
Willy Tarreau6996e152007-04-30 14:37:43 +0200271 while (1) {
Willy Tarreau83749182007-04-15 20:56:27 +0200272 if (b->l == 0) { /* let's realign the buffer to optimize I/O */
273 b->r = b->w = b->lr = b->data;
274 max = 0;
275 }
276 else if (b->r > b->w) {
277 max = b->r - b->w;
278 }
279 else {
280 max = b->data + BUFSIZE - b->w;
281 }
282
Willy Tarreaubaaee002006-06-26 02:48:02 +0200283 if (max == 0) {
Willy Tarreauf8306d52006-07-29 19:01:31 +0200284 /* may be we have received a connection acknowledgement in TCP mode without data */
Willy Tarreau6996e152007-04-30 14:37:43 +0200285 if (likely(fdtab[fd].state == FD_STCONN)) {
Willy Tarreau6996e152007-04-30 14:37:43 +0200286 /* We have no data to send to check the connection, and
287 * getsockopt() will not inform us whether the connection
288 * is still pending. So we'll reuse connect() to check the
289 * state of the socket. This has the advantage of givig us
290 * the following info :
291 * - error
292 * - connecting (EALREADY, EINPROGRESS)
293 * - connected (EISCONN, 0)
294 */
Willy Tarreaue94ebd02007-10-09 17:14:37 +0200295 if ((connect(fd, fdtab[fd].peeraddr, fdtab[fd].peerlen) == 0))
Willy Tarreau6996e152007-04-30 14:37:43 +0200296 errno = 0;
297
298 if (errno == EALREADY || errno == EINPROGRESS) {
299 retval = 0;
300 goto out_wakeup;
Willy Tarreauf8306d52006-07-29 19:01:31 +0200301 }
Willy Tarreau6996e152007-04-30 14:37:43 +0200302
303 if (errno && errno != EISCONN)
304 goto out_error;
305
306 /* OK we just need to indicate that we got a connection
307 * and that we wrote nothing.
308 */
309 b->flags |= BF_WRITE_NULL;
310 fdtab[fd].state = FD_STREADY;
Willy Tarreauf8306d52006-07-29 19:01:31 +0200311 }
312
Willy Tarreau6996e152007-04-30 14:37:43 +0200313 /* Funny, we were called to write something but there wasn't
314 * anything. Theorically we cannot get there, but just in case,
315 * let's disable the write event and pretend we never came there.
316 */
Willy Tarreauf161a342007-04-08 16:59:42 +0200317 EV_FD_CLR(fd, DIR_WR);
Willy Tarreau0c303ee2008-07-07 00:09:58 +0200318 b->wex = TICK_ETERNITY;
319 goto out_wakeup;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200320 }
321
322#ifndef MSG_NOSIGNAL
323 {
324 int skerr;
325 socklen_t lskerr = sizeof(skerr);
326
Willy Tarreauc6423482006-10-15 14:59:03 +0200327 ret = getsockopt(fd, SOL_SOCKET, SO_ERROR, &skerr, &lskerr);
328 if (ret == -1 || skerr)
Willy Tarreaubaaee002006-06-26 02:48:02 +0200329 ret = -1;
330 else
331 ret = send(fd, b->w, max, MSG_DONTWAIT);
332 }
333#else
334 ret = send(fd, b->w, max, MSG_DONTWAIT | MSG_NOSIGNAL);
335#endif
336
337 if (ret > 0) {
338 b->l -= ret;
339 b->w += ret;
340
Willy Tarreau0f9f5052006-07-29 17:39:25 +0200341 b->flags |= BF_PARTIAL_WRITE;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200342
343 if (b->w == b->data + BUFSIZE) {
344 b->w = b->data; /* wrap around the buffer */
345 }
Willy Tarreau83749182007-04-15 20:56:27 +0200346
Willy Tarreau6996e152007-04-30 14:37:43 +0200347 if (!b->l) {
348 EV_FD_CLR(fd, DIR_WR);
Willy Tarreau0c303ee2008-07-07 00:09:58 +0200349 b->wex = TICK_ETERNITY;
350 goto out_wakeup;
Willy Tarreau6996e152007-04-30 14:37:43 +0200351 }
Willy Tarreau83749182007-04-15 20:56:27 +0200352
Willy Tarreauab3e1d32007-06-03 14:10:36 +0200353 /* if the system buffer is full, don't insist */
354 if (ret < max)
355 break;
356
Willy Tarreau6996e152007-04-30 14:37:43 +0200357 if (--write_poll <= 0)
358 break;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200359 }
Willy Tarreau6996e152007-04-30 14:37:43 +0200360 else if (ret == 0 || errno == EAGAIN) {
361 /* nothing written, just pretend we were never called
362 * and wait for the socket to be ready. But we may have
363 * done some work justifying to notify the task.
364 */
Willy Tarreau83749182007-04-15 20:56:27 +0200365 retval = 0;
366 break;
367 }
Willy Tarreaubaaee002006-06-26 02:48:02 +0200368 else {
Willy Tarreau6996e152007-04-30 14:37:43 +0200369 goto out_error;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200370 }
Willy Tarreau6996e152007-04-30 14:37:43 +0200371 } /* while (1) */
Willy Tarreaubaaee002006-06-26 02:48:02 +0200372
Willy Tarreau6996e152007-04-30 14:37:43 +0200373 /*
374 * The only way to get out of this loop is to have stopped writing
375 * without any error, either by limiting the number of loops, or
376 * because of an EAGAIN. We only rearm the timer if we have at least
377 * written something.
378 */
379
380 if (b->flags & BF_PARTIAL_WRITE) {
Willy Tarreau0c303ee2008-07-07 00:09:58 +0200381 b->wex = tick_add_ifset(now_ms, b->wto);
382 if (b->wex) {
Willy Tarreau83749182007-04-15 20:56:27 +0200383 /* FIXME: to prevent the client from expiring read timeouts during writes,
384 * we refresh it. A solution would be to merge read+write timeouts into a
385 * unique one, although that needs some study particularly on full-duplex
386 * TCP connections. */
Willy Tarreaufa645582007-06-03 15:59:52 +0200387 if (!(b->flags & BF_SHUTR_STATUS))
388 b->rex = b->wex;
Willy Tarreau83749182007-04-15 20:56:27 +0200389 }
Willy Tarreaubaaee002006-06-26 02:48:02 +0200390 }
Willy Tarreaubaaee002006-06-26 02:48:02 +0200391
Willy Tarreau6996e152007-04-30 14:37:43 +0200392 out_wakeup:
393 if (b->flags & BF_WRITE_STATUS)
394 task_wakeup(fdtab[fd].owner);
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100395 fdtab[fd].ev &= ~FD_POLL_OUT;
Willy Tarreau83749182007-04-15 20:56:27 +0200396 return retval;
Willy Tarreau6996e152007-04-30 14:37:43 +0200397
398 out_error:
399 /* There was an error. we must wakeup the task. No need to clear
400 * the events, the task will do it.
401 */
402 fdtab[fd].state = FD_STERROR;
Willy Tarreaud6f087e2008-01-18 17:20:13 +0100403 fdtab[fd].ev &= ~FD_POLL_STICKY;
Willy Tarreau6996e152007-04-30 14:37:43 +0200404 b->flags |= BF_WRITE_ERROR;
Willy Tarreau0c303ee2008-07-07 00:09:58 +0200405 b->wex = TICK_ETERNITY;
406 goto out_wakeup;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200407}
408
Willy Tarreaubaaee002006-06-26 02:48:02 +0200409
410
411/*
412 * Local variables:
413 * c-indent-level: 8
414 * c-basic-offset: 8
415 * End:
416 */