blob: 2a7275baa50b6d8ca8c8b2852698827903126d51 [file] [log] [blame]
Willy Tarreaubaaee002006-06-26 02:48:02 +02001/*
2 * Functions operating on SOCK_STREAM and buffers.
3 *
Willy Tarreaue09e0ce2007-03-18 16:31:29 +01004 * Copyright 2000-2007 Willy Tarreau <w@1wt.eu>
Willy Tarreaubaaee002006-06-26 02:48:02 +02005 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 */
12
13#include <errno.h>
14#include <fcntl.h>
15#include <stdio.h>
16#include <stdlib.h>
17
18#include <sys/socket.h>
19#include <sys/stat.h>
20#include <sys/types.h>
21
Willy Tarreau2dd0d472006-06-29 17:53:05 +020022#include <common/compat.h>
Willy Tarreaue3ba5f02006-06-29 18:54:54 +020023#include <common/config.h>
Willy Tarreau83749182007-04-15 20:56:27 +020024#include <common/standard.h>
Willy Tarreau2dd0d472006-06-29 17:53:05 +020025#include <common/time.h>
Willy Tarreaubaaee002006-06-26 02:48:02 +020026
Willy Tarreaubaaee002006-06-26 02:48:02 +020027#include <types/buffers.h>
28#include <types/global.h>
Willy Tarreaubaaee002006-06-26 02:48:02 +020029#include <types/polling.h>
Willy Tarreaubaaee002006-06-26 02:48:02 +020030
31#include <proto/client.h>
32#include <proto/fd.h>
Willy Tarreaubaaee002006-06-26 02:48:02 +020033#include <proto/stream_sock.h>
34#include <proto/task.h>
35
36
37/*
Willy Tarreaud7971282006-07-29 18:36:34 +020038 * this function is called on a read event from a stream socket.
Willy Tarreau83749182007-04-15 20:56:27 +020039 * It returns 0 if we have a high confidence that we will not be
40 * able to read more data without polling first. Returns non-zero
41 * otherwise.
Willy Tarreaubaaee002006-06-26 02:48:02 +020042 */
Willy Tarreaud7971282006-07-29 18:36:34 +020043int stream_sock_read(int fd) {
Willy Tarreau83749182007-04-15 20:56:27 +020044 __label__ out_wakeup;
Willy Tarreau54469402006-07-29 16:59:06 +020045 struct buffer *b = fdtab[fd].cb[DIR_RD].b;
Willy Tarreau83749182007-04-15 20:56:27 +020046 int ret, max, retval;
Willy Tarreaub8949f12007-03-23 22:39:59 +010047 int read_poll = MAX_READ_POLL_LOOPS;
Willy Tarreaubaaee002006-06-26 02:48:02 +020048
49#ifdef DEBUG_FULL
Willy Tarreaud7971282006-07-29 18:36:34 +020050 fprintf(stderr,"stream_sock_read : fd=%d, owner=%p\n", fd, fdtab[fd].owner);
Willy Tarreaubaaee002006-06-26 02:48:02 +020051#endif
52
Willy Tarreau83749182007-04-15 20:56:27 +020053 retval = 1;
54
55 if (unlikely(fdtab[fd].state == FD_STERROR || (fdtab[fd].ev & FD_POLL_ERR))) {
56 /* read/write error */
57 b->flags |= BF_READ_ERROR;
58 fdtab[fd].state = FD_STERROR;
59 goto out_wakeup;
60 }
61
62 if (unlikely(fdtab[fd].ev & FD_POLL_HUP)) {
63 /* connection closed */
64 b->flags |= BF_READ_NULL;
65 goto out_wakeup;
66 }
67
68 retval = 0;
69 while (read_poll-- > 0) {
70 if (b->l == 0) { /* let's realign the buffer to optimize I/O */
71 b->r = b->w = b->lr = b->data;
72 max = b->rlim - b->data;
73 }
74 else if (b->r > b->w) {
75 max = b->rlim - b->r;
76 }
77 else {
78 max = b->w - b->r;
79 /* FIXME: theorically, if w>0, we shouldn't have rlim < data+size anymore
80 * since it means that the rewrite protection has been removed. This
81 * implies that the if statement can be removed.
82 */
83 if (max > b->rlim - b->data)
Willy Tarreaubaaee002006-06-26 02:48:02 +020084 max = b->rlim - b->data;
Willy Tarreau83749182007-04-15 20:56:27 +020085 }
Willy Tarreaubaaee002006-06-26 02:48:02 +020086
Willy Tarreau83749182007-04-15 20:56:27 +020087 if (max == 0) { /* not anymore room to store data */
88 EV_FD_CLR(fd, DIR_RD);
89 break;
90 }
Willy Tarreaubaaee002006-06-26 02:48:02 +020091
92#ifndef MSG_NOSIGNAL
Willy Tarreau83749182007-04-15 20:56:27 +020093 {
94 int skerr;
95 socklen_t lskerr = sizeof(skerr);
96
97 ret = getsockopt(fd, SOL_SOCKET, SO_ERROR, &skerr, &lskerr);
98 if (ret == -1 || skerr)
99 ret = -1;
100 else
101 ret = recv(fd, b->r, max, 0);
102 }
Willy Tarreaubaaee002006-06-26 02:48:02 +0200103#else
Willy Tarreau83749182007-04-15 20:56:27 +0200104 ret = recv(fd, b->r, max, MSG_NOSIGNAL);
Willy Tarreaubaaee002006-06-26 02:48:02 +0200105#endif
Willy Tarreau83749182007-04-15 20:56:27 +0200106 if (ret > 0) {
107 b->r += ret;
108 b->l += ret;
109 b->flags |= BF_PARTIAL_READ;
110 retval = 1;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200111
Willy Tarreau83749182007-04-15 20:56:27 +0200112 if (b->r == b->data + BUFSIZE) {
113 b->r = b->data; /* wrap around the buffer */
114 }
Willy Tarreau9641e8f2007-03-23 23:02:09 +0100115
Willy Tarreau83749182007-04-15 20:56:27 +0200116 b->total += ret;
Willy Tarreau9641e8f2007-03-23 23:02:09 +0100117
Willy Tarreau83749182007-04-15 20:56:27 +0200118 /* generally if we read something smaller than the 1 or 2 MSS,
119 * it means that it's not worth trying to read again. It may
120 * also happen on headers, but the application then can stop
121 * reading before we start polling.
122 */
123 if (ret < MIN_RET_FOR_READ_LOOP)
Willy Tarreaubaaee002006-06-26 02:48:02 +0200124 break;
Willy Tarreau83749182007-04-15 20:56:27 +0200125
126 if (!read_poll)
Willy Tarreaubaaee002006-06-26 02:48:02 +0200127 break;
Willy Tarreau83749182007-04-15 20:56:27 +0200128
129 /* we hope to read more data or to get a close on next round */
130 continue;
131 }
132 else if (ret == 0) {
133 b->flags |= BF_READ_NULL;
134 retval = 1; // connection closed
135 break;
136 }
Willy Tarreau9f195292007-04-15 21:26:58 +0200137 else if (errno == EAGAIN) {
138 /* Ignore EAGAIN but inform the poller that there is
139 * nothing to read left.
140 */
Willy Tarreau83749182007-04-15 20:56:27 +0200141 retval = 0;
142 break;
143 }
144 else {
145 retval = 1;
146 b->flags |= BF_READ_ERROR;
147 fdtab[fd].state = FD_STERROR;
148 break;
149 }
150 } /* while (read_poll) */
Willy Tarreaubaaee002006-06-26 02:48:02 +0200151
Willy Tarreau0f9f5052006-07-29 17:39:25 +0200152 if (b->flags & BF_READ_STATUS) {
Willy Tarreau83749182007-04-15 20:56:27 +0200153 out_wakeup:
Willy Tarreauf161a342007-04-08 16:59:42 +0200154 if (b->rto && EV_FD_ISSET(fd, DIR_RD))
Willy Tarreaud7971282006-07-29 18:36:34 +0200155 tv_delayfrom(&b->rex, &now, b->rto);
Willy Tarreaubaaee002006-06-26 02:48:02 +0200156 else
Willy Tarreaud7971282006-07-29 18:36:34 +0200157 tv_eternity(&b->rex);
Willy Tarreaubaaee002006-06-26 02:48:02 +0200158
Willy Tarreaud7971282006-07-29 18:36:34 +0200159 task_wakeup(&rq, fdtab[fd].owner);
Willy Tarreaubaaee002006-06-26 02:48:02 +0200160 }
161
Willy Tarreau83749182007-04-15 20:56:27 +0200162 fdtab[fd].ev &= ~FD_POLL_RD;
163 return retval;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200164}
165
166
167/*
Willy Tarreauf8306d52006-07-29 19:01:31 +0200168 * this function is called on a write event from a stream socket.
Willy Tarreau83749182007-04-15 20:56:27 +0200169 * It returns 0 if we have a high confidence that we will not be
170 * able to write more data without polling first. Returns non-zero
171 * otherwise.
Willy Tarreaubaaee002006-06-26 02:48:02 +0200172 */
Willy Tarreauf8306d52006-07-29 19:01:31 +0200173int stream_sock_write(int fd) {
Willy Tarreau83749182007-04-15 20:56:27 +0200174 __label__ out_eternity;
Willy Tarreau54469402006-07-29 16:59:06 +0200175 struct buffer *b = fdtab[fd].cb[DIR_WR].b;
Willy Tarreau83749182007-04-15 20:56:27 +0200176 int ret, max, retval;
177 int write_poll = MAX_WRITE_POLL_LOOPS;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200178
179#ifdef DEBUG_FULL
Willy Tarreauf8306d52006-07-29 19:01:31 +0200180 fprintf(stderr,"stream_sock_write : fd=%d, owner=%p\n", fd, fdtab[fd].owner);
Willy Tarreaubaaee002006-06-26 02:48:02 +0200181#endif
182
Willy Tarreau83749182007-04-15 20:56:27 +0200183 retval = 1;
184
185 if (unlikely(fdtab[fd].state == FD_STERROR || (fdtab[fd].ev & FD_POLL_ERR))) {
186 /* read/write error */
187 b->flags |= BF_WRITE_ERROR;
188 fdtab[fd].state = FD_STERROR;
189 EV_FD_CLR(fd, DIR_WR);
190 goto out_eternity;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200191 }
Willy Tarreau83749182007-04-15 20:56:27 +0200192
193 retval = 0;
194 while (write_poll-- > 0) {
195 if (b->l == 0) { /* let's realign the buffer to optimize I/O */
196 b->r = b->w = b->lr = b->data;
197 max = 0;
198 }
199 else if (b->r > b->w) {
200 max = b->r - b->w;
201 }
202 else {
203 max = b->data + BUFSIZE - b->w;
204 }
205
Willy Tarreaubaaee002006-06-26 02:48:02 +0200206 if (max == 0) {
Willy Tarreauf8306d52006-07-29 19:01:31 +0200207 /* may be we have received a connection acknowledgement in TCP mode without data */
Willy Tarreau83749182007-04-15 20:56:27 +0200208 if (!(b->flags & BF_PARTIAL_WRITE)
209 && fdtab[fd].state == FD_STCONN) {
Willy Tarreauf8306d52006-07-29 19:01:31 +0200210 int skerr;
211 socklen_t lskerr = sizeof(skerr);
Willy Tarreauc6423482006-10-15 14:59:03 +0200212 ret = getsockopt(fd, SOL_SOCKET, SO_ERROR, &skerr, &lskerr);
213 if (ret == -1 || skerr) {
Willy Tarreauf8306d52006-07-29 19:01:31 +0200214 b->flags |= BF_WRITE_ERROR;
215 fdtab[fd].state = FD_STERROR;
Willy Tarreauf161a342007-04-08 16:59:42 +0200216 EV_FD_CLR(fd, DIR_WR);
Willy Tarreau83749182007-04-15 20:56:27 +0200217 retval = 1;
218 goto out_eternity;
Willy Tarreauf8306d52006-07-29 19:01:31 +0200219 }
220 }
221
Willy Tarreau0f9f5052006-07-29 17:39:25 +0200222 b->flags |= BF_WRITE_NULL;
Willy Tarreauf8306d52006-07-29 19:01:31 +0200223 fdtab[fd].state = FD_STREADY;
Willy Tarreauf161a342007-04-08 16:59:42 +0200224 EV_FD_CLR(fd, DIR_WR);
Willy Tarreau83749182007-04-15 20:56:27 +0200225 retval = 1;
226 goto out_eternity;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200227 }
228
229#ifndef MSG_NOSIGNAL
230 {
231 int skerr;
232 socklen_t lskerr = sizeof(skerr);
233
Willy Tarreauc6423482006-10-15 14:59:03 +0200234 ret = getsockopt(fd, SOL_SOCKET, SO_ERROR, &skerr, &lskerr);
235 if (ret == -1 || skerr)
Willy Tarreaubaaee002006-06-26 02:48:02 +0200236 ret = -1;
237 else
238 ret = send(fd, b->w, max, MSG_DONTWAIT);
239 }
240#else
241 ret = send(fd, b->w, max, MSG_DONTWAIT | MSG_NOSIGNAL);
242#endif
243
244 if (ret > 0) {
245 b->l -= ret;
246 b->w += ret;
247
Willy Tarreau0f9f5052006-07-29 17:39:25 +0200248 b->flags |= BF_PARTIAL_WRITE;
Willy Tarreau83749182007-04-15 20:56:27 +0200249 retval = 1;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200250
251 if (b->w == b->data + BUFSIZE) {
252 b->w = b->data; /* wrap around the buffer */
253 }
Willy Tarreau83749182007-04-15 20:56:27 +0200254
255 if (!write_poll)
256 break;
257
258 /* we hope to be able to write more data */
259 continue;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200260 }
261 else if (ret == 0) {
Willy Tarreauf8306d52006-07-29 19:01:31 +0200262 /* nothing written, just pretend we were never called */
Willy Tarreau83749182007-04-15 20:56:27 +0200263 retval = 0;
264 break;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200265 }
Willy Tarreau83749182007-04-15 20:56:27 +0200266 else if (errno == EAGAIN) {/* ignore EAGAIN */
267 retval = 0;
268 break;
269 }
Willy Tarreaubaaee002006-06-26 02:48:02 +0200270 else {
Willy Tarreau0f9f5052006-07-29 17:39:25 +0200271 b->flags |= BF_WRITE_ERROR;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200272 fdtab[fd].state = FD_STERROR;
Willy Tarreau83749182007-04-15 20:56:27 +0200273 EV_FD_CLR(fd, DIR_WR);
274 retval = 1;
275 goto out_eternity;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200276 }
Willy Tarreau83749182007-04-15 20:56:27 +0200277 } /* while (write_poll) */
Willy Tarreaubaaee002006-06-26 02:48:02 +0200278
Willy Tarreau83749182007-04-15 20:56:27 +0200279 if (b->flags & BF_WRITE_STATUS) {
280 if (b->wto) {
281 tv_delayfrom(&b->wex, &now, b->wto);
282 /* FIXME: to prevent the client from expiring read timeouts during writes,
283 * we refresh it. A solution would be to merge read+write timeouts into a
284 * unique one, although that needs some study particularly on full-duplex
285 * TCP connections. */
286 b->rex = b->wex;
287 }
288 else {
289 out_eternity:
290 tv_eternity(&b->wex);
291 }
Willy Tarreaubaaee002006-06-26 02:48:02 +0200292 }
Willy Tarreaubaaee002006-06-26 02:48:02 +0200293
Willy Tarreauf8306d52006-07-29 19:01:31 +0200294 task_wakeup(&rq, fdtab[fd].owner);
Willy Tarreau83749182007-04-15 20:56:27 +0200295 fdtab[fd].ev &= ~FD_POLL_WR;
296 return retval;
Willy Tarreaubaaee002006-06-26 02:48:02 +0200297}
298
Willy Tarreaubaaee002006-06-26 02:48:02 +0200299
300
301/*
302 * Local variables:
303 * c-indent-level: 8
304 * c-basic-offset: 8
305 * End:
306 */