Willy Tarreau | baaee00 | 2006-06-26 02:48:02 +0200 | [diff] [blame] | 1 | /* |
| 2 | * File descriptors management functions. |
| 3 | * |
Willy Tarreau | 7be79a4 | 2012-11-11 15:02:54 +0100 | [diff] [blame] | 4 | * Copyright 2000-2012 Willy Tarreau <w@1wt.eu> |
Willy Tarreau | baaee00 | 2006-06-26 02:48:02 +0200 | [diff] [blame] | 5 | * |
| 6 | * This program is free software; you can redistribute it and/or |
| 7 | * modify it under the terms of the GNU General Public License |
| 8 | * as published by the Free Software Foundation; either version |
| 9 | * 2 of the License, or (at your option) any later version. |
| 10 | * |
Willy Tarreau | 7be79a4 | 2012-11-11 15:02:54 +0100 | [diff] [blame] | 11 | * This code implements "speculative I/O". The principle is to try to perform |
| 12 | * expected I/O before registering the events in the poller. Each time this |
| 13 | * succeeds, it saves a possibly expensive system call to set the event. It |
| 14 | * generally succeeds for all reads after an accept(), and for writes after a |
| 15 | * connect(). It also improves performance for streaming connections because |
| 16 | * even if only one side is polled, the other one may react accordingly |
| 17 | * depending on the fill level of the buffer. This behaviour is also the only |
| 18 | * one compatible with event-based pollers (eg: EPOLL_ET). |
| 19 | * |
| 20 | * More importantly, it enables I/O operations that are backed by invisible |
| 21 | * buffers. For example, SSL is able to read a whole socket buffer and not |
| 22 | * deliver it to the application buffer because it's full. Unfortunately, it |
| 23 | * won't be reported by a poller anymore until some new activity happens. The |
| 24 | * only way to call it again thus is to perform speculative I/O as soon as |
| 25 | * reading on the FD is enabled again. |
| 26 | * |
| 27 | * The speculative I/O uses a list of expected events and a list of updates. |
| 28 | * Expected events are events that are expected to come and that we must report |
| 29 | * to the application until it asks to stop or to poll. Updates are new requests |
| 30 | * for changing an FD state. Updates are the only way to create new events. This |
| 31 | * is important because it means that the number of speculative events cannot |
| 32 | * increase between updates and will only grow one at a time while processing |
| 33 | * updates. All updates must always be processed, though events might be |
| 34 | * processed by small batches if required. |
| 35 | * |
| 36 | * There is no direct link between the FD and the updates list. There is only a |
| 37 | * bit in the fdtab[] to indicate than a file descriptor is already present in |
| 38 | * the updates list. Once an fd is present in the updates list, it will have to |
| 39 | * be considered even if its changes are reverted in the middle or if the fd is |
| 40 | * replaced. |
| 41 | * |
| 42 | * It is important to understand that as long as all expected events are |
| 43 | * processed, they might starve the polled events, especially because polled |
| 44 | * I/O starvation quickly induces more speculative I/O. One solution to this |
| 45 | * consists in only processing a part of the events at once, but one drawback |
| 46 | * is that unhandled events will still wake the poller up. Using an event-driven |
| 47 | * poller such as EPOLL_ET will solve this issue though. |
| 48 | * |
| 49 | * A file descriptor has a distinct state for each direction. This state is a |
| 50 | * combination of two bits : |
| 51 | * bit 0 = active Y/N : is set if the FD is active, which means that its |
| 52 | * handler will be called without prior polling ; |
| 53 | * bit 1 = polled Y/N : is set if the FD was subscribed to polling |
| 54 | * |
| 55 | * It is perfectly valid to have both bits set at a time, which generally means |
| 56 | * that the FD was reported by polling, was marked active and not yet unpolled. |
| 57 | * Such a state must not last long to avoid unneeded wakeups. |
| 58 | * |
| 59 | * The state of the FD as of last change is preserved in two other bits. These |
| 60 | * ones are useful to save a significant amount of system calls during state |
| 61 | * changes, because there is no need to update the FD status in the system until |
| 62 | * we're about to call the poller. |
| 63 | * |
| 64 | * Since we do not want to scan all the FD list to find speculative I/O events, |
| 65 | * we store them in a list consisting in a linear array holding only the FD |
| 66 | * indexes right now. Note that a closed FD cannot exist in the spec list, |
| 67 | * because it is closed by fd_delete() which in turn calls __fd_clo() which |
| 68 | * always removes it from the list. |
| 69 | * |
| 70 | * For efficiency reasons, we will store the Read and Write bits interlaced to |
| 71 | * form a 4-bit field, so that we can simply shift the value right by 0/1 and |
| 72 | * get what we want : |
| 73 | * 3 2 1 0 |
| 74 | * Wp Rp Wa Ra |
| 75 | * |
| 76 | * The FD array has to hold a back reference to the speculative list. This |
| 77 | * reference is always valid unless the FD if currently being polled and not |
| 78 | * updated (in which case the reference points to index 0). |
| 79 | * |
| 80 | * We store the FD state in the 4 lower bits of fdtab[fd].spec_e, and save the |
| 81 | * previous state upon changes in the 4 higher bits, so that changes are easy |
| 82 | * to spot. |
Willy Tarreau | baaee00 | 2006-06-26 02:48:02 +0200 | [diff] [blame] | 83 | */ |
| 84 | |
Willy Tarreau | 2ff7622 | 2007-04-09 19:29:56 +0200 | [diff] [blame] | 85 | #include <stdio.h> |
Willy Tarreau | 4f60f16 | 2007-04-08 16:39:58 +0200 | [diff] [blame] | 86 | #include <string.h> |
Willy Tarreau | baaee00 | 2006-06-26 02:48:02 +0200 | [diff] [blame] | 87 | #include <unistd.h> |
Willy Tarreau | baaee00 | 2006-06-26 02:48:02 +0200 | [diff] [blame] | 88 | #include <sys/types.h> |
| 89 | |
Willy Tarreau | 2dd0d47 | 2006-06-29 17:53:05 +0200 | [diff] [blame] | 90 | #include <common/compat.h> |
| 91 | #include <common/config.h> |
Willy Tarreau | baaee00 | 2006-06-26 02:48:02 +0200 | [diff] [blame] | 92 | |
Willy Tarreau | 7be79a4 | 2012-11-11 15:02:54 +0100 | [diff] [blame] | 93 | #include <types/global.h> |
| 94 | |
Willy Tarreau | 2a42950 | 2006-10-15 14:52:29 +0200 | [diff] [blame] | 95 | #include <proto/fd.h> |
Willy Tarreau | c6f4ce8 | 2009-06-10 11:09:37 +0200 | [diff] [blame] | 96 | #include <proto/port_range.h> |
Willy Tarreau | baaee00 | 2006-06-26 02:48:02 +0200 | [diff] [blame] | 97 | |
| 98 | struct fdtab *fdtab = NULL; /* array of all the file descriptors */ |
Willy Tarreau | 8d5d77e | 2009-10-18 07:25:52 +0200 | [diff] [blame] | 99 | struct fdinfo *fdinfo = NULL; /* less-often used infos for file descriptors */ |
Willy Tarreau | baaee00 | 2006-06-26 02:48:02 +0200 | [diff] [blame] | 100 | int maxfd; /* # of the highest fd + 1 */ |
| 101 | int totalconn; /* total # of terminated sessions */ |
| 102 | int actconn; /* # of active sessions */ |
| 103 | |
Willy Tarreau | 4f60f16 | 2007-04-08 16:39:58 +0200 | [diff] [blame] | 104 | struct poller pollers[MAX_POLLERS]; |
| 105 | struct poller cur_poller; |
| 106 | int nbpollers = 0; |
Willy Tarreau | baaee00 | 2006-06-26 02:48:02 +0200 | [diff] [blame] | 107 | |
Willy Tarreau | 7be79a4 | 2012-11-11 15:02:54 +0100 | [diff] [blame] | 108 | /* FD status is defined by the poller's status and by the speculative I/O list */ |
| 109 | int fd_nbspec = 0; // number of speculative events in the list |
| 110 | int fd_nbupdt = 0; // number of updates in the list |
| 111 | unsigned int *fd_spec = NULL; // speculative I/O list |
| 112 | unsigned int *fd_updt = NULL; // FD updates list |
Willy Tarreau | baaee00 | 2006-06-26 02:48:02 +0200 | [diff] [blame] | 113 | |
Willy Tarreau | 4f60f16 | 2007-04-08 16:39:58 +0200 | [diff] [blame] | 114 | /* Deletes an FD from the fdsets, and recomputes the maxfd limit. |
| 115 | * The file descriptor is also closed. |
| 116 | */ |
| 117 | void fd_delete(int fd) |
Willy Tarreau | baaee00 | 2006-06-26 02:48:02 +0200 | [diff] [blame] | 118 | { |
Willy Tarreau | ad38ace | 2013-12-15 14:19:38 +0100 | [diff] [blame] | 119 | if (fdtab[fd].linger_risk) { |
| 120 | /* this is generally set when connecting to servers */ |
| 121 | setsockopt(fd, SOL_SOCKET, SO_LINGER, |
| 122 | (struct linger *) &nolinger, sizeof(struct linger)); |
| 123 | } |
Willy Tarreau | 6ea20b1 | 2012-11-11 16:05:19 +0100 | [diff] [blame] | 124 | if (cur_poller.clo) |
| 125 | cur_poller.clo(fd); |
| 126 | |
| 127 | release_spec_entry(fd); |
| 128 | fdtab[fd].spec_e &= ~(FD_EV_CURR_MASK | FD_EV_PREV_MASK); |
| 129 | |
Willy Tarreau | 8d5d77e | 2009-10-18 07:25:52 +0200 | [diff] [blame] | 130 | port_range_release_port(fdinfo[fd].port_range, fdinfo[fd].local_port); |
| 131 | fdinfo[fd].port_range = NULL; |
Willy Tarreau | 4f60f16 | 2007-04-08 16:39:58 +0200 | [diff] [blame] | 132 | close(fd); |
Willy Tarreau | db3b326 | 2012-07-05 23:19:22 +0200 | [diff] [blame] | 133 | fdtab[fd].owner = NULL; |
Willy Tarreau | 1720abd | 2012-11-11 17:08:32 +0100 | [diff] [blame] | 134 | fdtab[fd].new = 0; |
Willy Tarreau | baaee00 | 2006-06-26 02:48:02 +0200 | [diff] [blame] | 135 | |
Willy Tarreau | db3b326 | 2012-07-05 23:19:22 +0200 | [diff] [blame] | 136 | while ((maxfd-1 >= 0) && !fdtab[maxfd-1].owner) |
Willy Tarreau | 4f60f16 | 2007-04-08 16:39:58 +0200 | [diff] [blame] | 137 | maxfd--; |
Willy Tarreau | baaee00 | 2006-06-26 02:48:02 +0200 | [diff] [blame] | 138 | } |
Willy Tarreau | baaee00 | 2006-06-26 02:48:02 +0200 | [diff] [blame] | 139 | |
Willy Tarreau | 09f2456 | 2012-11-11 16:43:45 +0100 | [diff] [blame] | 140 | /* Scan and process the speculative events. This should be called right after |
| 141 | * the poller. |
| 142 | */ |
| 143 | void fd_process_spec_events() |
| 144 | { |
| 145 | int fd, spec_idx, e; |
| 146 | |
| 147 | /* now process speculative events if any */ |
| 148 | |
| 149 | for (spec_idx = 0; spec_idx < fd_nbspec; ) { |
| 150 | fd = fd_spec[spec_idx]; |
| 151 | e = fdtab[fd].spec_e; |
| 152 | |
| 153 | /* |
| 154 | * Process the speculative events. |
| 155 | * |
| 156 | * Principle: events which are marked FD_EV_ACTIVE are processed |
| 157 | * with their usual I/O callback. The callback may remove the |
| 158 | * events from the list or tag them for polling. Changes will be |
| 159 | * applied on next round. |
| 160 | */ |
| 161 | |
| 162 | fdtab[fd].ev &= FD_POLL_STICKY; |
| 163 | |
Willy Tarreau | 70d0ad5 | 2012-11-12 01:57:14 +0100 | [diff] [blame] | 164 | if (e & FD_EV_ACTIVE_R) |
Willy Tarreau | 09f2456 | 2012-11-11 16:43:45 +0100 | [diff] [blame] | 165 | fdtab[fd].ev |= FD_POLL_IN; |
| 166 | |
Willy Tarreau | 70d0ad5 | 2012-11-12 01:57:14 +0100 | [diff] [blame] | 167 | if (e & FD_EV_ACTIVE_W) |
Willy Tarreau | 09f2456 | 2012-11-11 16:43:45 +0100 | [diff] [blame] | 168 | fdtab[fd].ev |= FD_POLL_OUT; |
| 169 | |
| 170 | if (fdtab[fd].iocb && fdtab[fd].owner && fdtab[fd].ev) |
| 171 | fdtab[fd].iocb(fd); |
| 172 | |
| 173 | /* if the fd was removed from the spec list, it has been |
| 174 | * replaced by the next one that we don't want to skip ! |
| 175 | */ |
| 176 | if (spec_idx < fd_nbspec && fd_spec[spec_idx] != fd) |
| 177 | continue; |
| 178 | |
| 179 | spec_idx++; |
| 180 | } |
| 181 | } |
Willy Tarreau | baaee00 | 2006-06-26 02:48:02 +0200 | [diff] [blame] | 182 | |
Willy Tarreau | 4f60f16 | 2007-04-08 16:39:58 +0200 | [diff] [blame] | 183 | /* disable the specified poller */ |
| 184 | void disable_poller(const char *poller_name) |
| 185 | { |
| 186 | int p; |
Willy Tarreau | baaee00 | 2006-06-26 02:48:02 +0200 | [diff] [blame] | 187 | |
Willy Tarreau | 4f60f16 | 2007-04-08 16:39:58 +0200 | [diff] [blame] | 188 | for (p = 0; p < nbpollers; p++) |
| 189 | if (strcmp(pollers[p].name, poller_name) == 0) |
| 190 | pollers[p].pref = 0; |
| 191 | } |
Willy Tarreau | baaee00 | 2006-06-26 02:48:02 +0200 | [diff] [blame] | 192 | |
| 193 | /* |
Willy Tarreau | 4f60f16 | 2007-04-08 16:39:58 +0200 | [diff] [blame] | 194 | * Initialize the pollers till the best one is found. |
| 195 | * If none works, returns 0, otherwise 1. |
Willy Tarreau | baaee00 | 2006-06-26 02:48:02 +0200 | [diff] [blame] | 196 | */ |
Willy Tarreau | 4f60f16 | 2007-04-08 16:39:58 +0200 | [diff] [blame] | 197 | int init_pollers() |
Willy Tarreau | baaee00 | 2006-06-26 02:48:02 +0200 | [diff] [blame] | 198 | { |
Willy Tarreau | 4f60f16 | 2007-04-08 16:39:58 +0200 | [diff] [blame] | 199 | int p; |
| 200 | struct poller *bp; |
Willy Tarreau | baaee00 | 2006-06-26 02:48:02 +0200 | [diff] [blame] | 201 | |
Willy Tarreau | 7be79a4 | 2012-11-11 15:02:54 +0100 | [diff] [blame] | 202 | if ((fd_spec = (uint32_t *)calloc(1, sizeof(uint32_t) * global.maxsock)) == NULL) |
| 203 | goto fail_spec; |
| 204 | |
| 205 | if ((fd_updt = (uint32_t *)calloc(1, sizeof(uint32_t) * global.maxsock)) == NULL) |
| 206 | goto fail_updt; |
Willy Tarreau | baaee00 | 2006-06-26 02:48:02 +0200 | [diff] [blame] | 207 | |
Willy Tarreau | 4f60f16 | 2007-04-08 16:39:58 +0200 | [diff] [blame] | 208 | do { |
| 209 | bp = NULL; |
| 210 | for (p = 0; p < nbpollers; p++) |
| 211 | if (!bp || (pollers[p].pref > bp->pref)) |
| 212 | bp = &pollers[p]; |
Willy Tarreau | baaee00 | 2006-06-26 02:48:02 +0200 | [diff] [blame] | 213 | |
Willy Tarreau | 4f60f16 | 2007-04-08 16:39:58 +0200 | [diff] [blame] | 214 | if (!bp || bp->pref == 0) |
Willy Tarreau | baaee00 | 2006-06-26 02:48:02 +0200 | [diff] [blame] | 215 | break; |
| 216 | |
Willy Tarreau | 4f60f16 | 2007-04-08 16:39:58 +0200 | [diff] [blame] | 217 | if (bp->init(bp)) { |
| 218 | memcpy(&cur_poller, bp, sizeof(*bp)); |
| 219 | return 1; |
Willy Tarreau | baaee00 | 2006-06-26 02:48:02 +0200 | [diff] [blame] | 220 | } |
Willy Tarreau | 4f60f16 | 2007-04-08 16:39:58 +0200 | [diff] [blame] | 221 | } while (!bp || bp->pref == 0); |
| 222 | return 0; |
Willy Tarreau | 7be79a4 | 2012-11-11 15:02:54 +0100 | [diff] [blame] | 223 | |
| 224 | fail_updt: |
| 225 | free(fd_spec); |
| 226 | fail_spec: |
| 227 | return 0; |
Willy Tarreau | baaee00 | 2006-06-26 02:48:02 +0200 | [diff] [blame] | 228 | } |
| 229 | |
Willy Tarreau | baaee00 | 2006-06-26 02:48:02 +0200 | [diff] [blame] | 230 | /* |
Krzysztof Piotr Oledzki | a643baf | 2008-05-29 23:53:44 +0200 | [diff] [blame] | 231 | * Deinitialize the pollers. |
| 232 | */ |
| 233 | void deinit_pollers() { |
| 234 | |
| 235 | struct poller *bp; |
| 236 | int p; |
| 237 | |
| 238 | for (p = 0; p < nbpollers; p++) { |
| 239 | bp = &pollers[p]; |
| 240 | |
| 241 | if (bp && bp->pref) |
| 242 | bp->term(bp); |
| 243 | } |
Willy Tarreau | 7be79a4 | 2012-11-11 15:02:54 +0100 | [diff] [blame] | 244 | |
| 245 | free(fd_updt); |
| 246 | free(fd_spec); |
| 247 | fd_updt = NULL; |
| 248 | fd_spec = NULL; |
Krzysztof Piotr Oledzki | a643baf | 2008-05-29 23:53:44 +0200 | [diff] [blame] | 249 | } |
| 250 | |
| 251 | /* |
Willy Tarreau | 2ff7622 | 2007-04-09 19:29:56 +0200 | [diff] [blame] | 252 | * Lists the known pollers on <out>. |
| 253 | * Should be performed only before initialization. |
| 254 | */ |
| 255 | int list_pollers(FILE *out) |
| 256 | { |
| 257 | int p; |
| 258 | int last, next; |
| 259 | int usable; |
| 260 | struct poller *bp; |
| 261 | |
| 262 | fprintf(out, "Available polling systems :\n"); |
| 263 | |
| 264 | usable = 0; |
| 265 | bp = NULL; |
| 266 | last = next = -1; |
| 267 | while (1) { |
| 268 | for (p = 0; p < nbpollers; p++) { |
Willy Tarreau | 2ff7622 | 2007-04-09 19:29:56 +0200 | [diff] [blame] | 269 | if ((next < 0 || pollers[p].pref > next) |
Willy Tarreau | e79c3b2 | 2010-11-19 10:20:36 +0100 | [diff] [blame] | 270 | && (last < 0 || pollers[p].pref < last)) { |
Willy Tarreau | 2ff7622 | 2007-04-09 19:29:56 +0200 | [diff] [blame] | 271 | next = pollers[p].pref; |
Willy Tarreau | e79c3b2 | 2010-11-19 10:20:36 +0100 | [diff] [blame] | 272 | if (!bp || (pollers[p].pref > bp->pref)) |
| 273 | bp = &pollers[p]; |
| 274 | } |
Willy Tarreau | 2ff7622 | 2007-04-09 19:29:56 +0200 | [diff] [blame] | 275 | } |
| 276 | |
| 277 | if (next == -1) |
| 278 | break; |
| 279 | |
| 280 | for (p = 0; p < nbpollers; p++) { |
| 281 | if (pollers[p].pref == next) { |
| 282 | fprintf(out, " %10s : ", pollers[p].name); |
| 283 | if (pollers[p].pref == 0) |
| 284 | fprintf(out, "disabled, "); |
| 285 | else |
| 286 | fprintf(out, "pref=%3d, ", pollers[p].pref); |
| 287 | if (pollers[p].test(&pollers[p])) { |
| 288 | fprintf(out, " test result OK"); |
| 289 | if (next > 0) |
| 290 | usable++; |
Willy Tarreau | e79c3b2 | 2010-11-19 10:20:36 +0100 | [diff] [blame] | 291 | } else { |
Willy Tarreau | 2ff7622 | 2007-04-09 19:29:56 +0200 | [diff] [blame] | 292 | fprintf(out, " test result FAILED"); |
Willy Tarreau | e79c3b2 | 2010-11-19 10:20:36 +0100 | [diff] [blame] | 293 | if (bp == &pollers[p]) |
| 294 | bp = NULL; |
| 295 | } |
Willy Tarreau | 2ff7622 | 2007-04-09 19:29:56 +0200 | [diff] [blame] | 296 | fprintf(out, "\n"); |
| 297 | } |
| 298 | } |
| 299 | last = next; |
| 300 | next = -1; |
| 301 | }; |
| 302 | fprintf(out, "Total: %d (%d usable), will use %s.\n", nbpollers, usable, bp ? bp->name : "none"); |
| 303 | return 0; |
| 304 | } |
| 305 | |
| 306 | /* |
| 307 | * Some pollers may lose their connection after a fork(). It may be necessary |
| 308 | * to create initialize part of them again. Returns 0 in case of failure, |
| 309 | * otherwise 1. The fork() function may be NULL if unused. In case of error, |
| 310 | * the the current poller is destroyed and the caller is responsible for trying |
| 311 | * another one by calling init_pollers() again. |
| 312 | */ |
| 313 | int fork_poller() |
| 314 | { |
| 315 | if (cur_poller.fork) { |
| 316 | if (cur_poller.fork(&cur_poller)) |
| 317 | return 1; |
| 318 | cur_poller.term(&cur_poller); |
| 319 | return 0; |
| 320 | } |
| 321 | return 1; |
| 322 | } |
| 323 | |
| 324 | /* |
Willy Tarreau | baaee00 | 2006-06-26 02:48:02 +0200 | [diff] [blame] | 325 | * Local variables: |
| 326 | * c-indent-level: 8 |
| 327 | * c-basic-offset: 8 |
| 328 | * End: |
| 329 | */ |