blob: 34cb0bb76db410df9eed54b8488c6813582cb3d6 [file] [log] [blame]
/*
* HA-Proxy : High Availability-enabled HTTP/TCP proxy
* 2000-2006 - Willy Tarreau - willy AT meta-x DOT org.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Please refer to RFC2068 or RFC2616 for informations about HTTP protocol, and
* RFC2965 for informations about cookies usage. More generally, the IETF HTTP
* Working Group's web site should be consulted for protocol related changes :
*
* http://ftp.ics.uci.edu/pub/ietf/http/
*
* Pending bugs (may be not fixed because never reproduced) :
* - solaris only : sometimes, an HTTP proxy with only a dispatch address causes
* the proxy to terminate (no core) if the client breaks the connection during
* the response. Seen on 1.1.8pre4, but never reproduced. May not be related to
* the snprintf() bug since requests were simple (GET / HTTP/1.0), but may be
* related to missing setsid() (fixed in 1.1.15)
* - a proxy with an invalid config will prevent the startup even if disabled.
*
* ChangeLog has moved to the CHANGELOG file.
*
* TODO:
* - handle properly intermediate incomplete server headers. Done ?
* - handle hot-reconfiguration
* - fix client/server state transition when server is in connect or headers state
* and client suddenly disconnects. The server *should* switch to SHUT_WR, but
* still handle HTTP headers.
* - remove MAX_NEWHDR
* - cut this huge file into several ones
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <ctype.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/tcp.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <netdb.h>
#include <fcntl.h>
#include <errno.h>
#include <signal.h>
#include <stdarg.h>
#include <sys/resource.h>
#include <time.h>
#include <syslog.h>
#ifdef USE_PCRE
#include <pcre.h>
#include <pcreposix.h>
#else
#include <regex.h>
#endif
#if defined(TPROXY) && defined(NETFILTER)
#include <linux/netfilter_ipv4.h>
#endif
#if defined(__dietlibc__)
#include <strings.h>
#endif
#if defined(ENABLE_POLL)
#include <sys/poll.h>
#endif
#if defined(ENABLE_EPOLL)
#if !defined(USE_MY_EPOLL)
#include <sys/epoll.h>
#else
#include "include/epoll.h"
#endif
#endif
#ifdef DEBUG_FULL
#include <assert.h>
#endif
#include <include/base64.h>
#include <include/uri_auth.h>
#include "include/appsession.h"
#include "include/mini-clist.h"
#ifndef HAPROXY_VERSION
#define HAPROXY_VERSION "1.2.14"
#endif
#ifndef HAPROXY_DATE
#define HAPROXY_DATE "2006/05/21"
#endif
/* this is for libc5 for example */
#ifndef TCP_NODELAY
#define TCP_NODELAY 1
#endif
#ifndef SHUT_RD
#define SHUT_RD 0
#endif
#ifndef SHUT_WR
#define SHUT_WR 1
#endif
/*
* BUFSIZE defines the size of a read and write buffer. It is the maximum
* amount of bytes which can be stored by the proxy for each session. However,
* when reading HTTP headers, the proxy needs some spare space to add or rewrite
* headers if needed. The size of this spare is defined with MAXREWRITE. So it
* is not possible to process headers longer than BUFSIZE-MAXREWRITE bytes. By
* default, BUFSIZE=16384 bytes and MAXREWRITE=BUFSIZE/2, so the maximum length
* of headers accepted is 8192 bytes, which is in line with Apache's limits.
*/
#ifndef BUFSIZE
#define BUFSIZE 16384
#endif
// reserved buffer space for header rewriting
#ifndef MAXREWRITE
#define MAXREWRITE (BUFSIZE / 2)
#endif
#define REQURI_LEN 1024
#define CAPTURE_LEN 64
// max # args on a configuration line
#define MAX_LINE_ARGS 40
// max # of added headers per request
#define MAX_NEWHDR 10
// max # of matches per regexp
#define MAX_MATCH 10
// cookie delimitor in "prefix" mode. This character is inserted between the
// persistence cookie and the original value. The '~' is allowed by RFC2965,
// and should not be too common in server names.
#ifndef COOKIE_DELIM
#define COOKIE_DELIM '~'
#endif
#define CONN_RETRIES 3
#define CHK_CONNTIME 2000
#define DEF_CHKINTR 2000
#define DEF_FALLTIME 3
#define DEF_RISETIME 2
#define DEF_CHECK_REQ "OPTIONS / HTTP/1.0\r\n\r\n"
/* Default connections limit.
*
* A system limit can be enforced at build time in order to avoid using haproxy
* beyond reasonable system limits. For this, just define SYSTEM_MAXCONN to the
* absolute limit accepted by the system. If the configuration specifies a
* higher value, it will be capped to SYSTEM_MAXCONN and a warning will be
* emitted. The only way to override this limit will be to set it via the
* command-line '-n' argument.
*/
#ifndef SYSTEM_MAXCONN
#define DEFAULT_MAXCONN 2000
#else
#define DEFAULT_MAXCONN SYSTEM_MAXCONN
#endif
#ifdef CONFIG_PRODUCT_NAME
#define PRODUCT_NAME CONFIG_PRODUCT_NAME
#else
#define PRODUCT_NAME "HAProxy"
#endif
/* how many bits are needed to code the size of an int (eg: 32bits -> 5) */
#define INTBITS 5
/* show stats this every millisecond, 0 to disable */
#ifndef STATTIME
#define STATTIME 2000
#endif
/* this reduces the number of calls to select() by choosing appropriate
* sheduler precision in milliseconds. It should be near the minimum
* time that is needed by select() to collect all events. All timeouts
* are rounded up by adding this value prior to pass it to select().
*/
#define SCHEDULER_RESOLUTION 9
#define TIME_ETERNITY -1
/* returns the lowest delay amongst <old> and <new>, and respects TIME_ETERNITY */
#define MINTIME(old, new) (((new)<0)?(old):(((old)<0||(new)<(old))?(new):(old)))
#define SETNOW(a) (*a=now)
/****** string-specific macros and functions ******/
/* if a > max, then bound <a> to <max>. The macro returns the new <a> */
#define UBOUND(a, max) ({ typeof(a) b = (max); if ((a) > b) (a) = b; (a); })
/* if a < min, then bound <a> to <min>. The macro returns the new <a> */
#define LBOUND(a, min) ({ typeof(a) b = (min); if ((a) < b) (a) = b; (a); })
/* returns 1 only if only zero or one bit is set in X, which means that X is a
* power of 2, and 0 otherwise */
#define POWEROF2(x) (((x) & ((x)-1)) == 0)
/*
* copies at most <size-1> chars from <src> to <dst>. Last char is always
* set to 0, unless <size> is 0. The number of chars copied is returned
* (excluding the terminating zero).
* This code has been optimized for size and speed : on x86, it's 45 bytes
* long, uses only registers, and consumes only 4 cycles per char.
*/
int strlcpy2(char *dst, const char *src, int size) {
char *orig = dst;
if (size) {
while (--size && (*dst = *src)) {
src++; dst++;
}
*dst = 0;
}
return dst - orig;
}
/*
* This function simply returns a statically allocated string containing
* the ascii representation for number 'n' in decimal.
*/
char *ultoa(unsigned long n) {
/* enough to store 2^63=18446744073709551615 */
static char itoa_str[21];
char *pos;
pos = itoa_str + sizeof(itoa_str) - 1;
*pos-- = '\0';
do {
*pos-- = '0' + n % 10;
n /= 10;
} while (n && pos >= itoa_str);
return pos + 1;
}
/*
* Returns a pointer to an area of <__len> bytes taken from the pool <pool> or
* dynamically allocated. In the first case, <__pool> is updated to point to
* the next element in the list.
*/
#define pool_alloc_from(__pool, __len) ({ \
void *__p; \
if ((__p = (__pool)) == NULL) \
__p = malloc(((__len) >= sizeof (void *)) ? (__len) : sizeof(void *)); \
else { \
__pool = *(void **)(__pool); \
} \
__p; \
})
/*
* Puts a memory area back to the corresponding pool.
* Items are chained directly through a pointer that
* is written in the beginning of the memory area, so
* there's no need for any carrier cell. This implies
* that each memory area is at least as big as one
* pointer.
*/
#define pool_free_to(__pool, __ptr) ({ \
*(void **)(__ptr) = (void *)(__pool); \
__pool = (void *)(__ptr); \
})
#define MEM_OPTIM
#ifdef MEM_OPTIM
/*
* Returns a pointer to type <type> taken from the
* pool <pool_type> or dynamically allocated. In the
* first case, <pool_type> is updated to point to the
* next element in the list.
*/
#define pool_alloc(type) ({ \
void *__p; \
if ((__p = pool_##type) == NULL) \
__p = malloc(sizeof_##type); \
else { \
pool_##type = *(void **)pool_##type; \
} \
__p; \
})
/*
* Puts a memory area back to the corresponding pool.
* Items are chained directly through a pointer that
* is written in the beginning of the memory area, so
* there's no need for any carrier cell. This implies
* that each memory area is at least as big as one
* pointer.
*/
#define pool_free(type, ptr) ({ \
*(void **)ptr = (void *)pool_##type; \
pool_##type = (void *)ptr; \
})
#else
#define pool_alloc(type) (calloc(1,sizeof_##type));
#define pool_free(type, ptr) (free(ptr));
#endif /* MEM_OPTIM */
#define sizeof_task sizeof(struct task)
#define sizeof_session sizeof(struct session)
#define sizeof_pendconn sizeof(struct pendconn)
#define sizeof_buffer sizeof(struct buffer)
#define sizeof_fdtab sizeof(struct fdtab)
#define sizeof_requri REQURI_LEN
#define sizeof_capture CAPTURE_LEN
#define sizeof_curappsession CAPTURE_LEN /* current_session pool */
#define sizeof_appsess sizeof(struct appsessions)
/* different possible states for the sockets */
#define FD_STCLOSE 0
#define FD_STLISTEN 1
#define FD_STCONN 2
#define FD_STREADY 3
#define FD_STERROR 4
/* values for task->state */
#define TASK_IDLE 0
#define TASK_RUNNING 1
/* values for proxy->state */
#define PR_STNEW 0
#define PR_STIDLE 1
#define PR_STRUN 2
#define PR_STSTOPPED 3
#define PR_STPAUSED 4
#define PR_STERROR 5
/* values for proxy->mode */
#define PR_MODE_TCP 0
#define PR_MODE_HTTP 1
#define PR_MODE_HEALTH 2
/* possible actions for the *poll() loops */
#define POLL_LOOP_ACTION_INIT 0
#define POLL_LOOP_ACTION_RUN 1
#define POLL_LOOP_ACTION_CLEAN 2
/* poll mechanisms available */
#define POLL_USE_SELECT (1<<0)
#define POLL_USE_POLL (1<<1)
#define POLL_USE_EPOLL (1<<2)
/* bits for proxy->options */
#define PR_O_REDISP 0x00000001 /* allow reconnection to dispatch in case of errors */
#define PR_O_TRANSP 0x00000002 /* transparent mode : use original DEST as dispatch */
#define PR_O_COOK_RW 0x00000004 /* rewrite all direct cookies with the right serverid */
#define PR_O_COOK_IND 0x00000008 /* keep only indirect cookies */
#define PR_O_COOK_INS 0x00000010 /* insert cookies when not accessing a server directly */
#define PR_O_COOK_PFX 0x00000020 /* rewrite all cookies by prefixing the right serverid */
#define PR_O_COOK_ANY (PR_O_COOK_RW | PR_O_COOK_IND | PR_O_COOK_INS | PR_O_COOK_PFX)
#define PR_O_BALANCE_RR 0x00000040 /* balance in round-robin mode */
#define PR_O_KEEPALIVE 0x00000080 /* follow keep-alive sessions */
#define PR_O_FWDFOR 0x00000100 /* insert x-forwarded-for with client address */
#define PR_O_BIND_SRC 0x00000200 /* bind to a specific source address when connect()ing */
#define PR_O_NULLNOLOG 0x00000400 /* a connect without request will not be logged */
#define PR_O_COOK_NOC 0x00000800 /* add a 'Cache-control' header with the cookie */
#define PR_O_COOK_POST 0x00001000 /* don't insert cookies for requests other than a POST */
#define PR_O_HTTP_CHK 0x00002000 /* use HTTP 'OPTIONS' method to check server health */
#define PR_O_PERSIST 0x00004000 /* server persistence stays effective even when server is down */
#define PR_O_LOGASAP 0x00008000 /* log as soon as possible, without waiting for the session to complete */
#define PR_O_HTTP_CLOSE 0x00010000 /* force 'connection: close' in both directions */
#define PR_O_CHK_CACHE 0x00020000 /* require examination of cacheability of the 'set-cookie' field */
#define PR_O_TCP_CLI_KA 0x00040000 /* enable TCP keep-alive on client-side sessions */
#define PR_O_TCP_SRV_KA 0x00080000 /* enable TCP keep-alive on server-side sessions */
#define PR_O_USE_ALL_BK 0x00100000 /* load-balance between backup servers */
#define PR_O_FORCE_CLO 0x00200000 /* enforce the connection close immediately after server response */
#define PR_O_BALANCE_SH 0x00400000 /* balance on source IP hash */
#define PR_O_BALANCE (PR_O_BALANCE_RR | PR_O_BALANCE_SH)
#define PR_O_ABRT_CLOSE 0x00800000 /* immediately abort request when client closes */
/* various session flags, bits values 0x01 to 0x20 (shift 0) */
#define SN_DIRECT 0x00000001 /* connection made on the server matching the client cookie */
#define SN_CLDENY 0x00000002 /* a client header matches a deny regex */
#define SN_CLALLOW 0x00000004 /* a client header matches an allow regex */
#define SN_SVDENY 0x00000008 /* a server header matches a deny regex */
#define SN_SVALLOW 0x00000010 /* a server header matches an allow regex */
#define SN_POST 0x00000020 /* the request was an HTTP POST */
/* session flags dedicated to cookies : bits values 0x40, 0x80 (0-3 shift 6) */
#define SN_CK_NONE 0x00000000 /* this session had no cookie */
#define SN_CK_INVALID 0x00000040 /* this session had a cookie which matches no server */
#define SN_CK_DOWN 0x00000080 /* this session had cookie matching a down server */
#define SN_CK_VALID 0x000000C0 /* this session had cookie matching a valid server */
#define SN_CK_MASK 0x000000C0 /* mask to get this session's cookie flags */
#define SN_CK_SHIFT 6 /* bit shift */
/* session termination conditions, bits values 0x100 to 0x700 (0-7 shift 8) */
#define SN_ERR_NONE 0x00000000
#define SN_ERR_CLITO 0x00000100 /* client time-out */
#define SN_ERR_CLICL 0x00000200 /* client closed (read/write error) */
#define SN_ERR_SRVTO 0x00000300 /* server time-out, connect time-out */
#define SN_ERR_SRVCL 0x00000400 /* server closed (connect/read/write error) */
#define SN_ERR_PRXCOND 0x00000500 /* the proxy decided to close (deny...) */
#define SN_ERR_RESOURCE 0x00000600 /* the proxy encountered a lack of a local resources (fd, mem, ...) */
#define SN_ERR_INTERNAL 0x00000700 /* the proxy encountered an internal error */
#define SN_ERR_MASK 0x00000700 /* mask to get only session error flags */
#define SN_ERR_SHIFT 8 /* bit shift */
/* session state at termination, bits values 0x1000 to 0x7000 (0-7 shift 12) */
#define SN_FINST_R 0x00001000 /* session ended during client request */
#define SN_FINST_C 0x00002000 /* session ended during server connect */
#define SN_FINST_H 0x00003000 /* session ended during server headers */
#define SN_FINST_D 0x00004000 /* session ended during data phase */
#define SN_FINST_L 0x00005000 /* session ended while pushing last data to client */
#define SN_FINST_Q 0x00006000 /* session ended while waiting in queue for a server slot */
#define SN_FINST_MASK 0x00007000 /* mask to get only final session state flags */
#define SN_FINST_SHIFT 12 /* bit shift */
/* cookie information, bits values 0x10000 to 0x80000 (0-8 shift 16) */
#define SN_SCK_NONE 0x00000000 /* no set-cookie seen for the server cookie */
#define SN_SCK_DELETED 0x00010000 /* existing set-cookie deleted or changed */
#define SN_SCK_INSERTED 0x00020000 /* new set-cookie inserted or changed existing one */
#define SN_SCK_SEEN 0x00040000 /* set-cookie seen for the server cookie */
#define SN_SCK_MASK 0x00070000 /* mask to get the set-cookie field */
#define SN_SCK_ANY 0x00080000 /* at least one set-cookie seen (not to be counted) */
#define SN_SCK_SHIFT 16 /* bit shift */
/* cacheability management, bits values 0x100000 to 0x300000 (0-3 shift 20) */
#define SN_CACHEABLE 0x00100000 /* at least part of the response is cacheable */
#define SN_CACHE_COOK 0x00200000 /* a cookie in the response is cacheable */
#define SN_CACHE_SHIFT 20 /* bit shift */
/* various other session flags, bits values 0x400000 and above */
#define SN_MONITOR 0x00400000 /* this session comes from a monitoring system */
#define SN_ASSIGNED 0x00800000 /* no need to assign a server to this session */
#define SN_ADDR_SET 0x01000000 /* this session's server address has been set */
#define SN_SELF_GEN 0x02000000 /* the proxy generates data for the client (eg: stats) */
/* various data sources for the responses */
#define DATA_SRC_NONE 0
#define DATA_SRC_STATS 1
/* data transmission states for the responses */
#define DATA_ST_INIT 0
#define DATA_ST_DATA 1
/* different possible states for the client side */
#define CL_STHEADERS 0
#define CL_STDATA 1
#define CL_STSHUTR 2
#define CL_STSHUTW 3
#define CL_STCLOSE 4
/* different possible states for the server side */
#define SV_STIDLE 0
#define SV_STCONN 1
#define SV_STHEADERS 2
#define SV_STDATA 3
#define SV_STSHUTR 4
#define SV_STSHUTW 5
#define SV_STCLOSE 6
/* result of an I/O event */
#define RES_SILENT 0 /* didn't happen */
#define RES_DATA 1 /* data were sent or received */
#define RES_NULL 2 /* result is 0 (read == 0), or connect without need for writing */
#define RES_ERROR 3 /* result -1 or error on the socket (eg: connect()) */
/* modes of operation (global.mode) */
#define MODE_DEBUG 1
#define MODE_STATS 2
#define MODE_LOG 4
#define MODE_DAEMON 8
#define MODE_QUIET 16
#define MODE_CHECK 32
#define MODE_VERBOSE 64
#define MODE_STARTING 128
#define MODE_FOREGROUND 256
/* server flags */
#define SRV_RUNNING 1 /* the server is UP */
#define SRV_BACKUP 2 /* this server is a backup server */
#define SRV_MAPPORTS 4 /* this server uses mapped ports */
#define SRV_BIND_SRC 8 /* this server uses a specific source address */
#define SRV_CHECKED 16 /* this server needs to be checked */
/* function which act on servers need to return various errors */
#define SRV_STATUS_OK 0 /* everything is OK. */
#define SRV_STATUS_INTERNAL 1 /* other unrecoverable errors. */
#define SRV_STATUS_NOSRV 2 /* no server is available */
#define SRV_STATUS_FULL 3 /* the/all server(s) are saturated */
#define SRV_STATUS_QUEUED 4 /* the/all server(s) are saturated but the connection was queued */
/* what to do when a header matches a regex */
#define ACT_ALLOW 0 /* allow the request */
#define ACT_REPLACE 1 /* replace the matching header */
#define ACT_REMOVE 2 /* remove the matching header */
#define ACT_DENY 3 /* deny the request */
#define ACT_PASS 4 /* pass this header without allowing or denying the request */
/* configuration sections */
#define CFG_NONE 0
#define CFG_GLOBAL 1
#define CFG_LISTEN 2
/* fields that need to be logged. They appear as flags in session->logs.logwait */
#define LW_DATE 1 /* date */
#define LW_CLIP 2 /* CLient IP */
#define LW_SVIP 4 /* SerVer IP */
#define LW_SVID 8 /* server ID */
#define LW_REQ 16 /* http REQuest */
#define LW_RESP 32 /* http RESPonse */
#define LW_PXIP 64 /* proxy IP */
#define LW_PXID 128 /* proxy ID */
#define LW_BYTES 256 /* bytes read from server */
#define LW_COOKIE 512 /* captured cookie */
#define LW_REQHDR 1024 /* request header(s) */
#define LW_RSPHDR 2048 /* response header(s) */
#define ERR_NONE 0 /* no error */
#define ERR_RETRYABLE 1 /* retryable error, may be cumulated */
#define ERR_FATAL 2 /* fatal error, may be cumulated */
/*********************************************************************/
#define LIST_HEAD(a) ((void *)(&(a)))
/*********************************************************************/
/* describes a chunk of string */
struct chunk {
char *str; /* beginning of the string itself. Might not be 0-terminated */
int len; /* size of the string from first to last char. <0 = uninit. */
};
struct cap_hdr {
struct cap_hdr *next;
char *name; /* header name, case insensitive */
int namelen; /* length of the header name, to speed-up lookups */
int len; /* capture length, not including terminal zero */
int index; /* index in the output array */
void *pool; /* pool of pre-allocated memory area of (len+1) bytes */
};
struct hdr_exp {
struct hdr_exp *next;
regex_t *preg; /* expression to look for */
int action; /* ACT_ALLOW, ACT_REPLACE, ACT_REMOVE, ACT_DENY */
char *replace; /* expression to set instead */
};
struct buffer {
unsigned int l; /* data length */
char *r, *w, *h, *lr; /* read ptr, write ptr, last header ptr, last read */
char *rlim; /* read limit, used for header rewriting */
unsigned long long total; /* total data read */
char data[BUFSIZE];
};
struct pendconn {
struct list list; /* chaining ... */
struct session *sess; /* the session waiting for a connection */
struct server *srv; /* the server we are waiting for */
};
struct server {
struct server *next;
int state; /* server state (SRV_*) */
int cklen; /* the len of the cookie, to speed up checks */
char *cookie; /* the id set in the cookie */
char *id; /* just for identification */
struct list pendconns; /* pending connections */
int nbpend, nbpend_max; /* number of pending connections */
struct task *queue_mgt; /* the task associated to the queue processing */
struct sockaddr_in addr; /* the address to connect to */
struct sockaddr_in source_addr; /* the address to which we want to bind for connect() */
short check_port; /* the port to use for the health checks */
int health; /* 0->rise-1 = bad; rise->rise+fall-1 = good */
int rise, fall; /* time in iterations */
int inter; /* time in milliseconds */
int result; /* 0 = connect OK, -1 = connect KO */
int curfd; /* file desc used for current test, or -1 if not in test */
unsigned char uweight, eweight; /* user-specified weight-1, and effective weight-1 */
unsigned int wscore; /* weight score, used during srv map computation */
int cur_sess, cur_sess_max; /* number of currently active sessions (including syn_sent) */
unsigned int cum_sess; /* cumulated number of sessions really sent to this server */
unsigned int maxconn, minconn; /* max # of active sessions (0 = unlimited), min# for dynamic limit. */
unsigned failed_checks, down_trans; /* failed checks and up-down transitions */
unsigned failed_conns, failed_resp; /* failed connect() and responses */
unsigned failed_secu; /* blocked responses because of security concerns */
struct proxy *proxy; /* the proxy this server belongs to */
};
/* The base for all tasks */
struct task {
struct task *next, *prev; /* chaining ... */
struct task *rqnext; /* chaining in run queue ... */
struct task *wq; /* the wait queue this task is in */
int state; /* task state : IDLE or RUNNING */
struct timeval expire; /* next expiration time for this task, use only for fast sorting */
int (*process)(struct task *t); /* the function which processes the task */
void *context; /* the task's context */
};
/* WARNING: if new fields are added, they must be initialized in event_accept() */
struct session {
struct task *task; /* the task associated with this session */
/* application specific below */
struct timeval crexpire; /* expiration date for a client read */
struct timeval cwexpire; /* expiration date for a client write */
struct timeval srexpire; /* expiration date for a server read */
struct timeval swexpire; /* expiration date for a server write */
struct timeval cnexpire; /* expiration date for a connect */
char res_cr, res_cw, res_sr, res_sw;/* results of some events */
struct proxy *proxy; /* the proxy this socket belongs to */
int cli_fd; /* the client side fd */
int srv_fd; /* the server side fd */
int cli_state; /* state of the client side */
int srv_state; /* state of the server side */
int conn_retries; /* number of connect retries left */
int flags; /* some flags describing the session */
struct buffer *req; /* request buffer */
struct buffer *rep; /* response buffer */
struct sockaddr_storage cli_addr; /* the client address */
struct sockaddr_in srv_addr; /* the address to connect to */
struct server *srv; /* the server being used */
struct pendconn *pend_pos; /* if not NULL, points to the position in the pending queue */
char **req_cap; /* array of captured request headers (may be NULL) */
char **rsp_cap; /* array of captured response headers (may be NULL) */
struct chunk req_line; /* points to first line */
struct chunk auth_hdr; /* points to 'Authorization:' header */
struct {
int logwait; /* log fields waiting to be collected : LW_* */
struct timeval tv_accept; /* date of the accept() (beginning of the session) */
long t_request; /* delay before the end of the request arrives, -1 if never occurs */
long t_queue; /* delay before the session gets out of the connect queue, -1 if never occurs */
long t_connect; /* delay before the connect() to the server succeeds, -1 if never occurs */
long t_data; /* delay before the first data byte from the server ... */
unsigned long t_close; /* total session duration */
unsigned long srv_queue_size; /* number of sessions waiting for a connect slot on this server at accept() time (in direct assignment) */
unsigned long prx_queue_size; /* overall number of sessions waiting for a connect slot on this instance at accept() time */
char *uri; /* first line if log needed, NULL otherwise */
char *cli_cookie; /* cookie presented by the client, in capture mode */
char *srv_cookie; /* cookie presented by the server, in capture mode */
int status; /* HTTP status from the server, negative if from proxy */
long long bytes; /* number of bytes transferred from the server */
} logs;
short int data_source; /* where to get the data we generate ourselves */
short int data_state; /* where to get the data we generate ourselves */
union {
struct {
struct proxy *px;
struct server *sv;
short px_st, sv_st; /* DATA_ST_INIT or DATA_ST_DATA */
} stats;
} data_ctx;
unsigned int uniq_id; /* unique ID used for the traces */
};
struct listener {
int fd; /* the listen socket */
struct sockaddr_storage addr; /* the address we listen to */
struct listener *next; /* next address or NULL */
};
struct proxy {
struct listener *listen; /* the listen addresses and sockets */
struct in_addr mon_net, mon_mask; /* don't forward connections from this net (network order) FIXME: should support IPv6 */
int state; /* proxy state */
struct sockaddr_in dispatch_addr; /* the default address to connect to */
struct server *srv; /* known servers */
int srv_act, srv_bck; /* # of running servers */
int tot_wact, tot_wbck; /* total weights of active and backup servers */
struct server **srv_map; /* the server map used to apply weights */
int srv_map_sz; /* the size of the effective server map */
int srv_rr_idx; /* next server to be elected in round robin mode */
char *cookie_name; /* name of the cookie to look for */
int cookie_len; /* strlen(cookie_name), computed only once */
char *appsession_name; /* name of the cookie to look for */
int appsession_name_len; /* strlen(appsession_name), computed only once */
int appsession_len; /* length of the appsession cookie value to be used */
int appsession_timeout;
CHTbl htbl_proxy; /* Per Proxy hashtable */
char *capture_name; /* beginning of the name of the cookie to capture */
int capture_namelen; /* length of the cookie name to match */
int capture_len; /* length of the string to be captured */
struct uri_auth *uri_auth; /* if non-NULL, the (list of) per-URI authentications */
int clitimeout; /* client I/O timeout (in milliseconds) */
int srvtimeout; /* server I/O timeout (in milliseconds) */
int contimeout; /* connect timeout (in milliseconds) */
char *id; /* proxy id */
struct list pendconns; /* pending connections with no server assigned yet */
int nbpend, nbpend_max; /* number of pending connections with no server assigned yet */
int totpend; /* total number of pending connections on this instance (for stats) */
unsigned int nbconn, nbconn_max; /* # of active sessions */
unsigned int cum_conn; /* cumulated number of processed sessions */
unsigned int maxconn; /* max # of active sessions */
unsigned failed_conns, failed_resp; /* failed connect() and responses */
unsigned failed_secu; /* blocked responses because of security concerns */
int conn_retries; /* maximum number of connect retries */
int options; /* PR_O_REDISP, PR_O_TRANSP, ... */
int mode; /* mode = PR_MODE_TCP, PR_MODE_HTTP or PR_MODE_HEALTH */
struct sockaddr_in source_addr; /* the address to which we want to bind for connect() */
struct proxy *next;
struct sockaddr_in logsrv1, logsrv2; /* 2 syslog servers */
signed char logfac1, logfac2; /* log facility for both servers. -1 = disabled */
int loglev1, loglev2; /* log level for each server, 7 by default */
int to_log; /* things to be logged (LW_*) */
struct timeval stop_time; /* date to stop listening, when stopping != 0 */
int nb_reqadd, nb_rspadd;
struct hdr_exp *req_exp; /* regular expressions for request headers */
struct hdr_exp *rsp_exp; /* regular expressions for response headers */
int nb_req_cap, nb_rsp_cap; /* # of headers to be captured */
struct cap_hdr *req_cap; /* chained list of request headers to be captured */
struct cap_hdr *rsp_cap; /* chained list of response headers to be captured */
void *req_cap_pool, *rsp_cap_pool; /* pools of pre-allocated char ** used to build the sessions */
char *req_add[MAX_NEWHDR], *rsp_add[MAX_NEWHDR]; /* headers to be added */
int grace; /* grace time after stop request */
char *check_req; /* HTTP request to use if PR_O_HTTP_CHK is set, else NULL */
int check_len; /* Length of the HTTP request */
struct {
char *msg400; /* message for error 400 */
int len400; /* message length for error 400 */
char *msg403; /* message for error 403 */
int len403; /* message length for error 403 */
char *msg408; /* message for error 408 */
int len408; /* message length for error 408 */
char *msg500; /* message for error 500 */
int len500; /* message length for error 500 */
char *msg502; /* message for error 502 */
int len502; /* message length for error 502 */
char *msg503; /* message for error 503 */
int len503; /* message length for error 503 */
char *msg504; /* message for error 504 */
int len504; /* message length for error 504 */
} errmsg;
};
/* info about one given fd */
struct fdtab {
int (*read)(int fd); /* read function */
int (*write)(int fd); /* write function */
struct task *owner; /* the session (or proxy) associated with this fd */
int state; /* the state of this fd */
};
/*********************************************************************/
int cfg_maxpconn = DEFAULT_MAXCONN; /* # of simultaneous connections per proxy (-N) */
int cfg_maxconn = 0; /* # of simultaneous connections, (-n) */
char *cfg_cfgfile = NULL; /* configuration file */
char *progname = NULL; /* program name */
int pid; /* current process id */
/* global options */
static struct {
int uid;
int gid;
int nbproc;
int maxconn;
int maxsock; /* max # of sockets */
int rlimit_nofile; /* default ulimit-n value : 0=unset */
int rlimit_memmax; /* default ulimit-d in megs value : 0=unset */
int mode;
char *chroot;
char *pidfile;
int logfac1, logfac2;
int loglev1, loglev2;
struct sockaddr_in logsrv1, logsrv2;
} global = {
logfac1 : -1,
logfac2 : -1,
loglev1 : 7, /* max syslog level : debug */
loglev2 : 7,
/* others NULL OK */
};
/*********************************************************************/
fd_set *StaticReadEvent,
*StaticWriteEvent;
int cfg_polling_mechanism = 0; /* POLL_USE_{SELECT|POLL|EPOLL} */
void **pool_session = NULL,
**pool_pendconn = NULL,
**pool_buffer = NULL,
**pool_fdtab = NULL,
**pool_requri = NULL,
**pool_task = NULL,
**pool_capture = NULL,
**pool_appsess = NULL;
struct proxy *proxy = NULL; /* list of all existing proxies */
struct fdtab *fdtab = NULL; /* array of all the file descriptors */
struct task *rq = NULL; /* global run queue */
struct task wait_queue[2] = { /* global wait queue */
{
prev:LIST_HEAD(wait_queue[0]), /* expirable tasks */
next:LIST_HEAD(wait_queue[0]),
},
{
prev:LIST_HEAD(wait_queue[1]), /* non-expirable tasks */
next:LIST_HEAD(wait_queue[1]),
},
};
static int totalconn = 0; /* total # of terminated sessions */
static int actconn = 0; /* # of active sessions */
static int maxfd = 0; /* # of the highest fd + 1 */
static int listeners = 0; /* # of listeners */
static int stopping = 0; /* non zero means stopping in progress */
static struct timeval now = {0,0}; /* the current date at any moment */
static struct timeval start_date; /* the process's start date */
static struct proxy defproxy; /* fake proxy used to assign default values on all instances */
/* Here we store informations about the pids of the processes we may pause
* or kill. We will send them a signal every 10 ms until we can bind to all
* our ports. With 200 retries, that's about 2 seconds.
*/
#define MAX_START_RETRIES 200
static int nb_oldpids = 0;
static int *oldpids = NULL;
static int oldpids_sig; /* use USR1 or TERM */
#if defined(ENABLE_EPOLL)
/* FIXME: this is dirty, but at the moment, there's no other solution to remove
* the old FDs from outside the loop. Perhaps we should export a global 'poll'
* structure with pointers to functions such as init_fd() and close_fd(), plus
* a private structure with several pointers to places such as below.
*/
static fd_set *PrevReadEvent = NULL, *PrevWriteEvent = NULL;
#endif
static regmatch_t pmatch[MAX_MATCH]; /* rm_so, rm_eo for regular expressions */
/* this is used to drain data, and as a temporary buffer for sprintf()... */
static char trash[BUFSIZE];
const int zero = 0;
const int one = 1;
/*
* Syslog facilities and levels. Conforming to RFC3164.
*/
#define MAX_SYSLOG_LEN 1024
#define NB_LOG_FACILITIES 24
const char *log_facilities[NB_LOG_FACILITIES] = {
"kern", "user", "mail", "daemon",
"auth", "syslog", "lpr", "news",
"uucp", "cron", "auth2", "ftp",
"ntp", "audit", "alert", "cron2",
"local0", "local1", "local2", "local3",
"local4", "local5", "local6", "local7"
};
#define NB_LOG_LEVELS 8
const char *log_levels[NB_LOG_LEVELS] = {
"emerg", "alert", "crit", "err",
"warning", "notice", "info", "debug"
};
#define SYSLOG_PORT 514
const char *monthname[12] = {"Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec" };
const char sess_term_cond[8] = "-cCsSPRI"; /* normal, CliTo, CliErr, SrvTo, SrvErr, PxErr, Resource, Internal */
const char sess_fin_state[8] = "-RCHDLQ7"; /* cliRequest, srvConnect, srvHeader, Data, Last, Queue, unknown */
const char sess_cookie[4] = "NIDV"; /* No cookie, Invalid cookie, cookie for a Down server, Valid cookie */
const char sess_set_cookie[8] = "N1I3PD5R"; /* No set-cookie, unknown, Set-Cookie Inserted, unknown,
Set-cookie seen and left unchanged (passive), Set-cookie Deleted,
unknown, Set-cookie Rewritten */
#define MAX_HOSTNAME_LEN 32
static char hostname[MAX_HOSTNAME_LEN] = "";
const char *HTTP_302 =
"HTTP/1.0 302 Found\r\n"
"Cache-Control: no-cache\r\n"
"Connection: close\r\n"
"Location: "; /* not terminated since it will be concatenated with the URL */
/* same as 302 except that the browser MUST retry with the GET method */
const char *HTTP_303 =
"HTTP/1.0 303 See Other\r\n"
"Cache-Control: no-cache\r\n"
"Connection: close\r\n"
"Location: "; /* not terminated since it will be concatenated with the URL */
const char *HTTP_400 =
"HTTP/1.0 400 Bad request\r\n"
"Cache-Control: no-cache\r\n"
"Connection: close\r\n"
"\r\n"
"<html><body><h1>400 Bad request</h1>\nYour browser sent an invalid request.\n</body></html>\n";
/* Warning: this one is an sprintf() fmt string, with <realm> as its only argument */
const char *HTTP_401_fmt =
"HTTP/1.0 401 Unauthorized\r\n"
"Cache-Control: no-cache\r\n"
"Connection: close\r\n"
"WWW-Authenticate: Basic realm=\"%s\"\r\n"
"\r\n"
"<html><body><h1>401 Unauthorized</h1>\nYou need a valid user and password to access this content.\n</body></html>\n";
const char *HTTP_403 =
"HTTP/1.0 403 Forbidden\r\n"
"Cache-Control: no-cache\r\n"
"Connection: close\r\n"
"\r\n"
"<html><body><h1>403 Forbidden</h1>\nRequest forbidden by administrative rules.\n</body></html>\n";
const char *HTTP_408 =
"HTTP/1.0 408 Request Time-out\r\n"
"Cache-Control: no-cache\r\n"
"Connection: close\r\n"
"\r\n"
"<html><body><h1>408 Request Time-out</h1>\nYour browser didn't send a complete request in time.\n</body></html>\n";
const char *HTTP_500 =
"HTTP/1.0 500 Server Error\r\n"
"Cache-Control: no-cache\r\n"
"Connection: close\r\n"
"\r\n"
"<html><body><h1>500 Server Error</h1>\nAn internal server error occured.\n</body></html>\n";
const char *HTTP_502 =
"HTTP/1.0 502 Bad Gateway\r\n"
"Cache-Control: no-cache\r\n"
"Connection: close\r\n"
"\r\n"
"<html><body><h1>502 Bad Gateway</h1>\nThe server returned an invalid or incomplete response.\n</body></html>\n";
const char *HTTP_503 =
"HTTP/1.0 503 Service Unavailable\r\n"
"Cache-Control: no-cache\r\n"
"Connection: close\r\n"
"\r\n"
"<html><body><h1>503 Service Unavailable</h1>\nNo server is available to handle this request.\n</body></html>\n";
const char *HTTP_504 =
"HTTP/1.0 504 Gateway Time-out\r\n"
"Cache-Control: no-cache\r\n"
"Connection: close\r\n"
"\r\n"
"<html><body><h1>504 Gateway Time-out</h1>\nThe server didn't respond in time.\n</body></html>\n";
/*********************************************************************/
/* statistics ******************************************************/
/*********************************************************************/
#if STATTIME > 0
static int stats_tsk_lsrch, stats_tsk_rsrch,
stats_tsk_good, stats_tsk_right, stats_tsk_left,
stats_tsk_new, stats_tsk_nsrch;
#endif
/*********************************************************************/
/* debugging *******************************************************/
/*********************************************************************/
#ifdef DEBUG_FULL
static char *cli_stnames[5] = {"HDR", "DAT", "SHR", "SHW", "CLS" };
static char *srv_stnames[7] = {"IDL", "CON", "HDR", "DAT", "SHR", "SHW", "CLS" };
#endif
/*********************************************************************/
/* function prototypes *********************************************/
/*********************************************************************/
int event_accept(int fd);
int event_cli_read(int fd);
int event_cli_write(int fd);
int event_srv_read(int fd);
int event_srv_write(int fd);
int process_session(struct task *t);
static int appsession_task_init(void);
static int appsession_init(void);
static int appsession_refresh(struct task *t);
/*********************************************************************/
/* general purpose functions ***************************************/
/*********************************************************************/
void display_version() {
printf("HA-Proxy version " HAPROXY_VERSION " " HAPROXY_DATE"\n");
printf("Copyright 2000-2006 Willy Tarreau <w@w.ods.org>\n\n");
}
/*
* This function prints the command line usage and exits
*/
void usage(char *name) {
display_version();
fprintf(stderr,
"Usage : %s -f <cfgfile> [ -vdV"
#if STATTIME > 0
"sl"
#endif
"D ] [ -n <maxconn> ] [ -N <maxpconn> ]\n"
" [ -p <pidfile> ] [ -m <max megs> ]\n"
" -v displays version\n"
" -d enters debug mode ; -db only disables background mode.\n"
" -V enters verbose mode (disables quiet mode)\n"
#if STATTIME > 0
" -s enables statistics output\n"
" -l enables long statistics format\n"
#endif
" -D goes daemon ; implies -q\n"
" -q quiet mode : don't display messages\n"
" -c check mode : only check config file and exit\n"
" -n sets the maximum total # of connections (%d)\n"
" -m limits the usable amount of memory (in MB)\n"
" -N sets the default, per-proxy maximum # of connections (%d)\n"
" -p writes pids of all children to this file\n"
#if defined(ENABLE_EPOLL)
" -de disables epoll() usage even when available\n"
#endif
#if defined(ENABLE_POLL)
" -dp disables poll() usage even when available\n"
#endif
" -sf/-st [pid ]* finishes/terminates old pids. Must be last arguments.\n"
"\n",
name, DEFAULT_MAXCONN, cfg_maxpconn);
exit(1);
}
/*
* Displays the message on stderr with the date and pid. Overrides the quiet
* mode during startup.
*/
void Alert(char *fmt, ...) {
va_list argp;
struct timeval tv;
struct tm *tm;
if (!(global.mode & MODE_QUIET) || (global.mode & (MODE_VERBOSE | MODE_STARTING))) {
va_start(argp, fmt);
gettimeofday(&tv, NULL);
tm=localtime(&tv.tv_sec);
fprintf(stderr, "[ALERT] %03d/%02d%02d%02d (%d) : ",
tm->tm_yday, tm->tm_hour, tm->tm_min, tm->tm_sec, (int)getpid());
vfprintf(stderr, fmt, argp);
fflush(stderr);
va_end(argp);
}
}
/*
* Displays the message on stderr with the date and pid.
*/
void Warning(char *fmt, ...) {
va_list argp;
struct timeval tv;
struct tm *tm;
if (!(global.mode & MODE_QUIET) || (global.mode & MODE_VERBOSE)) {
va_start(argp, fmt);
gettimeofday(&tv, NULL);
tm=localtime(&tv.tv_sec);
fprintf(stderr, "[WARNING] %03d/%02d%02d%02d (%d) : ",
tm->tm_yday, tm->tm_hour, tm->tm_min, tm->tm_sec, (int)getpid());
vfprintf(stderr, fmt, argp);
fflush(stderr);
va_end(argp);
}
}
/*
* Displays the message on <out> only if quiet mode is not set.
*/
void qfprintf(FILE *out, char *fmt, ...) {
va_list argp;
if (!(global.mode & MODE_QUIET) || (global.mode & MODE_VERBOSE)) {
va_start(argp, fmt);
vfprintf(out, fmt, argp);
fflush(out);
va_end(argp);
}
}
/*
* converts <str> to a struct sockaddr_in* which is locally allocated.
* The format is "addr:port", where "addr" can be empty or "*" to indicate
* INADDR_ANY.
*/
struct sockaddr_in *str2sa(char *str) {
static struct sockaddr_in sa;
char *c;
int port;
memset(&sa, 0, sizeof(sa));
str=strdup(str);
if ((c=strrchr(str,':')) != NULL) {
*c++=0;
port=atol(c);
}
else
port=0;
if (*str == '*' || *str == '\0') { /* INADDR_ANY */
sa.sin_addr.s_addr = INADDR_ANY;
}
else if (!inet_pton(AF_INET, str, &sa.sin_addr)) {
struct hostent *he;
if ((he = gethostbyname(str)) == NULL) {
Alert("Invalid server name: '%s'\n", str);
}
else
sa.sin_addr = *(struct in_addr *) *(he->h_addr_list);
}
sa.sin_port=htons(port);
sa.sin_family=AF_INET;
free(str);
return &sa;
}
/*
* converts <str> to a two struct in_addr* which are locally allocated.
* The format is "addr[/mask]", where "addr" cannot be empty, and mask
* is optionnal and either in the dotted or CIDR notation.
* Note: "addr" can also be a hostname. Returns 1 if OK, 0 if error.
*/
int str2net(char *str, struct in_addr *addr, struct in_addr *mask) {
char *c;
unsigned long len;
memset(mask, 0, sizeof(*mask));
memset(addr, 0, sizeof(*addr));
str=strdup(str);
if ((c = strrchr(str, '/')) != NULL) {
*c++ = 0;
/* c points to the mask */
if (strchr(c, '.') != NULL) { /* dotted notation */
if (!inet_pton(AF_INET, c, mask))
return 0;
}
else { /* mask length */
char *err;
len = strtol(c, &err, 10);
if (!*c || (err && *err) || (unsigned)len > 32)
return 0;
if (len)
mask->s_addr = htonl(0xFFFFFFFFUL << (32 - len));
else
mask->s_addr = 0;
}
}
else {
mask->s_addr = 0xFFFFFFFF;
}
if (!inet_pton(AF_INET, str, addr)) {
struct hostent *he;
if ((he = gethostbyname(str)) == NULL) {
return 0;
}
else
*addr = *(struct in_addr *) *(he->h_addr_list);
}
free(str);
return 1;
}
/*
* converts <str> to a list of listeners which are dynamically allocated.
* The format is "{addr|'*'}:port[-end][,{addr|'*'}:port[-end]]*", where :
* - <addr> can be empty or "*" to indicate INADDR_ANY ;
* - <port> is a numerical port from 1 to 65535 ;
* - <end> indicates to use the range from <port> to <end> instead (inclusive).
* This can be repeated as many times as necessary, separated by a coma.
* The <tail> argument is a pointer to a current list which should be appended
* to the tail of the new list. The pointer to the new list is returned.
*/
struct listener *str2listener(char *str, struct listener *tail) {
struct listener *l;
char *c, *next, *range, *dupstr;
int port, end;
next = dupstr = strdup(str);
while (next && *next) {
struct sockaddr_storage ss;
str = next;
/* 1) look for the end of the first address */
if ((next = strrchr(str, ',')) != NULL) {
*next++ = 0;
}
/* 2) look for the addr/port delimiter, it's the last colon. */
if ((range = strrchr(str, ':')) == NULL) {
Alert("Missing port number: '%s'\n", str);
goto fail;
}
*range++ = 0;
if (strrchr(str, ':') != NULL) {
/* IPv6 address contains ':' */
memset(&ss, 0, sizeof(ss));
ss.ss_family = AF_INET6;
if (!inet_pton(ss.ss_family, str, &((struct sockaddr_in6 *)&ss)->sin6_addr)) {
Alert("Invalid server address: '%s'\n", str);
goto fail;
}
}
else {
memset(&ss, 0, sizeof(ss));
ss.ss_family = AF_INET;
if (*str == '*' || *str == '\0') { /* INADDR_ANY */
((struct sockaddr_in *)&ss)->sin_addr.s_addr = INADDR_ANY;
}
else if (!inet_pton(ss.ss_family, str, &((struct sockaddr_in *)&ss)->sin_addr)) {
struct hostent *he;
if ((he = gethostbyname(str)) == NULL) {
Alert("Invalid server name: '%s'\n", str);
goto fail;
}
else
((struct sockaddr_in *)&ss)->sin_addr =
*(struct in_addr *) *(he->h_addr_list);
}
}
/* 3) look for the port-end delimiter */
if ((c = strchr(range, '-')) != NULL) {
*c++ = 0;
end = atol(c);
}
else {
end = atol(range);
}
port = atol(range);
if (port < 1 || port > 65535) {
Alert("Invalid port '%d' specified for address '%s'.\n", port, str);
goto fail;
}
if (end < 1 || end > 65535) {
Alert("Invalid port '%d' specified for address '%s'.\n", end, str);
goto fail;
}
for (; port <= end; port++) {
l = (struct listener *)calloc(1, sizeof(struct listener));
l->next = tail;
tail = l;
l->fd = -1;
l->addr = ss;
if (ss.ss_family == AF_INET6)
((struct sockaddr_in6 *)(&l->addr))->sin6_port = htons(port);
else
((struct sockaddr_in *)(&l->addr))->sin_port = htons(port);
} /* end for(port) */
} /* end while(next) */
free(dupstr);
return tail;
fail:
free(dupstr);
return NULL;
}
#define FD_SETS_ARE_BITFIELDS
#ifdef FD_SETS_ARE_BITFIELDS
/*
* This map is used with all the FD_* macros to check whether a particular bit
* is set or not. Each bit represents an ACSII code. FD_SET() sets those bytes
* which should be encoded. When FD_ISSET() returns non-zero, it means that the
* byte should be encoded. Be careful to always pass bytes from 0 to 255
* exclusively to the macros.
*/
fd_set hdr_encode_map[(sizeof(fd_set) > (256/8)) ? 1 : ((256/8) / sizeof(fd_set))];
fd_set url_encode_map[(sizeof(fd_set) > (256/8)) ? 1 : ((256/8) / sizeof(fd_set))];
#else
#error "Check if your OS uses bitfields for fd_sets"
#endif
/* will try to encode the string <string> replacing all characters tagged in
* <map> with the hexadecimal representation of their ASCII-code (2 digits)
* prefixed by <escape>, and will store the result between <start> (included
*) and <stop> (excluded), and will always terminate the string with a '\0'
* before <stop>. The position of the '\0' is returned if the conversion
* completes. If bytes are missing between <start> and <stop>, then the
* conversion will be incomplete and truncated. If <stop> <= <start>, the '\0'
* cannot even be stored so we return <start> without writing the 0.
* The input string must also be zero-terminated.
*/
char hextab[16] = "0123456789ABCDEF";
char *encode_string(char *start, char *stop,
const char escape, const fd_set *map,
const char *string)
{
if (start < stop) {
stop--; /* reserve one byte for the final '\0' */
while (start < stop && *string != 0) {
if (!FD_ISSET((unsigned char)(*string), map))
*start++ = *string;
else {
if (start + 3 >= stop)
break;
*start++ = escape;
*start++ = hextab[(*string >> 4) & 15];
*start++ = hextab[*string & 15];
}
string++;
}
*start = '\0';
}
return start;
}
/*
* This function sends a syslog message to both log servers of a proxy,
* or to global log servers if the proxy is NULL.
* It also tries not to waste too much time computing the message header.
* It doesn't care about errors nor does it report them.
*/
void send_log(struct proxy *p, int level, char *message, ...) {
static int logfd = -1; /* syslog UDP socket */
static long tvsec = -1; /* to force the string to be initialized */
struct timeval tv;
va_list argp;
static char logmsg[MAX_SYSLOG_LEN];
static char *dataptr = NULL;
int fac_level;
int hdr_len, data_len;
struct sockaddr_in *sa[2];
int facilities[2], loglevel[2];
int nbloggers = 0;
char *log_ptr;
if (logfd < 0) {
if ((logfd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP)) < 0)
return;
}
if (level < 0 || progname == NULL || message == NULL)
return;
gettimeofday(&tv, NULL);
if (tv.tv_sec != tvsec || dataptr == NULL) {
/* this string is rebuild only once a second */
struct tm *tm = localtime(&tv.tv_sec);
tvsec = tv.tv_sec;
hdr_len = snprintf(logmsg, sizeof(logmsg),
"<<<<>%s %2d %02d:%02d:%02d %s[%d]: ",
monthname[tm->tm_mon],
tm->tm_mday, tm->tm_hour, tm->tm_min, tm->tm_sec,
progname, pid);
/* WARNING: depending upon implementations, snprintf may return
* either -1 or the number of bytes that would be needed to store
* the total message. In both cases, we must adjust it.
*/
if (hdr_len < 0 || hdr_len > sizeof(logmsg))
hdr_len = sizeof(logmsg);
dataptr = logmsg + hdr_len;
}
va_start(argp, message);
data_len = vsnprintf(dataptr, logmsg + sizeof(logmsg) - dataptr, message, argp);
if (data_len < 0 || data_len > (logmsg + sizeof(logmsg) - dataptr))
data_len = logmsg + sizeof(logmsg) - dataptr;
va_end(argp);
dataptr[data_len - 1] = '\n'; /* force a break on ultra-long lines */
if (p == NULL) {
if (global.logfac1 >= 0) {
sa[nbloggers] = &global.logsrv1;
facilities[nbloggers] = global.logfac1;
loglevel[nbloggers] = global.loglev1;
nbloggers++;
}
if (global.logfac2 >= 0) {
sa[nbloggers] = &global.logsrv2;
facilities[nbloggers] = global.logfac2;
loglevel[nbloggers] = global.loglev2;
nbloggers++;
}
} else {
if (p->logfac1 >= 0) {
sa[nbloggers] = &p->logsrv1;
facilities[nbloggers] = p->logfac1;
loglevel[nbloggers] = p->loglev1;
nbloggers++;
}
if (p->logfac2 >= 0) {
sa[nbloggers] = &p->logsrv2;
facilities[nbloggers] = p->logfac2;
loglevel[nbloggers] = p->loglev2;
nbloggers++;
}
}
while (nbloggers-- > 0) {
/* we can filter the level of the messages that are sent to each logger */
if (level > loglevel[nbloggers])
continue;
/* For each target, we may have a different facility.
* We can also have a different log level for each message.
* This induces variations in the message header length.
* Since we don't want to recompute it each time, nor copy it every
* time, we only change the facility in the pre-computed header,
* and we change the pointer to the header accordingly.
*/
fac_level = (facilities[nbloggers] << 3) + level;
log_ptr = logmsg + 3; /* last digit of the log level */
do {
*log_ptr = '0' + fac_level % 10;
fac_level /= 10;
log_ptr--;
} while (fac_level && log_ptr > logmsg);
*log_ptr = '<';
/* the total syslog message now starts at logptr, for dataptr+data_len-logptr */
#ifndef MSG_NOSIGNAL
sendto(logfd, log_ptr, dataptr + data_len - log_ptr, MSG_DONTWAIT,
(struct sockaddr *)sa[nbloggers], sizeof(**sa));
#else
sendto(logfd, log_ptr, dataptr + data_len - log_ptr, MSG_DONTWAIT | MSG_NOSIGNAL,
(struct sockaddr *)sa[nbloggers], sizeof(**sa));
#endif
}
}
/* sets <tv> to the current time */
static inline struct timeval *tv_now(struct timeval *tv) {
if (tv)
gettimeofday(tv, NULL);
return tv;
}
/*
* adds <ms> ms to <from>, set the result to <tv> and returns a pointer <tv>
*/
static struct timeval *tv_delayfrom(struct timeval *tv, struct timeval *from, int ms) {
if (!tv || !from)
return NULL;
tv->tv_usec = from->tv_usec + (ms%1000)*1000;
tv->tv_sec = from->tv_sec + (ms/1000);
while (tv->tv_usec >= 1000000) {
tv->tv_usec -= 1000000;
tv->tv_sec++;
}
return tv;
}
/*
* compares <tv1> and <tv2> : returns 0 if equal, -1 if tv1 < tv2, 1 if tv1 > tv2
* Must not be used when either argument is eternity. Use tv_cmp2() for that.
*/
static inline int tv_cmp(struct timeval *tv1, struct timeval *tv2) {
if (tv1->tv_sec < tv2->tv_sec)
return -1;
else if (tv1->tv_sec > tv2->tv_sec)
return 1;
else if (tv1->tv_usec < tv2->tv_usec)
return -1;
else if (tv1->tv_usec > tv2->tv_usec)
return 1;
else
return 0;
}
/*
* returns the absolute difference, in ms, between tv1 and tv2
* Must not be used when either argument is eternity.
*/
unsigned long tv_delta(struct timeval *tv1, struct timeval *tv2) {
int cmp;
unsigned long ret;
cmp = tv_cmp(tv1, tv2);
if (!cmp)
return 0; /* same dates, null diff */
else if (cmp < 0) {
struct timeval *tmp = tv1;
tv1 = tv2;
tv2 = tmp;
}
ret = (tv1->tv_sec - tv2->tv_sec) * 1000;
if (tv1->tv_usec > tv2->tv_usec)
ret += (tv1->tv_usec - tv2->tv_usec) / 1000;
else
ret -= (tv2->tv_usec - tv1->tv_usec) / 1000;
return (unsigned long) ret;
}
/*
* returns the difference, in ms, between tv1 and tv2
* Must not be used when either argument is eternity.
*/
static inline unsigned long tv_diff(struct timeval *tv1, struct timeval *tv2) {
unsigned long ret;
ret = (tv2->tv_sec - tv1->tv_sec) * 1000;
if (tv2->tv_usec > tv1->tv_usec)
ret += (tv2->tv_usec - tv1->tv_usec) / 1000;
else
ret -= (tv1->tv_usec - tv2->tv_usec) / 1000;
return (unsigned long) ret;
}
/*
* compares <tv1> and <tv2> modulo 1ms: returns 0 if equal, -1 if tv1 < tv2, 1 if tv1 > tv2
* Must not be used when either argument is eternity. Use tv_cmp2_ms() for that.
*/
static int tv_cmp_ms(struct timeval *tv1, struct timeval *tv2) {
if (tv1->tv_sec == tv2->tv_sec) {
if (tv2->tv_usec >= tv1->tv_usec + 1000)
return -1;
else if (tv1->tv_usec >= tv2->tv_usec + 1000)
return 1;
else
return 0;
}
else if ((tv2->tv_sec > tv1->tv_sec + 1) ||
((tv2->tv_sec == tv1->tv_sec + 1) && (tv2->tv_usec + 1000000 >= tv1->tv_usec + 1000)))
return -1;
else if ((tv1->tv_sec > tv2->tv_sec + 1) ||
((tv1->tv_sec == tv2->tv_sec + 1) && (tv1->tv_usec + 1000000 >= tv2->tv_usec + 1000)))
return 1;
else
return 0;
}
/*
* returns the remaining time between tv1=now and event=tv2
* if tv2 is passed, 0 is returned.
* Must not be used when either argument is eternity.
*/
static inline unsigned long tv_remain(struct timeval *tv1, struct timeval *tv2) {
unsigned long ret;
if (tv_cmp_ms(tv1, tv2) >= 0)
return 0; /* event elapsed */
ret = (tv2->tv_sec - tv1->tv_sec) * 1000;
if (tv2->tv_usec > tv1->tv_usec)
ret += (tv2->tv_usec - tv1->tv_usec) / 1000;
else
ret -= (tv1->tv_usec - tv2->tv_usec) / 1000;
return (unsigned long) ret;
}
/*
* zeroes a struct timeval
*/
static inline struct timeval *tv_eternity(struct timeval *tv) {
tv->tv_sec = tv->tv_usec = 0;
return tv;
}
/*
* returns 1 if tv is null, else 0
*/
static inline int tv_iseternity(struct timeval *tv) {
if (tv->tv_sec == 0 && tv->tv_usec == 0)
return 1;
else
return 0;
}
/*
* compares <tv1> and <tv2> : returns 0 if equal, -1 if tv1 < tv2, 1 if tv1 > tv2,
* considering that 0 is the eternity.
*/
static int tv_cmp2(struct timeval *tv1, struct timeval *tv2) {
if (tv_iseternity(tv1))
if (tv_iseternity(tv2))
return 0; /* same */
else
return 1; /* tv1 later than tv2 */
else if (tv_iseternity(tv2))
return -1; /* tv2 later than tv1 */
if (tv1->tv_sec > tv2->tv_sec)
return 1;
else if (tv1->tv_sec < tv2->tv_sec)
return -1;
else if (tv1->tv_usec > tv2->tv_usec)
return 1;
else if (tv1->tv_usec < tv2->tv_usec)
return -1;
else
return 0;
}
/*
* compares <tv1> and <tv2> modulo 1 ms: returns 0 if equal, -1 if tv1 < tv2, 1 if tv1 > tv2,
* considering that 0 is the eternity.
*/
static int tv_cmp2_ms(struct timeval *tv1, struct timeval *tv2) {
if (tv_iseternity(tv1))
if (tv_iseternity(tv2))
return 0; /* same */
else
return 1; /* tv1 later than tv2 */
else if (tv_iseternity(tv2))
return -1; /* tv2 later than tv1 */
if (tv1->tv_sec == tv2->tv_sec) {
if (tv1->tv_usec >= tv2->tv_usec + 1000)
return 1;
else if (tv2->tv_usec >= tv1->tv_usec + 1000)
return -1;
else
return 0;
}
else if ((tv1->tv_sec > tv2->tv_sec + 1) ||
((tv1->tv_sec == tv2->tv_sec + 1) && (tv1->tv_usec + 1000000 >= tv2->tv_usec + 1000)))
return 1;
else if ((tv2->tv_sec > tv1->tv_sec + 1) ||
((tv2->tv_sec == tv1->tv_sec + 1) && (tv2->tv_usec + 1000000 >= tv1->tv_usec + 1000)))
return -1;
else
return 0;
}
/*
* returns the remaining time between tv1=now and event=tv2
* if tv2 is passed, 0 is returned.
* Returns TIME_ETERNITY if tv2 is eternity.
*/
static unsigned long tv_remain2(struct timeval *tv1, struct timeval *tv2) {
unsigned long ret;
if (tv_iseternity(tv2))
return TIME_ETERNITY;
if (tv_cmp_ms(tv1, tv2) >= 0)
return 0; /* event elapsed */
ret = (tv2->tv_sec - tv1->tv_sec) * 1000;
if (tv2->tv_usec > tv1->tv_usec)
ret += (tv2->tv_usec - tv1->tv_usec) / 1000;
else
ret -= (tv1->tv_usec - tv2->tv_usec) / 1000;
return (unsigned long) ret;
}
/*
* returns the first event between tv1 and tv2 into tvmin.
* a zero tv is ignored. tvmin is returned.
*/
static inline struct timeval *tv_min(struct timeval *tvmin,
struct timeval *tv1, struct timeval *tv2) {
if (tv_cmp2(tv1, tv2) <= 0)
*tvmin = *tv1;
else
*tvmin = *tv2;
return tvmin;
}
/***********************************************************/
/* fd management ***************************************/
/***********************************************************/
/* Deletes an FD from the fdsets, and recomputes the maxfd limit.
* The file descriptor is also closed.
*/
static void fd_delete(int fd) {
FD_CLR(fd, StaticReadEvent);
FD_CLR(fd, StaticWriteEvent);
#if defined(ENABLE_EPOLL)
if (PrevReadEvent) {
FD_CLR(fd, PrevReadEvent);
FD_CLR(fd, PrevWriteEvent);
}
#endif
close(fd);
fdtab[fd].state = FD_STCLOSE;
while ((maxfd-1 >= 0) && (fdtab[maxfd-1].state == FD_STCLOSE))
maxfd--;
}
/* recomputes the maxfd limit from the fd */
static inline void fd_insert(int fd) {
if (fd+1 > maxfd)
maxfd = fd+1;
}
/*************************************************************/
/* task management ***************************************/
/*************************************************************/
/* puts the task <t> in run queue <q>, and returns <t> */
static inline struct task *task_wakeup(struct task **q, struct task *t) {
if (t->state == TASK_RUNNING)
return t;
else {
t->rqnext = *q;
t->state = TASK_RUNNING;
return *q = t;
}
}
/* removes the task <t> from the queue <q>
* <s> MUST be <q>'s first task.
* set the run queue to point to the next one, and return it
*/
static inline struct task *task_sleep(struct task **q, struct task *t) {
if (t->state == TASK_RUNNING) {
*q = t->rqnext;
t->state = TASK_IDLE; /* tell that s has left the run queue */
}
return *q; /* return next running task */
}
/*
* removes the task <t> from its wait queue. It must have already been removed
* from the run queue. A pointer to the task itself is returned.
*/
static inline struct task *task_delete(struct task *t) {
t->prev->next = t->next;
t->next->prev = t->prev;
return t;
}
/*
* frees a task. Its context must have been freed since it will be lost.
*/
static inline void task_free(struct task *t) {
pool_free(task, t);
}
/* inserts <task> into its assigned wait queue, where it may already be. In this case, it
* may be only moved or left where it was, depending on its timing requirements.
* <task> is returned.
*/
struct task *task_queue(struct task *task) {
struct task *list = task->wq;
struct task *start_from;
/* This is a very dirty hack to queue non-expirable tasks in another queue
* in order to avoid pulluting the tail of the standard queue. This will go
* away with the new O(log(n)) scheduler anyway.
*/
if (tv_iseternity(&task->expire)) {
/* if the task was queued in the standard wait queue, we must dequeue it */
if (task->prev) {
if (task->wq == LIST_HEAD(wait_queue[1]))
return task;
else {
task_delete(task);
task->prev = NULL;
}
}
list = task->wq = LIST_HEAD(wait_queue[1]);
} else {
/* if the task was queued in the eternity queue, we must dequeue it */
if (task->prev && (task->wq == LIST_HEAD(wait_queue[1]))) {
task_delete(task);
task->prev = NULL;
list = task->wq = LIST_HEAD(wait_queue[0]);
}
}
/* next, test if the task was already in a list */
if (task->prev == NULL) {
// start_from = list;
start_from = list->prev;
#if STATTIME > 0
stats_tsk_new++;
#endif
/* insert the unlinked <task> into the list, searching back from the last entry */
while (start_from != list && tv_cmp2(&task->expire, &start_from->expire) < 0) {
start_from = start_from->prev;
#if STATTIME > 0
stats_tsk_nsrch++;
#endif
}
// while (start_from->next != list && tv_cmp2(&task->expire, &start_from->next->expire) > 0) {
// start_from = start_from->next;
// stats_tsk_nsrch++;
// }
}
else if (task->prev == list ||
tv_cmp2(&task->expire, &task->prev->expire) >= 0) { /* walk right */
start_from = task->next;
if (start_from == list || tv_cmp2(&task->expire, &start_from->expire) <= 0) {
#if STATTIME > 0
stats_tsk_good++;
#endif
return task; /* it's already in the right place */
}
#if STATTIME > 0
stats_tsk_right++;
#endif
/* if the task is not at the right place, there's little chance that
* it has only shifted a bit, and it will nearly always be queued
* at the end of the list because of constant timeouts
* (observed in real case).
*/
#ifndef WE_REALLY_THINK_THAT_THIS_TASK_MAY_HAVE_SHIFTED
start_from = list->prev; /* assume we'll queue to the end of the list */
while (start_from != list && tv_cmp2(&task->expire, &start_from->expire) < 0) {
start_from = start_from->prev;
#if STATTIME > 0
stats_tsk_lsrch++;
#endif
}
#else /* WE_REALLY_... */
/* insert the unlinked <task> into the list, searching after position <start_from> */
while (start_from->next != list && tv_cmp2(&task->expire, &start_from->next->expire) > 0) {
start_from = start_from->next;
#if STATTIME > 0
stats_tsk_rsrch++;
#endif
}
#endif /* WE_REALLY_... */
/* we need to unlink it now */
task_delete(task);
}
else { /* walk left. */
#if STATTIME > 0
stats_tsk_left++;
#endif
#ifdef LEFT_TO_TOP /* not very good */
start_from = list;
while (start_from->next != list && tv_cmp2(&task->expire, &start_from->next->expire) > 0) {
start_from = start_from->next;
#if STATTIME > 0
stats_tsk_lsrch++;
#endif
}
#else
start_from = task->prev->prev; /* valid because of the previous test above */
while (start_from != list && tv_cmp2(&task->expire, &start_from->expire) < 0) {
start_from = start_from->prev;
#if STATTIME > 0
stats_tsk_lsrch++;
#endif
}
#endif
/* we need to unlink it now */
task_delete(task);
}
task->prev = start_from;
task->next = start_from->next;
task->next->prev = task;
start_from->next = task;
return task;
}
/*********************************************************************/
/* pending connections queues **************************************/
/*********************************************************************/
/*
* Detaches pending connection <p>, decreases the pending count, and frees
* the pending connection. The connection might have been queued to a specific
* server as well as to the proxy. The session also gets marked unqueued.
*/
static void pendconn_free(struct pendconn *p) {
LIST_DEL(&p->list);
p->sess->pend_pos = NULL;
if (p->srv)
p->srv->nbpend--;
else
p->sess->proxy->nbpend--;
p->sess->proxy->totpend--;
pool_free(pendconn, p);
}
/* Returns the first pending connection for server <s>, which may be NULL if
* nothing is pending.
*/
static inline struct pendconn *pendconn_from_srv(struct server *s) {
if (!s->nbpend)
return NULL;
return LIST_ELEM(s->pendconns.n, struct pendconn *, list);
}
/* Returns the first pending connection for proxy <px>, which may be NULL if
* nothing is pending.
*/
static inline struct pendconn *pendconn_from_px(struct proxy *px) {
if (!px->nbpend)
return NULL;
return LIST_ELEM(px->pendconns.n, struct pendconn *, list);
}
/* Detaches the next pending connection from either a server or a proxy, and
* returns its associated session. If no pending connection is found, NULL is
* returned. Note that neither <srv> nor <px> can be NULL.
*/
static struct session *pendconn_get_next_sess(struct server *srv, struct proxy *px) {
struct pendconn *p;
struct session *sess;
p = pendconn_from_srv(srv);
if (!p) {
p = pendconn_from_px(px);
if (!p)
return NULL;
p->sess->srv = srv;
}
sess = p->sess;
pendconn_free(p);
return sess;
}
/* Adds the session <sess> to the pending connection list of server <sess>->srv
* or to the one of <sess>->proxy if srv is NULL. All counters and back pointers
* are updated accordingly. Returns NULL if no memory is available, otherwise the
* pendconn itself.
*/
static struct pendconn *pendconn_add(struct session *sess) {
struct pendconn *p;
p = pool_alloc(pendconn);
if (!p)
return NULL;
sess->pend_pos = p;
p->sess = sess;
p->srv = sess->srv;
if (sess->srv) {
LIST_ADDQ(&sess->srv->pendconns, &p->list);
sess->logs.srv_queue_size += sess->srv->nbpend;
sess->srv->nbpend++;
if (sess->srv->nbpend > sess->srv->nbpend_max)
sess->srv->nbpend_max = sess->srv->nbpend;
} else {
LIST_ADDQ(&sess->proxy->pendconns, &p->list);
sess->logs.prx_queue_size += sess->proxy->nbpend;
sess->proxy->nbpend++;
if (sess->proxy->nbpend > sess->proxy->nbpend_max)
sess->proxy->nbpend_max = sess->proxy->nbpend;
}
sess->proxy->totpend++;
return p;
}
/* returns the effective dynamic maxconn for a server, considering the minconn
* and the proxy's usage relative to its saturation.
*/
static unsigned int srv_dynamic_maxconn(struct server *s) {
return s->minconn ?
((s->maxconn * s->proxy->nbconn / s->proxy->maxconn) < s->minconn) ? s->minconn :
(s->maxconn * s->proxy->nbconn / s->proxy->maxconn) : s->maxconn;
}
/* returns 0 if nothing has to be done for server <s> regarding queued connections,
* and non-zero otherwise. Suited for and if/else usage.
*/
static inline int may_dequeue_tasks(struct server *s, struct proxy *p) {
return (s && (s->nbpend || p->nbpend) &&
(!s->maxconn || s->cur_sess < srv_dynamic_maxconn(s)) &&
s->queue_mgt);
}
/*********************************************************************/
/* more specific functions ***************************************/
/*********************************************************************/
/* some prototypes */
static int maintain_proxies(void);
/* This either returns the sockname or the original destination address. Code
* inspired from Patrick Schaaf's example of nf_getsockname() implementation.
*/
static int get_original_dst(int fd, struct sockaddr_in *sa, socklen_t *salen) {
#if defined(TPROXY) && defined(SO_ORIGINAL_DST)
return getsockopt(fd, SOL_IP, SO_ORIGINAL_DST, (void *)sa, salen);
#else
#if defined(TPROXY) && defined(USE_GETSOCKNAME)
return getsockname(fd, (struct sockaddr *)sa, salen);
#else
return -1;
#endif
#endif
}
/*
* frees the context associated to a session. It must have been removed first.
*/
static void session_free(struct session *s) {
if (s->pend_pos)
pendconn_free(s->pend_pos);
if (s->req)
pool_free(buffer, s->req);
if (s->rep)
pool_free(buffer, s->rep);
if (s->rsp_cap != NULL) {
struct cap_hdr *h;
for (h = s->proxy->rsp_cap; h; h = h->next) {
if (s->rsp_cap[h->index] != NULL)
pool_free_to(h->pool, s->rsp_cap[h->index]);
}
pool_free_to(s->proxy->rsp_cap_pool, s->rsp_cap);
}
if (s->req_cap != NULL) {
struct cap_hdr *h;
for (h = s->proxy->req_cap; h; h = h->next) {
if (s->req_cap[h->index] != NULL)
pool_free_to(h->pool, s->req_cap[h->index]);
}
pool_free_to(s->proxy->req_cap_pool, s->req_cap);
}
if (s->logs.uri)
pool_free(requri, s->logs.uri);
if (s->logs.cli_cookie)
pool_free(capture, s->logs.cli_cookie);
if (s->logs.srv_cookie)
pool_free(capture, s->logs.srv_cookie);
pool_free(session, s);
}
/*
* This function recounts the number of usable active and backup servers for
* proxy <p>. These numbers are returned into the p->srv_act and p->srv_bck.
* This function also recomputes the total active and backup weights.
*/
static void recount_servers(struct proxy *px) {
struct server *srv;
px->srv_act = 0; px->srv_bck = px->tot_wact = px->tot_wbck = 0;
for (srv = px->srv; srv != NULL; srv = srv->next) {
if (srv->state & SRV_RUNNING) {
if (srv->state & SRV_BACKUP) {
px->srv_bck++;
px->tot_wbck += srv->eweight + 1;
} else {
px->srv_act++;
px->tot_wact += srv->eweight + 1;
}
}
}
}
/* This function recomputes the server map for proxy px. It
* relies on px->tot_wact and px->tot_wbck, so it must be
* called after recount_servers(). It also expects px->srv_map
* to be initialized to the largest value needed.
*/
static void recalc_server_map(struct proxy *px) {
int o, tot, flag;
struct server *cur, *best;
if (px->srv_act) {
flag = SRV_RUNNING;
tot = px->tot_wact;
} else if (px->srv_bck) {
flag = SRV_RUNNING | SRV_BACKUP;
if (px->options & PR_O_USE_ALL_BK)
tot = px->tot_wbck;
else
tot = 1; /* the first server is enough */
} else {
px->srv_map_sz = 0;
return;
}
/* this algorithm gives priority to the first server, which means that
* it will respect the declaration order for equivalent weights, and
* that whatever the weights, the first server called will always be
* the first declard. This is an important asumption for the backup
* case, where we want the first server only.
*/
for (cur = px->srv; cur; cur = cur->next)
cur->wscore = 0;
for (o = 0; o < tot; o++) {
int max = 0;
best = NULL;
for (cur = px->srv; cur; cur = cur->next) {
if ((cur->state & (SRV_RUNNING | SRV_BACKUP)) == flag) {
int v;
/* If we are forced to return only one server, we don't want to
* go further, because we would return the wrong one due to
* divide overflow.
*/
if (tot == 1) {
best = cur;
break;
}
cur->wscore += cur->eweight + 1;
v = (cur->wscore + tot) / tot; /* result between 0 and 3 */
if (best == NULL || v > max) {
max = v;
best = cur;
}
}
}
px->srv_map[o] = best;
best->wscore -= tot;
}
px->srv_map_sz = tot;
}
/*
* This function tries to find a running server with free connection slots for
* the proxy <px> following the round-robin method.
* If any server is found, it will be returned and px->srv_rr_idx will be updated
* to point to the next server. If no valid server is found, NULL is returned.
*/
static inline struct server *get_server_rr_with_conns(struct proxy *px) {
int newidx;
struct server *srv;
if (px->srv_map_sz == 0)
return NULL;
if (px->srv_rr_idx < 0 || px->srv_rr_idx >= px->srv_map_sz)
px->srv_rr_idx = 0;
newidx = px->srv_rr_idx;
do {
srv = px->srv_map[newidx++];
if (!srv->maxconn || srv->cur_sess < srv_dynamic_maxconn(srv)) {
px->srv_rr_idx = newidx;
return srv;
}
if (newidx == px->srv_map_sz)
newidx = 0;
} while (newidx != px->srv_rr_idx);
return NULL;
}
/*
* This function tries to find a running server for the proxy <px> following
* the round-robin method.
* If any server is found, it will be returned and px->srv_rr_idx will be updated
* to point to the next server. If no valid server is found, NULL is returned.
*/
static inline struct server *get_server_rr(struct proxy *px) {
if (px->srv_map_sz == 0)
return NULL;
if (px->srv_rr_idx < 0 || px->srv_rr_idx >= px->srv_map_sz)
px->srv_rr_idx = 0;
return px->srv_map[px->srv_rr_idx++];
}
/*
* This function tries to find a running server for the proxy <px> following
* the source hash method. Depending on the number of active/backup servers,
* it will either look for active servers, or for backup servers.
* If any server is found, it will be returned. If no valid server is found,
* NULL is returned.
*/
static inline struct server *get_server_sh(struct proxy *px, char *addr, int len) {
unsigned int h, l;
if (px->srv_map_sz == 0)
return NULL;
l = h = 0;
if (px->srv_act > 1 || (px->srv_act == 0 && px->srv_bck > 1)) {
while ((l + sizeof (int)) <= len) {
h ^= ntohl(*(unsigned int *)(&addr[l]));
l += sizeof (int);
}
h %= px->srv_map_sz;
}
return px->srv_map[h];
}
/*
* This function marks the session as 'assigned' in direct or dispatch modes,
* or tries to assign one in balance mode, according to the algorithm. It does
* nothing if the session had already been assigned a server.
*
* It may return :
* SRV_STATUS_OK if everything is OK. s->srv will be valid.
* SRV_STATUS_NOSRV if no server is available. s->srv = NULL.
* SRV_STATUS_FULL if all servers are saturated. s->srv = NULL.
* SRV_STATUS_INTERNAL for other unrecoverable errors.
*
* Upon successful return, the session flag SN_ASSIGNED to indicate that it does
* not need to be called anymore. This usually means that s->srv can be trusted
* in balance and direct modes. This flag is not cleared, so it's to the caller
* to clear it if required (eg: redispatch).
*
*/
int assign_server(struct session *s) {
#ifdef DEBUG_FULL
fprintf(stderr,"assign_server : s=%p\n",s);
#endif
if (s->pend_pos)
return SRV_STATUS_INTERNAL;
if (!(s->flags & SN_ASSIGNED)) {
if ((s->proxy->options & PR_O_BALANCE) && !(s->flags & SN_DIRECT)) {
if (!s->proxy->srv_act && !s->proxy->srv_bck)
return SRV_STATUS_NOSRV;
if (s->proxy->options & PR_O_BALANCE_RR) {
s->srv = get_server_rr_with_conns(s->proxy);
if (!s->srv)
return SRV_STATUS_FULL;
}
else if (s->proxy->options & PR_O_BALANCE_SH) {
int len;
if (s->cli_addr.ss_family == AF_INET)
len = 4;
else if (s->cli_addr.ss_family == AF_INET6)
len = 16;
else /* unknown IP family */
return SRV_STATUS_INTERNAL;
s->srv = get_server_sh(s->proxy,
(void *)&((struct sockaddr_in *)&s->cli_addr)->sin_addr,
len);
}
else /* unknown balancing algorithm */
return SRV_STATUS_INTERNAL;
}
s->flags |= SN_ASSIGNED;
}
return SRV_STATUS_OK;
}
/*
* This function assigns a server address to a session, and sets SN_ADDR_SET.
* The address is taken from the currently assigned server, or from the
* dispatch or transparent address.
*
* It may return :
* SRV_STATUS_OK if everything is OK.
* SRV_STATUS_INTERNAL for other unrecoverable errors.
*
* Upon successful return, the session flag SN_ADDR_SET is set. This flag is
* not cleared, so it's to the caller to clear it if required.
*
*/
int assign_server_address(struct session *s) {
#ifdef DEBUG_FULL
fprintf(stderr,"assign_server_address : s=%p\n",s);
#endif
if (s->flags & SN_DIRECT || s->proxy->options & PR_O_BALANCE) {
/* A server is necessarily known for this session */
if (!(s->flags & SN_ASSIGNED))
return SRV_STATUS_INTERNAL;
s->srv_addr = s->srv->addr;
/* if this server remaps proxied ports, we'll use
* the port the client connected to with an offset. */
if (s->srv->state & SRV_MAPPORTS) {
struct sockaddr_in sockname;
socklen_t namelen = sizeof(sockname);
if (!(s->proxy->options & PR_O_TRANSP) ||
get_original_dst(s->cli_fd, (struct sockaddr_in *)&sockname, &namelen) == -1)
getsockname(s->cli_fd, (struct sockaddr *)&sockname, &namelen);
s->srv_addr.sin_port = htons(ntohs(s->srv_addr.sin_port) + ntohs(sockname.sin_port));
}
}
else if (*(int *)&s->proxy->dispatch_addr.sin_addr) {
/* connect to the defined dispatch addr */
s->srv_addr = s->proxy->dispatch_addr;
}
else if (s->proxy->options & PR_O_TRANSP) {
/* in transparent mode, use the original dest addr if no dispatch specified */
socklen_t salen = sizeof(s->srv_addr);
if (get_original_dst(s->cli_fd, &s->srv_addr, &salen) == -1) {
qfprintf(stderr, "Cannot get original server address.\n");
return SRV_STATUS_INTERNAL;
}
}
s->flags |= SN_ADDR_SET;
return SRV_STATUS_OK;
}
/* This function assigns a server to session <s> if required, and can add the
* connection to either the assigned server's queue or to the proxy's queue.
*
* Returns :
*
* SRV_STATUS_OK if everything is OK.
* SRV_STATUS_NOSRV if no server is available. s->srv = NULL.
* SRV_STATUS_QUEUED if the connection has been queued.
* SRV_STATUS_FULL if the server(s) is/are saturated and the
* connection could not be queued.
* SRV_STATUS_INTERNAL for other unrecoverable errors.
*
*/
int assign_server_and_queue(struct session *s) {
struct pendconn *p;
int err;
if (s->pend_pos)
return SRV_STATUS_INTERNAL;
if (s->flags & SN_ASSIGNED) {
/* a server does not need to be assigned, perhaps because we're in
* direct mode, or in dispatch or transparent modes where the server
* is not needed.
*/
if (s->srv &&
s->srv->maxconn && s->srv->cur_sess >= srv_dynamic_maxconn(s->srv)) {
p = pendconn_add(s);
if (p)
return SRV_STATUS_QUEUED;
else
return SRV_STATUS_FULL;
}
return SRV_STATUS_OK;
}
/* a server needs to be assigned */
err = assign_server(s);
switch (err) {
case SRV_STATUS_OK:
/* in balance mode, we might have servers with connection limits */
if (s->srv &&
s->srv->maxconn && s->srv->cur_sess >= srv_dynamic_maxconn(s->srv)) {
p = pendconn_add(s);
if (p)
return SRV_STATUS_QUEUED;
else
return SRV_STATUS_FULL;
}
return SRV_STATUS_OK;
case SRV_STATUS_FULL:
/* queue this session into the proxy's queue */
p = pendconn_add(s);
if (p)
return SRV_STATUS_QUEUED;
else
return SRV_STATUS_FULL;
case SRV_STATUS_NOSRV:
case SRV_STATUS_INTERNAL:
return err;
default:
return SRV_STATUS_INTERNAL;
}
}
/*
* This function initiates a connection to the server assigned to this session
* (s->srv, s->srv_addr). It will assign a server if none is assigned yet.
* It can return one of :
* - SN_ERR_NONE if everything's OK
* - SN_ERR_SRVTO if there are no more servers
* - SN_ERR_SRVCL if the connection was refused by the server
* - SN_ERR_PRXCOND if the connection has been limited by the proxy (maxconn)
* - SN_ERR_RESOURCE if a system resource is lacking (eg: fd limits, ports, ...)
* - SN_ERR_INTERNAL for any other purely internal errors
* Additionnally, in the case of SN_ERR_RESOURCE, an emergency log will be emitted.
*/
int connect_server(struct session *s) {
int fd, err;
if (!(s->flags & SN_ADDR_SET)) {
err = assign_server_address(s);
if (err != SRV_STATUS_OK)
return SN_ERR_INTERNAL;
}
if ((fd = s->srv_fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == -1) {
qfprintf(stderr, "Cannot get a server socket.\n");
if (errno == ENFILE)
send_log(s->proxy, LOG_EMERG,
"Proxy %s reached system FD limit at %d. Please check system tunables.\n",
s->proxy->id, maxfd);
else if (errno == EMFILE)
send_log(s->proxy, LOG_EMERG,
"Proxy %s reached process FD limit at %d. Please check 'ulimit-n' and restart.\n",
s->proxy->id, maxfd);
else if (errno == ENOBUFS || errno == ENOMEM)
send_log(s->proxy, LOG_EMERG,
"Proxy %s reached system memory limit at %d sockets. Please check system tunables.\n",
s->proxy->id, maxfd);
/* this is a resource error */
return SN_ERR_RESOURCE;
}
if (fd >= global.maxsock) {
/* do not log anything there, it's a normal condition when this option
* is used to serialize connections to a server !
*/
Alert("socket(): not enough free sockets. Raise -n argument. Giving up.\n");
close(fd);
return SN_ERR_PRXCOND; /* it is a configuration limit */
}
if ((fcntl(fd, F_SETFL, O_NONBLOCK)==-1) ||
(setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char *) &one, sizeof(one)) == -1)) {
qfprintf(stderr,"Cannot set client socket to non blocking mode.\n");
close(fd);
return SN_ERR_INTERNAL;
}
if (s->proxy->options & PR_O_TCP_SRV_KA)
setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, (char *) &one, sizeof(one));
/* allow specific binding :
* - server-specific at first
* - proxy-specific next
*/
if (s->srv != NULL && s->srv->state & SRV_BIND_SRC) {
setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char *) &one, sizeof(one));
if (bind(fd, (struct sockaddr *)&s->srv->source_addr, sizeof(s->srv->source_addr)) == -1) {
Alert("Cannot bind to source address before connect() for server %s/%s. Aborting.\n",
s->proxy->id, s->srv->id);
close(fd);
send_log(s->proxy, LOG_EMERG,
"Cannot bind to source address before connect() for server %s/%s.\n",
s->proxy->id, s->srv->id);
return SN_ERR_RESOURCE;
}
}
else if (s->proxy->options & PR_O_BIND_SRC) {
setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char *) &one, sizeof(one));
if (bind(fd, (struct sockaddr *)&s->proxy->source_addr, sizeof(s->proxy->source_addr)) == -1) {
Alert("Cannot bind to source address before connect() for proxy %s. Aborting.\n", s->proxy->id);
close(fd);
send_log(s->proxy, LOG_EMERG,
"Cannot bind to source address before connect() for server %s/%s.\n",
s->proxy->id, s->srv->id);
return SN_ERR_RESOURCE;
}
}
if ((connect(fd, (struct sockaddr *)&s->srv_addr, sizeof(s->srv_addr)) == -1) &&
(errno != EINPROGRESS) && (errno != EALREADY) && (errno != EISCONN)) {
if (errno == EAGAIN || errno == EADDRINUSE) {
char *msg;
if (errno == EAGAIN) /* no free ports left, try again later */
msg = "no free ports";
else
msg = "local address already in use";
qfprintf(stderr,"Cannot connect: %s.\n",msg);
close(fd);
send_log(s->proxy, LOG_EMERG,
"Connect() failed for server %s/%s: %s.\n",
s->proxy->id, s->srv->id, msg);
return SN_ERR_RESOURCE;
} else if (errno == ETIMEDOUT) {
//qfprintf(stderr,"Connect(): ETIMEDOUT");
close(fd);
return SN_ERR_SRVTO;
} else {
// (errno == ECONNREFUSED || errno == ENETUNREACH || errno == EACCES || errno == EPERM)
//qfprintf(stderr,"Connect(): %d", errno);
close(fd);
return SN_ERR_SRVCL;
}
}
fdtab[fd].owner = s->task;
fdtab[fd].read = &event_srv_read;
fdtab[fd].write = &event_srv_write;
fdtab[fd].state = FD_STCONN; /* connection in progress */
FD_SET(fd, StaticWriteEvent); /* for connect status */
#if defined(DEBUG_FULL) && defined(ENABLE_EPOLL)
if (PrevReadEvent) {
assert(!(FD_ISSET(fd, PrevReadEvent)));
assert(!(FD_ISSET(fd, PrevWriteEvent)));
}
#endif
fd_insert(fd);
if (s->srv) {
s->srv->cur_sess++;
if (s->srv->cur_sess > s->srv->cur_sess_max)
s->srv->cur_sess_max = s->srv->cur_sess;
}
if (s->proxy->contimeout)
tv_delayfrom(&s->cnexpire, &now, s->proxy->contimeout);
else
tv_eternity(&s->cnexpire);
return SN_ERR_NONE; /* connection is OK */
}
/*
* this function is called on a read event from a client socket.
* It returns 0.
*/
int event_cli_read(int fd) {
struct task *t = fdtab[fd].owner;
struct session *s = t->context;
struct buffer *b = s->req;
int ret, max;
#ifdef DEBUG_FULL
fprintf(stderr,"event_cli_read : fd=%d, s=%p\n", fd, s);
#endif
if (fdtab[fd].state != FD_STERROR) {
#ifdef FILL_BUFFERS
while (1)
#else
do
#endif
{
if (b->l == 0) { /* let's realign the buffer to optimize I/O */
b->r = b->w = b->h = b->lr = b->data;
max = b->rlim - b->data;
}
else if (b->r > b->w) {
max = b->rlim - b->r;
}
else {
max = b->w - b->r;
/* FIXME: theorically, if w>0, we shouldn't have rlim < data+size anymore
* since it means that the rewrite protection has been removed. This
* implies that the if statement can be removed.
*/
if (max > b->rlim - b->data)
max = b->rlim - b->data;
}
if (max == 0) { /* not anymore room to store data */
FD_CLR(fd, StaticReadEvent);
break;
}
#ifndef MSG_NOSIGNAL
{
int skerr;
socklen_t lskerr = sizeof(skerr);
getsockopt(fd, SOL_SOCKET, SO_ERROR, &skerr, &lskerr);
if (skerr)
ret = -1;
else
ret = recv(fd, b->r, max, 0);
}
#else
ret = recv(fd, b->r, max, MSG_NOSIGNAL);
#endif
if (ret > 0) {
b->r += ret;
b->l += ret;
s->res_cr = RES_DATA;
if (b->r == b->data + BUFSIZE) {
b->r = b->data; /* wrap around the buffer */
}
b->total += ret;
/* we hope to read more data or to get a close on next round */
continue;
}
else if (ret == 0) {
s->res_cr = RES_NULL;
break;
}
else if (errno == EAGAIN) {/* ignore EAGAIN */
break;
}
else {
s->res_cr = RES_ERROR;
fdtab[fd].state = FD_STERROR;
break;
}
} /* while(1) */
#ifndef FILL_BUFFERS
while (0);
#endif
}
else {
s->res_cr = RES_ERROR;
fdtab[fd].state = FD_STERROR;
}
if (s->res_cr != RES_SILENT) {
if (s->proxy->clitimeout && FD_ISSET(fd, StaticReadEvent))
tv_delayfrom(&s->crexpire, &now, s->proxy->clitimeout);
else
tv_eternity(&s->crexpire);
task_wakeup(&rq, t);
}
return 0;
}
/*
* this function is called on a read event from a server socket.
* It returns 0.
*/
int event_srv_read(int fd) {
struct task *t = fdtab[fd].owner;
struct session *s = t->context;
struct buffer *b = s->rep;
int ret, max;
#ifdef DEBUG_FULL
fprintf(stderr,"event_srv_read : fd=%d, s=%p\n", fd, s);
#endif
if (fdtab[fd].state != FD_STERROR) {
#ifdef FILL_BUFFERS
while (1)
#else
do
#endif
{
if (b->l == 0) { /* let's realign the buffer to optimize I/O */
b->r = b->w = b->h = b->lr = b->data;
max = b->rlim - b->data;
}
else if (b->r > b->w) {
max = b->rlim - b->r;
}
else {
max = b->w - b->r;
/* FIXME: theorically, if w>0, we shouldn't have rlim < data+size anymore
* since it means that the rewrite protection has been removed. This
* implies that the if statement can be removed.
*/
if (max > b->rlim - b->data)
max = b->rlim - b->data;
}
if (max == 0) { /* not anymore room to store data */
FD_CLR(fd, StaticReadEvent);
break;
}
#ifndef MSG_NOSIGNAL
{
int skerr;
socklen_t lskerr = sizeof(skerr);
getsockopt(fd, SOL_SOCKET, SO_ERROR, &skerr, &lskerr);
if (skerr)
ret = -1;
else
ret = recv(fd, b->r, max, 0);
}
#else
ret = recv(fd, b->r, max, MSG_NOSIGNAL);
#endif
if (ret > 0) {
b->r += ret;
b->l += ret;
s->res_sr = RES_DATA;
if (b->r == b->data + BUFSIZE) {
b->r = b->data; /* wrap around the buffer */
}
b->total += ret;
/* we hope to read more data or to get a close on next round */
continue;
}
else if (ret == 0) {
s->res_sr = RES_NULL;
break;
}
else if (errno == EAGAIN) {/* ignore EAGAIN */
break;
}
else {
s->res_sr = RES_ERROR;
fdtab[fd].state = FD_STERROR;
break;
}
} /* while(1) */
#ifndef FILL_BUFFERS
while (0);
#endif
}
else {
s->res_sr = RES_ERROR;
fdtab[fd].state = FD_STERROR;
}
if (s->res_sr != RES_SILENT) {
if (s->proxy->srvtimeout && FD_ISSET(fd, StaticReadEvent))
tv_delayfrom(&s->srexpire, &now, s->proxy->srvtimeout);
else
tv_eternity(&s->srexpire);
task_wakeup(&rq, t);
}
return 0;
}
/*
* this function is called on a write event from a client socket.
* It returns 0.
*/
int event_cli_write(int fd) {
struct task *t = fdtab[fd].owner;
struct session *s = t->context;
struct buffer *b = s->rep;
int ret, max;
#ifdef DEBUG_FULL
fprintf(stderr,"event_cli_write : fd=%d, s=%p\n", fd, s);
#endif
if (b->l == 0) { /* let's realign the buffer to optimize I/O */
b->r = b->w = b->h = b->lr = b->data;
// max = BUFSIZE; BUG !!!!
max = 0;
}
else if (b->r > b->w) {
max = b->r - b->w;
}
else
max = b->data + BUFSIZE - b->w;
if (fdtab[fd].state != FD_STERROR) {
if (max == 0) {
s->res_cw = RES_NULL;
task_wakeup(&rq, t);
tv_eternity(&s->cwexpire);
FD_CLR(fd, StaticWriteEvent);
return 0;
}
#ifndef MSG_NOSIGNAL
{
int skerr;
socklen_t lskerr = sizeof(skerr);
getsockopt(fd, SOL_SOCKET, SO_ERROR, &skerr, &lskerr);
if (skerr)
ret = -1;
else
ret = send(fd, b->w, max, MSG_DONTWAIT);
}
#else
ret = send(fd, b->w, max, MSG_DONTWAIT | MSG_NOSIGNAL);
#endif
if (ret > 0) {
b->l -= ret;
b->w += ret;
s->res_cw = RES_DATA;
if (b->w == b->data + BUFSIZE) {
b->w = b->data; /* wrap around the buffer */
}
}
else if (ret == 0) {
/* nothing written, just make as if we were never called */
// s->res_cw = RES_NULL;
return 0;
}
else if (errno == EAGAIN) /* ignore EAGAIN */
return 0;
else {
s->res_cw = RES_ERROR;
fdtab[fd].state = FD_STERROR;
}
}
else {
s->res_cw = RES_ERROR;
fdtab[fd].state = FD_STERROR;
}
if (s->proxy->clitimeout) {
tv_delayfrom(&s->cwexpire, &now, s->proxy->clitimeout);
/* FIXME: to prevent the client from expiring read timeouts during writes,
* we refresh it. A solution would be to merge read+write timeouts into a
* unique one, although that needs some study particularly on full-duplex
* TCP connections. */
s->crexpire = s->cwexpire;
}
else
tv_eternity(&s->cwexpire);
task_wakeup(&rq, t);
return 0;
}
/*
* this function is called on a write event from a server socket.
* It returns 0.
*/
int event_srv_write(int fd) {
struct task *t = fdtab[fd].owner;
struct session *s = t->context;
struct buffer *b = s->req;
int ret, max;
#ifdef DEBUG_FULL
fprintf(stderr,"event_srv_write : fd=%d, s=%p\n", fd, s);
#endif
if (b->l == 0) { /* let's realign the buffer to optimize I/O */
b->r = b->w = b->h = b->lr = b->data;
// max = BUFSIZE; BUG !!!!
max = 0;
}
else if (b->r > b->w) {
max = b->r - b->w;
}
else
max = b->data + BUFSIZE - b->w;
if (fdtab[fd].state != FD_STERROR) {
if (max == 0) {
/* may be we have received a connection acknowledgement in TCP mode without data */
if (s->srv_state == SV_STCONN) {
int skerr;
socklen_t lskerr = sizeof(skerr);
getsockopt(fd, SOL_SOCKET, SO_ERROR, &skerr, &lskerr);
if (skerr) {
s->res_sw = RES_ERROR;
fdtab[fd].state = FD_STERROR;
task_wakeup(&rq, t);
tv_eternity(&s->swexpire);
FD_CLR(fd, StaticWriteEvent);
return 0;
}
}
s->res_sw = RES_NULL;
task_wakeup(&rq, t);
fdtab[fd].state = FD_STREADY;
tv_eternity(&s->swexpire);
FD_CLR(fd, StaticWriteEvent);
return 0;
}
#ifndef MSG_NOSIGNAL
{
int skerr;
socklen_t lskerr = sizeof(skerr);
getsockopt(fd, SOL_SOCKET, SO_ERROR, &skerr, &lskerr);
if (skerr)
ret = -1;
else
ret = send(fd, b->w, max, MSG_DONTWAIT);
}
#else
ret = send(fd, b->w, max, MSG_DONTWAIT | MSG_NOSIGNAL);
#endif
fdtab[fd].state = FD_STREADY;
if (ret > 0) {
b->l -= ret;
b->w += ret;
s->res_sw = RES_DATA;
if (b->w == b->data + BUFSIZE) {
b->w = b->data; /* wrap around the buffer */
}
}
else if (ret == 0) {
/* nothing written, just make as if we were never called */
// s->res_sw = RES_NULL;
return 0;
}
else if (errno == EAGAIN) /* ignore EAGAIN */
return 0;
else {
s->res_sw = RES_ERROR;
fdtab[fd].state = FD_STERROR;
}
}
else {
s->res_sw = RES_ERROR;
fdtab[fd].state = FD_STERROR;
}
/* We don't want to re-arm read/write timeouts if we're trying to connect,
* otherwise it could loop indefinitely !
*/
if (s->srv_state != SV_STCONN) {
if (s->proxy->srvtimeout) {
tv_delayfrom(&s->swexpire, &now, s->proxy->srvtimeout);
/* FIXME: to prevent the server from expiring read timeouts during writes,
* we refresh it. A solution would be to merge read+write+connect timeouts
* into a unique one since we don't mind expiring on read or write, and none
* of them is enabled while waiting for connect(), although that needs some
* study particularly on full-duplex TCP connections. */
s->srexpire = s->swexpire;
}
else
tv_eternity(&s->swexpire);
}
task_wakeup(&rq, t);
return 0;
}
/* returns 1 if the buffer is empty, 0 otherwise */
static inline int buffer_isempty(struct buffer *buf) {
return buf->l == 0;
}
/* returns 1 if the buffer is full, 0 otherwise */
static inline int buffer_isfull(struct buffer *buf) {
return buf->l == BUFSIZE;
}
/* flushes any content from buffer <buf> */
void buffer_flush(struct buffer *buf) {
buf->r = buf->h = buf->lr = buf->w = buf->data;
buf->l = 0;
}
/* returns the maximum number of bytes writable at once in this buffer */
int buffer_max(struct buffer *buf) {
if (buf->l == BUFSIZE)
return 0;
else if (buf->r >= buf->w)
return buf->data + BUFSIZE - buf->r;
else
return buf->w - buf->r;
}
/*
* Tries to realign the given buffer, and returns how many bytes can be written
* there at once without overwriting anything.
*/
int buffer_realign(struct buffer *buf) {
if (buf->l == 0) {
/* let's realign the buffer to optimize I/O */
buf->r = buf->w = buf->h = buf->lr = buf->data;
}
return buffer_max(buf);
}
/* writes <len> bytes from message <msg> to buffer <buf>. Returns 0 in case of
* success, or the number of bytes available otherwise.
* FIXME-20060521: handle unaligned data.
*/
int buffer_write(struct buffer *buf, const char *msg,