Blame - src/ev_sepoll.c - haproxy

blob: b7821fb67bb35e96c4534a75ca8726f13b104456 [file] [log] [blame]

Willy Tarreau	de99e99	2007-04-16 00:53:59 +0200	[diff] [blame^]	1	/*
				2	* FD polling functions for Speculative I/O combined with Linux epoll()
				3	*
				4	* Copyright 2000-2007 Willy Tarreau <w@1wt.eu>
				5	*
				6	* This program is free software; you can redistribute it and/or
				7	* modify it under the terms of the GNU General Public License
				8	* as published by the Free Software Foundation; either version
				9	* 2 of the License, or (at your option) any later version.
				10	*
				11	*/
				12
				13	#include <unistd.h>
				14	#include <sys/time.h>
				15	#include <sys/types.h>
				16
				17	#include <common/compat.h>
				18	#include <common/config.h>
				19	#include <common/standard.h>
				20	#include <common/time.h>
				21
				22	#include <types/fd.h>
				23	#include <types/global.h>
				24
				25	#include <proto/fd.h>
				26	#include <proto/task.h>
				27
				28	#if defined(USE_MY_EPOLL)
				29	#include <common/epoll.h>
				30	#include <errno.h>
				31	#include <sys/syscall.h>
				32	static _syscall1 (int, epoll_create, int, size);
				33	static _syscall4 (int, epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event *, event);
				34	static _syscall4 (int, epoll_wait, int, epfd, struct epoll_event *, events, int, maxevents, int, timeout);
				35	#else
				36	#include <sys/epoll.h>
				37	#endif
				38
				39	/*
				40	* We define 4 states for each direction of a file descriptor, which we store
				41	* as 2 bits :
				42	*
				43	* 00 = IDLE : we're not interested in this event
				44	* 01 = SPEC : perform speculative I/O on this FD
				45	* 10 = WAIT : really wait for an availability event on this FD (poll)
				46	* 11 = STOP : was marked WAIT, but disabled. It can switch back to WAIT if
				47	* the application changes its mind, otherwise disable FD polling
				48	* and switch back to IDLE.
				49	*
				50	* Since we do not want to scan all the FD list to find speculative I/O events,
				51	* we store them in a list consisting in a linear array holding only the FD
				52	* indexes right now.
				53	*
				54	* The STOP state requires the event to be present in the spec list so that
				55	* it can be detected and flushed upon next scan without having to scan the
				56	* whole FD list.
				57	*
				58	* This translates like this :
				59	*
				60	* EVENT_IN_SPEC_LIST = 01
				61	* EVENT_IN_POLL_LIST = 10
				62	*
				63	* IDLE = 0
				64	* SPEC = (EVENT_IN_SPEC_LIST)
				65	* WAIT = (EVENT_IN_POLL_LIST)
				66	* STOP = (EVENT_IN_SPEC_LIST\|EVENT_IN_POLL_LIST)
				67	*
				68	* fd_is_set() just consists in checking that the status is 01 or 10.
				69	*
				70	* For efficiency reasons, we will store the Read and Write bits interlaced to
				71	* form a 4-bit field, so that we can simply shift the value right by 0/1 and
				72	* get what we want :
				73	* 3 2 1 0
				74	* Wp Rp Ws Rs
				75	*
				76	* The FD array has to hold a back reference to the speculative list. This
				77	* reference is only valid if at least one of the directions is marked SPEC.
				78	*
				79	*/
				80
				81	#define FD_EV_IN_SL 1
				82	#define FD_EV_IN_PL 4
				83
				84	#define FD_EV_IDLE 0
				85	#define FD_EV_SPEC (FD_EV_IN_SL)
				86	#define FD_EV_WAIT (FD_EV_IN_PL)
				87	#define FD_EV_STOP (FD_EV_IN_SL\|FD_EV_IN_PL)
				88
				89	/* Those match any of R or W for Spec list or Poll list */
				90	#define FD_EV_RW_SL (FD_EV_IN_SL \| (FD_EV_IN_SL << 1))
				91	#define FD_EV_RW_PL (FD_EV_IN_PL \| (FD_EV_IN_PL << 1))
				92	#define FD_EV_MASK_DIR (FD_EV_IN_SL\|FD_EV_IN_PL)
				93
				94	#define FD_EV_IDLE_R 0
				95	#define FD_EV_SPEC_R (FD_EV_IN_SL)
				96	#define FD_EV_WAIT_R (FD_EV_IN_PL)
				97	#define FD_EV_STOP_R (FD_EV_IN_SL\|FD_EV_IN_PL)
				98	#define FD_EV_MASK_R (FD_EV_IN_SL\|FD_EV_IN_PL)
				99
				100	#define FD_EV_IDLE_W (FD_EV_IDLE_R << 1)
				101	#define FD_EV_SPEC_W (FD_EV_SPEC_R << 1)
				102	#define FD_EV_WAIT_W (FD_EV_WAIT_R << 1)
				103	#define FD_EV_STOP_W (FD_EV_STOP_R << 1)
				104	#define FD_EV_MASK_W (FD_EV_MASK_R << 1)
				105
				106	#define FD_EV_MASK (FD_EV_MASK_W \| FD_EV_MASK_R)
				107
				108
				109	/* descriptor of one FD.
				110	* FIXME: should be a bit field */
				111	struct fd_status {
				112	unsigned int e:4; // read and write events status.
				113	unsigned int s:28; // Position in spec list. Should be last.
				114	};
				115
				116	static int nbspec = 0; // current size of the spec list
				117
				118	static struct fd_status *fd_list = NULL; // list of FDs
				119	static unsigned int *spec_list = NULL; // speculative I/O list
				120
				121	/* private data */
				122	static struct epoll_event *epoll_events;
				123	static int epoll_fd;
				124
				125	/* This structure may be used for any purpose. Warning! do not use it in
				126	* recursive functions !
				127	*/
				128	static struct epoll_event ev;
				129
				130
				131	REGPRM1 static void alloc_spec_entry(const int fd)
				132	{
				133	if (fd_list[fd].e & FD_EV_RW_SL)
				134	return;
				135	fd_list[fd].s = nbspec;
				136	spec_list[nbspec++] = fd;
				137	}
				138
				139	/* removes entry <pos> from the spec list and replaces it with the last one.
				140	* The fd_list is adjusted to match the back reference if needed.
				141	*/
				142	REGPRM1 static void delete_spec_entry(const int pos)
				143	{
				144	int fd;
				145
				146	nbspec--;
				147	if (pos == nbspec)
				148	return;
				149
				150	/* we replace current FD by the highest one */
				151	fd = spec_list[nbspec];
				152	spec_list[pos] = fd;
				153	fd_list[fd].s = pos;
				154	}
				155
				156	/*
				157	* Returns non-zero if <fd> is already monitored for events in direction <dir>.
				158	*/
				159	REGPRM2 static int __fd_is_set(const int fd, int dir)
				160	{
				161	int ret;
				162
				163	ret = ((unsigned)fd_list[fd].e >> dir) & FD_EV_MASK_DIR;
				164	return (ret == FD_EV_SPEC \|\| ret == FD_EV_WAIT);
				165	}
				166
				167	/*
				168	* Don't worry about the strange constructs in __fd_set/__fd_clr, they are
				169	* designed like this in order to reduce the number of jumps (verified).
				170	*/
				171	REGPRM2 static int __fd_set(const int fd, int dir)
				172	{
				173	__label__ switch_state;
				174	unsigned int i;
				175
				176	i = ((unsigned)fd_list[fd].e >> dir) & FD_EV_MASK_DIR;
				177
				178	if (i == FD_EV_IDLE) {
				179	// switch to SPEC state and allocate a SPEC entry.
				180	alloc_spec_entry(fd);
				181	switch_state:
				182	fd_list[fd].e ^= (unsigned int)(FD_EV_IN_SL << dir);
				183	return 1;
				184	}
				185	else if (i == FD_EV_STOP) {
				186	// switch to WAIT state
				187	goto switch_state;
				188	}
				189	else
				190	return 0;
				191	}
				192
				193	REGPRM2 static int __fd_clr(const int fd, int dir)
				194	{
				195	__label__ switch_state;
				196	unsigned int i;
				197
				198	i = ((unsigned)fd_list[fd].e >> dir) & FD_EV_MASK_DIR;
				199
				200	if (i == FD_EV_SPEC) {
				201	// switch to IDLE state
				202	goto switch_state;
				203	}
				204	else if (likely(i == FD_EV_WAIT)) {
				205	// switch to STOP state
				206	/* We will create a queue entry for this one because we want to
				207	* process it later in order to merge it with other events on
				208	* the same FD.
				209	*/
				210	alloc_spec_entry(fd);
				211	switch_state:
				212	fd_list[fd].e ^= (unsigned int)(FD_EV_IN_SL << dir);
				213	return 1;
				214	}
				215	return 0;
				216	}
				217
				218	REGPRM1 static void __fd_rem(int fd)
				219	{
				220	__fd_clr(fd, DIR_RD);
				221	__fd_clr(fd, DIR_WR);
				222	}
				223
				224	/*
				225	* On valid epoll() implementations, a call to close() automatically removes
				226	* the fds. This means that the FD will appear as previously unset.
				227	*/
				228	REGPRM1 static void __fd_clo(int fd)
				229	{
				230	if (fd_list[fd].e & FD_EV_RW_SL)
				231	delete_spec_entry(fd_list[fd].s);
				232	fd_list[fd].e &= ~(FD_EV_MASK);
				233	}
				234
				235	static struct ev_to_epoll {
				236	char op; // epoll opcode to switch from spec to wait, 0 if none
				237	char m; // inverted mask for existing events
				238	char ev; // remainint epoll events after change
				239	char pad;
				240	} ev_to_epoll[16] = {
				241	[FD_EV_IDLE_W \| FD_EV_STOP_R] = { .op=EPOLL_CTL_DEL, .m=FD_EV_MASK_R },
				242	[FD_EV_SPEC_W \| FD_EV_STOP_R] = { .op=EPOLL_CTL_DEL, .m=FD_EV_MASK_R },
				243	[FD_EV_STOP_W \| FD_EV_IDLE_R] = { .op=EPOLL_CTL_DEL, .m=FD_EV_MASK_W },
				244	[FD_EV_STOP_W \| FD_EV_SPEC_R] = { .op=EPOLL_CTL_DEL, .m=FD_EV_MASK_W },
				245	[FD_EV_WAIT_W \| FD_EV_STOP_R] = { .op=EPOLL_CTL_MOD, .m=FD_EV_MASK_R, .ev=EPOLLOUT },
				246	[FD_EV_STOP_W \| FD_EV_WAIT_R] = { .op=EPOLL_CTL_MOD, .m=FD_EV_MASK_W, .ev=EPOLLIN },
				247	[FD_EV_STOP_W \| FD_EV_STOP_R] = { .op=EPOLL_CTL_DEL, .m=FD_EV_MASK_R\|FD_EV_MASK_W },
				248	[FD_EV_WAIT_W \| FD_EV_WAIT_R] = { .ev=EPOLLIN\|EPOLLOUT },
				249	};
				250
				251	/*
				252	* speculative epoll() poller
				253	*/
				254	REGPRM2 static void _do_poll(struct poller *p, int wait_time)
				255	{
				256	static unsigned int last_skipped;
				257	int status;
				258	int fd, opcode;
				259	int count;
				260	int spec_idx;
				261
				262
				263	/* Here we have two options :
				264	* - either walk the list forwards and hope to atch more events
				265	* - or walk it backwards to minimize the number of changes and
				266	* to make better use of the cache.
				267	* Tests have shown that walking backwards improves perf by 0.2%.
				268	*/
				269
				270	spec_idx = nbspec;
				271	while (likely(spec_idx > 0)) {
				272	spec_idx--;
				273	fd = spec_list[spec_idx];
				274
				275	opcode = ev_to_epoll[fd_list[fd].e].op;
				276	if (opcode) {
				277	ev.events = ev_to_epoll[fd_list[fd].e].ev;
				278	ev.data.fd = fd;
				279	epoll_ctl(epoll_fd, opcode, fd, &ev);
				280	fd_list[fd].e &= ~(unsigned int)ev_to_epoll[fd_list[fd].e].m;
				281	}
				282
				283	if (!(fd_list[fd].e & FD_EV_RW_SL)) {
				284	// This one must be removed. Let's clear it now.
				285	delete_spec_entry(spec_idx);
				286	continue;
				287	}
				288
				289	/* OK so now we do not have any event marked STOP anymore in
				290	* the list. We can simply try to execute functions for the
				291	* events we have found, and requeue them in case of EAGAIN.
				292	*/
				293
				294	status = 0;
				295	fdtab[fd].ev = 0;
				296
				297	if ((fd_list[fd].e & FD_EV_MASK_R) == FD_EV_SPEC_R) {
				298	if (fdtab[fd].state != FD_STCLOSE) {
				299	fdtab[fd].ev \|= FD_POLL_IN;
				300	if (fdtab[fd].cb[DIR_RD].f(fd) == 0)
				301	status \|= EPOLLIN;
				302	}
				303	}
				304
				305	if ((fd_list[fd].e & FD_EV_MASK_W) == FD_EV_SPEC_W) {
				306	if (fdtab[fd].state != FD_STCLOSE) {
				307	fdtab[fd].ev \|= FD_POLL_OUT;
				308	if (fdtab[fd].cb[DIR_WR].f(fd) == 0)
				309	status \|= EPOLLOUT;
				310	}
				311	}
				312
				313	if (status) {
				314	/* Some speculative accesses have failed, we must
				315	* switch to the WAIT state.
				316	*/
				317	ev.events = status;
				318	ev.data.fd = fd;
				319	if (fd_list[fd].e & FD_EV_RW_PL) {
				320	// Event already in poll list
				321	ev.events \|= ev_to_epoll[fd_list[fd].e].ev;
				322	opcode = EPOLL_CTL_MOD;
				323	} else {
				324	// Event not in poll list yet
				325	opcode = EPOLL_CTL_ADD;
				326	}
				327	epoll_ctl(epoll_fd, opcode, fd, &ev);
				328
				329	/* We don't want epoll_wait() to wait for certain events
				330	* which might never come.
				331	*/
				332	wait_time = 0;
				333
				334	if (status & EPOLLIN) {
				335	fd_list[fd].e &= ~FD_EV_MASK_R;
				336	fd_list[fd].e \|= FD_EV_WAIT_R;
				337	}
				338	if (status & EPOLLOUT) {
				339	fd_list[fd].e &= ~FD_EV_MASK_W;
				340	fd_list[fd].e \|= FD_EV_WAIT_W;
				341	}
				342
				343	if ((fd_list[fd].e & FD_EV_MASK_R) != FD_EV_SPEC_R &&
				344	(fd_list[fd].e & FD_EV_MASK_W) != FD_EV_SPEC_W) {
				345	delete_spec_entry(spec_idx);
				346	continue;
				347	}
				348	}
				349	}
				350
				351	/* If some speculative events remain, we must not set the timeout in
				352	* epoll_wait(). Also, if some speculative events remain, it means
				353	* that some have been immediately processed, otherwise they would
				354	* have been disabled.
				355	*/
				356	if (nbspec) {
				357	if (!last_skipped++) {
				358	/* Measures have shown a great performance increase if
				359	* we call the epoll_wait() only the second time after
				360	* speculative accesses have succeeded. This reduces
				361	* the number of unsucessful calls to epoll_wait() by
				362	* a factor of about 3, and the total number of calls
				363	* by about 2.
				364	*/
				365	tv_now(&now);
				366	return;
				367	}
				368	wait_time = 0;
				369	}
				370	last_skipped = 0;
				371
				372	/* now let's wait for events */
				373	status = epoll_wait(epoll_fd, epoll_events, maxfd, wait_time);
				374	tv_now(&now);
				375
				376	for (count = 0; count < status; count++) {
				377	int e = epoll_events[count].events;
				378	fd = epoll_events[count].data.fd;
				379
				380	/* it looks complicated but gcc can optimize it away when constants
				381	* have same values.
				382	*/
				383	fdtab[fd].ev =
				384	((e & EPOLLIN ) ? FD_POLL_IN : 0) \|
				385	((e & EPOLLPRI) ? FD_POLL_PRI : 0) \|
				386	((e & EPOLLOUT) ? FD_POLL_OUT : 0) \|
				387	((e & EPOLLERR) ? FD_POLL_ERR : 0) \|
				388	((e & EPOLLHUP) ? FD_POLL_HUP : 0);
				389
				390	if ((fd_list[fd].e & FD_EV_MASK_R) == FD_EV_WAIT_R) {
				391	if (fdtab[fd].state == FD_STCLOSE)
				392	continue;
				393	if (fdtab[fd].ev & FD_POLL_RD)
				394	fdtab[fd].cb[DIR_RD].f(fd);
				395	}
				396
				397	if ((fd_list[fd].e & FD_EV_MASK_W) == FD_EV_WAIT_W) {
				398	if (fdtab[fd].state == FD_STCLOSE)
				399	continue;
				400	if (fdtab[fd].ev & FD_POLL_WR)
				401	fdtab[fd].cb[DIR_WR].f(fd);
				402	}
				403	}
				404	}
				405
				406	/*
				407	* Initialization of the speculative epoll() poller.
				408	* Returns 0 in case of failure, non-zero in case of success. If it fails, it
				409	* disables the poller by setting its pref to 0.
				410	*/
				411	REGPRM1 static int _do_init(struct poller *p)
				412	{
				413	__label__ fail_fd_list, fail_spec, fail_ee, fail_fd;
				414
				415	p->private = NULL;
				416
				417	epoll_fd = epoll_create(global.maxsock + 1);
				418	if (epoll_fd < 0)
				419	goto fail_fd;
				420
				421	epoll_events = (struct epoll_event*)
				422	calloc(1, sizeof(struct epoll_event) * global.maxsock);
				423
				424	if (epoll_events == NULL)
				425	goto fail_ee;
				426
				427	if ((spec_list = (uint32_t )calloc(1, sizeof(uint32_t) global.maxsock)) == NULL)
				428	goto fail_spec;
				429
				430	fd_list = (struct fd_status )calloc(1, sizeof(struct fd_status) global.maxsock);
				431	if (fd_list == NULL)
				432	goto fail_fd_list;
				433
				434	return 1;
				435
				436	fail_fd_list:
				437	free(spec_list);
				438	fail_spec:
				439	free(epoll_events);
				440	fail_ee:
				441	close(epoll_fd);
				442	epoll_fd = 0;
				443	fail_fd:
				444	p->pref = 0;
				445	return 0;
				446	}
				447
				448	/*
				449	* Termination of the speculative epoll() poller.
				450	* Memory is released and the poller is marked as unselectable.
				451	*/
				452	REGPRM1 static void _do_term(struct poller *p)
				453	{
				454	if (fd_list)
				455	free(fd_list);
				456	if (spec_list)
				457	free(spec_list);
				458	if (epoll_events)
				459	free(epoll_events);
				460
				461	close(epoll_fd);
				462	epoll_fd = 0;
				463
				464	fd_list = NULL;
				465	spec_list = NULL;
				466	epoll_events = NULL;
				467
				468	p->private = NULL;
				469	p->pref = 0;
				470	}
				471
				472	/*
				473	* Check that the poller works.
				474	* Returns 1 if OK, otherwise 0.
				475	*/
				476	REGPRM1 static int _do_test(struct poller *p)
				477	{
				478	int fd;
				479
				480	fd = epoll_create(global.maxsock + 1);
				481	if (fd < 0)
				482	return 0;
				483	close(fd);
				484	return 1;
				485	}
				486
				487	/*
				488	* It is a constructor, which means that it will automatically be called before
				489	* main(). This is GCC-specific but it works at least since 2.95.
				490	* Special care must be taken so that it does not need any uninitialized data.
				491	*/
				492	__attribute__((constructor))
				493	static void _do_register(void)
				494	{
				495	struct poller *p;
				496
				497	if (nbpollers >= MAX_POLLERS)
				498	return;
				499	p = &pollers[nbpollers++];
				500
				501	p->name = "sepoll";
				502	p->pref = 400;
				503	p->private = NULL;
				504
				505	p->test = _do_test;
				506	p->init = _do_init;
				507	p->term = _do_term;
				508	p->poll = _do_poll;
				509
				510	p->is_set = __fd_is_set;
				511	p->cond_s = p->set = __fd_set;
				512	p->cond_c = p->clr = __fd_clr;
				513	p->rem = __fd_rem;
				514	p->clo = __fd_clo;
				515	}
				516
				517
				518	/*
				519	* Local variables:
				520	* c-indent-level: 8
				521	* c-basic-offset: 8
				522	* End:
				523	*/