MEDIUM: queue: move the queue lock manipulation to pendconn_process_next_strm()

By placing the lock there, it becomes possible to lock the proxy
later and to unlock it earlier. The server unlocking also happens slightly
earlier.

The performance on roundrobin increases from 481k to 524k req/s on 16
threads. Leastconn shows about 513k req/s (the difference being the
take_conn() call).

The performance profile changes from this:
   9.32%  hap-pxok            [.] process_srv_queue
   7.56%  hap-pxok            [.] pendconn_dequeue
   6.90%  hap-pxok            [.] pendconn_add

to this:
   7.42%  haproxy             [.] process_srv_queue
   5.61%  haproxy             [.] pendconn_dequeue
   4.95%  haproxy             [.] pendconn_add
diff --git a/src/queue.c b/src/queue.c
index 5c9db17..8db2da2 100644
--- a/src/queue.c
+++ b/src/queue.c
@@ -255,8 +255,8 @@
  *
  * The proxy's queue will be consulted only if px_ok is non-zero.
  *
- * This function must only be called if the server queue _AND_ the proxy queue
- * are locked (if pk_ok is set). Today it is only called by process_srv_queue.
+ * This function uses both the proxy and the server queues' lock. Today it is
+ * only called by process_srv_queue.
  *
  * The function returns the dequeued pendconn on success or NULL if none is
  * available. It's up to the caller to add the corresponding stream to the
@@ -270,15 +270,20 @@
 	u32 pkey, ppkey;
 
 	p = NULL;
+	HA_SPIN_LOCK(QUEUE_LOCK, &srv->queue.lock);
 	if (srv->queue.length)
 		p = pendconn_first(&srv->queue.head);
 
 	pp = NULL;
+	HA_SPIN_LOCK(QUEUE_LOCK, &px->queue.lock);
 	if (px_ok && px->queue.length)
 		pp = pendconn_first(&px->queue.head);
 
-	if (!p && !pp)
+	if (!p && !pp) {
+		HA_SPIN_UNLOCK(QUEUE_LOCK, &px->queue.lock);
+		HA_SPIN_UNLOCK(QUEUE_LOCK, &srv->queue.lock);
 		return NULL;
+	}
 	else if (!pp)
 		goto use_p; /*  p != NULL */
 	else if (!p)
@@ -307,13 +312,17 @@
  use_pp:
 	/* Let's switch from the server pendconn to the proxy pendconn */
 	__pendconn_unlink_prx(pp);
+	HA_SPIN_UNLOCK(QUEUE_LOCK, &px->queue.lock);
+	HA_SPIN_UNLOCK(QUEUE_LOCK, &srv->queue.lock);
 	_HA_ATOMIC_INC(&px->queue.idx);
 	_HA_ATOMIC_DEC(&px->queue.length);
 	_HA_ATOMIC_DEC(&px->totpend);
 	p = pp;
 	goto unlinked;
  use_p:
+	HA_SPIN_UNLOCK(QUEUE_LOCK, &px->queue.lock);
 	__pendconn_unlink_srv(p);
+	HA_SPIN_UNLOCK(QUEUE_LOCK, &srv->queue.lock);
 	_HA_ATOMIC_INC(&srv->queue.idx);
 	_HA_ATOMIC_DEC(&srv->queue.length);
 	_HA_ATOMIC_DEC(&px->totpend);
@@ -347,14 +356,8 @@
 	while (s->served < maxconn) {
 		struct pendconn *pc;
 
-		HA_SPIN_LOCK(QUEUE_LOCK, &s->queue.lock);
-		HA_SPIN_LOCK(QUEUE_LOCK, &p->queue.lock);
-
 		pc = pendconn_process_next_strm(s, p, px_ok);
 
-		HA_SPIN_UNLOCK(QUEUE_LOCK, &p->queue.lock);
-		HA_SPIN_UNLOCK(QUEUE_LOCK, &s->queue.lock);
-
 		if (!pc)
 			break;