[MEDIUM] Decrease server health based on http responses / events, version 3
Implement decreasing health based on observing communication between
HAProxy and servers.
Changes in this version 2:
- documentation
- close race between a started check and health analysis event
- don't force fastinter if it is not set
- better names for options
- layer4 support
Changes in this version 3:
- add stats
- port to the current 1.4 tree
diff --git a/include/common/defaults.h b/include/common/defaults.h
index b0aee86..cada729 100644
--- a/include/common/defaults.h
+++ b/include/common/defaults.h
@@ -120,6 +120,9 @@
#define DEF_CHECK_REQ "OPTIONS / HTTP/1.0\r\n\r\n"
#define DEF_SMTP_CHECK_REQ "HELO localhost\r\n"
+#define DEF_HANA_ONERR HANA_ONERR_FAILCHK
+#define DEF_HANA_ERRLIMIT 10
+
// X-Forwarded-For header default
#define DEF_XFORWARDFOR_HDR "X-Forwarded-For"
diff --git a/include/proto/checks.h b/include/proto/checks.h
index bd70164..e3fe7ac 100644
--- a/include/proto/checks.h
+++ b/include/proto/checks.h
@@ -29,6 +29,7 @@
const char *get_check_status_info(short check_status);
struct task *process_chk(struct task *t);
int start_checks();
+void health_adjust(struct server *s, short status);
#endif /* _PROTO_CHECKS_H */
diff --git a/include/types/checks.h b/include/types/checks.h
index 1b04608..75e32b6 100644
--- a/include/types/checks.h
+++ b/include/types/checks.h
@@ -18,6 +18,9 @@
/* Below we have finished checks */
HCHK_STATUS_CHECKED, /* DUMMY STATUS */
+
+ HCHK_STATUS_HANA, /* Healt analyze detected enough consecutive errors */
+
HCHK_STATUS_SOCKERR, /* Socket error */
HCHK_STATUS_L4OK, /* L4 check passed, for example tcp connect */
@@ -41,8 +44,51 @@
HCHK_STATUS_SIZE
};
+
+/* health status for response tracking */
+enum {
+ HANA_STATUS_UNKNOWN = 0,
+
+ HANA_STATUS_L4_OK, /* L4 successful connection */
+ HANA_STATUS_L4_ERR, /* L4 unsuccessful connection */
+
+ HANA_STATUS_HTTP_OK, /* Correct http response */
+ HANA_STATUS_HTTP_STS, /* Wrong http response, for example HTTP 5xx */
+ HANA_STATUS_HTTP_HDRRSP, /* Invalid http response (headers) */
+ HANA_STATUS_HTTP_RSP, /* Invalid http response */
+
+ HANA_STATUS_HTTP_READ_ERROR, /* Read error */
+ HANA_STATUS_HTTP_READ_TIMEOUT, /* Read timeout */
+ HANA_STATUS_HTTP_BROKEN_PIPE, /* Unexpected close from server */
+
+ HANA_STATUS_SIZE
+};
+
+enum {
+ HANA_ONERR_UNKNOWN = 0,
+
+ HANA_ONERR_FASTINTER, /* Force fastinter*/
+ HANA_ONERR_FAILCHK, /* Simulate a failed check */
+ HANA_ONERR_SUDDTH, /* Enters sudden death - one more failed check will mark this server down */
+ HANA_ONERR_MARKDWN, /* Mark this server down, now! */
+};
+
+enum {
+ HANA_OBS_NONE = 0,
+
+ HANA_OBS_LAYER4, /* Observe L4 - for example tcp */
+ HANA_OBS_LAYER7, /* Observe L7 - for example http */
+
+ HANA_OBS_SIZE
+};
+
struct check_status {
short result; /* one of SRV_CHK_* */
char *info; /* human readable short info */
char *desc; /* long description */
};
+
+struct analyze_status {
+ char *desc; /* description */
+ unsigned char lr[HANA_OBS_SIZE]; /* result for l4/l7: 0 = ignore, 1 - error, 2 - OK */
+};
diff --git a/include/types/counters.h b/include/types/counters.h
index f551bf0..fa648a3 100644
--- a/include/types/counters.h
+++ b/include/types/counters.h
@@ -81,7 +81,8 @@
} http;
} p;
- long long failed_checks, down_trans; /* failed checks and up->down transitions */
+ long long failed_checks, failed_hana; /* failed health checks and health analyses */
+ long long down_trans; /* up->down transitions */
};
#endif /* _TYPES_COUNTERS_H */
diff --git a/include/types/server.h b/include/types/server.h
index 935992d..745a94f 100644
--- a/include/types/server.h
+++ b/include/types/server.h
@@ -115,7 +115,10 @@
struct sockaddr_in check_addr; /* the address to check, if different from <addr> */
short check_port; /* the port to use for the health checks */
int health; /* 0->rise-1 = bad; rise->rise+fall-1 = good */
+ int consecutive_errors; /* current number of consecutive errors */
int rise, fall; /* time in iterations */
+ int consecutive_errors_limit; /* number of consecutive errors that triggers an event */
+ short observe, onerror; /* observing mode: one of HANA_OBS_*; what to do on error: on of ANA_ONERR_* */
int inter, fastinter, downinter; /* checks: time in milliseconds */
int slowstart; /* slowstart time in seconds (ms in the conf) */
int result; /* health-check result : SRV_CHK_* */
@@ -137,9 +140,9 @@
unsigned down_time; /* total time the server was down */
time_t last_change; /* last time, when the state was changed */
struct timeval check_start; /* last health check start time */
- unsigned long check_duration; /* time in ms took to finish last health check */
+ long check_duration; /* time in ms took to finish last health check */
short check_status, check_code; /* check result, check code */
- char check_desc[HCHK_DESC_LEN]; /* healt check descritpion */
+ char check_desc[HCHK_DESC_LEN]; /* health check descritpion */
struct freq_ctr sess_per_sec; /* sessions per second on this server */
int puid; /* proxy-unique server ID, used for SNMP */