summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPablo Neira Ayuso <pablo@netfilter.org>2009-01-15 23:19:58 +0100
committerPablo Neira Ayuso <pablo@netfilter.org>2009-01-15 23:19:58 +0100
commit2cacd3a802510bde43e23cf4c7d39f51a2eaf460 (patch)
tree0f14343829df1fee20549a22544060b21587a841
parent8dce3504fde7da933dc6e7ecfeb99b4b45125f32 (diff)
run: relax resynchronization algorithm when netlink overruns
This patch relaxes the current approach when netlink reports overruns. There are two situations that can trigger a resynchronization with the kernel conntrack table: a) Netlink overruns because the receiver buffer is too small: increasing the netlink buffer size and schedule a resync with the kernel table conntrack to resolve the inconsistency. The sysadmin would notice in the logs and will try to set a bigger buffer in the configuration file. b) The system is under heavy workload (CPU is too busy): we should avoid resync with the kernel table since this is an expensive operation. We do our best here and keep replicating as much states as possible. If CPU consumption lowers at some point, the we will try to resync ourselves. This patch reduces the chances to resynchronize with the kernel conntrack table unless that two overruns do not happen in an internal of 30 seconds. Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
-rw-r--r--src/run.c38
1 files changed, 29 insertions, 9 deletions
diff --git a/src/run.c b/src/run.c
index caf0b38..2e373ce 100644
--- a/src/run.c
+++ b/src/run.c
@@ -207,7 +207,7 @@ void local_handler(int fd, void *data)
static void do_overrun_alarm(struct alarm_block *a, void *data)
{
nl_overrun_request_resync(STATE(overrun));
- add_alarm(&STATE(overrun_alarm), 2, 0);
+ STATE(stats).nl_kernel_table_resync++;
}
static int event_handler(enum nf_conntrack_msg_type type,
@@ -378,6 +378,9 @@ init(void)
return 0;
}
+/* interval of 30s. for between two overrun */
+#define OVRUN_INT 30
+
static void __run(struct timeval *next_alarm)
{
int ret;
@@ -406,15 +409,33 @@ static void __run(struct timeval *next_alarm)
if (ret == -1) {
switch(errno) {
case ENOBUFS:
- /*
- * It seems that ctnetlink can't back off,
- * it's likely that we're losing events.
- * Solution: duplicate the socket buffer
- * size and resync with master conntrack table.
+ /* We have hit ENOBUFS, it's likely that we are
+ * losing events. Two possible situations may
+ * trigger this error:
+ *
+ * 1) The netlink receiver buffer is too small:
+ * increasing the netlink buffer size should
+ * be enough. However, some event messages
+ * got lost. We have to resync ourselves
+ * with the kernel table conntrack table to
+ * resolve the inconsistency.
+ *
+ * 2) The receiver is too slow to process the
+ * netlink messages so that the queue gets
+ * full quickly. This generally happens
+ * if the system is under heavy workload
+ * (busy CPU). In this case, increasing the
+ * size of the netlink receiver buffer
+ * would not help anymore since we would
+ * be delaying the overrun. Moreover, we
+ * should avoid resynchronizations. We
+ * should do our best here and keep
+ * replicating as much states as possible.
+ * If workload lowers at some point,
+ * we resync ourselves.
*/
nl_resize_socket_buffer(STATE(event));
- nl_overrun_request_resync(STATE(overrun));
- add_alarm(&STATE(overrun_alarm), 2, 0);
+ add_alarm(&STATE(overrun_alarm), OVRUN_INT, 0);
STATE(stats).nl_catch_event_failed++;
STATE(stats).nl_overrun++;
break;
@@ -435,7 +456,6 @@ static void __run(struct timeval *next_alarm)
}
if (FD_ISSET(nfct_fd(STATE(overrun)), &readfds)) {
- del_alarm(&STATE(overrun_alarm));
nfct_catch(STATE(overrun));
if (STATE(mode)->purge)
STATE(mode)->purge();