From 5f5ed5102c5a36ff16aeddb2aab01b51c75d5dc5 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 21 Sep 2021 02:19:46 +0200
Subject: doc: add cluster match script

This patch adds a script (from 2010!) to set up an active-active
setup with the cluster match.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 doc/misc/README       | 187 +++++++++++++++++++++++++++++++++++++
 doc/misc/clusterip.sh | 254 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 441 insertions(+)
 create mode 100644 doc/misc/README
 create mode 100644 doc/misc/clusterip.sh

diff --git a/doc/misc/README b/doc/misc/README
new file mode 100644
index 0000000..7d0a1ae
--- /dev/null
+++ b/doc/misc/README
@@ -0,0 +1,187 @@
+= Setting up active-active load-sharing hash-based stateful firewall =
+	by Pablo Neira Ayuso <pablo@netfilter.org> in 2010
+
+If you want to know more about this configuration and other firewall
+architectures, please read:
+
+* Demystifying cluster-based fault-tolerant firewalls.
+  IEEE Internet Computing, 13(6):31-38, December 2009.
+  Available at: https://perso.ens-lyon.fr/laurent.lefevre/pdf/IC2009_Neira_Gasca_Lefevre.pdf
+
+== 0x0 intro ==
+
+Under this directory you can find a script that allows you to setup a simple
+active-active hash-based load-sharing firewall cluster based on the iptables'
+cluster match.
+
+== 0x1 testbed ==
+
+My testbed looks like the following:
+
+                ---------- eth1        eth2  ----------
+ client A ------|        |--- firewall 1 ----|        |
+ (192.168.0.2)  | switch | (.0.5)    (.1.5)  | switch |--- server
+                |        |                   |        |   (192.168.1.2)
+ client B ------|        |--- firewall 2 ----|        |
+ (192.168.0.11) ---------- (.0.5)    (.1.5)  ----------
+                            eth1      eth2
+
+The firewalls perform SNAT to masquerade clients. Note that both cluster
+firewall have the same IP addresses. For administrative purposes, it is
+a good idea that each firewall has its one IP address to SSH them, make
+sure you add the appropriate rule to skip the cluster match rule-set!
+More comments: although the picture shows two switches, I'm actually
+using one and I separated the clients and the server in two different
+VLANs.
+
+The script also sets a multicast MAC address that is the same for both
+firewalls so that the switch floods the same packets to both firewalls.
+Using a multicast MAC address is a RFC violation [1], since network node
+must not include multicast MAC address in ARP replies, but:
+
+ a) it is the only way I found so far to obtain the behaviour from my
+    HP procurve switches.
+
+ b) the VRRP MAC address range is not supported appropritely by switch
+    vendors, at least by my HP procurve switches. If switch vendors
+    support this MAC address range appropriately, they will handle them
+    as multicast MAC address. As of 2011 I did not find any switch handling
+    VRRP MAC address range as multicast ports (they still handle them as
+    normal unicast MAC addresses, therefore my solution does not work with
+    two nodes with the same VRRP MAC address).
+
+The cluster match relies upon the Connection Tracking System (conntrack).
+Thus, traffic coming in the reply direction which does not belong this node
+is labeled as INVALID for TCP and ICMP protocols. The scripts add a rule to
+drop this traffic to avoid possible packet duplication. For UDP traffic,
+you will have to add a rule to drop NEW traffic in the reply direction
+because conntrack considers it valid. If you don't do this, both nodes
+may accept reply traffic, thus, sending duplicated packets to the client,
+which is not what you want.
+
+During my last experiments, I was using the Linux kernel 2.6.37 in the
+firewalls and the server. Everything you need to setup this configuration
+is available in stock Linux kernels. No external patches with new features
+are required.
+
+== 0x2 running scripts ==
+
+Copy the script to each node, then adjust the script variables to your
+configuration.
+
+On firewall 1:
+firewall1# ./clusterip-node1.sh start
+
+On firewall 2:
+firewall2# ./clusterip-node2.sh start
+
+== 0x3 trouble-shooting ==
+
+Some troubleshooting may help to understand how this setup works. Check
+the following if you experience problems:
+
+1) Check that Multicast MAC address are assigned to the NICs:
+
+firewall1$ ip maddr
+[...]
+2:      eth1
+[...]
+        link  01:00:5e:00:01:01 static
+3:      eth2
+[...]
+        link  01:00:5e:00:01:02 static
+
+The scripts add the multicast MAC addresses to the NICs, if this
+is not done the traffic will be discarded by the firewalls'
+networking stack.
+
+2) ICMP ping the server from one the clients:
+
+client$ ping -c 1 192.168.1.2
+PING 192.168.1.2 (192.168.1.2) 56(84) bytes of data.
+64 bytes from 192.168.1.2: icmp_seq=1 ttl=63 time=0.220 ms
+
+--- 192.168.1.2 ping statistics ---
+1 packets transmitted, 1 received, 0% packet loss, time 0ms
+rtt min/avg/max/mdev = 0.220/0.220/0.220/0.000 ms
+
+If this does not work, make sure the firewalls are including the
+multicast MAC address in their ARP replies, you can check this
+by looking at the neigbour cache:
+
+client$ ip neighbour
+[...]
+192.168.0.5 dev eth1 lladdr 01:00:5e:00:01:01 REACHABLE
+
+server$ ip neighbour
+[...]
+192.168.1.5 dev eth1 lladdr 01:00:5e:00:01:02 REACHABLE
+
+firewall$ ip neighbour
+[...]
+192.168.0.5 dev eth1 lladdr 01:00:5e:00:01:01 REACHABLE
+192.168.1.5 dev eth2 lladdr 01:00:5e:00:01:02 REACHABLE
+
+3) Test TCP connections: you can use netcat to start simple connections
+between the client and the server.
+
+You can also use intensive HTTP traffic generation to test performance
+like injectX.c and httpterm from Willy Tarreau:
+
+http://1wt.eu/tools/inject/
+http://1wt.eu/tools/httpterm/
+
+clientA:~/http-client-benchmark# ./client -t 60 -u 200 -G 192.168.1.2:8000
+#  hits  hits/s  ^h/s  ^bytes   kB/s  errs   rst  tout  mhtime
+ 266926 26692 26766   3881270   3779     0     0     0   0.237
+ 294067 26733 27141   3935621   3785     0     0     0   0.176
+
+clientB~/http-client-benchmark# ./client -t 30 -u 40 -G 192.168.1.2:8020
+#  hits  hits/s  ^h/s  ^bytes   kB/s  errs   rst  tout  mhtime
+  53250 17750 17368   2518448   2513     0     0     0   0.240
+  70766 17691 17516   2539907   2505     0     0     0   0.297
+
+^h/s is the current number of HTTP petitions per second. This means
+that you get ~45000 HTTP petitions per second. In my setup, with only
+one firewall active I get ~27000 HTTP petitions per second. We obtain
+extra performance of ~66%, not that bad 8-).
+
+I have configured httpterm to send object of 0 bytes over HTTP
+to obtain the maximum number of HTTP flows. This is the worst case
+scenario in firewall load.
+
+I forgot to mention that I set CPU affinity for NICs IRQs. I've got
+two cores, one for each firewall NIC.
+
+== 0x4 report sucessful setups ==
+
+My testbed is composed of low-cost basic five years old HP proliant
+systems, you can see that the numbers are not great. I like knowing
+about numbers, I'd appreciate if you drop me a line to tell me the
+numbers that you get and your experience.
+
+== 0x5 conclusions and future works ==
+
+The cluster match allows to setup load-sharing hash-based stateful
+firewalls that is a way to avoid having a spare backup firewall as
+it happens in classical Primary-Backup setups.
+
+Still, there is some pending work to fully integrate conntrackd and HA
+managers with it (in case that you want high availability, of course).
+
+-o-
+
+[1] More specifically, it's a RFC 1812 (section 3.3.2) violation.
+It's been reported that this is a problem for CISCO routers:
+http://marc.info/?l=netfilter&m=128810399113170&w=2
+
+Michele Codutti: "The problem is the multicast MAC address that these
+routers doesn't "like". They discard any incoming packet with MAC
+multicast address to be compliant with RFC1812. The only documented
+(by Cisco) workaround is to put a fixed arp entry with the multicast
+address that maps the clustered IP in the router."
+
+If you keep reading the mailing thread, the reported problem affected
+Cisco 7200 VXR.
+
+--02/02/2010
diff --git a/doc/misc/clusterip.sh b/doc/misc/clusterip.sh
new file mode 100644
index 0000000..911f676
--- /dev/null
+++ b/doc/misc/clusterip.sh
@@ -0,0 +1,254 @@
+#!/bin/sh
+
+#
+# (C) 2009-2011 by Pablo Neira Ayuso <pneira@us.es>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+
+#
+# Here, you can find the variables that you have to change.
+#
+
+# enable this for debugging
+LOG_DEBUG=0
+
+# number of cluster node (must be unique, from 1 to N cluster nodes)
+NODE=1
+
+# this is the real MAC address of eth1
+REAL_HWADDR1=00:18:71:68:f2:34
+
+# this is the real MAC address of eth2
+REAL_HWADDR2=00:11:0a:60:e7:32
+
+#
+# These variables MUST have the same values in both cluster nodes
+#
+
+# number of nodes that belong this cluster
+TOTAL_NODES=2
+
+# this is the cluster multicast MAC address of eth1
+MC_HWADDR1=01:00:5e:00:01:01
+
+# this is the cluster multicast MAC address of eth2
+MC_HWADDR2=01:00:5e:00:01:02
+
+# cluster IP address of eth1
+ADDR1=192.168.0.5/24
+
+# cluster IP address of eth2
+ADDR2=192.168.1.5/24
+
+# random seed for hashing
+SEED=0xdeadbeef
+
+start_cluster_address()
+{
+	# set cluster IP addresses
+	ip a a $ADDR1 dev eth1
+	ip a a $ADDR2 dev eth2
+	# set cluster multicast MAC addresses
+	ip maddr add $MC_HWADDR1 dev eth1
+	ip maddr add $MC_HWADDR2 dev eth2
+	# mangle ARP replies to include the cluster multicast MAC addresses
+	arptables -I OUTPUT -o eth1 --h-length 6 \
+		-j mangle --mangle-mac-s $MC_HWADDR1
+	# mangle ARP request to use the original MAC address (otherwise the
+	# stack drops this packet).
+	arptables -I INPUT -i eth1 --h-length 6 --destination-mac \
+		$MC_HWADDR1 -j mangle --mangle-mac-d $REAL_HWADDR1
+	arptables -I OUTPUT -o eth2 --h-length 6 \
+		-j mangle --mangle-mac-s $MC_HWADDR2
+	arptables -I INPUT -i eth2 --h-length 6 --destination-mac \
+		$MC_HWADDR2 -j mangle --mangle-mac-d $REAL_HWADDR2
+}
+
+stop_cluster_address()
+{
+	# delete cluster IP addresses
+	ip a d $ADDR1 dev eth1
+	ip a d $ADDR2 dev eth2
+	# delete cluster multicast MAC addresses
+	ip maddr del $MC_HWADDR1 dev eth1
+	ip maddr del $MC_HWADDR2 dev eth2
+	# delete ARP replies mangling
+	arptables -D OUTPUT -o eth1 --h-length 6 \
+		-j mangle --mangle-mac-s $MC_HWADDR1
+	# delete ARP requests mangling
+	arptables -D INPUT -i eth1 --h-length 6 --destination-mac \
+		$MC_HWADDR1 -j mangle --mangle-mac-d $REAL_HWADDR1
+	arptables -D OUTPUT -o eth2 --h-length 6 \
+		-j mangle --mangle-mac-s $MC_HWADDR2
+	arptables -D INPUT -i eth2 --h-length 6 --destination-mac \
+		$MC_HWADDR2 -j mangle --mangle-mac-d $REAL_HWADDR2
+}
+
+start_nat()
+{
+	iptables -A POSTROUTING -t nat -s 192.168.0.11 \
+		-j SNAT --to-source 192.168.1.5
+	iptables -A POSTROUTING -t nat -s 192.168.0.2 \
+		-j SNAT --to-source 192.168.1.5
+}
+
+stop_nat()
+{
+	iptables -D POSTROUTING -t nat -s 192.168.0.11 \
+		-j SNAT --to-source 192.168.1.5
+	iptables -D POSTROUTING -t nat -s 192.168.0.2 \
+		-j SNAT --to-source 192.168.1.5
+}
+
+iptables_start_cluster_rules()
+{
+	# mark packets that belong to this node (go direction)
+	iptables -A CLUSTER-RULES -t mangle -i eth1 -m cluster \
+		--cluster-total-nodes $TOTAL_NODES --cluster-local-node $1 \
+		--cluster-hash-seed $SEED -j MARK --set-mark 0xffff
+
+	# mark packet that belong to this node (reply direction)
+	# note: we *do* need this to change the packet type to PACKET_HOST,
+	# otherwise the stack silently drops the packet.
+	iptables -A CLUSTER-RULES -t mangle -i eth2 -m cluster \
+		--cluster-total-nodes $TOTAL_NODES --cluster-local-node $1 \
+		--cluster-hash-seed $SEED -j MARK --set-mark 0xffff
+}
+
+iptables_stop_cluster_rules()
+{
+	iptables -D CLUSTER-RULES -t mangle -i eth1 -m cluster \
+		--cluster-total-nodes $TOTAL_NODES --cluster-local-node $1 \
+		--cluster-hash-seed $SEED -j MARK --set-mark 0xffff
+
+	iptables -D CLUSTER-RULES -t mangle -i eth2 -m cluster \
+		--cluster-total-nodes $TOTAL_NODES --cluster-local-node $1 \
+		--cluster-hash-seed $SEED -j MARK --set-mark 0xffff
+}
+
+start_cluster_ruleset() {
+	iptables -N CLUSTER-RULES -t mangle
+
+	iptables_start_cluster_rules $NODE
+
+	iptables -A PREROUTING -t mangle -j CLUSTER-RULES
+
+	if [ $LOG_DEBUG -eq 1 ]
+	then
+		iptables -A PREROUTING -t mangle -i eth1 -m mark \
+			--mark 0xffff -j LOG --log-prefix "cluster-accept: "
+		iptables -A PREROUTING -t mangle -i eth1 -m mark \
+			! --mark 0xffff -j LOG --log-prefix "cluster-drop: "
+		iptables -A PREROUTING -t mangle -i eth2 -m mark \
+			--mark 0xffff \
+			-j LOG --log-prefix "cluster-reply-accept: "
+		iptables -A PREROUTING -t mangle -i eth2 -m mark \
+			! --mark 0xffff \
+			-j LOG --log-prefix "cluster-reply-drop: "
+	fi
+
+	# drop packets that don't belong to us (go direction)
+	iptables -A PREROUTING -t mangle -i eth1 -m mark \
+		! --mark 0xffff -j DROP
+
+	# drop packets that don't belong to us (reply direction)
+	iptables -A PREROUTING -t mangle -i eth2 -m mark \
+		! --mark 0xffff -j DROP
+}
+
+stop_cluster_ruleset() {
+	iptables -D PREROUTING -t mangle -j CLUSTER-RULES
+
+	if [ $LOG_DEBUG -eq 1 ]
+	then
+		iptables -D PREROUTING -t mangle -i eth1 -m mark \
+			--mark 0xffff -j LOG --log-prefix "cluster-accept: "
+		iptables -D PREROUTING -t mangle -i eth1 -m mark \
+			! --mark 0xffff -j LOG --log-prefix "cluster-drop: "
+		iptables -D PREROUTING -t mangle -i eth2 -m mark \
+			--mark 0xffff \
+			-j LOG --log-prefix "cluster-reply-accept: "
+		iptables -D PREROUTING -t mangle -i eth2 -m mark \
+			! --mark 0xffff \
+			-j LOG --log-prefix "cluster-reply-drop: "
+	fi
+
+	iptables -D PREROUTING -t mangle -i eth1 -m mark \
+		! --mark 0xffff -j DROP
+
+	iptables -D PREROUTING -t mangle -i eth2 -m mark \
+		! --mark 0xffff -j DROP
+
+	iptables_stop_cluster_rules $NODE
+
+	iptables -F CLUSTER-RULES -t mangle
+	iptables -X CLUSTER-RULES -t mangle
+}
+
+case "$1" in
+start)
+	echo "starting cluster configuration for node $NODE."
+
+	# just in case that you forget it
+	echo 1 > /proc/sys/net/ipv4/ip_forward
+
+	# disable TCP pickup
+	echo 0 > /proc/sys/net/ipv4/netfilter/ip_conntrack_tcp_be_liberal
+	echo 0 > /proc/sys/net/ipv4/netfilter/ip_conntrack_tcp_loose
+
+	start_cluster_address
+	start_nat
+
+	# drop invalid flows from eth2 (not allowed). This is mandatory
+	# because traffic which does not belong to this node is always
+	# labeled as INVALID by TCP and ICMP state tracking. For protocols like
+	# UDP, you will have to drop NEW traffic from eth2, otherwise reply
+	# traffic may be accepted by both nodes, thus duplicating the traffic.
+	iptables -A PREROUTING -t mangle -i eth2 \
+		-m state --state INVALID -j DROP
+
+	start_cluster_ruleset
+	;;
+stop)
+	echo "stopping cluster configuration for node $NODE."
+
+	stop_cluster_address
+	stop_nat
+
+	iptables -D PREROUTING -t mangle -i eth2 \
+		-m state --state INVALID -j DROP
+
+	stop_cluster_ruleset
+	;;
+primary)
+	logger "cluster-match-script: entering MASTER state for node $2"
+	if [ -x $CONNTRACKD_SCRIPT ]
+	then
+		sh $CONNTRACKD_SCRIPT primary $NODE $2
+	fi
+	iptables_start_cluster_rules $2
+	;;
+backup)
+	logger "cluster-match-script: entering BACKUP state for node $2"
+	if [ -x $CONNTRACKD_SCRIPT ]
+	then
+		sh $CONNTRACKD_SCRIPT backup $NODE $2
+	fi
+	iptables_stop_cluster_rules $2
+	;;
+fault)
+	logger "cluster-match-script: entering FAULT state for node $2"
+	if [ -x $CONNTRACKD_SCRIPT ]
+	then
+		sh $CONNTRACKD_SCRIPT fault $NODE $2
+	fi
+	iptables_stop_cluster_rules $2
+	;;
+*)
+	echo "$0 start|stop|add|del [nodeid]"
+	;;
+esac
-- 
cgit v1.2.3