From 5f5ed5102c5a36ff16aeddb2aab01b51c75d5dc5 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 21 Sep 2021 02:19:46 +0200 Subject: doc: add cluster match script This patch adds a script (from 2010!) to set up an active-active setup with the cluster match. Signed-off-by: Pablo Neira Ayuso --- doc/misc/README | 187 +++++++++++++++++++++++++++++++++++++ doc/misc/clusterip.sh | 254 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 441 insertions(+) create mode 100644 doc/misc/README create mode 100644 doc/misc/clusterip.sh diff --git a/doc/misc/README b/doc/misc/README new file mode 100644 index 0000000..7d0a1ae --- /dev/null +++ b/doc/misc/README @@ -0,0 +1,187 @@ += Setting up active-active load-sharing hash-based stateful firewall = + by Pablo Neira Ayuso in 2010 + +If you want to know more about this configuration and other firewall +architectures, please read: + +* Demystifying cluster-based fault-tolerant firewalls. + IEEE Internet Computing, 13(6):31-38, December 2009. + Available at: https://perso.ens-lyon.fr/laurent.lefevre/pdf/IC2009_Neira_Gasca_Lefevre.pdf + +== 0x0 intro == + +Under this directory you can find a script that allows you to setup a simple +active-active hash-based load-sharing firewall cluster based on the iptables' +cluster match. + +== 0x1 testbed == + +My testbed looks like the following: + + ---------- eth1 eth2 ---------- + client A ------| |--- firewall 1 ----| | + (192.168.0.2) | switch | (.0.5) (.1.5) | switch |--- server + | | | | (192.168.1.2) + client B ------| |--- firewall 2 ----| | + (192.168.0.11) ---------- (.0.5) (.1.5) ---------- + eth1 eth2 + +The firewalls perform SNAT to masquerade clients. Note that both cluster +firewall have the same IP addresses. For administrative purposes, it is +a good idea that each firewall has its one IP address to SSH them, make +sure you add the appropriate rule to skip the cluster match rule-set! +More comments: although the picture shows two switches, I'm actually +using one and I separated the clients and the server in two different +VLANs. + +The script also sets a multicast MAC address that is the same for both +firewalls so that the switch floods the same packets to both firewalls. +Using a multicast MAC address is a RFC violation [1], since network node +must not include multicast MAC address in ARP replies, but: + + a) it is the only way I found so far to obtain the behaviour from my + HP procurve switches. + + b) the VRRP MAC address range is not supported appropritely by switch + vendors, at least by my HP procurve switches. If switch vendors + support this MAC address range appropriately, they will handle them + as multicast MAC address. As of 2011 I did not find any switch handling + VRRP MAC address range as multicast ports (they still handle them as + normal unicast MAC addresses, therefore my solution does not work with + two nodes with the same VRRP MAC address). + +The cluster match relies upon the Connection Tracking System (conntrack). +Thus, traffic coming in the reply direction which does not belong this node +is labeled as INVALID for TCP and ICMP protocols. The scripts add a rule to +drop this traffic to avoid possible packet duplication. For UDP traffic, +you will have to add a rule to drop NEW traffic in the reply direction +because conntrack considers it valid. If you don't do this, both nodes +may accept reply traffic, thus, sending duplicated packets to the client, +which is not what you want. + +During my last experiments, I was using the Linux kernel 2.6.37 in the +firewalls and the server. Everything you need to setup this configuration +is available in stock Linux kernels. No external patches with new features +are required. + +== 0x2 running scripts == + +Copy the script to each node, then adjust the script variables to your +configuration. + +On firewall 1: +firewall1# ./clusterip-node1.sh start + +On firewall 2: +firewall2# ./clusterip-node2.sh start + +== 0x3 trouble-shooting == + +Some troubleshooting may help to understand how this setup works. Check +the following if you experience problems: + +1) Check that Multicast MAC address are assigned to the NICs: + +firewall1$ ip maddr +[...] +2: eth1 +[...] + link 01:00:5e:00:01:01 static +3: eth2 +[...] + link 01:00:5e:00:01:02 static + +The scripts add the multicast MAC addresses to the NICs, if this +is not done the traffic will be discarded by the firewalls' +networking stack. + +2) ICMP ping the server from one the clients: + +client$ ping -c 1 192.168.1.2 +PING 192.168.1.2 (192.168.1.2) 56(84) bytes of data. +64 bytes from 192.168.1.2: icmp_seq=1 ttl=63 time=0.220 ms + +--- 192.168.1.2 ping statistics --- +1 packets transmitted, 1 received, 0% packet loss, time 0ms +rtt min/avg/max/mdev = 0.220/0.220/0.220/0.000 ms + +If this does not work, make sure the firewalls are including the +multicast MAC address in their ARP replies, you can check this +by looking at the neigbour cache: + +client$ ip neighbour +[...] +192.168.0.5 dev eth1 lladdr 01:00:5e:00:01:01 REACHABLE + +server$ ip neighbour +[...] +192.168.1.5 dev eth1 lladdr 01:00:5e:00:01:02 REACHABLE + +firewall$ ip neighbour +[...] +192.168.0.5 dev eth1 lladdr 01:00:5e:00:01:01 REACHABLE +192.168.1.5 dev eth2 lladdr 01:00:5e:00:01:02 REACHABLE + +3) Test TCP connections: you can use netcat to start simple connections +between the client and the server. + +You can also use intensive HTTP traffic generation to test performance +like injectX.c and httpterm from Willy Tarreau: + +http://1wt.eu/tools/inject/ +http://1wt.eu/tools/httpterm/ + +clientA:~/http-client-benchmark# ./client -t 60 -u 200 -G 192.168.1.2:8000 +# hits hits/s ^h/s ^bytes kB/s errs rst tout mhtime + 266926 26692 26766 3881270 3779 0 0 0 0.237 + 294067 26733 27141 3935621 3785 0 0 0 0.176 + +clientB~/http-client-benchmark# ./client -t 30 -u 40 -G 192.168.1.2:8020 +# hits hits/s ^h/s ^bytes kB/s errs rst tout mhtime + 53250 17750 17368 2518448 2513 0 0 0 0.240 + 70766 17691 17516 2539907 2505 0 0 0 0.297 + +^h/s is the current number of HTTP petitions per second. This means +that you get ~45000 HTTP petitions per second. In my setup, with only +one firewall active I get ~27000 HTTP petitions per second. We obtain +extra performance of ~66%, not that bad 8-). + +I have configured httpterm to send object of 0 bytes over HTTP +to obtain the maximum number of HTTP flows. This is the worst case +scenario in firewall load. + +I forgot to mention that I set CPU affinity for NICs IRQs. I've got +two cores, one for each firewall NIC. + +== 0x4 report sucessful setups == + +My testbed is composed of low-cost basic five years old HP proliant +systems, you can see that the numbers are not great. I like knowing +about numbers, I'd appreciate if you drop me a line to tell me the +numbers that you get and your experience. + +== 0x5 conclusions and future works == + +The cluster match allows to setup load-sharing hash-based stateful +firewalls that is a way to avoid having a spare backup firewall as +it happens in classical Primary-Backup setups. + +Still, there is some pending work to fully integrate conntrackd and HA +managers with it (in case that you want high availability, of course). + +-o- + +[1] More specifically, it's a RFC 1812 (section 3.3.2) violation. +It's been reported that this is a problem for CISCO routers: +http://marc.info/?l=netfilter&m=128810399113170&w=2 + +Michele Codutti: "The problem is the multicast MAC address that these +routers doesn't "like". They discard any incoming packet with MAC +multicast address to be compliant with RFC1812. The only documented +(by Cisco) workaround is to put a fixed arp entry with the multicast +address that maps the clustered IP in the router." + +If you keep reading the mailing thread, the reported problem affected +Cisco 7200 VXR. + +--02/02/2010 diff --git a/doc/misc/clusterip.sh b/doc/misc/clusterip.sh new file mode 100644 index 0000000..911f676 --- /dev/null +++ b/doc/misc/clusterip.sh @@ -0,0 +1,254 @@ +#!/bin/sh + +# +# (C) 2009-2011 by Pablo Neira Ayuso +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# + +# +# Here, you can find the variables that you have to change. +# + +# enable this for debugging +LOG_DEBUG=0 + +# number of cluster node (must be unique, from 1 to N cluster nodes) +NODE=1 + +# this is the real MAC address of eth1 +REAL_HWADDR1=00:18:71:68:f2:34 + +# this is the real MAC address of eth2 +REAL_HWADDR2=00:11:0a:60:e7:32 + +# +# These variables MUST have the same values in both cluster nodes +# + +# number of nodes that belong this cluster +TOTAL_NODES=2 + +# this is the cluster multicast MAC address of eth1 +MC_HWADDR1=01:00:5e:00:01:01 + +# this is the cluster multicast MAC address of eth2 +MC_HWADDR2=01:00:5e:00:01:02 + +# cluster IP address of eth1 +ADDR1=192.168.0.5/24 + +# cluster IP address of eth2 +ADDR2=192.168.1.5/24 + +# random seed for hashing +SEED=0xdeadbeef + +start_cluster_address() +{ + # set cluster IP addresses + ip a a $ADDR1 dev eth1 + ip a a $ADDR2 dev eth2 + # set cluster multicast MAC addresses + ip maddr add $MC_HWADDR1 dev eth1 + ip maddr add $MC_HWADDR2 dev eth2 + # mangle ARP replies to include the cluster multicast MAC addresses + arptables -I OUTPUT -o eth1 --h-length 6 \ + -j mangle --mangle-mac-s $MC_HWADDR1 + # mangle ARP request to use the original MAC address (otherwise the + # stack drops this packet). + arptables -I INPUT -i eth1 --h-length 6 --destination-mac \ + $MC_HWADDR1 -j mangle --mangle-mac-d $REAL_HWADDR1 + arptables -I OUTPUT -o eth2 --h-length 6 \ + -j mangle --mangle-mac-s $MC_HWADDR2 + arptables -I INPUT -i eth2 --h-length 6 --destination-mac \ + $MC_HWADDR2 -j mangle --mangle-mac-d $REAL_HWADDR2 +} + +stop_cluster_address() +{ + # delete cluster IP addresses + ip a d $ADDR1 dev eth1 + ip a d $ADDR2 dev eth2 + # delete cluster multicast MAC addresses + ip maddr del $MC_HWADDR1 dev eth1 + ip maddr del $MC_HWADDR2 dev eth2 + # delete ARP replies mangling + arptables -D OUTPUT -o eth1 --h-length 6 \ + -j mangle --mangle-mac-s $MC_HWADDR1 + # delete ARP requests mangling + arptables -D INPUT -i eth1 --h-length 6 --destination-mac \ + $MC_HWADDR1 -j mangle --mangle-mac-d $REAL_HWADDR1 + arptables -D OUTPUT -o eth2 --h-length 6 \ + -j mangle --mangle-mac-s $MC_HWADDR2 + arptables -D INPUT -i eth2 --h-length 6 --destination-mac \ + $MC_HWADDR2 -j mangle --mangle-mac-d $REAL_HWADDR2 +} + +start_nat() +{ + iptables -A POSTROUTING -t nat -s 192.168.0.11 \ + -j SNAT --to-source 192.168.1.5 + iptables -A POSTROUTING -t nat -s 192.168.0.2 \ + -j SNAT --to-source 192.168.1.5 +} + +stop_nat() +{ + iptables -D POSTROUTING -t nat -s 192.168.0.11 \ + -j SNAT --to-source 192.168.1.5 + iptables -D POSTROUTING -t nat -s 192.168.0.2 \ + -j SNAT --to-source 192.168.1.5 +} + +iptables_start_cluster_rules() +{ + # mark packets that belong to this node (go direction) + iptables -A CLUSTER-RULES -t mangle -i eth1 -m cluster \ + --cluster-total-nodes $TOTAL_NODES --cluster-local-node $1 \ + --cluster-hash-seed $SEED -j MARK --set-mark 0xffff + + # mark packet that belong to this node (reply direction) + # note: we *do* need this to change the packet type to PACKET_HOST, + # otherwise the stack silently drops the packet. + iptables -A CLUSTER-RULES -t mangle -i eth2 -m cluster \ + --cluster-total-nodes $TOTAL_NODES --cluster-local-node $1 \ + --cluster-hash-seed $SEED -j MARK --set-mark 0xffff +} + +iptables_stop_cluster_rules() +{ + iptables -D CLUSTER-RULES -t mangle -i eth1 -m cluster \ + --cluster-total-nodes $TOTAL_NODES --cluster-local-node $1 \ + --cluster-hash-seed $SEED -j MARK --set-mark 0xffff + + iptables -D CLUSTER-RULES -t mangle -i eth2 -m cluster \ + --cluster-total-nodes $TOTAL_NODES --cluster-local-node $1 \ + --cluster-hash-seed $SEED -j MARK --set-mark 0xffff +} + +start_cluster_ruleset() { + iptables -N CLUSTER-RULES -t mangle + + iptables_start_cluster_rules $NODE + + iptables -A PREROUTING -t mangle -j CLUSTER-RULES + + if [ $LOG_DEBUG -eq 1 ] + then + iptables -A PREROUTING -t mangle -i eth1 -m mark \ + --mark 0xffff -j LOG --log-prefix "cluster-accept: " + iptables -A PREROUTING -t mangle -i eth1 -m mark \ + ! --mark 0xffff -j LOG --log-prefix "cluster-drop: " + iptables -A PREROUTING -t mangle -i eth2 -m mark \ + --mark 0xffff \ + -j LOG --log-prefix "cluster-reply-accept: " + iptables -A PREROUTING -t mangle -i eth2 -m mark \ + ! --mark 0xffff \ + -j LOG --log-prefix "cluster-reply-drop: " + fi + + # drop packets that don't belong to us (go direction) + iptables -A PREROUTING -t mangle -i eth1 -m mark \ + ! --mark 0xffff -j DROP + + # drop packets that don't belong to us (reply direction) + iptables -A PREROUTING -t mangle -i eth2 -m mark \ + ! --mark 0xffff -j DROP +} + +stop_cluster_ruleset() { + iptables -D PREROUTING -t mangle -j CLUSTER-RULES + + if [ $LOG_DEBUG -eq 1 ] + then + iptables -D PREROUTING -t mangle -i eth1 -m mark \ + --mark 0xffff -j LOG --log-prefix "cluster-accept: " + iptables -D PREROUTING -t mangle -i eth1 -m mark \ + ! --mark 0xffff -j LOG --log-prefix "cluster-drop: " + iptables -D PREROUTING -t mangle -i eth2 -m mark \ + --mark 0xffff \ + -j LOG --log-prefix "cluster-reply-accept: " + iptables -D PREROUTING -t mangle -i eth2 -m mark \ + ! --mark 0xffff \ + -j LOG --log-prefix "cluster-reply-drop: " + fi + + iptables -D PREROUTING -t mangle -i eth1 -m mark \ + ! --mark 0xffff -j DROP + + iptables -D PREROUTING -t mangle -i eth2 -m mark \ + ! --mark 0xffff -j DROP + + iptables_stop_cluster_rules $NODE + + iptables -F CLUSTER-RULES -t mangle + iptables -X CLUSTER-RULES -t mangle +} + +case "$1" in +start) + echo "starting cluster configuration for node $NODE." + + # just in case that you forget it + echo 1 > /proc/sys/net/ipv4/ip_forward + + # disable TCP pickup + echo 0 > /proc/sys/net/ipv4/netfilter/ip_conntrack_tcp_be_liberal + echo 0 > /proc/sys/net/ipv4/netfilter/ip_conntrack_tcp_loose + + start_cluster_address + start_nat + + # drop invalid flows from eth2 (not allowed). This is mandatory + # because traffic which does not belong to this node is always + # labeled as INVALID by TCP and ICMP state tracking. For protocols like + # UDP, you will have to drop NEW traffic from eth2, otherwise reply + # traffic may be accepted by both nodes, thus duplicating the traffic. + iptables -A PREROUTING -t mangle -i eth2 \ + -m state --state INVALID -j DROP + + start_cluster_ruleset + ;; +stop) + echo "stopping cluster configuration for node $NODE." + + stop_cluster_address + stop_nat + + iptables -D PREROUTING -t mangle -i eth2 \ + -m state --state INVALID -j DROP + + stop_cluster_ruleset + ;; +primary) + logger "cluster-match-script: entering MASTER state for node $2" + if [ -x $CONNTRACKD_SCRIPT ] + then + sh $CONNTRACKD_SCRIPT primary $NODE $2 + fi + iptables_start_cluster_rules $2 + ;; +backup) + logger "cluster-match-script: entering BACKUP state for node $2" + if [ -x $CONNTRACKD_SCRIPT ] + then + sh $CONNTRACKD_SCRIPT backup $NODE $2 + fi + iptables_stop_cluster_rules $2 + ;; +fault) + logger "cluster-match-script: entering FAULT state for node $2" + if [ -x $CONNTRACKD_SCRIPT ] + then + sh $CONNTRACKD_SCRIPT fault $NODE $2 + fi + iptables_stop_cluster_rules $2 + ;; +*) + echo "$0 start|stop|add|del [nodeid]" + ;; +esac -- cgit v1.2.3