summaryrefslogtreecommitdiffstats
path: root/doc
diff options
context:
space:
mode:
Diffstat (limited to 'doc')
-rw-r--r--doc/Makefile.am30
-rw-r--r--doc/additional-commands.txt116
-rw-r--r--doc/data-types.txt102
-rw-r--r--doc/libnftables-json.adoc67
-rw-r--r--doc/libnftables.adoc30
-rw-r--r--doc/nft.txt143
-rw-r--r--doc/payload-expression.txt85
-rw-r--r--doc/primary-expression.txt98
-rw-r--r--doc/stateful-objects.txt4
-rw-r--r--doc/statements.txt202
10 files changed, 552 insertions, 325 deletions
diff --git a/doc/Makefile.am b/doc/Makefile.am
deleted file mode 100644
index 21482320..00000000
--- a/doc/Makefile.am
+++ /dev/null
@@ -1,30 +0,0 @@
-if BUILD_MAN
-man_MANS = nft.8 libnftables-json.5 libnftables.3
-
-A2X_OPTS_MANPAGE = -L --doctype manpage --format manpage -D ${builddir}
-
-ASCIIDOC_MAIN = nft.txt
-ASCIIDOC_INCLUDES = \
- data-types.txt \
- payload-expression.txt \
- primary-expression.txt \
- stateful-objects.txt \
- statements.txt
-ASCIIDOCS = ${ASCIIDOC_MAIN} ${ASCIIDOC_INCLUDES}
-
-EXTRA_DIST = ${ASCIIDOCS} ${man_MANS} libnftables-json.adoc libnftables.adoc
-
-CLEANFILES = \
- *~
-
-nft.8: ${ASCIIDOCS}
- ${AM_V_GEN}${A2X} ${A2X_OPTS_MANPAGE} $<
-
-.adoc.3:
- ${AM_V_GEN}${A2X} ${A2X_OPTS_MANPAGE} $<
-
-.adoc.5:
- ${AM_V_GEN}${A2X} ${A2X_OPTS_MANPAGE} $<
-
-CLEANFILES += ${man_MANS}
-endif
diff --git a/doc/additional-commands.txt b/doc/additional-commands.txt
new file mode 100644
index 00000000..2ebc2993
--- /dev/null
+++ b/doc/additional-commands.txt
@@ -0,0 +1,116 @@
+LIST HOOKS
+~~~~~~~~~~
+
+This shows the list of functions that have been registered for the
+given protocol family, including functions that have been
+registered implicitly by kernel modules such as nf_conntrack. +
+
+[verse]
+____
+*list hooks* ['family']
+*list hooks netdev* [ *device* 'DEVICE_NAME' ]
+____
+
+*list hooks* is enough to display everything that is active
+on the system. Hooks in the netdev family are tied to a network
+device. If no device name is given, nft will query all network
+devices in the current network namespace.
+Example Usage:
+
+.List all active netfilter hooks in either the ip or ip6 stack
+--------------------------------------------------------------
+% nft list hooks inet
+family ip {
+ hook prerouting {
+ -0000000400 ipv4_conntrack_defrag [nf_defrag_ipv4]
+ -0000000200 ipv4_conntrack_in [nf_conntrack]
+ -0000000100 nf_nat_ipv4_pre_routing [nf_nat]
+ }
+ hook input {
+ 0000000000 chain inet filter input [nf_tables]
+ +0000000100 nf_nat_ipv4_local_in [nf_nat]
+[..]
+--------------------------------------------------------------
+
+The above shows a host that has nat, conntrack and ipv4 packet
+defragmentation enabled.
+For each hook location for the queried family a list of active hooks
+using the format +
+
+*priority* *identifier* [*module_name*]
+
+will be shown.
+
+The *priority* value dictates the order in which the hooks are called.
+The list is sorted, the lowest number is run first.
+
+The priority value of hooks registered by the kernel cannot be changed.
+For basechains registered by nftables, this value corresponds to the
+*priority* value specified in the base chain definition.
+
+After the numerical value, information about the hook is shown.
+For basechains defined in nftables this includes the table family,
+the table name and the basechains name.
+For hooks coming from kernel modules, the function name is used
+instead.
+
+If a *module name* is given, the hook was registered by the kernel
+module with this name. You can use 'modinfo *module name*' to
+obtain more information about the module.
+
+This functionality requires a kernel built with the option +
+CONFIG_NETFILTER_NETLINK_HOOK
+enabled, either as a module or builtin. The module is named
+*nfnetlink_hook*.
+
+MONITOR
+~~~~~~~
+The monitor command allows you to listen to Netlink events produced by the
+nf_tables subsystem. These are either related to creation and deletion of
+objects or to packets for which *meta nftrace* was enabled. When they
+occur, nft will print to stdout the monitored events in either JSON or
+native nft format. +
+
+[verse]
+____
+*monitor* [*new* | *destroy*] 'MONITOR_OBJECT'
+*monitor* *trace*
+
+'MONITOR_OBJECT' := *tables* | *chains* | *sets* | *rules* | *elements* | *ruleset*
+____
+
+To filter events related to a concrete object, use one of the keywords in
+'MONITOR_OBJECT'.
+
+To filter events related to a concrete action, use keyword *new* or *destroy*.
+
+The second form of invocation takes no further options and exclusively prints
+events generated for packets with *nftrace* enabled.
+
+Hit ^C to finish the monitor operation.
+
+.Listen to all events, report in native nft format
+--------------------------------------------------
+% nft monitor
+--------------------------------------------------
+
+.Listen to deleted rules, report in JSON format
+-----------------------------------------------
+% nft -j monitor destroy rules
+-----------------------------------------------
+
+.Listen to both new and destroyed chains, in native nft format
+-----------------------------------------------------------------
+% nft monitor chains
+-------------------------------
+
+.Listen to ruleset events such as table, chain, rule, set, counters and quotas, in native nft format
+----------------------------------------------------------------------------------------------------
+% nft monitor ruleset
+---------------------
+
+.Trace incoming packets from host 10.0.0.1
+------------------------------------------
+% nft add rule filter input ip saddr 10.0.0.1 meta nftrace set 1
+% nft monitor trace
+------------------------------------------
diff --git a/doc/data-types.txt b/doc/data-types.txt
index 961fc624..18af266a 100644
--- a/doc/data-types.txt
+++ b/doc/data-types.txt
@@ -166,7 +166,7 @@ Check TCP option header existence.
.Boolean specification
----------------------
# match if route exists
-filter input fib daddr . iif oif exists
+filter input fib daddr . iif check exists
# match only non-fragmented packets in IPv6 traffic
filter input exthdr frag missing
@@ -242,35 +242,13 @@ integer
The ICMP Code type is used to conveniently specify the ICMP header's code field.
-.Keywords may be used when specifying the ICMP code
-[options="header"]
-|==================
-|Keyword | Value
-|net-unreachable |
-0
-|host-unreachable |
-1
-|prot-unreachable|
-2
-|port-unreachable|
-3
-|frag-needed|
-4
-|net-prohibited|
-9
-|host-prohibited|
-10
-|admin-prohibited|
-13
-|===================
-
ICMPV6 TYPE TYPE
~~~~~~~~~~~~~~~~
[options="header"]
|==================
|Name | Keyword | Size | Base type
|ICMPv6 Type |
-icmpx_code |
+icmpv6_type |
8 bit |
integer
|===================
@@ -340,52 +318,6 @@ integer
The ICMPv6 Code type is used to conveniently specify the ICMPv6 header's code field.
-.keywords may be used when specifying the ICMPv6 code
-[options="header"]
-|==================
-|Keyword |Value
-|no-route|
-0
-|admin-prohibited|
-1
-|addr-unreachable|
-3
-|port-unreachable|
-4
-|policy-fail|
-5
-|reject-route|
-6
-|==================
-
-ICMPVX CODE TYPE
-~~~~~~~~~~~~~~~~
-[options="header"]
-|==================
-|Name | Keyword | Size | Base type
-|ICMPvX Code |
-icmpv6_type |
-8 bit |
-integer
-|===================
-
-The ICMPvX Code type abstraction is a set of values which overlap between ICMP
-and ICMPv6 Code types to be used from the inet family.
-
-.keywords may be used when specifying the ICMPvX code
-[options="header"]
-|==================
-|Keyword |Value
-|no-route|
-0
-|port-unreachable|
-1
-|host-unreachable|
-2
-|admin-prohibited|
-3
-|=================
-
CONNTRACK TYPES
~~~~~~~~~~~~~~~
@@ -446,21 +378,21 @@ For each of the types above, keywords are available for convenience:
.conntrack status (ct_status)
[options="header"]
|==================
-|Keyword| Value
-|expected|
-1
-|seen-reply|
-2
-|assured|
-4
-|confirmed|
-8
-|snat|
-16
-|dnat|
-32
-|dying|
-512
+|Keyword| Value | Description
+|expected|1| Expected connection; conntrack helper set it up
+|seen-reply|2| Conntrack has seen packets in both directions
+|assured| 4 |Conntrack entry will not be removed if hash table is full
+|confirmed | 8 | Initial packet processed
+|snat| 16 | Original source address differs from reply destination
+|dnat| 32 | Original destination differs from reply source
+|seq-adjust| 64 | tcp sequence number rewrite due to conntrack helper or synproxy
+|snat-done| 128 | tried to find matching snat/masquerade rule
+|dnat-done| 256 | tried to find matching dnat/redirect rule
+|dying| 512 | Connection about to be deleted
+|fixed-timeout | 1024 | entry expires even if traffic is active
+|helper | 8192 | connection is monitored by conntrack helper
+|offload | 16384 | connection is offloaded to a flow table
+|hw-offload | 32768 | connection is offloaded to hardware
|================
.conntrack event bits (ct_event)
diff --git a/doc/libnftables-json.adoc b/doc/libnftables-json.adoc
index f4aea36e..643884d5 100644
--- a/doc/libnftables-json.adoc
+++ b/doc/libnftables-json.adoc
@@ -175,7 +175,7 @@ kind, optionally filtered by *family* and for some, also *table*.
____
*{ "reset":* 'RESET_OBJECT' *}*
-'RESET_OBJECT' := 'COUNTER' | 'COUNTERS' | 'QUOTA' | 'QUOTAS' | 'RULE' | 'RULES'
+'RESET_OBJECT' := 'COUNTER' | 'COUNTERS' | 'QUOTA' | 'QUOTAS' | 'RULE' | 'RULES' | 'SET' | 'MAP' | 'ELEMENT'
____
Reset state in suitable objects, i.e. zero their internal counter.
@@ -202,12 +202,19 @@ Rename a chain. The new name is expected in a dedicated property named
=== TABLE
[verse]
+____
*{ "table": {
"family":* 'STRING'*,
"name":* 'STRING'*,
- "handle":* 'NUMBER'
+ "handle":* 'NUMBER'*,
+ "flags":* 'TABLE_FLAGS'
*}}*
+'TABLE_FLAGS' := 'TABLE_FLAG' | *[* 'TABLE_FLAG_LIST' *]*
+'TABLE_FLAG_LIST' := 'TABLE_FLAG' [*,* 'TABLE_FLAG_LIST' ]
+'TABLE_FLAG' := *"dormant"* | *"owner"* | *"persist"*
+____
+
This object describes a table.
*family*::
@@ -217,6 +224,8 @@ This object describes a table.
*handle*::
The table's handle. In input, it is used only in *delete* command as
alternative to *name*.
+*flags*::
+ The table's flags.
=== CHAIN
[verse]
@@ -308,11 +317,12 @@ ____
"handle":* 'NUMBER'*,
"type":* 'SET_TYPE'*,
"policy":* 'SET_POLICY'*,
- "flags": [* 'SET_FLAG_LIST' *],
+ "flags":* 'SET_FLAGS'*,
"elem":* 'SET_ELEMENTS'*,
"timeout":* 'NUMBER'*,
"gc-interval":* 'NUMBER'*,
- "size":* 'NUMBER'
+ "size":* 'NUMBER'*,
+ "auto-merge":* 'BOOLEAN'
*}}*
*{ "map": {
@@ -323,16 +333,18 @@ ____
"type":* 'SET_TYPE'*,
"map":* 'STRING'*,
"policy":* 'SET_POLICY'*,
- "flags": [* 'SET_FLAG_LIST' *],
+ "flags":* 'SET_FLAGS'*,
"elem":* 'SET_ELEMENTS'*,
"timeout":* 'NUMBER'*,
"gc-interval":* 'NUMBER'*,
- "size":* 'NUMBER'
+ "size":* 'NUMBER'*,
+ "auto-merge":* 'BOOLEAN'
*}}*
-'SET_TYPE' := 'STRING' | *[* 'SET_TYPE_LIST' *]*
+'SET_TYPE' := 'STRING' | *[* 'SET_TYPE_LIST' *]* | *{ "typeof":* 'EXPRESSION' *}*
'SET_TYPE_LIST' := 'STRING' [*,* 'SET_TYPE_LIST' ]
'SET_POLICY' := *"performance"* | *"memory"*
+'SET_FLAGS' := 'SET_FLAG' | *[* 'SET_FLAG_LIST' *]*
'SET_FLAG_LIST' := 'SET_FLAG' [*,* 'SET_FLAG_LIST' ]
'SET_FLAG' := *"constant"* | *"interval"* | *"timeout"*
'SET_ELEMENTS' := 'EXPRESSION' | *[* 'EXPRESSION_LIST' *]*
@@ -366,10 +378,13 @@ that they translate a unique key to a value.
Garbage collector interval in seconds.
*size*::
Maximum number of elements supported.
+*auto-merge*::
+ Automatic merging of adjacent/overlapping set elements in interval sets.
==== TYPE
-The set type might be a string, such as *"ipv4_addr"* or an array
-consisting of strings (for concatenated types).
+The set type might be a string, such as *"ipv4_addr"*, an array
+consisting of strings (for concatenated types) or a *typeof* object containing
+an expression to extract the type from.
==== ELEM
A single set element might be given as string, integer or boolean value for
@@ -682,11 +697,6 @@ processing continues with the next rule in the same chain.
==== OPERATORS
[horizontal]
-*&*:: Binary AND
-*|*:: Binary OR
-*^*:: Binary XOR
-*<<*:: Left shift
-*>>*:: Right shift
*==*:: Equal
*!=*:: Not equal
*<*:: Less than
@@ -1174,7 +1184,7 @@ ____
Construct a payload expression, i.e. a reference to a certain part of packet
data. The first form creates a raw payload expression to point at a random
-number (*len*) of bytes at a certain offset (*offset*) from a given reference
+number (*len*) of bits at a certain offset (*offset*) from a given reference
point (*base*). The following *base* values are accepted:
*"ll"*::
@@ -1226,6 +1236,17 @@ If the *field* property is not given, the expression is to be used as an SCTP
chunk existence check in a *match* statement with a boolean on the right hand
side.
+=== DCCP OPTION
+[verse]
+*{ "dccp option": {
+ "type":* 'NUMBER'*
+*}}*
+
+Create a reference to a DCCP option (*type*).
+
+The expression is to be used as a DCCP option existence check in a *match*
+statement with a boolean on the right hand side.
+
=== META
[verse]
____
@@ -1333,15 +1354,17 @@ Perform kernel Forwarding Information Base lookups.
=== BINARY OPERATION
[verse]
-*{ "|": [* 'EXPRESSION'*,* 'EXPRESSION' *] }*
-*{ "^": [* 'EXPRESSION'*,* 'EXPRESSION' *] }*
-*{ "&": [* 'EXPRESSION'*,* 'EXPRESSION' *] }*
-*{ "+<<+": [* 'EXPRESSION'*,* 'EXPRESSION' *] }*
-*{ ">>": [* 'EXPRESSION'*,* 'EXPRESSION' *] }*
+*{ "|": [* 'EXPRESSION'*,* 'EXPRESSIONS' *] }*
+*{ "^": [* 'EXPRESSION'*,* 'EXPRESSIONS' *] }*
+*{ "&": [* 'EXPRESSION'*,* 'EXPRESSIONS' *] }*
+*{ "+<<+": [* 'EXPRESSION'*,* 'EXPRESSIONS' *] }*
+*{ ">>": [* 'EXPRESSION'*,* 'EXPRESSIONS' *] }*
+'EXPRESSIONS' := 'EXPRESSION' | 'EXPRESSION'*,* 'EXPRESSIONS'
-All binary operations expect an array of exactly two expressions, of which the
+All binary operations expect an array of at least two expressions, of which the
first element denotes the left hand side and the second one the right hand
-side.
+side. Extra elements are accepted in the given array and appended to the term
+accordingly.
=== VERDICT
[verse]
diff --git a/doc/libnftables.adoc b/doc/libnftables.adoc
index 7ea0d56e..2cf78d7a 100644
--- a/doc/libnftables.adoc
+++ b/doc/libnftables.adoc
@@ -18,6 +18,9 @@ void nft_ctx_free(struct nft_ctx* '\*ctx'*);
bool nft_ctx_get_dry_run(struct nft_ctx* '\*ctx'*);
void nft_ctx_set_dry_run(struct nft_ctx* '\*ctx'*, bool* 'dry'*);
+unsigned int nft_ctx_input_get_flags(struct nft_ctx* '\*ctx'*);
+unsigned int nft_ctx_input_set_flags(struct nft_ctx* '\*ctx'*, unsigned int* 'flags'*);
+
unsigned int nft_ctx_output_get_flags(struct nft_ctx* '\*ctx'*);
void nft_ctx_output_set_flags(struct nft_ctx* '\*ctx'*, unsigned int* 'flags'*);
@@ -78,6 +81,30 @@ The *nft_ctx_get_dry_run*() function returns the dry-run setting's value contain
The *nft_ctx_set_dry_run*() function sets the dry-run setting in 'ctx' to the value of 'dry'.
+=== nft_ctx_input_get_flags() and nft_ctx_input_set_flags()
+The flags setting controls the input format.
+
+----
+enum {
+ NFT_CTX_INPUT_NO_DNS = (1 << 0),
+ NFT_CTX_INPUT_JSON = (1 << 1),
+};
+----
+
+NFT_CTX_INPUT_NO_DNS::
+ Avoid resolving IP addresses with blocking getaddrinfo(). In that case,
+ only plain IP addresses are accepted.
+
+NFT_CTX_INPUT_JSON:
+ When parsing the input, first try to interpret the input as JSON before
+ falling back to the nftables format. This behavior is implied when setting
+ the NFT_CTX_OUTPUT_JSON flag.
+
+The *nft_ctx_input_get_flags*() function returns the input flags setting's value in 'ctx'.
+
+The *nft_ctx_input_set_flags*() function sets the input flags setting in 'ctx' to the value of 'val'
+and returns the previous flags.
+
=== nft_ctx_output_get_flags() and nft_ctx_output_set_flags()
The flags setting controls the output format.
@@ -118,7 +145,8 @@ NFT_CTX_OUTPUT_HANDLE::
NFT_CTX_OUTPUT_JSON::
If enabled at compile-time, libnftables accepts input in JSON format and is able to print output in JSON format as well.
See *libnftables-json*(5) for a description of the supported schema.
- This flag controls JSON output format, input is auto-detected.
+ This flag enables JSON output format. If the flag is set, the input will first be tried as JSON format,
+ before falling back to nftables format. This flag implies NFT_CTX_INPUT_JSON.
NFT_CTX_OUTPUT_ECHO::
The echo setting makes libnftables print the changes once they are committed to the kernel, just like a running instance of *nft monitor* would.
Amongst other things, this allows one to retrieve an added rule's handle atomically.
diff --git a/doc/nft.txt b/doc/nft.txt
index 83f0f8bb..1be2fbac 100644
--- a/doc/nft.txt
+++ b/doc/nft.txt
@@ -43,6 +43,8 @@ understanding of their meaning. You can get information about options by running
*-f*::
*--file 'filename'*::
Read input from 'filename'. If 'filename' is -, read from stdin.
+ The directory path to this file is inserted at the beginning the list of
+ directories to be searched for included files (see *-I/--includepath*).
*-D*::
*--define 'name=value'*::
@@ -321,7 +323,7 @@ Effectively, this is the nft-equivalent of *iptables-save* and
TABLES
------
[verse]
-{*add* | *create*} *table* ['family'] 'table' [ {*comment* 'comment' *;*'} *{ flags* 'flags' *; }*]
+{*add* | *create*} *table* ['family'] 'table' [*{* [*comment* 'comment' *;*] [*flags* 'flags' *;*] *}*]
{*delete* | *destroy* | *list* | *flush*} *table* ['family'] 'table'
*list tables* ['family']
*delete table* ['family'] *handle* 'handle'
@@ -343,8 +345,17 @@ return an error.
|Flag | Description
|dormant |
table is not evaluated any more (base chains are unregistered).
+|owner |
+table is owned by the creating process.
+|persist |
+table shall outlive the owning process.
|=================
+Creating a table with flag *owner* excludes other processes from manipulating
+it or its contents. By default, it will be removed when the process exits.
+Setting flag *persist* will prevent this and the resulting orphaned table will
+accept a new owner, e.g. a restarting daemon maintaining the table.
+
.*Add, change, delete a table*
---------------------------------------
# start nft in interactive mode
@@ -376,7 +387,7 @@ add table inet mytable
CHAINS
------
[verse]
-{*add* | *create*} *chain* ['family'] 'table' 'chain' [*{ type* 'type' *hook* 'hook' [*device* 'device'] *priority* 'priority' *;* [*policy* 'policy' *;*] [*comment* 'comment' *;*'] *}*]
+{*add* | *create*} *chain* ['family'] 'table' 'chain' [*{ type* 'type' *hook* 'hook' [*device* 'device'] *priority* 'priority' *;* [*policy* 'policy' *;*] [*comment* 'comment' *;*] *}*]
{*delete* | *destroy* | *list* | *flush*} *chain* ['family'] 'table' 'chain'
*list chains* ['family']
*delete chain* ['family'] 'table' *handle* 'handle'
@@ -386,7 +397,8 @@ CHAINS
Chains are containers for rules. They exist in two kinds, base chains and
regular chains. A base chain is an entry point for packets from the networking
stack, a regular chain may be used as jump target and is used for better rule
-organization.
+organization. Regular chains can be anonymous, see *VERDICT STATEMENT* examples
+for details.
[horizontal]
*add*:: Add a new chain in the specified table. When a hook and priority value
@@ -412,7 +424,7 @@ Chains of this type perform Native Address Translation based on conntrack
entries. Only the first packet of a connection actually traverses this chain -
its rules usually define details of the created conntrack entry (NAT
statements for instance).
-|route | ip, ip6 | output |
+|route | ip, ip6, inet | output |
If a packet has traversed a chain of this type and is about to be accepted, a
new route lookup is performed if relevant parts of the IP header have changed.
This allows one to e.g. implement policy routing selectors in nftables.
@@ -434,6 +446,11 @@ further quirks worth noticing:
*prerouting*, *input*, *forward*, *output*, *postrouting* and this *ingress*
hook.
+The *device* parameter accepts a network interface name as a string, and is
+required when adding a base chain that filters traffic on the ingress or
+egress hooks. Any ingress or egress chains will only filter traffic from the
+interface specified in the *device* parameter.
+
The *priority* parameter accepts a signed integer value or a standard priority
name which specifies the order in which chains with the same *hook* value are
traversed. The ordering is ascending, i.e. lower priority values have precedence
@@ -519,7 +536,7 @@ beginning of the chain or before the specified rule.
*replace*:: Similar to *add*, but the rule replaces the specified rule.
*delete*:: Delete the specified rule.
*destroy*:: Delete the specified rule, it does not fail if it does not exist.
-*reset*:: Reset rule-contained state, i.e. counter and quota statement values.
+*reset*:: Reset rule-contained state, e.g. counter and quota statement values.
.*add a rule to ip table output chain*
-------------
@@ -570,8 +587,8 @@ section describes nft set syntax in more detail.
[verse]
*add set* ['family'] 'table' 'set' *{ type* 'type' | *typeof* 'expression' *;* [*flags* 'flags' *;*] [*timeout* 'timeout' *;*] [*gc-interval* 'gc-interval' *;*] [*elements = {* 'element'[*,* ...] *} ;*] [*size* 'size' *;*] [*comment* 'comment' *;*'] [*policy* 'policy' *;*] [*auto-merge ;*] *}*
-{*delete* | *destroy* | *list* | *flush*} *set* ['family'] 'table' 'set'
-*list sets* ['family']
+{*delete* | *destroy* | *list* | *flush* | *reset* } *set* ['family'] 'table' 'set'
+*list sets* ['family'] ['table']
*delete set* ['family'] 'table' *handle* 'handle'
{*add* | *delete* | *destroy* } *element* ['family'] 'table' 'set' *{* 'element'[*,* ...] *}*
@@ -585,6 +602,7 @@ be tuned with the flags that can be specified at set creation time.
*destroy*:: Delete the specified set, it does not fail if it does not exist.
*list*:: Display the elements in the specified set.
*flush*:: Remove all elements from the specified set.
+*reset*:: Reset state in all contained elements, e.g. counter and quota statement values.
.Set specifications
[options="header"]
@@ -597,8 +615,7 @@ string: ipv4_addr, ipv6_addr, ether_addr, inet_proto, inet_service, mark
data type of set element |
expression to derive the data type from
|flags |
-set flags |
-string: constant, dynamic, interval, timeout
+set flags | string: constant, dynamic, interval, timeout. Used to describe the sets properties.
|timeout |
time an element stays in the set, mandatory if set is added to from the packet path (ruleset)|
string, decimal followed by unit. Units are: d, h, m, s
@@ -624,8 +641,8 @@ MAPS
-----
[verse]
*add map* ['family'] 'table' 'map' *{ type* 'type' | *typeof* 'expression' [*flags* 'flags' *;*] [*elements = {* 'element'[*,* ...] *} ;*] [*size* 'size' *;*] [*comment* 'comment' *;*'] [*policy* 'policy' *;*] *}*
-{*delete* | *destroy* | *list* | *flush*} *map* ['family'] 'table' 'map'
-*list maps* ['family']
+{*delete* | *destroy* | *list* | *flush* | *reset* } *map* ['family'] 'table' 'map'
+*list maps* ['family'] ['table']
Maps store data based on some specific key used as input. They are uniquely identified by a user-defined name and attached to tables.
@@ -635,8 +652,7 @@ Maps store data based on some specific key used as input. They are uniquely iden
*destroy*:: Delete the specified map, it does not fail if it does not exist.
*list*:: Display the elements in the specified map.
*flush*:: Remove all elements from the specified map.
-*add element*:: Comma-separated list of elements to add into the specified map.
-*delete element*:: Comma-separated list of element keys to delete from the specified map.
+*reset*:: Reset state in all contained elements, e.g. counter and quota statement values.
.Map specifications
[options="header"]
@@ -650,7 +666,7 @@ data type of set element |
expression to derive the data type from
|flags |
map flags |
-string: constant, interval
+string, same as set flags
|elements |
elements contained by the map |
map data type
@@ -662,12 +678,28 @@ map policy |
string: performance [default], memory
|=================
+Users can specifiy the properties/features that the set/map must support.
+This allows the kernel to pick an optimal internal representation.
+If a required flag is missing, the ruleset might still work, as
+nftables will auto-enable features if it can infer this from the ruleset.
+This may not work for all cases, however, so it is recommended to
+specify all required features in the set/map definition manually.
+
+.Set and Map flags
+[options="header"]
+|=================
+|Flag | Description
+|constant | Set contents will never change after creation
+|dynamic | Set must support updates from the packet path with the *add*, *update* or *delete* keywords.
+|interval | Set must be able to store intervals (ranges)
+|timeout | Set must support element timeouts (auto-removal of elements once they expire).
+|=================
ELEMENTS
--------
[verse]
____
-{*add* | *create* | *delete* | *destroy* | *get* } *element* ['family'] 'table' 'set' *{* 'ELEMENT'[*,* ...] *}*
+{*add* | *create* | *delete* | *destroy* | *get* | *reset* } *element* ['family'] 'table' 'set' *{* 'ELEMENT'[*,* ...] *}*
'ELEMENT' := 'key_expression' 'OPTIONS' [*:* 'value_expression']
'OPTIONS' := [*timeout* 'TIMESPEC'] [*expires* 'TIMESPEC'] [*comment* 'string']
@@ -687,6 +719,9 @@ listed elements may already exist.
be non-trivial in very large and/or interval sets. In the latter case, the
containing interval is returned instead of just the element itself.
+*reset* command resets state attached to the given element(s), e.g. counter and
+quota statement values.
+
.Element options
[options="header"]
|=================
@@ -704,7 +739,7 @@ FLOWTABLES
-----------
[verse]
{*add* | *create*} *flowtable* ['family'] 'table' 'flowtable' *{ hook* 'hook' *priority* 'priority' *; devices = {* 'device'[*,* ...] *} ; }*
-*list flowtables* ['family']
+*list flowtables* ['family'] ['table']
{*delete* | *destroy* | *list*} *flowtable* ['family'] 'table' 'flowtable'
*delete* *flowtable* ['family'] 'table' *handle* 'handle'
@@ -715,8 +750,8 @@ protocols. Each entry also caches the destination interface and the gateway
address - to update the destination link-layer address - to forward packets.
The ttl and hoplimit fields are also decremented. Hence, flowtables provides an
alternative path that allow packets to bypass the classic forwarding path.
-Flowtables reside in the ingress hook that is located before the prerouting
-hook. You can select which flows you want to offload through the flow
+Flowtables reside in the ingress *hook* that is located before the prerouting
+*hook*. You can select which flows you want to offload through the flow
expression from the forward chain. Flowtables are identified by their address
family and their name. The address family must be one of ip, ip6, or inet. The inet
address family is a dummy family which is used to create hybrid IPv4/IPv6
@@ -732,17 +767,6 @@ and subtraction can be used to set relative priority, e.g. filter + 5 equals to
*destroy*:: Delete the specified flowtable, it does not fail if it does not exist.
*list*:: List all flowtables.
-LISTING
--------
-[verse]
-*list { secmarks | synproxys | flow tables | meters | hooks }* ['family']
-*list { secmarks | synproxys | flow tables | meters | hooks } table* ['family'] 'table'
-*list ct { timeout | expectation | helper | helpers } table* ['family'] 'table'
-
-Inspect configured objects.
-*list hooks* shows the full hook pipeline, including those registered by
-kernel modules, such as nf_conntrack.
-
STATEFUL OBJECTS
----------------
[verse]
@@ -755,13 +779,8 @@ STATEFUL OBJECTS
*destroy* 'counter' ['family'] 'table' *handle* 'handle'
*destroy* 'quota' ['family'] 'table' *handle* 'handle'
*destroy* 'limit' ['family'] 'table' *handle* 'handle'
-*list counters* ['family']
-*list quotas* ['family']
-*list limits* ['family']
-*reset counters* ['family']
-*reset quotas* ['family']
-*reset counters* ['family'] 'table'
-*reset quotas* ['family'] 'table'
+*list* { *counters* | *limits* | *quotas* } ['family'] ['table']
+*reset* { *counters* | *quotas* } ['family'] ['table']
Stateful objects are attached to tables and are identified by a unique name.
They group stateful information from rules, to reference them in rules the
@@ -874,57 +893,7 @@ ADDITIONAL COMMANDS
-------------------
These are some additional commands included in nft.
-MONITOR
-~~~~~~~~
-The monitor command allows you to listen to Netlink events produced by the
-nf_tables subsystem. These are either related to creation and deletion of
-objects or to packets for which *meta nftrace* was enabled. When they
-occur, nft will print to stdout the monitored events in either JSON or
-native nft format. +
-
-[verse]
-____
-*monitor* [*new* | *destroy*] 'MONITOR_OBJECT'
-*monitor* *trace*
-
-'MONITOR_OBJECT' := *tables* | *chains* | *sets* | *rules* | *elements* | *ruleset*
-____
-
-To filter events related to a concrete object, use one of the keywords in
-'MONITOR_OBJECT'.
-
-To filter events related to a concrete action, use keyword *new* or *destroy*.
-
-The second form of invocation takes no further options and exclusively prints
-events generated for packets with *nftrace* enabled.
-
-Hit ^C to finish the monitor operation.
-
-.Listen to all events, report in native nft format
---------------------------------------------------
-% nft monitor
---------------------------------------------------
-
-.Listen to deleted rules, report in JSON format
------------------------------------------------
-% nft -j monitor destroy rules
------------------------------------------------
-
-.Listen to both new and destroyed chains, in native nft format
------------------------------------------------------------------
-% nft monitor chains
--------------------------------
-
-.Listen to ruleset events such as table, chain, rule, set, counters and quotas, in native nft format
-----------------------------------------------------------------------------------------------------
-% nft monitor ruleset
----------------------
-
-.Trace incoming packets from host 10.0.0.1
-------------------------------------------
-% nft add rule filter input ip saddr 10.0.0.1 meta nftrace set 1
-% nft monitor trace
-------------------------------------------
+include::additional-commands.txt[]
ERROR REPORTING
---------------
diff --git a/doc/payload-expression.txt b/doc/payload-expression.txt
index f1de3447..ce0c6a23 100644
--- a/doc/payload-expression.txt
+++ b/doc/payload-expression.txt
@@ -134,6 +134,14 @@ Destination address |
ipv4_addr
|======================
+Careful with matching on *ip length*: If GRO/GSO is enabled, then the Linux
+kernel might aggregate several packets into one big packet that is larger than
+MTU. Moreover, if GRO/GSO maximum size is larger than 65535 (see man ip-link(8),
+specifically gro_ipv6_max_size and gso_ipv6_max_size), then *ip length* might
+be 0 for such jumbo packets. *meta length* allows you to match on the packet
+length including the IP header size. If you want to perform heuristics on the
+*ip length* field, then disable GRO/GSO.
+
ICMP HEADER EXPRESSION
~~~~~~~~~~~~~~~~~~~~~~
[verse]
@@ -244,6 +252,14 @@ Destination address |
ipv6_addr
|=======================
+Careful with matching on *ip6 length*: If GRO/GSO is enabled, then the Linux
+kernel might aggregate several packets into one big packet that is larger than
+MTU. Moreover, if GRO/GSO maximum size is larger than 65535 (see man ip-link(8),
+specifically gro_ipv6_max_size and gso_ipv6_max_size), then *ip6 length* might
+be 0 for such jumbo packets. *meta length* allows you to match on the packet
+length including the IP header size. If you want to perform heuristics on the
+*ip6 length* field, then disable GRO/GSO.
+
.Using ip6 header expressions
-----------------------------
# matching if first extension header indicates a fragment
@@ -253,7 +269,7 @@ ip6 nexthdr ipv6-frag
ICMPV6 HEADER EXPRESSION
~~~~~~~~~~~~~~~~~~~~~~~~
[verse]
-*icmpv6* {*type* | *code* | *checksum* | *parameter-problem* | *packet-too-big* | *id* | *sequence* | *max-delay*}
+*icmpv6* {*type* | *code* | *checksum* | *parameter-problem* | *packet-too-big* | *id* | *sequence* | *max-delay* | *taddr* | *daddr*}
This expression refers to ICMPv6 header fields. When using it in *inet*,
*bridge* or *netdev* families, it will cause an implicit dependency on IPv6 to
@@ -288,6 +304,12 @@ integer (16 bit)
|max-delay|
maximum response delay of MLD queries|
integer (16 bit)
+|taddr|
+target address of neighbor solicit/advert, redirect or MLD|
+ipv6_addr
+|daddr|
+destination address of redirect|
+ipv6_addr
|==============================
TCP HEADER EXPRESSION
@@ -648,44 +670,6 @@ integer (24 bit)
netdev filter ingress udp dport 4789 vxlan tcp dport 80 counter
----------------------------------------------------------
-ARP HEADER EXPRESSION
-~~~~~~~~~~~~~~~~~~~~~
-[verse]
-*arp* {*htype* | *ptype* | *hlen* | *plen* | *operation* | *saddr* { *ip* | *ether* } | *daddr* { *ip* | *ether* }
-
-.ARP header expression
-[options="header"]
-|==================
-|Keyword| Description| Type
-|htype|
-ARP hardware type|
-integer (16 bit)
-|ptype|
-EtherType|
-ether_type
-|hlen|
-Hardware address len|
-integer (8 bit)
-|plen|
-Protocol address len |
-integer (8 bit)
-|operation|
-Operation |
-arp_op
-|saddr ether|
-Ethernet sender address|
-ether_addr
-|daddr ether|
-Ethernet target address|
-ether_addr
-|saddr ip|
-IPv4 sender address|
-ipv4_addr
-|daddr ip|
-IPv4 target address|
-ipv4_addr
-|======================
-
RAW PAYLOAD EXPRESSION
~~~~~~~~~~~~~~~~~~~~~~
[verse]
@@ -745,14 +729,15 @@ nftables currently supports matching (finding) a given ipv6 extension header, TC
*dst* {*nexthdr* | *hdrlength*}
*mh* {*nexthdr* | *hdrlength* | *checksum* | *type*}
*srh* {*flags* | *tag* | *sid* | *seg-left*}
-*tcp option* {*eol* | *nop* | *maxseg* | *window* | *sack-perm* | *sack* | *sack0* | *sack1* | *sack2* | *sack3* | *timestamp*} 'tcp_option_field'
+*tcp option* {*eol* | *nop* | *maxseg* | *window* | *sack-perm* | *sack* | *sack0* | *sack1* | *sack2* | *sack3* | *timestamp* | *mptcp* } 'tcp_option_field'
*ip option* { lsrr | ra | rr | ssrr } 'ip_option_field'
The following syntaxes are valid only in a relational expression with boolean type on right-hand side for checking header existence only:
[verse]
*exthdr* {*hbh* | *frag* | *rt* | *dst* | *mh*}
-*tcp option* {*eol* | *nop* | *maxseg* | *window* | *sack-perm* | *sack* | *sack0* | *sack1* | *sack2* | *sack3* | *timestamp*}
+*tcp option* {*eol* | *nop* | *maxseg* | *window* | *sack-perm* | *sack* | *sack0* | *sack1* | *sack2* | *sack3* | *timestamp* | *mptcp* }
*ip option* { lsrr | ra | rr | ssrr }
+*dccp option* 'dccp_option_type'
.IPv6 extension headers
[options="header"]
@@ -809,8 +794,13 @@ length, left, right
|timestamp|
TCP Timestamps |
length, tsval, tsecr
+|mptcp|
+Multipath TCP |
+subtype
|============================
+Data types can be queried with 'nft describe tcp option *keyword* [ *fieldname* ]'.
+
TCP option matching also supports raw expression syntax to access arbitrary options:
[verse]
*tcp option*
@@ -823,16 +813,16 @@ TCP option matching also supports raw expression syntax to access arbitrary opti
|Keyword| Description | IP option fields
|lsrr|
Loose Source Route |
-type, length, ptr, addr
+length, ptr, addr
|ra|
Router Alert |
-type, length, value
+length, value
|rr|
Record Route |
-type, length, ptr, addr
+length, ptr, addr
|ssrr|
Strict Source Route |
-type, length, ptr, addr
+length, ptr, addr
|============================
.finding TCP options
@@ -855,6 +845,11 @@ ip6 filter input frag more-fragments 1 counter
filter input ip option lsrr exists counter
---------------------------------------
+.finding DCCP option
+------------------
+filter input dccp option 40 exists counter
+---------------------------------------
+
CONNTRACK EXPRESSIONS
~~~~~~~~~~~~~~~~~~~~~
Conntrack expressions refer to meta data of the connection tracking entry associated with a packet. +
diff --git a/doc/primary-expression.txt b/doc/primary-expression.txt
index e13970cf..2266724e 100644
--- a/doc/primary-expression.txt
+++ b/doc/primary-expression.txt
@@ -117,7 +117,7 @@ devgroup
outgoing device group|
devgroup
|cgroup|
-control group id |
+control group net_cls.classid (for matching on cgroupv2, see *socket cgroupv2*)|
integer (32 bit)
|random|
pseudo-random number|
@@ -168,15 +168,18 @@ Either an integer or a date in ISO format. For example: "2019-06-06 17:00".
Hour and seconds are optional and can be omitted if desired. If omitted,
midnight will be assumed.
The following three would be equivalent: "2019-06-06", "2019-06-06 00:00"
-and "2019-06-06 00:00:00".
+and "2019-06-06 00:00:00". Use a range expression such as
+"2019-06-06 10:00"-"2019-06-10 14:00" for matching a time range.
When an integer is given, it is assumed to be a UNIX timestamp.
|day|
Either a day of week ("Monday", "Tuesday", etc.), or an integer between 0 and 6.
Strings are matched case-insensitively, and a full match is not expected (e.g. "Mon" would match "Monday").
-When an integer is given, 0 is Sunday and 6 is Saturday.
+When an integer is given, 0 is Sunday and 6 is Saturday. Use a range expression
+such as "Monday"-"Wednesday" for matching a week day range.
|hour|
A string representing an hour in 24-hour format. Seconds can optionally be specified.
-For example, 17:00 and 17:00:00 would be equivalent.
+For example, 17:00 and 17:00:00 would be equivalent. Use a range expression such
+as "17:00"-"19:00" for matching a time range.
|=============================
.Using meta expressions
@@ -190,6 +193,9 @@ filter output oif eth0
# incoming packet was subject to ipsec processing
raw prerouting meta ipsec exists accept
+
+# match incoming packet from 03:00 to 14:00 local time
+raw prerouting meta hour "03:00"-"14:00" counter accept
-----------------------
SOCKET EXPRESSION
@@ -304,44 +310,96 @@ table inet x {
FIB EXPRESSIONS
~~~~~~~~~~~~~~~
[verse]
-*fib* {*saddr* | *daddr* | *mark* | *iif* | *oif*} [*.* ...] {*oif* | *oifname* | *type*}
+*fib* 'FIB_TUPLE' 'FIB_RESULT'
+'FIB_TUPLE' := { *saddr* | *daddr*} [ *.* { *iif* | *oif* } *.* *mark* ]
+'FIB_RESULT' := { *oif* | *oifname* | *check* | *type* }
+
+
+A fib expression queries the fib (forwarding information base) to obtain information
+such as the output interface index.
+
+The first arguments to the *fib* expression are the input keys to be passed to the fib lookup function.
+One of *saddr* or *daddr* is mandatory, they are also mutually exclusive.
+
+*mark*, *iif* and *oif* keywords are optional modifiers to influence the search result, see
+the *FIB_TUPLE* keyword table below for a description.
+The *iif* and *oif* tuple keywords are also mutually exclusive.
+
+The last argument to the *fib* expression is the desired result type.
-A fib expression queries the fib (forwarding information base) to obtain
-information such as the output interface index a particular address would use.
-The input is a tuple of elements that is used as input to the fib lookup
-functions.
+*oif* asks to obtain the interface index that would be used to send packets to the packets source
+(*saddr* key) or destination (*daddr* key). If no routing entry is found, the returned interface
+index is 0.
-.fib expression specific types
+*oifname* is like *oif*, but it fills the interface name instead. This is useful to check dynamic
+interfaces such as ppp devices. If no entry is found, an empty interface name is returned.
+
+*type* returns the address type such as unicast or multicast. A complete list of supported
+address types can be shown with *nft* *describe* *fib_addrtype*.
+
+.FIB_TUPLE keywords
[options="header"]
|==================
-|Keyword| Description| Type
+|flag| Description
+|daddr| Perform a normal route lookup: search fib for route to the *destination address* of the packet.
+|saddr| Perform a reverse route lookup: search the fib for route to the *source address* of the packet.
+|mark | consider the packet mark (nfmark) when querying the fib.
+|iif | if fib lookups provides a route then check its output interface is identical to the packets *input* interface.
+|oif | if fib lookups provides a route then check its output interface is identical to the packets *output* interface. This flag can only be used with the *type* result.
+|=======================
+
+.FIB_RESULT keywords
+[options="header"]
+|==================
+|Keyword| Description| Result Type
|oif|
Output interface index|
-integer (32 bit)
+iface_index
+|check|
+Output interface check|
+boolean
|oifname|
Output interface name|
-string
+ifname
|type|
Address type |
-fib_addrtype
+fib_addrtype (see *nft* *describe* *fib_addrtype* for a list)
|=======================
-Use *nft* *describe* *fib_addrtype* to get a list of all address types.
+The *oif* and *oifname* result is only valid in the *prerouting*, *input* and *forward* hooks.
+The *type* can be queried from any one of *prerouting*, *input*, *forward* *output* and *postrouting*.
+
+For *type*, the presence of the *iif* keyword in the 'FIB_TUPLE' modifiers restrict the available
+hooks to those where the packet is associated with an incoming interface, i.e. *prerouting*, *input* and *forward*.
+Likewise, the *oif* keyword in the 'FIB_TUPLE' modifier list will limit the available hooks to
+*forward*, *output* and *postrouting*.
.Using fib expressions
----------------------
# drop packets without a reverse path
filter prerouting fib saddr . iif oif missing drop
-In this example, 'saddr . iif' looks up routing information based on the source address and the input interface.
-oif picks the output interface index from the routing information.
+In this example, 'saddr . iif' looks up a route to the *source address* of the packet and restricts matching
+results to the interface that the packet arrived on, then stores the output interface index from the obtained
+fib route result.
+
If no route was found for the source address/input interface combination, the output interface index is zero.
-In case the input interface is specified as part of the input key, the output interface index is always the same as the input interface index or zero.
-If only 'saddr oif' is given, then oif can be any interface index or zero.
+Hence, this rule will drop all packets that do not have a strict reverse path (hypothetical reply packet
+would be sent via the interface the tested packet arrived on).
+
+If only 'saddr oif' is used as the input key, then this rule would only drop packets where the fib cannot
+find a route. In most setups this will never drop packets because the default route is returned.
-# drop packets to address not configured on incoming interface
+# drop packets if the destination ip address is not configured on the incoming interface
filter prerouting fib daddr . iif type != { local, broadcast, multicast } drop
+This queries the fib based on the current packets' destination address and the incoming interface.
+
+If the packet is sent to a unicast address that is configured on a different interface, then the packet
+will be dropped as such an address would be classified as 'unicast' type.
+Without the 'iif' modifier, any address configured on the local machine is 'local', and unicast addresses
+not configured on any interface would return the type 'unicast'.
+
# perform lookup in a specific 'blackhole' table (0xdead, needs ip appropriate ip rule)
filter prerouting meta mark set 0xdead fib daddr . mark type vmap { blackhole : drop, prohibit : jump prohibited, unreachable : drop }
----------------------
diff --git a/doc/stateful-objects.txt b/doc/stateful-objects.txt
index e3c79220..5824d53a 100644
--- a/doc/stateful-objects.txt
+++ b/doc/stateful-objects.txt
@@ -94,7 +94,7 @@ table ip filter {
ct timeout customtimeout {
protocol tcp;
l3proto ip
- policy = { established: 120, close: 20 }
+ policy = { established: 2m, close: 20s }
}
chain output {
@@ -119,7 +119,7 @@ sport=41360 dport=22
CT EXPECTATION
~~~~~~~~~~~~~~
[verse]
-*add* *ct expectation* ['family'] 'table' 'name' *{ protocol* 'protocol' *; dport* 'dport' *; timeout* 'timeout' *; size* 'size' *; [*l3proto* 'family' *;*] *}*
+*add* *ct expectation* ['family'] 'table' 'name' *{ protocol* 'protocol' *; dport* 'dport' *; timeout* 'timeout' *; size* 'size' *;* [*l3proto* 'family' *;*] *}*
*delete* *ct expectation* ['family'] 'table' 'name'
*list* *ct expectations*
diff --git a/doc/statements.txt b/doc/statements.txt
index 0532b2b1..f9460dd7 100644
--- a/doc/statements.txt
+++ b/doc/statements.txt
@@ -3,8 +3,12 @@ VERDICT STATEMENT
The verdict statement alters control flow in the ruleset and issues policy decisions for packets.
[verse]
+____
{*accept* | *drop* | *queue* | *continue* | *return*}
-{*jump* | *goto*} 'chain'
+{*jump* | *goto*} 'CHAIN'
+
+'CHAIN' := 'chain_name' | *{* 'statement' ... *}*
+____
*accept* and *drop* are absolute verdicts -- they terminate ruleset evaluation immediately.
@@ -26,15 +30,20 @@ resumes with the next base chain hook, not the rule following the queue verdict.
*return*:: Return from the current chain and continue evaluation at the
next rule in the last chain. If issued in a base chain, it is equivalent to the
base chain policy.
-*jump* 'chain':: Continue evaluation at the first rule in 'chain'. The current
+*jump* 'CHAIN':: Continue evaluation at the first rule in 'CHAIN'. The current
position in the ruleset is pushed to a call stack and evaluation will continue
there when the new chain is entirely evaluated or a *return* verdict is issued.
In case an absolute verdict is issued by a rule in the chain, ruleset evaluation
terminates immediately and the specific action is taken.
-*goto* 'chain':: Similar to *jump*, but the current position is not pushed to the
+*goto* 'CHAIN':: Similar to *jump*, but the current position is not pushed to the
call stack, meaning that after the new chain evaluation will continue at the last
chain instead of the one containing the goto statement.
+An alternative to specifying the name of an existing, regular chain in 'CHAIN'
+is to specify an anonymous chain ad-hoc. Like with anonymous sets, it can't be
+referenced from another rule and will be removed along with the rule containing
+it.
+
.Using verdict statements
-------------------
# process packets from eth0 and the internal network in from_lan
@@ -42,6 +51,10 @@ resumes with the next base chain hook, not the rule following the queue verdict.
filter input iif eth0 ip saddr 192.168.0.0/24 jump from_lan
filter input iif eth0 drop
+
+# jump and goto statements support anonymous chain creation
+filter input iif eth0 jump { ip saddr 192.168.0.0/24 drop ; udp dport domain drop; }
+
-------------------
PAYLOAD STATEMENT
@@ -56,7 +69,7 @@ set ip DSCP (diffserv) header field or ipv6 flow labels.
---------------------------------------
# redirect tcp:http from 192.160.0.0/16 to local machine for routing instead of bridging
# assumes 00:11:22:33:44:55 is local MAC address.
-bridge input meta iif eth0 ip saddr 192.168.0.0/16 tcp dport 80 meta pkttype set unicast ether daddr set 00:11:22:33:44:55
+bridge input meta iif eth0 ip saddr 192.168.0.0/16 tcp dport 80 meta pkttype set host ether daddr set 00:11:22:33:44:55
-------------------------------------------
.Set IPv4 DSCP header field
@@ -171,37 +184,77 @@ REJECT STATEMENT
____
*reject* [ *with* 'REJECT_WITH' ]
-'REJECT_WITH' := *icmp* 'icmp_code' |
- *icmpv6* 'icmpv6_code' |
- *icmpx* 'icmpx_code' |
+'REJECT_WITH' := *icmp* 'icmp_reject_code' |
+ *icmpv6* 'icmpv6_reject_code' |
+ *icmpx* 'icmpx_reject_code' |
*tcp reset*
____
A reject statement is used to send back an error packet in response to the
matched packet otherwise it is equivalent to drop so it is a terminating
statement, ending rule traversal. This statement is only valid in base chains
-using the *input*,
+using the *prerouting*, *input*,
*forward* or *output* hooks, and user-defined chains which are only called from
those chains.
-.different ICMP reject variants are meant for use in different table families
+.Keywords may be used to reject when specifying the ICMP code
+[options="header"]
+|==================
+|Keyword | Value
+|net-unreachable |
+0
+|host-unreachable |
+1
+|prot-unreachable|
+2
+|port-unreachable|
+3
+|frag-needed|
+4
+|net-prohibited|
+9
+|host-prohibited|
+10
+|admin-prohibited|
+13
+|===================
+
+.keywords may be used to reject when specifying the ICMPv6 code
[options="header"]
|==================
-|Variant |Family | Type
-|icmp|
-ip|
-icmp_code
-|icmpv6|
-ip6|
-icmpv6_code
-|icmpx|
-inet|
-icmpx_code
+|Keyword |Value
+|no-route|
+0
+|admin-prohibited|
+1
+|addr-unreachable|
+3
+|port-unreachable|
+4
+|policy-fail|
+5
+|reject-route|
+6
|==================
-For a description of the different types and a list of supported keywords refer
-to DATA TYPES section above. The common default reject value is
-*port-unreachable*. +
+The ICMPvX Code type abstraction is a set of values which overlap between ICMP
+and ICMPv6 Code types to be used from the inet family.
+
+.keywords may be used when specifying the ICMPvX code
+[options="header"]
+|==================
+|Keyword |Value
+|no-route|
+0
+|port-unreachable|
+1
+|host-unreachable|
+2
+|admin-prohibited|
+3
+|=================
+
+The common default ICMP code to reject is *port-unreachable*.
Note that in bridge family, reject statement is only allowed in base chains
which hook into input or prerouting.
@@ -296,7 +349,7 @@ A meta statement sets the value of a meta expression. The existing meta fields
are: priority, mark, pkttype, nftrace. +
[verse]
-*meta* {*mark* | *priority* | *pkttype* | *nftrace*} *set* 'value'
+*meta* {*mark* | *priority* | *pkttype* | *nftrace* | *broute*} *set* 'value'
A meta statement sets meta data associated with a packet. +
@@ -316,6 +369,9 @@ pkt_type
|nftrace |
ruleset packet tracing on/off. Use *monitor trace* command to watch traces|
0, 1
+|broute |
+broute on/off. packets are routed instead of being bridged|
+0, 1
|==========================
LIMIT STATEMENT
@@ -356,8 +412,8 @@ NAT STATEMENTS
~~~~~~~~~~~~~~
[verse]
____
-*snat* [[*ip* | *ip6*] *to*] 'ADDR_SPEC' [*:*'PORT_SPEC'] ['FLAGS']
-*dnat* [[*ip* | *ip6*] *to*] 'ADDR_SPEC' [*:*'PORT_SPEC'] ['FLAGS']
+*snat* [[*ip* | *ip6*] [ *prefix* ] *to*] 'ADDR_SPEC' [*:*'PORT_SPEC'] ['FLAGS']
+*dnat* [[*ip* | *ip6*] [ *prefix* ] *to*] 'ADDR_SPEC' [*:*'PORT_SPEC'] ['FLAGS']
*masquerade* [*to :*'PORT_SPEC'] ['FLAGS']
*redirect* [*to :*'PORT_SPEC'] ['FLAGS']
@@ -395,6 +451,9 @@ Before kernel 4.18 nat statements require both prerouting and postrouting base c
to be present since otherwise packets on the return path won't be seen by
netfilter and therefore no reverse translation will take place.
+The optional *prefix* keyword allows to map *n* source addresses to *n*
+destination addresses. See 'Advanced NAT examples' below.
+
.NAT statement values
[options="header"]
|==================
@@ -405,7 +464,7 @@ You may specify a mapping to relate a list of tuples composed of arbitrary
expression key with address value. |
ipv4_addr, ipv6_addr, e.g. abcd::1234, or you can use a mapping, e.g. meta mark map { 10 : 192.168.1.2, 20 : 192.168.1.3 }
|port|
-Specifies that the source/destination address of the packet should be modified. |
+Specifies that the source/destination port of the packet should be modified. |
port number (16 bit)
|===============================
@@ -454,6 +513,52 @@ add rule inet nat postrouting meta oif ppp0 masquerade
------------------------
+.Advanced NAT examples
+----------------------
+
+# map prefixes in one network to that of another, e.g. 10.141.11.4 is mangled to 192.168.2.4,
+# 10.141.11.5 is mangled to 192.168.2.5 and so on.
+add rule nat postrouting snat ip prefix to ip saddr map { 10.141.11.0/24 : 192.168.2.0/24 }
+
+# map a source address, source port combination to a pool of destination addresses and ports:
+add rule nat postrouting dnat to ip saddr . tcp dport map { 192.168.1.2 . 80 : 10.141.10.2-10.141.10.5 . 8888-8999 }
+
+# The above example generates the following NAT expression:
+#
+# [ nat dnat ip addr_min reg 1 addr_max reg 10 proto_min reg 9 proto_max reg 11 ]
+#
+# which expects to obtain the following tuple:
+# IP address (min), source port (min), IP address (max), source port (max)
+# to be obtained from the map. The given addresses and ports are inclusive.
+
+# This also works with named maps and in combination with both concatenations and ranges:
+table ip nat {
+ map ipportmap {
+ typeof ip saddr : interval ip daddr . tcp dport
+ flags interval
+ elements = { 192.168.1.2 : 10.141.10.1-10.141.10.3 . 8888-8999, 192.168.2.0/24 : 10.141.11.5-10.141.11.20 . 8888-8999 }
+ }
+
+ chain prerouting {
+ type nat hook prerouting priority dstnat; policy accept;
+ ip protocol tcp dnat ip to ip saddr map @ipportmap
+ }
+}
+
+@ipportmap maps network prefixes to a range of hosts and ports.
+The new destination is taken from the range provided by the map element.
+Same for the destination port.
+
+Note the use of the "interval" keyword in the typeof description.
+This is required so nftables knows that it has to ask for twice the
+amount of storage for each key-value pair in the map.
+
+": ipv4_addr . inet_service" would allow associating one address and one port
+with each key. But for this case, for each key, two addresses and two ports
+(The minimum and maximum values for both) have to be stored.
+
+------------------------
+
TPROXY STATEMENT
~~~~~~~~~~~~~~~~
Tproxy redirects the packet to a local socket without changing the packet header
@@ -491,27 +596,58 @@ this case the rule will match for both families.
table ip x {
chain y {
type filter hook prerouting priority mangle; policy accept;
- tcp dport ntp tproxy to 1.1.1.1
- udp dport ssh tproxy to :2222
+ tcp dport ntp tproxy to 1.1.1.1 accept
+ udp dport ssh tproxy to :2222 accept
}
}
table ip6 x {
chain y {
type filter hook prerouting priority mangle; policy accept;
- tcp dport ntp tproxy to [dead::beef]
- udp dport ssh tproxy to :2222
+ tcp dport ntp tproxy to [dead::beef] accept
+ udp dport ssh tproxy to :2222 accept
}
}
table inet x {
chain y {
type filter hook prerouting priority mangle; policy accept;
- tcp dport 321 tproxy to :ssh
- tcp dport 99 tproxy ip to 1.1.1.1:999
- udp dport 155 tproxy ip6 to [dead::beef]:smux
+ tcp dport 321 tproxy to :22 accept
+ tcp dport 99 tproxy ip to 1.1.1.1:999 accept
+ udp dport 155 tproxy ip6 to [dead::beef]:smux accept
}
}
-------------------------------------
+Note that the tproxy statement is non-terminal to allow post-processing of
+packets. This allows packets to be logged for debugging as well as updating the
+mark to ensure that packets are delivered locally through policy routing rules.
+
+.Example ruleset for tproxy statement with logging and meta mark
+-------------------------------------
+table inet x {
+ chain y {
+ type filter hook prerouting priority mangle; policy accept;
+ udp dport 9999 goto {
+ tproxy to :1234 log prefix "packet tproxied: " meta mark set 1 accept
+ log prefix "no socket on port 1234 or not transparent?: " drop
+ }
+ }
+}
+-------------------------------------
+
+As packet headers are unchanged, packets might be forwarded instead of delivered
+locally. As mentioned above, this can be avoided by adding policy routing rules
+and the packet mark.
+
+.Example policy routing rules for local redirection
+----------------------------------------------------
+ip rule add fwmark 1 lookup 100
+ip route add local 0.0.0.0/0 dev lo table 100
+----------------------------------------------------
+
+This is a change in behavior compared to the legacy iptables TPROXY target
+which is terminal. To terminate the packet processing after the tproxy
+statement, remember to issue a verdict as in the example above.
+
SYNPROXY STATEMENT
~~~~~~~~~~~~~~~~~~
This statement will process TCP three-way-handshake parallel in netfilter