/* netfilter.c: look after the filters for various protocols. 
 * Heavily influenced by the old firewall.c by David Bonn and Alan Cox.
 *
 * Thanks to Rob `CmdrTaco' Malda for not influencing this code in any
 * way.
 *
 * Rusty Russell (C)2000 -- This code is GPL.
 *
 * February 2000: Modified by James Morris to have 1 queue per protocol.
 * 15-Mar-2000:   Added NF_REPEAT --RR.
 */
#include <linux/config.h>
#include <linux/netfilter.h>
#include <net/protocol.h>
#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/wait.h>
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/if.h>
#include <linux/netdevice.h>
#include <linux/brlock.h>
#include <linux/inetdevice.h>
#include <net/sock.h>
#include <net/route.h>
#include <linux/ip.h>

#define __KERNEL_SYSCALLS__
#include <linux/unistd.h>

/* In this code, we can be waiting indefinitely for userspace to
 * service a packet if a hook returns NF_QUEUE.  We could keep a count
 * of skbuffs queued for userspace, and not deregister a hook unless
 * this is zero, but that sucks.  Now, we simply check when the
 * packets come back: if the hook is gone, the packet is discarded. */
#ifdef CONFIG_NETFILTER_DEBUG
#define NFDEBUG(format, args...)  printk(format , ## args)
#else
#define NFDEBUG(format, args...)
#endif

/* Sockopts only registered and called from user context, so
   BR_NETPROTO_LOCK would be overkill.  Also, [gs]etsockopt calls may
   sleep. */
static DECLARE_MUTEX(nf_sockopt_mutex);

struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS];
static LIST_HEAD(nf_sockopts);

/* 
 * A queue handler may be registered for each protocol.  Each is protected by
 * long term mutex.  The handler must provide an an outfn() to accept packets
 * for queueing and must reinject all packets it receives, no matter what.
 */
static struct nf_queue_handler_t {
	nf_queue_outfn_t outfn;
	void *data;
} queue_handler[NPROTO];

int nf_register_hook(struct nf_hook_ops *reg)
{
	struct list_head *i;

	br_write_lock_bh(BR_NETPROTO_LOCK);
	for (i = nf_hooks[reg->pf][reg->hooknum].next; 
	     i != &nf_hooks[reg->pf][reg->hooknum]; 
	     i = i->next) {
		if (reg->priority < ((struct nf_hook_ops *)i)->priority)
			break;
	}
	list_add(&reg->list, i->prev);
	br_write_unlock_bh(BR_NETPROTO_LOCK);
	return 0;
}

void nf_unregister_hook(struct nf_hook_ops *reg)
{
	br_write_lock_bh(BR_NETPROTO_LOCK);
	list_del(&reg->list);
	br_write_unlock_bh(BR_NETPROTO_LOCK);
}

/* Do exclusive ranges overlap? */
static inline int overlap(int min1, int max1, int min2, int max2)
{
	return max1 > min2 && min1 < max2;
}

/* Functions to register sockopt ranges (exclusive). */
int nf_register_sockopt(struct nf_sockopt_ops *reg)
{
	struct list_head *i;
	int ret = 0;

	if (down_interruptible(&nf_sockopt_mutex) != 0)
		return -EINTR;

	for (i = nf_sockopts.next; i != &nf_sockopts; i = i->next) {
		struct nf_sockopt_ops *ops = (struct nf_sockopt_ops *)i;
		if (ops->pf == reg->pf
		    && (overlap(ops->set_optmin, ops->set_optmax, 
				reg->set_optmin, reg->set_optmax)
			|| overlap(ops->get_optmin, ops->get_optmax, 
				   reg->get_optmin, reg->get_optmax))) {
			NFDEBUG("nf_sock overlap: %u-%u/%u-%u v %u-%u/%u-%u\n",
				ops->set_optmin, ops->set_optmax, 
				ops->get_optmin, ops->get_optmax, 
				reg->set_optmin, reg->set_optmax,
				reg->get_optmin, reg->get_optmax);
			ret = -EBUSY;
			goto out;
		}
	}

	list_add(&reg->list, &nf_sockopts);
out:
	up(&nf_sockopt_mutex);
	return ret;
}

void nf_unregister_sockopt(struct nf_sockopt_ops *reg)
{
	/* No point being interruptible: we're probably in cleanup_module() */
 restart:
	down(&nf_sockopt_mutex);
	if (reg->use != 0) {
		/* To be woken by nf_sockopt call... */
		/* FIXME: Stuart Young's name appears gratuitously. */
		set_current_state(TASK_UNINTERRUPTIBLE);
		reg->cleanup_task = current;
		up(&nf_sockopt_mutex);
		schedule();
		goto restart;
	}
	list_del(&reg->list);
	up(&nf_sockopt_mutex);
}

#ifdef CONFIG_NETFILTER_DEBUG
#include <net/ip.h>
#include <net/route.h>
#include <net/tcp.h>
#include <linux/netfilter_ipv4.h>

static void debug_print_hooks_ip(unsigned int nf_debug)
{
	if (nf_debug & (1 << NF_IP_PRE_ROUTING)) {
		printk("PRE_ROUTING ");
		nf_debug ^= (1 << NF_IP_PRE_ROUTING);
	}
	if (nf_debug & (1 << NF_IP_LOCAL_IN)) {
		printk("LOCAL_IN ");
		nf_debug ^= (1 << NF_IP_LOCAL_IN);
	}
	if (nf_debug & (1 << NF_IP_FORWARD)) {
		printk("FORWARD ");
		nf_debug ^= (1 << NF_IP_FORWARD);
	}
	if (nf_debug & (1 << NF_IP_LOCAL_OUT)) {
		printk("LOCAL_OUT ");
		nf_debug ^= (1 << NF_IP_LOCAL_OUT);
	}
	if (nf_debug & (1 << NF_IP_POST_ROUTING)) {
		printk("POST_ROUTING ");
		nf_debug ^= (1 << NF_IP_POST_ROUTING);
	}
	if (nf_debug)
		printk("Crap bits: 0x%04X", nf_debug);
	printk("\n");
}

void nf_dump_skb(int pf, struct sk_buff *skb)
{
	printk("skb: pf=%i %s dev=%s len=%u\n", 
	       pf,
	       skb->sk ? "(owned)" : "(unowned)",
	       skb->dev ? skb->dev->name : "(no dev)",
	       skb->len);
	switch (pf) {
	case PF_INET: {
		const struct iphdr *ip = skb->nh.iph;
		__u32 *opt = (__u32 *) (ip + 1);
		int opti;
		__u16 src_port = 0, dst_port = 0;

		if (ip->protocol == IPPROTO_TCP
		    || ip->protocol == IPPROTO_UDP) {
			struct tcphdr *tcp=(struct tcphdr *)((__u32 *)ip+ip->ihl);
			src_port = ntohs(tcp->source);
			dst_port = ntohs(tcp->dest);
		}
	
		printk("PROTO=%d %u.%u.%u.%u:%hu %u.%u.%u.%u:%hu"
		       " L=%hu S=0x%2.2hX I=%hu F=0x%4.4hX T=%hu",
		       ip->protocol, NIPQUAD(ip->saddr),
		       src_port, NIPQUAD(ip->daddr),
		       dst_port,
		       ntohs(ip->tot_len), ip->tos, ntohs(ip->id),
		       ntohs(ip->frag_off), ip->ttl);

		for (opti = 0; opti < (ip->ihl - sizeof(struct iphdr) / 4); opti++)
			printk(" O=0x%8.8X", *opt++);
		printk("\n");
	}
	}
}

void nf_debug_ip_local_deliver(struct sk_buff *skb)
{
	/* If it's a loopback packet, it must have come through
	 * NF_IP_LOCAL_OUT, NF_IP_RAW_INPUT, NF_IP_PRE_ROUTING and
	 * NF_IP_LOCAL_IN.  Otherwise, must have gone through
	 * NF_IP_RAW_INPUT and NF_IP_PRE_ROUTING.  */
	if (!skb->dev) {
		printk("ip_local_deliver: skb->dev is NULL.\n");
	}
	else if (strcmp(skb->dev->name, "lo") == 0) {
		if (skb->nf_debug != ((1 << NF_IP_LOCAL_OUT)
				      | (1 << NF_IP_POST_ROUTING)
				      | (1 << NF_IP_PRE_ROUTING)
				      | (1 << NF_IP_LOCAL_IN))) {
			printk("ip_local_deliver: bad loopback skb: ");
			debug_print_hooks_ip(skb->nf_debug);
			nf_dump_skb(PF_INET, skb);
		}
	}
	else {
		if (skb->nf_debug != ((1<<NF_IP_PRE_ROUTING)
				      | (1<<NF_IP_LOCAL_IN))) {
			printk("ip_local_deliver: bad non-lo skb: ");
			debug_print_hooks_ip(skb->nf_debug);
			nf_dump_skb(PF_INET, skb);
		}
	}
}

void nf_debug_ip_loopback_xmit(struct sk_buff *newskb)
{
	if (newskb->nf_debug != ((1 << NF_IP_LOCAL_OUT)
				 | (1 << NF_IP_POST_ROUTING))) {
		printk("ip_dev_loopback_xmit: bad owned skb = %p: ", 
		       newskb);
		debug_print_hooks_ip(newskb->nf_debug);
		nf_dump_skb(PF_INET, newskb);
	}
	/* Clear to avoid confusing input check */
	newskb->nf_debug = 0;
}

void nf_debug_ip_finish_output2(struct sk_buff *skb)
{
	/* If it's owned, it must have gone through the
	 * NF_IP_LOCAL_OUT and NF_IP_POST_ROUTING.
	 * Otherwise, must have gone through
	 * NF_IP_PRE_ROUTING, NF_IP_FORWARD and NF_IP_POST_ROUTING.
	 */
	if (skb->sk) {
		if (skb->nf_debug != ((1 << NF_IP_LOCAL_OUT)
				      | (1 << NF_IP_POST_ROUTING))) {
			printk("ip_finish_output: bad owned skb = %p: ", skb);
			debug_print_hooks_ip(skb->nf_debug);
			nf_dump_skb(PF_INET, skb);
		}
	} else {
		if (skb->nf_debug != ((1 << NF_IP_PRE_ROUTING)
				      | (1 << NF_IP_FORWARD)
				      | (1 << NF_IP_POST_ROUTING))) {
			/* Fragments, entunnelled packets, TCP RSTs
                           generated by ipt_REJECT will have no
                           owners, but still may be local */
			if (skb->nf_debug != ((1 << NF_IP_LOCAL_OUT)
					      | (1 << NF_IP_POST_ROUTING))){
				printk("ip_finish_output:"
				       " bad unowned skb = %p: ",skb);
				debug_print_hooks_ip(skb->nf_debug);
				nf_dump_skb(PF_INET, skb);
			}
		}
	}
}
#endif /*CONFIG_NETFILTER_DEBUG*/

/* Call get/setsockopt() */
static int nf_sockopt(struct sock *sk, int pf, int val, 
		      char *opt, int *len, int get)
{
	struct list_head *i;
	struct nf_sockopt_ops *ops;
	int ret;

	if (down_interruptible(&nf_sockopt_mutex) != 0)
		return -EINTR;

	for (i = nf_sockopts.next; i != &nf_sockopts; i = i->next) {
		ops = (struct nf_sockopt_ops *)i;
		if (ops->pf == pf) {
			if (get) {
				if (val >= ops->get_optmin
				    && val < ops->get_optmax) {
					ops->use++;
					up(&nf_sockopt_mutex);
					ret = ops->get(sk, val, opt, len);
					goto out;
				}
			} else {
				if (val >= ops->set_optmin
				    && val < ops->set_optmax) {
					ops->use++;
					up(&nf_sockopt_mutex);
					ret = ops->set(sk, val, opt, *len);
					goto out;
				}
			}
		}
	}
	up(&nf_sockopt_mutex);
	return -ENOPROTOOPT;
	
 out:
	down(&nf_sockopt_mutex);
	ops->use--;
	if (ops->cleanup_task)
		wake_up_process(ops->cleanup_task);
	up(&nf_sockopt_mutex);
	return ret;
}

int nf_setsockopt(struct sock *sk, int pf, int val, char *opt,
		  int len)
{
	return nf_sockopt(sk, pf, val, opt, &len, 0);
}

int nf_getsockopt(struct sock *sk, int pf, int val, char *opt, int *len)
{
	return nf_sockopt(sk, pf, val, opt, len, 1);
}

static unsigned int nf_iterate(struct list_head *head,
			       struct sk_buff **skb,
			       int hook,
			       const struct net_device *indev,
			       const struct net_device *outdev,
			       struct list_head **i,
			       int (*okfn)(struct sk_buff *),
			       int hook_thresh)
{
	for (*i = (*i)->next; *i != head; *i = (*i)->next) {
		struct nf_hook_ops *elem = (struct nf_hook_ops *)*i;

		if (hook_thresh > elem->priority)
			continue;

		switch (elem->hook(hook, skb, indev, outdev, okfn)) {
		case NF_QUEUE:
			return NF_QUEUE;

		case NF_STOLEN:
			return NF_STOLEN;

		case NF_DROP:
			return NF_DROP;

		case NF_REPEAT:
			*i = (*i)->prev;
			break;

#ifdef CONFIG_NETFILTER_DEBUG
		case NF_ACCEPT:
			break;

		default:
			NFDEBUG("Evil return from %p(%u).\n", 
				elem->hook, hook);
#endif
		}
	}
	return NF_ACCEPT;
}

int nf_register_queue_handler(int pf, nf_queue_outfn_t outfn, void *data)
{      
	int ret;

	br_write_lock_bh(BR_NETPROTO_LOCK);
	if (queue_handler[pf].outfn)
		ret = -EBUSY;
	else {
		queue_handler[pf].outfn = outfn;
		queue_handler[pf].data = data;
		ret = 0;
	}
	br_write_unlock_bh(BR_NETPROTO_LOCK);

	return ret;
}

/* The caller must flush their queue before this */
int nf_unregister_queue_handler(int pf)
{
	br_write_lock_bh(BR_NETPROTO_LOCK);
	queue_handler[pf].outfn = NULL;
	queue_handler[pf].data = NULL;
	br_write_unlock_bh(BR_NETPROTO_LOCK);
	return 0;
}

/* 
 * Any packet that leaves via this function must come back 
 * through nf_reinject().
 */
static void nf_queue(struct sk_buff *skb, 
		     struct list_head *elem, 
		     int pf, unsigned int hook,
		     struct net_device *indev,
		     struct net_device *outdev,
		     int (*okfn)(struct sk_buff *))
{
	int status;
	struct nf_info *info;
	struct net_device *physindev = NULL;
	struct net_device *physoutdev = NULL;

	if (!queue_handler[pf].outfn) {
		kfree_skb(skb);
		return;
	}

	info = kmalloc(sizeof(*info), GFP_ATOMIC);
	if (!info) {
		if (net_ratelimit())
			printk(KERN_ERR "OOM queueing packet %p\n",
			       skb);
		kfree_skb(skb);
		return;
	}

	*info = (struct nf_info) { 
		(struct nf_hook_ops *)elem, pf, hook, indev, outdev, okfn };

	/* Bump dev refs so they don't vanish while packet is out */
	if (indev) dev_hold(indev);
	if (outdev) dev_hold(outdev);

	if (skb->nf_bridge) {
		physindev = skb->nf_bridge->physindev;
		if (physindev) dev_hold(physindev);
		physoutdev = skb->nf_bridge->physoutdev;
		if (physoutdev) dev_hold(physoutdev);
	}

	status = queue_handler[pf].outfn(skb, info, queue_handler[pf].data);
	if (status < 0) {
		/* James M doesn't say fuck enough. */
		if (indev) dev_put(indev);
		if (outdev) dev_put(outdev);
		if (physindev) dev_put(physindev);
		if (physoutdev) dev_put(physoutdev);
		kfree(info);
		kfree_skb(skb);
		return;
	}
}

int nf_hook_slow(int pf, unsigned int hook, struct sk_buff *skb,
		 struct net_device *indev,
		 struct net_device *outdev,
		 int (*okfn)(struct sk_buff *),
		 int hook_thresh)
{
	struct list_head *elem;
	unsigned int verdict;
	int ret = 0;

	/* This stopgap cannot be removed until all the hooks are audited. */
	if (skb_is_nonlinear(skb) && skb_linearize(skb, GFP_ATOMIC) != 0) {
		kfree_skb(skb);
		return -ENOMEM;
	}
	if (skb->ip_summed == CHECKSUM_HW) {
		if (outdev == NULL) {
			skb->ip_summed = CHECKSUM_NONE;
		} else {
			skb_checksum_help(skb);
		}
	}

	/* We may already have this, but read-locks nest anyway */
	br_read_lock_bh(BR_NETPROTO_LOCK);

#ifdef CONFIG_NETFILTER_DEBUG
	if (skb->nf_debug & (1 << hook)) {
		printk("nf_hook: hook %i already set.\n", hook);
		nf_dump_skb(pf, skb);
	}
	skb->nf_debug |= (1 << hook);
#endif

	elem = &nf_hooks[pf][hook];
	verdict = nf_iterate(&nf_hooks[pf][hook], &skb, hook, indev,
			     outdev, &elem, okfn, hook_thresh);
	if (verdict == NF_QUEUE) {
		NFDEBUG("nf_hook: Verdict = QUEUE.\n");
		nf_queue(skb, elem, pf, hook, indev, outdev, okfn);
	}

	switch (verdict) {
	case NF_ACCEPT:
		ret = okfn(skb);
		break;

	case NF_DROP:
		kfree_skb(skb);
		ret = -EPERM;
		break;
	}

	br_read_unlock_bh(BR_NETPROTO_LOCK);
	return ret;
}

void nf_reinject(struct sk_buff *skb, struct nf_info *info,
		 unsigned int verdict)
{
	struct list_head *elem = &info->elem->list;
	struct list_head *i;

	/* We don't have BR_NETPROTO_LOCK here */
	br_read_lock_bh(BR_NETPROTO_LOCK);
	for (i = nf_hooks[info->pf][info->hook].next; i != elem; i = i->next) {
		if (i == &nf_hooks[info->pf][info->hook]) {
			/* The module which sent it to userspace is gone. */
			NFDEBUG("%s: module disappeared, dropping packet.\n",
			         __FUNCTION__);
			verdict = NF_DROP;
			break;
		}
	}

	/* Continue traversal iff userspace said ok... */
	if (verdict == NF_REPEAT) {
		elem = elem->prev;
		verdict = NF_ACCEPT;
	}

	if (verdict == NF_ACCEPT) {
		verdict = nf_iterate(&nf_hooks[info->pf][info->hook],
				     &skb, info->hook, 
				     info->indev, info->outdev, &elem,
				     info->okfn, INT_MIN);
	}

	switch (verdict) {
	case NF_ACCEPT:
		info->okfn(skb);
		break;

	case NF_QUEUE:
		nf_queue(skb, elem, info->pf, info->hook, 
			 info->indev, info->outdev, info->okfn);
		break;

	case NF_DROP:
		kfree_skb(skb);
		break;
	}
	br_read_unlock_bh(BR_NETPROTO_LOCK);

	/* Release those devices we held, or Alexey will kill me. */
	if (info->indev) dev_put(info->indev);
	if (info->outdev) dev_put(info->outdev);
	
	kfree(info);
	return;
}

#ifdef CONFIG_INET
/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
int ip_route_me_harder(struct sk_buff **pskb)
{
	struct iphdr *iph = (*pskb)->nh.iph;
	struct rtable *rt;
	struct flowi fl = { .nl_u = { .ip4_u =
				      { .daddr = iph->daddr,
					.saddr = iph->saddr,
					.tos = RT_TOS(iph->tos)|RTO_CONN,
#ifdef CONFIG_IP_ROUTE_FWMARK
					.fwmark = (*pskb)->nfmark
#endif
				      } },
			    .oif = (*pskb)->sk ? (*pskb)->sk->bound_dev_if : 0,
			    };
	struct net_device *dev_src = NULL;
	int err;

	/* accomodate ip_route_output_slow(), which expects the key src to be
	   0 or a local address; however some non-standard hacks like
	   ipt_REJECT.c:send_reset() can cause packets with foreign
           saddr to be appear on the NF_IP_LOCAL_OUT hook -MB */
	if(fl.fl4_src && !(dev_src = ip_dev_find(fl.fl4_src)))
		fl.fl4_src = 0;

	if ((err=ip_route_output_key(&rt, &fl)) != 0) {
		printk("route_me_harder: ip_route_output_key(dst=%u.%u.%u.%u, src=%u.%u.%u.%u, oif=%d, tos=0x%x, fwmark=0x%lx) error %d\n",
			NIPQUAD(iph->daddr), NIPQUAD(iph->saddr),
			(*pskb)->sk ? (*pskb)->sk->bound_dev_if : 0,
			RT_TOS(iph->tos)|RTO_CONN,
#ifdef CONFIG_IP_ROUTE_FWMARK
			(*pskb)->nfmark,
#else
			0UL,
#endif
			err);
		goto out;
	}

	/* Drop old route. */
	dst_release((*pskb)->dst);

	(*pskb)->dst = &rt->u.dst;

	/* Change in oif may mean change in hh_len. */
	if (skb_headroom(*pskb) < (*pskb)->dst->dev->hard_header_len) {
		struct sk_buff *nskb;

		nskb = skb_realloc_headroom(*pskb,
					    (*pskb)->dst->dev->hard_header_len);
		if (!nskb) {
			err = -ENOMEM;
			goto out;
		}
		if ((*pskb)->sk)
			skb_set_owner_w(nskb, (*pskb)->sk);
		kfree_skb(*pskb);
		*pskb = nskb;
	}

out:
	if (dev_src)
		dev_put(dev_src);

	return err;
}
#endif /*CONFIG_INET*/

/* This does not belong here, but ipt_REJECT needs it if connection
   tracking in use: without this, connection may not be in hash table,
   and hence manufactured ICMP or RST packets will not be associated
   with it. */
void (*ip_ct_attach)(struct sk_buff *, struct nf_ct_info *);

void __init netfilter_init(void)
{
	int i, h;

	for (i = 0; i < NPROTO; i++) {
		for (h = 0; h < NF_MAX_HOOKS; h++)
			INIT_LIST_HEAD(&nf_hooks[i][h]);
	}
}