From c324c7e4baa5b24323175be0a3c8febb256deee1 Mon Sep 17 00:00:00 2001
From: Bart De Schuymer <bdschuym@pandora.be>
Date: Mon, 21 Oct 2002 17:44:04 +0000
Subject: update to 2.5.44

---
 br-nf-bds/linux2.5/include/linux/skbuff.h |    9 +
 br-nf-bds/linux2.5/net/core/netfilter.c   |   18 +-
 br-nf-bds/linux2.5/net/ipv4/ip_output.c   | 1223 +++++++++++++++++------------
 3 files changed, 743 insertions(+), 507 deletions(-)

(limited to 'br-nf-bds/linux2.5')

diff --git a/br-nf-bds/linux2.5/include/linux/skbuff.h b/br-nf-bds/linux2.5/include/linux/skbuff.h
index 0a43a95..cefecda 100644
--- a/br-nf-bds/linux2.5/include/linux/skbuff.h
+++ b/br-nf-bds/linux2.5/include/linux/skbuff.h
@@ -775,6 +775,15 @@ static inline int skb_headlen(const struct sk_buff *skb)
 	return skb->len - skb->data_len;
 }
 
+static inline int skb_pagelen(const struct sk_buff *skb)
+{
+	int i, len = 0;
+
+	for (i = (int)skb_shinfo(skb)->nr_frags - 1; i >= 0; i--)
+		len += skb_shinfo(skb)->frags[i].size;
+	return len + skb_headlen(skb);
+}
+
 #define SKB_PAGE_ASSERT(skb) do { if (skb_shinfo(skb)->nr_frags) \
 					BUG(); } while (0)
 #define SKB_FRAG_ASSERT(skb) do { if (skb_shinfo(skb)->frag_list) \
diff --git a/br-nf-bds/linux2.5/net/core/netfilter.c b/br-nf-bds/linux2.5/net/core/netfilter.c
index 00ea7f9..f5a5af3 100644
--- a/br-nf-bds/linux2.5/net/core/netfilter.c
+++ b/br-nf-bds/linux2.5/net/core/netfilter.c
@@ -580,13 +580,15 @@ int ip_route_me_harder(struct sk_buff **pskb)
 {
 	struct iphdr *iph = (*pskb)->nh.iph;
 	struct rtable *rt;
-	struct rt_key key = { dst:iph->daddr,
-			      src:iph->saddr,
-			      oif:(*pskb)->sk ? (*pskb)->sk->bound_dev_if : 0,
-			      tos:RT_TOS(iph->tos)|RTO_CONN,
+	struct flowi fl = { .nl_u = { .ip4_u =
+				      { .daddr = iph->daddr,
+					.saddr = iph->saddr,
+					.tos = RT_TOS(iph->tos)|RTO_CONN,
 #ifdef CONFIG_IP_ROUTE_FWMARK
-			      fwmark:(*pskb)->nfmark
+					.fwmark = (*pskb)->nfmark
 #endif
+				      } },
+			    .oif = (*pskb)->sk ? (*pskb)->sk->bound_dev_if : 0,
 			    };
 	struct net_device *dev_src = NULL;
 	int err;
@@ -595,10 +597,10 @@ int ip_route_me_harder(struct sk_buff **pskb)
 	   0 or a local address; however some non-standard hacks like
 	   ipt_REJECT.c:send_reset() can cause packets with foreign
            saddr to be appear on the NF_IP_LOCAL_OUT hook -MB */
-	if(key.src && !(dev_src = ip_dev_find(key.src)))
-		key.src = 0;
+	if(fl.fl4_src && !(dev_src = ip_dev_find(fl.fl4_src)))
+		fl.fl4_src = 0;
 
-	if ((err=ip_route_output_key(&rt, &key)) != 0) {
+	if ((err=ip_route_output_key(&rt, &fl)) != 0) {
 		printk("route_me_harder: ip_route_output_key(dst=%u.%u.%u.%u, src=%u.%u.%u.%u, oif=%d, tos=0x%x, fwmark=0x%lx) error %d\n",
 			NIPQUAD(iph->daddr), NIPQUAD(iph->saddr),
 			(*pskb)->sk ? (*pskb)->sk->bound_dev_if : 0,
diff --git a/br-nf-bds/linux2.5/net/ipv4/ip_output.c b/br-nf-bds/linux2.5/net/ipv4/ip_output.c
index c103c48..d41926e 100644
--- a/br-nf-bds/linux2.5/net/ipv4/ip_output.c
+++ b/br-nf-bds/linux2.5/net/ipv4/ip_output.c
@@ -5,7 +5,7 @@
  *
  *		The Internet Protocol (IP) output module.
  *
- * Version:	$Id: ip_output.c,v 1.6 2002/10/21 17:28:24 bdschuym Exp $
+ * Version:	$Id: ip_output.c,v 1.7 2002/10/21 17:45:17 bdschuym Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -15,6 +15,7 @@
  *		Stefan Becker, <stefanb@yello.ping.de>
  *		Jorge Cwik, <jorge@laser.satlink.net>
  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *		Hirokazu Takahashi, <taka@valinux.co.jp>
  *
  *	See ip_input.c for original log
  *
@@ -38,6 +39,9 @@
  *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
  *					silently drop skb instead of failing with -EPERM.
  *		Detlev Wengorz	:	Copy protocol for fragments.
+ *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
+ *					datagrams.
+ *		Hirokazu Takahashi:	sendfile() on UDP works now.
  */
 
 #include <asm/uaccess.h>
@@ -108,16 +112,9 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb)
 	return 0;
 }
 
-/* Don't just hand NF_HOOK skb->dst->output, in case netfilter hook
-   changes route */
-static inline int
-output_maybe_reroute(struct sk_buff *skb)
-{
-	return skb->dst->output(skb);
-}
-
 /* 
  *		Add an ip header to a skbuff and send it out.
+ *
  */
 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 			  u32 saddr, u32 daddr, struct ip_options *opt)
@@ -153,15 +150,34 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 	}
 	ip_send_check(iph);
 
+	skb->priority = sk->priority;
+
 	/* Send it out. */
 	return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
-		       output_maybe_reroute);
+		       dst_output);
 }
 
 static inline int ip_finish_output2(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb->dst;
 	struct hh_cache *hh = dst->hh;
+	struct net_device *dev = dst->dev;
+
+	/* Be paranoid, rather than too clever. */
+	if (unlikely(skb_headroom(skb) < dev->hard_header_len
+		     && dev->hard_header)) {
+		struct sk_buff *skb2;
+
+		skb2 = skb_realloc_headroom(skb, (dev->hard_header_len&~15) + 16);
+		if (skb2 == NULL) {
+			kfree_skb(skb);
+			return -ENOMEM;
+		}
+		if (skb->sk)
+			skb_set_owner_w(skb2, skb->sk);
+		kfree_skb(skb);
+		skb = skb2;
+	}
 
 #ifdef CONFIG_NETFILTER_DEBUG
 	nf_debug_ip_finish_output2(skb);
@@ -203,10 +219,6 @@ int ip_mc_output(struct sk_buff *skb)
 	 *	If the indicated interface is up and running, send the packet.
 	 */
 	IP_INC_STATS(IpOutRequests);
-#ifdef CONFIG_IP_ROUTE_NAT
-	if (rt->rt_flags & RTCF_NAT)
-		ip_do_nat(skb);
-#endif
 
 	skb->dev = dev;
 	skb->protocol = htons(ETH_P_IP);
@@ -251,100 +263,21 @@ int ip_mc_output(struct sk_buff *skb)
 				newskb->dev, ip_dev_loopback_xmit);
 	}
 
-	return ip_finish_output(skb);
+	if (skb->len > dev->mtu || skb_shinfo(skb)->frag_list)
+		return ip_fragment(skb, ip_finish_output);
+	else
+		return ip_finish_output(skb);
 }
 
 int ip_output(struct sk_buff *skb)
 {
-#ifdef CONFIG_IP_ROUTE_NAT
-	struct rtable *rt = (struct rtable*)skb->dst;
-#endif
-
 	IP_INC_STATS(IpOutRequests);
 
-#ifdef CONFIG_IP_ROUTE_NAT
-	if (rt->rt_flags&RTCF_NAT)
-		ip_do_nat(skb);
-#endif
-
-	return ip_finish_output(skb);
-}
-
-/* Queues a packet to be sent, and starts the transmitter if necessary.  
- * This routine also needs to put in the total length and compute the 
- * checksum.  We use to do this in two stages, ip_build_header() then
- * this, but that scheme created a mess when routes disappeared etc.
- * So we do it all here, and the TCP send engine has been changed to
- * match. (No more unroutable FIN disasters, etc. wheee...)  This will
- * most likely make other reliable transport layers above IP easier
- * to implement under Linux.
- */
-static inline int ip_queue_xmit2(struct sk_buff *skb)
-{
-	struct sock *sk = skb->sk;
-	struct rtable *rt = (struct rtable *)skb->dst;
-	struct net_device *dev;
-	struct iphdr *iph = skb->nh.iph;
-
-	dev = rt->u.dst.dev;
-
-	/* This can happen when the transport layer has segments queued
-	 * with a cached route, and by the time we get here things are
-	 * re-routed to a device with a different MTU than the original
-	 * device.  Sick, but we must cover it.
-	 */
-	if (skb_headroom(skb) < dev->hard_header_len && dev->hard_header) {
-		struct sk_buff *skb2;
-
-		skb2 = skb_realloc_headroom(skb, (dev->hard_header_len + 15) & ~15);
-		kfree_skb(skb);
-		if (skb2 == NULL)
-			return -ENOMEM;
-		if (sk)
-			skb_set_owner_w(skb2, sk);
-		skb = skb2;
-		iph = skb->nh.iph;
-	}
-
-	if (skb->len > rt->u.dst.pmtu) {
-		unsigned int hlen;
-		if (!(sk->route_caps&NETIF_F_TSO))
-			goto fragment;
-
-		/* Hack zone: all this must be done by TCP. */
-		hlen = ((skb->h.raw - skb->data) + (skb->h.th->doff << 2));
-		skb_shinfo(skb)->tso_size = rt->u.dst.pmtu - hlen;
-		skb_shinfo(skb)->tso_segs =
-			(skb->len - hlen + skb_shinfo(skb)->tso_size - 1)/
-				skb_shinfo(skb)->tso_size - 1;
-	}
-
-	ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);
-
-	/* Add an IP checksum. */
-	ip_send_check(iph);
-
-	skb->priority = sk->priority;
-	return skb->dst->output(skb);
-
-fragment:
-	if (ip_dont_fragment(sk, &rt->u.dst)) {
-		/* Reject packet ONLY if TCP might fragment
-		 * it itself, if were careful enough.
-		 */
-		NETDEBUG(printk(KERN_DEBUG "sending pkt_too_big (len[%u] pmtu[%u]) to self\n",
-				skb->len, rt->u.dst.pmtu));
-
-		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
-			  htonl(rt->u.dst.pmtu));
-		kfree_skb(skb);
-		return -EMSGSIZE;
-	}
-	ip_select_ident(iph, &rt->u.dst, sk);
-	if (skb->ip_summed == CHECKSUM_HW &&
-	    (skb = skb_checksum_help(skb)) == NULL)
-		return -ENOMEM;
-	return ip_fragment(skb, skb->dst->output);
+	if ((skb->len > skb->dst->dev->mtu || skb_shinfo(skb)->frag_list) &&
+	    !skb_shinfo(skb)->tso_size)
+		return ip_fragment(skb, ip_finish_output);
+	else
+		return ip_finish_output(skb);
 }
 
 int ip_queue_xmit(struct sk_buff *skb)
@@ -372,14 +305,20 @@ int ip_queue_xmit(struct sk_buff *skb)
 		if(opt && opt->srr)
 			daddr = opt->faddr;
 
-		/* If this fails, retransmit mechanism of transport layer will
-		 * keep trying until route appears or the connection times itself
-		 * out.
-		 */
-		if (ip_route_output(&rt, daddr, inet->saddr,
-				    RT_CONN_FLAGS(sk),
-				    sk->bound_dev_if))
-			goto no_route;
+		{
+			struct flowi fl = { .nl_u = { .ip4_u =
+						      { .daddr = daddr,
+							.saddr = inet->saddr,
+							.tos = RT_CONN_FLAGS(sk) } },
+					    .oif = sk->bound_dev_if };
+
+			/* If this fails, retransmit mechanism of transport layer will
+			 * keep trying until route appears or the connection times itself
+			 * out.
+			 */
+			if (ip_route_output_key(&rt, &fl))
+				goto no_route;
+		}
 		__sk_dst_set(sk, &rt->u.dst);
 		tcp_v4_setup_caps(sk, &rt->u.dst);
 	}
@@ -409,348 +348,60 @@ packet_routed:
 		ip_options_build(skb, opt, inet->daddr, rt, 0);
 	}
 
-	return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
-		       ip_queue_xmit2);
-
-no_route:
-	IP_INC_STATS(IpOutNoRoutes);
-	kfree_skb(skb);
-	return -EHOSTUNREACH;
-}
-
-/*
- *	Build and send a packet, with as little as one copy
- *
- *	Doesn't care much about ip options... option length can be
- *	different for fragment at 0 and other fragments.
- *
- *	Note that the fragment at the highest offset is sent first,
- *	so the getfrag routine can fill in the TCP/UDP checksum header
- *	field in the last fragment it sends... actually it also helps
- * 	the reassemblers, they can put most packets in at the head of
- *	the fragment queue, and they know the total size in advance. This
- *	last feature will measurably improve the Linux fragment handler one
- *	day.
- *
- *	The callback has five args, an arbitrary pointer (copy of frag),
- *	the source IP address (may depend on the routing table), the 
- *	destination address (char *), the offset to copy from, and the
- *	length to be copied.
- */
-
-static int ip_build_xmit_slow(struct sock *sk,
-		  int getfrag (const void *,
-			       char *,
-			       unsigned int,	
-			       unsigned int),
-		  const void *frag,
-		  unsigned length,
-		  struct ipcm_cookie *ipc,
-		  struct rtable *rt,
-		  int flags)
-{
-	struct inet_opt *inet = inet_sk(sk);
-	unsigned int fraglen, maxfraglen, fragheaderlen;
-	int err;
-	int offset, mf;
-	int mtu;
-	u16 id;
-
-	int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
-	int nfrags=0;
-	struct ip_options *opt = ipc->opt;
-	int df = 0;
-
-	mtu = rt->u.dst.pmtu;
-	if (ip_dont_fragment(sk, &rt->u.dst))
-		df = htons(IP_DF);
-
-	length -= sizeof(struct iphdr);
-
-	if (opt) {
-		fragheaderlen = sizeof(struct iphdr) + opt->optlen;
-		maxfraglen = ((mtu-sizeof(struct iphdr)-opt->optlen) & ~7) + fragheaderlen;
-	} else {
-		fragheaderlen = sizeof(struct iphdr);
-
-		/*
-		 *	Fragheaderlen is the size of 'overhead' on each buffer. Now work
-		 *	out the size of the frames to send.
-		 */
-
-		maxfraglen = ((mtu-sizeof(struct iphdr)) & ~7) + fragheaderlen;
-	}
-
-	if (length + fragheaderlen > 0xFFFF) {
-		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
-		return -EMSGSIZE;
-	}
-
-	/*
-	 *	Start at the end of the frame by handling the remainder.
-	 */
-
-	offset = length - (length % (maxfraglen - fragheaderlen));
-
-	/*
-	 *	Amount of memory to allocate for final fragment.
-	 */
-
-	fraglen = length - offset + fragheaderlen;
-
-	if (length-offset==0) {
-		fraglen = maxfraglen;
-		offset -= maxfraglen-fragheaderlen;
-	}
-
-	/*
-	 *	The last fragment will not have MF (more fragments) set.
-	 */
-
-	mf = 0;
-
-	/*
-	 *	Don't fragment packets for path mtu discovery.
-	 */
+	if (skb->len > rt->u.dst.pmtu && (sk->route_caps&NETIF_F_TSO)) {
+		unsigned int hlen;
 
-	if (offset > 0 && inet->pmtudisc == IP_PMTUDISC_DO) { 
-		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
- 		return -EMSGSIZE;
+		/* Hack zone: all this must be done by TCP. */
+		hlen = ((skb->h.raw - skb->data) + (skb->h.th->doff << 2));
+		skb_shinfo(skb)->tso_size = rt->u.dst.pmtu - hlen;
+		skb_shinfo(skb)->tso_segs =
+			(skb->len - hlen + skb_shinfo(skb)->tso_size - 1)/
+				skb_shinfo(skb)->tso_size - 1;
 	}
-	if (flags&MSG_PROBE)
-		goto out;
-
-	/*
-	 *	Begin outputting the bytes.
-	 */
-
-	id = inet->id++;
-
-	do {
-		char *data;
-		struct sk_buff * skb;
-
-		/*
-		 *	Get the memory we require with some space left for alignment.
-		 */
-		if (!(flags & MSG_DONTWAIT) || nfrags == 0) {
-			skb = sock_alloc_send_skb(sk, fraglen + hh_len + 15,
-						  (flags & MSG_DONTWAIT), &err);
-		} else {
-			/* On a non-blocking write, we check for send buffer
-			 * usage on the first fragment only.
-			 */
-			skb = sock_wmalloc(sk, fraglen + hh_len + 15, 1,
-					   sk->allocation);
-			if (!skb)
-				err = -ENOBUFS;
-		}
-		if (skb == NULL)
-			goto error;
-
-		/*
-		 *	Fill in the control structures
-		 */
-
-		skb->priority = sk->priority;
-		skb->dst = dst_clone(&rt->u.dst);
-		skb_reserve(skb, hh_len);
-
-		/*
-		 *	Find where to start putting bytes.
-		 */
-
-		data = skb_put(skb, fraglen);
-		skb->nh.iph = (struct iphdr *)data;
-
-		/*
-		 *	Only write IP header onto non-raw packets 
-		 */
-
-		{
-			struct iphdr *iph = (struct iphdr *)data;
-
-			iph->version = 4;
-			iph->ihl = 5;
-			if (opt) {
-				iph->ihl += opt->optlen>>2;
-				ip_options_build(skb, opt,
-						 ipc->addr, rt, offset);
-			}
-			iph->tos = inet->tos;
-			iph->tot_len = htons(fraglen - fragheaderlen + iph->ihl*4);
-			iph->frag_off = htons(offset>>3)|mf|df;
-			iph->id = id;
-			if (!mf) {
-				if (offset || !df) {
-					/* Select an unpredictable ident only
-					 * for packets without DF or having
-					 * been fragmented.
-					 */
-					__ip_select_ident(iph, &rt->u.dst, 0);
-					id = iph->id;
-				}
-
-				/*
-				 *	Any further fragments will have MF set.
-				 */
-				mf = htons(IP_MF);
-			}
-			if (rt->rt_type == RTN_MULTICAST)
-				iph->ttl = inet->mc_ttl;
-			else
-				iph->ttl = inet->ttl;
-			iph->protocol = sk->protocol;
-			iph->check = 0;
-			iph->saddr = rt->rt_src;
-			iph->daddr = rt->rt_dst;
-			iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
-			data += iph->ihl*4;
-		}
-
-		/*
-		 *	User data callback
-		 */
-
-		if (getfrag(frag, data, offset, fraglen-fragheaderlen)) {
-			err = -EFAULT;
-			kfree_skb(skb);
-			goto error;
-		}
 
-		offset -= (maxfraglen-fragheaderlen);
-		fraglen = maxfraglen;
+	ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);
 
-		nfrags++;
+	/* Add an IP checksum. */
+	ip_send_check(iph);
 
-		err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, 
-			      skb->dst->dev, output_maybe_reroute);
-		if (err) {
-			if (err > 0)
-				err = inet->recverr ? net_xmit_errno(err) : 0;
-			if (err)
-				goto error;
-		}
-	} while (offset >= 0);
+	skb->priority = sk->priority;
 
-	if (nfrags>1)
-		ip_statistics[smp_processor_id()*2 + !in_softirq()].IpFragCreates += nfrags;
-out:
-	return 0;
+	return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+		       dst_output);
 
-error:
-	IP_INC_STATS(IpOutDiscards);
-	if (nfrags>1)
-		ip_statistics[smp_processor_id()*2 + !in_softirq()].IpFragCreates += nfrags;
-	return err; 
+no_route:
+	IP_INC_STATS(IpOutNoRoutes);
+	kfree_skb(skb);
+	return -EHOSTUNREACH;
 }
 
-/*
- *	Fast path for unfragmented packets.
- */
-int ip_build_xmit(struct sock *sk, 
-		  int getfrag (const void *,
-			       char *,
-			       unsigned int,	
-			       unsigned int),
-		  const void *frag,
-		  unsigned length,
-		  struct ipcm_cookie *ipc,
-		  struct rtable *rt,
-		  int flags)
-{
-	struct inet_opt *inet = inet_sk(sk);
-	int err;
-	struct sk_buff *skb;
-	int df;
-	struct iphdr *iph;
-
-	/*
-	 *	Try the simple case first. This leaves fragmented frames, and by
-	 *	choice RAW frames within 20 bytes of maximum size(rare) to the long path
-	 */
-
-	if (!inet->hdrincl) {
-		length += sizeof(struct iphdr);
-
-		/*
-		 * 	Check for slow path.
-		 */
-		if (length > rt->u.dst.pmtu || ipc->opt != NULL)  
-			return ip_build_xmit_slow(sk,getfrag,frag,length,ipc,rt,flags); 
-	} else {
-		if (length > rt->u.dst.dev->mtu) {
-			ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport,
-				       rt->u.dst.dev->mtu);
-			return -EMSGSIZE;
-		}
-	}
-	if (flags&MSG_PROBE)
-		goto out;
 
-	/*
-	 *	Do path mtu discovery if needed.
-	 */
-	df = 0;
-	if (ip_dont_fragment(sk, &rt->u.dst))
-		df = htons(IP_DF);
-
-	/* 
-	 *	Fast path for unfragmented frames without options. 
-	 */ 
-	{
-	int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
-
-	skb = sock_alloc_send_skb(sk, length+hh_len+15,
-				  flags&MSG_DONTWAIT, &err);
-	if(skb==NULL)
-		goto error; 
-	skb_reserve(skb, hh_len);
-	}
-
-	skb->priority = sk->priority;
-	skb->dst = dst_clone(&rt->u.dst);
-
-	skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length);
-
-	if (!inet->hdrincl) {
-		iph->version=4;
-		iph->ihl=5;
-		iph->tos = inet->tos;
-		iph->tot_len = htons(length);
-		iph->frag_off = df;
-		iph->ttl = inet->mc_ttl;
-		ip_select_ident(iph, &rt->u.dst, sk);
-		if (rt->rt_type != RTN_MULTICAST)
-			iph->ttl = inet->ttl;
-		iph->protocol=sk->protocol;
-		iph->saddr=rt->rt_src;
-		iph->daddr=rt->rt_dst;
-		iph->check=0;
-		iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
-		err = getfrag(frag, ((char *)iph)+iph->ihl*4,0, length-iph->ihl*4);
-	}
-	else
-		err = getfrag(frag, (void *)iph, 0, length);
-
-	if (err)
-		goto error_fault;
+static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
+{
+	to->pkt_type = from->pkt_type;
+	to->priority = from->priority;
+	to->protocol = from->protocol;
+	to->security = from->security;
+	to->dst = dst_clone(from->dst);
+	to->dev = from->dev;
 
-	err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
-		      output_maybe_reroute);
-	if (err > 0)
-		err = inet->recverr ? net_xmit_errno(err) : 0;
-	if (err)
-		goto error;
-out:
-	return 0;
+	/* Copy the flags to each fragment. */
+	IPCB(to)->flags = IPCB(from)->flags;
 
-error_fault:
-	err = -EFAULT;
-	kfree_skb(skb);
-error:
-	IP_INC_STATS(IpOutDiscards);
-	return err; 
+#ifdef CONFIG_NET_SCHED
+	to->tc_index = from->tc_index;
+#endif
+#ifdef CONFIG_NETFILTER
+	to->nfmark = from->nfmark;
+	/* Connection association is same as pre-frag packet */
+	to->nfct = from->nfct;
+	nf_conntrack_get(to->nfct);
+	to->nf_bridge = from->nf_bridge;
+	nf_bridge_get(to->nf_bridge);
+#ifdef CONFIG_NETFILTER_DEBUG
+	to->nf_debug = from->nf_debug;
+#endif
+#endif
 }
 
 /*
@@ -758,8 +409,6 @@ error:
  *	smaller pieces (each of size equal to IP header plus
  *	a block of the data of the original IP data part) that will yet fit in a
  *	single device frame, and queue such a frame for sending.
- *
- *	Yes this is inefficient, feel free to submit a quicker one.
  */
 
 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
@@ -783,13 +432,111 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 
 	iph = skb->nh.iph;
 
+	if (unlikely(iph->frag_off & htons(IP_DF))) {
+		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+			  htonl(rt->u.dst.pmtu));
+		kfree_skb(skb);
+		return -EMSGSIZE;
+	}
+
 	/*
 	 *	Setup starting values.
 	 */
 
 	hlen = iph->ihl * 4;
-	left = skb->len - hlen;		/* Space per frame */
 	mtu = rt->u.dst.pmtu - hlen;	/* Size of data space */
+
+	/* When frag_list is given, use it. First, check its validity:
+	 * some transformers could create wrong frag_list or break existing
+	 * one, it is not prohibited. In this case fall back to copying.
+	 *
+	 * LATER: this step can be merged to real generation of fragments,
+	 * we can switch to copy when see the first bad fragment.
+	 */
+	if (skb_shinfo(skb)->frag_list) {
+		struct sk_buff *frag;
+		int first_len = skb_pagelen(skb);
+
+		if (first_len - hlen > mtu ||
+		    ((first_len - hlen) & 7) ||
+		    (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
+		    skb_cloned(skb))
+			goto slow_path;
+
+		for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
+			/* Correct geometry. */
+			if (frag->len > mtu ||
+			    ((frag->len & 7) && frag->next) ||
+			    skb_headroom(frag) < hlen)
+			    goto slow_path;
+
+			/* Correct socket ownership. */
+			if (frag->sk == NULL)
+				goto slow_path;
+
+			/* Partially cloned skb? */
+			if (skb_shared(frag))
+				goto slow_path;
+		}
+
+		/* Everything is OK. Generate! */
+
+		err = 0;
+		offset = 0;
+		frag = skb_shinfo(skb)->frag_list;
+		skb_shinfo(skb)->frag_list = 0;
+		skb->data_len = first_len - skb_headlen(skb);
+		skb->len = first_len;
+		iph->tot_len = htons(first_len);
+		iph->frag_off |= htons(IP_MF);
+		ip_send_check(iph);
+
+		for (;;) {
+			/* Prepare header of the next frame,
+			 * before previous one went down. */
+			if (frag) {
+				frag->h.raw = frag->data;
+				frag->nh.raw = __skb_push(frag, hlen);
+				memcpy(frag->nh.raw, iph, hlen);
+				iph = frag->nh.iph;
+				iph->tot_len = htons(frag->len);
+				ip_copy_metadata(frag, skb);
+				if (offset == 0)
+					ip_options_fragment(frag);
+				offset += skb->len - hlen;
+				iph->frag_off = htons(offset>>3);
+				if (frag->next != NULL)
+					iph->frag_off |= htons(IP_MF);
+				/* Ready, complete checksum */
+				ip_send_check(iph);
+			}
+
+			err = output(skb);
+
+			if (err || !frag)
+				break;
+
+			skb = frag;
+			frag = skb->next;
+			skb->next = NULL;
+		}
+
+		if (err == 0) {
+			IP_INC_STATS(IpFragOKs);
+			return 0;
+		}
+
+		while (frag) {
+			skb = frag->next;
+			kfree_skb(frag);
+			frag = skb;
+		}
+		IP_INC_STATS(IpFragFails);
+		return err;
+	}
+
+slow_path:
+	left = skb->len - hlen;		/* Space per frame */
 	ptr = raw + hlen;		/* Where to start from */
 
 	/*
@@ -817,7 +564,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 		 *	Allocate buffer.
 		 */
 
-		if ((skb2 = alloc_skb(len+hlen+dev->hard_header_len+15,GFP_ATOMIC)) == NULL) {
+		if ((skb2 = alloc_skb(len+hlen+rt->u.dst.dev->hard_header_len+16,GFP_ATOMIC)) == NULL) {
 			NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
 			err = -ENOMEM;
 			goto fail;
@@ -827,14 +574,11 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 		 *	Set up data on packet
 		 */
 
-		skb2->pkt_type = skb->pkt_type;
-		skb2->priority = skb->priority;
-		skb_reserve(skb2, (dev->hard_header_len+15)&~15);
+		ip_copy_metadata(skb2, skb);
+		skb_reserve(skb2, (rt->u.dst.dev->hard_header_len&~15)+16);
 		skb_put(skb2, len + hlen);
 		skb2->nh.raw = skb2->data;
 		skb2->h.raw = skb2->data + hlen;
-		skb2->protocol = skb->protocol;
-		skb2->security = skb->security;
 
 		/*
 		 *	Charge the memory for the fragment to any owner
@@ -843,8 +587,6 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 
 		if (skb->sk)
 			skb_set_owner_w(skb2, skb->sk);
-		skb2->dst = dst_clone(skb->dst);
-		skb2->dev = skb->dev;
 
 		/*
 		 *	Copy the packet header into the new buffer.
@@ -874,9 +616,6 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 		if (offset == 0)
 			ip_options_fragment(skb);
 
-		/* Copy the flags to each fragment. */
-		IPCB(skb2)->flags = IPCB(skb)->flags;
-
 		/*
 		 *	Added AC : If we are fragmenting a fragment that's not the
 		 *		   last fragment then keep MF on each bit
@@ -886,21 +625,6 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 		ptr += len;
 		offset += len;
 
-#ifdef CONFIG_NET_SCHED
-		skb2->tc_index = skb->tc_index;
-#endif
-#ifdef CONFIG_NETFILTER
-		skb2->nfmark = skb->nfmark;
-		/* Connection association is same as pre-frag packet */
-		skb2->nfct = skb->nfct;
-		nf_conntrack_get(skb2->nfct);
-		skb2->nf_bridge = skb->nf_bridge;
-		nf_bridge_get(skb2->nf_bridge);
-#ifdef CONFIG_NETFILTER_DEBUG
-		skb2->nf_debug = skb->nf_debug;
-#endif
-#endif
-
 		/*
 		 *	Put this fragment into the sending queue.
 		 */
@@ -925,40 +649,525 @@ fail:
 	return err;
 }
 
+int
+ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
+{
+	struct iovec *iov = from;
+
+	if (skb->ip_summed == CHECKSUM_HW) {
+		if (memcpy_fromiovecend(to, iov, offset, len) < 0)
+			return -EFAULT;
+	} else {
+		unsigned int csum = 0;
+		if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
+			return -EFAULT;
+		skb->csum = csum_block_add(skb->csum, csum, odd);
+	}
+	return 0;
+}
+
+static inline int
+skb_can_coalesce(struct sk_buff *skb, int i, struct page *page, int off)
+{
+	if (i) {
+		skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
+		return page == frag->page &&
+			off == frag->page_offset+frag->size;
+	}
+	return 0;
+}
+
+static inline void
+skb_fill_page_desc(struct sk_buff *skb, int i, struct page *page, int off, int size)
+{
+	skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+	frag->page = page;
+	frag->page_offset = off;
+	frag->size = size;
+	skb_shinfo(skb)->nr_frags = i+1;
+}
+
+static inline unsigned int
+csum_page(struct page *page, int offset, int copy)
+{
+	char *kaddr;
+	unsigned int csum;
+	kaddr = kmap(page);
+	csum = csum_partial(kaddr + offset, copy, 0);
+	kunmap(page);
+	return csum;
+}
+
 /*
- *	Fetch data from kernel space and fill in checksum if needed.
+ *	ip_append_data() and ip_append_page() can make one large IP datagram
+ *	from many pieces of data. Each pieces will be holded on the socket
+ *	until ip_push_pending_frames() is called. Eache pieces can be a page
+ *	or non-page data.
+ *	
+ *	Not only UDP, other transport protocols - e.g. raw sockets - can use
+ *	this interface potentially.
+ *
+ *	LATER: length must be adjusted by pad at tail, when it is required.
  */
-static int ip_reply_glue_bits(const void *dptr, char *to, unsigned int offset, 
-			      unsigned int fraglen)
+int ip_append_data(struct sock *sk,
+		   int getfrag(void *from, char *to, int offset, int len,
+			       int odd, struct sk_buff *skb),
+		   void *from, int length, int transhdrlen,
+		   struct ipcm_cookie *ipc, struct rtable *rt,
+		   unsigned int flags)
 {
-        struct ip_reply_arg *dp = (struct ip_reply_arg*)dptr;
-	u16 *pktp = (u16 *)to;
-	struct iovec *iov; 
-	int len; 
-	int hdrflag = 1; 
-
-	iov = &dp->iov[0]; 
-	if (offset >= iov->iov_len) { 
-		offset -= iov->iov_len;
-		iov++; 
-		hdrflag = 0; 
+	struct inet_opt *inet = inet_sk(sk);
+	struct sk_buff *skb;
+
+	struct ip_options *opt = NULL;
+	int hh_len;
+	int exthdrlen;
+	int mtu;
+	int copy;
+	int err;
+	int offset = 0;
+	unsigned int maxfraglen, fragheaderlen;
+	int csummode = CHECKSUM_NONE;
+
+	if (flags&MSG_PROBE)
+		return 0;
+
+	if (skb_queue_empty(&sk->write_queue)) {
+		/*
+		 * setup for corking.
+		 */
+		opt = ipc->opt;
+		if (opt) {
+			if (inet->cork.opt == NULL)
+				inet->cork.opt = kmalloc(sizeof(struct ip_options)+40, sk->allocation);
+			memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
+			inet->cork.flags |= IPCORK_OPT;
+			inet->cork.addr = ipc->addr;
+		}
+		dst_hold(&rt->u.dst);
+		inet->cork.fragsize = mtu = rt->u.dst.pmtu;
+		inet->cork.rt = rt;
+		inet->cork.length = 0;
+		inet->sndmsg_page = NULL;
+		inet->sndmsg_off = 0;
+		if ((exthdrlen = rt->u.dst.header_len) != 0) {
+			length += exthdrlen;
+			transhdrlen += exthdrlen;
+		}
+	} else {
+		rt = inet->cork.rt;
+		if (inet->cork.flags & IPCORK_OPT)
+			opt = inet->cork.opt;
+
+		transhdrlen = 0;
+		exthdrlen = 0;
+		mtu = inet->cork.fragsize;
 	}
-	len = iov->iov_len - offset;
-	if (fraglen > len) { /* overlapping. */ 
-		dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, len,
-					     dp->csum);
-		offset = 0;
-		fraglen -= len; 
-		to += len; 
-		iov++;
+	hh_len = (rt->u.dst.dev->hard_header_len&~15) + 16;
+
+	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
+	maxfraglen = ((mtu-fragheaderlen) & ~7) + fragheaderlen;
+
+	if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
+		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
+		return -EMSGSIZE;
+	}
+
+	/*
+	 * transhdrlen > 0 means that this is the first fragment and we wish
+	 * it won't be fragmented in the future.
+	 */
+	if (transhdrlen &&
+	    length + fragheaderlen <= maxfraglen &&
+	    rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
+	    !exthdrlen)
+		csummode = CHECKSUM_HW;
+
+	inet->cork.length += length;
+
+	if ((skb = skb_peek_tail(&sk->write_queue)) == NULL)
+		goto alloc_new_skb;
+
+	while (length > 0) {
+		if ((copy = maxfraglen - skb->len) <= 0) {
+			char *data;
+			unsigned int datalen;
+			unsigned int fraglen;
+			unsigned int alloclen;
+			BUG_TRAP(copy == 0);
+
+alloc_new_skb:
+			datalen = maxfraglen - fragheaderlen;
+			if (datalen > length)
+				datalen = length;
+
+			fraglen = datalen + fragheaderlen;
+			if ((flags & MSG_MORE) && 
+			    !(rt->u.dst.dev->features&NETIF_F_SG))
+				alloclen = maxfraglen;
+			else
+				alloclen = datalen + fragheaderlen;
+			if (!(flags & MSG_DONTWAIT) || transhdrlen) {
+				skb = sock_alloc_send_skb(sk, 
+						alloclen + hh_len + 15,
+						(flags & MSG_DONTWAIT), &err);
+			} else {
+				skb = sock_wmalloc(sk, 
+						alloclen + hh_len + 15, 1,
+						sk->allocation);
+				if (unlikely(skb == NULL))
+					err = -ENOBUFS;
+			}
+			if (skb == NULL)
+				goto error;
+
+			/*
+			 *	Fill in the control structures
+			 */
+			skb->ip_summed = csummode;
+			skb->csum = 0;
+			skb_reserve(skb, hh_len);
+
+			/*
+			 *	Find where to start putting bytes.
+			 */
+			data = skb_put(skb, fraglen);
+			skb->nh.raw = __skb_pull(skb, exthdrlen);
+			data += fragheaderlen;
+			skb->h.raw = data + exthdrlen;
+
+			copy = datalen - transhdrlen;
+			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, 0, skb) < 0) {
+				err = -EFAULT;
+				kfree_skb(skb);
+				goto error;
+			}
+
+			offset += copy;
+			length -= datalen;
+			transhdrlen = 0;
+			exthdrlen = 0;
+			csummode = CHECKSUM_NONE;
+
+			/*
+			 * Put the packet on the pending queue.
+			 */
+			__skb_queue_tail(&sk->write_queue, skb);
+			continue;
+		}
+
+		if (copy > length)
+			copy = length;
+
+		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
+			unsigned int off;
+
+			off = skb->len;
+			if (getfrag(from, skb_put(skb, copy), 
+					offset, copy, off, skb) < 0) {
+				__skb_trim(skb, off);
+				err = -EFAULT;
+				goto error;
+			}
+		} else {
+			int i = skb_shinfo(skb)->nr_frags;
+			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
+			struct page *page = inet->sndmsg_page;
+			int off = inet->sndmsg_off;
+			unsigned int left;
+
+			if (page && (left = PAGE_SIZE - off) > 0) {
+				if (copy >= left)
+					copy = left;
+				if (page != frag->page) {
+					if (i == MAX_SKB_FRAGS) {
+						err = -EMSGSIZE;
+						goto error;
+					}
+					get_page(page);
+	 				skb_fill_page_desc(skb, i, page, inet->sndmsg_off, 0);
+					frag = &skb_shinfo(skb)->frags[i];
+				}
+			} else if (i < MAX_SKB_FRAGS) {
+				if (copy > PAGE_SIZE)
+					copy = PAGE_SIZE;
+				page = alloc_pages(sk->allocation, 0);
+				if (page == NULL)  {
+					err = -ENOMEM;
+					goto error;
+				}
+				inet->sndmsg_page = page;
+				inet->sndmsg_off = 0;
+
+				skb_fill_page_desc(skb, i, page, 0, 0);
+				frag = &skb_shinfo(skb)->frags[i];
+				skb->truesize += PAGE_SIZE;
+				atomic_add(PAGE_SIZE, &sk->wmem_alloc);
+			} else {
+				err = -EMSGSIZE;
+				goto error;
+			}
+			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
+				err = -EFAULT;
+				goto error;
+			}
+			inet->sndmsg_off += copy;
+			frag->size += copy;
+			skb->len += copy;
+			skb->data_len += copy;
+		}
+		offset += copy;
+		length -= copy;
+	}
+
+	return 0;
+
+error:
+	inet->cork.length -= length;
+	IP_INC_STATS(IpOutDiscards);
+	return err; 
+}
+
+ssize_t	ip_append_page(struct sock *sk, struct page *page,
+		       int offset, size_t size, int flags)
+{
+	struct inet_opt *inet = inet_sk(sk);
+	struct sk_buff *skb;
+	struct rtable *rt;
+	struct ip_options *opt = NULL;
+	int hh_len;
+	int mtu;
+	int len;
+	int err;
+	unsigned int maxfraglen, fragheaderlen;
+
+	if (inet->hdrincl)
+		return -EPERM;
+
+	if (flags&MSG_PROBE)
+		return 0;
+
+	if (skb_queue_empty(&sk->write_queue))
+		return -EINVAL;
+
+	rt = inet->cork.rt;
+	if (inet->cork.flags & IPCORK_OPT)
+		opt = inet->cork.opt;
+
+	if (!(rt->u.dst.dev->features&NETIF_F_SG))
+		return -EOPNOTSUPP;
+
+	hh_len = (rt->u.dst.dev->hard_header_len&~15)+16;
+	mtu = inet->cork.fragsize;
+
+	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
+	maxfraglen = ((mtu-fragheaderlen) & ~7) + fragheaderlen;
+
+	if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
+		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
+		return -EMSGSIZE;
+	}
+
+	if ((skb = skb_peek_tail(&sk->write_queue)) == NULL)
+		return -EINVAL;
+
+	inet->cork.length += size;
+
+	while (size > 0) {
+		int i;
+		if ((len = maxfraglen - skb->len) <= 0) {
+			char *data;
+			struct iphdr *iph;
+			BUG_TRAP(len == 0);
+
+			skb = sock_wmalloc(sk, fragheaderlen + hh_len + 15, 1,
+					   sk->allocation);
+			if (unlikely(!skb)) {
+				err = -ENOBUFS;
+				goto error;
+			}
+
+			/*
+			 *	Fill in the control structures
+			 */
+			skb->ip_summed = CHECKSUM_NONE;
+			skb->csum = 0;
+			skb_reserve(skb, hh_len);
+
+			/*
+			 *	Find where to start putting bytes.
+			 */
+			data = skb_put(skb, fragheaderlen);
+			skb->nh.iph = iph = (struct iphdr *)data;
+			data += fragheaderlen;
+			skb->h.raw = data;
+
+			/*
+			 * Put the packet on the pending queue.
+			 */
+			__skb_queue_tail(&sk->write_queue, skb);
+			continue;
+		}
+
+		i = skb_shinfo(skb)->nr_frags;
+		if (len > size)
+			len = size;
+		if (skb_can_coalesce(skb, i, page, offset)) {
+			skb_shinfo(skb)->frags[i-1].size += len;
+		} else if (i < MAX_SKB_FRAGS) {
+			get_page(page);
+			skb_fill_page_desc(skb, i, page, offset, len);
+		} else {
+			err = -EMSGSIZE;
+			goto error;
+		}
+
+		if (skb->ip_summed == CHECKSUM_NONE) {
+			unsigned int csum;
+			csum = csum_page(page, offset, len);
+			skb->csum = csum_block_add(skb->csum, csum, skb->len);
+		}
+
+		skb->len += len;
+		skb->data_len += len;
+		offset += len;
+		size -= len;
+	}
+	return 0;
+
+error:
+	inet->cork.length -= size;
+	IP_INC_STATS(IpOutDiscards);
+	return err;
+}
+
+/*
+ *	Combined all pending IP fragments on the socket as one IP datagram
+ *	and push them out.
+ */
+int ip_push_pending_frames(struct sock *sk)
+{
+	struct sk_buff *skb, *tmp_skb;
+	struct sk_buff **tail_skb;
+	struct inet_opt *inet = inet_sk(sk);
+	struct ip_options *opt = NULL;
+	struct rtable *rt = inet->cork.rt;
+	struct iphdr *iph;
+	int df = 0;
+	__u8 ttl;
+	int err = 0;
+
+	if ((skb = __skb_dequeue(&sk->write_queue)) == NULL)
+		goto out;
+	tail_skb = &(skb_shinfo(skb)->frag_list);
+
+	while ((tmp_skb = __skb_dequeue(&sk->write_queue)) != NULL) {
+		__skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
+		*tail_skb = tmp_skb;
+		tail_skb = &(tmp_skb->next);
+		skb->len += tmp_skb->len;
+		skb->data_len += tmp_skb->len;
+#if 0 /* Logically correct, but useless work, ip_fragment() will have to undo */
+		skb->truesize += tmp_skb->truesize;
+		__sock_put(tmp_skb->sk);
+		tmp_skb->destructor = NULL;
+		tmp_skb->sk = NULL;
+#endif
+	}
+
+	if (inet->pmtudisc == IP_PMTUDISC_DO ||
+	    (!skb_shinfo(skb)->frag_list && ip_dont_fragment(sk, &rt->u.dst)))
+		df = htons(IP_DF);
+
+	if (inet->cork.flags & IPCORK_OPT)
+		opt = inet->cork.opt;
+
+	if (rt->rt_type == RTN_MULTICAST)
+		ttl = inet->mc_ttl;
+	else
+		ttl = inet->ttl;
+
+	iph = (struct iphdr *)skb->data;
+	iph->version = 4;
+	iph->ihl = 5;
+	if (opt) {
+		iph->ihl += opt->optlen>>2;
+		ip_options_build(skb, opt, inet->cork.addr, rt, 0);
+	}
+	iph->tos = inet->tos;
+	iph->tot_len = htons(skb->len);
+	iph->frag_off = df;
+	if (!df) {
+		__ip_select_ident(iph, &rt->u.dst, 0);
+	} else {
+		iph->id = htons(inet->id++);
+	}
+	iph->ttl = ttl;
+	iph->protocol = sk->protocol;
+	iph->saddr = rt->rt_src;
+	iph->daddr = rt->rt_dst;
+	ip_send_check(iph);
+
+	skb->priority = sk->priority;
+	skb->dst = dst_clone(&rt->u.dst);
+
+	/* Netfilter gets whole the not fragmented skb. */
+	err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, 
+		      skb->dst->dev, dst_output);
+	if (err) {
+		if (err > 0)
+			err = inet->recverr ? net_xmit_errno(err) : 0;
+		if (err)
+			goto error;
+	}
+
+out:
+	inet->cork.flags &= ~IPCORK_OPT;
+	if (inet->cork.rt) {
+		ip_rt_put(inet->cork.rt);
+		inet->cork.rt = NULL;
 	}
+	return err;
 
-	dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, fraglen, 
-					     dp->csum); 
+error:
+	IP_INC_STATS(IpOutDiscards);
+	goto out;
+}
 
-	if (hdrflag && dp->csumoffset)
-		*(pktp + dp->csumoffset) = csum_fold(dp->csum); /* fill in checksum */
-	return 0;	       
+/*
+ *	Throw away all pending data on the socket.
+ */
+void ip_flush_pending_frames(struct sock *sk)
+{
+	struct inet_opt *inet = inet_sk(sk);
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue_tail(&sk->write_queue)) != NULL)
+		kfree_skb(skb);
+
+	inet->cork.flags &= ~IPCORK_OPT;
+	if (inet->cork.opt) {
+		kfree(inet->cork.opt);
+		inet->cork.opt = NULL;
+	}
+	if (inet->cork.rt) {
+		ip_rt_put(inet->cork.rt);
+		inet->cork.rt = NULL;
+	}
+}
+
+
+/*
+ *	Fetch data from kernel space and fill in checksum if needed.
+ */
+static int ip_reply_glue_bits(void *dptr, char *to, int offset, 
+			      int len, int odd, struct sk_buff *skb)
+{
+	unsigned int csum;
+
+	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
+	skb->csum = csum_block_add(skb->csum, csum, odd);
+	return 0;  
 }
 
 /* 
@@ -967,6 +1176,8 @@ static int ip_reply_glue_bits(const void *dptr, char *to, unsigned int offset,
  *
  *	Should run single threaded per socket because it uses the sock 
  *     	structure to pass arguments.
+ *
+ *	LATER: switch from ip_build_xmit to ip_append_*
  */
 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
 		   unsigned int len)
@@ -993,8 +1204,14 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
 			daddr = replyopts.opt.faddr;
 	}
 
-	if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0))
-		return;
+	{
+		struct flowi fl = { .nl_u = { .ip4_u =
+					      { .daddr = daddr,
+						.saddr = rt->rt_spec_dst,
+						.tos = RT_TOS(skb->nh.iph->tos) } } };
+		if (ip_route_output_key(&rt, &fl))
+			return;
+	}
 
 	/* And let IP do all the hard work.
 
@@ -1006,7 +1223,15 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
 	inet->tos = skb->nh.iph->tos;
 	sk->priority = skb->priority;
 	sk->protocol = skb->nh.iph->protocol;
-	ip_build_xmit(sk, ip_reply_glue_bits, arg, len, &ipc, rt, MSG_DONTWAIT);
+	ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
+		       &ipc, rt, MSG_DONTWAIT);
+	if ((skb = skb_peek(&sk->write_queue)) != NULL) {
+		if (arg->csumoffset >= 0)
+			*((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
+		skb->ip_summed = CHECKSUM_NONE;
+		ip_push_pending_frames(sk);
+	}
+
 	bh_unlock_sock(sk);
 
 	ip_rt_put(rt);
-- 
cgit v1.2.3