2008年7月18日 星期五

Tx Path

ret_fast_syscall
sys_sendfile
do_sendfile
generic_file_sendfile
do_generic_mapping_read
file_send_actor
sock_sendpage
tcp_sendpage
tcp_push_one
tcp_transmit_skb
ip_queue_xmit
ip_output

dev_queue_xmit
br_dev_xmit
br_deliver
dev_queue_xmit


it seems that control flow (commands, login, ..., etc) would go through tcp_sendmsg,
data flow goes through tcp_sendpage



tcp_sendmsg -------> dev_queue_xmit




There are 2 places that invoke qdisc_run: dev_queue_xmit and net_tx_action.
Whenever dev_queue_xmit is called, it would call qdisc_run to transmit packet. If the transmission fails, the packet would be queue, and transmitted latter by net_tx_action.

dev->enqueue=pfifo_fast_enqueue

bridge dev has no enqueue defined, so the transmission has to be successful.





linux-2.6.16/net/core/dev.c
/**
* dev_queue_xmit - transmit a buffer
* @skb: buffer to transmit
*
* Queue a buffer for transmission to a network device. The caller must
* have set the device and priority and built the buffer before calling
* this function. The function can be called from an interrupt.
*
* A negative errno code is returned on a failure. A success does not
* guarantee the frame will be transmitted as it may be dropped due
* to congestion or traffic shaping.
*
* -----------------------------------------------------------------------------------
* I notice this method can also return errors from the queue disciplines,
* including NET_XMIT_DROP, which is a positive value. So, errors can also
* be positive.
*
* Regardless of the return value, the skb is consumed, so it is currently
* difficult to retry a send to this method. (You can bump the ref count
* before sending to hold a reference for retry if you are careful.)
*
* When calling this method, interrupts MUST be enabled. This is because
* the BH enable code must have IRQs enabled so that it will not deadlock.
* --BLG
*/

int dev_queue_xmit(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
struct Qdisc *q;
int rc = -ENOMEM;

(...............................)

/* Disable soft irqs for various locks below. Also
* stops preemption for RCU.
*/
local_bh_disable();

/* Updates of qdisc are serialized by queue_lock.
* The struct Qdisc which is pointed to by qdisc is now a
* rcu structure - it may be accessed without acquiring
* a lock (but the structure may be stale.) The freeing of the
* qdisc will be deferred until it's known that there are no
* more references to it.
*
* If the qdisc has an enqueue function, we still need to
* hold the queue_lock before calling it, since queue_lock
* also serializes access to the device queue.
*/

q = rcu_dereference(dev->qdisc);
#ifdef CONFIG_NET_CLS_ACT
skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
#endif
if (q->enqueue) {
/* Grab device queue */
spin_lock(&dev->queue_lock);

rc = q->enqueue(skb, q);
qdisc_run(dev);

spin_unlock(&dev->queue_lock);
rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
goto out;
}

/* The device has no queue. Common case for software devices:
loopback, all the sorts of tunnels...

Really, it is unlikely that xmit_lock protection is necessary here.
(f.e. loopback and IP tunnels are clean ignoring statistics
counters.)
However, it is possible, that they rely on protection
made by us here.

Check this and shot the lock. It is not prone from deadlocks.
Either shot noqueue qdisc, it is even simpler 8)
*/
if (dev->flags & IFF_UP) {
int cpu = smp_processor_id(); /* ok because BHs are off */

if (dev->xmit_lock_owner != cpu) {

HARD_TX_LOCK(dev, cpu);

if (!netif_queue_stopped(dev)) {
if (netdev_nit)
dev_queue_xmit_nit(skb, dev);

rc = 0;
if (!dev->hard_start_xmit(skb, dev)) {
HARD_TX_UNLOCK(dev);
goto out;
}
}
HARD_TX_UNLOCK(dev);
if (net_ratelimit())
printk(KERN_CRIT "Virtual device %s asks to "
"queue packet!\n", dev->name);
} else {
/* Recursion is detected! It is possible,
* unfortunately */
if (net_ratelimit())
printk(KERN_CRIT "Dead loop on virtual device "
"%s, fix it urgently!\n", dev->name);
}
}

rc = -ENETDOWN;
local_bh_enable();

out_kfree_skb:
kfree_skb(skb);
return rc;
out:
local_bh_enable();
return rc;

}

(.........................)


static void net_tx_action(struct softirq_action *h)
{

(.........................)

if (sd->output_queue) {
struct net_device *head;

local_irq_disable();
head = sd->output_queue;
sd->output_queue = NULL;
local_irq_enable();

while (head) {
struct net_device *dev = head;
head = head->next_sched;

smp_mb__before_clear_bit();
clear_bit(__LINK_STATE_SCHED, &dev->state);

if (spin_trylock(&dev->queue_lock)) {
qdisc_run(dev);
spin_unlock(&dev->queue_lock);
} else {
netif_schedule(dev);
}
}
}
linux-2.6.16/include/net/pkt_sched.h
static inline void qdisc_run(struct net_device *dev)
{
while (!netif_queue_stopped(dev) && qdisc_restart(dev) < 0)
/* NOTHING */;
}


linux-2.6.16/net/sched/sch_generic.c
/*
dev->queue_lock serializes queue accesses for this device
AND dev->qdisc pointer itself.

dev->xmit_lock serializes accesses to device driver.

dev->queue_lock and dev->xmit_lock are mutually exclusive,
if one is grabbed, another must be free.
*/


/* Kick device.
Note, that this procedure can be called by a watchdog timer, so that
we do not check dev->tbusy flag here.

Returns: 0 - queue is empty.
>0 - queue is not empty, but throttled.
<0 - queue is not empty. Device is throttled, if dev->tbusy != 0.

NOTE: Called under dev->queue_lock with locally disabled BH.
*/

int qdisc_restart(struct net_device *dev)
{
struct Qdisc *q = dev->qdisc;
struct sk_buff *skb;

(..................................)

if (!netif_queue_stopped(dev)) {
int ret;
if (netdev_nit)
dev_queue_xmit_nit(skb, dev);

ret = dev->hard_start_xmit(skb, dev);
if (ret == NETDEV_TX_OK) {
if (!nolock) {
dev->xmit_lock_owner = -1;
spin_unlock(&dev->xmit_lock);
}
spin_lock(&dev->queue_lock);
return -1;
}
if (ret == NETDEV_TX_LOCKED && nolock) {
spin_lock(&dev->queue_lock);
goto collision;
}
}

(..................................)

}
linux-2.6.16/net/sched/sch_generic.c

(.........................)

static struct Qdisc_ops pfifo_fast_ops = {
.id = "pfifo_fast",
.priv_size = PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
.enqueue = pfifo_fast_enqueue,
.dequeue = pfifo_fast_dequeue,
.requeue = pfifo_fast_requeue,
.init = pfifo_fast_init,
.reset = pfifo_fast_reset,
.dump = pfifo_fast_dump,
.owner = THIS_MODULE,
};

(.........................)

static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
{
struct sk_buff_head *list = prio2list(skb, qdisc);

if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
qdisc->q.qlen++;
return __qdisc_enqueue_tail(skb, qdisc, list);
}

return qdisc_drop(skb, qdisc);
}




沒有留言: