1、Interrupt handler

1.1、Register an interrupt handler

在函数ixgbe_open() -> ixgbe_request_irq()中会尝试注册interrupt handler：

/**
 * ixgbe_request_irq - initialize interrupts
 * @adapter: board private structure
 *
 * Attempts to configure interrupts using the best available
 * capabilities of the hardware and kernel.
 **/
static int ixgbe_request_irq(struct ixgbe_adapter *adapter)
{
    struct net_device *netdev = adapter->netdev;
    int err;

    if (adapter->flags & IXGBE_FLAG_MSIX_ENABLED)
        err = ixgbe_request_msix_irqs(adapter);
    else if (adapter->flags & IXGBE_FLAG_MSI_ENABLED)
        err = request_irq(adapter->pdev->irq, ixgbe_intr, 0,
                  netdev->name, adapter);
    else
        err = request_irq(adapter->pdev->irq, ixgbe_intr, IRQF_SHARED,
                  netdev->name, adapter);

    if (err)
        e_err(probe, "request_irq failed, Error %d\n", err);

    return err;
}

目前可以注册三种类型的interrupt：

MSI-X interrupt mode;
MSI interrupt mode;
legacy interrupt mode;

MSI-X interrupt handler最终会执行函数ixgbe_msix_clean_rings。

1.2、Interrupt handler

当网卡接收到frame时，通过DMA将frame搬运到对应的rx ring，然后产生一个硬中断，调用函数：

static irqreturn_t ixgbe_msix_clean_rings(int irq, void *data)
{
    struct ixgbe_q_vector *q_vector = data;

    /* EIAM disabled interrupts (on this vector) for us */

    if (q_vector->rx.ring || q_vector->tx.ring)
        napi_schedule(&q_vector->napi);

    return IRQ_HANDLED;
}

之后再经过一系列函数调用napi_schedule() -> __napi_schedule() -> ____napi_schedule()：

/**
 *  napi_schedule - schedule NAPI poll
 *  @n: napi context
 *
 * Schedule NAPI poll routine to be called if it is not already
 * running.
 */
static inline void napi_schedule(struct napi_struct *n)
{
    if (napi_schedule_prep(n))
        __napi_schedule(n);
}

/**
 * __napi_schedule - schedule for receive
 * @n: entry to schedule
 *
 * The entry's receive function will be scheduled to run
 */
void __napi_schedule(struct napi_struct *n)
{
    unsigned long flags;

    local_irq_save(flags);
    ____napi_schedule(&__get_cpu_var(softnet_data), n);
    local_irq_restore(flags);
}

/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
                     struct napi_struct *napi)
{
    list_add_tail(&napi->poll_list, &sd->poll_list);
    __raise_softirq_irqoff(NET_RX_SOFTIRQ);
}

函数____napi_schedule完成两件事：

napi_struct{}结构挂载到与当前CPU相关的softnet_data{}->poll_list链表中；
触发NET_RX_SOFTIRQ softirq。

2、Softirq handler

2.1、net_rx_action

NET_RX_SOFTIRQ对应的软中断处理函数是net_rx_action：

static void net_rx_action(struct softirq_action *h)
{
    struct softnet_data *sd = &__get_cpu_var(softnet_data);
    unsigned long time_limit = jiffies + 2;
    int budget = netdev_budget;
    void *have;

    local_irq_disable();

    while (!list_empty(&sd->poll_list)) {
        struct napi_struct *n;
        int work, weight;

        /* If softirq window is exhuasted then punt.
         * Allow this to run for 2 jiffies since which will allow
         * an average latency of 1.5/HZ.
         */
        if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
            goto softnet_break;

        local_irq_enable();

        /* Even though interrupts have been re-enabled, this
         * access is safe because interrupts can only add new
         * entries to the tail of this list, and only ->poll()
         * calls can remove this head entry from the list.
         */
        n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);

        have = netpoll_poll_lock(n);

        weight = n->weight;

        /* This NAPI_STATE_SCHED test is for avoiding a race
         * with netpoll's poll_napi().  Only the entity which
         * obtains the lock and sees NAPI_STATE_SCHED set will
         * actually make the ->poll() call.  Therefore we avoid
         * accidentally calling ->poll() when NAPI is not scheduled.
         */
        work = 0;
        if (test_bit(NAPI_STATE_SCHED, &n->state)) {
            work = n->poll(n, weight);
            trace_napi_poll(n);
        }

        WARN_ON_ONCE(work > weight);

        budget -= work;

        local_irq_disable();

        /* Drivers must not modify the NAPI state if they
         * consume the entire weight.  In such cases this code
         * still "owns" the NAPI instance and therefore can
         * move the instance around on the list at-will.
         */
        if (unlikely(work == weight)) {
            if (unlikely(napi_disable_pending(n))) {
                local_irq_enable();
                napi_complete(n);
                local_irq_disable();
            } else {
                if (n->gro_list) {
                    /* flush too old packets
                     * If HZ < 1000, flush all packets.
                     */
                    local_irq_enable();
                    napi_gro_flush(n, HZ >= 1000);
                    local_irq_disable();
                }
                list_move_tail(&n->poll_list, &sd->poll_list);
            }
        }

        netpoll_poll_unlock(have);
    }
out:
    net_rps_action_and_irq_enable(sd);

    return;

softnet_break:
    sd->time_squeeze++;
    __raise_softirq_irqoff(NET_RX_SOFTIRQ);
    goto out;
}

while循环退出的条件有：

当前CPU所属的poll list中没有再挂载有napi_struct{}结构了；
剩下的budget变为小于等于0了；
2 jiffies的时限已经到了。

2.2、ixgbe_poll

加载ixgbe驱动的时候，按照ixgbe_probe() -> ixgbe_init_interrupt_scheme() -> ixgbe_alloc_q_vectors() -> ixgbe_alloc_q_vector()的调用关系，在函数ixgbe_alloc_q_vector()中注册函数指针napi_struct{}->poll：

    /* initialize NAPI */
    netif_napi_add(adapter->netdev, &q_vector->napi,
               ixgbe_poll, 64);

这里可以看到，napi_struct{}->weight被固定设置为64。

/**
 * ixgbe_poll - NAPI Rx polling callback
 * @napi: structure for representing this polling device
 * @budget: how many packets driver is allowed to clean
 *
 * This function is used for legacy and MSI, NAPI mode
 **/
int ixgbe_poll(struct napi_struct *napi, int budget)
{
    ...

    ixgbe_for_each_ring(ring, q_vector->rx)
        clean_complete &= ixgbe_clean_rx_irq(q_vector, ring,
                             per_ring_budget);

    /* If all work not completed, return budget and keep polling */
    if (!clean_complete)
        return budget;

    /* all work done, exit the polling mode */
    napi_complete(napi);
    if (adapter->rx_itr_setting & 1)
        ixgbe_set_itr(q_vector);
    if (!test_bit(__IXGBE_DOWN, &adapter->state))
        ixgbe_irq_enable_queues(adapter, ((u64)1 << q_vector->v_idx));

    return 0;
}

2.3、ixgbe_clean_rx_irq

/**
 * ixgbe_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
 * @q_vector: structure containing interrupt and ring information
 * @rx_ring: rx descriptor ring to transact packets on
 * @budget: Total limit on number of packets to process
 *
 * This function provides a "bounce buffer" approach to Rx interrupt
 * processing.  The advantage to this is that on systems that have
 * expensive overhead for IOMMU access this provides a means of avoiding
 * it by maintaining the mapping of the page to the syste.
 *
 * Returns true if all work is completed without reaching budget
 **/
static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
                   struct ixgbe_ring *rx_ring,
                   const int budget)
{
    unsigned int total_rx_bytes = 0, total_rx_packets = 0;
    u16 cleaned_count = ixgbe_desc_unused(rx_ring);

    do {
        union ixgbe_adv_rx_desc *rx_desc;
        struct sk_buff *skb;

        /* return some buffers to hardware, one at a time is too slow */
        if (cleaned_count >= IXGBE_RX_BUFFER_WRITE) {
            ixgbe_alloc_rx_buffers(rx_ring, cleaned_count);
            cleaned_count = 0;
        }

        rx_desc = IXGBE_RX_DESC(rx_ring, rx_ring->next_to_clean);

        if (!ixgbe_test_staterr(rx_desc, IXGBE_RXD_STAT_DD))
            break;

        /*
         * This memory barrier is needed to keep us from reading
         * any other fields out of the rx_desc until we know the
         * RXD_STAT_DD bit is set
         */
        rmb();

        /* retrieve a buffer from the ring */
        skb = ixgbe_fetch_rx_buffer(rx_ring, rx_desc);

        /* exit if we failed to retrieve a buffer */
        if (!skb)
            break;

        cleaned_count++;

        /* place incomplete frames back on ring for completion */
        if (ixgbe_is_non_eop(rx_ring, rx_desc, skb))
            continue;

        /* verify the packet layout is correct */
        if (ixgbe_cleanup_headers(rx_ring, rx_desc, skb))
            continue;

        /* probably a little skewed due to removing CRC */
        total_rx_bytes += skb->len;

        /* populate checksum, timestamp, VLAN, and protocol */
        ixgbe_process_skb_fields(rx_ring, rx_desc, skb);

        ixgbe_rx_skb(q_vector, skb);

        /* update budget accounting */
        total_rx_packets++;
    } while (likely(total_rx_packets < budget));

    u64_stats_update_begin(&rx_ring->syncp);
    rx_ring->stats.packets += total_rx_packets;
    rx_ring->stats.bytes += total_rx_bytes;
    u64_stats_update_end(&rx_ring->syncp);
    q_vector->rx.total_packets += total_rx_packets;
    q_vector->rx.total_bytes += total_rx_bytes;

    if (cleaned_count)
        ixgbe_alloc_rx_buffers(rx_ring, cleaned_count);

    return (total_rx_packets < budget);
}

函数ixgbe_clean_rx_irq中的循环，一次处理一个接收报文，直至budget用完或者没有接收报文需要处理，循环所完成的工作有：

调用函数ixgbe_alloc_rx_buffers分配额外的接收缓冲区，接收IXGBE_RX_BUFFER_WRITE(16)个报文才会执行一次（首次除外），每次分配的数量为IXGBE_RX_BUFFER_WRITE(16)；
获取rx descriptor，进而获取rx buffer，赋值给skb；
检查当前是否是个”End of Packet” buffer，如果是，就继续往下处理；如果不是，把它重新到rx queue的末尾；
调用函数ixgbe_cleanup_headers检查layout；
调用函数ixgbe_process_skb_fields，设置hash、checksum、timestamp、vlan、protocol等信息；
- 如果硬件校验checksum成功，并且接收报文是TCP或者UDP，sk_buff{}->ip_summed会被设置为CHECKSUM_UNNECESSARY；
- 调用函数eth_type_trans会获取L2 header中的protocol信息，然后记录到sk_buff{}->protocol中。
调用函数ixgbe_rx_skb，将skb上送给协议栈继续处理。
- 一般情况下，函数ixgbe_rx_skb会直接调用函数napi_gro_receive。

2.4、napi_gro_receive

gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
    skb_gro_reset_offset(skb);

    return napi_skb_finish(dev_gro_receive(napi, skb), skb);
}

函数napi_gro_receive主要执行了两个函数：

dev_gro_receive：处理GRO；

Generic Receive Offloading (GRO) is a software implementation of a hardware optimization that is known as Large Receive Offloading (LRO).
napi_skb_finish：调用函数netif_receive_skb继续处理，或者释放报文skb。

3、netif_receive_skb

如果不考虑RPS，函数netif_receive_skb只是简单地调用了函数__netif_receive_skb_core。

函数netif_receive_skb和之后调用的其它函数仍然是在softirq上下文中被处理的。

3.1、Packet tap delivery

这部分代码涉及捕包的实现：

static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
{
    ...
    list_for_each_entry_rcu(ptype, &ptype_all, list) {
        if (!ptype->dev || ptype->dev == skb->dev) {
            if (pt_prev)
                ret = deliver_skb(skb, pt_prev, orig_dev);
            pt_prev = ptype;
        }
    }
    ...
}

如果执行tcpdump，可以观察到如下的系统调用：

# strace tcpdump -i eth3
...
socket(PF_PACKET, SOCK_RAW, 768)        = 3
ioctl(3, SIOCGIFINDEX, {ifr_name="lo", ifr_index=1}) = 0
ioctl(3, SIOCGIFHWADDR, {ifr_name="eth3", ifr_hwaddr=0c:42:a1:10:65:cd}) = 0
ioctl(3, SIOCGIFINDEX, {ifr_name="eth3", ifr_index=11}) = 0
bind(3, {sa_family=AF_PACKET, proto=0x03, if11, pkttype=PACKET_HOST, addr(0)={0, }, 20) = 0
...

这里可以看到，创建socket时，family指定为PF_PACKET，type指定为SOCK_RAW，protocol指定为768（实际就是htons(ETH_P_ALL)）。

追踪socket()系统调用过程，针对family指定为PF_PACKET时，会调用函数packet_create来完成特定的初始化工作：

/*
 *  Create a packet of type SOCK_PACKET.
 */

static int packet_create(struct net *net, struct socket *sock, int protocol,
             int kern)
{
    struct sock *sk;
    struct packet_sock *po;
    __be16 proto = (__force __be16)protocol; /* weird, but documented */
    int err;
    ...
    po = pkt_sk(sk);
    sk->sk_family = PF_PACKET;
    ...
    po->prot_hook.func = packet_rcv;
    ...
    if (proto) {
        po->prot_hook.type = proto;
        register_prot_hook(sk);
    }
    ...
}

继续追踪函数调用：

static void register_prot_hook(struct sock *sk)
{
    struct packet_sock *po = pkt_sk(sk);
    if (!po->running) {
        if (po->fanout)
            __fanout_link(sk, po);
        else
            dev_add_pack(&po->prot_hook);
        sock_hold(sk);
        po->running = 1;
    }
}

void dev_add_pack(struct packet_type *pt)
{
    struct list_head *head = ptype_head(pt);

    spin_lock(&ptype_lock);
    list_add_rcu(&pt->list, head);
    spin_unlock(&ptype_lock);
}

static inline struct list_head *ptype_head(const struct packet_type *pt)
{
    if (pt->type == htons(ETH_P_ALL))
        return &ptype_all;
    else
        return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
}

最终会在以ptype_all为头部的链表中添加一个packet_type{}结构。

回到函数__netif_receive_skb_core的执行路径上，函数deliver_skb会执行之前注册的回调函数packet_type{}->func：

static inline int deliver_skb(struct sk_buff *skb,
                  struct packet_type *pt_prev,
                  struct net_device *orig_dev)
{
    if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
        return -ENOMEM;
    atomic_inc(&skb->users);
    return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}

对于捕包的场景，最终会执行函数packet_rcv。

3.2、Protocol layer delivery

这部分代码涉及一般报文的处理，由接收报文的协议（例如IPv4 / ARP）来决定后续的处理路径：

static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
{
    ...
    type = skb->protocol;
    list_for_each_entry_rcu(ptype,
            &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
        if (ptype->type == type &&
            (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
             ptype->dev == orig_dev)) {
            if (pt_prev)
                ret = deliver_skb(skb, pt_prev, orig_dev);
            pt_prev = ptype;
        }
    }

    if (pt_prev) {
        if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
            goto drop;
        else
            ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
    } else {
drop:
        atomic_long_inc(&skb->dev->rx_dropped);
        kfree_skb(skb);
        /* Jamal, now you will not able to escape explaining
         * me how you were going to use this. :-)
         */
        ret = NET_RX_DROP;
    }
    ...
}

和捕包类似，在函数inet_init的末尾会注册IPv4报文的处理路径：

static int __init inet_init(void)
{
    ...
    dev_add_pack(&ip_packet_type);
    ...
}

static struct packet_type ip_packet_type __read_mostly = {
    .type = cpu_to_be16(ETH_P_IP),
    .func = ip_rcv,
};

对于IPv4报文，之后内核会执行函数ip_rcv继续处理。

4、IP Layer

/*
 *  Main IP Receive routine.
 */
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
    ...
    return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL,
               ip_rcv_finish);
    ...
}

这里会涉及netfilter的第一个hook点 — NF_INET_PRE_ROUTING，如果netfilter未丢弃报文，会继续执行函数ip_rcv_finish。

static int ip_rcv_finish(struct sk_buff *skb)
{
    ...
    return dst_input(skb);
    ...
}

最后会执行dst_entry{}->input所指的函数，对这个函数指针的赋值来自于之前调用函数ip_route_input_noref，执行路由查找所完成的。对于目标地址是本机的，实际会设置为函数ip_local_deliver。

/*
 *  Deliver IP Packets to the higher protocol layers.
 */
int ip_local_deliver(struct sk_buff *skb)
{
    /*
     *  Reassemble IP fragments.
     */

    if (ip_is_fragment(ip_hdr(skb))) {
        if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
            return 0;
    }

    return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
               ip_local_deliver_finish);
}

这里会涉及netfilter的另一个hook点 — NF_INET_LOCAL_IN，如果netfilter未丢弃报文，会继续执行函数ip_local_deliver_finish。

static int ip_local_deliver_finish(struct sk_buff *skb)
{
    struct net *net = dev_net(skb->dev);

    __skb_pull(skb, ip_hdrlen(skb));

    /* Point into the IP datagram, just past the header. */
    skb_reset_transport_header(skb);

    rcu_read_lock();
    {
        int protocol = ip_hdr(skb)->protocol;
        const struct net_protocol *ipprot;
        int raw;

        ...
        ipprot = rcu_dereference(inet_protos[protocol]);
        if (ipprot != NULL) {
            int ret;

            ...
            ret = ipprot->handler(skb);
            if (ret < 0) {
                protocol = -ret;
                goto resubmit;
            }
            ...
        }
    }
 out:
    rcu_read_unlock();

    return 0;
}

最终会根据IP Header中所指定的L4 Layer协议，来执行net_protocol{}->handler所指的函数。

在函数inet_init中同样会注册几种L4 Layer协议报文的处理路径：

static int __init inet_init(void)
{
    ...
    if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
        pr_crit("%s: Cannot add ICMP protocol\n", __func__);
    if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
        pr_crit("%s: Cannot add UDP protocol\n", __func__);
    if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
        pr_crit("%s: Cannot add TCP protocol\n", __func__);
    ...
}

对于UDP报文，之后内核会执行函数udp_rcv继续处理；而对于TCP报文，之后内核会执行函数tcp_v4_rcv继续处理。

5、Links

https://blog.packagecloud.io/monitoring-tuning-linux-networking-stack-receiving-data/

Keep studying

Receiving Data