PF_PACKET抓包mmap


PACKET套接口創建

內核函數packet_create處理PF_PACKET套接口的創建工作。其參數sock->type決定了采用哪一種工作模式,如果參數type為SOCK_PACKET即第一種模式,type為SOCK_DGRAM或者SOCK_RAW即為第二種模式。

兩種模式內核會賦予不同的操作函數集合和數據包接收函數,例如后者使用packet_ops函數集,而前者使用packet_ops_spkt函數集。

接收函數一個為packet_rcv,一個為packet_rcv_spkt函數。

/**    Attach a protocol block
     */
    spin_lock_init(&po->bind_lock);
    mutex_init(&po->pg_vec_lock);
    po->prot_hook.func = packet_rcv;
    if (sock->type == SOCK_PACKET)
        po->prot_hook.func = packet_rcv_spkt;
    po->prot_hook.af_packet_priv = sk;

socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));

類型為SOCK_DGRAM/SOCK_RAW的PF_PACKET套接口,除了普通的在內核與用戶層間拷貝數據包的方式外,還可通過setsockopt系統調用設置環形接收buffer,

通過mmap與應用層共享這部分內存。這樣就可省去拷貝操作。但是數據包的套接口地址信息就不

通過recvfrom/recvmsg調用送到用戶層,內核需將這部分信息和數據包拼接在一起,另外,數據包的一些信息如時間戳、VLAN等和環形buffer管理信息也需要在內核與用戶態交互,

所以還需要一個結構,為此內核定義了TPACKET_HAEDER結構存儲這些信息

 

目前TPACKET_HEADER有三個版本,每個版本的長度略有不同,用戶層可使用setsockopt(PACKET_VERSION)設置需要的版本,另外也可通過getsockopt(PACKET_HDRLEN)獲取到每個版本對應的頭部長度,設置環形接收buffer需要此長度值。

 

    enum tpacket_versions {
        TPACKET_V1,
        TPACKET_V2,
        TPACKET_V3
    };

用戶層通過setsockopt(PACKET_RX_RING/PACKET_TX_RING)設置環形buffer參數,內核函數packet_set_ring進行處理,並對這4個字段的合法性檢查,來看一下其中的要求和關聯。

1)內存塊大小tp_block_size必須按照頁面大小對其,即必須是頁面大小的整數倍;每個內存塊至少要能夠容納一個數據包;另外,tp_block_size的大小要求是頁面大小的2的指數倍(2,4,8倍);

2)數據包大小tp_frame_size必須是16字節(TPACKET_ALIGNMENT)對其;不能太小,必須大於TPACKET頭部信息的長度;
3)內存塊數量tp_block_nr乘以每個內存塊容納的數據幀數目,應該等於數據包的總數tp_frame_nr。

合法性檢查通過后,內核根據tp_block_size和tp_block_nr分配相應的存儲頁面,並將相關信息保持在packet_sock套接口的成員rx_ring(packet_ring_buffer)結構體中。最后,更改數據包接收函數為tpacket_rcv,其處理環形buffer接收數據包功能。

static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
        int closing, int tx_ring)
{
    struct pgv *pg_vec = NULL;
    struct packet_sock *po = pkt_sk(sk);
    int was_running, order = 0;
    struct packet_ring_buffer *rb;
    struct sk_buff_head *rb_queue;
    __be16 num;
    int err = -EINVAL;
    /* Added to avoid minimal code churn */
    struct tpacket_req *req = &req_u->req;

    /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
    if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
        WARN(1, "Tx-ring is not supported.\n");
        goto out;
    }

    rb = tx_ring ? &po->tx_ring : &po->rx_ring;
    rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;

    err = -EBUSY;
    if (!closing) {
        if (atomic_read(&po->mapped))
            goto out;
        if (atomic_read(&rb->pending))
            goto out;
    }

    if (req->tp_block_nr) {
        /* Sanity tests and some calculations */
        err = -EBUSY;
        if (unlikely(rb->pg_vec))
            goto out;

        switch (po->tp_version) {
        case TPACKET_V1:
            po->tp_hdrlen = TPACKET_HDRLEN;
            break;
        case TPACKET_V2:
            po->tp_hdrlen = TPACKET2_HDRLEN;
            break;
        case TPACKET_V3:
            po->tp_hdrlen = TPACKET3_HDRLEN;
            break;
        }
        /*
           Frame structure:
        
           - Start. Frame must be aligned to TPACKET_ALIGNMENT=16
           - struct tpacket_hdr
           - pad to TPACKET_ALIGNMENT=16
           - struct sockaddr_ll
           - Gap, chosen so that packet data (Start+tp_net) alignes to TPACKET_ALIGNMENT=16
           - Start+tp_mac: [ Optional MAC header ]
           - Start+tp_net: Packet data, aligned to TPACKET_ALIGNMENT=16.
           - Pad to align to TPACKET_ALIGNMENT=16
         */

        err = -EINVAL;
        if (unlikely((int)req->tp_block_size <= 0))
            goto out;
        if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))// 必須是pagesize的倍數
            goto out;
        if (unlikely(req->tp_frame_size < po->tp_hdrlen +
                    po->tp_reserve))
            goto out;
        if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))//數據包大小tp_frame_size必須是16字節對其
            goto out;

        rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
        if (unlikely(rb->frames_per_block <= 0))
            goto out;
        //內存塊數量tp_block_nr乘以每個內存塊容納的數據幀數目,應該等於數據包的總數tp_frame_nr
        if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
                    req->tp_frame_nr))
            goto out;

        err = -ENOMEM;
        order = get_order(req->tp_block_size);
        pg_vec = alloc_pg_vec(req, order);// kmalloc       tp_block_nr  *  tp_block_size
        if (unlikely(!pg_vec))
            goto out;
        switch (po->tp_version) {
        case TPACKET_V3:
        /* Transmit path is not supported. We checked
         * it above but just being paranoid
         */
            if (!tx_ring)
                init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
                break;
        default:
            break;
        }
    }
    /* Done */
    else {
        err = -EINVAL;
        if (unlikely(req->tp_frame_nr))
            goto out;
    }

    lock_sock(sk);

    /* Detach socket from network */
    spin_lock(&po->bind_lock);
    was_running = po->running;
    num = po->num;
    if (was_running) {
        po->num = 0;
        __unregister_prot_hook(sk, false);
    }
    spin_unlock(&po->bind_lock);

    synchronize_net();

    err = -EBUSY;
    mutex_lock(&po->pg_vec_lock);
    if (closing || atomic_read(&po->mapped) == 0) {
        err = 0;
        spin_lock_bh(&rb_queue->lock);
        swap(rb->pg_vec, pg_vec);
        rb->frame_max = (req->tp_frame_nr - 1);
        rb->head = 0;
        rb->frame_size = req->tp_frame_size;
        spin_unlock_bh(&rb_queue->lock);

        swap(rb->pg_vec_order, order);
        swap(rb->pg_vec_len, req->tp_block_nr);

        rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
        po->prot_hook.func = (po->rx_ring.pg_vec) ?
                        tpacket_rcv : packet_rcv;//替換數據報文解析函數
        skb_queue_purge(rb_queue);
        if (atomic_read(&po->mapped))
            pr_err("packet_mmap: vma is busy: %d\n",
                   atomic_read(&po->mapped));
    }
    mutex_unlock(&po->pg_vec_lock);

    spin_lock(&po->bind_lock);
    if (was_running) {
        po->num = num;
        register_prot_hook(sk);
    }
    spin_unlock(&po->bind_lock);
    if (closing && (po->tp_version > TPACKET_V2)) {
        /* Because we don't support block-based V3 on tx-ring */
        if (!tx_ring)
            prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
    }
    release_sock(sk);

    if (pg_vec)
        free_pg_vec(pg_vec, order, req->tp_block_nr);
out:
    return err;
}
/*
+ Why use PACKET_MMAP
--------------------------------------------------------------------------------
In Linux 2.4/2.6 if PACKET_MMAP is not enabled, the capture process is very inefficient. It uses very limited buffers and requires one system call to capture each packet, it requires two if you want to get packet's timestamp (like libpcap always does).
In the other hand PACKET_MMAP is very efficient. PACKET_MMAP provides a size configurable circular buffer mapped in user space that can be used to either send or receive packets. This way reading packets just needs to wait for them, most of the time there is no need to issue a single system call. Concerning transmission, multiple packets can be sent through one system call to get the highest bandwidth. By using a shared buffer between the kernel and the user also has the benefit of minimizing packet copies.
It's fine to use PACKET_MMAP to improve the performance of the capture and transmission process, but it isn't everything. At least, if you are capturing at high speeds (this is relative to the cpu speed), you should check if the device driver of your network interface card supports some sort of interrupt load mitigation or (even better) if it supports NAPI, also make sure it is enabled. For transmission, check the MTU (Maximum Transmission Unit) used and supported by devices of your network.
-------------------------------------------------------------------------------- + How to use mmap() to improve capture process
--------------------------------------------------------------------------------
From the user standpoint, you should use the higher level libpcap library, which is a de facto standard, portable across nearly all operating systems including Win32. 
Said that, at time of this writing, official libpcap 0.8.1 is out and doesn't include support for PACKET_MMAP, and also probably the libpcap included in your distribution. 
I'm aware of two implementations of PACKET_MMAP in libpcap:
    http://wiki.ipxwarzone.com/             (by Simon Patarin, based on libpcap 0.6.2) http://public.lanl.gov/cpw/              (by Phil Wood, based on lastest libpcap)
The rest of this document is intended for people who want to understand the low level details or want to improve libpcap by including PACKET_MMAP support.
-------------------------------------------------------------------------------- + How to use mmap() directly to improve capture process
--------------------------------------------------------------------------------
From the system calls stand point, the use of PACKET_MMAP involves the following process:
[setup]     socket() -------> creation of the capture socket setsockopt() ---> allocation of the circular buffer (ring) option: PACKET_RX_RING mmap() ---------> mapping of the allocated buffer to the user process
[capture]   poll() ---------> to wait for incoming packets
[shutdown]  close() --------> destruction of the capture socket and deallocation of all associated 
                              resources.
socket creation and destruction is straight forward, and is done the same way with or without PACKET_MMAP:
int fd;
fd= socket(PF_PACKET, mode, htons(ETH_P_ALL))
where mode is SOCK_RAW for the raw interface were link level information can be captured or SOCK_DGRAM for the cooked interface where link level information capture is not supported and a link level pseudo-header is provided by the kernel.
The destruction of the socket and all associated resources is done by a simple call to close(fd).
Next I will describe PACKET_MMAP settings and its constraints, also the mapping of the circular buffer in the user process and the use of this buffer.
-------------------------------------------------------------------------------- + How to use mmap() directly to improve transmission process
-------------------------------------------------------------------------------- Transmission process is similar to capture as shown below.
[setup]          socket() -------> creation of the transmission socket setsockopt() ---> allocation of the circular buffer (ring) option: PACKET_TX_RING bind() ---------> bind transmission socket with a network interface
                 mmap() ---------> mapping of the allocated buffer to the user process
[transmission]   poll() ---------> wait for free packets (optional) send() ---------> send all packets that are set as ready in the ring
                                   The flag MSG_DONTWAIT can be used to return before end of transfer.
[shutdown]  close() --------> destruction of the transmission socket and deallocation of all associated resources.
Binding the socket to your network interface is mandatory (with zero copy) to know the header size of frames used in the circular buffer.
As capture, each frame contains two parts:
 -------------------- | struct tpacket_hdr | Header. It contains the status of |                    | of this frame |--------------------| | data buffer        | .                    .  Data that will be sent over the network interface. .                    .
 --------------------
 bind() associates the socket to your network interface thanks to sll_ifindex parameter of struct sockaddr_ll.
 Initialization example:
 struct sockaddr_ll my_addr;
 struct ifreq s_ifr;
 ...
 strncpy (s_ifr.ifr_name, "eth0", sizeof(s_ifr.ifr_name));
 /* get interface index of eth0 */
 ioctl(this->socket, SIOCGIFINDEX, &s_ifr);
 /* fill sockaddr_ll struct to prepare binding */
 my_addr.sll_family = AF_PACKET;
 my_addr.sll_protocol = htons(ETH_P_ALL);
 my_addr.sll_ifindex =  s_ifr.ifr_ifindex;
 /* bind socket to eth0 */
 bind(this->socket, (struct sockaddr *)&my_addr, sizeof(struct sockaddr_ll));
 A complete tutorial is available at: http://wiki.gnu-log.net/
-------------------------------------------------------------------------------- + PACKET_MMAP settings
--------------------------------------------------------------------------------
To setup PACKET_MMAP from user level code is done with a call like
 - Capture process setsockopt(fd, SOL_PACKET, PACKET_RX_RING, (void *) &req, sizeof(req))
 - Transmission process setsockopt(fd, SOL_PACKET, PACKET_TX_RING, (void *) &req, sizeof(req))
The most significant argument in the previous call is the req parameter, this parameter must to have the following structure:
    struct tpacket_req
    { unsigned int    tp_block_size;  /* Minimal size of contiguous block */ unsigned int    tp_block_nr;    /* Number of blocks */
        unsigned int    tp_frame_size;  /* Size of frame */
        unsigned int    tp_frame_nr;    /* Total number of frames */ };
This structure is defined in /usr/include/linux/if_packet.h and establishes a circular buffer (ring) of unswappable memory. Being mapped in the capture process allows reading the captured frames and related meta-information like timestamps without requiring a system call.
Frames are grouped in blocks. Each block is a physically contiguous region of memory and holds tp_block_size/tp_frame_size frames. The total number of blocks is tp_block_nr. Note that tp_frame_nr is a redundant parameter because
    frames_per_block = tp_block_size/tp_frame_size
indeed, packet_set_ring checks that the following condition is true
    frames_per_block * tp_block_nr == tp_frame_nr
Lets see an example, with the following values:
     tp_block_size= 4096
     tp_frame_size= 2048
     tp_block_nr  = 4
     tp_frame_nr  = 8
we will get the following buffer structure:
        block #1                 block #2 
+---------+---------+ +---------+---------+
| frame 1 | frame 2 | | frame 3 | frame 4 |
+---------+---------+ +---------+---------+ block #3 block #4

+---------+---------+ +---------+---------+
| frame 5 | frame 6 | | frame 7 | frame 8 |
+---------+---------+ +---------+---------+ A frame can be of any size with the only condition it can fit in a block. A block can only hold an integer number of frames, or in other words, a frame cannot be spawned across two blocks, so there are some details you have to take into account when choosing the frame_size. See "Mapping and use of the circular buffer (ring)". currently, this structure is a dynamically allocated vector with kmalloc called pg_vec, its size limits the number of blocks that can be allocated. +---+---+---+---+ | x | x | x | x | +---+---+---+---+ | | | | | | | v | | v block #4 | v block #3 v block #2 block #1 kmalloc allocates any number of bytes of physically contiguous memory from a pool of pre-determined sizes. This pool of memory is maintained by the slab allocator which is at the end the responsible for doing the allocation and hence which imposes the maximum memory that kmalloc can allocate. ++ Transmission process Those defines are also used for transmission: #define TP_STATUS_AVAILABLE 0 // Frame is available #define TP_STATUS_SEND_REQUEST 1 // Frame will be sent on next send() #define TP_STATUS_SENDING 2 // Frame is currently in transmission #define TP_STATUS_WRONG_FORMAT 4 // Frame format is not correct First, the kernel initializes all frames to TP_STATUS_AVAILABLE. To send a packet, the user fills a data buffer of an available frame, sets tp_len to current data buffer size and sets its status field to TP_STATUS_SEND_REQUEST. This can be done on multiple frames. Once the user is ready to transmit, it calls send(). Then all buffers with status equal to TP_STATUS_SEND_REQUEST are forwarded to the network device. The kernel updates each status of sent frames with TP_STATUS_SENDING until the end of transfer. At the end of each transfer, buffer status returns to TP_STATUS_AVAILABLE. header->tp_len = in_i_size; header->tp_status = TP_STATUS_SEND_REQUEST; retval = send(this->socket, NULL, 0, 0); The user can also use poll() to check if a buffer is available: (status == TP_STATUS_SENDING) struct pollfd pfd; pfd.fd = fd; pfd.revents = 0; pfd.events = POLLOUT; retval = poll(&pfd, 1, timeout); ------------------------------------------------------------------------------- + PACKET_TIMESTAMP ------------------------------------------------------------------------------- The PACKET_TIMESTAMP setting determines the source of the timestamp in the packet meta information. If your NIC is capable of timestamping packets in hardware, you can request those hardware timestamps to used. Note: you may need to enable the generation of hardware timestamps with SIOCSHWTSTAMP. PACKET_TIMESTAMP accepts the same integer bit field as SO_TIMESTAMPING. However, only the SOF_TIMESTAMPING_SYS_HARDWARE and SOF_TIMESTAMPING_RAW_HARDWARE values are recognized by PACKET_TIMESTAMP. SOF_TIMESTAMPING_SYS_HARDWARE takes precedence over SOF_TIMESTAMPING_RAW_HARDWARE if both bits are set. int req = 0; req |= SOF_TIMESTAMPING_SYS_HARDWARE; setsockopt(fd, SOL_PACKET, PACKET_TIMESTAMP, (void *) &req, sizeof(req)) If PACKET_TIMESTAMP is not set, a software timestamp generated inside the networking stack is used (the behavior before this setting was added). */

 

用戶層要訪問內核的接收環形buffer,需要通過mmap將其映射到用戶空間;

mmapbuf = mmap(0, mmapbuflen, PROT_READ|PROT_WRITE, MAP_SHARED, sk, 0);

數據幀接收

  新接收到的數據幀應當放入共享環形buffer的哪個位置?由函數packet_lookup_frame計算得到。參數position為保存在環形buffer中的可用幀空間的頭索引(rx_ring.head),根據此索引,

計算得到頁面索引(內存塊索引)和幀偏移,即得到可用來保存數據幀的地址(h.raw)。

  內核與用戶層在操作環形buffer時的同步實現,參見tpacket_hdr字段中的tp_status字段,此字段的第一個bit位來實現功能,當前為0時(TP_STATUS_KERNEL)標識內核在使用此段數據幀空間,反之,為1時(TP_STATUS_USER)標識用戶層面在使用此段空間。前面介紹的內核使用packet_lookup_frame函數查找可用的數據幀空間,找到之后使用函數__packet_get_status來判斷一下此段空間是否可用,tp_status等於TP_STATUS_KERNEL可正常使用,否則,說明用戶層還沒有處理此段空間內的數據幀,通常在環形buffer已滿的情況下出現。
內核在填充完數據幀空間之后,將tp_status的同步位設置為TP_STATUS_USER,同時調用sk->sk_data_ready(sk)通知用戶層數據已准備好。

static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
               struct packet_type *pt, struct net_device *orig_dev)
{
    struct sock *sk;
    struct packet_sock *po;
    struct sockaddr_ll *sll;
    union {
        struct tpacket_hdr *h1;
        struct tpacket2_hdr *h2;
        struct tpacket3_hdr *h3;
        void *raw;
    } h;
    u8 *skb_head = skb->data;
    int skb_len = skb->len;
    unsigned int snaplen, res;
    unsigned long status = TP_STATUS_USER;
    unsigned short macoff, netoff, hdrlen;
    struct sk_buff *copy_skb = NULL;
    struct timeval tv;
    struct timespec ts;
    struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);

    if (skb->pkt_type == PACKET_LOOPBACK)
        goto drop;

    sk = pt->af_packet_priv;
    po = pkt_sk(sk);

    if (!net_eq(dev_net(dev), sock_net(sk)))
        goto drop;

    if (dev->header_ops) {
        if (sk->sk_type != SOCK_DGRAM)
            skb_push(skb, skb->data - skb_mac_header(skb));
        else if (skb->pkt_type == PACKET_OUTGOING) {
            /* Special case: outgoing packets have ll header at head */
            skb_pull(skb, skb_network_offset(skb));
        }
    }

    if (skb->ip_summed == CHECKSUM_PARTIAL)
        status |= TP_STATUS_CSUMNOTREADY;

    snaplen = skb->len;

    res = run_filter(skb, sk, snaplen);
    if (!res)
        goto drop_n_restore;
    if (snaplen > res)
        snaplen = res;

    if (sk->sk_type == SOCK_DGRAM) {
        macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
                  po->tp_reserve;
    } else {
        unsigned int maclen = skb_network_offset(skb);
        netoff = TPACKET_ALIGN(po->tp_hdrlen +
                       (maclen < 16 ? 16 : maclen)) +
            po->tp_reserve;
        macoff = netoff - maclen;
    }
    if (po->tp_version <= TPACKET_V2) {
        if (macoff + snaplen > po->rx_ring.frame_size) {
            if (po->copy_thresh &&
                atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
                if (skb_shared(skb)) {
                    copy_skb = skb_clone(skb, GFP_ATOMIC);
                } else {
                    copy_skb = skb_get(skb);
                    skb_head = skb->data;
                }
                if (copy_skb)
                    skb_set_owner_r(copy_skb, sk);
            }
            snaplen = po->rx_ring.frame_size - macoff;
            if ((int)snaplen < 0)
                snaplen = 0;
        }
    }
    spin_lock(&sk->sk_receive_queue.lock);
    h.raw = packet_current_rx_frame(po, skb,
                    TP_STATUS_KERNEL, (macoff+snaplen));
    if (!h.raw)
        goto ring_is_full;
    if (po->tp_version <= TPACKET_V2) {
        packet_increment_rx_head(po, &po->rx_ring);
    /*
     * LOSING will be reported till you read the stats,
     * because it's COR - Clear On Read.
     * Anyways, moving it for V1/V2 only as V3 doesn't need this
     * at packet level.
     */
        if (po->stats.tp_drops)
            status |= TP_STATUS_LOSING;
    }
    po->stats.tp_packets++;
    if (copy_skb) {
        status |= TP_STATUS_COPY;
        __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
    }
    spin_unlock(&sk->sk_receive_queue.lock);

    skb_copy_bits(skb, 0, h.raw + macoff, snaplen);

    switch (po->tp_version) {
    case TPACKET_V1:
        h.h1->tp_len = skb->len;
        h.h1->tp_snaplen = snaplen;
        h.h1->tp_mac = macoff;
        h.h1->tp_net = netoff;
        if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
                && shhwtstamps->syststamp.tv64)
            tv = ktime_to_timeval(shhwtstamps->syststamp);
        else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
                && shhwtstamps->hwtstamp.tv64)
            tv = ktime_to_timeval(shhwtstamps->hwtstamp);
        else if (skb->tstamp.tv64)
            tv = ktime_to_timeval(skb->tstamp);
        else
            do_gettimeofday(&tv);
        h.h1->tp_sec = tv.tv_sec;
        h.h1->tp_usec = tv.tv_usec;
        hdrlen = sizeof(*h.h1);
        break;
    case TPACKET_V2:
        h.h2->tp_len = skb->len;
        h.h2->tp_snaplen = snaplen;
        h.h2->tp_mac = macoff;
        h.h2->tp_net = netoff;
        if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
                && shhwtstamps->syststamp.tv64)
            ts = ktime_to_timespec(shhwtstamps->syststamp);
        else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
                && shhwtstamps->hwtstamp.tv64)
            ts = ktime_to_timespec(shhwtstamps->hwtstamp);
        else if (skb->tstamp.tv64)
            ts = ktime_to_timespec(skb->tstamp);
        else
            getnstimeofday(&ts);
        h.h2->tp_sec = ts.tv_sec;
        h.h2->tp_nsec = ts.tv_nsec;
        if (vlan_tx_tag_present(skb)) {
            h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
            status |= TP_STATUS_VLAN_VALID;
        } else {
            h.h2->tp_vlan_tci = 0;
        }
        h.h2->tp_padding = 0;
        hdrlen = sizeof(*h.h2);
        break;
    case TPACKET_V3:
        /* tp_nxt_offset,vlan are already populated above.
         * So DONT clear those fields here
         */
        h.h3->tp_status |= status;
        h.h3->tp_len = skb->len;
        h.h3->tp_snaplen = snaplen;
        h.h3->tp_mac = macoff;
        h.h3->tp_net = netoff;
        if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
                && shhwtstamps->syststamp.tv64)
            ts = ktime_to_timespec(shhwtstamps->syststamp);
        else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
                && shhwtstamps->hwtstamp.tv64)
            ts = ktime_to_timespec(shhwtstamps->hwtstamp);
        else if (skb->tstamp.tv64)
            ts = ktime_to_timespec(skb->tstamp);
        else
            getnstimeofday(&ts);
        h.h3->tp_sec  = ts.tv_sec;
        h.h3->tp_nsec = ts.tv_nsec;
        hdrlen = sizeof(*h.h3);
        break;
    default:
        BUG();
    }

    sll = h.raw + TPACKET_ALIGN(hdrlen);
    sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
    sll->sll_family = AF_PACKET;
    sll->sll_hatype = dev->type;
    sll->sll_protocol = skb->protocol;
    sll->sll_pkttype = skb->pkt_type;
    if (unlikely(po->origdev))
        sll->sll_ifindex = orig_dev->ifindex;
    else
        sll->sll_ifindex = dev->ifindex;

    smp_mb();
#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
    {
        u8 *start, *end;

        if (po->tp_version <= TPACKET_V2) {
            end = (u8 *)PAGE_ALIGN((unsigned long)h.raw
                + macoff + snaplen);
            for (start = h.raw; start < end; start += PAGE_SIZE)
                flush_dcache_page(pgv_to_page(start));
        }
        smp_wmb();
    }
#endif
    if (po->tp_version <= TPACKET_V2)
        __packet_set_status(po, h.raw, status);
    else
        prb_clear_blk_fill_status(&po->rx_ring);

    sk->sk_data_ready(sk, 0);

drop_n_restore:
    if (skb_head != skb->data && skb_shared(skb)) {
        skb->data = skb_head;
        skb->len = skb_len;
    }
drop:
    kfree_skb(skb);
    return 0;

ring_is_full:
    po->stats.tp_drops++;
    spin_unlock(&sk->sk_receive_queue.lock);

    sk->sk_data_ready(sk, 0);
    kfree_skb(copy_skb);
    goto drop_n_restore;
}

目前能看到的是, PACKET_MMAP只支持內核和用戶態之間zero copy,但是內核里面還有一次ring buffer到DMA拷貝;

而PF_RING 通過DNA支持真正的zero copy,具體實現方案有待進一步研究,RTFS

---------------------------------------------------------------------------------
相關研究討論的帖子
 
源自BSD的類似技術netmap

 

eg:

int main ( int argc, char **argv ) 
{
    struct pollfd pfd;
    struct sockaddr_ll addr;
    int i;
    
    signal(SIGINT, sigproc);

    /* Open the packet socket */
    if ( (fd=socket(PF_PACKET, SOCK_DGRAM, 0))<0 ) {
        perror("socket()");
        return 1;
    }

    /* Setup the fd for mmap() ring buffer */
    req.tp_block_size=4096;
    req.tp_frame_size=1024;
    req.tp_block_nr=64;
    req.tp_frame_nr=4*64;
    if ( (setsockopt(fd,
        SOL_PACKET,
        PACKET_RX_RING,
        (char *)&req,
        sizeof(req))) != 0 ) {
        perror("setsockopt()");
        close(fd);
        return 1;
    };

    /* mmap() the sucker */
    map=mmap(NULL,
        req.tp_block_size * req.tp_block_nr,
        PROT_READ|PROT_WRITE|PROT_EXEC, MAP_SHARED, fd, 0);
    if ( map==MAP_FAILED ) {
        perror("mmap()");
        close(fd);
        return 1;
    }

    /* Setup our ringbuffer */
    ring=malloc(req.tp_frame_nr * sizeof(struct iovec));
    for(i=0; i<req.tp_frame_nr; i++) {
        ring[i].iov_base=(void *)((long)map)+(i*req.tp_frame_size);
        ring[i].iov_len=req.tp_frame_size;
    }
    
    /* bind the packet socket */
    memset(&addr, 0, sizeof(addr));
    addr.sll_family=AF_PACKET;
    addr.sll_protocol=htons(0x03);
    addr.sll_ifindex=0;
    addr.sll_hatype=0;
    addr.sll_pkttype=0;
    addr.sll_halen=0;
    if ( bind(fd, (struct sockaddr *)&addr, sizeof(addr)) ) {
        munmap(map, req.tp_block_size * req.tp_block_nr);
        perror("bind()");
        close(fd);
        return 1;
    }
    
    for(i=0;;) {
        while(*(unsigned long*)ring[i].iov_base) {
            struct tpacket_hdr *h=ring[i].iov_base;
            struct sockaddr_ll *sll=(void *)h + TPACKET_ALIGN(sizeof(*h));
            unsigned char *bp=(unsigned char *)h + h->tp_mac;

            printf("%u.%.6u: if%u %s %u bytes\n",
                h->tp_sec, h->tp_usec,
                sll->sll_ifindex,
                names[sll->sll_pkttype],
                h->tp_len);

            /* tell the kernel this packet is done with */
            h->tp_status=0;
            mb(); /* memory barrier */
            
            i=(i==req.tp_frame_nr-1) ? 0 : i+1;
        }

        /* Sleep when nothings happening */
        pfd.fd=fd;
        pfd.events=POLLIN|POLLERR;
        pfd.revents=0;
        poll(&pfd, 1, -1);
    }
    
    return 0;
}

 

packet_poll 分析: 

1、通過datagram_poll 也就是接收緩存中的事件mask1

 2、如果開啟了ring mmap 就會檢查rx_frame 返回mask2

最后返回mask1 | mask2的值

/**
 *     datagram_poll - generic datagram poll
 *    @file: file struct
 *    @sock: socket
 *    @wait: poll table
 *
 *    Datagram poll: Again totally generic. This also handles
 *    sequenced packet sockets providing the socket receive queue
 *    is only ever holding data ready to receive.
 *
 *    Note: when you _don't_ use this routine for this protocol,
 *    and you use a different write policy from sock_writeable()
 *    then please supply your own write_space callback.
 */
unsigned int datagram_poll(struct file *file, struct socket *sock,
               poll_table *wait)
{
    struct sock *sk = sock->sk;
    unsigned int mask;
// 如果wait 為空NULL 不會執行其callback
    sock_poll_wait(file, sk_sleep(sk), wait);
    mask = 0;

    /* exceptional events? */
    if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
        mask |= POLLERR;
    if (sk->sk_shutdown & RCV_SHUTDOWN)
        mask |= POLLRDHUP | POLLIN | POLLRDNORM;
    if (sk->sk_shutdown == SHUTDOWN_MASK)
        mask |= POLLHUP;

    /* readable? */
    if (!skb_queue_empty(&sk->sk_receive_queue))
        mask |= POLLIN | POLLRDNORM;

    /* Connection-based need to check for termination and startup */
    if (connection_based(sk)) {
        if (sk->sk_state == TCP_CLOSE)
            mask |= POLLHUP;
        /* connection hasn't started yet? */
        if (sk->sk_state == TCP_SYN_SENT)
            return mask;
    }

    /* writable? */
    if (sock_writeable(sk))
        mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
    else
        set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);

    return mask;
}
static unsigned int packet_poll(struct file *file, struct socket *sock,
                poll_table *wait)
{
    struct sock *sk = sock->sk;
    struct packet_sock *po = pkt_sk(sk);
    unsigned int mask = datagram_poll(file, sock, wait);

    spin_lock_bh(&sk->sk_receive_queue.lock);
    if (po->rx_ring.pg_vec) {
        if (!packet_previous_rx_frame(po, &po->rx_ring,
            TP_STATUS_KERNEL))
            mask |= POLLIN | POLLRDNORM;
    }
    spin_unlock_bh(&sk->sk_receive_queue.lock);
    spin_lock_bh(&sk->sk_write_queue.lock);
    if (po->tx_ring.pg_vec) {
        if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
            mask |= POLLOUT | POLLWRNORM;
    }
    spin_unlock_bh(&sk->sk_write_queue.lock);
    return mask;
}


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM