IP 層收發報文簡要剖析3--ip輸入報文分片重組


 在ip_local_deliver中,如果檢測到是分片包,則需要將報文進行重組。其所有的分片被重新組合后才能提交到上層協議,每一個被重新組合的數據包文用ipq結構實例來表示

 

struct ipq {
    struct inet_frag_queue q;

    u32        user;//分片來源
    __be32        saddr;//原地址
    __be32        daddr;//目的地址
    __be16        id;//ip報文序列號
    u8        protocol;//上層協議號  
//這四個字段來自ip首部是為了確定來自哪個ip數據報文
    u8        ecn; /* RFC3168 support */
    u16        max_df_size; /* largest frag with DF set seen */
    int             iif;
    int             vif;   /* L3 master device index */
    unsigned int    rid;//已收到的分片計數器
    struct inet_peer *peer;//記錄發送方信息
   //通過rid peer 可以防止Dos攻擊 
};

 網絡空間分段管理結構

struct inet_frags {
    struct inet_frag_bucket    hash[INETFRAGS_HASHSZ];//哈希隊列

    struct work_struct    frags_work;//工作隊列
    unsigned int next_bucket;
    unsigned long last_rebuild_jiffies;
    bool rebuild;

    /* The first call to hashfn is responsible to initialize
     * rnd. This is best done with net_get_random_once.
     *
     * rnd_seqlock is used to let hash insertion detect
     * when it needs to re-lookup the hash chain to use.
     */
    u32            rnd;//隨機數
    seqlock_t        rnd_seqlock;//
    int            qsize;//隊列長度

    unsigned int        (*hashfn)(const struct inet_frag_queue *);
    bool            (*match)(const struct inet_frag_queue *q,
                     const void *arg);//分段隊列匹配函數
    void            (*constructor)(struct inet_frag_queue *q,
                           const void *arg);
    void            (*destructor)(struct inet_frag_queue *);
    void            (*frag_expire)(unsigned long data);//隊列過期處理函數
    struct kmem_cache    *frags_cachep;
    const char        *frags_cache_name;
};

 

struct netns_frags {
    /* The percpu_counter "mem" need to be cacheline aligned.
     *  mem.count must not share cacheline with other writers
     */
    struct percpu_counter   mem ____cacheline_aligned_in_smp;

    /* sysctls */
    int            timeout;超時時間
    int            high_thresh;內存使用上限
    int            low_thresh;內存使用下限
    int            max_dist;
};
View Code

 

/**
 * struct inet_frag_queue - fragment queue
 *
 * @lock: spinlock protecting the queue
 * @timer: queue expiration timer
 * @list: hash bucket list
 * @refcnt: reference count of the queue
 * @fragments: received fragments head
 * @fragments_tail: received fragments tail
 * @stamp: timestamp of the last received fragment
 * @len: total length of the original datagram
 * @meat: length of received fragments so far
 * @flags: fragment queue flags
 * @max_size: maximum received fragment size
 * @net: namespace that this frag belongs to
 * @list_evictor: list of queues to forcefully evict (e.g. due to low memory)
 */
struct inet_frag_queue {//inet分段隊列頭
    spinlock_t        lock;smp環境下 需要
    struct timer_list    timer;隊列定時器,組裝非常耗時,不能無休止的等待分片的到達
    struct hlist_node    list;哈希節點,鏈入inet分段管理結構的哈希隊列
    atomic_t        refcnt;計數器
    struct sk_buff        *fragments;分段數據包隊列
    struct sk_buff        *fragments_tail;
    ktime_t            stamp;時間戳
    int            len;數據包結束位置offset+len
    int            meat;與原數據長度的差距,如果和原數據包長度一樣代表接收完成
    __u8            flags;
    u16            max_size;
    struct netns_frags    *net;指向網絡空寂分段管理結構
    struct hlist_node    list_evictor;
};

1.1、 IP分組的初始化

void __init ipfrag_init(void)
{
    ip4_frags_ctl_register();
    register_pernet_subsys(&ip4_frags_ops);//向內核注冊ipv4分段管理函數
    ip4_frags.hashfn = ip4_hashfn;//設置計算hash的函數
    //設置初始化ip 分段隊列的構造函數
    ip4_frags.constructor = ip4_frag_init;
    //析構函數
    ip4_frags.destructor = ip4_frag_free;
    //隊列機構長度
    ip4_frags.qsize = sizeof(struct ipq);
    //對比ip分段隊列hook
    ip4_frags.match = ip4_frag_match;
    //設置分段隊列過期處理函數
    ip4_frags.frag_expire = ip_expire;
    ip4_frags.frags_cache_name = ip_frag_cache_name;
    if (inet_frags_init(&ip4_frags))
        panic("IP: failed to allocate ip4_frags cache\n");
}

int inet_frags_init(struct inet_frags *f)
{
    int i;
//初始化工作隊列
    INIT_WORK(&f->frags_work, inet_frag_worker);

    for (i = 0; i < INETFRAGS_HASHSZ; i++) {
        struct inet_frag_bucket *hb = &f->hash[i];//初始化hash 隊列頭

        spin_lock_init(&hb->chain_lock);
        INIT_HLIST_HEAD(&hb->chain);
    }

    seqlock_init(&f->rnd_seqlock);
    f->last_rebuild_jiffies = 0;
    f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0,
                        NULL);
    if (!f->frags_cachep)
        return -ENOMEM;

    return 0;
}
EXPORT_SYMBOL(inet_frags_init);

 

int ip_local_deliver(struct sk_buff *skb)
{
    /*
     *    Reassemble IP fragments.
     */
    struct net *net = dev_net(skb->dev);
 
    /* 分片重組 */
    if (ip_is_fragment(ip_hdr(skb))) {
        if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER))
            return 0;
    }
 
    /* 經過LOCAL_IN鈎子點 */
    return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN,
               net, NULL, skb, skb->dev, NULL,
               ip_local_deliver_finish);
}

 

1.2、 ip分片報文重組的處理

/* Process an incoming IP datagram fragment. */
int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
{
    struct net_device *dev = skb->dev ? : skb_dst(skb)->dev;
    int vif = l3mdev_master_ifindex_rcu(dev);
    struct ipq *qp;
//遞增計數
    __IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS);
    skb_orphan(skb);

    /* Lookup (or create) queue header* 查找或創建IP分片隊列  */
    qp = ip_find(net, ip_hdr(skb), user, vif);
    if (qp) {/* 分片隊列存在 */
        int ret;

        spin_lock(&qp->q.lock);

        ret = ip_frag_queue(qp, skb);//分片數據包入隊重組數據包

        spin_unlock(&qp->q.lock);
        ipq_put(qp);
        return ret;
    }
    /* 創建新的ip分片隊列失敗,內存不足遞增失敗計數*/
    __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
    kfree_skb(skb);
    return -ENOMEM;
}
EXPORT_SYMBOL(ip_defrag);

 1.2.2 ip_find 根據ip首部以及user標志 在ipq散列表中查找對應的ipq。

/* Find the correct entry in the "incomplete datagrams" queue for
 * this IP datagram, and create new one, if nothing is found.
enum ip_defrag_users {
    IP_DEFRAG_LOCAL_DELIVER,
    IP_DEFRAG_CALL_RA_CHAIN,
    IP_DEFRAG_CONNTRACK_IN,
    __IP_DEFRAG_CONNTRACK_IN_END    = IP_DEFRAG_CONNTRACK_IN + USHRT_MAX,
    IP_DEFRAG_CONNTRACK_OUT,
    __IP_DEFRAG_CONNTRACK_OUT_END    = IP_DEFRAG_CONNTRACK_OUT + USHRT_MAX,
    IP_DEFRAG_CONNTRACK_BRIDGE_IN,
    __IP_DEFRAG_CONNTRACK_BRIDGE_IN = IP_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX,
    IP_DEFRAG_VS_IN,
    IP_DEFRAG_VS_OUT,
    IP_DEFRAG_VS_FWD,
    IP_DEFRAG_AF_PACKET,
    IP_DEFRAG_MACVLAN,
};
 */
static struct ipq *ip_find(struct net *net, struct iphdr *iph,
               u32 user, int vif)
{
    struct inet_frag_queue *q;
    struct ip4_create_arg arg;
    unsigned int hash;
 /* 記錄ip頭和輸入信息 */
    arg.iph = iph;
    arg.user = user;
    arg.vif = vif;
 /* 通過id,源地址,目的地址,協議計算hash */
    hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
/* 根據hash值查找或創建隊列 */
    q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
    if (IS_ERR_OR_NULL(q)) {
        inet_frag_maybe_warn_overflow(q, pr_fmt());
        return NULL;
    }
    return container_of(q, struct ipq, q);
}

struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
                       struct inet_frags *f, void *key,
                       unsigned int hash)
{
    struct inet_frag_bucket *hb;
    struct inet_frag_queue *q;
    int depth = 0;
 /* 分片內存已經超過了低限 */
    if (frag_mem_limit(nf) > nf->low_thresh)
        /* 進行節點回收 */
        inet_frag_schedule_worker(f); //工作隊列回調函數為inet_frag_worker

    hash &= (INETFRAGS_HASHSZ - 1);
    hb = &f->hash[hash]; /* 找到hash桶 */

    spin_lock(&hb->chain_lock);
    hlist_for_each_entry(q, &hb->chain, list) { /* 遍歷鏈表 */
        if (q->net == nf && f->match(q, key)) {
            atomic_inc(&q->refcnt); /* 增加引用計數 */
            spin_unlock(&hb->chain_lock);
            return q;
        }
        depth++;/* 記錄查找深度 */
    }
    spin_unlock(&hb->chain_lock);
 /* 未找到 */
    /* 桶節點的鏈表深度不超過限定 */
    if (depth <= INETFRAGS_MAXDEPTH)
        return inet_frag_create(nf, f, key);/* 創建節點返回 */

    if (inet_frag_may_rebuild(f)) { 
        /* 如果已經超過了重建間隔時間,則重建 */
        if (!f->rebuild)
            f->rebuild = true;
        inet_frag_schedule_worker(f);
    }

    return ERR_PTR(-ENOBUFS);
}
EXPORT_SYMBOL(inet_frag_find);
View Code

如果查找不到則會創建一個ipq 並將其插入鏈表中

static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
                        struct inet_frags *f,
                        void *arg)
{
    struct inet_frag_queue *q;

    q = inet_frag_alloc(nf, f, arg);//分配隊列頭結構空間
    if (!q)
        return NULL;

    return inet_frag_intern(nf, q, f, arg);
}
static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
                           struct inet_frags *f,
                           void *arg)
{
    struct inet_frag_queue *q;

    if (frag_mem_limit(nf) > nf->high_thresh) {//內存超過警戒線 回收內存
        inet_frag_schedule_worker(f);
        return NULL;
    }

    q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
    if (!q)
        return NULL;

    q->net = nf;//記錄下網絡空間的分段管理結構指針
    f->constructor(q, arg);//之前初始化時,構造函數來初始化-ip4_frag_init
    add_frag_mem_limit(nf, f->qsize);//sum 網絡空間的分段內存

    setup_timer(&q->timer, f->frag_expire, (unsigned long)q);//定時器initand run
    spin_lock_init(&q->lock);
    atomic_set(&q->refcnt, 1);

    return q;
}
static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
{
    struct ipq *qp = container_of(q, struct ipq, q);//獲取分段隊列指針
    struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
                           frags);
                           
    struct net *net = container_of(ipv4, struct net, ipv4);

    const struct ip4_create_arg *arg = a;//ipv4的分段信息指針

    qp->protocol = arg->iph->protocol;//IP層頭部協議
    qp->id = arg->iph->id;//ip層id
    qp->ecn = ip4_frag_ecn(arg->iph->tos);
    qp->saddr = arg->iph->saddr;
    qp->daddr = arg->iph->daddr;
    qp->vif = arg->vif;
    qp->user = arg->user;
    //記錄對方信息
    qp->peer = q->net->max_dist ?
        inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) :
        NULL;
}

static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
                        struct inet_frag_queue *qp_in,
                        struct inet_frags *f,
                        void *arg)
{
    struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f);
    struct inet_frag_queue *qp;

#ifdef CONFIG_SMP
    /* With SMP race we have to recheck hash table, because
     * such entry could have been created on other cpu before
     * we acquired hash bucket lock.
     */
    hlist_for_each_entry(qp, &hb->chain, list) {
        if (qp->net == nf && f->match(qp, arg)) {
            atomic_inc(&qp->refcnt);
            spin_unlock(&hb->chain_lock);
            qp_in->flags |= INET_FRAG_COMPLETE;
            inet_frag_put(qp_in, f);
            return qp;
        }
    }
#endif
    qp = qp_in;
    if (!mod_timer(&qp->timer, jiffies + nf->timeout))
        atomic_inc(&qp->refcnt);

    atomic_inc(&qp->refcnt);//鏈入inet分段管理結構的hash隊列
    hlist_add_head(&qp->list, &hb->chain);

    spin_unlock(&hb->chain_lock);

    return qp;
}
View Code

 1/2/3 分片數據包加入重組數據包

 

/* Add new segment to existing queue. */
static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
{
    struct sk_buff *prev, *next;
    struct net_device *dev;
    unsigned int fragsize;
    int flags, offset;
    int ihl, end;
    int err = -ENOENT;
    u8 ecn;

    if (qp->q.flags & INET_FRAG_COMPLETE) //分段隊列接收完成 則釋放此分片返回
        goto err;
/*數據包沒有分段標志or  分段隊列間隔過大
//重現調整分段隊列是否出錯
如果不是本地生成的分片,則調用ip_frag_too_far 檢測
是否存在 dos攻擊,存在攻擊則調用邋ip_frag_reinit釋放
所用分片
*/
    if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
        unlikely(ip_frag_too_far(qp)) &&
        unlikely(err = ip_frag_reinit(qp))) {
        ipq_kill(qp);//將ipq從散列表中移除停止定時器 計數器減一
        // 調用ipq_unlink 設置ipq為complete狀態,只有complete狀態才能釋放
        goto err;
    }

    ecn = ip4_frag_ecn(ip_hdr(skb)->tos);
    offset = ntohs(ip_hdr(skb)->frag_off);
    flags = offset & ~IP_OFFSET;
    offset &= IP_OFFSET;
    offset <<= 3;        /* offset is in 8-byte chunks */
    ihl = ip_hdrlen(skb);
/* 獲取ip首部中的數據標志位 片的偏移 首部長度 */
    /* Determine the position of this fragment. */
    end = offset + skb->len - skb_network_offset(skb) - ihl;
    err = -EINVAL;
     /**/
    /* Is this the final fragment?
如果是最后一個片則先對分片進行檢測
    */
    if ((flags & IP_MF) == 0) {
        /* If we already have some bits beyond end
         * or have different end, the segment is corrupted.
         結束位置小於前一個位置,ipq已經有
         last_in 標志且分片末尾不等於原始數據長度
         */
        if (end < qp->q.len ||
            ((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len))
            goto err;
        qp->q.flags |= INET_FRAG_LAST_IN;
        qp->q.len = end;
        /*通過校驗並設置為last_in標志,存儲完整的數據長度*/
    } else {
        if (end&7) {//按8字節對其
            end &= ~7;
            if (skb->ip_summed != CHECKSUM_UNNECESSARY)
                skb->ip_summed = CHECKSUM_NONE;
        }
        if (end > qp->q.len) {
            /* 結束地址大於前一個分段數據地址
            Some bits beyond end -> corruption.
            如果設置了最后一個分段數據標志
            表示最后一個包,則錯誤*/
            if (qp->q.flags & INET_FRAG_LAST_IN)
                goto err;
            qp->q.len = end;//記錄當前分段數據塊的結束位置
        }
    }
    if (end == offset)//等於起始位置 即分片區數據長度為0
        goto err;

    err = -ENOMEM;//去掉ip首部
    if (!pskb_pull(skb, skb_network_offset(skb) + ihl))
        goto err;
//skb 數據長度為end-offset ip 有效載荷長度
    err = pskb_trim_rcsum(skb, end - offset);
    if (err)
        goto err;

    /* Find out which fragments are in front and at the back of us
     * in the chain of fragments so far.  We must know where to put
     * this fragment, right?
     */
    prev = qp->q.fragments_tail;
    if (!prev || FRAG_CB(prev)->offset < offset) {
        next = NULL;
        goto found;
    }
    prev = NULL;
    for (next = qp->q.fragments; next != NULL; next = next->next) {
        if (FRAG_CB(next)->offset >= offset)
            break;    /* bingo! */
        prev = next;
    }/*確定分片在鏈表中的位置,分片到達的時間順序不同
    ipq 上的分片按照分片偏移值大小排序
    */

found:
    /* We found where to put this one.  Check for overlap with
     * preceding fragment, and, if needed, align things so that
     * any overlaps are eliminated.
     檢驗和和上一個分片數據是否有重疊
     */
    if (prev) {
        int i = (FRAG_CB(prev)->offset + prev->len) - offset;

        if (i > 0) {//有重疊 調用pskb_pull 消除重疊
            offset += i;
            err = -EINVAL;
            if (end <= offset)
                goto err;
            err = -ENOMEM;
            if (!pskb_pull(skb, i))
                goto err;
            if (skb->ip_summed != CHECKSUM_UNNECESSARY)
                skb->ip_summed = CHECKSUM_NONE;
        }
    }

    err = -ENOMEM;
/*如果和后面一個分片的數據有重疊,
部分重疊還是完全重疊;
重疊部分數據超過下一個分片的數據長度,咋釋放
下發一個分片並在檢查與后面第二個分片的數據是否
有重疊,如果沒有超過下一個則調整下一個分片。
如此反復直到對所有分片都檢測完。
調整片的偏移以及分片總長度
*/
    while (next && FRAG_CB(next)->offset < end) {
        int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */

        if (i < next->len) {
            /* Eat head of the next overlapped fragment
             * and leave the loop. The next ones cannot overlap.
             */
            if (!pskb_pull(next, i))
                goto err;
            FRAG_CB(next)->offset += i;
            qp->q.meat -= i;
            if (next->ip_summed != CHECKSUM_UNNECESSARY)
                next->ip_summed = CHECKSUM_NONE;
            break;
        } else {
            struct sk_buff *free_it = next;

            /* Old fragment is completely overridden with
             * new one drop it.
             */
            next = next->next;

            if (prev)
                prev->next = next;
            else
                qp->q.fragments = next;

            qp->q.meat -= free_it->len;
            sub_frag_mem_limit(qp->q.net, free_it->truesize);
            kfree_skb(free_it);
        }
    }

    FRAG_CB(skb)->offset = offset;//當前片的偏移

    /* Insert this fragment in the chain of fragments. 
    當前的片插入到ipq隊列中相應的位置*/
    skb->next = next;
    if (!next)
        qp->q.fragments_tail = skb;
    if (prev)
        prev->next = skb;
    else
        qp->q.fragments = skb;

    dev = skb->dev;
    if (dev) {
        qp->iif = dev->ifindex;
        skb->dev = NULL;
    }
    qp->q.stamp = skb->tstamp;//更新時間搓
    qp->q.meat += skb->len;//sum ipq已收到分片的總長度
    qp->ecn |= ecn;
    //分片組裝模塊的所占內存的總長度
    add_frag_mem_limit(qp->q.net, skb->truesize);
    if (offset == 0)//為第一個片 設置標志
        qp->q.flags |= INET_FRAG_FIRST_IN;

    fragsize = skb->len + ihl;

    if (fragsize > qp->q.max_size)
        qp->q.max_size = fragsize;

    if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
        fragsize > qp->max_df_size)
        qp->max_df_size = fragsize;

    if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
        qp->q.meat == qp->q.len) {//所有報文都到齊則重組
        unsigned long orefdst = skb->_skb_refdst;

        skb->_skb_refdst = 0UL;
        err = ip_frag_reasm(qp, prev, dev);
        skb->_skb_refdst = orefdst;
        return err;
    }

    skb_dst_drop(skb);
    return -EINPROGRESS;

err:
    kfree_skb(skb);
    return err;
}

 

 ip_frag_reasm 重組報文;

* Build a new IP datagram from all its fragments. */
/* 
*用於組裝已到齊的所有分片,當原始 
 * 數據包的所有分片都已到齊時,會調用此函 
 * 數組裝分片。 
 */  
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
             struct net_device *dev)
{
    struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
    struct iphdr *iph;
    struct sk_buff *fp, *head = qp->q.fragments;
    int len;
    int ihlen;
    int err;
    u8 ecn;
/* 
     * 要開始組裝了,因此調用ipq_kill()將此ipq結點從 
     * ipq散列表刪除,並刪除定時器。 
     */  
    ipq_kill(qp);

    ecn = ip_frag_ecn_table[qp->ecn];
    if (unlikely(ecn == 0xff)) {
        err = -EINVAL;
        goto out_fail;
    }
    /* Make the one we just received the head. */
    if (prev) {
        head = prev->next;
        fp = skb_clone(head, GFP_ATOMIC);
        if (!fp)
            goto out_nomem;

        fp->next = head->next;
        if (!fp->next)
            qp->q.fragments_tail = fp;
        prev->next = fp;

        skb_morph(head, qp->q.fragments);
        head->next = qp->q.fragments->next;

        consume_skb(qp->q.fragments);
        qp->q.fragments = head;
    }

    WARN_ON(!head);
    WARN_ON(FRAG_CB(head)->offset != 0);

    /* Allocate a new buffer for the datagram. 
    計算原始報文的長度 超過64  KB*/
    ihlen = ip_hdrlen(head);
    len = ihlen + qp->q.len;

    err = -E2BIG;
    if (len > 65535)
        goto out_oversize;

    /* Head of list must not be cloned. 
     * 在組裝分片時,所有的分片都會組裝到第一個分片 
     * 上,因此第一個分片是不能克隆的,如果是克隆的, 
     * 則需為分片組裝重新分配一個SKB。 
      */
    if (skb_unclone(head, GFP_ATOMIC))
        goto out_nomem;

    /* If the first fragment is fragmented itself, we split
     * it to two chunks: the first with data and paged part
     * and the second, holding only fragments. */
     /* 
     * 分片隊列的第一個SKB不能既帶有數據,又帶有分片,即其 
     * frag_list上不能有分片skb,如果有則重新分配一個SKB。最終的 
     * 效果是,head自身不包括數據,其frag_list上鏈接着所有分片的 
     * SKB。這也是SKB的一種表現形式,不一定是一個連續的數據塊, 
     * 但最終會調用skb_linearize()將這些數據都復制到一個連續的數據 
     * 塊中。 
     */  
    if (skb_has_frag_list(head)) {
        struct sk_buff *clone;
        int i, plen = 0;

        clone = alloc_skb(0, GFP_ATOMIC);
        if (!clone)
            goto out_nomem;
        clone->next = head->next;
        head->next = clone;
        skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
        skb_frag_list_init(head);
        for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
            plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
        clone->len = clone->data_len = head->data_len - plen;
        head->data_len -= clone->len;
        head->len -= clone->len;
        clone->csum = 0;
        clone->ip_summed = head->ip_summed;
        add_frag_mem_limit(qp->q.net, clone->truesize);
    }
 /* 
     * 把所有分片組裝起來即將分片鏈接到第一個 
     * SKB的frag_list上,同時還需要遍歷所有分片, 
     * 重新計算IP數據包長度以及校驗和等。 
     */  
    skb_shinfo(head)->frag_list = head->next;
    skb_push(head, head->data - skb_network_header(head));

    for (fp=head->next; fp; fp = fp->next) {
        head->data_len += fp->len;
        head->len += fp->len;
        if (head->ip_summed != fp->ip_summed)
            head->ip_summed = CHECKSUM_NONE;
        else if (head->ip_summed == CHECKSUM_COMPLETE)
            head->csum = csum_add(head->csum, fp->csum);
        head->truesize += fp->truesize;
    }
     /* 
     * 重置首部長度、片偏移、標志位和總長度。 
     */  
    sub_frag_mem_limit(qp->q.net, head->truesize);

    head->next = NULL;
    head->dev = dev;
    head->tstamp = qp->q.stamp;
    IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);

    iph = ip_hdr(head);
    iph->tot_len = htons(len);
    iph->tos |= ecn;

    /* When we set IP_DF on a refragmented skb we must also force a
     * call to ip_fragment to avoid forwarding a DF-skb of size s while
     * original sender only sent fragments of size f (where f < s).
     *
     * We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest
     * frag seen to avoid sending tiny DF-fragments in case skb was built
     * from one very small df-fragment and one large non-df frag.
     */
    if (qp->max_df_size == qp->q.max_size) {
        IPCB(head)->flags |= IPSKB_FRAG_PMTU;
        iph->frag_off = htons(IP_DF);
    } else {
        iph->frag_off = 0;
    }

    ip_send_check(iph);

    __IP_INC_STATS(net, IPSTATS_MIB_REASMOKS);
     /* 
     * 既然各分片都已處理完,釋放ipq的分片隊列。 
     */  
    qp->q.fragments = NULL;
    qp->q.fragments_tail = NULL;
    return 0;

out_nomem:
    net_dbg_ratelimited("queue_glue: no memory for gluing queue %p\n", qp);
    err = -ENOMEM;
    goto out_fail;
out_oversize:
    net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr);
out_fail:
    __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
    return err;
}
View Code

 

1/4/4 ipq散列表重組

 

static void inet_frag_secret_rebuild(struct inet_frags *f)
{
    int i;

    write_seqlock_bh(&f->rnd_seqlock);//順序鎖

    if (!inet_frag_may_rebuild(f))
        goto out;
/* 獲取新的用於計算hash的隨機值 */
    get_random_bytes(&f->rnd, sizeof(u32));

    for (i = 0; i < INETFRAGS_HASHSZ; i++) {
        struct inet_frag_bucket *hb;
        struct inet_frag_queue *q;
        struct hlist_node *n;

        hb = &f->hash[i]; /* 取的桶節點 */
        spin_lock(&hb->chain_lock);

        hlist_for_each_entry_safe(q, n, &hb->chain, list) {
            unsigned int hval = inet_frag_hashfn(f, q);

            if (hval != i) {/* 節點不屬於當前桶 */
                struct inet_frag_bucket *hb_dest;

                hlist_del(&q->list); /* 從當前桶中刪除該節點 */

                /* Relink to new hash chain. */
                hb_dest = &f->hash[hval]; /* 找到目標桶 */

                /* This is the only place where we take
                 * another chain_lock while already holding
                 * one.  As this will not run concurrently,
                 * we cannot deadlock on hb_dest lock below, if its
                 * already locked it will be released soon since
                 * other caller cannot be waiting for hb lock
                 * that we've taken above.
                 */
                spin_lock_nested(&hb_dest->chain_lock,
                         SINGLE_DEPTH_NESTING);/* 節點加入目標桶的鏈表中 */
                hlist_add_head(&q->list, &hb_dest->chain);
                spin_unlock(&hb_dest->chain_lock);
            }
        }
        spin_unlock(&hb->chain_lock);
    }
 /* 設置重建標記和重建時間 */
    f->rebuild = false;
    f->last_rebuild_jiffies = jiffies;
out:
    write_sequnlock_bh(&f->rnd_seqlock);
}

 

1/4/5 超時IP分片的清除

會定時清除規定 時間內沒有完成重組的upq及其所有的分片

/*
 * Oops, a fragment queue timed out.  Kill it and send an ICMP reply.
 */
static void ip_expire(unsigned long arg)
{
    struct ipq *qp;
    struct net *net;

    qp = container_of((struct inet_frag_queue *) arg, struct ipq, q);
    net = container_of(qp->q.net, struct net, ipv4.frags);

    spin_lock(&qp->q.lock);
//ipq 已經是complete狀態不處理 直接釋放ipq以及其所有的分片
    if (qp->q.flags & INET_FRAG_COMPLETE)
        goto out;

    ipq_kill(qp);//將其從散列表移除
    __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);//數據統計

    if (!inet_frag_evicting(&qp->q)) {//在回收隊列中
        struct sk_buff *head = qp->q.fragments;
        const struct iphdr *iph;
        int err;

        __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);

        if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments)
            goto out;

        rcu_read_lock();
        head->dev = dev_get_by_index_rcu(net, qp->iif);
        if (!head->dev)
            goto out_rcu_unlock;

        /* skb has no dst, perform route lookup again */
        iph = ip_hdr(head);
        err = ip_route_input_noref(head, iph->daddr, iph->saddr,
                       iph->tos, head->dev);
        if (err)
            goto out_rcu_unlock;

        /* Only an end host needs to send an ICMP
         * "Fragment Reassembly Timeout" message, per RFC792.
         */
        if (frag_expire_skip_icmp(qp->user) &&
            (skb_rtable(head)->rt_type != RTN_LOCAL))
            goto out_rcu_unlock;

        /* Send an ICMP "Fragment Reassembly Timeout" message. 發送ICMP 報文*/
        icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
out_rcu_unlock:
        rcu_read_unlock();
    }
out:
    spin_unlock(&qp->q.lock);
    ipq_put(qp);
}
View Code

 

 1/4/6 進行節點回收工作隊列

為了控制ip組裝所占用的內存,設置了兩個閾值low_thresh 、high_thresh 當前ipq散列表所占用的內存存儲在 mem變量中,這些全局變量存在如下結構中(netns_frags)

struct netns_frags {
    /* The percpu_counter "mem" need to be cacheline aligned.
     *  mem.count must not share cacheline with other writers
     */
    struct percpu_counter   mem ____cacheline_aligned_in_smp;

    /* sysctls */
    int            timeout;
    int            high_thresh;
    int            low_thresh;
    int            max_dist;
};
View Code

 

當mem大於high_thres 時,需要對散列表清理,直到mem值降低到low_thres。這兩個值可以通過proc修改

static unsigned int
inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
{
    struct inet_frag_queue *fq;
    struct hlist_node *n;
    unsigned int evicted = 0;
    HLIST_HEAD(expired);

    spin_lock(&hb->chain_lock);
 /* 遍歷桶下的鏈表 */
    hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
        if (!inet_fragq_should_evict(fq))/* 未超過限定,無需回收 */
            continue;

        if (!del_timer(&fq->timer)) /* 定時器無法刪除 */
            continue;
/* 能夠回收的節點加入到臨時hash */
        hlist_add_head(&fq->list_evictor, &expired);
        ++evicted;
    }

    spin_unlock(&hb->chain_lock);
 /* 依次調用回收函數進行回收 */
    hlist_for_each_entry_safe(fq, n, &expired, list_evictor)
        f->frag_expire((unsigned long) fq);

    return evicted;
}

static void inet_frag_worker(struct work_struct *work)
{
 /*  本次回收的桶節點數 */
    unsigned int budget = INETFRAGS_EVICT_BUCKETS;
    unsigned int i, evicted = 0;
    struct inet_frags *f;

    f = container_of(work, struct inet_frags, frags_work);

    BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ);

    local_bh_disable();
 /* 從上次回收完的下一個節點開始,進行回收 */
    for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) {
        evicted += inet_evict_bucket(f, &f->hash[i]);
        /* 回收並統計回收數量 */
        i = (i + 1) & (INETFRAGS_HASHSZ - 1);
        /* 回收節點數超過最大值,停止 */
        if (evicted > INETFRAGS_EVICT_MAX)
            break;
    }

    f->next_bucket = i;  /* 記錄下次需要開始回收的桶節點 */

    local_bh_enable();
 /* 如果需要重建,則重建 */
    if (f->rebuild && inet_frag_may_rebuild(f))
        inet_frag_secret_rebuild(f);
}


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM