概述
ip_fragment函數用於判斷是否進行分片,在沒有設置DF標記的情況下進入分片,如果設置了DF標記,則繼續判斷,如果不允許DF分片或者收到的最大分片大於MTU大小,則回復ICMP,釋放skb,其余情況仍然需要走分片;
ip_do_fragment是詳細的分片流程,整個過程分為快速分片和慢速分片兩種,如果存在分片列表frag_list,並且通過檢查,則走快速路徑,復制每個分片的ip頭等信息之后,發送出去;如果不存在分片列表,或者分片列表檢查失敗,則走慢速路徑,慢速路徑會根據MTU大小,對整個數據進行重新划分,分配skb,進行數據拷貝,設置ip頭等信息,然后發送出去;
源碼分析
1 static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, 2 unsigned int mtu, 3 int (*output)(struct net *, struct sock *, struct sk_buff *)) 4 { 5 struct iphdr *iph = ip_hdr(skb); 6 7 /* 如果沒有DF標記,則進行分片 */ 8 if ((iph->frag_off & htons(IP_DF)) == 0) 9 return ip_do_fragment(net, sk, skb, output); 10 11 /* 有DF標記則繼續判斷 */ 12 13 /* 不允許本地分片 || 分片最大長度>MTU */ 14 if (unlikely(!skb->ignore_df || 15 (IPCB(skb)->frag_max_size && 16 IPCB(skb)->frag_max_size > mtu))) { 17 IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); 18 /* ICMP錯誤 */ 19 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, 20 htonl(mtu)); 21 /* 釋放skb */ 22 kfree_skb(skb); 23 return -EMSGSIZE; 24 } 25 26 /* 其他情況,繼續分片 */ 27 return ip_do_fragment(net, sk, skb, output); 28 }
1 int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, 2 int (*output)(struct net *, struct sock *, struct sk_buff *)) 3 { 4 struct iphdr *iph; 5 int ptr; 6 struct sk_buff *skb2; 7 unsigned int mtu, hlen, left, len, ll_rs; 8 int offset; 9 __be16 not_last_frag; 10 struct rtable *rt = skb_rtable(skb); 11 int err = 0; 12 13 /* for offloaded checksums cleanup checksum before fragmentation */ 14 /* PARTIAL類型需要清除校驗和 */ 15 if (skb->ip_summed == CHECKSUM_PARTIAL && 16 (err = skb_checksum_help(skb))) 17 goto fail; 18 19 /* 20 * Point into the IP datagram header. 21 */ 22 23 iph = ip_hdr(skb); 24 25 /* 獲取mtu */ 26 mtu = ip_skb_dst_mtu(sk, skb); 27 28 /* 接收到的最大分片長度 < mtu,則將mtu設置為該值 */ 29 if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu) 30 mtu = IPCB(skb)->frag_max_size; 31 32 /* 33 * Setup starting values. 34 */ 35 36 hlen = iph->ihl * 4; 37 mtu = mtu - hlen; /* Size of data space */ 38 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; 39 40 /* When frag_list is given, use it. First, check its validity: 41 * some transformers could create wrong frag_list or break existing 42 * one, it is not prohibited. In this case fall back to copying. 43 * 44 * LATER: this step can be merged to real generation of fragments, 45 * we can switch to copy when see the first bad fragment. 46 */ 47 /* 有分片列表 */ 48 if (skb_has_frag_list(skb)) { 49 struct sk_buff *frag, *frag2; 50 51 /* 線性區域和分頁區的數據長度 */ 52 unsigned int first_len = skb_pagelen(skb); 53 54 /* 以下情況,進入慢路處理 */ 55 if (first_len - hlen > mtu || /* 分片長度>MTU */ 56 ((first_len - hlen) & 7) || /* 沒有8字節對齊 */ 57 ip_is_fragment(iph) || /* 是一個分片 */ 58 skb_cloned(skb)) /* 是克隆的 */ 59 goto slow_path; 60 61 /* 遍歷分片列表 */ 62 skb_walk_frags(skb, frag) { 63 /* Correct geometry. */ 64 /* 以下情況,恢復狀態,進入慢速路徑 */ 65 if (frag->len > mtu || /* 分片長度>mtu */ 66 ((frag->len & 7) && frag->next) || /* 除最后一個分片外,其余有非8字節對齊的 */ 67 skb_headroom(frag) < hlen) /* 頭部長度過小 */ 68 goto slow_path_clean; 69 70 /* Partially cloned skb? */ 71 /* 克隆的,恢復狀態,進入慢速路徑 */ 72 if (skb_shared(frag)) 73 goto slow_path_clean; 74 75 BUG_ON(frag->sk); 76 77 /* 分片關聯控制塊 */ 78 if (skb->sk) { 79 frag->sk = skb->sk; 80 frag->destructor = sock_wfree; 81 } 82 83 /* 第一個skb的長度去掉當前分片的長度 */ 84 skb->truesize -= frag->truesize; 85 } 86 87 /* Everything is OK. Generate! */ 88 89 /* 現在分片沒問題了,設置分片信息 */ 90 err = 0; 91 offset = 0; 92 frag = skb_shinfo(skb)->frag_list; 93 skb_frag_list_init(skb); 94 skb->data_len = first_len - skb_headlen(skb); 95 skb->len = first_len; 96 iph->tot_len = htons(first_len); 97 iph->frag_off = htons(IP_MF); 98 ip_send_check(iph); 99 100 /* 循環設置分片信息,並發送 */ 101 for (;;) { 102 /* Prepare header of the next frame, 103 * before previous one went down. */ 104 /* 為每一片都拷貝ip頭,設置偏移信息 */ 105 if (frag) { 106 frag->ip_summed = CHECKSUM_NONE; 107 skb_reset_transport_header(frag); 108 __skb_push(frag, hlen); 109 skb_reset_network_header(frag); 110 memcpy(skb_network_header(frag), iph, hlen); 111 iph = ip_hdr(frag); 112 iph->tot_len = htons(frag->len); 113 ip_copy_metadata(frag, skb); 114 if (offset == 0) 115 ip_options_fragment(frag); 116 offset += skb->len - hlen; 117 iph->frag_off = htons(offset>>3); 118 if (frag->next) 119 iph->frag_off |= htons(IP_MF); 120 /* Ready, complete checksum */ 121 ip_send_check(iph); 122 } 123 124 /* 調用發送回調 */ 125 err = output(net, sk, skb); 126 127 if (!err) 128 IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES); 129 if (err || !frag) 130 break; 131 132 skb = frag; 133 frag = skb->next; 134 skb->next = NULL; 135 } 136 137 if (err == 0) { 138 IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS); 139 return 0; 140 } 141 142 /* 出錯,釋放分片 */ 143 while (frag) { 144 skb = frag->next; 145 kfree_skb(frag); 146 frag = skb; 147 } 148 IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); 149 return err; 150 151 slow_path_clean: 152 /* 將分片恢復原狀態 */ 153 skb_walk_frags(skb, frag2) { 154 if (frag2 == frag) 155 break; 156 frag2->sk = NULL; 157 frag2->destructor = NULL; 158 skb->truesize += frag2->truesize; 159 } 160 } 161 162 slow_path: 163 /* 慢速分片路徑 */ 164 165 166 iph = ip_hdr(skb); 167 168 /* 除去首部的剩余空間 */ 169 left = skb->len - hlen; /* Space per frame */ 170 ptr = hlen; /* Where to start from */ 171 172 /* 二層頭部空間 */ 173 ll_rs = LL_RESERVED_SPACE(rt->dst.dev); 174 175 /* 176 * Fragment the datagram. 177 */ 178 179 /* 初始化mf和offset */ 180 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3; 181 not_last_frag = iph->frag_off & htons(IP_MF); 182 183 /* 184 * Keep copying data until we run out. 185 */ 186 187 /* 開始分片了 */ 188 while (left > 0) { 189 /* len初始為剩余長度 */ 190 len = left; 191 /* IF: it doesn't fit, use 'mtu' - the data space left */ 192 /* 根據mtu確認長度 */ 193 if (len > mtu) 194 len = mtu; 195 /* IF: we are not sending up to and including the packet end 196 then align the next start on an eight byte boundary */ 197 /* 除最后分片外,其余8字節對齊 */ 198 if (len < left) { 199 len &= ~7; 200 } 201 202 /* Allocate buffer */ 203 /* 分配skb */ 204 skb2 = alloc_skb(len + hlen + ll_rs, GFP_ATOMIC); 205 if (!skb2) { 206 err = -ENOMEM; 207 goto fail; 208 } 209 210 /* 211 * Set up data on packet 212 */ 213 214 /* 拷貝元數據 */ 215 ip_copy_metadata(skb2, skb); 216 217 /* 預留空間,設置頭部偏移 */ 218 skb_reserve(skb2, ll_rs); 219 skb_put(skb2, len + hlen); 220 skb_reset_network_header(skb2); 221 skb2->transport_header = skb2->network_header + hlen; 222 223 /* 224 * Charge the memory for the fragment to any owner 225 * it might possess 226 */ 227 /* 關聯sk */ 228 if (skb->sk) 229 skb_set_owner_w(skb2, skb->sk); 230 231 /* 232 * Copy the packet header into the new buffer. 233 */ 234 235 /* 拷貝頭部 */ 236 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen); 237 238 /* 239 * Copy a block of the IP datagram. 240 */ 241 /* 拷貝數據 */ 242 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len)) 243 BUG(); 244 left -= len; 245 246 /* 247 * Fill in the new header fields. 248 */ 249 iph = ip_hdr(skb2); 250 251 /* 設置偏移 *// 252 iph->frag_off = htons((offset >> 3)); 253 254 /* 轉發的數據包,帶有FRAG_PMTU標記,則打上DF */ 255 if (IPCB(skb)->flags & IPSKB_FRAG_PMTU) 256 iph->frag_off |= htons(IP_DF); 257 258 /* ANK: dirty, but effective trick. Upgrade options only if 259 * the segment to be fragmented was THE FIRST (otherwise, 260 * options are already fixed) and make it ONCE 261 * on the initial skb, so that all the following fragments 262 * will inherit fixed options. 263 */ 264 /* 第一個分片包含ip選項 */ 265 if (offset == 0) 266 ip_options_fragment(skb); 267 268 /* 269 * Added AC : If we are fragmenting a fragment that's not the 270 * last fragment then keep MF on each bit 271 */ 272 /* 不是最后分片需要設定MF標記 */ 273 if (left > 0 || not_last_frag) 274 iph->frag_off |= htons(IP_MF); 275 276 /* 指針和偏移更新 */ 277 ptr += len; 278 offset += len; 279 280 /* 281 * Put this fragment into the sending queue. 282 */ 283 /* 設置數據長度 */ 284 iph->tot_len = htons(len + hlen); 285 286 /* 校驗和 */ 287 ip_send_check(iph); 288 289 /* 發送分片 */ 290 err = output(net, sk, skb2); 291 if (err) 292 goto fail; 293 294 IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES); 295 } 296 297 /* 分片完成並發送,釋放skb */ 298 consume_skb(skb); 299 IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS); 300 return err; 301 302 fail: 303 304 /* 出錯,釋放skb */ 305 kfree_skb(skb); 306 IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); 307 return err; 308 }