初始化
iptable_nat_table_init函數通過調用ipt_register_table完成NAT表注冊和鈎子函數注冊的功能;該流程與iptable_filter的函數調用的函數一致,此處不再重復分析,詳情請移步<iptable_filter分析>;
1 static int __net_init iptable_nat_table_init(struct net *net) 2 { 3 struct ipt_replace *repl; 4 int ret; 5 6 /* nat表已經初始化過 */ 7 if (net->ipv4.nat_table) 8 return 0; 9 10 /* 分配初始化表,用於下面的注冊 */ 11 repl = ipt_alloc_initial_table(&nf_nat_ipv4_table); 12 if (repl == NULL) 13 return -ENOMEM; 14 /* 表注冊,鈎子函數注冊 */ 15 ret = ipt_register_table(net, &nf_nat_ipv4_table, repl, 16 nf_nat_ipv4_ops, &net->ipv4.nat_table); 17 kfree(repl); 18 return ret; 19 }
鈎子函數分析
鈎子函數以及鈎子點
nf_nat_ipv4_ops是NAT相關鈎子函數的數組,其調用順序和鈎子點見下面注釋;其中filter工作在DNAT和SNAT之間;
這幾個鈎子函數都會調用nf_nat_ipv4_fn來完成NAT轉換,本部分最后統一分析該函數;
1 /* 鈎子函數數組 */ 2 /* 順序 DNAT->filter->SNAT */ 3 /* 輸入本機 PRE_ROUTING(DNAT)->LOCAL_IN(SNAT) */ 4 /* 轉發 PRE_ROUTING(DNAT)->POST_ROUTING(SNAT) */ 5 /* 本機輸出 LOCAL_OUT(DNAT)->POST_ROUTING(SNAT) */ 6 static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { 7 /* Before packet filtering, change destination */ 8 { 9 .hook = iptable_nat_ipv4_in, 10 .pf = NFPROTO_IPV4, 11 .hooknum = NF_INET_PRE_ROUTING, 12 .priority = NF_IP_PRI_NAT_DST, /* DNAT */ 13 }, 14 /* After packet filtering, change source */ 15 { 16 .hook = iptable_nat_ipv4_out, 17 .pf = NFPROTO_IPV4, 18 .hooknum = NF_INET_POST_ROUTING, 19 .priority = NF_IP_PRI_NAT_SRC, /* SNAT */ 20 }, 21 /* Before packet filtering, change destination */ 22 { 23 .hook = iptable_nat_ipv4_local_fn, 24 .pf = NFPROTO_IPV4, 25 .hooknum = NF_INET_LOCAL_OUT, 26 .priority = NF_IP_PRI_NAT_DST, /* DNAT */ 27 }, 28 /* After packet filtering, change source */ 29 { 30 .hook = iptable_nat_ipv4_fn, 31 .pf = NFPROTO_IPV4, 32 .hooknum = NF_INET_LOCAL_IN, 33 .priority = NF_IP_PRI_NAT_SRC, /* SNAT */ 34 }, 35 };
iptable_nat_ipv4_in
函數工作在PRE_ROUTING鈎子點,進行DNAT轉換;
1 /* PRE_ROUTING,DNAT */ 2 static unsigned int iptable_nat_ipv4_in(void *priv, 3 struct sk_buff *skb, 4 const struct nf_hook_state *state) 5 { 6 return nf_nat_ipv4_in(priv, skb, state, iptable_nat_do_chain); 7 }
nf_nat_ipv4_in函數在進行DNAT轉換之前記錄了目的地址,在進行轉換之后,如果目的地址發生了改變,則需要釋放skb中的路由緩存;NAT轉換過程調用nf_nat_ipv4_fn完成,步驟見下面的該函數分析;
1 /* PRE_ROUTING, DNAT */ 2 unsigned int 3 nf_nat_ipv4_in(void *priv, struct sk_buff *skb, 4 const struct nf_hook_state *state, 5 unsigned int (*do_chain)(void *priv, 6 struct sk_buff *skb, 7 const struct nf_hook_state *state, 8 struct nf_conn *ct)) 9 { 10 unsigned int ret; 11 /* 獲取目的地址 */ 12 __be32 daddr = ip_hdr(skb)->daddr; 13 14 /* DNAT轉換 */ 15 ret = nf_nat_ipv4_fn(priv, skb, state, do_chain); 16 17 /* 轉換之后,目的地址發生變化,釋放路由緩存 */ 18 if (ret != NF_DROP && ret != NF_STOLEN && 19 daddr != ip_hdr(skb)->daddr) 20 skb_dst_drop(skb); 21 22 return ret; 23 }
iptable_nat_ipv4_fn
函數工作在LOCAL_IN鈎子點,進行SNAT轉換;NAT轉換過程調用nf_nat_ipv4_fn完成,步驟見下面的該函數分析;
1 /* LOCAL_IN,SNAT */ 2 static unsigned int iptable_nat_ipv4_fn(void *priv, 3 struct sk_buff *skb, 4 const struct nf_hook_state *state) 5 { 6 return nf_nat_ipv4_fn(priv, skb, state, iptable_nat_do_chain); 7 }
iptable_nat_ipv4_local_fn
函數工作在LOCAL_OUT鈎子點,進行DNAT轉換;
1 /* LOCAL_OUT,DNAT */ 2 static unsigned int iptable_nat_ipv4_local_fn(void *priv, 3 struct sk_buff *skb, 4 const struct nf_hook_state *state) 5 { 6 return nf_nat_ipv4_local_fn(priv, skb, state, iptable_nat_do_chain); 7 }
nf_nat_ipv4_local_fn函數在進行DNAT轉換之后,如果地址發生變化,則需要重新進行路由查;NAT轉換過程調用nf_nat_ipv4_fn完成,步驟見下面的該函數分析;
1 unsigned int 2 nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, 3 const struct nf_hook_state *state, 4 unsigned int (*do_chain)(void *priv, 5 struct sk_buff *skb, 6 const struct nf_hook_state *state, 7 struct nf_conn *ct)) 8 { 9 const struct nf_conn *ct; 10 enum ip_conntrack_info ctinfo; 11 unsigned int ret; 12 int err; 13 14 /* root is playing with raw sockets. */ 15 if (skb->len < sizeof(struct iphdr) || 16 ip_hdrlen(skb) < sizeof(struct iphdr)) 17 return NF_ACCEPT; 18 19 /* DNAT轉換 */ 20 ret = nf_nat_ipv4_fn(priv, skb, state, do_chain); 21 22 /* 轉換成功 */ 23 if (ret != NF_DROP && ret != NF_STOLEN && 24 (ct = nf_ct_get(skb, &ctinfo)) != NULL) { 25 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); 26 27 /* ip地址發生變化 */ 28 if (ct->tuplehash[dir].tuple.dst.u3.ip != 29 ct->tuplehash[!dir].tuple.src.u3.ip) { 30 /* 重新查路由 */ 31 err = ip_route_me_harder(state->net, skb, RTN_UNSPEC); 32 if (err < 0) 33 ret = NF_DROP_ERR(err); 34 } 35 #ifdef CONFIG_XFRM 36 else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && 37 ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && 38 ct->tuplehash[dir].tuple.dst.u.all != 39 ct->tuplehash[!dir].tuple.src.u.all) { 40 err = nf_xfrm_me_harder(state->net, skb, AF_INET); 41 if (err < 0) 42 ret = NF_DROP_ERR(err); 43 } 44 #endif 45 } 46 return ret; 47 }
iptable_nat_ipv4_out
函數工作在POST_ROUTING鈎子點,進行SNAT轉換;
1 /* POST_ROUTING,SNAT */ 2 static unsigned int iptable_nat_ipv4_out(void *priv, 3 struct sk_buff *skb, 4 const struct nf_hook_state *state) 5 { 6 return nf_nat_ipv4_out(priv, skb, state, iptable_nat_do_chain); 7 }
1 unsigned int 2 nf_nat_ipv4_out(void *priv, struct sk_buff *skb, 3 const struct nf_hook_state *state, 4 unsigned int (*do_chain)(void *priv, 5 struct sk_buff *skb, 6 const struct nf_hook_state *state, 7 struct nf_conn *ct)) 8 { 9 #ifdef CONFIG_XFRM 10 const struct nf_conn *ct; 11 enum ip_conntrack_info ctinfo; 12 int err; 13 #endif 14 unsigned int ret; 15 16 /* root is playing with raw sockets. */ 17 if (skb->len < sizeof(struct iphdr) || 18 ip_hdrlen(skb) < sizeof(struct iphdr)) 19 return NF_ACCEPT; 20 21 /* SNAT轉換 */ 22 ret = nf_nat_ipv4_fn(priv, skb, state, do_chain); 23 #ifdef CONFIG_XFRM 24 if (ret != NF_DROP && ret != NF_STOLEN && 25 !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && 26 (ct = nf_ct_get(skb, &ctinfo)) != NULL) { 27 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); 28 29 if ((ct->tuplehash[dir].tuple.src.u3.ip != 30 ct->tuplehash[!dir].tuple.dst.u3.ip) || 31 (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && 32 ct->tuplehash[dir].tuple.src.u.all != 33 ct->tuplehash[!dir].tuple.dst.u.all)) { 34 err = nf_xfrm_me_harder(state->net, skb, AF_INET); 35 if (err < 0) 36 ret = NF_DROP_ERR(err); 37 } 38 } 39 #endif 40 return ret; 41 }
公共函數nf_nat_ipv4_fn
nf_nat_ipv4_fn完成具體的SNAT或者DNAT的轉換流程,上面的四個鈎子函數都會調用該函數;
1 unsigned int 2 nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, 3 const struct nf_hook_state *state, 4 unsigned int (*do_chain)(void *priv, 5 struct sk_buff *skb, 6 const struct nf_hook_state *state, 7 struct nf_conn *ct)) 8 { 9 struct nf_conn *ct; 10 enum ip_conntrack_info ctinfo; 11 struct nf_conn_nat *nat; 12 /* maniptype == SRC for postrouting. */ 13 /* 獲取是進行DNAT還是SNAT,其中PRE_ROUTING和LOCAL_OUT進行DNAT,LOCAL_IN和POST_ROUTING進行SNAT */ 14 enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook); 15 16 /* 獲取skb關聯的連接跟蹤sf_conn */ 17 ct = nf_ct_get(skb, &ctinfo); 18 /* Can't track? It's not due to stress, or conntrack would 19 * have dropped it. Hence it's the user's responsibilty to 20 * packet filter it out, or implement conntrack/NAT for that 21 * protocol. 8) --RR 22 */ 23 /* 沒有,返回accpet */ 24 if (!ct) 25 return NF_ACCEPT; 26 27 /* 獲取NAT擴展 */ 28 nat = nfct_nat(ct); 29 30 /* 判斷連接跟蹤狀態 */ 31 switch (ctinfo) { 32 /* 關聯連接(或者icmp錯誤)或者關聯連接的應答 */ 33 case IP_CT_RELATED: 34 case IP_CT_RELATED_REPLY: 35 /* icmp協議的NAT操作 */ 36 if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { 37 if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, 38 state->hook)) 39 return NF_DROP; 40 else 41 return NF_ACCEPT; 42 } 43 /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ 44 case IP_CT_NEW: 45 /* Seen it before? This can happen for loopback, retrans, 46 * or local packets. 47 */ 48 /* 尚未進行過NAT轉換 */ 49 if (!nf_nat_initialized(ct, maniptype)) { 50 unsigned int ret; 51 52 /* 進行規則匹配 */ 53 ret = do_chain(priv, skb, state, ct); 54 if (ret != NF_ACCEPT) 55 return ret; 56 57 /* 打NAT轉換標記 */ 58 if (nf_nat_initialized(ct, HOOK2MANIP(state->hook))) 59 break; 60 61 /* 連接跟蹤進行NAT */ 62 ret = nf_nat_alloc_null_binding(ct, state->hook); 63 if (ret != NF_ACCEPT) 64 return ret; 65 } 66 /* 進行過NAT轉換 */ 67 else { 68 pr_debug("Already setup manip %s for ct %p\n", 69 maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", 70 ct); 71 /* 出接口發生改變 */ 72 if (nf_nat_oif_changed(state->hook, ctinfo, nat, 73 state->out)) 74 goto oif_changed; 75 } 76 break; 77 78 default: 79 /* ESTABLISHED */ 80 NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED || 81 ctinfo == IP_CT_ESTABLISHED_REPLY); 82 /* 出接口發生改變 */ 83 if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) 84 goto oif_changed; 85 } 86 87 /* skb數據包進行NAT轉換修改 */ 88 return nf_nat_packet(ct, ctinfo, state->hook, skb); 89 90 oif_changed: 91 nf_ct_kill_acct(ct, ctinfo, skb); 92 return NF_DROP; 93 }
1 unsigned int 2 nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) 3 { 4 return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum)); 5 }
1 static unsigned int 2 __nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip) 3 { 4 /* Force range to this IP; let proto decide mapping for 5 * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). 6 * Use reply in case it's already been mangled (eg local packet). 7 */ 8 /* 使用應答方向的ip地址,LOCAL_OUT會先經過mangle,可能改變了 */ 9 union nf_inet_addr ip = 10 (manip == NF_NAT_MANIP_SRC ? 11 ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 : 12 ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3); 13 14 /* 設置range */ 15 struct nf_nat_range range = { 16 .flags = NF_NAT_RANGE_MAP_IPS, 17 .min_addr = ip, 18 .max_addr = ip, 19 }; 20 21 /* 進行NAT轉換 */ 22 return nf_nat_setup_info(ct, &range, manip); 23 }
1 unsigned int 2 nf_nat_setup_info(struct nf_conn *ct, 3 const struct nf_nat_range *range, 4 enum nf_nat_manip_type maniptype) 5 { 6 struct nf_conntrack_tuple curr_tuple, new_tuple; 7 8 /* Can't setup nat info for confirmed ct. */ 9 /* 已經確認的,返回accpet */ 10 if (nf_ct_is_confirmed(ct)) 11 return NF_ACCEPT; 12 13 NF_CT_ASSERT(maniptype == NF_NAT_MANIP_SRC || 14 maniptype == NF_NAT_MANIP_DST); 15 BUG_ON(nf_nat_initialized(ct, maniptype)); 16 17 /* What we've got will look like inverse of reply. Normally 18 * this is what is in the conntrack, except for prior 19 * manipulations (future optimization: if num_manips == 0, 20 * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) 21 */ 22 /* 從應答tuple反向得到當前tuple */ 23 nf_ct_invert_tuplepr(&curr_tuple, 24 &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 25 26 /* 根據當前tuple和range得到NAT轉換之后的的tuple */ 27 get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype); 28 29 /* NAT轉換之后和之前的tuple不同 */ 30 if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) { 31 struct nf_conntrack_tuple reply; 32 33 /* Alter conntrack table so will recognize replies. */ 34 /* 通過新tuple得到reply_tuple */ 35 nf_ct_invert_tuplepr(&reply, &new_tuple); 36 /* 加入到reply hash */ 37 nf_conntrack_alter_reply(ct, &reply); 38 39 /* 此時tuple類似如下 */ 40 /* 41 //內網10.1通過100.1訪問200.1,經過SNAT之后得到tuple 42 tuple SNAT(10.1->200.1, 200.1->100.1) 43 44 //外網300.1通過100.1訪問20.1,經過DNAT之后,得到tuple 45 tuple DNAT(300.1->100.1, 20.1->300.1) 46 */ 47 48 /* Non-atomic: we own this at the moment. */ 49 /* 更新狀態需要做NAT */ 50 if (maniptype == NF_NAT_MANIP_SRC) 51 ct->status |= IPS_SRC_NAT; 52 else 53 ct->status |= IPS_DST_NAT; 54 55 /* 擴展項的調整 */ 56 if (nfct_help(ct)) 57 if (!nfct_seqadj_ext_add(ct)) 58 return NF_DROP; 59 } 60 61 /* SNAT */ 62 if (maniptype == NF_NAT_MANIP_SRC) { 63 struct nf_nat_conn_key key = { 64 .net = nf_ct_net(ct), 65 .tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 66 .zone = nf_ct_zone(ct), 67 }; 68 int err; 69 70 /* 加入到nf_nat_bysource_table */ 71 err = rhltable_insert_key(&nf_nat_bysource_table, 72 &key, 73 &ct->nat_bysource, 74 nf_nat_bysource_params); 75 if (err) 76 return NF_DROP; 77 } 78 79 /* It's done. */ 80 /* NAT轉換完成 */ 81 if (maniptype == NF_NAT_MANIP_DST) 82 ct->status |= IPS_DST_NAT_DONE; 83 else 84 ct->status |= IPS_SRC_NAT_DONE; 85 86 return NF_ACCEPT; 87 }
1 /* 根據orig_tuple和range得到NAT轉換之后的tuple */ 2 static void 3 get_unique_tuple(struct nf_conntrack_tuple *tuple, 4 const struct nf_conntrack_tuple *orig_tuple, 5 const struct nf_nat_range *range, 6 struct nf_conn *ct, 7 enum nf_nat_manip_type maniptype) 8 { 9 const struct nf_conntrack_zone *zone; 10 const struct nf_nat_l3proto *l3proto; 11 const struct nf_nat_l4proto *l4proto; 12 struct net *net = nf_ct_net(ct); 13 14 zone = nf_ct_zone(ct); 15 16 rcu_read_lock(); 17 18 /* 查找l3proto和l4proto */ 19 l3proto = __nf_nat_l3proto_find(orig_tuple->src.l3num); 20 l4proto = __nf_nat_l4proto_find(orig_tuple->src.l3num, 21 orig_tuple->dst.protonum); 22 23 /* 1) If this srcip/proto/src-proto-part is currently mapped, 24 * and that same mapping gives a unique tuple within the given 25 * range, use that. 26 * 27 * This is only required for source (ie. NAT/masq) mappings. 28 * So far, we don't do local source mappings, so multiple 29 * manips not an issue. 30 */ 31 /* SNAT && 沒有打RANDOM_ALL標記 */ 32 if (maniptype == NF_NAT_MANIP_SRC && 33 !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { 34 /* try the original tuple first */ 35 /* 查看orig_tuple是否滿足范圍要求 */ 36 if (in_range(l3proto, l4proto, orig_tuple, range)) { 37 /* tuple尚未被使用 */ 38 if (!nf_nat_used_tuple(orig_tuple, ct)) { 39 /* 使用原tuple */ 40 *tuple = *orig_tuple; 41 goto out; 42 } 43 } 44 /* ori_range不滿足要求,則從bysource_table中查找一個滿足范圍的tuple */ 45 else if (find_appropriate_src(net, zone, l3proto, l4proto, 46 orig_tuple, tuple, range)) { 47 pr_debug("get_unique_tuple: Found current src map\n"); 48 /* tuple尚未被使用 */ 49 if (!nf_nat_used_tuple(tuple, ct)) 50 goto out; 51 } 52 } 53 54 /* 從給定range中選擇一個最少使用的組合 */ 55 /* 2) Select the least-used IP/proto combination in the given range */ 56 *tuple = *orig_tuple; 57 find_best_ips_proto(zone, tuple, range, ct, maniptype); 58 59 /* 3) The per-protocol part of the manip is made to map into 60 * the range to make a unique tuple. 61 */ 62 63 /* Only bother mapping if it's not already in range and unique */ 64 /* 沒有打RANDOM_ALL標記 */ 65 if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { 66 /* 有SPECIFIED標記,對端口號進行檢查 */ 67 if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { 68 /* 端口號已經在范圍之內&&(端口最小最大范圍相等||tuple沒有使用) */ 69 if (l4proto->in_range(tuple, maniptype, 70 &range->min_proto, 71 &range->max_proto) && 72 (range->min_proto.all == range->max_proto.all || 73 !nf_nat_used_tuple(tuple, ct))) 74 goto out; 75 } 76 /* 沒有SPECIFIED標記,端口號不變,tuple沒有被使用 */ 77 else if (!nf_nat_used_tuple(tuple, ct)) { 78 goto out; 79 } 80 } 81 82 /* Last change: get protocol to try to obtain unique tuple. */ 83 /* 隨機選擇端口號 */ 84 l4proto->unique_tuple(l3proto, tuple, range, maniptype, ct); 85 out: 86 rcu_read_unlock(); 87 }
1 unsigned int nf_nat_packet(struct nf_conn *ct, 2 enum ip_conntrack_info ctinfo, 3 unsigned int hooknum, 4 struct sk_buff *skb) 5 { 6 const struct nf_nat_l3proto *l3proto; 7 const struct nf_nat_l4proto *l4proto; 8 /* 獲取方向 */ 9 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); 10 unsigned long statusbit; 11 /* 獲取進行SNAT還是DNAT */ 12 enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum); 13 14 /* 設置NAT標記 */ 15 if (mtype == NF_NAT_MANIP_SRC) 16 statusbit = IPS_SRC_NAT; 17 else 18 statusbit = IPS_DST_NAT; 19 20 /* Invert if this is reply dir. */ 21 /* 應答方向需要取反 */ 22 if (dir == IP_CT_DIR_REPLY) 23 statusbit ^= IPS_NAT_MASK; 24 25 /* Non-atomic: these bits don't change. */ 26 27 /* 需要做NAT */ 28 if (ct->status & statusbit) { 29 struct nf_conntrack_tuple target; 30 31 /* We are aiming to look like inverse of other direction. */ 32 /* 獲取目標tuple */ 33 /* 34 //內網10.1通過100.1訪問200.1,經過SNAT之后得到tuple 35 tuple SNAT(10.1->200.1, 200.1->100.1) 36 37 //外網300.1通過100.1訪問20.1,經過DNAT之后,得到tuple 38 tuple DNAT(300.1->100.1, 20.1->300.1) 39 */ 40 nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); 41 42 /* 獲取l3proto,l4proto */ 43 l3proto = __nf_nat_l3proto_find(target.src.l3num); 44 l4proto = __nf_nat_l4proto_find(target.src.l3num, 45 target.dst.protonum); 46 47 /* 將ip地址和端口的NAT轉換結果寫入skb */ 48 if (!l3proto->manip_pkt(skb, 0, l4proto, &target, mtype)) 49 return NF_DROP; 50 } 51 return NF_ACCEPT; 52 }