1. 前言
本文結合網絡上關於Linux網橋的說明、 Linux平台的代碼閱讀記錄,整理的一篇總結性文檔。由於時間倉促,分析可能存在不足之外,望大家見諒和指正。
對於接觸過Linux 網絡的童鞋,對網橋功能應該不陌生。概括來說,網橋實現最重要的兩點:
1.<span "=""> MAC學習:學習MAC地址,起初,網橋是沒有任何地址與端口的對應關系的,它發送數據,還是得想HUB一樣,但是每發送一個數據,它都會關心數據包的來源MAC是從自己的哪個端口來的,由於學習,建立地址-端口的對照表(CAM表)。
2.<span "=""> 報文轉發:每發送一個數據包,網橋都會提取其目的MAC地址,從自己的地址-端口對照表(CAM表)中查找由哪個端口把數據包發送出去。
本文目的讓讀者對Linux網橋有個全面的認識。作重講述Linux網橋的定義、網橋管理、數據流程和端口-MAC映射管理,以及網橋的Netfilter。關於網橋的STP,由於ap121上,網橋並沒啟用STP,所以這部分不做詳細介紹,只在文檔后面做一個簡單介紹。
2. 網橋定義和網橋管理
在Linux里面使用網橋非常簡單,僅需要做兩件事情就可以配置了。其一是在編譯內核里把CONFIG_BRIDGE或CONDIG_BRIDGE_MODULE編譯選項打開;其二是安裝brctl工具。第一步是使內核協議棧支持網橋,第二步是安裝用戶空間工具,通過一系列的ioctl調用來配置網橋。在我們開發過程中,常見的幾條命令:
Brctl addbr br0 (建立一個網橋br0, 同時在Linux內核里面創建虛擬網卡br0)
Brctl addif br0 eth0
Brctl addif br0 ath0
Brctl addif br0 ath1 (分別為網橋br0添加接口eth0, ath0和ath1)
本章我們的目的就是弄清楚以上幾條命令在內核中是如何實現、生效的。
2.1 網橋定義
按照慣例,先熟悉一下網橋相關的重要數據結構體定義,方便后續講解。和網橋息息相關的幾個結構體包括:網橋自身定義(net_bridge)、網橋端口(net_bridge_port)、網橋端口-MAC映射表項(net_bridge_fdb_entry)等。另外,網橋本身也是一個虛擬的網卡設備(net_device)。Net_device是一個龐大的結構體,我們在這里就不展現了。關於net_device詳細介紹請參考《Linux設備驅動程序》網絡驅動程序章節, net_device的詳細介紹。下面我們介紹網橋、端口、端口-MAC映射表項的數據結構。
網橋定義:
struct net_bridge
{
//自旋鎖
spinlock_t lock;
//網橋所有端口的鏈表,其中每個元素都是一個net_bridge_port結構。
struct list_head port_list;
//網橋會建立一個虛擬設備來進行管理,這個設備的MAC地址是動態指定的,通常就是橋組中一個物理端口的MAC地址
struct net_device *dev;
//這個鎖是用來保護下面的那個hash鏈表。
spinlock_t hash_lock;
//保存forwarding database的一個hash鏈表(這個也就是地址學習的東東,所以通過hash能 快速定位),這里每個元素都是一個net_bridge_fsb_entry結構
struct hlist_head hash[BR_HASH_SIZE];
//這個結構沒有被使用
struct list_head age_list;
unsigned long feature_mask;
#ifdef CONFIG_BRIDGE_NETFILTER
struct rtable fake_rtable;
#endif
unsigned long flags;
#define BR_SET_MAC_ADDR 0x00000001
//stp相關的一些東西
bridge_id designated_root;
bridge_id bridge_id;
u32 root_path_cost;
unsigned long max_age;
unsigned long hello_time;
unsigned long forward_delay;
unsigned long bridge_max_age;
unsigned long ageing_time;
unsigned long bridge_hello_time;
unsigned long bridge_forward_delay;
u8 group_addr[ETH_ALEN];
u16 root_port;
//STP當前使用的協議
enum {
BR_NO_STP, /* no spanning tree */
BR_KERNEL_STP, /* old STP in kernel */
BR_USER_STP, /* new RSTP in userspace */
} stp_enabled;
unsigned char topology_change;
unsigned char topology_change_detected;
//stp要用的一些定時器列表
struct timer_list hello_timer;
struct timer_list tcn_timer;
struct timer_list topology_change_timer;
struct timer_list gc_timer;
struct kobject *ifobj;
};
網橋端口數據結構體:
struct net_bridge_port
{
//當前端口所屬的網橋設備
struct net_bridge *br;
//表示鏈接到這個端口的物理設備
struct net_device *dev;
//同一橋內的端口鏈表
struct list_head list;
//stp相關的一些參數
u8 priority;
u8 state;
u16 port_no;
unsigned char topology_change_ack;
unsigned char config_pending;
port_id port_id;
port_id designated_port;
bridge_id designated_root;
bridge_id designated_bridge;
u32 path_cost;
u32 designated_cost;
//端口定時器,也就是stp控制超時的一些定時器列表
struct timer_list forward_delay_timer;
struct timer_list hold_timer;
struct timer_list message_age_timer;
struct kobject kobj;
struct rcu_head rcu;
};
網橋端口-MAC映射表項:
struct net_bridge_fdb_entry
{
//用於CAM表連接的鏈表指針
struct hlist_node hlist;
//橋的端口(最主要的兩個域就是這個域和下面的mac地址域)
struct net_bridge_port *dst;
//當使用RCU策略,才用到
struct rcu_head rcu;
//引用計數
atomic_t use_count;
unsigned long ageing_timer;
//mac地址
mac_addr addr;
//標明是否為本機MAC地址
unsigned char is_local;
//標明是否為靜態地址
unsigned char is_static;
};
關於net_bridge、 net_bridge_port、net_bridge_fdb_entry它們之間的關系可以使用如下圖的示意圖表示:
重要數據結構關系示意圖
2.2 網橋模塊初始化
網橋在內核中,被實現為一個內核模塊,源代碼在~/1xU/ap121/linux/kernels/mips-linux-2.6.31/net/bridge/br.c中。初始化方法br_init:
static int __init br_init(void)
{
int err;
//stp的注冊
err = stp_proto_register(&br_stp_proto);
if (err < 0) {
printk(KERN_ERR "bridge: can't register sap for STP\n");
return err;
}
//CAM表的初始化
err = br_fdb_init();
if (err)
goto err_out;
//網橋的netfilter鈎子函數的初始化
err = br_netfilter_init();
if (err)
goto err_out1;
//注冊到netdevice的通知鏈上
err = register_netdevice_notifier(&br_device_notifier);
if (err)
goto err_out2;
err = br_netlink_init();
if (err)
goto err_out3;
//設置網橋設備的do_ioctl函數,也就是提供給用戶空間ioctl接口
brioctl_set(br_ioctl_deviceless_stub);
//設置網橋數據處理接口
br_handle_frame_hook = br_handle_frame;
//設置網橋CAM數據交換接口
br_fdb_get_hook = br_fdb_get;
br_fdb_put_hook = br_fdb_put;
return 0;
// 異常處理略
…
return err;
}
2.3 網橋管理
網橋內核模塊初始化后,並沒有真正的一個網橋設備被實例化,它只是搭建好了運行環境。要網橋真正的運作,還需要從創建一個網橋設備開始。
2.3.1 創建和刪除網橋
接上文,在網橋初始化的時候,設置了網橋的ioctl接口:br_ioctl_deviceless_stub。下面看看br_ioctl_deviceless_stub的實現:
int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, void __user *uarg)
{
switch (cmd) {
case SIOCGIFBR:
case SIOCSIFBR:
return old_deviceless(net, uarg);
case SIOCBRADDBR:
case SIOCBRDELBR:
{
char buf[IFNAMSIZ];
if (!capable(CAP_NET_ADMIN))
return -EPERM;
if (copy_from_user(buf, uarg, IFNAMSIZ))
return -EFAULT;
buf[IFNAMSIZ-1] = 0;
if (cmd == SIOCBRADDBR)
return br_add_bridge(net, buf); //添加網橋
return br_del_bridge(net, buf); //刪除網橋
}
}
return -EOPNOTSUPP;
}
當我們執行brctl addbr br0時,我們傳入的cmd為SIOCBRADDBR,會轉入br_add_bridge中進行:
int br_add_bridge(struct net *net, const char *name)
{
struct net_device *dev;
int ret;
// 創建一個網卡設備
dev = new_bridge_dev(net, name);
if (!dev)
return -ENOMEM;
rtnl_lock();
// 內核確認設備名稱
if (strchr(dev->name, '%')) {
ret = dev_alloc_name(dev, dev->name);
if (ret < 0)
goto out_free;
}
// 注冊網卡設備
ret = register_netdevice(dev);
if (ret)
goto out_free;
//在sysfs中建立相關信息,便於查看和管理
ret = br_sysfs_addbr(dev);
if (ret)
unregister_netdevice(dev);
out:
rtnl_unlock();
return ret;
out_free:
free_netdev(dev);
goto out;
}
網橋是一個虛擬的設備,它的注冊跟實際的物理網絡設備注冊是一樣的(可以參看《Linux設備驅動程序》網絡驅動程序中,net_device創建和注冊過程):
static struct net_device *new_bridge_dev(struct net *net, const char *name)
{
struct net_bridge *br;
struct net_device *dev;
// 創建net_device設備,執行網橋設備初始化程序:br_dev_setup
dev = alloc_netdev(sizeof(struct net_bridge), name,
br_dev_setup);
if (!dev)
return NULL;
// 設定net
dev_net_set(dev, net);
// net_device私有區被指定網橋,然后進行網橋相關初始化填充
br = netdev_priv(dev);
br->dev = dev;
spin_lock_init(&br->lock);
INIT_LIST_HEAD(&br->port_list);
spin_lock_init(&br->hash_lock);
br->bridge_id.prio[0] = 0x80;
br->bridge_id.prio[1] = 0x00;
memcpy(br->group_addr, br_group_address, ETH_ALEN);
br->feature_mask = dev->features;
br->stp_enabled = BR_NO_STP; // 默認不開啟STP功能
br->designated_root = br->bridge_id;
br->root_path_cost = 0;
br->root_port = 0;
br->bridge_max_age = br->max_age = 20 * HZ;
br->bridge_hello_time = br->hello_time = 2 * HZ;
br->bridge_forward_delay = br->forward_delay = 15 * HZ;
br->topology_change = 0;
br->topology_change_detected = 0;
br->ageing_time = 300 * HZ;
br_netfilter_rtable_init(br);
INIT_LIST_HEAD(&br->age_list);
br_stp_timer_init(br);
return dev;
}
更詳細的,看看網橋虛擬設備初始化的細節:
void br_dev_setup(struct net_device *dev)
{
//初始化MAC
random_ether_addr(dev->dev_addr);
// 網橋設備也是以太網設備,需要進行以太網部分初始化
ether_setup(dev);
dev->netdev_ops = &br_netdev_ops;
dev->destructor = free_netdev;
SET_ETHTOOL_OPS(dev, &br_ethtool_ops);
dev->tx_queue_len = 0;
dev->priv_flags = IFF_EBRIDGE;
dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA |
NETIF_F_GSO_MASK | NETIF_F_NO_CSUM | NETIF_F_LLTX |
NETIF_F_NETNS_LOCAL | NETIF_F_GSO;
}
static const struct ethtool_ops br_ethtool_ops = {
.get_drvinfo = br_getinfo,
.get_link = ethtool_op_get_link,
.get_tx_csum = ethtool_op_get_tx_csum,
.set_tx_csum = br_set_tx_csum,
.get_sg = ethtool_op_get_sg,
.set_sg = br_set_sg,
.get_tso = ethtool_op_get_tso,
.set_tso = br_set_tso,
.get_ufo = ethtool_op_get_ufo,
.get_flags = ethtool_op_get_flags,
};
static const struct net_device_ops br_netdev_ops = {
.ndo_open = br_dev_open, // 打開設備
.ndo_stop = br_dev_stop, // 停止設備
.ndo_start_xmit = br_dev_xmit, // 發送數據
.ndo_set_mac_address = br_set_mac_address, // 設置MAC
.ndo_set_multicast_list = br_dev_set_multicast_list, // 設置mutlicast
.ndo_change_mtu = br_change_mtu, // 設置MTU
.ndo_do_ioctl = br_dev_ioctl, // 設備ioctl
};
以上是創建網橋及網橋初始化的全部過程,關於網橋刪除主要是上述網橋注冊過程的逆過程:解除端口,清除定時器,刪除sysfs設備,注銷虛擬設備:
static void del_br(struct net_bridge *br)
{
struct net_bridge_port *p, *n;
list_for_each_entry_safe(p, n, &br->port_list, list) {
del_nbp(p);
}
del_timer_sync(&br->gc_timer);
br_sysfs_delbr(br->dev);
unregister_netdevice(br->dev);
}
2.3.2 添加和刪除端口
僅僅創建網橋,還是不夠的。實際應用中的網橋需要添加實際的端口(即物理接口),比如:
brctl addif br0 eth0
應用程序在使用ioctl來為網橋增加物理接口,對應內核函數br_dev_ioctl(初始化網橋時指定的),代碼和分析如下:
int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
{
struct net_bridge *br = netdev_priv(dev);
switch(cmd) {
case SIOCDEVPRIVATE:
return old_dev_ioctl(dev, rq, cmd);
case SIOCBRADDIF:
case SIOCBRDELIF:
// 根據cmd類型執行添加或刪除端口
return add_del_if(br, rq->ifr_ifindex, cmd == SIOCBRADDIF);
}
pr_debug("Bridge does not support ioctl 0x%x\n", cmd);
return -EOPNOTSUPP;
}
static int add_del_if(struct net_bridge *br, int ifindex, int isadd)
{
struct net_device *dev;
int ret;
if (!capable(CAP_NET_ADMIN))
return -EPERM;
dev = dev_get_by_index(dev_net(br->dev), ifindex);
if (dev == NULL)
return -EINVAL;
if (isadd)
ret = br_add_if(br, dev); //增加一個端口
else
ret = br_del_if(br, dev); //刪除端口
dev_put(dev);
return ret;
}
事實上,增加一個端口就是實例化並填充一個net_bridge_port,並將其加入到網橋的端口記錄表中:
int br_add_if(struct net_bridge *br, struct net_device *dev)
{
struct net_bridge_port *p;
int err = 0;
// 環路端口和非以太網設備不添加
if (dev->flags & IFF_LOOPBACK || dev->type != ARPHRD_ETHER)
return -EINVAL;
// 如果加入端口本身也是網橋設備,不添加
if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit)
return -ELOOP;
// 如果加入端口設備已經屬於其他網橋,不添加
if (dev->br_port != NULL)
return -EBUSY;
// new一個port
p = new_nbp(br, dev);
if (IS_ERR(p))
return PTR_ERR(p);
// 設置為混雜模式
err = dev_set_promiscuity(dev, 1);
if (err)
goto put_back;
// 一些初始化
err = kobject_init_and_add(&p->kobj, &brport_ktype, &(dev->dev.kobj),
SYSFS_BRIDGE_PORT_ATTR);
if (err)
goto err0;
// 將端口的MAC插入到端口-MAC映射表中
err = br_fdb_insert(br, p, dev->dev_addr);
if (err)
goto err1;
// 添加到sysfs文件系統中
err = br_sysfs_addif(p);
if (err)
goto err2;
rcu_assign_pointer(dev->br_port, p);
dev_disable_lro(dev);
// 添加到網橋端口記錄表中
list_add_rcu(&p->list, &br->port_list);
spin_lock_bh(&br->lock);
br_stp_recalculate_bridge_id(br);
br_features_recompute(br);
if ((dev->flags & IFF_UP) && netif_carrier_ok(dev) &&
(br->dev->flags & IFF_UP))
br_stp_enable_port(p);
spin_unlock_bh(&br->lock);
br_ifinfo_notify(RTM_NEWLINK, p);
dev_set_mtu(br->dev, br_min_mtu(br));
kobject_uevent(&p->kobj, KOBJ_ADD);
return 0;
err2:
br_fdb_delete_by_port(br, p, 1);
err1:
kobject_put(&p->kobj);
err0:
dev_set_promiscuity(dev, -1);
put_back:
dev_put(dev);
kfree(p);
return err;
}
int br_del_if(struct net_bridge *br, struct net_device *dev)
{
struct net_bridge_port *p = dev->br_port;
if (!p || p->br != br)
return -EINVAL;
del_nbp(p);
spin_lock_bh(&br->lock);
br_stp_recalculate_bridge_id(br);
br_features_recompute(br);
spin_unlock_bh(&br->lock);
return 0;
}
3. 網橋數據處理流程
在我們日常開發中,最常見的一種拓撲如下圖所示:
典型拓撲
其中,DUT有三個端口ath0(本地無線端口)、eth0(有線lan口)、aht1(連接遠程無線端口);該三個端口通過br0網橋橋接在一起。本章作重講述,在該拓撲下,pc1、pc2、pc3以及ROOT-AP之間,是如何通過DUT(br0)進行數據交互的。
現假設pc3向pc1發送一個數據包,數據首先會由eth0網卡接收,此后網卡向CPU發送接收中斷。當CPU執行當前指令后(如果開中斷的話),馬上跳到網卡的驅動程去。eht0的網卡驅動首先生成一個skb結構,然后對以太網層進行分析,最后驅動將該skb結構放到當前CPU的輸入隊列中,喚醒軟中斷。如果沒有其它中斷的到來,那么軟中斷將調用netif_receive_skb函數。關於網卡驅動和中斷響應不是本文討論的重點,所以我們還是從netif_receive_skb說起。
/**
* netif_receive_skb - process receive buffer from network
* @skb: buffer to process
*
* netif_receive_skb() is the main receive data processing function.
* It always succeeds. The buffer may be dropped during processing
* for congestion control or by the protocol layers.
*
* This function may only be called from softirq context and interrupts
* should be enabled.
*
* Return values (usually ignored):
* NET_RX_SUCCESS: no congestion
* NET_RX_DROP: packet was dropped
*/
int netif_receive_skb(struct sk_buff *skb)
{
struct packet_type *ptype, *pt_prev;
struct net_device *orig_dev;
struct net_device *null_or_orig;
int ret = NET_RX_DROP;
__be16 type;
if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
return NET_RX_SUCCESS;
/* if we've gotten here through NAPI, check netpoll */
#ifdef CONFIG_MAPPING
if (skb->dev)
#endif
if (netpoll_receive_skb(skb))
return NET_RX_DROP;
if (!skb->tstamp.tv64)
net_timestamp(skb);
if (!skb->iif)
skb->iif = skb->dev->ifindex;
null_or_orig = NULL;
orig_dev = skb->dev;
if (orig_dev->master) {
if (skb_bond_should_drop(skb))
null_or_orig = orig_dev; /* deliver only exact match */
else
skb->dev = orig_dev->master;
}
__get_cpu_var(netdev_rx_stat).total++;
skb_reset_network_header(skb);
skb_reset_transport_header(skb);
skb->mac_len = skb->network_header - skb->mac_header;
pt_prev = NULL;
rcu_read_lock();
#ifdef CONFIG_NET_CLS_ACT
if (skb->tc_verd & TC_NCLS) {
skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
goto ncls;
}
#endif
// 檢查數據包是否有packet socket來接受該包(比如抓包工具),如果有則往該socket發送一份
list_for_each_entry_rcu(ptype, &ptype_all, list) {
if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
ptype->dev == orig_dev) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
}
#ifdef CONFIG_NET_CLS_ACT
skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
if (!skb)
goto out;
ncls:
#endif
// 嘗試交由網橋處理,如果網橋處理了,返回skb=NULL
skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
if (!skb)
goto out;
skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
if (!skb)
goto out;
// 對數據包轉到L3層處理
type = skb->protocol;
list_for_each_entry_rcu(ptype,
&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
if (ptype->type == type &&
(ptype->dev == null_or_orig || ptype->dev == skb->dev ||
ptype->dev == orig_dev)) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
}
if (pt_prev) {
ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
} else {
kfree_skb(skb);
/* Jamal, now you will not able to escape explaining
* me how you were going to use this. :-)
*/
ret = NET_RX_DROP;
}
out:
rcu_read_unlock();
return ret;
}
總結而言,netif_recerve_skb函數主要做三件事情:
1.<span "=""> <span "=""> 如果有socket需要(如抓包應用)skb,則將skb復制給他們;
2.<span "=""> <span "=""> 處理橋接,即如果開啟了網橋,進行網橋處理;
3. 將skb交給網絡層。
3.1 網橋處理入口:handle_bridge
static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
struct packet_type **pt_prev, int *ret,
struct net_device *orig_dev)
{
struct net_bridge_port *port;
// 如果數據包是環回包,或者數據包的產生設備不屬於任何網橋,則不進行網橋處理
if (skb->pkt_type == PACKET_LOOPBACK ||
(port = rcu_dereference(skb->dev->br_port)) == NULL)
return skb;
if (*pt_prev) {
*ret = deliver_skb(skb, *pt_prev, orig_dev);
*pt_prev = NULL;
}
// 調用網橋處理接口,該接口在網橋初始化時被指定為 br_handle_frame
return br_handle_frame_hook(port, skb);
}
3.2 網橋處理邏輯之br_handle_frame
/*
* Called via br_handle_frame_hook.
* Return NULL if skb is handled
* note: already called with rcu_read_lock (preempt_disabled)
*/
struct sk_buff *br_handle_frame(struct net_bridge_port *p, struct sk_buff *skb)
{
// 獲取數據包MAC
const unsigned char *dest = eth_hdr(skb)->h_dest;
int (*rhook)(struct sk_buff *skb);
if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
goto drop;
skb = skb_share_check(skb, GFP_ATOMIC);
if (!skb)
return NULL;
#ifdef CONFIG_ATHRS_HW_NAT
skb->ath_hw_nat_fw_flags = 1;
#endif
// 如果目的地址是01:80:c2:00:00:0X,則是發往STP的多播地址,此時可能需要進行STP處理
if (unlikely(is_link_local(dest))) {
/* Pause frames shouldn't be passed up by driver anyway */
if (skb->protocol == htons(ETH_P_PAUSE))
goto drop;
/* If STP is turned off, then forward */
if (p->br->stp_enabled == BR_NO_STP && dest[5] == 0)
goto forward;
// 在老版本的網橋實現中,這里有一個分支進行STP數據包處理。在新內核版本中(2.6,新只是相對的),STP被實現為上層協議,所以會在網橋處理后,傳遞到上層再進行專門處理。
if (NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_IN, skb, skb->dev,
NULL, br_handle_local_finish)) //更新CAM表
return NULL; /* frame consumed by filter */
else
return skb;// 由於br_handle_local_finish返回始終為0,所以return skb將繼續上層處理
}
forward:
switch (p->state) {
case BR_STATE_FORWARDING:
// 判斷是否需要走三層進行轉發,這個是broute表的執行函數
rhook = rcu_dereference(br_should_route_hook);
if (rhook != NULL) {
if (rhook(skb))
return skb;
dest = eth_hdr(skb)->h_dest;
}
/* 注意: fall through */
case BR_STATE_LEARNING:
if (!compare_ether_addr(p->br->dev->dev_addr, dest))
skb->pkt_type = PACKET_HOST;
// Netfilter hook點
NF_HOOK(PF_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL,
br_handle_frame_finish);
break;
default:
drop:
kfree_skb(skb);
}
return NULL;
}
3.3 網橋處理邏輯之br_handle_frame_finish
int br_handle_frame_finish(struct sk_buff *skb)
{
const unsigned char *dest = eth_hdr(skb)->h_dest;
struct net_bridge_port *p = rcu_dereference(skb->dev->br_port);
struct net_bridge *br;
struct net_bridge_fdb_entry *dst;
struct sk_buff *skb2;
// 如果網橋處於DISABLED狀態,直接drop
if (!p || p->state == BR_STATE_DISABLED)
goto drop;
/* insert into forwarding database after filtering to avoid spoofing */
// 選擇端口所屬的網橋(可能有多個網橋的情況)
br = p->br;
// 更新端口-MAC映射表
br_fdb_update(br, p, eth_hdr(skb)->h_source);
if (p->state == BR_STATE_LEARNING)
goto drop;
/* The packet skb2 goes to the local host (NULL to skip). */
skb2 = NULL;
if (br->dev->flags & IFF_PROMISC)
skb2 = skb;
dst = NULL;
if (is_multicast_ether_addr(dest)) {
br->dev->stats.multicast++;
skb2 = skb;
} else if ((dst = __br_fdb_get(br, dest)) && dst->is_local) {
skb2 = skb;
/* Do not forward the packet since it's local. */
skb = NULL;
}
if (skb2 == skb)
skb2 = skb_clone(skb, GFP_ATOMIC);
if (skb2)
br_pass_frame_up(br, skb2); // 如果skb2非空,則向上傳遞報文
if (skb) {
if (dst)
br_forward(dst->dst, skb);
else
br_flood_forward(br, skb); // 多播或端口-MAC表中無記錄,需要洪泛發送(每個端口均發送)
}
out:
return 0;
drop:
kfree_skb(skb);
goto out;
}
3.4 網橋處理邏輯之 br_forward
void br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
{
if (should_deliver(to, skb)) {
__br_forward(to, skb);
return;
}
kfree_skb(skb);
}
static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
{
struct net_device *indev;
indev = skb->dev;
skb->dev = to->dev; //替換報文中的dev為轉發端口對應的dev
skb->ip_summed = CHECKSUM_NONE;
// Netfilter hook處理
NF_HOOK(PF_BRIDGE, NF_BR_FORWARD, skb, indev, skb->dev,
br_forward_finish);
}
void br_flood_forward(struct net_bridge *br, struct sk_buff *skb)
{
br_flood(br, skb, __br_forward);
}
static void br_flood(struct net_bridge *br, struct sk_buff *skb,
void (*__packet_hook)(const struct net_bridge_port *p,
struct sk_buff *skb))
{
struct net_bridge_port *p;
struct net_bridge_port *prev;
prev = NULL;
/* backup multicast address. by HouXB, 07Dec10 */
#ifdef CONFIG_TP_MULTICAST
#define IS_MULTICAST_ADDR(ptr) ((ptr[0] == 0x01) && (ptr[1] == 0x00) && (ptr[2] == 0x5e) ? 1 : 0)
mac_addr multi_mac_addr;
unsigned char *pmac = multi_mac_addr.addr;
memset(pmac, 0, 6/*ETH_ALEN*/);
if(IS_MULTICAST_ADDR(skb_mac_header(skb)))
{
//backup multicast address
memcpy(pmac, skb_mac_header(skb), 6/*ETH_ALEN*/);
}
#endif
// 遍歷所有端口,從每個端口發送一份出去, should_deliver會排除進來的端口
list_for_each_entry_rcu(p, &br->port_list, list) {
if (should_deliver(p, skb)) {
if (prev != NULL) {
struct sk_buff *skb2;
if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) {
br->dev->stats.tx_dropped++;
kfree_skb(skb);
return;
}
#ifdef CONFIG_TP_MULTICAST
if(IS_MULTICAST_ADDR(pmac))
{
//restore multicast address
memcpy(skb_mac_header(skb), pmac, 6/*ETH_ALEN*/);
}
#endif
__packet_hook(prev, skb2);
}
prev = p;
}
}
if (prev != NULL) {
#ifdef CONFIG_TP_MULTICAST
if(IS_MULTICAST_ADDR(pmac))
{
//restore multicast address
memcpy(skb_mac_header(skb), pmac, 6/*ETH_ALEN*/);
}
#endif
__packet_hook(prev, skb);
return;
}
kfree_skb(skb);
}
3.5 網橋處理邏輯之 br_forward_finish
int br_forward_finish(struct sk_buff *skb)
{
// Netfilter hook: NF_BR_POST_ROUTING
return NF_HOOK(PF_BRIDGE, NF_BR_POST_ROUTING, skb, NULL, skb->dev,
br_dev_queue_push_xmit);
}
int br_dev_queue_push_xmit(struct sk_buff *skb)
{
/* drop mtu oversized packets except gso */
if (packet_length(skb) > skb->dev->mtu && !skb_is_gso(skb))
kfree_skb(skb);
else {
/* ip_refrag calls ip_fragment, doesn't copy the MAC header. */
if (nf_bridge_maybe_copy_header(skb))
kfree_skb(skb);
else {
skb_push(skb, ETH_HLEN);
dev_queue_xmit(skb); // 進入驅動
}
}
return 0;
}
3.6 網橋處理邏輯之br_pass_frame_up
static void br_pass_frame_up(struct net_bridge *br, struct sk_buff *skb)
{
struct net_device *indev, *brdev = br->dev;
// 數據統計
brdev->stats.rx_packets++;
brdev->stats.rx_bytes += skb->len;
indev = skb->dev;
// 特別注意:此處將skb的dev強制修改為網橋dev
skb->dev = brdev;
// Netfilter hook :NF_BR_LOCAL_IN
NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_IN, skb, indev, NULL,
netif_receive_skb);
}
這段代碼非常簡單,對<span "="">net_bridge的數據統計進行更新以后,強制將skb的dev修改為網橋的dev,最后通過<span "="">NF_HOOK在<span "="">NF_BR_LOCAL_IN掛接點上調用回了<span "="">netif_receive_skb方法。
在<span "="">netif_receive_skb函數中,調用了<span "="">handle_bridge函數,重新觸發了網橋處理流程,現在發往網橋虛擬設備的數據包又回到了<span "="">netif_receive_skb,那么網橋的處理過程會不會又被調用呢?答案是否定的。回顧網橋入口函數handle_bridge方法,判斷是否執行網橋處理流程的判斷:
// 如果數據包是環回包,或者數據包的產生設備不屬於任何網橋,則不進行網橋處理
if (skb->pkt_type == PACKET_LOOPBACK ||
(port = rcu_dereference(skb->dev->br_port)) == NULL)
return skb;
見上文程序段,<span "="">br_pass_frame_up函數將<span "="">skb->dev賦成了<span "="">br->dev,實際上<span "="">skb->dev變成了網橋建立的虛擬設備;這個設備是網橋本身而不是橋組的某一端口(它不屬於任何網橋設備,因為前面提到過網橋不能添加一個網橋設備做端口),故而在進行網橋處理判斷時,不能進入網橋處理流程<span "=""> ,從而進入上層協議棧處理。
3.7 網橋處理流程小結
進入橋的數據報文分為幾個類型,橋對應的處理方法也不同:
1、 報文是本機發送給自己的,橋不處理,交給上層協議棧;
2、 接收報文的物理接口不是網橋接口,橋不處理,交給上層協議棧;
3、 進入網橋后,如果網橋的狀態為Disable,則將包丟棄不處理;
4、 報文源地址無效(廣播,多播,以及00:00:00:00:00:00),丟包;
5、 如果是STP的BPDU包,交給上層協議棧;
6、 如果是發給本機的報文,橋直接返回,交給上層協議棧,不轉發;
7、 需要轉發的報文分三種情況:
1) 廣播或多播,則除接收端口外的所有端口都需要轉發一份;
2) 單播並且在端口-MAC映射表中能找到端口映射的,只需要網映射端口轉發一份即可;
3) 單播但找不到端口映射的,則除了接收端口外其余端口都需要轉發。
最后,再回顧一下網橋數據處理主要函數關系圖:
圖 網橋處理流程示意圖
4. 網橋端口-MAC映射表維護
眾所周知,網橋需要維護一個MAC地址-端口映射表,端口是指網橋自身提供的端口,而MAC地址是指與端口相連的另一端主機的MAC地址。當網橋收到一個報文時,先獲取它的源MAC,更新數據庫,然后讀取該報文的目標MAC地址,查找該數據庫,如果找到,根據找到條目的端口進行轉發;否則會把數據包向除入口端口以外的所有端口轉發。
通常網橋端口-MAC映射表又被稱為網橋轉發數據庫或CAM,為了簡化敘述,統一使用數據庫代替。
4.1 數據庫的創建和銷毀
數據庫使用kmem_cache_create函數進行創建,使用kmem_cache_desctory進行銷毀。回顧網橋初始化時,會調用br_fdb_init進行數據庫初始化:
int __init br_fdb_init(void)
{
br_fdb_cache = kmem_cache_create("bridge_fdb_cache",
sizeof(struct net_bridge_fdb_entry),
0,
SLAB_HWCACHE_ALIGN, NULL);
if (!br_fdb_cache)
return -ENOMEM;
get_random_bytes(&fdb_salt, sizeof(fdb_salt));
return 0;
}
銷毀:
void br_fdb_fini(void)
{
kmem_cache_destroy(br_fdb_cache);
}
4.2 數據庫更新
當網橋收到一個數據包時,它會獲取該數據的源MAC地址,然后對數據庫進行更新。如果該MAC地址不在數庫中,則創建一個新表項。如果存在,更新它的過期時間。數據庫使用hash表的結構方式,便於高效查詢。數據庫更新函數:
void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
const unsigned char *addr)
{
// 使用hash算法,找到skb的MAC所屬的表
struct hlist_head *head = &br->hash[br_mac_hash(addr)];
struct net_bridge_fdb_entry *fdb;
/* some users want to always flood. */
if (hold_time(br) == 0)
return;
/* ignore packets unless we are using this port */
if (!(source->state == BR_STATE_LEARNING ||
source->state == BR_STATE_FORWARDING))
return;
fdb = fdb_find(head, addr);
if (likely(fdb)) { // 如果skb的MAC已經存在於數據庫中,更新過期時間
/* attempt to update an entry for a local interface */
if (unlikely(fdb->is_local)) {
if (net_ratelimit())
printk(KERN_WARNING "%s: received packet with "
"own address as source address\n",
source->dev->name);
} else {
/* fastpath: update of existing entry */
fdb->dst = source;
fdb->ageing_timer = jiffies;
}
} else { // 如果skb的MAC不在數據中,則新建一條記錄
spin_lock(&br->hash_lock);
if (!fdb_find(head, addr))
fdb_create(head, source, addr, 0); // 創建表項
/* else we lose race and someone else inserts
* it first, don't bother updating
*/
spin_unlock(&br->hash_lock);
}
}
4.3 創建數據項
見上文程序段,在更新表項的函數里,已經為MAC地址算出其所屬的Hash鏈表,因此,創建函數只需要在該鏈上添加一個數據項即可:
static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head,
struct net_bridge_port *source,
const unsigned char *addr,
int is_local)
{
struct net_bridge_fdb_entry *fdb;
fdb = kmem_cache_alloc(br_fdb_cache, GFP_ATOMIC);
if (fdb) {
memcpy(fdb->addr.addr, addr, ETH_ALEN);
hlist_add_head_rcu(&fdb->hlist, head);
fdb->dst = source;
fdb->is_local = is_local; // 0
fdb->is_static = is_local; // 0
fdb->ageing_timer = jiffies;
}
return fdb;
}
4.4 查找數據項
網橋的數據項查找與一般的查找類似,但略有不同。前面提到,如果要更新一MAC地址,不管該地址是否已經過期了,只需遍歷該MAC地址對應的Hash鏈表,然后更新年齡,此時它肯定不過期了。但網橋要轉發數據時,除了要找到該目標MAC的出口端口外,還要判斷該記錄是否過期了。因此,數據項的查找有兩種,一種用於更新,另一用於轉發:
static inline struct net_bridge_fdb_entry *fdb_find(struct hlist_head *head,
const unsigned char *addr)
{
struct hlist_node *h;
struct net_bridge_fdb_entry *fdb;
hlist_for_each_entry_rcu(fdb, h, head, hlist) {
if (!compare_ether_addr(fdb->addr.addr, addr))
return fdb;
}
return NULL;
}
/* No locking or refcounting, assumes caller has no preempt (rcu_read_lock) */
struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br,
const unsigned char *addr)
{
struct hlist_node *h;
struct net_bridge_fdb_entry *fdb;
hlist_for_each_entry_rcu(fdb, h, &br->hash[br_mac_hash(addr)], hlist) {
if (!compare_ether_addr(fdb->addr.addr, addr)) {
if (unlikely(has_expired(br, fdb))) // 判斷是否過期
break;
return fdb;
}
}
return NULL;
}
5. 網橋Netfilter
之前我們在專門講過Linux的Netfilter框架(雖然當時主要是針對IP層),所以這里就不再詳細講解網橋Netfilter的過程。在網橋處理邏輯中,我們已經看到了各個hook點的調用關系,和IP的Netfilter是一致的。關於這部分內容,這里就不重復講述;這里要講的,是與IP的Netfilter不同的一些東西。
5.1 Netfilter初始化
直接上代碼:
int __init br_netfilter_init(void)
{
int ret;
// 注冊hook options
ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
if (ret < 0)
return ret;
#ifdef CONFIG_SYSCTL
brnf_sysctl_header = register_sysctl_paths(brnf_path, brnf_table);
if (brnf_sysctl_header == NULL) {
printk(KERN_WARNING
"br_netfilter: can't register to sysctl.\n");
nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
return -ENOMEM;
}
#endif
printk(KERN_NOTICE "Bridge firewalling registered\n");
return 0;
}
static struct nf_hook_ops br_nf_ops[] __read_mostly = {
{ .hook = br_nf_pre_routing,
.owner = THIS_MODULE,
.pf = PF_BRIDGE,
.hooknum = NF_BR_PRE_ROUTING,
.priority = NF_BR_PRI_BRNF, }, // 優先級為0
{ .hook = br_nf_local_in,
.owner = THIS_MODULE,
.pf = PF_BRIDGE,
.hooknum = NF_BR_LOCAL_IN,
.priority = NF_BR_PRI_BRNF, },
{ .hook = br_nf_forward_ip,
.owner = THIS_MODULE,
.pf = PF_BRIDGE,
.hooknum = NF_BR_FORWARD,
.priority = NF_BR_PRI_BRNF - 1, }, //優先級為-1, ip高於arp
{ .hook = br_nf_forward_arp,
.owner = THIS_MODULE,
.pf = PF_BRIDGE,
.hooknum = NF_BR_FORWARD,
.priority = NF_BR_PRI_BRNF, },
{ .hook = br_nf_local_out,
.owner = THIS_MODULE,
.pf = PF_BRIDGE,
.hooknum = NF_BR_LOCAL_OUT,
.priority = NF_BR_PRI_FIRST, },
{ .hook = br_nf_post_routing,
.owner = THIS_MODULE,
.pf = PF_BRIDGE,
.hooknum = NF_BR_POST_ROUTING,
.priority = NF_BR_PRI_LAST, },
{ .hook = ip_sabotage_in,
.owner = THIS_MODULE,
.pf = PF_INET,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_FIRST, },
{ .hook = ip_sabotage_in,
.owner = THIS_MODULE,
.pf = PF_INET6,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP6_PRI_FIRST, },
};
回想IP的Netfilter,每個nf_hook_ops都屬於某個特定的表。但Bridge下的Netfilter,在Netfilter初始化的時候,注冊了一系列nf_hook_ops,它們不屬於任何表,且它們的優先級為0,默認都會被執行。這些hook函數不執行具體的匹配規則,但是會做一些特殊的處理,如調用IP層的hook。這部分功能將在后文講述Bridge與IP聯動的時候講述。
5.2 Ebtables表介紹
在網橋的Netfilter下,內建了三張表:broute、nat和filter。其中broute主要用於判斷某數據包是否應該進入網絡層進行處理(跳過網橋處理)。與傳統Netfilter下的表注冊不一樣,broute注冊沒有注冊nf_hook_ops,所以不能通過NF_HOOK()調用;相反其調用方式是直接通過在適當的位置調用其表執行函數。
broute表
static struct ebt_entries initial_chain = {
.name = "BROUTING",
.policy = EBT_ACCEPT,
};
static struct ebt_replace_kernel initial_table =
{
.name = "broute",
.valid_hooks = 1 << NF_BR_BROUTING,
.entries_size = sizeof(struct ebt_entries),
.hook_entry = {
[NF_BR_BROUTING] = &initial_chain,
},
.entries = (char *)&initial_chain,
};
static struct ebt_table broute_table =
{
.name = "broute",
.table = &initial_table,
.valid_hooks = 1 << NF_BR_BROUTING, // 非傳統的幾處hook點,專門為brout表定義的一個假hook點
.check = check,
.me = THIS_MODULE,
};
static int __init ebtable_broute_init(void)
{
int ret;
ret = register_pernet_subsys(&broute_net_ops);
if (ret < 0)
return ret;
/* see br_input.c */
rcu_assign_pointer(br_should_route_hook, ebt_broute);
return 0;
}
static int ebt_broute(struct sk_buff *skb)
{
int ret;
ret = ebt_do_table(NF_BR_BROUTING, skb, skb->dev, NULL,
dev_net(skb->dev)->xt.broute_table);
if (ret == NF_DROP)
return 1; /* route it */
return 0; /* bridge it */
}
Nat表
static struct nf_hook_ops ebt_ops_nat[] __read_mostly = {
{
.hook = ebt_nat_out,
.owner = THIS_MODULE,
.pf = PF_BRIDGE,
.hooknum = NF_BR_LOCAL_OUT,
.priority = NF_BR_PRI_NAT_DST_OTHER, // 100
},
{
.hook = ebt_nat_out,
.owner = THIS_MODULE,
.pf = PF_BRIDGE,
.hooknum = NF_BR_POST_ROUTING,
.priority = NF_BR_PRI_NAT_SRC, //300
},
{
.hook = ebt_nat_in,
.owner = THIS_MODULE,
.pf = PF_BRIDGE,
.hooknum = NF_BR_PRE_ROUTING,
.priority = NF_BR_PRI_NAT_DST_BRIDGED, // -300
},
};
static struct ebt_table frame_nat =
{
.name = "nat",
.table = &initial_table,
.valid_hooks = NAT_VALID_HOOKS,
.check = check,
.me = THIS_MODULE,
};
static int __init ebtable_nat_init(void)
{
int ret;
ret = register_pernet_subsys(&frame_nat_net_ops);
if (ret < 0)
return ret;
ret = nf_register_hooks(ebt_ops_nat, ARRAY_SIZE(ebt_ops_nat));
if (ret < 0)
unregister_pernet_subsys(&frame_nat_net_ops);
return ret;
}
Nat表的注冊和之前講過Iptables相關表注冊是一致的,都是初始化表結構,初始化nf_hook_ops,讓后分別注冊。值得注意的是,在注冊nf_hook_ops的時候,各個nf_hook_ops的優先級是不一樣的。優先級定義:
enum nf_br_hook_priorities {
NF_BR_PRI_FIRST = INT_MIN,
NF_BR_PRI_NAT_DST_BRIDGED = -300,
NF_BR_PRI_FILTER_BRIDGED = -200,
NF_BR_PRI_BRNF = 0,
NF_BR_PRI_NAT_DST_OTHER = 100,
NF_BR_PRI_FILTER_OTHER = 200,
NF_BR_PRI_NAT_SRC = 300,
NF_BR_PRI_LAST = INT_MAX,
};
filter表
和nat表類似,不贅述。
5.3 Bridge與IP聯動
Bridge和IP在透明防火牆中是需要聯動的,因為IP層可以做更多的事情。雖然,這些事情也是可以在Bridge中實現的,但是模塊化及KISS原則將Bridge從這些復雜的事情中分割出來,僅做它自己該處理的事情;如果需要IP層幫助,則直接調用IP層的hook即可。
下面,我們通過“read the fucking source code”,了解Bridge與IP層到底是如何聯動的。
5.3.1 NF_HOOK宏相關
#define NF_HOOK(pf, hook, skb, indev, outdev, okfn) \
NF_HOOK_THRESH(pf, hook, skb, indev, outdev, okfn, INT_MIN) // INT_MIN最小的整數
#define NF_HOOK_THRESH(pf, hook, skb, indev, outdev, okfn, thresh) \
({int __ret; \
if ((__ret=nf_hook_thresh(pf, hook, (skb), indev, outdev, okfn, thresh, 1)) == 1)\ // NF_ACCEPT == 1
__ret = (okfn)(skb); // 執行hook后的回調函數 \
__ret;})
unsigned int nf_iterate(struct list_head *head,
struct sk_buff *skb,
unsigned int hook,
const struct net_device *indev,
const struct net_device *outdev,
struct list_head **i,
int (*okfn)(struct sk_buff *),
int hook_thresh)
{
unsigned int verdict;
/*
* The caller must not block between calls to this
* function because of risk of continuing from deleted element.
*/
list_for_each_continue_rcu(*i, head) { // 遍歷所有nf_hook_ops
struct nf_hook_ops *elem = (struct nf_hook_ops *)*i;
// 如果設置的thresh值高於nf_hook_ops注冊的優先級,則跳過該ops
if (hook_thresh > elem->priority)
continue;
/* Optimization: we don't need to hold module
reference here, since function can't sleep. --RR */
verdict = elem->hook(hook, skb, indev, outdev, okfn); // 執行hook函數
if (verdict != NF_ACCEPT) {
#ifdef CONFIG_NETFILTER_DEBUG
if (unlikely((verdict & NF_VERDICT_MASK)
> NF_MAX_VERDICT)) {
NFDEBUG("Evil return from %p(%u).\n",
elem->hook, hook);
continue;
}
#endif
if (verdict != NF_REPEAT)
return verdict;
*i = (*i)->prev;
}
}
return NF_ACCEPT;
}
通過上述分析,如果通過NF_HOOK()進入,這在該hook點注冊的所有nf_hook_ops都會被執行,畢竟NF_HOOK指定了thresh值是最小整數。相反,如果要控制thresh的值,來過濾一部分nf_hook_ops,則需要顯示調用NF_HOOK_THRESH(),並指定thresh的值。
5.3.2 NF_BR_PRE_ROUTING 注冊舉例
進一步分析NF_BR_PRE_ROUTING這個hook點的數據流情況。回顧5.1節和5.2節,網橋在NF_BR_PRE_ROUTING點上,注冊了兩個nf_hook_ops:
一個是默認的hook處理:
{ .hook = br_nf_pre_routing,
.owner = THIS_MODULE,
.pf = PF_BRIDGE,
.hooknum = NF_BR_PRE_ROUTING,
.priority = NF_BR_PRI_BRNF, }, // 優先級為0
另一個是nat表注冊的:
{
.hook = ebt_nat_in,
.owner = THIS_MODULE,
.pf = PF_BRIDGE,
.hooknum = NF_BR_PRE_ROUTING,
.priority = NF_BR_PRI_NAT_DST_BRIDGED, //優先級 -300
},
其中,ebt_nat_in是傳統Netfilter表處理hook回調函數,通過調用do_tables遍歷表規則,對數據處理:
static unsigned int
ebt_nat_in(unsigned int hook, struct sk_buff *skb, const struct net_device *in
, const struct net_device *out, int (*okfn)(struct sk_buff *))
{
return ebt_do_table(hook, skb, in, out, dev_net(in)->xt.frame_nat);
}
但是,br_nf_pre_routing所做的事情卻有所不同:
static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
struct iphdr *iph;
__u32 len = nf_bridge_encap_header_len(skb);
if (unlikely(!pskb_may_pull(skb, len)))
goto out;
if (skb->protocol == htons(ETH_P_IPV6) || IS_VLAN_IPV6(skb) ||
IS_PPPOE_IPV6(skb)) { // 如果是IPv6的數據,則交由IPv6的hook處理
#ifdef CONFIG_SYSCTL
if (!brnf_call_ip6tables)
return NF_ACCEPT;
#endif
nf_bridge_pull_encap_header_rcsum(skb);
return br_nf_pre_routing_ipv6(hook, skb, in, out, okfn);
}
#ifdef CONFIG_SYSCTL
if (!brnf_call_iptables)
return NF_ACCEPT;
#endif
if (skb->protocol != htons(ETH_P_IP) && !IS_VLAN_IP(skb) &&
!IS_PPPOE_IP(skb)) // 如果不是IP數據,則通過交由后續規則或上層處理
return NF_ACCEPT;
nf_bridge_pull_encap_header_rcsum(skb);
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
goto inhdr_error;
iph = ip_hdr(skb);
if (iph->ihl < 5 || iph->version != 4)
goto inhdr_error;
if (!pskb_may_pull(skb, 4 * iph->ihl))
goto inhdr_error;
iph = ip_hdr(skb);
if (ip_fast_csum((__u8 *) iph, iph->ihl) != 0)
goto inhdr_error;
len = ntohs(iph->tot_len);
if (skb->len < len || len < 4 * iph->ihl)
goto inhdr_error;
pskb_trim_rcsum(skb, len);
nf_bridge_put(skb->nf_bridge);
if (!nf_bridge_alloc(skb))
return NF_DROP;
if (!setup_pre_routing(skb))
return NF_DROP;
store_orig_dstaddr(skb);
NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, skb->dev, NULL,
br_nf_pre_routing_finish);
return NF_STOLEN;
inhdr_error:
// IP_INC_STATS_BH(IpInHdrErrors);
out:
return NF_DROP;
}
5.3.3 br_nf_pre_routing_finish
static int br_nf_pre_routing_finish(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
struct iphdr *iph = ip_hdr(skb);
struct nf_bridge_info *nf_bridge = skb->nf_bridge;
struct rtable *rt;
int err;
if (nf_bridge->mask & BRNF_PKT_TYPE) {
skb->pkt_type = PACKET_OTHERHOST;
nf_bridge->mask ^= BRNF_PKT_TYPE;
}
nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING;
if (dnat_took_place(skb)) { // 如果做了DNAT,則交由ip層進行route
if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
struct flowi fl = {
.nl_u = {
.ip4_u = {
.daddr = iph->daddr,
.saddr = 0,
.tos = RT_TOS(iph->tos) },
},
.proto = 0,
};
struct in_device *in_dev = in_dev_get(dev);
/* If err equals -EHOSTUNREACH the error is due to a
* martian destination or due to the fact that
* forwarding is disabled. For most martian packets,
* ip_route_output_key() will fail. It won't fail for 2 types of
* martian destinations: loopback destinations and destination
* 0.0.0.0. In both cases the packet will be dropped because the
* destination is the loopback device and not the bridge. */
if (err != -EHOSTUNREACH || !in_dev || IN_DEV_FORWARD(in_dev))
goto free_skb;
if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
/* - Bridged-and-DNAT'ed traffic doesn't
* require ip_forwarding. */
if (((struct dst_entry *)rt)->dev == dev) {
skb_dst_set(skb, (struct dst_entry *)rt);
goto bridged_dnat;
}
/* we are sure that forwarding is disabled, so printing
* this message is no problem. Note that the packet could
* still have a martian destination address, in which case
* the packet could be dropped even if forwarding were enabled */
__br_dnat_complain();
dst_release((struct dst_entry *)rt);
}
free_skb:
kfree_skb(skb);
return 0;
} else {
if (skb_dst(skb)->dev == dev) {
bridged_dnat:
/* Tell br_nf_local_out this is a
* bridged frame */
nf_bridge->mask |= BRNF_BRIDGED_DNAT;
skb->dev = nf_bridge->physindev;
nf_bridge_push_encap_header(skb);
NF_HOOK_THRESH(PF_BRIDGE, NF_BR_PRE_ROUTING,
skb, skb->dev, NULL,
br_nf_pre_routing_finish_bridge,
1);
return 0;
}
memcpy(eth_hdr(skb)->h_dest, dev->dev_addr, ETH_ALEN);
skb->pkt_type = PACKET_HOST;
}
} else {
rt = bridge_parent_rtable(nf_bridge->physindev);
if (!rt) {
kfree_skb(skb);
return 0;
}
dst_hold(&rt->u.dst);
skb_dst_set(skb, &rt->u.dst);
}
skb->dev = nf_bridge->physindev;
nf_bridge_push_encap_header(skb);
NF_HOOK_THRESH(PF_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL,
br_handle_frame_finish, 1);
return 0;
}
由5.3.1節的分析知,NF_HOOK_THRESH()中thresh設置為1,將從優先級為1的nf_hook_ops執行;整個過程相當於,在Bridge prerouting hook點上,先執行完優先級小於0的hook操作,然后轉入IP層執行所有IP prerouting hook點上的hook操作,最后(依據IP層執行)再轉回Bridge prerouting hook點,從優先級為1處繼續執行:
圖 Bridge 與IP聯動示意圖
事實上,Bridge與IP的聯動過程比較復雜,本文只是示例了其核心的機制,很多細節的控制並沒有一一說明。關於Bridge與IP的聯動過程中,更多實現細節留給讀者自行分析吧。
關於其它網橋hook點上,Bridge與IP的聯動本文將略去不再講述,因為其原理和方法大致是一致的,只是不同的處理細節不一致。關於Bridge與IP的聯動全貌,可以參考帖子:
http://mobile.tp-link.net/redmine/boards/103/topics/32056
針對我司Linux平台的實現,Bridge與IP的聯動關系如下圖:
圖 Linux平台Bridge 與IP聯動
6. 網橋STP
6.1 STP簡介
生成樹協議STP(Spanning Tree Protocol)的主要功能有兩個:一是在利用生成樹算法、在以太網絡中,創建一個以某台交換機的某個端口為根的生成樹,避免環路。二是在以太網絡拓撲發生變化時,通過生成樹協議達到收斂保護的目的。
6.1.1 名詞解釋
STP :生成樹算法。
BPDU:STP的數據單元,在網橋局域網內傳遞信息。
TCN:拓撲改變通知BPDU。
根網橋:具有最小網橋ID的網橋被選作根網橋,網橋ID應為唯一的。
根端口:在指定網橋上面,到根網橋路徑花費最小的端口為根端口,如果指定網橋上面有幾個端口,到根網橋路徑花費一樣小,那么選擇端口id 最小的端口為根端口。
指定網橋:局域網通過所連的網橋,接收和發送數據幀,如果局域網有且只有一個網橋相連,那么這個網橋必定是指定網橋,如果有多個網橋跟這個局域網相連,那么到根網橋路徑花費最少的那個網橋為指定網橋,如果,有幾個網橋到到根網橋路徑花費一樣,那么比較網橋id,id最小的被選作為指定網橋。
指定端口:指定網橋上面和局域網相連的端口叫做指定端口,如果指定網橋上面有幾個端口,同時和局域網相連,那么選擇端口id 最小的端口為所在局域網的指定端口。
根路徑花費:當端口為根端口時候,通過這個端口的路徑花費。 對於這個網橋來說,路徑費用是到根網橋的費用之和。
指定花費:當端口為所在局域網的指定端口時候,即為根路徑費用,當不為指定端口時候,是所在局域網指定端口到根網橋的費用。
6.1.2 網橋有五種狀態
BR_STATE_DISABLED(0):禁用狀態,不參與生成樹,不轉發任何數據幀。
BR_STATE_LISTENING(1): 監聽狀態,能夠決定根,可以選擇根端口、指定端口和非指定端口。在監昕狀態的過程中,端口不能學 習任何接收幀的單播地址。
BR_STATE_LEARNING (2): 學習狀態,端口能學習流入幀的MAC地址,不能轉發幀。
BR_STATE_FORWARDING(3): 轉發狀態,接口能夠轉發幀。端口學習到接收幀的源 MAC地址,並可根據目標MAC地址進行恰當地轉發。
BR_STATE_BLOCKING(4):阻塞狀態,不參與幀轉發、監聽流人的BPDU,不能學習接收幀的任何MAC地址 。
6.1.3 STP關鍵點
運行生成樹算法(STA)的網橋定期發送BPDU;選取唯一一個根網橋;在每個非根網橋選取唯一一個根端口;在每網段選取唯一一個標志端口。
(1) 選取唯一一個根網橋:BPDU中包含Bridge ID;Bridge ID(8B)=優先級(2B)+交換機MAC地址(6B);一些交換機的優先級默認為32768,可以修改;優先級值最小的成為根網橋;優先級值最小的成為根網橋;優先級值相同,MAC地址最小的成為根網橋;Bridge ID值最小的成為根網橋;根網橋缺省每2秒發送一次BPDU。
(2) 在每個非根網橋選取唯一一個根端口:根網橋上沒有根端口;端口代價最小的成為根端口;端口代價相同,Port ID最小端口的成為端口;Port ID通常為端口的MAC地址;MAC地址最小的端口成為根端口。
(3) 在每網段選取唯一一個標志端口:端口代價最小的成為標識端口;根網橋端口到各網段的代價最小;通常只有根網橋端口成為標識端口;被選定為根端口和標識端口的進行轉發狀態;落選端口進入阻塞狀態,只偵聽BPDU。
(4) 阻塞端口在指定的時間間隔(缺省20秒)收不到BPDU時,會重新運行生成樹算法進行選舉;缺點:在運行生成樹算法的過程中,網絡處理阻斷狀態,所有端口都不進行轉發。計算過程缺省為50秒。
6.1.4 STP工作過程
當網橋加電的時,網橋將認為它就是根網橋,並且將過渡到監聽狀態。一般情況下,當網橋認識到網絡拓撲發生變更的時,將出現兩種過渡狀態:在拓撲變更的過程中,端口需要根據轉發延遲計時器的數值而臨時性地實施監聽和學習狀態。
當端口處於監聽狀態的時,它將利用發送和接收BPDU來確定活躍( active)的拓撲;當網絡拓撲處於過渡期的時候,將不傳遞任何用戶數據; 在監聽狀態的過程中,網橋將處理它所接收的BPDU;對於作為指定端口或根端口的端口,它們將在15秒(轉發延遲的默認值)之啟過渡到學習狀態;對於不是指定端口或根端口的端口,它們將過渡返回到阻塞狀態。
當端口處於學習狀態的時,將利用從端口所學到的MAC地址來組建自己的MAC地址表;不能轉發用戶數據幀;在這個時刻,網橋不能傳遞任何用戶數據。
當端口處於數據轉發的時,學習狀態能夠降低所需擴散的數據幀的數量;如果某個端口在學習狀態結束的時候仍然是指定端口或根端口,那么該端口就將過渡到轉發狀態;對於不是指定端口 或根端口的端口,它們將過渡返回到阻塞狀態;在轉發狀態中,端口能夠發送和接收用戶數據;端口從阻塞狀態過渡到轉發狀態的正常時間是30~50秒。
注:如果端口所連接的對象是主機,那么因為在這些鏈珞上的轉發不會造成STP環路,所以這些端口也就不需要參與STP監聽和學習的過程。
6.2 STP 實現
6.2.1 stp_proto注冊
在早期的版本中,網橋的STP數據包是在網橋處理過程中,按照特定的組播地址進行識別,然后在網橋處理過程中完成相應的數據處理。后來的實現中,將其實現為一種單獨的協議,並最終由IEEE802.2協議進行封包傳遞。
在網橋初始化的時候,為STP注冊了協議,並指定其接收數據的函數為br_stp_proto:
static const struct stp_proto br_stp_proto = {
.rcv = br_stp_rcv, // 接收函數
};
br_init()方法中,注冊stp協議:
err = stp_proto_register(&br_stp_proto);
int stp_proto_register(const struct stp_proto *proto)
{
int err = 0;
mutex_lock(&stp_proto_mutex);
if (sap_registered++ == 0) {
sap = llc_sap_open(LLC_SAP_BSPAN, stp_pdu_rcv); //在LLC上注冊數據處理函數。LLC_SAP_BSPAN (0x42):Bridge Spanning Tree Proto
if (!sap) {
err = -ENOMEM;
goto out;
}
}
if (is_zero_ether_addr(proto->group_address))
rcu_assign_pointer(stp_proto, proto);
else
rcu_assign_pointer(garp_protos[proto->group_address[5] -
GARP_ADDR_MIN], proto); //按照組播地址下標5的序號,將協議加入到garp_protos中
out:
mutex_unlock(&stp_proto_mutex);
return err;
}
// llc_sap_open 將創建一個llc_sap結構,並加入到LLC協議鏈表中
struct llc_sap *llc_sap_open(unsigned char lsap,
int (*func)(struct sk_buff *skb,
struct net_device *dev,
struct packet_type *pt,
struct net_device *orig_dev))
{
struct llc_sap *sap = NULL;
write_lock_bh(&llc_sap_list_lock);
if (__llc_sap_find(lsap)) /* SAP already exists */
goto out;
sap = llc_sap_alloc();
if (!sap)
goto out;
sap->laddr.lsap = lsap; // 上層協議標識
sap->rcv_func = func; // 上層協議入口函數
llc_add_sap(sap);
out:
write_unlock_bh(&llc_sap_list_lock);
return sap;
6.2.2 STP數據流程
既然STP協議是附在LLC之上的,那么還得從LLC的接收說起。LLC在初始化的時候,注冊了其數據接收函數為llc_rev():
int llc_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
struct llc_sap *sap;
struct llc_pdu_sn *pdu;
int dest;
int (*rcv)(struct sk_buff *, struct net_device *,
struct packet_type *, struct net_device *);
……
// 獲取報文頭部
pdu = llc_pdu_sn_hdr(skb);
if (unlikely(!pdu->dsap)) /* NULL DSAP, refer to station */
goto handle_station;
sap = llc_sap_find(pdu->dsap); // 查找所屬上層協議
if (unlikely(!sap)) {/* unknown SAP */
dprintk("%s: llc_sap_find(%02X) failed!\n", __func__,
pdu->dsap);
goto drop;
}
/*
* First the upper layer protocols that don't need the full
* LLC functionality
*/
rcv = rcu_dereference(sap->rcv_func);
if (rcv) {
struct sk_buff *cskb = skb_clone(skb, GFP_ATOMIC);
if (cskb)
rcv(cskb, dev, pt, orig_dev); // 執行上層協議的接收函數
}
dest = llc_pdu_type(skb);
if (unlikely(!dest || !llc_type_handlers[dest - 1]))
goto drop_put;
llc_type_handlers[dest - 1](sap, skb);
……
}
在STP協議注冊過程中,指定了LLC到STP的入口函數是stp_pdu_rcv:
static int stp_pdu_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
const struct ethhdr *eh = eth_hdr(skb);
const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);
const struct stp_proto *proto;
if (pdu->ssap != LLC_SAP_BSPAN ||
pdu->dsap != LLC_SAP_BSPAN ||
pdu->ctrl_1 != LLC_PDU_TYPE_U)
goto err;
if (eh->h_dest[5] >= GARP_ADDR_MIN && eh->h_dest[5] <= GARP_ADDR_MAX) {
// 讀取對用的proto
proto = rcu_dereference(garp_protos[eh->h_dest[5] -
GARP_ADDR_MIN]);
if (proto &&
compare_ether_addr(eh->h_dest, proto->group_address))
goto err;
} else
proto = rcu_dereference(stp_proto);
if (!proto)
goto err;
proto->rcv(proto, skb, dev); // 真正STP協議的執行函數
return 0;
……
}
真正STP協議的執行函數br_stp_rcv:
void br_stp_rcv(const struct stp_proto *proto, struct sk_buff *skb,
struct net_device *dev)
{
const unsigned char *dest = eth_hdr(skb)->h_dest;
struct net_bridge_port *p = rcu_dereference(dev->br_port);
struct net_bridge *br;
const unsigned char *buf;
if (!p)
goto err;
if (!pskb_may_pull(skb, 4))
goto err;
/* compare of protocol id and version */
buf = skb->data;
if (buf[0] != 0 || buf[1] != 0 || buf[2] != 0)
goto err;
br = p->br;
spin_lock(&br->lock);
if (br->stp_enabled != BR_KERNEL_STP) //沒有開啟STP功能
goto out;
if (!(br->dev->flags & IFF_UP))
goto out;
if (p->state == BR_STATE_DISABLED)
goto out;
if (compare_ether_addr(dest, br->group_addr) != 0)
goto out;
buf = skb_pull(skb, 3);
if (buf[0] == BPDU_TYPE_CONFIG) {
struct br_config_bpdu bpdu;
if (!pskb_may_pull(skb, 32))
goto out;
buf = skb->data;
bpdu.topology_change = (buf[1] & 0x01) ? 1 : 0;
bpdu.topology_change_ack = (buf[1] & 0x80) ? 1 : 0;
bpdu.root.prio[0] = buf[2];
bpdu.root.prio[1] = buf[3];
bpdu.root.addr[0] = buf[4];
bpdu.root.addr[1] = buf[5];
bpdu.root.addr[2] = buf[6];
bpdu.root.addr[3] = buf[7];
bpdu.root.addr[4] = buf[8];
bpdu.root.addr[5] = buf[9];
bpdu.root_path_cost =
(buf[10] << 24) |
(buf[11] << 16) |
(buf[12] << 8) |
buf[13];
bpdu.bridge_id.prio[0] = buf[14];
bpdu.bridge_id.prio[1] = buf[15];
bpdu.bridge_id.addr[0] = buf[16];
bpdu.bridge_id.addr[1] = buf[17];
bpdu.bridge_id.addr[2] = buf[18];
bpdu.bridge_id.addr[3] = buf[19];
bpdu.bridge_id.addr[4] = buf[20];
bpdu.bridge_id.addr[5] = buf[21];
bpdu.port_id = (buf[22] << 8) | buf[23];
bpdu.message_age = br_get_ticks(buf+24);
bpdu.max_age = br_get_ticks(buf+26);
bpdu.hello_time = br_get_ticks(buf+28);
bpdu.forward_delay = br_get_ticks(buf+30);
br_received_config_bpdu(p, &bpdu); // 進入網橋配置信息處理
}
else if (buf[0] == BPDU_TYPE_TCN) {
br_received_tcn_bpdu(p); // 進入網絡變更信息處理
}
out:
spin_unlock(&br->lock);
err:
kfree_skb(skb);
}
6.2.3 核心實現
<span "="">6.2.3.1 br_received_config_bpdu
void br_received_config_bpdu(struct net_bridge_port *p, struct br_config_bpdu *bpdu)
{
struct net_bridge *br;
int was_root;
br = p->br;
// 自己是根橋嗎?用自己的br_ID和BPDU包中的根ID相比較
was_root = br_is_root_bridge(br);
//比橋BPDU包中的信息(bpdu)和原先的對應的信息(p),如果需要更新,返回1,相同返回0,不需更新返回-1
if (br_supersedes_port_info(p, bpdu)) {
//刷新自己的相關信息
br_record_config_information(p, bpdu);
//進行root_bridge、port的選舉
br_configuration_update(br);
//設置端口狀態
br_port_state_selection(br);
// 如果因為這個BPDU導致拓朴變化了,如自己以前是根橋,現在不是了,需要發送TCN包,進行通告
if (!br_is_root_bridge(br) && was_root) {
del_timer(&br->hello_timer);
if (br->topology_change_detected) {
del_timer(&br->topology_change_timer);
br_transmit_tcn(br);
mod_timer(&br->tcn_timer,
jiffies + br->bridge_hello_time);
}
}
// 需要把這個BPDU包繼續轉發下去
if (p->port_no == br->root_port) {
br_record_config_timeout_values(br, bpdu);
br_config_bpdu_generation(br);
if (bpdu->topology_change_ack)
br_topology_change_acknowledged(br);
}
} else if (br_is_designated_port(p)) { //如果收到這個BPDU包,不是“最優”的,而接收數據包的接口不是根端口,直接將轉發出去就可以了
br_reply(p);
}
}
<span "="">6.2.3.2 br_received_tcn_bpdu
void br_received_tcn_bpdu(struct net_bridge_port *p)
{
if (br_is_designated_port(p)) {
pr_info("%s: received tcn bpdu on port %i(%s)\n",
p->br->dev->name, p->port_no, p->dev->name);
br_topology_change_detection(p->br); // 發送變更通知
br_topology_change_acknowledge(p); // 應答變更
}
}