轉載
https://segmentfault.com/a/1190000020946695
TYPE-2
添加頭端復制表項,這種很少出現,一般來說,對端會發送type3類型的路由用於vtep發現
/*
* Install remote VTEP into the kernel if the remote VTEP has asked
* for head-end-replication.
*/
static int zvni_vtep_install(zebra_vni_t *zvni, zebra_vtep_t *zvtep)
{
if (is_vxlan_flooding_head_end() &&
(zvtep->flood_control == VXLAN_FLOOD_HEAD_END_REPL))
//內核添加頭端復制表項
return kernel_add_vtep(zvni->vni, zvni->vxlan_if,
&zvtep->vtep_ip);
return 0;
}
添加mac表項(用於同子網轉發)
/*
* Install remote MAC into the kernel.
*/
static int zvni_mac_install(zebra_vni_t *zvni, zebra_mac_t *mac)
{
struct zebra_if *zif;
struct zebra_l2info_vxlan *vxl;
bool sticky;
if (!(mac->flags & ZEBRA_MAC_REMOTE))
return 0;
zif = zvni->vxlan_if->info;
if (!zif)
return -1;
vxl = &zif->l2info.vxl;
sticky = !!CHECK_FLAG(mac->flags,
(ZEBRA_MAC_STICKY | ZEBRA_MAC_REMOTE_DEF_GW));
return kernel_add_mac(zvni->vxlan_if, vxl->access_vlan, &mac->macaddr,
mac->fwd_info.r_vtep_ip, sticky);
}
添加鄰居表項(跨子網報文轉發時,用作內層目的mac)
/*
* Install remote neighbor into the kernel.
*/
static int zvni_neigh_install(zebra_vni_t *zvni, zebra_neigh_t *n)
{
struct zebra_if *zif;
struct zebra_l2info_vxlan *vxl;
struct interface *vlan_if;
#ifdef GNU_LINUX
uint8_t flags;
#endif
int ret = 0;
if (!(n->flags & ZEBRA_NEIGH_REMOTE))
return 0;
zif = zvni->vxlan_if->info;
if (!zif)
return -1;
vxl = &zif->l2info.vxl;
vlan_if = zvni_map_to_svi(vxl->access_vlan, zif->brslave_info.br_if);
if (!vlan_if)
return -1;
#ifdef GNU_LINUX
flags = NTF_EXT_LEARNED;
if (n->flags & ZEBRA_NEIGH_ROUTER_FLAG)
flags |= NTF_ROUTER;
ZEBRA_NEIGH_SET_ACTIVE(n);
ret = kernel_add_neigh(vlan_if, &n->ip, &n->emac, flags);
#endif
return ret;
}
//添加NUD_NOARP鄰居
int kernel_add_neigh(struct interface *ifp, struct ipaddr *ip,
struct ethaddr *mac, uint8_t flags)
{
return netlink_neigh_update2(ifp, ip, mac, flags,
NUD_NOARP, RTM_NEWNEIGH);
}
不需要添加路由,路由在創建bdif的時候,該bdif需要作為本l2vni的網關,在上面配置IP后,會生成本網段的網段路由,結合上面的鄰居表項即可完成跨子網路由轉發。
注: 對於集中式路由網關,設置了default-gw標志的話,發布的本地的mac/ip消息在設置鄰居表時標志位NUD_NOARP。如果是攜帶sticky標志也會是這種類型的鄰居,其它的是NTF_EXT_LEARNED表項。
TYPE-3
添加mac值為全零的頭端復制fdb表項
/*
* Install remote VTEP into the kernel if the remote VTEP has asked
* for head-end-replication.
*/
static int zvni_vtep_install(zebra_vni_t *zvni, zebra_vtep_t *zvtep)
{
if (is_vxlan_flooding_head_end() &&
(zvtep->flood_control == VXLAN_FLOOD_HEAD_END_REPL))
//內核添加頭端復制表項
return kernel_add_vtep(zvni->vni, zvni->vxlan_if,
&zvtep->vtep_ip);
return 0;
}
TYPE-5
FRR-BGP對於網段路由采用的是interface-less模型,如下圖所示:
在linux內核中是如下配置:
右邊的VTEP的IP為10.200.200.1(underlay-ip),其路由mac為0200.0ade.de01(這個是overlay的mac,通常作為內層報文的mac)。當右邊的設備發布一條192.168.1.0/24的網段路由的時候,左邊的BGP將會收到如下所示的type-5類型的路由:
可以看到其NLRI中的前綴為192.168.1.0/24,下一跳屬性為10.200.200.1(是一個underlay地址)。同時使用擴展路由mac團體攜帶了overlay網關的mac(0200.0ade.de01),還攜帶了l3vni。左邊的設備收到該地址后會進行處理。
在指定vrf中安裝路由
struct nexthop *route_entry_nexthop_ipv4_ifindex_add(struct route_entry *re,
struct in_addr *ipv4,
struct in_addr *src,
ifindex_t ifindex,
vrf_id_t nh_vrf_id)
{
struct nexthop *nexthop;
struct interface *ifp;
nexthop = nexthop_new();
nexthop->vrf_id = nh_vrf_id;
nexthop->type = NEXTHOP_TYPE_IPV4_IFINDEX;
nexthop->gate.ipv4 = *ipv4;
if (src)
nexthop->src.ipv4 = *src;
nexthop->ifindex = ifindex;
ifp = if_lookup_by_index(nexthop->ifindex, nh_vrf_id);
/*Pending: need to think if null ifp here is ok during bootup?
There was a crash because ifp here was coming to be NULL */
if (ifp)
if (connected_is_unnumbered(ifp))//接口必須是沒有配置IP的,如果配置了IP會導致路由不能准確下發
SET_FLAG(nexthop->flags, NEXTHOP_FLAG_ONLINK);//設置NEXTHOP_FLAG_ONLINK標志
route_entry_nexthop_add(re, nexthop);
return nexthop;
}
通過上面的函數整理出路由的下一跳后,使用如下函數添加路由:
/*
* Update or delete a prefix from the kernel,
* using info from a dataplane context.
*/
enum zebra_dplane_result kernel_route_update(struct zebra_dplane_ctx *ctx)
{
int cmd, ret;
const struct prefix *p = dplane_ctx_get_dest(ctx);
struct nexthop *nexthop;
if (dplane_ctx_get_op(ctx) == DPLANE_OP_ROUTE_DELETE) {
cmd = RTM_DELROUTE;
} else if (dplane_ctx_get_op(ctx) == DPLANE_OP_ROUTE_INSTALL) {
cmd = RTM_NEWROUTE;
} else if (dplane_ctx_get_op(ctx) == DPLANE_OP_ROUTE_UPDATE) {
if (p->family == AF_INET || v6_rr_semantics) {
/* Single 'replace' operation */
cmd = RTM_NEWROUTE;
} else {
/*
* So v6 route replace semantics are not in
* the kernel at this point as I understand it.
* so let's do a delete then an add.
* In the future once v6 route replace semantics
* are in we can figure out what to do here to
* allow working with old and new kernels.
*
* I'm also intentionally ignoring the failure case
* of the route delete. If that happens yeah we're
* screwed.
*/
if (!RSYSTEM_ROUTE(dplane_ctx_get_old_type(ctx)))
(void)netlink_route_multipath(RTM_DELROUTE,
ctx);
cmd = RTM_NEWROUTE;
}
} else {
return ZEBRA_DPLANE_REQUEST_FAILURE;
}
if (!RSYSTEM_ROUTE(dplane_ctx_get_type(ctx)))
ret = netlink_route_multipath(cmd, ctx);
else
ret = 0;
if ((cmd == RTM_NEWROUTE) && (ret == 0)) {
/* Update installed nexthops to signal which have been
* installed.
*/
for (ALL_NEXTHOPS_PTR(dplane_ctx_get_ng(ctx), nexthop)) {
if (CHECK_FLAG(nexthop->flags, NEXTHOP_FLAG_RECURSIVE))
continue;
if (CHECK_FLAG(nexthop->flags, NEXTHOP_FLAG_ACTIVE)) {
SET_FLAG(nexthop->flags, NEXTHOP_FLAG_FIB);
}
}
}
return (ret == 0 ?
ZEBRA_DPLANE_REQUEST_SUCCESS : ZEBRA_DPLANE_REQUEST_FAILURE);
}
可以使用如下命令達到同樣的效果:
sudo ip route add 192.168.1.0/24 via 10.200.200.1 dev br100 proto bgp metric 20 onlink
#注意onlink屬性一定要添加,表示直連的鄰居,從上面的代碼和可以看出
提取路由mac和下一跳ip構建鄰居(這個鄰居比較特殊,其中mac是overlay的mac,而IP是underlay的IP),在linux內核中添加鄰居表項,且設置了noarp屬性。
//添加NUD_NOARP鄰居
int kernel_add_neigh(struct interface *ifp, struct ipaddr *ip,
struct ethaddr *mac, uint8_t flags)
{
return netlink_neigh_update2(ifp, ip, mac, flags,
NUD_NOARP, RTM_NEWNEIGH);
}
可以使用ip monitor命令監聽到這一過程:
10.200.200.1 dev br100 lladdr 02:00:0a:de:de:01 NOARP
可以使用命令sudo ip neigh add 10.200.200.1 dev br100 lladdr 02:00:0a:de:de:01 nud noarp vrf evpn-vrf
達到相同的結果。
同時使用rmac和下一跳IP構建fdb表項:
int kernel_add_mac(struct interface *ifp, vlanid_t vid, struct ethaddr *mac,
struct in_addr vtep_ip, bool sticky)
{
return netlink_macfdb_update(ifp, vid, mac, vtep_ip, RTM_NEWNEIGH,
sticky);
}
可以使用如下命令得到相同的效果:
sudo bridge fdb add 02:00:0a:de:de:01 dev vxlan100 dst 10.200.200.1 self extern_learn
調用棧為:
#0 zebra_vxlan_evpn_vrf_route_add (vrf_id=11, rmac=0x7fff76e7cba0, vtep_ip=0x7fff76e7cacc, host_prefix=0x7fff76e7caf0) at zebra/zebra_vxlan.c:5680
#1 0x0000557f9485a716 in zread_route_add (client=0x557f96929790, hdr=<optimized out>, msg=<optimized out>, zvrf=<optimized out>) at zebra/zapi_msg.c:1488
#2 0x0000557f9485cebb in zserv_handle_commands (client=client@entry=0x557f96929790, msg=msg@entry=0x7ff374001040) at zebra/zapi_msg.c:2532
#3 0x0000557f9485714e in zserv_process_messages (thread=<optimized out>) at zebra/zserv.c:523
#4 0x00007ff37f3ef968 in thread_call (thread=thread@entry=0x7fff76e7e910) at lib/thread.c:1547
#5 0x00007ff37f3cc257 in frr_run (master=0x557f9672baa0) at lib/libfrr.c:1021
#6 0x0000557f9481b1be in main (argc=2, argv=0x7fff76e7ecd8) at zebra/main.c:475
(gdb) s
TYPE4
TYPE4用於MLAG,暫時了解不多。