This is how to use SOCKMAP: SOCKMAP or specifically "BPF_MAP_TYPE_SOCKMAP", is a type of an eBPF map. This map is an "array" - indices are integers. All this is pretty standard. The magic is in the map values - they must be TCP socket descriptors.
copy from:https://blog.cloudflare.com/sockmap-tcp-splicing-of-the-future/
也就是eBPF程序必須attach一個map,不是attach一個socket。so how to use SOCKMAP ?
sock_map = bpf_create_map(BPF_MAP_TYPE_SOCKMAP, sizeof(int), sizeof(int), 2, 0) prog_parser = bpf_load_program(BPF_PROG_TYPE_SK_SKB, ...) prog_verdict = bpf_load_program(BPF_PROG_TYPE_SK_SKB, ...) bpf_prog_attach(prog_parser, sock_map, BPF_SK_SKB_STREAM_PARSER) bpf_prog_attach(prog_verdict, sock_map, BPF_SK_SKB_STREAM_VERDICT)
-
先看看 bpf_create_map的作用: 創建一個map內存塊
-
BPF map的應用場景有幾種:
- BPF程序和用戶態態的交互:BPF程序運行完,得到的結果存儲到map中,供用戶態訪問;
- BPF程序內部交互:如果BPF程序內部需要用全局變量來交互,但是由於安全原因BPF程序不允許訪問全局變量,可以使用map來充當全局變量;
- BPF Tail call:Tail call是一個BPF程序跳轉到另一BPF程序,BPF程序首先通過BPF_MAP_TYPE_PROG_ARRAY類型的map來知道另一個BPF程序的指針,然后調用tail_call()的helper function來執行Tail call。
- BPF程序和內核態的交互:和BPF程序以外的內核程序交互,也可以使用map作為中介;
-
- Map 類型(
map_type
),就是上文提到的各種 Map 類型 - Map 的鍵大小(
key_size
),以字節為單位 - Map 的值大小(
value_size
),以字節為單位 - Map 的元素最大容量(
max_entries
),個數為單位
- Map 類型(
{ struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ __u32 key_size; /* size of key in bytes */ __u32 value_size; /* size of value in bytes */ __u32 max_entries; /* max number of entries in a map */ __u32 map_flags; /* BPF_MAP_CREATE related * flags defined above. */ __u32 inner_map_fd; /* fd pointing to the inner map */ __u32 numa_node; /* numa node (effective only if * BPF_F_NUMA_NODE is set). */ char map_name[BPF_OBJ_NAME_LEN]; __u32 map_ifindex; /* ifindex of netdev to create on */ __u32 btf_fd; /* fd pointing to a BTF type data */ __u32 btf_key_type_id; /* BTF type_id of the key */ __u32 btf_value_type_id; /* BTF type_id of the value */ __u32 btf_vmlinux_value_type_id;/* BTF type_id of a kernel- * struct stored as the * map value */ }; --------------------------- }
int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, int max_entries, __u32 map_flags) { struct bpf_create_map_attr map_attr = {}; map_attr.map_type = map_type;//BPF_MAP_TYPE_SOCKMAP BPF_MAP_TYPE_HASH BPF_MAP_TYPE_ARRAY and so on map_attr.map_flags = map_flags;//map的標志位 map_attr.key_size = key_size; //鍵值 中鍵的大小 map_attr.value_size = value_size;// 鍵值中值的大小 map_attr.max_entries = max_entries;//map鍵值對 最大數目 return bpf_create_map_xattr(&map_attr); }

int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr) { union bpf_attr attr; memset(&attr, '\0', sizeof(attr)); // 完成 bpf_attr的賦值初始化 attr.map_type = create_attr->map_type; attr.key_size = create_attr->key_size; attr.value_size = create_attr->value_size; attr.max_entries = create_attr->max_entries; attr.map_flags = create_attr->map_flags; if (create_attr->name) memcpy(attr.map_name, create_attr->name, min(strlen(create_attr->name), BPF_OBJ_NAME_LEN - 1)); attr.numa_node = create_attr->numa_node; attr.btf_fd = create_attr->btf_fd; attr.btf_key_type_id = create_attr->btf_key_type_id; attr.btf_value_type_id = create_attr->btf_value_type_id; attr.map_ifindex = create_attr->map_ifindex; if (attr.map_type == BPF_MAP_TYPE_STRUCT_OPS) attr.btf_vmlinux_value_type_id = create_attr->btf_vmlinux_value_type_id; else attr.inner_map_fd = create_attr->inner_map_fd; //調用bpf 系統調用 創建 一個map bpf 第一個參數為命令參數,比如: BPF_MAP_CREATE BPF_MAP_UPDATE_ELEM BPF_MAP_DELETE_ELEM return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); }
可以看到 實際上 會調用一個map_create 函數 分配內存 並初始化一個map
static int map_create(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); struct bpf_map_memory mem; struct bpf_map *map; int f_flags; int err; err = CHECK_ATTR(BPF_MAP_CREATE); if (err) return -EINVAL; if (attr->btf_vmlinux_value_type_id) { if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS || attr->btf_key_type_id || attr->btf_value_type_id) return -EINVAL; } else if (attr->btf_key_type_id && !attr->btf_value_type_id) { return -EINVAL; } f_flags = bpf_get_file_flag(attr->map_flags); if (f_flags < 0) return f_flags; if (numa_node != NUMA_NO_NODE && ((unsigned int)numa_node >= nr_node_ids || !node_online(numa_node))) return -EINVAL; /* find map type and init map: hashtable vs rbtree vs bloom vs ... 分配內存使用 */ map = find_and_alloc_map(attr); if (IS_ERR(map)) return PTR_ERR(map); err = bpf_obj_name_cpy(map->name, attr->map_name, sizeof(attr->map_name)); if (err < 0) goto free_map; atomic64_set(&map->refcnt, 1); atomic64_set(&map->usercnt, 1); mutex_init(&map->freeze_mutex); map->spin_lock_off = -EINVAL; ---------------------------------------------- err = bpf_map_alloc_id(map); // 將map 和 idx-id 相關聯索引 if (err) goto free_map_sec; err = bpf_map_new_fd(map, f_flags);// 將map 和fd 關聯 一切皆文件 if (err < 0) { /* failed to allocate fd. * bpf_map_put_with_uref() is needed because the above * bpf_map_alloc_id() has published the map * to the userspace and the userspace may * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID. */ bpf_map_put_with_uref(map); return err; } return err; }
map_create 會調用:對應map_type的ops去分配內存等
以map_array為例:

static const struct bpf_map_ops array_ops = { .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, }; static struct bpf_map_type_list array_type __read_mostly = { .ops = &array_ops, .type = BPF_MAP_TYPE_ARRAY, }; static struct bpf_map *array_map_alloc(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; u32 elem_size, index_mask, max_entries; bool unpriv = !capable(CAP_SYS_ADMIN); struct bpf_array *array; u64 array_size, mask64; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || attr->value_size == 0 || attr->map_flags) return ERR_PTR(-EINVAL); if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1)) /* if value_size is bigger, the user space won't be able to * access the elements. */ return ERR_PTR(-E2BIG); /* (1.1.1) 計算value的size,key的size不用計算也不用存儲,因為這里的key直接就是index */ elem_size = round_up(attr->value_size, 8); max_entries = attr->max_entries; /* On 32 bit archs roundup_pow_of_two() with max_entries that has * upper most bit set in u32 space is undefined behavior due to * resulting 1U << 32, so do it manually here in u64 space. */ mask64 = fls_long(max_entries - 1); mask64 = 1ULL << mask64; mask64 -= 1; index_mask = mask64; if (unpriv) { /* round up array size to nearest power of 2, * since cpu will speculate within index_mask limits */ max_entries = index_mask + 1; /* Check for overflows. */ if (max_entries < attr->max_entries) return ERR_PTR(-E2BIG); } /* (1.1.2) 計算bpf_array + value數組的總大小,bpf_array包含了map的通用結構bpf_map */ array_size = sizeof(*array); if (percpu) array_size += (u64) max_entries * sizeof(void *); else array_size += (u64) max_entries * elem_size; /* make sure there is no u32 overflow later in round_up() */ if (array_size >= U32_MAX - PAGE_SIZE) return ERR_PTR(-ENOMEM); /* allocate all map elements and zero-initialize them */ /* (1.1.3) 根據總大小,分配bpf_array空間 */ array = bpf_map_area_alloc(array_size); if (!array) return ERR_PTR(-ENOMEM); array->index_mask = index_mask; array->map.unpriv_array = unpriv; /* copy mandatory map attributes */ /* (1.1.4) 拷貝attr到array->map中 */ array->map.map_type = attr->map_type; array->map.key_size = attr->key_size; array->map.value_size = attr->value_size; array->map.max_entries = attr->max_entries; array->elem_size = elem_size; if (!percpu) goto out; array_size += (u64) attr->max_entries * elem_size * num_possible_cpus(); if (array_size >= U32_MAX - PAGE_SIZE || elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) { bpf_map_area_free(array); return ERR_PTR(-ENOMEM); } out: array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT; return &array->map; }
bpf_load_program:用BPF_PROG_LOAD
命令進行bpf系統調用加載 BPF 程序到內核中
- 拷貝程序到內核;
- 校驗它的安全性;
- 如果可能對它進行JIT編譯;
- 然后分配一個文件句柄fd給它
完成這一切后,后續再把這段BPF程序掛載到需要運行的鈎子上面。
static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) { enum bpf_prog_type type = attr->prog_type; struct bpf_prog *prog; int err; char license[128]; bool is_gpl; if (CHECK_ATTR(BPF_PROG_LOAD)) return -EINVAL; if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT | BPF_F_TEST_STATE_FREQ | BPF_F_TEST_RND_HI32)) return -EINVAL; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && (attr->prog_flags & BPF_F_ANY_ALIGNMENT) && !bpf_capable()) return -EPERM; /* copy eBPF program license from user space 根據attr->license地址,從用戶空間拷貝license字符串到內核 */ if (strncpy_from_user(license, u64_to_user_ptr(attr->license), sizeof(license) - 1) < 0) return -EFAULT; license[sizeof(license) - 1] = 0; /* eBPF programs must be GPL compatible to use GPL-ed functions 判斷license是否符合GPL協議*/ is_gpl = license_is_gpl_compatible(license); //判斷BPF的總指令數是否超過BPF_MAXINSNS(4k) if (attr->insn_cnt == 0 || attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) return -E2BIG; //對BPF_PROG_TYPE_SOCKET_FILTER和BPF_PROG_TYPE_CGROUP_SKB以外的BPF程序加載,需要管理員權限 if (type != BPF_PROG_TYPE_SOCKET_FILTER && type != BPF_PROG_TYPE_CGROUP_SKB && !bpf_capable()) return -EPERM; //對 CGROUP SOCK等需要admin 權限 或者 對應net 空間的權限 if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN) && !capable(CAP_SYS_ADMIN)) return -EPERM; if (is_perfmon_prog_type(type) && !perfmon_capable()) return -EPERM; bpf_prog_load_fixup_attach_type(attr); if (bpf_prog_load_check_attach(type, attr->expected_attach_type, attr->attach_btf_id, attr->attach_prog_fd)) return -EINVAL; /* plain bpf_prog allocation 根據BPF指令數分配bpf_prog空間,和bpf_prog->aux空間*/ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); if (!prog) return -ENOMEM; prog->expected_attach_type = attr->expected_attach_type; prog->aux->attach_btf_id = attr->attach_btf_id; if (attr->attach_prog_fd) { struct bpf_prog *tgt_prog; tgt_prog = bpf_prog_get(attr->attach_prog_fd); if (IS_ERR(tgt_prog)) { err = PTR_ERR(tgt_prog); goto free_prog_nouncharge; } prog->aux->linked_prog = tgt_prog; } prog->aux->offload_requested = !!attr->prog_ifindex; err = security_bpf_prog_alloc(prog->aux); if (err) goto free_prog_nouncharge; err = bpf_prog_charge_memlock(prog); if (err) goto free_prog_sec; prog->len = attr->insn_cnt; err = -EFAULT;//把BPF代碼從用戶空間地址attr->insns,拷貝到內核空間地址prog->insns if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns), bpf_prog_insn_size(prog)) != 0) goto free_prog; prog->orig_prog = NULL; prog->jited = 0; atomic64_set(&prog->aux->refcnt, 1); prog->gpl_compatible = is_gpl ? 1 : 0; if (bpf_prog_is_dev_bound(prog->aux)) { err = bpf_prog_offload_init(prog, attr); if (err) goto free_prog; } /* find program type: socket_filter vs tracing_filter 根據attr->prog_type指定的type值,找到對應的bpf_prog_types, 給bpf_prog->aux->ops賦值,這個ops是一個函數操作集*/ err = find_prog_type(type, prog); if (err < 0) goto free_prog; prog->aux->load_time = ktime_get_boottime_ns(); err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name, sizeof(attr->prog_name)); if (err < 0) goto free_prog; /* run eBPF verifier 使用verifer對BPF程序進行合法性掃描 */ err = bpf_check(&prog, attr, uattr); if (err < 0) goto free_used_maps; /*嘗試對BPF程序進行JIT轉換*/ prog = bpf_prog_select_runtime(prog, &err); if (err < 0) goto free_used_maps; //給BPF程序分配關聯一個idx id索引 err = bpf_prog_alloc_id(prog); if (err) goto free_used_maps; /* Upon success of bpf_prog_alloc_id(), the BPF prog is * effectively publicly exposed. However, retrieving via * bpf_prog_get_fd_by_id() will take another reference, * therefore it cannot be gone underneath us. * * Only for the time /after/ successful bpf_prog_new_fd() * and before returning to userspace, we might just hold * one reference and any parallel close on that fd could * rip everything out. Hence, below notifications must * happen before bpf_prog_new_fd(). * * Also, any failure handling from this point onwards must * be using bpf_prog_put() given the program is exposed. */ bpf_prog_kallsyms_add(prog); perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); bpf_audit_prog(prog, BPF_AUDIT_LOAD); //給BPF程序分配一個文件句柄fd err = bpf_prog_new_fd(prog); if (err < 0) bpf_prog_put(prog); return err; -------------------------------- }
bpf_prog_attach:如何把我的bpf程序,attach到這些類型上:
重定向程序作為BPF_SK_SKB_STREAM_VERDICT附加到sockmap; 它應返回bpf_sk_redirect_map()的結果。
一個strparser程序通過BPF_SK_SKB_STREAM_PARSER附加,並且應返回已解析數據的長度。
能夠獲取什么樣的context?
指向包含包元數據/數據的結構__sk_buff的指針。 但是,sk_skb程序類型可以訪問更多字段。 可用的額外字段集記錄在include / linux / bpf.h中,如下所示:
什么時候會運行?
可以通過把BPF_SK_SKB_STREAM_PARSER 附加到sockmap上來把一個stream parser附加到一個socket上,然后,當socket通過、bpf/sockmap.c中的smap_parse_func_strparser() 接受的時候,就會執行。BPF_SK_SKB_STREAM_VERDICT也會附加到sockmap上,它通過smap_verdict_func()來執行。
/* bpf_load_program bpf_prog_attach(verdict_prog, map_fd, BPF_SMAP_STREAM_VERDICT, 0); int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type, unsigned int flags) { DECLARE_LIBBPF_OPTS(bpf_prog_attach_opts, opts, .flags = flags, ); return bpf_prog_attach_xattr(prog_fd, target_fd, type, &opts); } int bpf_prog_attach_xattr(int prog_fd, int target_fd, enum bpf_attach_type type, const struct bpf_prog_attach_opts *opts) { union bpf_attr attr; if (!OPTS_VALID(opts, bpf_prog_attach_opts)) return -EINVAL; memset(&attr, 0, sizeof(attr)); attr.target_fd = target_fd; attr.attach_bpf_fd = prog_fd; attr.attach_type = type; attr.attach_flags = OPTS_GET(opts, flags, 0); attr.replace_bpf_fd = OPTS_GET(opts, replace_prog_fd, 0); return sys_bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)); }
int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, struct bpf_prog *old, u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); struct bpf_prog **pprog; switch (which) { ------------------------------------------ case BPF_SK_SKB_STREAM_PARSER: pprog = &progs->skb_parser; break; case BPF_SK_SKB_STREAM_VERDICT: pprog = &progs->skb_verdict; break; } psock_set_prog(pprog, prog); return 0; }
int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) { u32 ufd = attr->target_fd; struct bpf_map *map; struct fd f; int ret; if (attr->attach_flags || attr->replace_bpf_fd) return -EINVAL; f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); ret = sock_map_prog_update(map, prog, NULL, attr->attach_type);---//找到對應的sk_psock_progs 並更新 fdput(f); return ret; } */ static int bpf_prog_attach(const union bpf_attr *attr) { enum bpf_prog_type ptype;BPF_SOCK_STREAM_VERDICT struct bpf_prog *prog = NULL; int ret; if (CHECK_ATTR(BPF_PROG_ATTACH)) return -EINVAL; if (attr->attach_flags & ~BPF_F_ATTACH_MASK) return -EINVAL; //------BPF_SK_SKB_STREAM_VERDICT-------> transmit -----BPF_PROG_TYPE_SK_SKB 也就是attach type 轉換為 prog-type ptype = attach_type_to_prog_type(attr->attach_type); if (ptype == BPF_PROG_TYPE_UNSPEC) return -EINVAL; prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); if (IS_ERR(prog)) return PTR_ERR(prog); if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) { bpf_prog_put(prog); return -EINVAL; } /* const struct bpf_verifier_ops sk_skb_verifier_ops = { .get_func_proto = sk_skb_func_proto,--------------bpf_sk_redirect_map_proto----------bpf_msg_redirect_map .is_valid_access = sk_skb_is_valid_access, .convert_ctx_access = sk_skb_convert_ctx_access, .gen_prologue = sk_skb_prologue, }; */ switch (ptype) { case BPF_PROG_TYPE_SK_SKB: case BPF_PROG_TYPE_SK_MSG: ret = sock_map_get_from_fd(attr, prog);// 根據target_fd 找到 map 並關聯對應map break; case BPF_PROG_TYPE_LIRC_MODE2: ret = lirc_prog_attach(attr, prog); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: ret = netns_bpf_prog_attach(attr, prog); break; case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_CGROUP_SOCKOPT: case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_SOCK_OPS: ret = cgroup_bpf_prog_attach(attr, ptype, prog); break; default: ret = -EINVAL; } if (ret) bpf_prog_put(prog); return ret; }
established sock_map
eBPF map, with two eBPF programs attached: parser and verdict.
The next step is to add a TCP socket descriptor to this map
int val = fd; bpf_map_update_elem(sock_map, &idx, &val, BPF_ANY);
bpf_map_update_elem: 將fd socket 和map相關聯
會執行系統調用 bpf(BPF_MAP_UPDATE_ELEM,-----) 最后調用map_update_elem 函數處理
static int map_update_elem(union bpf_attr *attr) { void __user *ukey = u64_to_user_ptr(attr->key);// 對應idx 索引 void __user *uvalue = u64_to_user_ptr(attr->value);//對應 鍵值 value 比如 需要執行動作的socket--fd int ufd = attr->map_fd; ----------------------- f = fdget(ufd);// map_fd--->file--->對應的map 內存 map = __bpf_map_get(f);// map_fd--->file--->對應的map 內存 f.file->private_data; ------------------------------ ----------------------------------// 將 key value 更新到map 中 err = bpf_map_update_value(map, f, key, value, attr->flags); }
static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key, void *value, __u64 flags) { int err; /* Need to create a kthread, thus must support schedule */ if (bpf_map_is_dev_bound(map)) { return bpf_map_offload_update_elem(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || map->map_type == BPF_MAP_TYPE_SOCKHASH || map->map_type == BPF_MAP_TYPE_SOCKMAP ||//sock_map_update_elem map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { return map->ops->map_update_elem(map, key, value, flags);// } //------------------ return err;
以sock_map_update_elem 為例查看
static int sock_map_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { u32 idx = *(u32 *)key; struct socket *sock; struct sock *sk; int ret; u64 ufd; if (map->value_size == sizeof(u64)) ufd = *(u64 *)value; else ufd = *(u32 *)value; --------------------------- sock = sockfd_lookup(ufd, &ret);// 根據value:sockt-fd 找到對應的struct socket ---------- sk = sock->sk;//sock---對應的net sk 結構體 ----------- ret = sock_map_update_common(map, idx, sk, flags); }
static int sock_map_update_common(struct bpf_map *map, u32 idx, struct sock *sk, u64 flags) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); struct sk_psock_link *link; struct sk_psock *psock; struct sock *osk; int ret; link = sk_psock_init_link();//分配內存 /* Only sockets we can redirect into/from in BPF need to hold * refs to parser/verdict progs and have their sk_data_ready * and sk_write_space callbacks overridden. */ ret = sock_map_link(map, &stab->progs, sk); psock = sk_psock(sk); WARN_ON_ONCE(!psock); raw_spin_lock_bh(&stab->lock); osk = stab->sks[idx]; sock_map_add_link(psock, link, map, &stab->sks[idx]); stab->sks[idx] = sk; sock_map_unref(osk, &stab->sks[idx]); return 0; } static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, struct sock *sk) { struct bpf_prog *msg_parser, *skb_parser, *skb_verdict; struct sk_psock *psock; bool skb_progs; int ret; skb_verdict = READ_ONCE(progs->skb_verdict);-------賦值見-sock_map_prog_update skb_parser = READ_ONCE(progs->skb_parser); skb_progs = skb_parser && skb_verdict; --------------------- msg_parser = READ_ONCE(progs->msg_parser); ------------------ psock = sock_map_psock_get_checked(sk); if (IS_ERR(psock)) { ret = PTR_ERR(psock); goto out_progs; } ------------------- psock = sk_psock_init(sk, map->numa_node); 將sk 和psock 相關聯:創建psock ;psock->sk = sk; --------------------- //主要是sk->sk_prot=ops 替換sk 的ops 函數;替換為bpf_ops ret = sock_map_init_proto(sk, psock); if (ret < 0) goto out_drop; if (skb_progs && !psock->parser.enabled) { ret = sk_psock_init_strp(sk, psock);//設置strparser cb 回調函數 if (ret) { write_unlock_bh(&sk->sk_callback_lock); goto out_drop; } psock_set_prog(&psock->progs.skb_verdict, skb_verdict); psock_set_prog(&psock->progs.skb_parser, skb_parser); //設置 sk 的data_ready 數據到達喚醒函數 sk_psock_start_strp(sk, psock); } return 0; } void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) { struct sk_psock_parser *parser = &psock->parser; if (parser->enabled) return; parser->saved_data_ready = sk->sk_data_ready; sk->sk_data_ready = sk_psock_strp_data_ready; sk->sk_write_space = sk_psock_write_space; parser->enabled = true; } int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) { static const struct strp_callbacks cb = { .rcv_msg = sk_psock_strp_read, .read_sock_done = sk_psock_strp_read_done, .parse_msg = sk_psock_strp_parse, }; psock->parser.enabled = false; return strp_init(&psock->parser.strp, sk, &cb); } 設置strparser cb 回調函數 int strp_init(struct strparser *strp, struct sock *sk, const struct strp_callbacks *cb) { -------------------- /* The sk (sock) arg determines the mode of the stream parser. * * If the sock is set then the strparser is in receive callback mode. * The upper layer calls strp_data_ready to kick receive processing * and strparser calls the read_sock function on the socket to * get packets. * * If the sock is not set then the strparser is in general mode. * The upper layer calls strp_process for each skb to be parsed. */ --------------- memset(strp, 0, sizeof(*strp)); strp->sk = sk; strp->cb.lock = cb->lock ? : strp_sock_lock; strp->cb.unlock = cb->unlock ? : strp_sock_unlock; strp->cb.rcv_msg = cb->rcv_msg; strp->cb.parse_msg = cb->parse_msg; strp->cb.read_sock_done = cb->read_sock_done ? : default_read_sock_done; strp->cb.abort_parser = cb->abort_parser ? : strp_abort_strp; INIT_DELAYED_WORK(&strp->msg_timer_work, strp_msg_timeout); INIT_WORK(&strp->work, strp_work); return 0; } static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], struct proto *base) { prot[TCP_BPF_BASE] = *base; prot[TCP_BPF_BASE].unhash = sock_map_unhash; prot[TCP_BPF_BASE].close = sock_map_close; prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg; prot[TCP_BPF_BASE].stream_memory_read = tcp_bpf_stream_read; prot[TCP_BPF_TX] = prot[TCP_BPF_BASE]; prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg; prot[TCP_BPF_TX].sendpage = tcp_bpf_sendpage; } struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock) { int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; if (!psock->sk_proto) { struct proto *ops = READ_ONCE(sk->sk_prot); if (tcp_bpf_assert_proto_ops(ops)) return ERR_PTR(-EINVAL); tcp_bpf_check_v6_needs_rebuild(sk, ops); } return &tcp_bpf_prots[family][config]; } static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock) { struct proto *prot; switch (sk->sk_type) { case SOCK_STREAM: prot = tcp_bpf_get_proto(sk, psock); break; case SOCK_DGRAM: prot = udp_bpf_get_proto(sk, psock); break; sk_psock_update_proto(sk, psock, prot); return 0; }
From now on, each time our socket sd
receives a packet,
prog_parser and prog_verdict are called
SEC("prog_parser") int _prog_parser(struct __sk_buff *skb) { return skb->len; } SEC("prog_verdict") int _prog_verdict(struct __sk_buff *skb) { uint32_t idx = 0; return bpf_sk_redirect_map(skb, &sock_map, idx, 0); }
bpf_sk_redirect_map
tells the kernel: for the received packet, please oh please redirect it from a receive queue of some socket,to a transmit queue of the socket living in sock_map under index 0. In our case, these are the same sockets!Here we achieved exactly what the echo server is supposed to do, but purely in eBPF.
const struct bpf_func_proto bpf_sk_redirect_map_proto = { .func = bpf_sk_redirect_map, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg, struct bpf_map *, map, u32, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_map_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; msg->flags = flags; msg->sk_redir = sk; return SK_PASS; }
參考學習:
eBPF學習用例:
Linux 內核觀測技術 BPF書籍
https://davidlovezoe.club/wordpress/archives/862
http://arthurchiao.art/blog/cilium-life-of-a-packet-pod-to-service-zh/
https://switch-router.gitee.io/blog/strparser/
https://davidlovezoe.club/wordpress/archives/963
https://patchwork.ozlabs.org/project/netdev/patch/20170816053247.15445.69312.stgit@john-Precision-Tower-5810/
https://jishuin.proginn.com/p/763bfbd2bc4e
https://blog.csdn.net/pwl999/article/details/82884882
https://github.com/zoidbergwill/awesome-ebpf
https://patchwork.ozlabs.org/project/netdev/patch/20170816053247.15445.69312.stgit@john-Precision-Tower-5810/
https://switch-router.gitee.io/blog/strparser/
https://blogs.oracle.com/linux/notes-on-bpf-1
總結:
- eBPF程序處理截獲報文的例子:psock,psock 使用 strpaser,將數據包的控制權轉移到 eBPF 處理程序,用戶可以在 eBPF 程序里完成網絡報文的重定向;sockmap 建立在 psock 之上,而 psock 的底層則是 strparser
strparser 的工作原理
核心數據結構:struct strparser 是 strparser 框架的核心數據結構,它綁定(attach)一個 TCP sock 結構 sk 和一組回調函數 cb
struct strparser { struct sock *sk; // code omitted .... struct strp_callbacks cb; };
回調函數一共有以下六個:
struct strp_callbacks { int (*parse_msg)(struct strparser *strp, struct sk_buff *skb); void (*rcv_msg)(struct strparser *strp, struct sk_buff *skb); int (*read_sock_done)(struct strparser *strp, int err) void (*abort_parser)(struct strparser *strp, int err); void (*lock)(struct strparser *strp); void (*unlock)(struct strparser *strp); };
parse_msg() 在 strpaser 收到報文時被框架調用。它用於從報文中提取下一個應用層消息(message)的長度。一個 TCP 報文里可能不止一個應用層消息,而 parse_msg() 就是提供給使用者去識別各個消息的手段
strpaser 截獲報文
正常情況下,內核 TCP 層處理報文后,會調用 sock->sk_data_ready(sk) , 它的默認動作是 wake up 一個用戶態進程.
void tcp_data_ready(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); // code omitted sk->sk_data_ready(sk); }
我們期望報文能進入 strpaser ,但報文顯然不會平白無故地地進入 strpaser ,因此,我們需要在報文的上送路徑上動一些手腳:替換掉 sk->sk_data_ready 函數
static int tls_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len){ // code omitted tsk->saved_sk_data_ready = tsk->socket->sk->sk_data_ready; tsk->saved_sk_write_space = tsk->socket->sk->sk_write_space;sk_write_space tsk->socket->sk->sk_data_ready = tls_data_ready; tsk->socket->sk->sk_write_space = tls_write_space; tsk->socket->sk->sk_user_data = tsk; // code omitted }
在 psock 的例子中, sk_psock_strp_data_ready() 被賦值到 sk->sk_data_ready
void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) { struct sk_psock_parser *parser = &psock->parser; // code omitted parser->saved_data_ready = sk->sk_data_ready; sk->sk_data_ready = sk_psock_strp_data_ready; sk->sk_write_space = sk_psock_write_space; parser->enabled = true; }
替換之后,當有 TCP 報文准備上送時,用戶定義的 sk->sk_data_ready 函數就會被調用,在該函數中,KTLS/psock 需要調用框架函數strp_data_ready() 將報文轉交給 strpaser 框架。
對 KTLS

static void tls_data_ready(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); strp_data_ready(&ctx->strp); }
對 psock
static void sk_psock_strp_data_ready(struct sock *sk) { struct sk_psock *psock; rcu_read_lock(); psock = sk_psock(sk); if (likely(psock)) { write_lock_bh(&sk->sk_callback_lock); strp_data_ready(&psock->parser.strp); write_unlock_bh(&sk->sk_callback_lock); } rcu_read_unlock(); }
strpaser 處理報文
strpaser 框架拿到報文之后,通常會依次調用用戶設置的 parse_msg 和 rcv_msg 回調函數,用戶在回調函數里用來決定報文應該何去何從
strp_data_ready |- strp_read_sock |- tcp_read_sock |- strp_recv |- __strp_recv |- strp->cb.parse_msg(strp, head) ... |- strp->cb.rcv_msg(strp, head);
比如對 KTLS, 就是將報文上送給應用層(AF_KTLS socket) static void tls_queue(struct strparser *strp, struct sk_buff *skb) { struct tls_sock *tsk; // code omitted tsk = strp->sk->sk_user_data; // code omitted ret = sock_queue_rcv_skb((struct sock *)tsk, skb); // code omitted }
而對於 psock, 則是運行 eBPF 程序,得到動作(verdict)。
static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) { struct sk_psock *psock = sk_psock_from_strp(strp); struct bpf_prog *prog; int ret = __SK_DROP; rcu_read_lock(); prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { skb_orphan(skb); tcp_skb_bpf_redirect_clear(skb); ret = sk_psock_bpf_run(psock, prog, skb); // if we rdir , return SK_PASS ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); } rcu_read_unlock(); sk_psock_verdict_apply(psock, skb, ret);
strpaser 是這個框架只是限定如何處理報文,而只是在內核層面提供給了用戶一個提前處理 TCP 報文的時機和一組回調函數,用戶通過不同的回調函數可以實現不同的邏輯。
https://switch-router.gitee.io/blog/strparser/-----------------------------------------*************************------------------------------------------------------