页次: 1
以下基于 linux kernel 4.20.12
首先看入口 kernel/bpf/syscall.c
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr attr = {};
int err;
if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
return -EPERM;
err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
if (err)
return err;
size = min_t(u32, size, sizeof(attr));
/* copy attributes from user space, may be less than sizeof(bpf_attr) */
if (copy_from_user(&attr, uattr, size) != 0)
return -EFAULT;
err = security_bpf(cmd, &attr, size);
if (err < 0)
return err;
switch (cmd) {
case BPF_MAP_CREATE:
err = map_create(&attr);
break;
case BPF_MAP_LOOKUP_ELEM:
err = map_lookup_elem(&attr);
break;
case BPF_MAP_UPDATE_ELEM:
err = map_update_elem(&attr);
break;
case BPF_MAP_DELETE_ELEM:
err = map_delete_elem(&attr);
break;
case BPF_MAP_GET_NEXT_KEY:
err = map_get_next_key(&attr);
break;
case BPF_PROG_LOAD:
err = bpf_prog_load(&attr);
break;
case BPF_OBJ_PIN:
err = bpf_obj_pin(&attr);
break;
case BPF_OBJ_GET:
err = bpf_obj_get(&attr);
break;
case BPF_PROG_ATTACH:
err = bpf_prog_attach(&attr);
break;
case BPF_PROG_DETACH:
err = bpf_prog_detach(&attr);
break;
case BPF_PROG_QUERY:
err = bpf_prog_query(&attr, uattr);
break;
case BPF_PROG_TEST_RUN:
err = bpf_prog_test_run(&attr, uattr);
break;
case BPF_PROG_GET_NEXT_ID:
err = bpf_obj_get_next_id(&attr, uattr,
&prog_idr, &prog_idr_lock);
break;
case BPF_MAP_GET_NEXT_ID:
err = bpf_obj_get_next_id(&attr, uattr,
&map_idr, &map_idr_lock);
break;
case BPF_PROG_GET_FD_BY_ID:
err = bpf_prog_get_fd_by_id(&attr);
break;
case BPF_MAP_GET_FD_BY_ID:
err = bpf_map_get_fd_by_id(&attr);
break;
case BPF_OBJ_GET_INFO_BY_FD:
err = bpf_obj_get_info_by_fd(&attr, uattr);
break;
case BPF_RAW_TRACEPOINT_OPEN:
err = bpf_raw_tracepoint_open(&attr);
break;
case BPF_BTF_LOAD:
err = bpf_btf_load(&attr);
break;
case BPF_BTF_GET_FD_BY_ID:
err = bpf_btf_get_fd_by_id(&attr);
break;
case BPF_TASK_FD_QUERY:
err = bpf_task_fd_query(&attr, uattr);
break;
case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
err = map_lookup_and_delete_elem(&attr);
break;
default:
err = -EINVAL;
break;
}
return err;
}
SYSCALL_DEFINE3,所有系统调用再内核的入口都是 SYSCALL_DEFINEx,x 代表了系统调用的参数个数,其实这是一个宏定义,可以阅读 include/linux/syscall.h
#ifndef SYSCALL_DEFINE0
#define SYSCALL_DEFINE0(sname) \
SYSCALL_METADATA(_##sname, 0); \
asmlinkage long sys_##sname(void); \
ALLOW_ERROR_INJECTION(sys_##sname, ERRNO); \
asmlinkage long sys_##sname(void)
#endif /* SYSCALL_DEFINE0 */
#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE_MAXARGS 6
#define SYSCALL_DEFINEx(x, sname, ...) \
SYSCALL_METADATA(sname, x, __VA_ARGS__) \
__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
不断展开之后,其实就是
asmlinkage long sys_bpf(int cmd, union bpf_attr *attr, unsigned int size);
1. 检查权限
进入函数后,先检查权限
if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
return -EPERM;
CAP_SYS_ADMIN 21 允许执行系统管理任务,包括:加载/卸载文件系统、设置磁盘配额、开/关交换设备和文件等,具体可以参考 /usr/src/linux/include/linux/capability.h 文件
# kernel/bpf/syscall.c 中有以下定义
int sysctl_unprivileged_bpf_disabled __read_mostly;
经常需要被读取的数据定义为 __read_mostly 类型,这样 Linux 内核被加载时,该数据将自动被存放到 Cache 中,以提高整个系统的执行效率。另一方面,如果所在的平台没有 Cache,或者虽然有 Cache,但并不提供存放数据的接口(也就是并不允许人工放置数据在Cache 中),这样定义为 __read_mostly 类型的数据将不能存放在Linux内核中,甚至也不能够被加载到系统内存去执行,将造成 Linux 内核启动失败
2. 大小检测
如果我们处理了一个比我们所知的更大的结构,需要确保所有未知位都是0,即新的用户空间不依赖于任何我们不知道的内核特性扩展
在这个函数调用和下面的 copy_from_user() 调用之间存在一个 ToCToU(time-of-check-to-time-of-use) 。然而,这不是一个问题,因为这个函数是为了将来对位进行校对
err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
/*
* If we're handed a bigger struct than we know of, ensure all the unknown bits
* are 0 - i.e. new user-space does not rely on any kernel feature extensions
* we don't know about yet.
*
* There is a ToCToU between this function call and the following
* copy_from_user() call. However, this is not a concern since this function is
* meant to be a future-proofing of bits.
*/
int bpf_check_uarg_tail_zero(void __user *uaddr,
size_t expected_size,
size_t actual_size)
{
unsigned char __user *addr;
unsigned char __user *end;
unsigned char val;
int err;
if (unlikely(actual_size > PAGE_SIZE)) /* silly large */
return -E2BIG;
if (unlikely(!access_ok(VERIFY_READ, uaddr, actual_size)))
return -EFAULT;
if (actual_size <= expected_size)
return 0;
addr = uaddr + expected_size;
end = uaddr + actual_size;
for (; addr < end; addr++) {
err = get_user(val, addr);
if (err)
return err;
if (val)
return -E2BIG;
}
return 0;
}
3 内存空间分配
static int bpf_prog_load(union bpf_attr *attr)
{
enum bpf_prog_type type = attr->prog_type;
struct bpf_prog *prog;
int err;
char license[128];
bool is_gpl;
if (CHECK_ATTR(BPF_PROG_LOAD))
return -EINVAL;
if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT)
return -EINVAL;
/* copy eBPF program license from user space */
// 根据 attr->license 地址,从用户空间拷贝 license 字符串到内核
if (strncpy_from_user(license, u64_to_user_ptr(attr->license),
sizeof(license) - 1) < 0)
return -EFAULT;
license[sizeof(license) - 1] = 0;
/* eBPF programs must be GPL compatible to use GPL-ed functions */
// ebpf 程序必须符合 GPL 协议
is_gpl = license_is_gpl_compatible(license);
// 判断BPF的总指令数是否超过 BPF_MAXINSNS(4k)
if (attr->insn_cnt == 0 || attr->insn_cnt > BPF_MAXINSNS)
return -E2BIG;
// 如果加载 BPF_PROG_TYPE_KPROBE 类型的 BPF 程序,指定的内核版本需要和当前内核版本匹配。不然由于内核的改动,可能会附加到错误的地址上
if (type == BPF_PROG_TYPE_KPROBE &&
attr->kern_version != LINUX_VERSION_CODE)
return -EINVAL;
// 对 BPF_PROG_TYPE_SOCKET_FILTER 和 BPF_PROG_TYPE_CGROUP_SKB 以外的 BPF 程序加载,需要管理员权限
if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
type != BPF_PROG_TYPE_CGROUP_SKB &&
!capable(CAP_SYS_ADMIN))
return -EPERM;
bpf_prog_load_fixup_attach_type(attr);
if (bpf_prog_load_check_attach_type(type, attr->expected_attach_type))
return -EINVAL;
/* plain bpf_prog allocation */
// 根据 BPF 指令数分配 bpf_prog 空间,和 bpf_prog->aux 空间
prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
if (!prog)
return -ENOMEM;
prog->expected_attach_type = attr->expected_attach_type;
prog->aux->offload_requested = !!attr->prog_ifindex;
// security_bpf_prog_alloc -> bpf_prog_alloc_security: 初始化 bpf 程序中的安全字段
err = security_bpf_prog_alloc(prog->aux);
if (err)
goto free_prog_nouncharge;
// 把整个 bpf_prog 空间在当前进程的 memlock_limit 中锁定
err = bpf_prog_charge_memlock(prog);
if (err)
goto free_prog_sec;
prog->len = attr->insn_cnt;
err = -EFAULT;
// 把 bpf 代码从用户空间地址 attr->insns,拷贝到内核空间地址 prog->insns
if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns),
bpf_prog_insn_size(prog)) != 0)
goto free_prog;
prog->orig_prog = NULL;
prog->jited = 0;
atomic_set(&prog->aux->refcnt, 1);
prog->gpl_compatible = is_gpl ? 1 : 0;
if (bpf_prog_is_dev_bound(prog->aux)) {
err = bpf_prog_offload_init(prog, attr);
if (err)
goto free_prog;
}
/* find program type: socket_filter vs tracing_filter */
// 根据 attr->prog_type 指定的 type 值,找到对应的bpf_prog_types,给 bpf_prog->aux->ops 赋值,这个 ops 是一个函数操作集
err = find_prog_type(type, prog);
if (err < 0)
goto free_prog;
prog->aux->load_time = ktime_get_boot_ns();
// 复制获得目标名,成功返回0,否则 <0
err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name);
if (err)
goto free_prog;
/* run eBPF verifier */
// 使用 verifer 对 BPF 程序进行合法性扫描
err = bpf_check(&prog, attr);
if (err < 0)
goto free_used_maps;
// 尝试对 bpf 程序进行 JIT 转换
prog = bpf_prog_select_runtime(prog, &err);
if (err < 0)
goto free_used_maps;
// 给 BPF 程序分配一个 id,在 1 到 INT_MAX 之间,把 BPF 程序发送给用户空间,并且用户空间可以通过 BPF_PROG_GET_FD_BY_ID 引用
err = bpf_prog_alloc_id(prog);
if (err)
goto free_used_maps;
// 给 BPF 程序分配一个文件句柄 fd
err = bpf_prog_new_fd(prog);
if (err < 0) {
/* failed to allocate fd.
* bpf_prog_put() is needed because the above
* bpf_prog_alloc_id() has published the prog
* to the userspace and the userspace may
* have refcnt-ed it through BPF_PROG_GET_FD_BY_ID.
*/
bpf_prog_put(prog);
return err;
}
bpf_prog_kallsyms_add(prog);
return err;
free_used_maps:
bpf_prog_kallsyms_del_subprogs(prog);
free_used_maps(prog->aux);
free_prog:
bpf_prog_uncharge_memlock(prog);
free_prog_sec:
security_bpf_prog_free(prog->aux);
free_prog_nouncharge:
bpf_prog_free(prog);
return err;
}
struct bpf_prog {
u16 pages; /* Number of allocated pages */
u16 jited:1, /* Is our filter JIT'ed? */
jit_requested:1, /* archs need to JIT the prog */
undo_set_mem:1, /* Passed set_memory_ro() checkpoint */
gpl_compatible:1, /* Is filter GPL compatible? */
cb_access:1, /* Is control block accessed? */
dst_needed:1, /* Do we need dst entry? */
blinded:1, /* Was blinded */
is_func:1, /* program is a bpf function */
kprobe_override:1, /* Do we override a kprobe? */
has_callchain_buf:1; /* callchain buffer allocated? */
enum bpf_prog_type type; /* Type of BPF program */ // 当前bpf程序的类型(kprobe/tracepoint/perf_event/sk_filter/sched_cls/sched_act/xdp/cg_skb)
enum bpf_attach_type expected_attach_type; /* For some prog types */ // 程序包含 bpf 指令的数量
u32 len; /* Number of filter blocks */
u32 jited_len; /* Size of jited insns in bytes */
u8 tag[BPF_TAG_SIZE];
struct bpf_prog_aux *aux; /* Auxiliary fields */ // 主要用来辅助 verifier 校验和转换的数据
struct sock_fprog_kern *orig_prog; /* Original BPF program */
unsigned int (*bpf_func)(const void *ctx, const struct bpf_insn *insn);
/* Instructions for interpreter */ // 运行时BPF程序的入口。如果JIT转换成功,这里指向的就是BPF程序JIT转换后的映像;否则这里指向内核解析器(interpreter)的通用入口__bpf_prog_run()
union {
struct sock_filter insns[0];
struct bpf_insn insnsi[0]; // 从用户态拷贝过来的,BPF程序原始指令的存放空间
};
};
离线
页次: 1