公告

Gentoo群:87709706,OS群:838664909

#1 2024-03-23 14:24:05

batsom
管理团队
注册时间: 2022-08-03
帖子: 594
个人网站

Gentoo 之 bpf 源码阅读

以下基于 linux kernel 4.20.12


首先看入口 kernel/bpf/syscall.c


 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
	union bpf_attr attr = {};
	int err;

	if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
		return -EPERM;

	err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
	if (err)
		return err;
	size = min_t(u32, size, sizeof(attr));

	/* copy attributes from user space, may be less than sizeof(bpf_attr) */
	if (copy_from_user(&attr, uattr, size) != 0)
		return -EFAULT;

	err = security_bpf(cmd, &attr, size);
	if (err < 0)
		return err;

	switch (cmd) {
	case BPF_MAP_CREATE:
		err = map_create(&attr);
		break;
	case BPF_MAP_LOOKUP_ELEM:
		err = map_lookup_elem(&attr);
		break;
	case BPF_MAP_UPDATE_ELEM:
		err = map_update_elem(&attr);
		break;
	case BPF_MAP_DELETE_ELEM:
		err = map_delete_elem(&attr);
		break;
	case BPF_MAP_GET_NEXT_KEY:
		err = map_get_next_key(&attr);
		break;
	case BPF_PROG_LOAD:
		err = bpf_prog_load(&attr);
		break;
	case BPF_OBJ_PIN:
		err = bpf_obj_pin(&attr);
		break;
	case BPF_OBJ_GET:
		err = bpf_obj_get(&attr);
		break;
	case BPF_PROG_ATTACH:
		err = bpf_prog_attach(&attr);
		break;
	case BPF_PROG_DETACH:
		err = bpf_prog_detach(&attr);
		break;
	case BPF_PROG_QUERY:
		err = bpf_prog_query(&attr, uattr);
		break;
	case BPF_PROG_TEST_RUN:
		err = bpf_prog_test_run(&attr, uattr);
		break;
	case BPF_PROG_GET_NEXT_ID:
		err = bpf_obj_get_next_id(&attr, uattr,
					  &prog_idr, &prog_idr_lock);
		break;
	case BPF_MAP_GET_NEXT_ID:
		err = bpf_obj_get_next_id(&attr, uattr,
					  &map_idr, &map_idr_lock);
		break;
	case BPF_PROG_GET_FD_BY_ID:
		err = bpf_prog_get_fd_by_id(&attr);
		break;
	case BPF_MAP_GET_FD_BY_ID:
		err = bpf_map_get_fd_by_id(&attr);
		break;
	case BPF_OBJ_GET_INFO_BY_FD:
		err = bpf_obj_get_info_by_fd(&attr, uattr);
		break;
	case BPF_RAW_TRACEPOINT_OPEN:
		err = bpf_raw_tracepoint_open(&attr);
		break;
	case BPF_BTF_LOAD:
		err = bpf_btf_load(&attr);
		break;
	case BPF_BTF_GET_FD_BY_ID:
		err = bpf_btf_get_fd_by_id(&attr);
		break;
	case BPF_TASK_FD_QUERY:
		err = bpf_task_fd_query(&attr, uattr);
		break;
	case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
		err = map_lookup_and_delete_elem(&attr);
		break;
	default:
		err = -EINVAL;
		break;
	}

	return err;
}

SYSCALL_DEFINE3,所有系统调用再内核的入口都是 SYSCALL_DEFINEx,x 代表了系统调用的参数个数,其实这是一个宏定义,可以阅读 include/linux/syscall.h

 #ifndef SYSCALL_DEFINE0
#define SYSCALL_DEFINE0(sname)					\
	SYSCALL_METADATA(_##sname, 0);				\
	asmlinkage long sys_##sname(void);			\
	ALLOW_ERROR_INJECTION(sys_##sname, ERRNO);		\
	asmlinkage long sys_##sname(void)
#endif /* SYSCALL_DEFINE0 */

#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)

#define SYSCALL_DEFINE_MAXARGS	6

#define SYSCALL_DEFINEx(x, sname, ...)				\
	SYSCALL_METADATA(sname, x, __VA_ARGS__)			\
	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)

不断展开之后,其实就是

asmlinkage long sys_bpf(int cmd, union bpf_attr *attr, unsigned int size); 

1. 检查权限

进入函数后,先检查权限

 if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
		return -EPERM;

CAP_SYS_ADMIN 21 允许执行系统管理任务,包括:加载/卸载文件系统、设置磁盘配额、开/关交换设备和文件等,具体可以参考 /usr/src/linux/include/linux/capability.h 文件

# kernel/bpf/syscall.c 中有以下定义
int sysctl_unprivileged_bpf_disabled __read_mostly;  

经常需要被读取的数据定义为 __read_mostly 类型,这样 Linux 内核被加载时,该数据将自动被存放到 Cache 中,以提高整个系统的执行效率。另一方面,如果所在的平台没有 Cache,或者虽然有 Cache,但并不提供存放数据的接口(也就是并不允许人工放置数据在Cache 中),这样定义为 __read_mostly 类型的数据将不能存放在Linux内核中,甚至也不能够被加载到系统内存去执行,将造成 Linux 内核启动失败
2. 大小检测

如果我们处理了一个比我们所知的更大的结构,需要确保所有未知位都是0,即新的用户空间不依赖于任何我们不知道的内核特性扩展

在这个函数调用和下面的 copy_from_user() 调用之间存在一个 ToCToU(time-of-check-to-time-of-use) 。然而,这不是一个问题,因为这个函数是为了将来对位进行校对

err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);

/*
 * If we're handed a bigger struct than we know of, ensure all the unknown bits
 * are 0 - i.e. new user-space does not rely on any kernel feature extensions
 * we don't know about yet.
 *
 * There is a ToCToU between this function call and the following
 * copy_from_user() call. However, this is not a concern since this function is
 * meant to be a future-proofing of bits.
 */
int bpf_check_uarg_tail_zero(void __user *uaddr,
			     size_t expected_size,
			     size_t actual_size)
{
	unsigned char __user *addr;
	unsigned char __user *end;
	unsigned char val;
	int err;

	if (unlikely(actual_size > PAGE_SIZE))	/* silly large */
		return -E2BIG;

	if (unlikely(!access_ok(VERIFY_READ, uaddr, actual_size)))
		return -EFAULT;

	if (actual_size <= expected_size)
		return 0;

	addr = uaddr + expected_size;
	end  = uaddr + actual_size;

	for (; addr < end; addr++) {
		err = get_user(val, addr);
		if (err)
			return err;
		if (val)
			return -E2BIG;
	}

	return 0;
} 

3 内存空间分配

static int bpf_prog_load(union bpf_attr *attr)
{
	enum bpf_prog_type type = attr->prog_type;
	struct bpf_prog *prog;
	int err;
	char license[128];
	bool is_gpl;

	if (CHECK_ATTR(BPF_PROG_LOAD))
		return -EINVAL;

	if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT)
		return -EINVAL;

	/* copy eBPF program license from user space */
	// 根据 attr->license 地址,从用户空间拷贝 license 字符串到内核
	if (strncpy_from_user(license, u64_to_user_ptr(attr->license),
			      sizeof(license) - 1) < 0)
		return -EFAULT;
	license[sizeof(license) - 1] = 0;

	/* eBPF programs must be GPL compatible to use GPL-ed functions */
	// ebpf 程序必须符合 GPL 协议
	is_gpl = license_is_gpl_compatible(license);

	// 判断BPF的总指令数是否超过 BPF_MAXINSNS(4k)
	if (attr->insn_cnt == 0 || attr->insn_cnt > BPF_MAXINSNS)
		return -E2BIG;

	// 如果加载 BPF_PROG_TYPE_KPROBE 类型的 BPF 程序,指定的内核版本需要和当前内核版本匹配。不然由于内核的改动,可能会附加到错误的地址上
	if (type == BPF_PROG_TYPE_KPROBE &&
	    attr->kern_version != LINUX_VERSION_CODE)
		return -EINVAL;

	// 对 BPF_PROG_TYPE_SOCKET_FILTER 和 BPF_PROG_TYPE_CGROUP_SKB 以外的 BPF 程序加载,需要管理员权限
	if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
	    type != BPF_PROG_TYPE_CGROUP_SKB &&
	    !capable(CAP_SYS_ADMIN))
		return -EPERM;

	bpf_prog_load_fixup_attach_type(attr);
	if (bpf_prog_load_check_attach_type(type, attr->expected_attach_type))
		return -EINVAL;

	/* plain bpf_prog allocation */
	// 根据 BPF 指令数分配 bpf_prog 空间,和 bpf_prog->aux 空间
	prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
	if (!prog)
		return -ENOMEM;

	prog->expected_attach_type = attr->expected_attach_type;

	prog->aux->offload_requested = !!attr->prog_ifindex;

	// security_bpf_prog_alloc -> bpf_prog_alloc_security: 初始化 bpf 程序中的安全字段
	err = security_bpf_prog_alloc(prog->aux);
	if (err)
		goto free_prog_nouncharge;

	// 把整个 bpf_prog 空间在当前进程的 memlock_limit 中锁定
	err = bpf_prog_charge_memlock(prog);
	if (err)
		goto free_prog_sec;

	prog->len = attr->insn_cnt;

	err = -EFAULT;
	// 把 bpf 代码从用户空间地址 attr->insns,拷贝到内核空间地址 prog->insns
	if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns),
			   bpf_prog_insn_size(prog)) != 0)
		goto free_prog;

	prog->orig_prog = NULL;
	prog->jited = 0;

	atomic_set(&prog->aux->refcnt, 1);
	prog->gpl_compatible = is_gpl ? 1 : 0;

	if (bpf_prog_is_dev_bound(prog->aux)) {
		err = bpf_prog_offload_init(prog, attr);
		if (err)
			goto free_prog;
	}

	/* find program type: socket_filter vs tracing_filter */
	// 根据 attr->prog_type 指定的 type 值,找到对应的bpf_prog_types,给 bpf_prog->aux->ops 赋值,这个 ops 是一个函数操作集
	err = find_prog_type(type, prog);
	if (err < 0)
		goto free_prog;

	prog->aux->load_time = ktime_get_boot_ns();

	// 复制获得目标名,成功返回0,否则 <0
	err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name);
	if (err)
		goto free_prog;

	/* run eBPF verifier */
	// 使用 verifer 对 BPF 程序进行合法性扫描
	err = bpf_check(&prog, attr);
	if (err < 0)
		goto free_used_maps;
 
	// 尝试对 bpf 程序进行 JIT 转换
	prog = bpf_prog_select_runtime(prog, &err);
	if (err < 0)
		goto free_used_maps;

	// 给 BPF 程序分配一个 id,在 1 到 INT_MAX 之间,把 BPF 程序发送给用户空间,并且用户空间可以通过 BPF_PROG_GET_FD_BY_ID 引用 
	err = bpf_prog_alloc_id(prog);
	if (err)
		goto free_used_maps;

	// 给 BPF 程序分配一个文件句柄 fd
	err = bpf_prog_new_fd(prog);
	if (err < 0) {
		/* failed to allocate fd.
		 * bpf_prog_put() is needed because the above
		 * bpf_prog_alloc_id() has published the prog
		 * to the userspace and the userspace may
		 * have refcnt-ed it through BPF_PROG_GET_FD_BY_ID.
		 */
		bpf_prog_put(prog);
		return err;
	}

	bpf_prog_kallsyms_add(prog);
	return err;

free_used_maps:
	bpf_prog_kallsyms_del_subprogs(prog);
	free_used_maps(prog->aux);
free_prog:
	bpf_prog_uncharge_memlock(prog);
free_prog_sec:
	security_bpf_prog_free(prog->aux);
free_prog_nouncharge:
	bpf_prog_free(prog);
	return err;
} 

struct bpf_prog {
	u16			pages;		      /* Number of allocated pages */
	u16			jited:1,	      /* Is our filter JIT'ed? */
				jit_requested:1,      /* archs need to JIT the prog */
				undo_set_mem:1,	      /* Passed set_memory_ro() checkpoint */
				gpl_compatible:1,     /* Is filter GPL compatible? */
				cb_access:1,	      /* Is control block accessed? */
				dst_needed:1,	      /* Do we need dst entry? */
				blinded:1,	      /* Was blinded */
				is_func:1,	      /* program is a bpf function */
				kprobe_override:1,    /* Do we override a kprobe? */
				has_callchain_buf:1;  /* callchain buffer allocated? */
	enum bpf_prog_type	type;		      /* Type of BPF program */ // 当前bpf程序的类型(kprobe/tracepoint/perf_event/sk_filter/sched_cls/sched_act/xdp/cg_skb)
	enum bpf_attach_type	expected_attach_type; /* For some prog types */ // 程序包含 bpf 指令的数量
	u32			len;		      /* Number of filter blocks */
	u32			jited_len;	      /* Size of jited insns in bytes */
	u8			tag[BPF_TAG_SIZE];
	struct bpf_prog_aux	*aux;		      /* Auxiliary fields */ // 主要用来辅助 verifier 校验和转换的数据
	struct sock_fprog_kern	*orig_prog;	      /* Original BPF program */
	unsigned int		(*bpf_func)(const void *ctx, const struct bpf_insn *insn);
	                                              /* Instructions for interpreter */ // 运行时BPF程序的入口。如果JIT转换成功,这里指向的就是BPF程序JIT转换后的映像;否则这里指向内核解析器(interpreter)的通用入口__bpf_prog_run()
	union {
		struct sock_filter	insns[0];
		struct bpf_insn		insnsi[0];    // 从用户态拷贝过来的,BPF程序原始指令的存放空间
	};
};

离线

页脚