kprobes
brief history
- Linux v2.6.9 introduced
kprobes into its mainline, but only for i386 - v2.6.10 begins to add kprobes support for
x86_64 - Linux v2.6.12 allows multiple kprobes at the same address
- v2.6.13 starts to support
kretprobe - v2.6.14 groups kprobes related functions to section
.kprobes.text, so that kprobes to these codes will be rejected - v2.6.15 updates: using
percpu infra to manage some internal variables; using RCU(Read-Copy-Update) list to manage kprobes; more checks on kprobes register, including kernel module address check - v2.6.16 updates: more kprobes rejects on kernel functions; refactor kprobes register functions to check whether we are probing a kernel module
- v2.6.17
i386 updates its kretprobe_trampoline_holder; updates kprobe_fault_handler - v2.6.18 kprobes registers for
page fault notifications when their is an active probe registered; tcpprobe module to probe tcp_sendmsg - v2.6.19 add
symbol_name and offset fields to struct kprobe so you can directly using function name to register kprobes; dccp_probe module to probe dccp_sendmsg - v2.6.21 add basic
debugfs support for kprobes, /sys/kernel/debug/kprobes/list lists all registered probes on the system - v2.6.22 more on
debugfs, add /sys/kernel/debug/kprobes/enabled to globally turn registered kprobes on/off and the default value is on - v2.6.24 add basic
kretprobe blacklist support - v2.6.25 kretprobe supports optional user-specified
entry_handler which runs on function entry and also supports private data which can pass data between entry_handler and ret_handler - v2.6.26 add basic
kprobe blacklist support; batch registration/unregistration of a group of probes interfaces - v2.6.29 add
module notifier call back, which will check kprobes on the module; add flags field to struct kprobe to mark kprobe gone KPROBE_FLAG_GONE and remove its instruction buffer - v2.6.30 enable/disable probes interfaces, if kp->flags is set
KPROBE_FLAG_DISABLED, that kp will be registered but disabled, so, it’s handlers aren’t hit until calling enable_kprobe(kp) - v2.6.33 updates kprobe blacklist; check whether kprobe re-registered
- v2.6.34 kprobe optimization
KPROBE_FLAG_OPTIMIZED for i386 and x86_64; kprobes sysctl interface /proc/sys/debug/kprobes-optimization to control kprobe optimization - v2.6.39 do not optimize in the entry code due to the unstable stack handling
initialization
- when Linux is starting, it will call
init_kprobes in its process of initialization
struct notifier_block *i386die_chain;
int register_die_notifier(struct notifier_block *nb)
{
// ...
err = notifier_chain_register(&i386die_chain, nb);
// ...
return err;
}
static struct notifier_block kprobe_exceptions_nb = {
.notifier_call = kprobe_exceptions_notify,
.priority = 0x7fffffff /* we need to notified first */
};
static int __init init_kprobes(void)
{
int i, err = 0;
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
INIT_HLIST_HEAD(&kprobe_table[i]); // for kprobes
INIT_HLIST_HEAD(&kretprobe_inst_table[i]); // for kretprobes
}
err = arch_init_kprobes(); // here to register a kprobe for trampoline
if (!err)
// register kprobe_exceptions_nb to i386die_chain
err = register_die_notifier(&kprobe_exceptions_nb);
return err;
}
__initcall(init_kprobes);
how does kprobe work
/* Attach to insert probes on any functions which should be ignored */
#define __kprobes __attribute__((__section__(".kprobes.text")))
void __kprobes arch_copy_kprobe(struct kprobe *p)
{
memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
p->opcode = *p->addr;
}
void __kprobes arch_arm_kprobe(struct kprobe *p)
{
*p->addr = BREAKPOINT_INSTRUCTION;
// ...
}
static int __kprobes in_kprobes_functions(unsigned long addr)
{
if (addr >= (unsigned long)__kprobes_text_start
&& addr < (unsigned long)__kprobes_text_end)
return -EINVAL;
return 0;
}
- call
register_kprobe to register a kprobe we want to probe
int register_kprobe(struct kprobe *p)
{
// ...
if ((ret = in_kprobes_functions((unsigned long) p->addr)) != 0)
return ret;
// ...
// NOTE: add new kprobe to corresponding hash table slot
hlist_add_head(&p->hlist,
&kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
arch_arm_kprobe(p);
// ...
}
- when execution comes to the probing address, an
int3 happened and do_int3 will be called then, and it will notify our register kprobes, from there, pre_handler/break_handler inside kprobe will be called
ENTRY(int3)
// ...
call do_int3 // here to call do_int3
// ...
#ifdef CONFIG_KPROBES
asmlinkage int do_int3(struct pt_regs *regs, long error_code)
{
// here to notify registered kprobes
if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
== NOTIFY_STOP)
return 1;
// back to normal ...
}
#endif
static inline int notify_die(enum die_val val,char *str,struct pt_regs *regs,long err,int trap, int sig)
{
struct die_args args = { .regs=regs, .str=str, .err=err, .trapnr=trap,.signr=sig };
// here kprobe_exceptions_notify inside kprobe_exceptions_nb will be called
return notifier_call_chain(&i386die_chain, val, &args);
}
int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
void *data)
{
struct die_args *args = (struct die_args *)data;
switch (val) {
case DIE_INT3:
// p->break_handler & p->pre_handler
if (kprobe_handler(args->regs)) // handler cb inside
return NOTIFY_STOP;
break;
// ... p->post_handler
// ... p->fault_handler
}
return NOTIFY_DONE;
}
static inline int kprobe_handler(struct pt_regs *regs)
{
struct kprobe *p;
u8 *addr = (u8 *) (regs->eip - 1);
// ...
p = get_kprobe(addr);
// ...
if (p->pre_handler(p, regs)) { // NOTE: where magic happened
/* handler has already set things up, so skip ss setup */
return 1;
}
// ...
}
how does kretprobe work
- in order to support
kretprobe, at boot time, init_kprobes will first register a kprobe at the trampoline, which is an arbitrary piece of code – typically just a nop instruction
void kretprobe_trampoline_holder(void)
{
asm volatile ( ".global kretprobe_trampoline\n"
"kretprobe_trampoline: \n"
"nop\n");
}
static struct kprobe trampoline_p = {
.addr = (kprobe_opcode_t *) &kretprobe_trampoline,
.pre_handler = trampoline_probe_handler
};
int __init arch_init_kprobes(void)
{
return register_kprobe(&trampoline_p);
}
- when you call
register_kretprobe, kprobes establishes a kprobe at the entry to the function and when the probed function is called and this probe is hit, kprobes saves a copy of the return address, and replaces the return address with the address of a “trampoline.”
void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs)
{
unsigned long *sara = (unsigned long *)®s->esp;
struct kretprobe_instance *ri;
if ((ri = get_free_rp_inst(rp)) != NULL) {
ri->rp = rp;
ri->task = current;
ri->ret_addr = (kprobe_opcode_t *) *sara; // important here! save original fn return addr
// replace the return addr with trampoline addr
*sara = (unsigned long) &kretprobe_trampoline;
// ...
}
// ...
}
static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
{
struct kretprobe *rp = container_of(p, struct kretprobe, kp);
/*TODO: consider to only swap the RA after the last pre_handler fired */
arch_prepare_kretprobe(rp, regs);
return 0;
}
int register_kretprobe(struct kretprobe *rp)
{
// ...
rp->kp.pre_handler = pre_handler_kretprobe;
/* Establish function entry probe point */
if ((ret = register_kprobe(&rp->kp)) != 0) {
}
// ...
}
- so that when function return, control passes to the trampoline, which is already been registered as a kprobe in
init_kprobes, and that probe is hit, so its pre_handler trampoline_probe_handler will be called - after calling the user-specified handler associated with the kretprobe, the original function return address will be restored, and execution will be resumed
int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
{
// ...
head = kretprobe_inst_table_head(current);
hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
if (ri->task != current)
continue;
if (ri->rp && ri->rp->handler)
ri->rp->handler(ri, regs); // called user-specified handler
orig_ret_address = (unsigned long)ri->ret_addr;
// ...
}
regs->eip = orig_ret_address; // restore the original return addr
// ...
}
debugfs interface for kprobes
- to list all registered probes on the system
cat /sys/kernel/debug/kprobes/list
- to globally turn registered kprobes on or off
cat /sys/kernel/debug/kprobes/enabled
# turn on
echo 1 > /sys/kernel/debug/kprobes/enabled
# turn off
echo 0 > /sys/kernel/debug/kprobes/enabled
kprobes sysctl interface
- check kprobes optimization status
cat /proc/sys/debug/kprobes-optimization
- turn on or off kprobes optimization
# turn off
echo 0 > /proc/sys/debug/kprobes-optimization
# turn on
echo 1 > /proc/sys/debug/kprobes-optimization
uprobes
brief history
- Linux v3.5 introduced
uprobe into its mainline
tracepoints
- NOTE: tracepoints implementation based on Linux v2.6.28 codes reading, may update it after reading new version of it
brief history
- Linux v2.6.28 introduced
tracepoint into its mainline - v2.6.29 splits original
DEFINE_TRACE into DECLARE_TRACE and DEFINE_TRACE; if the tracepoint has to be used in kernel modules, an EXPORT_TRACEPOINT_SYMBOL_GPL or EXPORT_TRACEPOINT_SYMBOL can be used to export the defined tracepoints - v2.6.30 the macro
TRACE_EVENT was introduced, which is far more powerful because it automates the “boilerplate” code needed to bridge the gap between a kernel function and a monitoring tool like perf or ftrace - v2.6.31 all
predefined tracepoint events were grouped under include/trace/events by organizing events into subsystems, and introduced /sys/kernel/debug/tracing/events interface, giving each event its own directory with enable file - v2.6.32 add
regfunc and unregfunc fields to struct tracepoint for more flexible functionality - v2.6.33 more macros
DECLARE_EVENT_CLASS DEFINE_EVENT DEFINE_EVENT_PRINT to facilitate tracepoint usage for other tools - v2.6.35 update
struct tracepoint and change related macros internal - v2.6.37 using
JUMP_LABEL macro to test whether a tracepoint is enabled or not, and its related stuff - v2.6.38 using
__tracepoints_ptrs section for iteration on the tracepoints; macro DECLARE_TRACE_CONDITION - v3.0 performance optimization using asm
goto __jump_table
how does tracepoint work
- using
DEFINE_TRACE, TPPROTO and TPARGS to define a tracepoint - the tracepoint name will be put in
__tracepoints_strings section, and the tracepoint itself will be put in __tracepoints section
DEFINE_TRACE(sched_wakeup_new,
TPPROTO(struct rq *rq, struct task_struct *p),
TPARGS(rq, p));
# the above example will expand to the following codes
# this is the tracepoint function called in other important places
static inline void trace_sched_wakeup_new(struct rq *rq,
struct task_struct *p) {
static const char __tpstrtab_sched_wakeup_new[]
__attribute__((section("__tracepoints_strings"))) =
"sched_wakeup_new:TPPROTO(struct rq *rq, struct task_struct *p)";
static struct tracepoint __tracepoint_sched_wakeup_new
__attribute__((section("__tracepoints"), aligned(8))) = {
__tpstrtab_sched_wakeup_new,
0,
NULL
};
// when tracepoint does not activated, then just skip it
if (unlikely(__tracepoint_sched_wakeup_new.state))
do {
void **it_func;
rcu_read_lock_sched();
it_func = rcu_dereference((&__tracepoint_sched_wakeup_new)->funcs);
if (it_func) {
do {
// call tracepoint probe funcs here!!
((void (*)(struct rq * rq, struct task_struct * p))(*it_func))(rq, p);
} while (*(++it_func));
}
rcu_read_unlock_sched();
} while (0);
}
- when you want to register a probe function to some tracepoint, just define the probe function according to the signature of proto of tracepoint definition, then call the generated register function, which will then call
tracepoint_probe_register to connect a probe to a tracepoint
static void set_tracepoint(struct tracepoint_entry **entry,
struct tracepoint *elem, int active)
{
// ...
// assign funcs in tracepoint entry to tracepoint
rcu_assign_pointer(elem->funcs, (*entry)->funcs);
elem->state = active; // then activate the tracepoint
}
void tracepoint_update_probe_range(struct tracepoint *begin,
struct tracepoint *end)
{
// ...
for (iter = begin; iter < end; iter++) {
mark_entry = get_tracepoint(iter->name);
if (mark_entry) {
set_tracepoint(&mark_entry, iter,
!!mark_entry->refcount);
} // ...
}
// ...
}
static void tracepoint_update_probes(void)
{
/* Core kernel tracepoints */
tracepoint_update_probe_range(__start___tracepoints,
__stop___tracepoints);
/* tracepoints in modules. */
module_update_tracepoints();
}
int tracepoint_probe_register(const char *name, void *probe)
{
// ...
// add probe to corresponding tracepoint entry
old = tracepoint_entry_add_probe(entry, probe);
// ...
// move probe funcs in tracepoint entry to corresponding tracepoint
tracepoint_update_probes();
// ...
}
static inline int register_trace_sched_wakeup_new(void (*probe)(struct rq *rq, struct task_struct *p)) {
return tracepoint_probe_register("sched_wakeup_new:TPPROTO(struct rq *rq, struct task_struct *p)", (void *)probe);
}
struct tracepoint {
const char *name;
int state;
void **funcs; // all registered probe functions for this tracepoint
} __attribute__((aligned(8)));
- tracepoint actual calling example
# sys_fork will call do_fork, and it will call wake_up_new_task
void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
{
// ...
trace_sched_wakeup_new(rq, p); // calling our above defined tracepoint here
// ...
}
tracepoints in modules
- when you run insmod or modprobe, the user-space utility eventually calls
sys_init_module to pass the binary blob (the .ko file) to the kernel
struct module
{
// ...
#ifdef CONFIG_TRACEPOINTS
struct tracepoint *tracepoints;
unsigned int num_tracepoints;
#endif
// ...
};
static noinline struct module *load_module(void __user *umod,
unsigned long len,
const char __user *uargs)
{
// ...
#ifdef CONFIG_TRACEPOINTS
mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
"__tracepoints",
sizeof(*mod->tracepoints),
&mod->num_tracepoints);
#endif
// ...
if (!mod->taints) {
// ...
#ifdef CONFIG_TRACEPOINTS
tracepoint_update_probe_range(mod->tracepoints,
mod->tracepoints + mod->num_tracepoints);
#endif
}
// ...
}
asmlinkage long
sys_init_module(void __user *umod,
unsigned long len,
const char __user *uargs)
{
// ...
mod = load_module(umod, len, uargs);
// ...
}
oid module_update_tracepoints(void)
{
// ...
list_for_each_entry(mod, &modules, list)
if (!mod->taints)
tracepoint_update_probe_range(mod->tracepoints,
mod->tracepoints + mod->num_tracepoints);
// ...
}
debugfs interface for tracepoints
- we can easily enable tracepoints by using the
event tracing framework
# to enable event 'sched_wakeup'
echo 1 > /sys/kernel/debug/tracing/events/sched/sched_wakeup/enable
# to disable it
echo 0 > /sys/kernel/debug/tracing/events/sched/sched_wakeup/enable
# to enable all events in sched subsystem
echo 1 > /sys/kernel/debug/tracing/events/sched/enable
# to eanble all events
echo 1 > /sys/kernel/debug/tracing/events/enable
perf subsystem
- NOTE: perf subsystem implementation based on Linux v2.6.32 codes reading, may update it after reading new version of it
brief history
- Linux v2.6.31 first introduced
Performance Counters for Linux(PCL) into its mainline for just hardware counters, so the syscall is sys_perf_counter_open - but Linux v2.6.32 changed the syscall to
sys_perf_event_open, and expanded the framework to do more than just hw counters
how does perf event work
- the following calling procedure only means calling timeline, not caller-callee callgraph, so that we can see how perf event subsystem is inited: start_kernel –> sched_init –>
perf_event_init –> check_bugs –> identify_boot_cpu –> init_hw_perf_events –> intel_pmu_init/amd_pmu_init –> perf_events_lapic_init –> rest_init –> kernel_init –> do_basic_setup –> do_initcalls –> perf_event_sysfs_init
eBPF
- NOTE: eBPF implementation based on Linux v4.0 codes reading, may update it after reading new version of it
brief history
- v3.17 first introduced
kernel/bpf directory into linux kernel, mainly for socket filter - v3.18 introduced
bpf syscall, bpf verifier and basic bpf map - v3.19 introduced
BPF_MAP_TYPE_ARRAY bpf array and BPF_MAP_TYPE_HASH bpf hash map; basic version of bpf helpers; BPF_PROG_TYPE_SOCKET_FILTER socket filter macro - v4.1 introduced more prog types, especially
BPF_PROG_TYPE_KPROBE allowing bpf program to attach to kprobes; more bpf helper functions like bpf_get_prandom_u32 and bpf_get_smp_processor_id etc - v4.2 introduced
tail calls to allow chaining multiple eBPF programs together, effectively extending the overall execution beyond the single-program instruction limit; new map BPF_MAP_TYPE_PROG_ARRAY type to support tail calls; several support helper functions, including bpf_tail_call, bpf_get_current_pid_tgid, bpf_get_current_uid_gid, bpf_get_current_comm etc - v4.3 new map
BPF_MAP_TYPE_PERF_EVENT_ARRAY type and with perf_event_read function to do perf event monitoring; more helper functions like bpf_get_cgroup_classid skb_[gs]et_tunnel_key etc; first introduced libbpf in tools/lib/bpf - v4.4 mount and register
/sys/fs/bpf/ filesystem for new cmd BPF_OBJ_PIN and BPF_OBJ_GET; /proc/sys/kernel/unprivileged_bpf_disabled to control whether users without CAP_SYS_ADMIN privilege can use the bpf syscall; bpf_perf_event_output bpf_get_route_realm and bpf_redirect functions - v4.5 enhance
/sys/fs/bpf/ filesystem to support link and rename; bpf_skb_load_bytes to net/core/filter.c - v4.6 new map types
BPF_MAP_TYPE_PERCPU_ARRAY BPF_MAP_TYPE_PERCPU_HASH BPF_MAP_TYPE_STACK_TRACE; helper functions like bpf_skb_[gs]et_tunnel_opt, bpf_get_stackid etc, after using map BPF_MAP_TYPE_STACK_TRACE and bpf_get_stackid to get stack traces, you can use /proc/kallsyms to translate them into understandable function names - v4.7 new program type
BPF_PROG_TYPE_TRACEPOINT; sysctl interfaces /proc/sys/net/core/bpf_jit_enable and /proc/sys/net/core/bpf_jit_harden; helper functions bpf_event_output, bpf_get_stackid_tp - v4.8 new program type
BPF_PROG_TYPE_XDP; new map type BPF_MAP_TYPE_CGROUP_ARRAY; more helper functions bpf_skb_change_proto, bpf_skb_change_type, bpf_skb_under_cgroup, bpf_get_hash_recalc, bpf_get_current_task, bpf_probe_write_user - v4.9 new program type
BPF_PROG_TYPE_PERF_EVENT
bpf program loading
- calling
bpf system call with BPF_PROG_LOAD cmd arg, bpf program will be copied into kernel and run through eBPF verifier by function bpf_check. After passing verification, bpf_prog_select_runtime will be used to select interpreter running function __bpf_prog_run or jited BPF instructions to native codes
static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
{
// ...
/* CALL */
JMP_CALL:
/* Function call scratches BPF_R1-BPF_R5 registers,
* preserves BPF_R6-BPF_R9, and stores return value
* into BPF_R0.
*/
BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
BPF_R4, BPF_R5); // NOTE: where magics happend
CONT;
// ...
// how does tail call implement, codes from v4.2
JMP_TAIL_CALL: {
struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_prog *prog;
u64 index = BPF_R3;
// ...
prog = READ_ONCE(array->prog[index]); // get new prog here
// ...
ARG1 = BPF_R1;
insn = prog->insnsi; // replace insn to new prog's instructions
goto select_insn; // next run, we will execute insn of new program
out:
CONT;
}
// ...
}
void bpf_int_jit_compile(struct bpf_prog *prog)
{
// ...
if (image) {
// ... after jiting related codes
prog->bpf_func = (void *)image; // here we replace the interpreter func to jited codes
prog->jited = true;
// ...
}
// ...
}
void bpf_prog_select_runtime(struct bpf_prog *fp)
{
fp->bpf_func = (void *) __bpf_prog_run; // NOTE: save interpreter running function
/* Probe if internal BPF can be JITed */
bpf_int_jit_compile(fp); // here fp->bpf_func is replaced with jited
// codes if jit is working
// ...
}
bpf program running
- two macros
BPF_PROG_RUN and SK_RUN_FILTER are used to run bpf programs
#define BPF_PROG_RUN(filter, ctx) (*filter->bpf_func)(ctx, filter->insnsi)
static int tcf_bpf(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
struct tcf_bpf *b = a->priv;
int action, filter_res;
spin_lock(&b->tcf_lock);
// ...
filter_res = BPF_PROG_RUN(b->filter, skb);
// ...
spin_unlock(&b->tcf_lock);
return action;
/* Macro to invoke filter function. */
#define SK_RUN_FILTER(filter, ctx) \
(*filter->prog->bpf_func)(ctx, filter->prog->insnsi)
int sk_filter(struct sock *sk, struct sk_buff *skb)
{
int err;
struct sk_filter *filter;
// ...
rcu_read_lock();
filter = rcu_dereference(sk->sk_filter);
if (filter) {
// as we can see here, how eBPF program works
unsigned int pkt_len = SK_RUN_FILTER(filter, skb);
err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
}
rcu_read_unlock();
return err;
}
bpf program with kprobes
perequisite knowledge
- when
start_kernel executes, it will call perf_event_init, in which perf_tp_register will be called to register the corresponding pmu perf_tracepoint
static struct pmu perf_tracepoint = {
.task_ctx_nr = perf_sw_context,
.event_init = perf_tp_event_init, // tracepoint related event init func
// ...
};
int perf_pmu_register(struct pmu *pmu, const char *name, int type)
{
// ...
pmu->name = name;
// ...
pmu->type = type;
// ...
list_add_rcu(&pmu->entry, &pmus);
// ...
}
static inline void perf_tp_register(void)
{
// register perf_tracepoint pmu to pmus
perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
}
void __init perf_event_init(void)
{
// ...
perf_tp_register();
// ...
}
init_kprobe_trace will be called after perf_event_init is done inside rest_init, it will register probes_write write handler, which will be triggered when we echo /sys/kernel/debug/tracing/kprobe_events
static const struct file_operations kprobe_events_ops = {
// ...
.write = probes_write,
}
static __init int init_kprobe_trace(void)
{
// ...
entry = tracefs_create_file("kprobe_events", 0644, d_tracer,
NULL, &kprobe_events_ops);
// ...
}
fs_initcall(init_kprobe_trace);
how does bpf program connect to kprobes
- when you write SEC(“kprobe/xxxx”), you aren’t writing a standard C function call. when you compile your code using
clang -target bpf, the compiler looks at that SEC() macro and creates a custom section named kprobe/xxxx
#define SEC(NAME) __attribute__((section(NAME), used))
SEC("kprobe/xxxx")
int bpf_prog(struct pt_regs *ctx)
{
struct sk_buff *skb;
skb = (struct sk_buff *) ctx->di;
// ... get other related stuff this function need from ctx
}
- the loader (like the following
load_bpf_file) loads the compiled .o file and scans all the section headers, when it sees the prefix kprobe/, it realizes this is a bpf kprobe program and load it
int load_bpf_file(char *path)
{
// ...
fd = open(path, O_RDONLY, 0);
// ...
for (i = 1; i < ehdr.e_shnum; i++) {
if (memcmp(shname_prog, "kprobe/", 7) == 0 ||
memcmp(shname_prog, "kretprobe/", 10) == 0 ||
memcmp(shname_prog, "socket", 6) == 0)
load_and_attach(shname_prog, insns, data_prog->d_size);
}
// ...
}
- it parses the rest of the string xxxx as the target function, using
tracefs to echo to /sys/kernel/debug/tracing/kprobe_events to first create the kprobe and it will triger the corresponding .write handler probes_write registered when kprobe_events file created, through this, it will finally register the kprobe
static struct trace_kprobe *alloc_trace_kprobe(const char *group,
const char *event, void *addr,
const char *symbol, unsigned long offs,
int nargs, bool is_return)
{
// ...
if (is_return)
tk->rp.handler = kretprobe_dispatcher;
else
tk->rp.kp.pre_handler = kprobe_dispatcher; // when the kprobe hit, it will call this kprobe_dispatcher
// ...
}
static int register_kprobe_event(struct trace_kprobe *tk)
{
struct ftrace_event_call *call = &tk->tp.call;
// ...
call->flags = TRACE_EVENT_FL_KPROBE;
call->class->reg = kprobe_register;
call->data = tk;
// ...
}
static int __register_trace_kprobe(struct trace_kprobe *tk)
{
// ...
if (trace_kprobe_is_return(tk))
ret = register_kretprobe(&tk->rp);
else
ret = register_kprobe(&tk->rp.kp); // finally register the kprobe here
// ...
}
static int register_trace_kprobe(struct trace_kprobe *tk)
{
// ...
ret = register_kprobe_event(tk);
// ...
ret = __register_trace_kprobe(tk);
// ...
}
static int create_trace_kprobe(int argc, char **argv)
{
// ... parse arguments echo to kprobe_events file etc
tk = alloc_trace_kprobe(group, event, addr, symbol, offset, argc,
is_return);
// ...
ret = register_trace_kprobe(tk); // here to register the kprobe
// ...
}
static ssize_t probes_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
return traceprobe_probes_write(file, buffer, count, ppos,
create_trace_kprobe); // finally calls the create_trace_kprobe
}
- after this step, the kprobe has been registered, but the bpf program hasn’t been attached yet,
bpf_prog_load first load and verify it
static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
{
bool is_socket = strncmp(event, "socket", 6) == 0;
bool is_kprobe = strncmp(event, "kprobe/", 7) == 0;
bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
enum bpf_prog_type prog_type;
// ...
struct perf_event_attr attr = {};
attr.type = PERF_TYPE_TRACEPOINT;
attr.sample_type = PERF_SAMPLE_RAW;
attr.sample_period = 1;
attr.wakeup_events = 1;
if (is_socket) {
prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
} else if (is_kprobe || is_kretprobe) {
prog_type = BPF_PROG_TYPE_KPROBE; // kprobe bpf type program
} // ...
if (is_kprobe || is_kretprobe) {
if (is_kprobe)
event += 7;
else
event += 10;
snprintf(buf, sizeof(buf),
"echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events",
is_kprobe ? 'p' : 'r', event, event); // here echo to kprobe_events file
err = system(buf);
// ...
fd = bpf_prog_load(prog_type, prog, size, license, kern_version);
}
// ...
strcpy(buf, DEBUGFS);
strcat(buf, "events/kprobes/");
strcat(buf, event);
strcat(buf, "/id");
efd = open(buf, O_RDONLY, 0);
// ...
err = read(efd, buf, sizeof(buf));
// ...
buf[err] = 0;
id = atoi(buf);
attr.config = id; // continue to config struct perf_event_attr
efd = perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0);
// ...
ioctl(efd, PERF_EVENT_IOC_ENABLE, 0);
ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd);
return 0;
}
- calling
perf_event_open syscall to create and sets up the perf_event structure, the event is usually created in a DISABLED state, it currently only knows how to send data to the standard perf ring buffer, but not a BPF program
static int
enable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file)
{
// ...
tk->tp.flags |= TP_FLAG_PROFILE;
if (trace_probe_is_registered(&tk->tp) && !trace_kprobe_has_gone(tk)) {
if (trace_kprobe_is_return(tk))
ret = enable_kretprobe(&tk->rp);
else
ret = enable_kprobe(&tk->rp.kp); // here to finally enable the kprobe
}
// ...
}
static int kprobe_register(struct ftrace_event_call *event,
enum trace_reg type, void *data)
{
struct trace_kprobe *tk = (struct trace_kprobe *)event->data;
struct ftrace_event_file *file = data;
switch (type) {
// ...
#ifdef CONFIG_PERF_EVENTS
case TRACE_REG_PERF_REGISTER:
return enable_trace_kprobe(tk, NULL);
// ...
#endif
}
// ...
}
static int perf_trace_event_reg(struct ftrace_event_call *tp_event,
struct perf_event *p_event)
{
// ...
// kprobe_register assigned in register_kprobe_event
ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL);
// ...
}
static int perf_trace_event_init(struct ftrace_event_call *tp_event,
struct perf_event *p_event)
{
// ...
ret = perf_trace_event_reg(tp_event, p_event);
// ...
}
int perf_trace_init(struct perf_event *p_event)
{
struct ftrace_event_call *tp_event;
u64 event_id = p_event->attr.config;
// ...
list_for_each_entry(tp_event, &ftrace_events, list) {
if (tp_event->event.type == event_id &&
tp_event->class && tp_event->class->reg &&
try_module_get(tp_event->mod)) {
ret = perf_trace_event_init(tp_event, p_event);
// ...
}
}
// ...
}
static int perf_tp_event_init(struct perf_event *event)
{
int err;
if (event->attr.type != PERF_TYPE_TRACEPOINT)
return -ENOENT;
// ...
err = perf_trace_init(event);
// ...
event->destroy = tp_perf_event_destroy;
return 0;
}
static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
{
// ...
event->pmu = pmu;
// here finally perf_tp_event_init in struct pmu perf_tracepoint is called
ret = pmu->event_init(event);
// ...
}
struct pmu *perf_init_event(struct perf_event *event)
{
// ...
list_for_each_entry_rcu(pmu, &pmus, entry) {
ret = perf_try_init_event(pmu, event);
// ...
}
// ...
}
static inline void perf_event__state_init(struct perf_event *event)
{
event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
PERF_EVENT_STATE_INACTIVE;
}
static struct perf_event *
perf_event_alloc(struct perf_event_attr *attr, int cpu,
struct task_struct *task,
struct perf_event *group_leader,
struct perf_event *parent_event,
perf_overflow_handler_t overflow_handler,
void *context, int cgroup_fd)
{
// ...
perf_event__state_init(event); // the event is not enabled yet
// ...
pmu = perf_init_event(event);
// ...
}
SYSCALL_DEFINE5(perf_event_open,
struct perf_event_attr __user *, attr_uptr,
pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
{
// ...
event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
NULL, NULL, cgroup_fd);
// ...
event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
f_flags); // here create the event_file and register perf_fops for it
// ...
}
- in
perf_event_open syscall, it will create an event_file by calling anon_inode_getfile and register perf_fops for it, which is important to attach bpf to kprobe and finally enable it
static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct perf_event *event = file->private_data;
// ...
ret = _perf_ioctl(event, cmd, arg);
// ...
}
static const struct file_operations perf_fops = {
// ...
.unlocked_ioctl = perf_ioctl,
// ...
};
- using
PERF_EVENT_IOC_ENABLE to activate the perf event, and using PERF_EVENT_IOC_SET_BPF to attach bpf program to kprobe
// ... copied from above
ioctl(efd, PERF_EVENT_IOC_ENABLE, 0);
ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd);
// ...
static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
{
struct bpf_prog *prog;
// ...
prog = bpf_prog_get(prog_fd); // loads the ebpf program
// ...
event->tp_event->prog = prog; // here finally attaches the ebpf program
return 0;
}
static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
{
void (*func)(struct perf_event *);
u32 flags = arg;
switch (cmd) {
case PERF_EVENT_IOC_ENABLE:
func = _perf_event_enable; // later calls func, event->state = PERF_EVENT_STATE_ACTIVE;
break;
// ...
case PERF_EVENT_IOC_SET_BPF:
return perf_event_set_bpf_prog(event, arg);
// ...
}
}
- when the kprobe is hit,
kprobe_dispatcher will be called, and finally to call the attached bpf program
unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
{
unsigned int ret;
// ...
ret = BPF_PROG_RUN(prog, ctx); // here to finally run bpf program
// ...
}
static void
kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
{
struct ftrace_event_call *call = &tk->tp.call;
struct bpf_prog *prog = call->prog;
// ...
if (prog && !trace_call_bpf(prog, regs)) // call bpf program
return;
// ...
}
static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
{
struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
tk->nhit++;
// ...
#ifdef CONFIG_PERF_EVENTS
if (tk->tp.flags & TP_FLAG_PROFILE)
kprobe_perf_func(tk, regs); // inside to call bpf program
#endif
return 0; /* We don't tweek kernel, so just return 0 */
}
how does bpf program connect to tracepoint
- ebpf program attach to tracepoints is very similar to kprobes in Linux
v4.7, here we does not contain all the details like kprobes above, but something specially to tracepoints - read
/sys/kernel/debug/tracing/events/xxx/id file to know tracepoint id and setup struct perf_event_attr when calling perf_event_open - assigning
trace_event_reg to .reg field of struct trace_event_class, and it will call tracepoint_probe_register to register field .perf_probe callback function, which is a perf_trace_xxx function, to tracepoint
#define _TRACE_PERF_INIT(call) \
.perf_probe = perf_trace_##call,
static struct trace_event_class __used __refdata event_class_##call = { \
.system = TRACE_SYSTEM_STRING, \
.define_fields = trace_event_define_fields_##call, \
.fields = LIST_HEAD_INIT(event_class_##call.fields),\
.raw_init = trace_event_raw_init, \
.probe = trace_event_raw_event_##call, \
.reg = trace_event_reg, \
_TRACE_PERF_INIT(call) \
};
int trace_event_reg(struct trace_event_call *call,
enum trace_reg type, void *data)
{
struct trace_event_file *file = data;
WARN_ON(!(call->flags & TRACE_EVENT_FL_TRACEPOINT));
switch (type) {
// ...
#ifdef CONFIG_PERF_EVENTS
case TRACE_REG_PERF_REGISTER:
return tracepoint_probe_register(call->tp,
call->class->perf_probe, // to register perf_probe callback to tracepoint
call);
case TRACE_REG_PERF_UNREGISTER:
tracepoint_probe_unregister(call->tp,
call->class->perf_probe,
call);
return 0;
// ...
#endif
}
return 0;
}
EXPORT_SYMBOL_GPL(trace_event_reg);
- when the tracepoint is hit, the registered
.perf_probe cb will be called, as stated above, which is perf_trace_xxx function
static notrace void \
perf_trace_##call(void *__data, proto) \
{ \
struct trace_event_call *event_call = __data; \
// ...
struct bpf_prog *prog = event_call->prog; \
struct pt_regs *__regs; \
// ...
perf_fetch_caller_regs(__regs); \
// ...
perf_trace_run_bpf_submit(entry, __entry_size, rctx, \
event_call, __count, __regs, \
head, __task); \
}
void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
struct trace_event_call *call, u64 count,
struct pt_regs *regs, struct hlist_head *head,
struct task_struct *task)
{
struct bpf_prog *prog = call->prog;
if (prog) {
*(struct pt_regs **)raw_data = regs;
// here to finally call trace_call_bpf to run ebpf program
if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) {
// ...
}
}
// ...
}
XDP bpf program
perequisite knowledge
module_init calls mlx4_en_init to register struct mlx4_interface, where .activate function mlx4_en_activate will be called later to setup struct net_device_ops for field netdev_ops of struct net_device, and in struct net_device_ops, the field of .ndo_xdp is important for xdp to work, which is function mlx4_xdp in the following code example
static const struct net_device_ops mlx4_netdev_ops = {
// ...
.ndo_xdp = mlx4_xdp,
};
int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
struct mlx4_en_port_profile *prof)
{
// ...
/*
* Initialize netdev entry points
*/
if (mlx4_is_master(priv->mdev->dev))
dev->netdev_ops = &mlx4_netdev_ops_master;
else
dev->netdev_ops = &mlx4_netdev_ops;
// ...
}
static void mlx4_en_activate(struct mlx4_dev *dev, void *ctx)
{
int i;
struct mlx4_en_dev *mdev = ctx;
/* Create a netdev for each port */
mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
mlx4_info(mdev, "Activating port:%d\n", i);
if (mlx4_en_init_netdev(mdev, i, &mdev->profile.prof[i]))
mdev->pndev[i] = NULL;
}
// ...
}
static struct mlx4_interface mlx4_en_interface = {
// ...
.protocol = MLX4_PROT_ETH,
.activate = mlx4_en_activate,
};
static int __init mlx4_en_init(void)
{
// ...
return mlx4_register_interface(&mlx4_en_interface);
}
module_init(mlx4_en_init);
how does XDP bpf program connect to NIC
- in linux
v4.8, XDP attachment support is very limited, we will need the ip command from iproute2 to do the job, and it will finally sends an RTM_SETLINK Netlink message to the kernel
ip link set dev eth0 xdp obj prog.o
- The kernel receives the message and routes it through the networking stack
rtnetlink_rcv_msgrtnl_setlinkdo_setlink, parses the attributes, specifically looking for IFLA_XDPdev_change_xdp_fd, the core kernel function that manages XDP attachment, which set or clear a bpf program for a device rx path
int dev_change_xdp_fd(struct net_device *dev, int fd)
{
const struct net_device_ops *ops = dev->netdev_ops;
struct bpf_prog *prog = NULL;
struct netdev_xdp xdp = {};
int err;
if (!ops->ndo_xdp)
return -EOPNOTSUPP;
if (fd >= 0) {
prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
if (IS_ERR(prog))
return PTR_ERR(prog);
}
xdp.command = XDP_SETUP_PROG;
xdp.prog = prog;
// ndo_xdp cb will be called, as we stated above, it's mlx4_xdp function here
err = ops->ndo_xdp(dev, &xdp);
// ...
}
- function
mlx4_xdp will finally call mlx4_xdp_set to setup XDP program
static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog)
{
// ...
xdp_ring_num = prog ? ALIGN(priv->rx_ring_num, MLX4_EN_NUM_UP) : 0;
// ...
priv->xdp_ring_num = xdp_ring_num;
netif_set_real_num_tx_queues(dev, priv->tx_ring_num -
priv->xdp_ring_num);
for (i = 0; i < priv->rx_ring_num; i++) {
old_prog = xchg(&priv->rx_ring[i]->xdp_prog, prog); // here ebpf is finally set
if (old_prog)
bpf_prog_put(old_prog);
}
// ...
}
static int mlx4_xdp(struct net_device *dev, struct netdev_xdp *xdp)
{
switch (xdp->command) {
case XDP_SETUP_PROG:
return mlx4_xdp_set(dev, xdp->prog);
case XDP_QUERY_PROG:
xdp->prog_attached = mlx4_xdp_attached(dev);
return 0;
default:
return -EINVAL;
}
}
- when a packet hits the NIC, after hardware interrupt and Rx CQ polling, it will finally call
mlx4_en_process_rx_cq function
static inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
struct xdp_buff *xdp)
{
u32 ret;
rcu_read_lock();
ret = BPF_PROG_RUN(prog, (void *)xdp);
rcu_read_unlock();
return ret;
}
int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int budget)
{
// ...
xdp_prog = READ_ONCE(ring->xdp_prog);
// ...
while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK,
cq->mcq.cons_index & cq->size)) {
// ...
/* A bpf program gets first chance to drop the packet. It may
* read bytes but not past the end of the frag.
*/
if (xdp_prog) {
struct xdp_buff xdp;
// ...
xdp.data = page_address(frags[0].page) +
frags[0].page_offset;
xdp.data_end = xdp.data + length;
// here to run xdp bpf program
act = bpf_prog_run_xdp(xdp_prog, &xdp);
switch (act) {
case XDP_PASS:
break;
case XDP_TX:
// ...
case XDP_ABORTED:
case XDP_DROP:
if (mlx4_en_rx_recycle(ring, frags))
goto consumed;
goto next;
}
}
// ...
}
// ...
}
ebpf sysctl interface
bpf_jit_enable
cat /proc/sys/net/core/bpf_jit_enable
- turn on or off ebpf jit compiler
# turn off
echo 0 > /proc/sys/net/core/bpf_jit_enable # for interpreter only
# turn on
echo 1 > /proc/sys/net/core/bpf_jit_enable # enable jit
# debug mode
# the kernel will output the resulting native opcodes to the kernel log (dmesg)
echo 2 > /proc/sys/net/core/bpf_jit_enable
bpf_jit_harden
- check ebpf jit harden status
cat /proc/sys/net/core/bpf_jit_harden
- turn on or off ebpf jit hardening
echo 0 > /proc/sys/net/core/bpf_jit_harden # disable JIT hardening
# enable JIT hardening for unprivileged users only
echo 1 > /proc/sys/net/core/bpf_jit_harden
# enable JIT hardening for all users
echo 2 > /proc/sys/net/core/bpf_jit_harden
ftrace
- NOTE: ftrace implementation based on Linux v2.6.27 codes reading, may update it after reading new version of it
perequisite knowledge
- Linux kernel should be compiled with gcc with
-pg flag, which will insert mcount function call at the start of every function, unless you use the function attribute no_instrument_function to suppress profiling of individual functions when compiling
gcc -c -pg -m32 ftrace.c
void foo()
{
printf("hello world\n");
}
#define notrace __attribute__((no_instrument_function))
void notrace bar()
{
printf("hello");
}
.LC0:
.string "hello world"
foo:
pushl %ebp
movl %esp, %ebp
subl $8, %esp
1: call mcount // NOTE: here -pg flag will insert mcount function call
subl $12, %esp
pushl $.LC0
// ...
bar:
pushl %ebp
movl %esp, %ebp
subl $8, %esp
subl $12, %esp
pushl $.LC0
// ...