kprobes
brief history
- Linux v2.6.9 introduced
kprobes into its mainline, but only for i386 - v2.6.10 begins to add kprobes support for
x86_64 - Linux v2.6.12 allows multiple kprobes at the same address
- v2.6.13 starts to support
kretprobe - v2.6.14 groups kprobes related functions to section
.kprobes.text, so that kprobes to these codes will be rejected - v2.6.15 updates: using
percpu infra to manage some internal variables; using RCU(Read-Copy-Update) list to manage kprobes; more checks on kprobes register, including kernel module address check - v2.6.16 updates: more kprobes rejects on kernel functions; refactor kprobes register functions to check whether we are probing a kernel module
- v2.6.17
i386 updates its kretprobe_trampoline_holder; updates kprobe_fault_handler - v2.6.18 kprobes registers for
page fault notifications when their is an active probe registered; tcpprobe module to probe tcp_sendmsg - v2.6.19 add
symbol_name and offset fields to struct kprobe so you can directly using function name to register kprobes; dccp_probe module to probe dccp_sendmsg - v2.6.21 add basic
debugfs support for kprobes, /sys/kernel/debug/kprobes/list lists all registered probes on the system - v2.6.22 more on
debugfs, add /sys/kernel/debug/kprobes/enabled to globally turn registered kprobes on/off and the default value is on - v2.6.24 add basic
kretprobe blacklist support - v2.6.25 kretprobe supports optional user-specified
entry_handler which runs on function entry and also supports private data which can pass data between entry_handler and ret_handler - v2.6.26 add basic
kprobe blacklist support; batch registration/unregistration of a group of probes interfaces - v2.6.29 add
module notifier call back, which will check kprobes on the module; add flags field to struct kprobe to mark kprobe gone KPROBE_FLAG_GONE and remove its instruction buffer - v2.6.30 enable/disable probes interfaces, if kp->flags is set
KPROBE_FLAG_DISABLED, that kp will be registered but disabled, so, it’s handlers aren’t hit until calling enable_kprobe(kp) - v2.6.33 updates kprobe blacklist; check whether kprobe re-registered
- v2.6.34 kprobe optimization
KPROBE_FLAG_OPTIMIZED for i386 and x86_64; kprobes sysctl interface /proc/sys/debug/kprobes-optimization to control kprobe optimization - v2.6.39 do not optimize in the entry code due to the unstable stack handling
initialization
- when Linux is starting, it will call
init_kprobes in its process of initialization
struct notifier_block *i386die_chain;
int register_die_notifier(struct notifier_block *nb)
{
// ...
err = notifier_chain_register(&i386die_chain, nb);
// ...
return err;
}
static struct notifier_block kprobe_exceptions_nb = {
.notifier_call = kprobe_exceptions_notify,
.priority = 0x7fffffff /* we need to notified first */
};
static int __init init_kprobes(void)
{
int i, err = 0;
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
INIT_HLIST_HEAD(&kprobe_table[i]); // for kprobes
INIT_HLIST_HEAD(&kretprobe_inst_table[i]); // for kretprobes
}
err = arch_init_kprobes(); // here to register a kprobe for trampoline
if (!err)
// register kprobe_exceptions_nb to i386die_chain
err = register_die_notifier(&kprobe_exceptions_nb);
return err;
}
__initcall(init_kprobes);
how does kprobe work
/* Attach to insert probes on any functions which should be ignored */
#define __kprobes __attribute__((__section__(".kprobes.text")))
void __kprobes arch_copy_kprobe(struct kprobe *p)
{
memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
p->opcode = *p->addr;
}
void __kprobes arch_arm_kprobe(struct kprobe *p)
{
*p->addr = BREAKPOINT_INSTRUCTION;
// ...
}
static int __kprobes in_kprobes_functions(unsigned long addr)
{
if (addr >= (unsigned long)__kprobes_text_start
&& addr < (unsigned long)__kprobes_text_end)
return -EINVAL;
return 0;
}
- call
register_kprobe to register a kprobe we want to probe
int register_kprobe(struct kprobe *p)
{
// ...
if ((ret = in_kprobes_functions((unsigned long) p->addr)) != 0)
return ret;
// ...
// NOTE: add new kprobe to corresponding hash table slot
hlist_add_head(&p->hlist,
&kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
arch_arm_kprobe(p);
// ...
}
- when execution comes to the probing address, an
int3 happened and do_int3 will be called then, and it will notify our register kprobes, from there, pre_handler/break_handler inside kprobe will be called
ENTRY(int3)
// ...
call do_int3 // here to call do_int3
// ...
#ifdef CONFIG_KPROBES
asmlinkage int do_int3(struct pt_regs *regs, long error_code)
{
// here to notify registered kprobes
if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
== NOTIFY_STOP)
return 1;
// back to normal ...
}
#endif
static inline int notify_die(enum die_val val,char *str,struct pt_regs *regs,long err,int trap, int sig)
{
struct die_args args = { .regs=regs, .str=str, .err=err, .trapnr=trap,.signr=sig };
// here kprobe_exceptions_notify inside kprobe_exceptions_nb will be called
return notifier_call_chain(&i386die_chain, val, &args);
}
int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
void *data)
{
struct die_args *args = (struct die_args *)data;
switch (val) {
case DIE_INT3:
// p->break_handler & p->pre_handler
if (kprobe_handler(args->regs)) // handler cb inside
return NOTIFY_STOP;
break;
// ... p->post_handler
// ... p->fault_handler
}
return NOTIFY_DONE;
}
static inline int kprobe_handler(struct pt_regs *regs)
{
struct kprobe *p;
u8 *addr = (u8 *) (regs->eip - 1);
// ...
p = get_kprobe(addr);
// ...
if (p->pre_handler(p, regs)) { // NOTE: where magic happened
/* handler has already set things up, so skip ss setup */
return 1;
}
// ...
}
how does kretprobe work
- in order to support
kretprobe, at boot time, init_kprobes will first register a kprobe at the trampoline, which is an arbitrary piece of code – typically just a nop instruction
void kretprobe_trampoline_holder(void)
{
asm volatile ( ".global kretprobe_trampoline\n"
"kretprobe_trampoline: \n"
"nop\n");
}
static struct kprobe trampoline_p = {
.addr = (kprobe_opcode_t *) &kretprobe_trampoline,
.pre_handler = trampoline_probe_handler
};
int __init arch_init_kprobes(void)
{
return register_kprobe(&trampoline_p);
}
- when you call
register_kretprobe, kprobes establishes a kprobe at the entry to the function and when the probed function is called and this probe is hit, kprobes saves a copy of the return address, and replaces the return address with the address of a “trampoline.”
void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs)
{
unsigned long *sara = (unsigned long *)®s->esp;
struct kretprobe_instance *ri;
if ((ri = get_free_rp_inst(rp)) != NULL) {
ri->rp = rp;
ri->task = current;
ri->ret_addr = (kprobe_opcode_t *) *sara; // important here! save original fn return addr
// replace the return addr with trampoline addr
*sara = (unsigned long) &kretprobe_trampoline;
// ...
}
// ...
}
static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
{
struct kretprobe *rp = container_of(p, struct kretprobe, kp);
/*TODO: consider to only swap the RA after the last pre_handler fired */
arch_prepare_kretprobe(rp, regs);
return 0;
}
int register_kretprobe(struct kretprobe *rp)
{
// ...
rp->kp.pre_handler = pre_handler_kretprobe;
/* Establish function entry probe point */
if ((ret = register_kprobe(&rp->kp)) != 0) {
}
// ...
}
- so that when function return, control passes to the trampoline, which is already been registered as a kprobe in
init_kprobes, and that probe is hit, so its pre_handler trampoline_probe_handler will be called - after calling the user-specified handler associated with the kretprobe, the original function return address will be restored, and execution will be resumed
int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
{
// ...
head = kretprobe_inst_table_head(current);
hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
if (ri->task != current)
continue;
if (ri->rp && ri->rp->handler)
ri->rp->handler(ri, regs); // called user-specified handler
orig_ret_address = (unsigned long)ri->ret_addr;
// ...
}
regs->eip = orig_ret_address; // restore the original return addr
// ...
}
debugfs interface for kprobes
- to list all registered probes on the system
cat /sys/kernel/debug/kprobes/list
- to globally turn registered kprobes on or off
cat /sys/kernel/debug/kprobes/enabled
# turn on
echo 1 > /sys/kernel/debug/kprobes/enabled
# turn off
echo 0 > /sys/kernel/debug/kprobes/enabled
kprobes sysctl interface
- check kprobes optimization status
cat /proc/sys/debug/kprobes-optimization
- turn on or off kprobes optimization
# turn off
echo 0 > /proc/sys/debug/kprobes-optimization
# turn on
echo 1 > /proc/sys/debug/kprobes-optimization
uprobes
brief history
- Linux v3.5 introduced
uprobe into its mainline
tracepoints
- NOTE: tracepoints implementation based on Linux v2.6.28 codes reading, may update it after reading new version of it
brief history
- Linux v2.6.28 introduced
tracepoint into its mainline - v2.6.29 splits original
DEFINE_TRACE into DECLARE_TRACE and DEFINE_TRACE; if the tracepoint has to be used in kernel modules, an EXPORT_TRACEPOINT_SYMBOL_GPL or EXPORT_TRACEPOINT_SYMBOL can be used to export the defined tracepoints - v2.6.30 the macro
TRACE_EVENT was introduced, which is far more powerful because it automates the “boilerplate” code needed to bridge the gap between a kernel function and a monitoring tool like perf or ftrace - v2.6.31 all
predefined tracepoint events were grouped under include/trace/events by organizing events into subsystems, and introduced /sys/kernel/debug/tracing/events interface, giving each event its own directory with enable file - v2.6.32 add
regfunc and unregfunc fields to struct tracepoint for more flexible functionality - v2.6.33 more macros
DECLARE_EVENT_CLASS DEFINE_EVENT DEFINE_EVENT_PRINT to facilitate tracepoint usage for other tools - v2.6.35 update
struct tracepoint and change related macros internal - v2.6.37 using
JUMP_LABEL macro to test whether a tracepoint is enabled or not, and its related stuff - v2.6.38 using
__tracepoints_ptrs section for iteration on the tracepoints; macro DECLARE_TRACE_CONDITION - v3.0 performance optimization using asm
goto __jump_table
how does tracepoint work
- using
DEFINE_TRACE, TPPROTO and TPARGS to define a tracepoint - the tracepoint name will be put in
__tracepoints_strings section, and the tracepoint itself will be put in __tracepoints section
DEFINE_TRACE(sched_wakeup_new,
TPPROTO(struct rq *rq, struct task_struct *p),
TPARGS(rq, p));
# the above example will expand to the following codes
# this is the tracepoint function called in other important places
static inline void trace_sched_wakeup_new(struct rq *rq,
struct task_struct *p) {
static const char __tpstrtab_sched_wakeup_new[]
__attribute__((section("__tracepoints_strings"))) =
"sched_wakeup_new:TPPROTO(struct rq *rq, struct task_struct *p)";
static struct tracepoint __tracepoint_sched_wakeup_new
__attribute__((section("__tracepoints"), aligned(8))) = {
__tpstrtab_sched_wakeup_new,
0,
NULL
};
// when tracepoint does not activated, then just skip it
if (unlikely(__tracepoint_sched_wakeup_new.state))
do {
void **it_func;
rcu_read_lock_sched();
it_func = rcu_dereference((&__tracepoint_sched_wakeup_new)->funcs);
if (it_func) {
do {
// call tracepoint probe funcs here!!
((void (*)(struct rq * rq, struct task_struct * p))(*it_func))(rq, p);
} while (*(++it_func));
}
rcu_read_unlock_sched();
} while (0);
}
- when you want to register a probe function to some tracepoint, just define the probe function according to the signature of proto of tracepoint definition, then call the generated register function, which will then call
tracepoint_probe_register to connect a probe to a tracepoint
static void set_tracepoint(struct tracepoint_entry **entry,
struct tracepoint *elem, int active)
{
// ...
// assign funcs in tracepoint entry to tracepoint
rcu_assign_pointer(elem->funcs, (*entry)->funcs);
elem->state = active; // then activate the tracepoint
}
void tracepoint_update_probe_range(struct tracepoint *begin,
struct tracepoint *end)
{
// ...
for (iter = begin; iter < end; iter++) {
mark_entry = get_tracepoint(iter->name);
if (mark_entry) {
set_tracepoint(&mark_entry, iter,
!!mark_entry->refcount);
} // ...
}
// ...
}
static void tracepoint_update_probes(void)
{
/* Core kernel tracepoints */
tracepoint_update_probe_range(__start___tracepoints,
__stop___tracepoints);
/* tracepoints in modules. */
module_update_tracepoints();
}
int tracepoint_probe_register(const char *name, void *probe)
{
// ...
// add probe to corresponding tracepoint entry
old = tracepoint_entry_add_probe(entry, probe);
// ...
// move probe funcs in tracepoint entry to corresponding tracepoint
tracepoint_update_probes();
// ...
}
static inline int register_trace_sched_wakeup_new(void (*probe)(struct rq *rq, struct task_struct *p)) {
return tracepoint_probe_register("sched_wakeup_new:TPPROTO(struct rq *rq, struct task_struct *p)", (void *)probe);
}
struct tracepoint {
const char *name;
int state;
void **funcs; // all registered probe functions for this tracepoint
} __attribute__((aligned(8)));
- tracepoint actual calling example
# sys_fork will call do_fork, and it will call wake_up_new_task
void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
{
// ...
trace_sched_wakeup_new(rq, p); // calling our above defined tracepoint here
// ...
}
tracepoints in modules
- when you run insmod or modprobe, the user-space utility eventually calls
sys_init_module to pass the binary blob (the .ko file) to the kernel
struct module
{
// ...
#ifdef CONFIG_TRACEPOINTS
struct tracepoint *tracepoints;
unsigned int num_tracepoints;
#endif
// ...
};
static noinline struct module *load_module(void __user *umod,
unsigned long len,
const char __user *uargs)
{
// ...
#ifdef CONFIG_TRACEPOINTS
mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
"__tracepoints",
sizeof(*mod->tracepoints),
&mod->num_tracepoints);
#endif
// ...
if (!mod->taints) {
// ...
#ifdef CONFIG_TRACEPOINTS
tracepoint_update_probe_range(mod->tracepoints,
mod->tracepoints + mod->num_tracepoints);
#endif
}
// ...
}
asmlinkage long
sys_init_module(void __user *umod,
unsigned long len,
const char __user *uargs)
{
// ...
mod = load_module(umod, len, uargs);
// ...
}
oid module_update_tracepoints(void)
{
// ...
list_for_each_entry(mod, &modules, list)
if (!mod->taints)
tracepoint_update_probe_range(mod->tracepoints,
mod->tracepoints + mod->num_tracepoints);
// ...
}
debugfs interface for tracepoints
- we can easily enable tracepoints by using the
event tracing framework
# to enable event 'sched_wakeup'
echo 1 > /sys/kernel/debug/tracing/events/sched/sched_wakeup/enable
# to disable it
echo 0 > /sys/kernel/debug/tracing/events/sched/sched_wakeup/enable
# to enable all events in sched subsystem
echo 1 > /sys/kernel/debug/tracing/events/sched/enable
# to eanble all events
echo 1 > /sys/kernel/debug/tracing/events/enable
perf subsystem
- NOTE: perf subsystem implementation based on Linux v2.6.32 codes reading, may update it after reading new version of it
brief history
- Linux v2.6.31 first introduced
Performance Counters for Linux(PCL) into its mainline for just hardware counters, so the syscall is sys_perf_counter_open - but Linux v2.6.32 changed the syscall to
sys_perf_event_open, and expanded the framework to do more than just hw counters
how does perf event work
- the following calling procedure only means calling timeline, not caller-callee callgraph, so that we can see how perf event subsystem is inited: start_kernel –> sched_init –>
perf_event_init –> check_bugs –> identify_boot_cpu –> init_hw_perf_events –> intel_pmu_init/amd_pmu_init –> perf_events_lapic_init –> rest_init –> kernel_init –> do_basic_setup –> do_initcalls –> perf_event_sysfs_init
eBPF
- NOTE: eBPF implementation based on Linux v4.0 codes reading, may update it after reading new version of it
bpf program loading
- calling
bpf system call with BPF_PROG_LOAD cmd arg, bpf program will be copied into kernel and run through eBPF verifier by function bpf_check. After passing verification, bpf_prog_select_runtime will be used to select interpreter running function __bpf_prog_run or jited BPF instructions to native codes
static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
{
// ...
/* CALL */
JMP_CALL:
/* Function call scratches BPF_R1-BPF_R5 registers,
* preserves BPF_R6-BPF_R9, and stores return value
* into BPF_R0.
*/
BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
BPF_R4, BPF_R5); // NOTE: where magics happend
CONT;
// ...
}
void bpf_int_jit_compile(struct bpf_prog *prog)
{
// ...
if (image) {
// ... after jiting related codes
prog->bpf_func = (void *)image; // here we replace the interpreter func to jited codes
prog->jited = true;
// ...
}
// ...
}
void bpf_prog_select_runtime(struct bpf_prog *fp)
{
fp->bpf_func = (void *) __bpf_prog_run; // NOTE: save interpreter running function
/* Probe if internal BPF can be JITed */
bpf_int_jit_compile(fp); // here fp->bpf_func is replaced with jited
// codes if jit is working
// ...
}
bpf program running
- two macros
BPF_PROG_RUN and SK_RUN_FILTER are used to run bpf programs
#define BPF_PROG_RUN(filter, ctx) (*filter->bpf_func)(ctx, filter->insnsi)
static int tcf_bpf(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
struct tcf_bpf *b = a->priv;
int action, filter_res;
spin_lock(&b->tcf_lock);
// ...
filter_res = BPF_PROG_RUN(b->filter, skb);
// ...
spin_unlock(&b->tcf_lock);
return action;
/* Macro to invoke filter function. */
#define SK_RUN_FILTER(filter, ctx) \
(*filter->prog->bpf_func)(ctx, filter->prog->insnsi)
int sk_filter(struct sock *sk, struct sk_buff *skb)
{
int err;
struct sk_filter *filter;
// ...
rcu_read_lock();
filter = rcu_dereference(sk->sk_filter);
if (filter) {
// as we can see here, how eBPF program works
unsigned int pkt_len = SK_RUN_FILTER(filter, skb);
err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
}
rcu_read_unlock();
return err;
}
ftrace
- NOTE: ftrace implementation based on Linux v2.6.27 codes reading, may update it after reading new version of it
perequisite knowledge
- Linux kernel should be compiled with gcc with
-pg flag, which will insert mcount function call at the start of every function, unless you use the function attribute no_instrument_function to suppress profiling of individual functions when compiling
gcc -c -pg -m32 ftrace.c
void foo()
{
printf("hello world\n");
}
#define notrace __attribute__((no_instrument_function))
void notrace bar()
{
printf("hello");
}
.LC0:
.string "hello world"
foo:
pushl %ebp
movl %esp, %ebp
subl $8, %esp
1: call mcount // NOTE: here -pg flag will insert mcount function call
subl $12, %esp
pushl $.LC0
// ...
bar:
pushl %ebp
movl %esp, %ebp
subl $8, %esp
subl $12, %esp
pushl $.LC0
// ...