Linux tracing techs

willmafh January 28, 2026

kprobes

brief history

Linux v2.6.9 introduced kprobes into its mainline, but only for i386
v2.6.10 begins to add kprobes support for x86_64
Linux v2.6.12 allows multiple kprobes at the same address
v2.6.13 starts to support kretprobe
v2.6.14 groups kprobes related functions to section .kprobes.text, so that kprobes to these codes will be rejected
v2.6.15 updates: using percpu infra to manage some internal variables; using RCU(Read-Copy-Update) list to manage kprobes; more checks on kprobes register, including kernel module address check
v2.6.16 updates: more kprobes rejects on kernel functions; refactor kprobes register functions to check whether we are probing a kernel module
v2.6.17 i386 updates its kretprobe_trampoline_holder; updates kprobe_fault_handler
v2.6.18 kprobes registers for page fault notifications when their is an active probe registered; tcpprobe module to probe tcp_sendmsg
v2.6.19 add symbol_name and offset fields to struct kprobe so you can directly using function name to register kprobes; dccp_probe module to probe dccp_sendmsg
v2.6.21 add basic debugfs support for kprobes, /sys/kernel/debug/kprobes/list lists all registered probes on the system
v2.6.22 more on debugfs, add /sys/kernel/debug/kprobes/enabled to globally turn registered kprobes on/off and the default value is on
v2.6.24 add basic kretprobe blacklist support
v2.6.25 kretprobe supports optional user-specified entry_handler which runs on function entry and also supports private data which can pass data between entry_handler and ret_handler
v2.6.26 add basic kprobe blacklist support; batch registration/unregistration of a group of probes interfaces
v2.6.29 add module notifier call back, which will check kprobes on the module; add flags field to struct kprobe to mark kprobe gone KPROBE_FLAG_GONE and remove its instruction buffer
v2.6.30 enable/disable probes interfaces, if kp->flags is set KPROBE_FLAG_DISABLED, that kp will be registered but disabled, so, it’s handlers aren’t hit until calling enable_kprobe(kp)
v2.6.33 updates kprobe blacklist; check whether kprobe re-registered
v2.6.34 kprobe optimization KPROBE_FLAG_OPTIMIZED for i386 and x86_64; kprobes sysctl interface /proc/sys/debug/kprobes-optimization to control kprobe optimization
v2.6.39 do not optimize in the entry code due to the unstable stack handling

initialization

when Linux is starting, it will call init_kprobes in its process of initialization

struct notifier_block *i386die_chain;

int register_die_notifier(struct notifier_block *nb)
{
    // ...
    err = notifier_chain_register(&i386die_chain, nb);
    // ...
    return err;
}

static struct notifier_block kprobe_exceptions_nb = {
    .notifier_call = kprobe_exceptions_notify,
    .priority = 0x7fffffff /* we need to notified first */
};

static int __init init_kprobes(void)
{
    int i, err = 0;

    for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
        INIT_HLIST_HEAD(&kprobe_table[i]);  // for kprobes
        INIT_HLIST_HEAD(&kretprobe_inst_table[i]);  // for kretprobes
    }

    err = arch_init_kprobes();  // here to register a kprobe for trampoline
    if (!err)
        // register kprobe_exceptions_nb to i386die_chain
        err = register_die_notifier(&kprobe_exceptions_nb);

    return err;
}

__initcall(init_kprobes);

how does kprobe work

helper functions

/* Attach to insert probes on any functions which should be ignored */
#define __kprobes  __attribute__((__section__(".kprobes.text")))

void __kprobes arch_copy_kprobe(struct kprobe *p)
{
    memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
    p->opcode = *p->addr;
}

void __kprobes arch_arm_kprobe(struct kprobe *p)
{
    *p->addr = BREAKPOINT_INSTRUCTION;
    // ...
}

static int __kprobes in_kprobes_functions(unsigned long addr)
{
    if (addr >= (unsigned long)__kprobes_text_start
        && addr < (unsigned long)__kprobes_text_end)
        return -EINVAL;
    return 0;
}

call register_kprobe to register a kprobe we want to probe

int register_kprobe(struct kprobe *p)
{
    // ...
    if ((ret = in_kprobes_functions((unsigned long) p->addr)) != 0)
        return ret;

    // ...
    // NOTE: add new kprobe to corresponding hash table slot
    hlist_add_head(&p->hlist,
               &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
        
    arch_arm_kprobe(p);
    // ...
}

when execution comes to the probing address, an int3 happened and do_int3 will be called then, and it will notify our register kprobes, from there, pre_handler/break_handler inside kprobe will be called

ENTRY(int3)
    // ...
    call do_int3  // here to call do_int3
    // ...

#ifdef CONFIG_KPROBES
asmlinkage int do_int3(struct pt_regs *regs, long error_code)
{
    // here to notify registered kprobes
    if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
            == NOTIFY_STOP)
        return 1;
    // back to normal ...
}
#endif

static inline int notify_die(enum die_val val,char *str,struct pt_regs *regs,long err,int trap, int sig)
{
    struct die_args args = { .regs=regs, .str=str, .err=err, .trapnr=trap,.signr=sig };
    // here kprobe_exceptions_notify inside kprobe_exceptions_nb will be called
    return notifier_call_chain(&i386die_chain, val, &args);
}

int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
                 void *data)
{
    struct die_args *args = (struct die_args *)data;
    switch (val) {
    case DIE_INT3:
        // p->break_handler & p->pre_handler
        if (kprobe_handler(args->regs))  // handler cb inside
            return NOTIFY_STOP;
        break;
        // ... p->post_handler
        // ... p->fault_handler
    }
    return NOTIFY_DONE;
}

static inline int kprobe_handler(struct pt_regs *regs)
{
    struct kprobe *p;
    u8 *addr = (u8 *) (regs->eip - 1);
    // ...
    p = get_kprobe(addr);
    // ...
    if (p->pre_handler(p, regs)) {  // NOTE: where magic happened
        /* handler has already set things up, so skip ss setup */
        return 1;
    }
    // ...
}

how does kretprobe work

in order to support kretprobe, at boot time, init_kprobes will first register a kprobe at the trampoline, which is an arbitrary piece of code – typically just a nop instruction

void kretprobe_trampoline_holder(void)
{
asm volatile (  ".global kretprobe_trampoline\n"
        "kretprobe_trampoline: \n"
        "nop\n");
}

static struct kprobe trampoline_p = {
    .addr = (kprobe_opcode_t *) &kretprobe_trampoline,
    .pre_handler = trampoline_probe_handler
};

int __init arch_init_kprobes(void)
{
    return register_kprobe(&trampoline_p);
}

when you call register_kretprobe, kprobes establishes a kprobe at the entry to the function and when the probed function is called and this probe is hit, kprobes saves a copy of the return address, and replaces the return address with the address of a “trampoline.”

void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs)
{
    unsigned long *sara = (unsigned long *)&regs->esp;
    struct kretprobe_instance *ri;

    if ((ri = get_free_rp_inst(rp)) != NULL) {
        ri->rp = rp;
        ri->task = current;
        ri->ret_addr = (kprobe_opcode_t *) *sara;  // important here! save original fn return addr

        // replace the return addr with trampoline addr
        *sara = (unsigned long) &kretprobe_trampoline;

        // ...
    }
    // ...
}

static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
{
    struct kretprobe *rp = container_of(p, struct kretprobe, kp);

    /*TODO: consider to only swap the RA after the last pre_handler fired */
    arch_prepare_kretprobe(rp, regs);
    return 0;
}

int register_kretprobe(struct kretprobe *rp)
{
    // ...
    rp->kp.pre_handler = pre_handler_kretprobe;

    /* Establish function entry probe point */
    if ((ret = register_kprobe(&rp->kp)) != 0) {
    }
    // ...
}

so that when function return, control passes to the trampoline, which is already been registered as a kprobe in init_kprobes, and that probe is hit, so its pre_handler trampoline_probe_handler will be called
after calling the user-specified handler associated with the kretprobe, the original function return address will be restored, and execution will be resumed

int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
{
    // ...
    head = kretprobe_inst_table_head(current);

    hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
        if (ri->task != current)
            continue;

        if (ri->rp && ri->rp->handler)
            ri->rp->handler(ri, regs);  // called user-specified handler

        orig_ret_address = (unsigned long)ri->ret_addr;
        // ...
    }

    regs->eip = orig_ret_address;  // restore the original return addr
    // ...
}

debugfs interface for kprobes

to list all registered probes on the system

cat /sys/kernel/debug/kprobes/list

to globally turn registered kprobes on or off

cat /sys/kernel/debug/kprobes/enabled
# turn on
echo 1 > /sys/kernel/debug/kprobes/enabled
# turn off
echo 0 > /sys/kernel/debug/kprobes/enabled

kprobes sysctl interface

check kprobes optimization status

cat /proc/sys/debug/kprobes-optimization

turn on or off kprobes optimization

# turn off
echo 0 > /proc/sys/debug/kprobes-optimization
# turn on
echo 1 > /proc/sys/debug/kprobes-optimization

uprobes

brief history

Linux v3.5 introduced uprobe into its mainline

tracepoints

NOTE: tracepoints implementation based on Linux v2.6.28 codes reading, may update it after reading new version of it

brief history

Linux v2.6.28 introduced tracepoint into its mainline
v2.6.29 splits original DEFINE_TRACE into DECLARE_TRACE and DEFINE_TRACE; if the tracepoint has to be used in kernel modules, an EXPORT_TRACEPOINT_SYMBOL_GPL or EXPORT_TRACEPOINT_SYMBOL can be used to export the defined tracepoints
v2.6.30 the macro TRACE_EVENT was introduced, which is far more powerful because it automates the “boilerplate” code needed to bridge the gap between a kernel function and a monitoring tool like perf or ftrace
v2.6.31 all predefined tracepoint events were grouped under include/trace/events by organizing events into subsystems, and introduced /sys/kernel/debug/tracing/events interface, giving each event its own directory with enable file
v2.6.32 add regfunc and unregfunc fields to struct tracepoint for more flexible functionality
v2.6.33 more macros DECLARE_EVENT_CLASS DEFINE_EVENT DEFINE_EVENT_PRINT to facilitate tracepoint usage for other tools
v2.6.35 update struct tracepoint and change related macros internal
v2.6.37 using JUMP_LABEL macro to test whether a tracepoint is enabled or not, and its related stuff
v2.6.38 using __tracepoints_ptrs section for iteration on the tracepoints; macro DECLARE_TRACE_CONDITION
v3.0 performance optimization using asm goto __jump_table

how does tracepoint work

using DEFINE_TRACE, TPPROTO and TPARGS to define a tracepoint
the tracepoint name will be put in __tracepoints_strings section, and the tracepoint itself will be put in __tracepoints section

DEFINE_TRACE(sched_wakeup_new,
    TPPROTO(struct rq *rq, struct task_struct *p),
    TPARGS(rq, p));

# the above example will expand to the following codes

# this is the tracepoint function called in other important places
static inline void trace_sched_wakeup_new(struct rq *rq,
                                          struct task_struct *p) {

  static const char __tpstrtab_sched_wakeup_new[]
      __attribute__((section("__tracepoints_strings"))) =
          "sched_wakeup_new:TPPROTO(struct rq *rq, struct task_struct *p)";

  static struct tracepoint __tracepoint_sched_wakeup_new
      __attribute__((section("__tracepoints"), aligned(8))) = {
          __tpstrtab_sched_wakeup_new,
                  0,
                  NULL
          };
  // when tracepoint does not activated, then just skip it
  if (unlikely(__tracepoint_sched_wakeup_new.state))
    do {
      void **it_func;
      rcu_read_lock_sched();
      it_func = rcu_dereference((&__tracepoint_sched_wakeup_new)->funcs);
      if (it_func) {
        do {
          // call tracepoint probe funcs here!!
          ((void (*)(struct rq * rq, struct task_struct * p))(*it_func))(rq, p);
        } while (*(++it_func));
      }
      rcu_read_unlock_sched();
    } while (0);
}

when you want to register a probe function to some tracepoint, just define the probe function according to the signature of proto of tracepoint definition, then call the generated register function, which will then call tracepoint_probe_register to connect a probe to a tracepoint

static void set_tracepoint(struct tracepoint_entry **entry,
    struct tracepoint *elem, int active)
{
    // ...
    // assign funcs in tracepoint entry to tracepoint
    rcu_assign_pointer(elem->funcs, (*entry)->funcs);
    elem->state = active;  // then activate the tracepoint
}

void tracepoint_update_probe_range(struct tracepoint *begin,
    struct tracepoint *end)
{
    // ...
    for (iter = begin; iter < end; iter++) {
        mark_entry = get_tracepoint(iter->name);
        if (mark_entry) {
            set_tracepoint(&mark_entry, iter,
                    !!mark_entry->refcount);
        } // ...
    }
    // ...
}

static void tracepoint_update_probes(void)
{
    /* Core kernel tracepoints */
    tracepoint_update_probe_range(__start___tracepoints,
        __stop___tracepoints);
    /* tracepoints in modules. */
    module_update_tracepoints();
}

int tracepoint_probe_register(const char *name, void *probe)
{
    // ...
    // add probe to corresponding tracepoint entry
    old = tracepoint_entry_add_probe(entry, probe);
    // ...
    // move probe funcs in tracepoint entry to corresponding tracepoint
    tracepoint_update_probes();
    // ...
}

static inline int register_trace_sched_wakeup_new(void (*probe)(struct rq *rq, struct task_struct *p)) {
  return tracepoint_probe_register("sched_wakeup_new:TPPROTO(struct rq *rq, struct task_struct *p)", (void *)probe);
}

struct tracepoint {
  const char *name;
  int state;
  void **funcs;  // all registered probe functions for this tracepoint
} __attribute__((aligned(8)));

tracepoint actual calling example

# sys_fork will call do_fork, and it will call wake_up_new_task
void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
{
    // ...
    trace_sched_wakeup_new(rq, p);  // calling our above defined tracepoint here
    // ...
}

tracepoints in modules

when you run insmod or modprobe, the user-space utility eventually calls sys_init_module to pass the binary blob (the .ko file) to the kernel

struct module
{
    // ...
#ifdef CONFIG_TRACEPOINTS
    struct tracepoint *tracepoints;
    unsigned int num_tracepoints;
#endif
    // ...
};

static noinline struct module *load_module(void __user *umod,
                  unsigned long len,
                  const char __user *uargs)
{
    // ...
#ifdef CONFIG_TRACEPOINTS
    mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
                    "__tracepoints",
                    sizeof(*mod->tracepoints),
                    &mod->num_tracepoints);
#endif
    // ...
    if (!mod->taints) {
        // ...
#ifdef CONFIG_TRACEPOINTS
        tracepoint_update_probe_range(mod->tracepoints,
            mod->tracepoints + mod->num_tracepoints);
#endif
    }
    // ...
}

asmlinkage long
sys_init_module(void __user *umod,
        unsigned long len,
        const char __user *uargs)
{
    // ...
    mod = load_module(umod, len, uargs);
    // ...
}

oid module_update_tracepoints(void)
{
    // ...
    list_for_each_entry(mod, &modules, list)
        if (!mod->taints)
            tracepoint_update_probe_range(mod->tracepoints,
                mod->tracepoints + mod->num_tracepoints);
    // ...
}

debugfs interface for tracepoints

we can easily enable tracepoints by using the event tracing framework

# to enable event 'sched_wakeup'
echo 1 > /sys/kernel/debug/tracing/events/sched/sched_wakeup/enable

# to disable it
echo 0 > /sys/kernel/debug/tracing/events/sched/sched_wakeup/enable

# to enable all events in sched subsystem
echo 1 > /sys/kernel/debug/tracing/events/sched/enable

# to eanble all events
echo 1 > /sys/kernel/debug/tracing/events/enable

perf subsystem

NOTE: perf subsystem implementation based on Linux v2.6.32 codes reading, may update it after reading new version of it

brief history

Linux v2.6.31 first introduced Performance Counters for Linux(PCL) into its mainline for just hardware counters, so the syscall is sys_perf_counter_open
but Linux v2.6.32 changed the syscall to sys_perf_event_open, and expanded the framework to do more than just hw counters

how does perf event work

the following calling procedure only means calling timeline, not caller-callee callgraph, so that we can see how perf event subsystem is inited: start_kernel –> sched_init –> perf_event_init –> check_bugs –> identify_boot_cpu –> init_hw_perf_events –> intel_pmu_init/amd_pmu_init –> perf_events_lapic_init –> rest_init –> kernel_init –> do_basic_setup –> do_initcalls –> perf_event_sysfs_init

eBPF

NOTE: eBPF implementation based on Linux v4.0 codes reading, may update it after reading new version of it

bpf program loading

calling bpf system call with BPF_PROG_LOAD cmd arg, bpf program will be copied into kernel and run through eBPF verifier by function bpf_check. After passing verification, bpf_prog_select_runtime will be used to select interpreter running function __bpf_prog_run or jited BPF instructions to native codes

static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
{
    // ...
        /* CALL */
    JMP_CALL:
        /* Function call scratches BPF_R1-BPF_R5 registers,
         * preserves BPF_R6-BPF_R9, and stores return value
         * into BPF_R0.
         */
        BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
                               BPF_R4, BPF_R5);  // NOTE: where magics happend
        CONT;
    // ...
}

void bpf_int_jit_compile(struct bpf_prog *prog)
{
    // ...
    if (image) {
        // ... after jiting related codes
        prog->bpf_func = (void *)image;  // here we replace the interpreter func to jited codes
        prog->jited = true;
        // ...
    }
    // ...
}

void bpf_prog_select_runtime(struct bpf_prog *fp)
{
    fp->bpf_func = (void *) __bpf_prog_run;  // NOTE: save interpreter running function

    /* Probe if internal BPF can be JITed */
    bpf_int_jit_compile(fp);  // here fp->bpf_func is replaced with jited
                              // codes if jit is working
    // ...
}

bpf program running

two macros BPF_PROG_RUN and SK_RUN_FILTER are used to run bpf programs

#define BPF_PROG_RUN(filter, ctx)  (*filter->bpf_func)(ctx, filter->insnsi)

static int tcf_bpf(struct sk_buff *skb, const struct tc_action *a,
           struct tcf_result *res)
{
    struct tcf_bpf *b = a->priv;
    int action, filter_res;

    spin_lock(&b->tcf_lock);
        
    // ...
    filter_res = BPF_PROG_RUN(b->filter, skb);
    // ...

    spin_unlock(&b->tcf_lock);
    return action;

/* Macro to invoke filter function. */
#define SK_RUN_FILTER(filter, ctx) \
    (*filter->prog->bpf_func)(ctx, filter->prog->insnsi)

int sk_filter(struct sock *sk, struct sk_buff *skb)
{
    int err;
    struct sk_filter *filter;
        
    // ...

    rcu_read_lock();
    filter = rcu_dereference(sk->sk_filter);
    if (filter) {
        // as we can see here, how eBPF program works
        unsigned int pkt_len = SK_RUN_FILTER(filter, skb);

        err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
    }
    rcu_read_unlock();

    return err;
}

ftrace

NOTE: ftrace implementation based on Linux v2.6.27 codes reading, may update it after reading new version of it

perequisite knowledge

Linux kernel should be compiled with gcc with -pg flag, which will insert mcount function call at the start of every function, unless you use the function attribute no_instrument_function to suppress profiling of individual functions when compiling

gcc -c -pg -m32 ftrace.c

void foo()
{
    printf("hello world\n");
}

#define notrace __attribute__((no_instrument_function))

void notrace bar()
{
    printf("hello");
}

.LC0:
        .string "hello world"
foo:
        pushl   %ebp
        movl    %esp, %ebp
        subl    $8, %esp
1:      call        mcount  // NOTE: here -pg flag will insert mcount function call
        subl    $12, %esp
        pushl   $.LC0
        // ...
bar:
        pushl   %ebp
        movl    %esp, %ebp
        subl    $8, %esp
        subl    $12, %esp
        pushl   $.LC0
        // ...