Linux tracing techs

kprobes

brief history

  • Linux v2.6.9 introduced kprobes into its mainline, but only for i386
  • v2.6.10 begins to add kprobes support for x86_64
  • Linux v2.6.12 allows multiple kprobes at the same address
  • v2.6.13 starts to support kretprobe
  • v2.6.14 groups kprobes related functions to section .kprobes.text, so that kprobes to these codes will be rejected
  • v2.6.15 updates: using percpu infra to manage some internal variables; using RCU(Read-Copy-Update) list to manage kprobes; more checks on kprobes register, including kernel module address check
  • v2.6.16 updates: more kprobes rejects on kernel functions; refactor kprobes register functions to check whether we are probing a kernel module
  • v2.6.17 i386 updates its kretprobe_trampoline_holder; updates kprobe_fault_handler
  • v2.6.18 kprobes registers for page fault notifications when their is an active probe registered; tcpprobe module to probe tcp_sendmsg
  • v2.6.19 add symbol_name and offset fields to struct kprobe so you can directly using function name to register kprobes; dccp_probe module to probe dccp_sendmsg
  • v2.6.21 add basic debugfs support for kprobes, /sys/kernel/debug/kprobes/list lists all registered probes on the system
  • v2.6.22 more on debugfs, add /sys/kernel/debug/kprobes/enabled to globally turn registered kprobes on/off and the default value is on
  • v2.6.24 add basic kretprobe blacklist support
  • v2.6.25 kretprobe supports optional user-specified entry_handler which runs on function entry and also supports private data which can pass data between entry_handler and ret_handler
  • v2.6.26 add basic kprobe blacklist support; batch registration/unregistration of a group of probes interfaces
  • v2.6.29 add module notifier call back, which will check kprobes on the module; add flags field to struct kprobe to mark kprobe gone KPROBE_FLAG_GONE and remove its instruction buffer
  • v2.6.30 enable/disable probes interfaces, if kp->flags is set KPROBE_FLAG_DISABLED, that kp will be registered but disabled, so, it’s handlers aren’t hit until calling enable_kprobe(kp)
  • v2.6.33 updates kprobe blacklist; check whether kprobe re-registered
  • v2.6.34 kprobe optimization KPROBE_FLAG_OPTIMIZED for i386 and x86_64; kprobes sysctl interface /proc/sys/debug/kprobes-optimization to control kprobe optimization
  • v2.6.39 do not optimize in the entry code due to the unstable stack handling

initialization

  • when Linux is starting, it will call init_kprobes in its process of initialization
struct notifier_block *i386die_chain;

int register_die_notifier(struct notifier_block *nb)
{
    // ...
    err = notifier_chain_register(&i386die_chain, nb);
    // ...
    return err;
}
static struct notifier_block kprobe_exceptions_nb = {
    .notifier_call = kprobe_exceptions_notify,
    .priority = 0x7fffffff /* we need to notified first */
};

static int __init init_kprobes(void)
{
    int i, err = 0;

    for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
        INIT_HLIST_HEAD(&kprobe_table[i]);  // for kprobes
        INIT_HLIST_HEAD(&kretprobe_inst_table[i]);  // for kretprobes
    }

    err = arch_init_kprobes();  // here to register a kprobe for trampoline
    if (!err)
        // register kprobe_exceptions_nb to i386die_chain
        err = register_die_notifier(&kprobe_exceptions_nb);

    return err;
}

__initcall(init_kprobes);

how does kprobe work

  • helper functions
/* Attach to insert probes on any functions which should be ignored */
#define __kprobes  __attribute__((__section__(".kprobes.text")))

void __kprobes arch_copy_kprobe(struct kprobe *p)
{
    memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
    p->opcode = *p->addr;
}

void __kprobes arch_arm_kprobe(struct kprobe *p)
{
    *p->addr = BREAKPOINT_INSTRUCTION;
    // ...
}

static int __kprobes in_kprobes_functions(unsigned long addr)
{
    if (addr >= (unsigned long)__kprobes_text_start
        && addr < (unsigned long)__kprobes_text_end)
        return -EINVAL;
    return 0;
}
  • call register_kprobe to register a kprobe we want to probe
int register_kprobe(struct kprobe *p)
{
    // ...
    if ((ret = in_kprobes_functions((unsigned long) p->addr)) != 0)
        return ret;

    // ...
    // NOTE: add new kprobe to corresponding hash table slot
    hlist_add_head(&p->hlist,
               &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
        
    arch_arm_kprobe(p);
    // ...
}
  • when execution comes to the probing address, an int3 happened and do_int3 will be called then, and it will notify our register kprobes, from there, pre_handler/break_handler inside kprobe will be called
ENTRY(int3)
    // ...
    call do_int3  // here to call do_int3
    // ...
#ifdef CONFIG_KPROBES
asmlinkage int do_int3(struct pt_regs *regs, long error_code)
{
    // here to notify registered kprobes
    if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
            == NOTIFY_STOP)
        return 1;
    // back to normal ...
}
#endif
static inline int notify_die(enum die_val val,char *str,struct pt_regs *regs,long err,int trap, int sig)
{
    struct die_args args = { .regs=regs, .str=str, .err=err, .trapnr=trap,.signr=sig };
    // here kprobe_exceptions_notify inside kprobe_exceptions_nb will be called
    return notifier_call_chain(&i386die_chain, val, &args);
}
int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
                 void *data)
{
    struct die_args *args = (struct die_args *)data;
    switch (val) {
    case DIE_INT3:
        // p->break_handler & p->pre_handler
        if (kprobe_handler(args->regs))  // handler cb inside
            return NOTIFY_STOP;
        break;
        // ... p->post_handler
        // ... p->fault_handler
    }
    return NOTIFY_DONE;
}
static inline int kprobe_handler(struct pt_regs *regs)
{
    struct kprobe *p;
    u8 *addr = (u8 *) (regs->eip - 1);
    // ...
    p = get_kprobe(addr);
    // ...
    if (p->pre_handler(p, regs)) {  // NOTE: where magic happened
        /* handler has already set things up, so skip ss setup */
        return 1;
    }
    // ...
}

how does kretprobe work

  • in order to support kretprobe, at boot time, init_kprobes will first register a kprobe at the trampoline, which is an arbitrary piece of code – typically just a nop instruction
void kretprobe_trampoline_holder(void)
{
asm volatile (  ".global kretprobe_trampoline\n"
        "kretprobe_trampoline: \n"
        "nop\n");
}

static struct kprobe trampoline_p = {
    .addr = (kprobe_opcode_t *) &kretprobe_trampoline,
    .pre_handler = trampoline_probe_handler
};

int __init arch_init_kprobes(void)
{
    return register_kprobe(&trampoline_p);
}
  • when you call register_kretprobe, kprobes establishes a kprobe at the entry to the function and when the probed function is called and this probe is hit, kprobes saves a copy of the return address, and replaces the return address with the address of a “trampoline.”
void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs)
{
    unsigned long *sara = (unsigned long *)&regs->esp;
    struct kretprobe_instance *ri;

    if ((ri = get_free_rp_inst(rp)) != NULL) {
        ri->rp = rp;
        ri->task = current;
        ri->ret_addr = (kprobe_opcode_t *) *sara;  // important here! save original fn return addr

        // replace the return addr with trampoline addr
        *sara = (unsigned long) &kretprobe_trampoline;

        // ...
    }
    // ...
}

static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
{
    struct kretprobe *rp = container_of(p, struct kretprobe, kp);

    /*TODO: consider to only swap the RA after the last pre_handler fired */
    arch_prepare_kretprobe(rp, regs);
    return 0;
}

int register_kretprobe(struct kretprobe *rp)
{
    // ...
    rp->kp.pre_handler = pre_handler_kretprobe;

    /* Establish function entry probe point */
    if ((ret = register_kprobe(&rp->kp)) != 0) {
    }
    // ...
}
  • so that when function return, control passes to the trampoline, which is already been registered as a kprobe in init_kprobes, and that probe is hit, so its pre_handler trampoline_probe_handler will be called
  • after calling the user-specified handler associated with the kretprobe, the original function return address will be restored, and execution will be resumed
int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
{
    // ...
    head = kretprobe_inst_table_head(current);

    hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
        if (ri->task != current)
            continue;

        if (ri->rp && ri->rp->handler)
            ri->rp->handler(ri, regs);  // called user-specified handler

        orig_ret_address = (unsigned long)ri->ret_addr;
        // ...
    }

    regs->eip = orig_ret_address;  // restore the original return addr
    // ...
}

debugfs interface for kprobes

  • to list all registered probes on the system
cat /sys/kernel/debug/kprobes/list
  • to globally turn registered kprobes on or off
cat /sys/kernel/debug/kprobes/enabled
# turn on
echo 1 > /sys/kernel/debug/kprobes/enabled
# turn off
echo 0 > /sys/kernel/debug/kprobes/enabled

kprobes sysctl interface

  • check kprobes optimization status
cat /proc/sys/debug/kprobes-optimization
  • turn on or off kprobes optimization
# turn off
echo 0 > /proc/sys/debug/kprobes-optimization
# turn on
echo 1 > /proc/sys/debug/kprobes-optimization

uprobes

brief history

  • Linux v3.5 introduced uprobe into its mainline

tracepoints

  • NOTE: tracepoints implementation based on Linux v2.6.28 codes reading, may update it after reading new version of it

brief history

  • Linux v2.6.28 introduced tracepoint into its mainline
  • v2.6.29 splits original DEFINE_TRACE into DECLARE_TRACE and DEFINE_TRACE; if the tracepoint has to be used in kernel modules, an EXPORT_TRACEPOINT_SYMBOL_GPL or EXPORT_TRACEPOINT_SYMBOL can be used to export the defined tracepoints
  • v2.6.30 the macro TRACE_EVENT was introduced, which is far more powerful because it automates the “boilerplate” code needed to bridge the gap between a kernel function and a monitoring tool like perf or ftrace
  • v2.6.31 all predefined tracepoint events were grouped under include/trace/events by organizing events into subsystems, and introduced /sys/kernel/debug/tracing/events interface, giving each event its own directory with enable file
  • v2.6.32 add regfunc and unregfunc fields to struct tracepoint for more flexible functionality
  • v2.6.33 more macros DECLARE_EVENT_CLASS DEFINE_EVENT DEFINE_EVENT_PRINT to facilitate tracepoint usage for other tools
  • v2.6.35 update struct tracepoint and change related macros internal
  • v2.6.37 using JUMP_LABEL macro to test whether a tracepoint is enabled or not, and its related stuff
  • v2.6.38 using __tracepoints_ptrs section for iteration on the tracepoints; macro DECLARE_TRACE_CONDITION
  • v3.0 performance optimization using asm goto __jump_table

how does tracepoint work

  • using DEFINE_TRACE, TPPROTO and TPARGS to define a tracepoint
  • the tracepoint name will be put in __tracepoints_strings section, and the tracepoint itself will be put in __tracepoints section
DEFINE_TRACE(sched_wakeup_new,
    TPPROTO(struct rq *rq, struct task_struct *p),
    TPARGS(rq, p));

# the above example will expand to the following codes

# this is the tracepoint function called in other important places
static inline void trace_sched_wakeup_new(struct rq *rq,
                                          struct task_struct *p) {

  static const char __tpstrtab_sched_wakeup_new[]
      __attribute__((section("__tracepoints_strings"))) =
          "sched_wakeup_new:TPPROTO(struct rq *rq, struct task_struct *p)";

  static struct tracepoint __tracepoint_sched_wakeup_new
      __attribute__((section("__tracepoints"), aligned(8))) = {
          __tpstrtab_sched_wakeup_new,
                  0,
                  NULL
          };
  // when tracepoint does not activated, then just skip it
  if (unlikely(__tracepoint_sched_wakeup_new.state))
    do {
      void **it_func;
      rcu_read_lock_sched();
      it_func = rcu_dereference((&__tracepoint_sched_wakeup_new)->funcs);
      if (it_func) {
        do {
          // call tracepoint probe funcs here!!
          ((void (*)(struct rq * rq, struct task_struct * p))(*it_func))(rq, p);
        } while (*(++it_func));
      }
      rcu_read_unlock_sched();
    } while (0);
}
  • when you want to register a probe function to some tracepoint, just define the probe function according to the signature of proto of tracepoint definition, then call the generated register function, which will then call tracepoint_probe_register to connect a probe to a tracepoint
static void set_tracepoint(struct tracepoint_entry **entry,
    struct tracepoint *elem, int active)
{
    // ...
    // assign funcs in tracepoint entry to tracepoint
    rcu_assign_pointer(elem->funcs, (*entry)->funcs);
    elem->state = active;  // then activate the tracepoint
}

void tracepoint_update_probe_range(struct tracepoint *begin,
    struct tracepoint *end)
{
    // ...
    for (iter = begin; iter < end; iter++) {
        mark_entry = get_tracepoint(iter->name);
        if (mark_entry) {
            set_tracepoint(&mark_entry, iter,
                    !!mark_entry->refcount);
        } // ...
    }
    // ...
}

static void tracepoint_update_probes(void)
{
    /* Core kernel tracepoints */
    tracepoint_update_probe_range(__start___tracepoints,
        __stop___tracepoints);
    /* tracepoints in modules. */
    module_update_tracepoints();
}

int tracepoint_probe_register(const char *name, void *probe)
{
    // ...
    // add probe to corresponding tracepoint entry
    old = tracepoint_entry_add_probe(entry, probe);
    // ...
    // move probe funcs in tracepoint entry to corresponding tracepoint
    tracepoint_update_probes();
    // ...
}
static inline int register_trace_sched_wakeup_new(void (*probe)(struct rq *rq, struct task_struct *p)) {
  return tracepoint_probe_register("sched_wakeup_new:TPPROTO(struct rq *rq, struct task_struct *p)", (void *)probe);
}
struct tracepoint {
  const char *name;
  int state;
  void **funcs;  // all registered probe functions for this tracepoint
} __attribute__((aligned(8)));
  • tracepoint actual calling example
# sys_fork will call do_fork, and it will call wake_up_new_task
void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
{
    // ...
    trace_sched_wakeup_new(rq, p);  // calling our above defined tracepoint here
    // ...
}

tracepoints in modules

  • when you run insmod or modprobe, the user-space utility eventually calls sys_init_module to pass the binary blob (the .ko file) to the kernel
struct module
{
    // ...
#ifdef CONFIG_TRACEPOINTS
    struct tracepoint *tracepoints;
    unsigned int num_tracepoints;
#endif
    // ...
};
static noinline struct module *load_module(void __user *umod,
                  unsigned long len,
                  const char __user *uargs)
{
    // ...
#ifdef CONFIG_TRACEPOINTS
    mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
                    "__tracepoints",
                    sizeof(*mod->tracepoints),
                    &mod->num_tracepoints);
#endif
    // ...
    if (!mod->taints) {
        // ...
#ifdef CONFIG_TRACEPOINTS
        tracepoint_update_probe_range(mod->tracepoints,
            mod->tracepoints + mod->num_tracepoints);
#endif
    }
    // ...
}

asmlinkage long
sys_init_module(void __user *umod,
        unsigned long len,
        const char __user *uargs)
{
    // ...
    mod = load_module(umod, len, uargs);
    // ...
}

oid module_update_tracepoints(void)
{
    // ...
    list_for_each_entry(mod, &modules, list)
        if (!mod->taints)
            tracepoint_update_probe_range(mod->tracepoints,
                mod->tracepoints + mod->num_tracepoints);
    // ...
}

debugfs interface for tracepoints

  • we can easily enable tracepoints by using the event tracing framework
# to enable event 'sched_wakeup'
echo 1 > /sys/kernel/debug/tracing/events/sched/sched_wakeup/enable

# to disable it
echo 0 > /sys/kernel/debug/tracing/events/sched/sched_wakeup/enable

# to enable all events in sched subsystem
echo 1 > /sys/kernel/debug/tracing/events/sched/enable

# to eanble all events
echo 1 > /sys/kernel/debug/tracing/events/enable

perf subsystem

  • NOTE: perf subsystem implementation based on Linux v2.6.32 codes reading, may update it after reading new version of it

brief history

  • Linux v2.6.31 first introduced Performance Counters for Linux(PCL) into its mainline for just hardware counters, so the syscall is sys_perf_counter_open
  • but Linux v2.6.32 changed the syscall to sys_perf_event_open, and expanded the framework to do more than just hw counters

how does perf event work

  • the following calling procedure only means calling timeline, not caller-callee callgraph, so that we can see how perf event subsystem is inited: start_kernel –> sched_init –> perf_event_init –> check_bugs –> identify_boot_cpu –> init_hw_perf_events –> intel_pmu_init/amd_pmu_init –> perf_events_lapic_init –> rest_init –> kernel_init –> do_basic_setup –> do_initcalls –> perf_event_sysfs_init

eBPF

  • NOTE: eBPF implementation based on Linux v4.0 codes reading, may update it after reading new version of it

bpf program loading

  • calling bpf system call with BPF_PROG_LOAD cmd arg, bpf program will be copied into kernel and run through eBPF verifier by function bpf_check. After passing verification, bpf_prog_select_runtime will be used to select interpreter running function __bpf_prog_run or jited BPF instructions to native codes
static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
{
    // ...
        /* CALL */
    JMP_CALL:
        /* Function call scratches BPF_R1-BPF_R5 registers,
         * preserves BPF_R6-BPF_R9, and stores return value
         * into BPF_R0.
         */
        BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
                               BPF_R4, BPF_R5);  // NOTE: where magics happend
        CONT;
    // ...
}
void bpf_int_jit_compile(struct bpf_prog *prog)
{
    // ...
    if (image) {
        // ... after jiting related codes
        prog->bpf_func = (void *)image;  // here we replace the interpreter func to jited codes
        prog->jited = true;
        // ...
    }
    // ...
}
void bpf_prog_select_runtime(struct bpf_prog *fp)
{
    fp->bpf_func = (void *) __bpf_prog_run;  // NOTE: save interpreter running function

    /* Probe if internal BPF can be JITed */
    bpf_int_jit_compile(fp);  // here fp->bpf_func is replaced with jited
                              // codes if jit is working
    // ...
}

bpf program running

  • two macros BPF_PROG_RUN and SK_RUN_FILTER are used to run bpf programs
#define BPF_PROG_RUN(filter, ctx)  (*filter->bpf_func)(ctx, filter->insnsi)

static int tcf_bpf(struct sk_buff *skb, const struct tc_action *a,
           struct tcf_result *res)
{
    struct tcf_bpf *b = a->priv;
    int action, filter_res;

    spin_lock(&b->tcf_lock);
        
    // ...
    filter_res = BPF_PROG_RUN(b->filter, skb);
    // ...

    spin_unlock(&b->tcf_lock);
    return action;
/* Macro to invoke filter function. */
#define SK_RUN_FILTER(filter, ctx) \
    (*filter->prog->bpf_func)(ctx, filter->prog->insnsi)

int sk_filter(struct sock *sk, struct sk_buff *skb)
{
    int err;
    struct sk_filter *filter;
        
    // ...

    rcu_read_lock();
    filter = rcu_dereference(sk->sk_filter);
    if (filter) {
        // as we can see here, how eBPF program works
        unsigned int pkt_len = SK_RUN_FILTER(filter, skb);

        err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
    }
    rcu_read_unlock();

    return err;
}

ftrace

  • NOTE: ftrace implementation based on Linux v2.6.27 codes reading, may update it after reading new version of it

perequisite knowledge

  • Linux kernel should be compiled with gcc with -pg flag, which will insert mcount function call at the start of every function, unless you use the function attribute no_instrument_function to suppress profiling of individual functions when compiling
gcc -c -pg -m32 ftrace.c
void foo()
{
    printf("hello world\n");
}

#define notrace __attribute__((no_instrument_function))

void notrace bar()
{
    printf("hello");
}
.LC0:
        .string "hello world"
foo:
        pushl   %ebp
        movl    %esp, %ebp
        subl    $8, %esp
1:      call        mcount  // NOTE: here -pg flag will insert mcount function call
        subl    $12, %esp
        pushl   $.LC0
        // ...
bar:
        pushl   %ebp
        movl    %esp, %ebp
        subl    $8, %esp
        subl    $12, %esp
        pushl   $.LC0
        // ...