Linux tracing techs

kprobes

brief history

  • Linux v2.6.9 introduced kprobes into its mainline, but only for i386
  • v2.6.10 begins to add kprobes support for x86_64
  • Linux v2.6.12 allows multiple kprobes at the same address
  • v2.6.13 starts to support kretprobe
  • v2.6.14 groups kprobes related functions to section .kprobes.text, so that kprobes to these codes will be rejected
  • v2.6.15 updates: using percpu infra to manage some internal variables; using RCU(Read-Copy-Update) list to manage kprobes; more checks on kprobes register, including kernel module address check
  • v2.6.16 updates: more kprobes rejects on kernel functions; refactor kprobes register functions to check whether we are probing a kernel module
  • v2.6.17 i386 updates its kretprobe_trampoline_holder; updates kprobe_fault_handler
  • v2.6.18 kprobes registers for page fault notifications when their is an active probe registered; tcpprobe module to probe tcp_sendmsg
  • v2.6.19 add symbol_name and offset fields to struct kprobe so you can directly using function name to register kprobes; dccp_probe module to probe dccp_sendmsg
  • v2.6.21 add basic debugfs support for kprobes, /sys/kernel/debug/kprobes/list lists all registered probes on the system
  • v2.6.22 more on debugfs, add /sys/kernel/debug/kprobes/enabled to globally turn registered kprobes on/off and the default value is on
  • v2.6.24 add basic kretprobe blacklist support
  • v2.6.25 kretprobe supports optional user-specified entry_handler which runs on function entry and also supports private data which can pass data between entry_handler and ret_handler
  • v2.6.26 add basic kprobe blacklist support; batch registration/unregistration of a group of probes interfaces
  • v2.6.29 add module notifier call back, which will check kprobes on the module; add flags field to struct kprobe to mark kprobe gone KPROBE_FLAG_GONE and remove its instruction buffer
  • v2.6.30 enable/disable probes interfaces, if kp->flags is set KPROBE_FLAG_DISABLED, that kp will be registered but disabled, so, it’s handlers aren’t hit until calling enable_kprobe(kp)
  • v2.6.33 updates kprobe blacklist; check whether kprobe re-registered
  • v2.6.34 kprobe optimization KPROBE_FLAG_OPTIMIZED for i386 and x86_64; kprobes sysctl interface /proc/sys/debug/kprobes-optimization to control kprobe optimization
  • v2.6.39 do not optimize in the entry code due to the unstable stack handling

initialization

  • when Linux is starting, it will call init_kprobes in its process of initialization
struct notifier_block *i386die_chain;

int register_die_notifier(struct notifier_block *nb)
{
    // ...
    err = notifier_chain_register(&i386die_chain, nb);
    // ...
    return err;
}
static struct notifier_block kprobe_exceptions_nb = {
    .notifier_call = kprobe_exceptions_notify,
    .priority = 0x7fffffff /* we need to notified first */
};

static int __init init_kprobes(void)
{
    int i, err = 0;

    for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
        INIT_HLIST_HEAD(&kprobe_table[i]);  // for kprobes
        INIT_HLIST_HEAD(&kretprobe_inst_table[i]);  // for kretprobes
    }

    err = arch_init_kprobes();  // here to register a kprobe for trampoline
    if (!err)
        // register kprobe_exceptions_nb to i386die_chain
        err = register_die_notifier(&kprobe_exceptions_nb);

    return err;
}

__initcall(init_kprobes);

how does kprobe work

  • helper functions
/* Attach to insert probes on any functions which should be ignored */
#define __kprobes  __attribute__((__section__(".kprobes.text")))

void __kprobes arch_copy_kprobe(struct kprobe *p)
{
    memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
    p->opcode = *p->addr;
}

void __kprobes arch_arm_kprobe(struct kprobe *p)
{
    *p->addr = BREAKPOINT_INSTRUCTION;
    // ...
}

static int __kprobes in_kprobes_functions(unsigned long addr)
{
    if (addr >= (unsigned long)__kprobes_text_start
        && addr < (unsigned long)__kprobes_text_end)
        return -EINVAL;
    return 0;
}
  • call register_kprobe to register a kprobe we want to probe
int register_kprobe(struct kprobe *p)
{
    // ...
    if ((ret = in_kprobes_functions((unsigned long) p->addr)) != 0)
        return ret;

    // ...
    // NOTE: add new kprobe to corresponding hash table slot
    hlist_add_head(&p->hlist,
               &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
        
    arch_arm_kprobe(p);
    // ...
}
  • when execution comes to the probing address, an int3 happened and do_int3 will be called then, and it will notify our register kprobes, from there, pre_handler/break_handler inside kprobe will be called
ENTRY(int3)
    // ...
    call do_int3  // here to call do_int3
    // ...
#ifdef CONFIG_KPROBES
asmlinkage int do_int3(struct pt_regs *regs, long error_code)
{
    // here to notify registered kprobes
    if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
            == NOTIFY_STOP)
        return 1;
    // back to normal ...
}
#endif
static inline int notify_die(enum die_val val,char *str,struct pt_regs *regs,long err,int trap, int sig)
{
    struct die_args args = { .regs=regs, .str=str, .err=err, .trapnr=trap,.signr=sig };
    // here kprobe_exceptions_notify inside kprobe_exceptions_nb will be called
    return notifier_call_chain(&i386die_chain, val, &args);
}
int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
                 void *data)
{
    struct die_args *args = (struct die_args *)data;
    switch (val) {
    case DIE_INT3:
        // p->break_handler & p->pre_handler
        if (kprobe_handler(args->regs))  // handler cb inside
            return NOTIFY_STOP;
        break;
        // ... p->post_handler
        // ... p->fault_handler
    }
    return NOTIFY_DONE;
}
static inline int kprobe_handler(struct pt_regs *regs)
{
    struct kprobe *p;
    u8 *addr = (u8 *) (regs->eip - 1);
    // ...
    p = get_kprobe(addr);
    // ...
    if (p->pre_handler(p, regs)) {  // NOTE: where magic happened
        /* handler has already set things up, so skip ss setup */
        return 1;
    }
    // ...
}

how does kretprobe work

  • in order to support kretprobe, at boot time, init_kprobes will first register a kprobe at the trampoline, which is an arbitrary piece of code – typically just a nop instruction
void kretprobe_trampoline_holder(void)
{
asm volatile (  ".global kretprobe_trampoline\n"
        "kretprobe_trampoline: \n"
        "nop\n");
}

static struct kprobe trampoline_p = {
    .addr = (kprobe_opcode_t *) &kretprobe_trampoline,
    .pre_handler = trampoline_probe_handler
};

int __init arch_init_kprobes(void)
{
    return register_kprobe(&trampoline_p);
}
  • when you call register_kretprobe, kprobes establishes a kprobe at the entry to the function and when the probed function is called and this probe is hit, kprobes saves a copy of the return address, and replaces the return address with the address of a “trampoline.”
void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs)
{
    unsigned long *sara = (unsigned long *)&regs->esp;
    struct kretprobe_instance *ri;

    if ((ri = get_free_rp_inst(rp)) != NULL) {
        ri->rp = rp;
        ri->task = current;
        ri->ret_addr = (kprobe_opcode_t *) *sara;  // important here! save original fn return addr

        // replace the return addr with trampoline addr
        *sara = (unsigned long) &kretprobe_trampoline;

        // ...
    }
    // ...
}

static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
{
    struct kretprobe *rp = container_of(p, struct kretprobe, kp);

    /*TODO: consider to only swap the RA after the last pre_handler fired */
    arch_prepare_kretprobe(rp, regs);
    return 0;
}

int register_kretprobe(struct kretprobe *rp)
{
    // ...
    rp->kp.pre_handler = pre_handler_kretprobe;

    /* Establish function entry probe point */
    if ((ret = register_kprobe(&rp->kp)) != 0) {
    }
    // ...
}
  • so that when function return, control passes to the trampoline, which is already been registered as a kprobe in init_kprobes, and that probe is hit, so its pre_handler trampoline_probe_handler will be called
  • after calling the user-specified handler associated with the kretprobe, the original function return address will be restored, and execution will be resumed
int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
{
    // ...
    head = kretprobe_inst_table_head(current);

    hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
        if (ri->task != current)
            continue;

        if (ri->rp && ri->rp->handler)
            ri->rp->handler(ri, regs);  // called user-specified handler

        orig_ret_address = (unsigned long)ri->ret_addr;
        // ...
    }

    regs->eip = orig_ret_address;  // restore the original return addr
    // ...
}

debugfs interface for kprobes

  • to list all registered probes on the system
cat /sys/kernel/debug/kprobes/list
  • to globally turn registered kprobes on or off
cat /sys/kernel/debug/kprobes/enabled
# turn on
echo 1 > /sys/kernel/debug/kprobes/enabled
# turn off
echo 0 > /sys/kernel/debug/kprobes/enabled

kprobes sysctl interface

  • check kprobes optimization status
cat /proc/sys/debug/kprobes-optimization
  • turn on or off kprobes optimization
# turn off
echo 0 > /proc/sys/debug/kprobes-optimization
# turn on
echo 1 > /proc/sys/debug/kprobes-optimization

uprobes

brief history

  • Linux v3.5 introduced uprobe into its mainline

tracepoints

  • NOTE: tracepoints implementation based on Linux v2.6.28 codes reading, may update it after reading new version of it

brief history

  • Linux v2.6.28 introduced tracepoint into its mainline
  • v2.6.29 splits original DEFINE_TRACE into DECLARE_TRACE and DEFINE_TRACE; if the tracepoint has to be used in kernel modules, an EXPORT_TRACEPOINT_SYMBOL_GPL or EXPORT_TRACEPOINT_SYMBOL can be used to export the defined tracepoints
  • v2.6.30 the macro TRACE_EVENT was introduced, which is far more powerful because it automates the “boilerplate” code needed to bridge the gap between a kernel function and a monitoring tool like perf or ftrace
  • v2.6.31 all predefined tracepoint events were grouped under include/trace/events by organizing events into subsystems, and introduced /sys/kernel/debug/tracing/events interface, giving each event its own directory with enable file
  • v2.6.32 add regfunc and unregfunc fields to struct tracepoint for more flexible functionality
  • v2.6.33 more macros DECLARE_EVENT_CLASS DEFINE_EVENT DEFINE_EVENT_PRINT to facilitate tracepoint usage for other tools
  • v2.6.35 update struct tracepoint and change related macros internal
  • v2.6.37 using JUMP_LABEL macro to test whether a tracepoint is enabled or not, and its related stuff
  • v2.6.38 using __tracepoints_ptrs section for iteration on the tracepoints; macro DECLARE_TRACE_CONDITION
  • v3.0 performance optimization using asm goto __jump_table

how does tracepoint work

  • using DEFINE_TRACE, TPPROTO and TPARGS to define a tracepoint
  • the tracepoint name will be put in __tracepoints_strings section, and the tracepoint itself will be put in __tracepoints section
DEFINE_TRACE(sched_wakeup_new,
    TPPROTO(struct rq *rq, struct task_struct *p),
    TPARGS(rq, p));

# the above example will expand to the following codes

# this is the tracepoint function called in other important places
static inline void trace_sched_wakeup_new(struct rq *rq,
                                          struct task_struct *p) {

  static const char __tpstrtab_sched_wakeup_new[]
      __attribute__((section("__tracepoints_strings"))) =
          "sched_wakeup_new:TPPROTO(struct rq *rq, struct task_struct *p)";

  static struct tracepoint __tracepoint_sched_wakeup_new
      __attribute__((section("__tracepoints"), aligned(8))) = {
          __tpstrtab_sched_wakeup_new,
                  0,
                  NULL
          };
  // when tracepoint does not activated, then just skip it
  if (unlikely(__tracepoint_sched_wakeup_new.state))
    do {
      void **it_func;
      rcu_read_lock_sched();
      it_func = rcu_dereference((&__tracepoint_sched_wakeup_new)->funcs);
      if (it_func) {
        do {
          // call tracepoint probe funcs here!!
          ((void (*)(struct rq * rq, struct task_struct * p))(*it_func))(rq, p);
        } while (*(++it_func));
      }
      rcu_read_unlock_sched();
    } while (0);
}
  • when you want to register a probe function to some tracepoint, just define the probe function according to the signature of proto of tracepoint definition, then call the generated register function, which will then call tracepoint_probe_register to connect a probe to a tracepoint
static void set_tracepoint(struct tracepoint_entry **entry,
    struct tracepoint *elem, int active)
{
    // ...
    // assign funcs in tracepoint entry to tracepoint
    rcu_assign_pointer(elem->funcs, (*entry)->funcs);
    elem->state = active;  // then activate the tracepoint
}

void tracepoint_update_probe_range(struct tracepoint *begin,
    struct tracepoint *end)
{
    // ...
    for (iter = begin; iter < end; iter++) {
        mark_entry = get_tracepoint(iter->name);
        if (mark_entry) {
            set_tracepoint(&mark_entry, iter,
                    !!mark_entry->refcount);
        } // ...
    }
    // ...
}

static void tracepoint_update_probes(void)
{
    /* Core kernel tracepoints */
    tracepoint_update_probe_range(__start___tracepoints,
        __stop___tracepoints);
    /* tracepoints in modules. */
    module_update_tracepoints();
}

int tracepoint_probe_register(const char *name, void *probe)
{
    // ...
    // add probe to corresponding tracepoint entry
    old = tracepoint_entry_add_probe(entry, probe);
    // ...
    // move probe funcs in tracepoint entry to corresponding tracepoint
    tracepoint_update_probes();
    // ...
}
static inline int register_trace_sched_wakeup_new(void (*probe)(struct rq *rq, struct task_struct *p)) {
  return tracepoint_probe_register("sched_wakeup_new:TPPROTO(struct rq *rq, struct task_struct *p)", (void *)probe);
}
struct tracepoint {
  const char *name;
  int state;
  void **funcs;  // all registered probe functions for this tracepoint
} __attribute__((aligned(8)));
  • tracepoint actual calling example
# sys_fork will call do_fork, and it will call wake_up_new_task
void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
{
    // ...
    trace_sched_wakeup_new(rq, p);  // calling our above defined tracepoint here
    // ...
}

tracepoints in modules

  • when you run insmod or modprobe, the user-space utility eventually calls sys_init_module to pass the binary blob (the .ko file) to the kernel
struct module
{
    // ...
#ifdef CONFIG_TRACEPOINTS
    struct tracepoint *tracepoints;
    unsigned int num_tracepoints;
#endif
    // ...
};
static noinline struct module *load_module(void __user *umod,
                  unsigned long len,
                  const char __user *uargs)
{
    // ...
#ifdef CONFIG_TRACEPOINTS
    mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
                    "__tracepoints",
                    sizeof(*mod->tracepoints),
                    &mod->num_tracepoints);
#endif
    // ...
    if (!mod->taints) {
        // ...
#ifdef CONFIG_TRACEPOINTS
        tracepoint_update_probe_range(mod->tracepoints,
            mod->tracepoints + mod->num_tracepoints);
#endif
    }
    // ...
}

asmlinkage long
sys_init_module(void __user *umod,
        unsigned long len,
        const char __user *uargs)
{
    // ...
    mod = load_module(umod, len, uargs);
    // ...
}

oid module_update_tracepoints(void)
{
    // ...
    list_for_each_entry(mod, &modules, list)
        if (!mod->taints)
            tracepoint_update_probe_range(mod->tracepoints,
                mod->tracepoints + mod->num_tracepoints);
    // ...
}

debugfs interface for tracepoints

  • we can easily enable tracepoints by using the event tracing framework
# to enable event 'sched_wakeup'
echo 1 > /sys/kernel/debug/tracing/events/sched/sched_wakeup/enable

# to disable it
echo 0 > /sys/kernel/debug/tracing/events/sched/sched_wakeup/enable

# to enable all events in sched subsystem
echo 1 > /sys/kernel/debug/tracing/events/sched/enable

# to eanble all events
echo 1 > /sys/kernel/debug/tracing/events/enable

perf subsystem

  • NOTE: perf subsystem implementation based on Linux v2.6.32 codes reading, may update it after reading new version of it

brief history

  • Linux v2.6.31 first introduced Performance Counters for Linux(PCL) into its mainline for just hardware counters, so the syscall is sys_perf_counter_open
  • but Linux v2.6.32 changed the syscall to sys_perf_event_open, and expanded the framework to do more than just hw counters

how does perf event work

  • the following calling procedure only means calling timeline, not caller-callee callgraph, so that we can see how perf event subsystem is inited: start_kernel –> sched_init –> perf_event_init –> check_bugs –> identify_boot_cpu –> init_hw_perf_events –> intel_pmu_init/amd_pmu_init –> perf_events_lapic_init –> rest_init –> kernel_init –> do_basic_setup –> do_initcalls –> perf_event_sysfs_init

eBPF

  • NOTE: eBPF implementation based on Linux v4.0 codes reading, may update it after reading new version of it

brief history

  • v3.17 first introduced kernel/bpf directory into linux kernel, mainly for socket filter
  • v3.18 introduced bpf syscall, bpf verifier and basic bpf map
  • v3.19 introduced BPF_MAP_TYPE_ARRAY bpf array and BPF_MAP_TYPE_HASH bpf hash map; basic version of bpf helpers; BPF_PROG_TYPE_SOCKET_FILTER socket filter macro
  • v4.1 introduced more prog types, especially BPF_PROG_TYPE_KPROBE allowing bpf program to attach to kprobes; more bpf helper functions like bpf_get_prandom_u32 and bpf_get_smp_processor_id etc
  • v4.2 introduced tail calls to allow chaining multiple eBPF programs together, effectively extending the overall execution beyond the single-program instruction limit; new map BPF_MAP_TYPE_PROG_ARRAY type to support tail calls; several support helper functions, including bpf_tail_call, bpf_get_current_pid_tgid, bpf_get_current_uid_gid, bpf_get_current_comm etc
  • v4.3 new map BPF_MAP_TYPE_PERF_EVENT_ARRAY type and with perf_event_read function to do perf event monitoring; more helper functions like bpf_get_cgroup_classid skb_[gs]et_tunnel_key etc; first introduced libbpf in tools/lib/bpf
  • v4.4 mount and register /sys/fs/bpf/ filesystem for new cmd BPF_OBJ_PIN and BPF_OBJ_GET; /proc/sys/kernel/unprivileged_bpf_disabled to control whether users without CAP_SYS_ADMIN privilege can use the bpf syscall; bpf_perf_event_output bpf_get_route_realm and bpf_redirect functions
  • v4.5 enhance /sys/fs/bpf/ filesystem to support link and rename; bpf_skb_load_bytes to net/core/filter.c
  • v4.6 new map types BPF_MAP_TYPE_PERCPU_ARRAY BPF_MAP_TYPE_PERCPU_HASH BPF_MAP_TYPE_STACK_TRACE; helper functions like bpf_skb_[gs]et_tunnel_opt, bpf_get_stackid etc, after using map BPF_MAP_TYPE_STACK_TRACE and bpf_get_stackid to get stack traces, you can use /proc/kallsyms to translate them into understandable function names
  • v4.7 new program type BPF_PROG_TYPE_TRACEPOINT; sysctl interfaces /proc/sys/net/core/bpf_jit_enable and /proc/sys/net/core/bpf_jit_harden; helper functions bpf_event_output, bpf_get_stackid_tp
  • v4.8 new program type BPF_PROG_TYPE_XDP; new map type BPF_MAP_TYPE_CGROUP_ARRAY; more helper functions bpf_skb_change_proto, bpf_skb_change_type, bpf_skb_under_cgroup, bpf_get_hash_recalc, bpf_get_current_task, bpf_probe_write_user
  • v4.9 new program type BPF_PROG_TYPE_PERF_EVENT

bpf program loading

  • calling bpf system call with BPF_PROG_LOAD cmd arg, bpf program will be copied into kernel and run through eBPF verifier by function bpf_check. After passing verification, bpf_prog_select_runtime will be used to select interpreter running function __bpf_prog_run or jited BPF instructions to native codes
static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
{
    // ...
        /* CALL */
    JMP_CALL:
        /* Function call scratches BPF_R1-BPF_R5 registers,
         * preserves BPF_R6-BPF_R9, and stores return value
         * into BPF_R0.
         */
        BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
                               BPF_R4, BPF_R5);  // NOTE: where magics happend
        CONT;
    // ...
    // how does tail call implement, codes from v4.2
    JMP_TAIL_CALL: {
        struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        struct bpf_prog *prog;
        u64 index = BPF_R3;
        // ...
        prog = READ_ONCE(array->prog[index]);  // get new prog here
        // ...
        ARG1 = BPF_R1;
        insn = prog->insnsi;  // replace insn to new prog's instructions
        goto select_insn;  // next run, we will execute insn of new program
out:
        CONT;
    }
    // ...
}
void bpf_int_jit_compile(struct bpf_prog *prog)
{
    // ...
    if (image) {
        // ... after jiting related codes
        prog->bpf_func = (void *)image;  // here we replace the interpreter func to jited codes
        prog->jited = true;
        // ...
    }
    // ...
}
void bpf_prog_select_runtime(struct bpf_prog *fp)
{
    fp->bpf_func = (void *) __bpf_prog_run;  // NOTE: save interpreter running function

    /* Probe if internal BPF can be JITed */
    bpf_int_jit_compile(fp);  // here fp->bpf_func is replaced with jited
                              // codes if jit is working
    // ...
}

bpf program running

  • two macros BPF_PROG_RUN and SK_RUN_FILTER are used to run bpf programs
#define BPF_PROG_RUN(filter, ctx)  (*filter->bpf_func)(ctx, filter->insnsi)

static int tcf_bpf(struct sk_buff *skb, const struct tc_action *a,
           struct tcf_result *res)
{
    struct tcf_bpf *b = a->priv;
    int action, filter_res;

    spin_lock(&b->tcf_lock);
        
    // ...
    filter_res = BPF_PROG_RUN(b->filter, skb);
    // ...

    spin_unlock(&b->tcf_lock);
    return action;
/* Macro to invoke filter function. */
#define SK_RUN_FILTER(filter, ctx) \
    (*filter->prog->bpf_func)(ctx, filter->prog->insnsi)

int sk_filter(struct sock *sk, struct sk_buff *skb)
{
    int err;
    struct sk_filter *filter;
        
    // ...

    rcu_read_lock();
    filter = rcu_dereference(sk->sk_filter);
    if (filter) {
        // as we can see here, how eBPF program works
        unsigned int pkt_len = SK_RUN_FILTER(filter, skb);

        err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
    }
    rcu_read_unlock();

    return err;
}

bpf program with kprobes

perequisite knowledge

  • when start_kernel executes, it will call perf_event_init, in which perf_tp_register will be called to register the corresponding pmu perf_tracepoint
static struct pmu perf_tracepoint = {
    .task_ctx_nr    = perf_sw_context,

    .event_init = perf_tp_event_init,  // tracepoint related event init func
    // ...
};

int perf_pmu_register(struct pmu *pmu, const char *name, int type)
{
    // ...
    pmu->name = name;
    // ...
    pmu->type = type;
    // ...
    list_add_rcu(&pmu->entry, &pmus);
    // ...
}

static inline void perf_tp_register(void)
{
    // register perf_tracepoint pmu to pmus
    perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
}

void __init perf_event_init(void)
{
    // ...
    perf_tp_register();
    // ...
}
  • init_kprobe_trace will be called after perf_event_init is done inside rest_init, it will register probes_write write handler, which will be triggered when we echo /sys/kernel/debug/tracing/kprobe_events
static const struct file_operations kprobe_events_ops = {
    // ...
    .write      = probes_write,
}

static __init int init_kprobe_trace(void)
{
    // ...
    entry = tracefs_create_file("kprobe_events", 0644, d_tracer,
                    NULL, &kprobe_events_ops);
    // ...
}
fs_initcall(init_kprobe_trace);

how does bpf program connect to kprobes

  • when you write SEC(“kprobe/xxxx”), you aren’t writing a standard C function call. when you compile your code using clang -target bpf, the compiler looks at that SEC() macro and creates a custom section named kprobe/xxxx
#define SEC(NAME) __attribute__((section(NAME), used))

SEC("kprobe/xxxx")
int bpf_prog(struct pt_regs *ctx)
{
    struct sk_buff *skb;
    skb = (struct sk_buff *) ctx->di;
    // ... get other related stuff this function need from ctx
}
  • the loader (like the following load_bpf_file) loads the compiled .o file and scans all the section headers, when it sees the prefix kprobe/, it realizes this is a bpf kprobe program and load it
int load_bpf_file(char *path)
{
    // ...
    fd = open(path, O_RDONLY, 0);
    // ...
    for (i = 1; i < ehdr.e_shnum; i++) {
        if (memcmp(shname_prog, "kprobe/", 7) == 0 ||
            memcmp(shname_prog, "kretprobe/", 10) == 0 ||
            memcmp(shname_prog, "socket", 6) == 0)
                load_and_attach(shname_prog, insns, data_prog->d_size);
    }
    // ...
}
  • it parses the rest of the string xxxx as the target function, using tracefs to echo to /sys/kernel/debug/tracing/kprobe_events to first create the kprobe and it will triger the corresponding .write handler probes_write registered when kprobe_events file created, through this, it will finally register the kprobe
static struct trace_kprobe *alloc_trace_kprobe(const char *group,
                         const char *event, void *addr,
                         const char *symbol, unsigned long offs,
                         int nargs, bool is_return)
{
    // ...
    if (is_return)
        tk->rp.handler = kretprobe_dispatcher;
    else
        tk->rp.kp.pre_handler = kprobe_dispatcher;  // when the kprobe hit, it will call this kprobe_dispatcher
    // ...
}

static int register_kprobe_event(struct trace_kprobe *tk)
{
    struct ftrace_event_call *call = &tk->tp.call;
    // ...
    call->flags = TRACE_EVENT_FL_KPROBE;
    call->class->reg = kprobe_register;
    call->data = tk;
    // ...
}

static int __register_trace_kprobe(struct trace_kprobe *tk)
{
    // ...
    if (trace_kprobe_is_return(tk))
        ret = register_kretprobe(&tk->rp);
    else
        ret = register_kprobe(&tk->rp.kp);  // finally register the kprobe here
    // ...
}

static int register_trace_kprobe(struct trace_kprobe *tk)
{
    // ...
    ret = register_kprobe_event(tk);
    // ...
    ret = __register_trace_kprobe(tk);
    // ...
}

static int create_trace_kprobe(int argc, char **argv)
{
    // ... parse arguments echo to kprobe_events file etc
        
    tk = alloc_trace_kprobe(group, event, addr, symbol, offset, argc,
                   is_return);
    // ...
    ret = register_trace_kprobe(tk);  // here to register the kprobe
    // ...
}

static ssize_t probes_write(struct file *file, const char __user *buffer,
                size_t count, loff_t *ppos)
{
    return traceprobe_probes_write(file, buffer, count, ppos,
            create_trace_kprobe);  // finally calls the create_trace_kprobe
}
  • after this step, the kprobe has been registered, but the bpf program hasn’t been attached yet, bpf_prog_load first load and verify it
static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
{
    bool is_socket = strncmp(event, "socket", 6) == 0;
    bool is_kprobe = strncmp(event, "kprobe/", 7) == 0;
    bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
    enum bpf_prog_type prog_type;
    // ...
        
    struct perf_event_attr attr = {};

    attr.type = PERF_TYPE_TRACEPOINT;
    attr.sample_type = PERF_SAMPLE_RAW;
    attr.sample_period = 1;
    attr.wakeup_events = 1;

    if (is_socket) {
        prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
    } else if (is_kprobe || is_kretprobe) {
        prog_type = BPF_PROG_TYPE_KPROBE;  // kprobe bpf type program
    } // ...
        
    if (is_kprobe || is_kretprobe) {
        if (is_kprobe)
            event += 7;
        else
            event += 10;

        snprintf(buf, sizeof(buf),
             "echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events",
             is_kprobe ? 'p' : 'r', event, event);  // here echo to kprobe_events file
        err = system(buf);
        // ...
                
        fd = bpf_prog_load(prog_type, prog, size, license, kern_version);
    }
    // ...
    strcpy(buf, DEBUGFS);
    strcat(buf, "events/kprobes/");
    strcat(buf, event);
    strcat(buf, "/id");

    efd = open(buf, O_RDONLY, 0);
    // ...
    err = read(efd, buf, sizeof(buf));
    // ...
    buf[err] = 0;
    id = atoi(buf);
    attr.config = id;  // continue to config struct perf_event_attr
        
    efd = perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0);
    // ...
    ioctl(efd, PERF_EVENT_IOC_ENABLE, 0);
    ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd);

    return 0;
}
  • calling perf_event_open syscall to create and sets up the perf_event structure, the event is usually created in a DISABLED state, it currently only knows how to send data to the standard perf ring buffer, but not a BPF program
static int
enable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file)
{
    // ...
        tk->tp.flags |= TP_FLAG_PROFILE;

    if (trace_probe_is_registered(&tk->tp) && !trace_kprobe_has_gone(tk)) {
        if (trace_kprobe_is_return(tk))
            ret = enable_kretprobe(&tk->rp);
        else
            ret = enable_kprobe(&tk->rp.kp);  // here to finally enable the kprobe
    }
    // ...
}

static int kprobe_register(struct ftrace_event_call *event,
               enum trace_reg type, void *data)
{
    struct trace_kprobe *tk = (struct trace_kprobe *)event->data;
    struct ftrace_event_file *file = data;

    switch (type) {
    // ...
#ifdef CONFIG_PERF_EVENTS
    case TRACE_REG_PERF_REGISTER:
        return enable_trace_kprobe(tk, NULL);
    // ...
#endif
    }
    // ...
}

static int perf_trace_event_reg(struct ftrace_event_call *tp_event,
                struct perf_event *p_event)
{
    // ...
    // kprobe_register assigned in register_kprobe_event
    ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL);
    // ...
}

static int perf_trace_event_init(struct ftrace_event_call *tp_event,
                 struct perf_event *p_event)
{
    // ...
    ret = perf_trace_event_reg(tp_event, p_event);
    // ...
}

int perf_trace_init(struct perf_event *p_event)
{
    struct ftrace_event_call *tp_event;
    u64 event_id = p_event->attr.config;
    // ...
    list_for_each_entry(tp_event, &ftrace_events, list) {
        if (tp_event->event.type == event_id &&
            tp_event->class && tp_event->class->reg &&
            try_module_get(tp_event->mod)) {
            ret = perf_trace_event_init(tp_event, p_event);
            // ...
        }
    }
    // ...
}

static int perf_tp_event_init(struct perf_event *event)
{
    int err;

    if (event->attr.type != PERF_TYPE_TRACEPOINT)
        return -ENOENT;
    // ...
    err = perf_trace_init(event);
    // ...
    event->destroy = tp_perf_event_destroy;

    return 0;
}

static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
{
    // ...
    event->pmu = pmu;
    // here finally perf_tp_event_init in struct pmu perf_tracepoint is called
    ret = pmu->event_init(event);
    // ...
}

struct pmu *perf_init_event(struct perf_event *event)
{
    // ...
    list_for_each_entry_rcu(pmu, &pmus, entry) {
        ret = perf_try_init_event(pmu, event);
        // ...
    }
    // ...
}

static inline void perf_event__state_init(struct perf_event *event)
{
    event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
                          PERF_EVENT_STATE_INACTIVE;
}

static struct perf_event *
perf_event_alloc(struct perf_event_attr *attr, int cpu,
         struct task_struct *task,
         struct perf_event *group_leader,
         struct perf_event *parent_event,
         perf_overflow_handler_t overflow_handler,
         void *context, int cgroup_fd)
{
    // ...
    perf_event__state_init(event);  // the event is not enabled yet
    // ...
    pmu = perf_init_event(event);
    // ...
}

SYSCALL_DEFINE5(perf_event_open,
        struct perf_event_attr __user *, attr_uptr,
        pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
{
    // ...
    event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
                 NULL, NULL, cgroup_fd);
    // ...
    event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
                    f_flags);  // here create the event_file and register perf_fops for it
    // ...
}
  • in perf_event_open syscall, it will create an event_file by calling anon_inode_getfile and register perf_fops for it, which is important to attach bpf to kprobe and finally enable it
static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
    struct perf_event *event = file->private_data;
    // ...
    ret = _perf_ioctl(event, cmd, arg);
    // ...
}

static const struct file_operations perf_fops = {
    // ...
    .unlocked_ioctl     = perf_ioctl,
    // ...
};
  • using PERF_EVENT_IOC_ENABLE to activate the perf event, and using PERF_EVENT_IOC_SET_BPF to attach bpf program to kprobe
// ... copied from above
    ioctl(efd, PERF_EVENT_IOC_ENABLE, 0);
    ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd);
// ...

static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
{
    struct bpf_prog *prog;

    // ...
    prog = bpf_prog_get(prog_fd);  // loads the ebpf program
    // ...
    event->tp_event->prog = prog;  // here finally attaches the ebpf program

    return 0;
}

static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
{
    void (*func)(struct perf_event *);
    u32 flags = arg;

    switch (cmd) {
    case PERF_EVENT_IOC_ENABLE:
        func = _perf_event_enable;  // later calls func, event->state = PERF_EVENT_STATE_ACTIVE;
        break;
    // ...
    case PERF_EVENT_IOC_SET_BPF:
        return perf_event_set_bpf_prog(event, arg);
    // ...
    }
}
  • when the kprobe is hit, kprobe_dispatcher will be called, and finally to call the attached bpf program
unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
{
    unsigned int ret;
    // ...
    ret = BPF_PROG_RUN(prog, ctx);  // here to finally run bpf program
    // ...
}

static void
kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
{
    struct ftrace_event_call *call = &tk->tp.call;
    struct bpf_prog *prog = call->prog;
    // ...

    if (prog && !trace_call_bpf(prog, regs))  // call bpf program
        return;
    // ...
}

static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
{
    struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);

    tk->nhit++;
    // ...
#ifdef CONFIG_PERF_EVENTS
    if (tk->tp.flags & TP_FLAG_PROFILE)
        kprobe_perf_func(tk, regs);  // inside to call bpf program
#endif
    return 0;   /* We don't tweek kernel, so just return 0 */
}

how does bpf program connect to tracepoint

  • ebpf program attach to tracepoints is very similar to kprobes in Linux v4.7, here we does not contain all the details like kprobes above, but something specially to tracepoints
  • read /sys/kernel/debug/tracing/events/xxx/id file to know tracepoint id and setup struct perf_event_attr when calling perf_event_open
  • assigning trace_event_reg to .reg field of struct trace_event_class, and it will call tracepoint_probe_register to register field .perf_probe callback function, which is a perf_trace_xxx function, to tracepoint
#define _TRACE_PERF_INIT(call)                      \
    .perf_probe     = perf_trace_##call,

static struct trace_event_class __used __refdata event_class_##call = { \
    .system         = TRACE_SYSTEM_STRING,          \   
    .define_fields      = trace_event_define_fields_##call, \
    .fields         = LIST_HEAD_INIT(event_class_##call.fields),\
    .raw_init       = trace_event_raw_init,         \   
    .probe          = trace_event_raw_event_##call,     \   
    .reg            = trace_event_reg,          \
    _TRACE_PERF_INIT(call)                      \   
};

int trace_event_reg(struct trace_event_call *call,
            enum trace_reg type, void *data)
{
    struct trace_event_file *file = data;

    WARN_ON(!(call->flags & TRACE_EVENT_FL_TRACEPOINT));
    switch (type) {
    // ...
#ifdef CONFIG_PERF_EVENTS
    case TRACE_REG_PERF_REGISTER:
        return tracepoint_probe_register(call->tp,
                         call->class->perf_probe,  // to register perf_probe callback to tracepoint
                         call);
    case TRACE_REG_PERF_UNREGISTER:
        tracepoint_probe_unregister(call->tp,
                        call->class->perf_probe,
                        call);
        return 0;
    // ...
#endif
    }    
    return 0;
}
EXPORT_SYMBOL_GPL(trace_event_reg);
  • when the tracepoint is hit, the registered .perf_probe cb will be called, as stated above, which is perf_trace_xxx function
static notrace void                         \
perf_trace_##call(void *__data, proto)                  \
{                                   \
    struct trace_event_call *event_call = __data;           \
    // ...
    struct bpf_prog *prog = event_call->prog;           \
    struct pt_regs *__regs;                     \
    // ...
    perf_fetch_caller_regs(__regs);                 \
    // ...
    perf_trace_run_bpf_submit(entry, __entry_size, rctx,        \
                  event_call, __count, __regs,      \
                  head, __task);            \
}

void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, 
                   struct trace_event_call *call, u64 count,
                   struct pt_regs *regs, struct hlist_head *head,
                   struct task_struct *task)
{
    struct bpf_prog *prog = call->prog;

    if (prog) {
        *(struct pt_regs **)raw_data = regs;
        // here to finally call trace_call_bpf to run ebpf program
        if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) {
            // ...
        }     
    }
    // ...
}

XDP bpf program

perequisite knowledge

  • module_init calls mlx4_en_init to register struct mlx4_interface, where .activate function mlx4_en_activate will be called later to setup struct net_device_ops for field netdev_ops of struct net_device, and in struct net_device_ops, the field of .ndo_xdp is important for xdp to work, which is function mlx4_xdp in the following code example
static const struct net_device_ops mlx4_netdev_ops = {
    // ...
    .ndo_xdp        = mlx4_xdp,
};

int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
            struct mlx4_en_port_profile *prof)
{
    // ...
    /*
     * Initialize netdev entry points
     */
    if (mlx4_is_master(priv->mdev->dev))
        dev->netdev_ops = &mlx4_netdev_ops_master;
    else
        dev->netdev_ops = &mlx4_netdev_ops;
    // ...
}

static void mlx4_en_activate(struct mlx4_dev *dev, void *ctx)
{   
    int i;
    struct mlx4_en_dev *mdev = ctx;

    /* Create a netdev for each port */
    mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
        mlx4_info(mdev, "Activating port:%d\n", i);
        if (mlx4_en_init_netdev(mdev, i, &mdev->profile.prof[i]))
            mdev->pndev[i] = NULL;
    }       
    // ...
}

static struct mlx4_interface mlx4_en_interface = {
    // ...
    .protocol   = MLX4_PROT_ETH,
    .activate   = mlx4_en_activate,
};

static int __init mlx4_en_init(void)
{
    // ...
    return mlx4_register_interface(&mlx4_en_interface);
}

module_init(mlx4_en_init);

how does XDP bpf program connect to NIC

  • in linux v4.8, XDP attachment support is very limited, we will need the ip command from iproute2 to do the job, and it will finally sends an RTM_SETLINK Netlink message to the kernel
ip link set dev eth0 xdp obj prog.o
  • The kernel receives the message and routes it through the networking stack
    • rtnetlink_rcv_msg
    • rtnl_setlink
    • do_setlink, parses the attributes, specifically looking for IFLA_XDP
    • dev_change_xdp_fd, the core kernel function that manages XDP attachment, which set or clear a bpf program for a device rx path
int dev_change_xdp_fd(struct net_device *dev, int fd)
{
    const struct net_device_ops *ops = dev->netdev_ops;
    struct bpf_prog *prog = NULL;
    struct netdev_xdp xdp = {};
    int err; 

    if (!ops->ndo_xdp)
        return -EOPNOTSUPP;
    if (fd >= 0) { 
        prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
        if (IS_ERR(prog))
            return PTR_ERR(prog);
    }    

    xdp.command = XDP_SETUP_PROG;
    xdp.prog = prog;
    // ndo_xdp cb will be called, as we stated above, it's mlx4_xdp function here
    err = ops->ndo_xdp(dev, &xdp);
    // ...
}
  • function mlx4_xdp will finally call mlx4_xdp_set to setup XDP program
static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog)
{
    // ...
    xdp_ring_num = prog ? ALIGN(priv->rx_ring_num, MLX4_EN_NUM_UP) : 0;
    // ...
    priv->xdp_ring_num = xdp_ring_num;
    netif_set_real_num_tx_queues(dev, priv->tx_ring_num -
                            priv->xdp_ring_num);

    for (i = 0; i < priv->rx_ring_num; i++) {
        old_prog = xchg(&priv->rx_ring[i]->xdp_prog, prog);  // here ebpf is finally set
        if (old_prog)
            bpf_prog_put(old_prog);
    }
    // ...
}

static int mlx4_xdp(struct net_device *dev, struct netdev_xdp *xdp)
{
    switch (xdp->command) {
    case XDP_SETUP_PROG:
        return mlx4_xdp_set(dev, xdp->prog);
    case XDP_QUERY_PROG:
        xdp->prog_attached = mlx4_xdp_attached(dev);
        return 0;
    default:
        return -EINVAL;
    }    
}
  • when a packet hits the NIC, after hardware interrupt and Rx CQ polling, it will finally call mlx4_en_process_rx_cq function
static inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
                   struct xdp_buff *xdp)
{
    u32 ret;

    rcu_read_lock();
    ret = BPF_PROG_RUN(prog, (void *)xdp);
    rcu_read_unlock();

    return ret;
}

int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int budget)
{
    // ...
    xdp_prog = READ_ONCE(ring->xdp_prog);
    // ...
    while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK,
            cq->mcq.cons_index & cq->size)) {
        // ...
        /* A bpf program gets first chance to drop the packet. It may
         * read bytes but not past the end of the frag.
         */
        if (xdp_prog) {
            struct xdp_buff xdp;
            // ...
            xdp.data = page_address(frags[0].page) +
                            frags[0].page_offset;
            xdp.data_end = xdp.data + length;
            // here to run xdp bpf program
            act = bpf_prog_run_xdp(xdp_prog, &xdp);
            switch (act) {
            case XDP_PASS:
                break;
            case XDP_TX:
                // ...
            case XDP_ABORTED:
            case XDP_DROP:
                if (mlx4_en_rx_recycle(ring, frags))
                    goto consumed;
                goto next;
            }
        }
        // ...
    }
    // ...
}

ebpf sysctl interface

bpf_jit_enable

  • check ebpf jit status
cat /proc/sys/net/core/bpf_jit_enable
  • turn on or off ebpf jit compiler
# turn off
echo 0 > /proc/sys/net/core/bpf_jit_enable  # for interpreter only
# turn on
echo 1 > /proc/sys/net/core/bpf_jit_enable  # enable jit
# debug mode
# the kernel will output the resulting native opcodes to the kernel log (dmesg)
echo 2 > /proc/sys/net/core/bpf_jit_enable

bpf_jit_harden

  • check ebpf jit harden status
cat /proc/sys/net/core/bpf_jit_harden
  • turn on or off ebpf jit hardening
echo 0 > /proc/sys/net/core/bpf_jit_harden  # disable JIT hardening
# enable JIT hardening for unprivileged users only
echo 1 > /proc/sys/net/core/bpf_jit_harden
# enable JIT hardening for all users
echo 2 > /proc/sys/net/core/bpf_jit_harden

ftrace

  • NOTE: ftrace implementation based on Linux v2.6.27 codes reading, may update it after reading new version of it

perequisite knowledge

  • Linux kernel should be compiled with gcc with -pg flag, which will insert mcount function call at the start of every function, unless you use the function attribute no_instrument_function to suppress profiling of individual functions when compiling
gcc -c -pg -m32 ftrace.c
void foo()
{
    printf("hello world\n");
}

#define notrace __attribute__((no_instrument_function))

void notrace bar()
{
    printf("hello");
}
.LC0:
        .string "hello world"
foo:
        pushl   %ebp
        movl    %esp, %ebp
        subl    $8, %esp
1:      call        mcount  // NOTE: here -pg flag will insert mcount function call
        subl    $12, %esp
        pushl   $.LC0
        // ...
bar:
        pushl   %ebp
        movl    %esp, %ebp
        subl    $8, %esp
        subl    $12, %esp
        pushl   $.LC0
        // ...