
Source file src/github.com/cilium/ebpf/link/kprobe.go

Documentation: github.com/cilium/ebpf/link

     1  package link
     3  import (
     4  	"bytes"
     5  	"crypto/rand"
     6  	"errors"
     7  	"fmt"
     8  	"os"
     9  	"path/filepath"
    10  	"runtime"
    11  	"strings"
    12  	"sync"
    13  	"syscall"
    14  	"unsafe"
    16  	"github.com/cilium/ebpf"
    17  	"github.com/cilium/ebpf/internal/sys"
    18  	"github.com/cilium/ebpf/internal/unix"
    19  )
    21  var (
    22  	kprobeEventsPath = filepath.Join(tracefsPath, "kprobe_events")
    24  	kprobeRetprobeBit = struct {
    25  		once  sync.Once
    26  		value uint64
    27  		err   error
    28  	}{}
    29  )
    31  type probeType uint8
    33  type probeArgs struct {
    34  	symbol, group, path          string
    35  	offset, refCtrOffset, cookie uint64
    36  	pid                          int
    37  	ret                          bool
    38  }
    40  // KprobeOptions defines additional parameters that will be used
    41  // when loading Kprobes.
    42  type KprobeOptions struct {
    43  	// Arbitrary value that can be fetched from an eBPF program
    44  	// via `bpf_get_attach_cookie()`.
    45  	//
    46  	// Needs kernel 5.15+.
    47  	Cookie uint64
    48  	// Offset of the kprobe relative to the traced symbol.
    49  	// Can be used to insert kprobes at arbitrary offsets in kernel functions,
    50  	// e.g. in places where functions have been inlined.
    51  	Offset uint64
    52  }
    54  const (
    55  	kprobeType probeType = iota
    56  	uprobeType
    57  )
    59  func (pt probeType) String() string {
    60  	if pt == kprobeType {
    61  		return "kprobe"
    62  	}
    63  	return "uprobe"
    64  }
    66  func (pt probeType) EventsPath() string {
    67  	if pt == kprobeType {
    68  		return kprobeEventsPath
    69  	}
    70  	return uprobeEventsPath
    71  }
    73  func (pt probeType) PerfEventType(ret bool) perfEventType {
    74  	if pt == kprobeType {
    75  		if ret {
    76  			return kretprobeEvent
    77  		}
    78  		return kprobeEvent
    79  	}
    80  	if ret {
    81  		return uretprobeEvent
    82  	}
    83  	return uprobeEvent
    84  }
    86  func (pt probeType) RetprobeBit() (uint64, error) {
    87  	if pt == kprobeType {
    88  		return kretprobeBit()
    89  	}
    90  	return uretprobeBit()
    91  }
    93  // Kprobe attaches the given eBPF program to a perf event that fires when the
    94  // given kernel symbol starts executing. See /proc/kallsyms for available
    95  // symbols. For example, printk():
    96  //
    97  //	kp, err := Kprobe("printk", prog, nil)
    98  //
    99  // Losing the reference to the resulting Link (kp) will close the Kprobe
   100  // and prevent further execution of prog. The Link must be Closed during
   101  // program shutdown to avoid leaking system resources.
   102  func Kprobe(symbol string, prog *ebpf.Program, opts *KprobeOptions) (Link, error) {
   103  	k, err := kprobe(symbol, prog, opts, false)
   104  	if err != nil {
   105  		return nil, err
   106  	}
   108  	lnk, err := attachPerfEvent(k, prog)
   109  	if err != nil {
   110  		k.Close()
   111  		return nil, err
   112  	}
   114  	return lnk, nil
   115  }
   117  // Kretprobe attaches the given eBPF program to a perf event that fires right
   118  // before the given kernel symbol exits, with the function stack left intact.
   119  // See /proc/kallsyms for available symbols. For example, printk():
   120  //
   121  //	kp, err := Kretprobe("printk", prog, nil)
   122  //
   123  // Losing the reference to the resulting Link (kp) will close the Kretprobe
   124  // and prevent further execution of prog. The Link must be Closed during
   125  // program shutdown to avoid leaking system resources.
   126  func Kretprobe(symbol string, prog *ebpf.Program, opts *KprobeOptions) (Link, error) {
   127  	k, err := kprobe(symbol, prog, opts, true)
   128  	if err != nil {
   129  		return nil, err
   130  	}
   132  	lnk, err := attachPerfEvent(k, prog)
   133  	if err != nil {
   134  		k.Close()
   135  		return nil, err
   136  	}
   138  	return lnk, nil
   139  }
   141  // isValidKprobeSymbol implements the equivalent of a regex match
   142  // against "^[a-zA-Z_][0-9a-zA-Z_.]*$".
   143  func isValidKprobeSymbol(s string) bool {
   144  	if len(s) < 1 {
   145  		return false
   146  	}
   148  	for i, c := range []byte(s) {
   149  		switch {
   150  		case c >= 'a' && c <= 'z':
   151  		case c >= 'A' && c <= 'Z':
   152  		case c == '_':
   153  		case i > 0 && c >= '0' && c <= '9':
   155  		// Allow `.` in symbol name. GCC-compiled kernel may change symbol name
   156  		// to have a `.isra.$n` suffix, like `udp_send_skb.isra.52`.
   157  		// See: https://gcc.gnu.org/gcc-10/changes.html
   158  		case i > 0 && c == '.':
   160  		default:
   161  			return false
   162  		}
   163  	}
   165  	return true
   166  }
   168  // kprobe opens a perf event on the given symbol and attaches prog to it.
   169  // If ret is true, create a kretprobe.
   170  func kprobe(symbol string, prog *ebpf.Program, opts *KprobeOptions, ret bool) (*perfEvent, error) {
   171  	if symbol == "" {
   172  		return nil, fmt.Errorf("symbol name cannot be empty: %w", errInvalidInput)
   173  	}
   174  	if prog == nil {
   175  		return nil, fmt.Errorf("prog cannot be nil: %w", errInvalidInput)
   176  	}
   177  	if !isValidKprobeSymbol(symbol) {
   178  		return nil, fmt.Errorf("symbol '%s' must be a valid symbol in /proc/kallsyms: %w", symbol, errInvalidInput)
   179  	}
   180  	if prog.Type() != ebpf.Kprobe {
   181  		return nil, fmt.Errorf("eBPF program type %s is not a Kprobe: %w", prog.Type(), errInvalidInput)
   182  	}
   184  	args := probeArgs{
   185  		pid:    perfAllThreads,
   186  		symbol: symbol,
   187  		ret:    ret,
   188  	}
   190  	if opts != nil {
   191  		args.cookie = opts.Cookie
   192  		args.offset = opts.Offset
   193  	}
   195  	// Use kprobe PMU if the kernel has it available.
   196  	tp, err := pmuKprobe(args)
   197  	if errors.Is(err, os.ErrNotExist) {
   198  		args.symbol = platformPrefix(symbol)
   199  		tp, err = pmuKprobe(args)
   200  	}
   201  	if err == nil {
   202  		return tp, nil
   203  	}
   204  	if err != nil && !errors.Is(err, ErrNotSupported) {
   205  		return nil, fmt.Errorf("creating perf_kprobe PMU: %w", err)
   206  	}
   208  	// Use tracefs if kprobe PMU is missing.
   209  	args.symbol = symbol
   210  	tp, err = tracefsKprobe(args)
   211  	if errors.Is(err, os.ErrNotExist) {
   212  		args.symbol = platformPrefix(symbol)
   213  		tp, err = tracefsKprobe(args)
   214  	}
   215  	if err != nil {
   216  		return nil, fmt.Errorf("creating trace event '%s' in tracefs: %w", symbol, err)
   217  	}
   219  	return tp, nil
   220  }
   222  // pmuKprobe opens a perf event based on the kprobe PMU.
   223  // Returns os.ErrNotExist if the given symbol does not exist in the kernel.
   224  func pmuKprobe(args probeArgs) (*perfEvent, error) {
   225  	return pmuProbe(kprobeType, args)
   226  }
   228  // pmuProbe opens a perf event based on a Performance Monitoring Unit.
   229  //
   230  // Requires at least a 4.17 kernel.
   231  // e12f03d7031a "perf/core: Implement the 'perf_kprobe' PMU"
   232  // 33ea4b24277b "perf/core: Implement the 'perf_uprobe' PMU"
   233  //
   234  // Returns ErrNotSupported if the kernel doesn't support perf_[k,u]probe PMU
   235  func pmuProbe(typ probeType, args probeArgs) (*perfEvent, error) {
   236  	// Getting the PMU type will fail if the kernel doesn't support
   237  	// the perf_[k,u]probe PMU.
   238  	et, err := getPMUEventType(typ)
   239  	if err != nil {
   240  		return nil, err
   241  	}
   243  	var config uint64
   244  	if args.ret {
   245  		bit, err := typ.RetprobeBit()
   246  		if err != nil {
   247  			return nil, err
   248  		}
   249  		config |= 1 << bit
   250  	}
   252  	var (
   253  		attr unix.PerfEventAttr
   254  		sp   unsafe.Pointer
   255  	)
   256  	switch typ {
   257  	case kprobeType:
   258  		// Create a pointer to a NUL-terminated string for the kernel.
   259  		sp, err = unsafeStringPtr(args.symbol)
   260  		if err != nil {
   261  			return nil, err
   262  		}
   264  		attr = unix.PerfEventAttr{
   265  			// The minimum size required for PMU kprobes is PERF_ATTR_SIZE_VER1,
   266  			// since it added the config2 (Ext2) field. Use Ext2 as probe_offset.
   267  			Size:   unix.PERF_ATTR_SIZE_VER1,
   268  			Type:   uint32(et),          // PMU event type read from sysfs
   269  			Ext1:   uint64(uintptr(sp)), // Kernel symbol to trace
   270  			Ext2:   args.offset,         // Kernel symbol offset
   271  			Config: config,              // Retprobe flag
   272  		}
   273  	case uprobeType:
   274  		sp, err = unsafeStringPtr(args.path)
   275  		if err != nil {
   276  			return nil, err
   277  		}
   279  		if args.refCtrOffset != 0 {
   280  			config |= args.refCtrOffset << uprobeRefCtrOffsetShift
   281  		}
   283  		attr = unix.PerfEventAttr{
   284  			// The minimum size required for PMU uprobes is PERF_ATTR_SIZE_VER1,
   285  			// since it added the config2 (Ext2) field. The Size field controls the
   286  			// size of the internal buffer the kernel allocates for reading the
   287  			// perf_event_attr argument from userspace.
   288  			Size:   unix.PERF_ATTR_SIZE_VER1,
   289  			Type:   uint32(et),          // PMU event type read from sysfs
   290  			Ext1:   uint64(uintptr(sp)), // Uprobe path
   291  			Ext2:   args.offset,         // Uprobe offset
   292  			Config: config,              // RefCtrOffset, Retprobe flag
   293  		}
   294  	}
   296  	rawFd, err := unix.PerfEventOpen(&attr, args.pid, 0, -1, unix.PERF_FLAG_FD_CLOEXEC)
   298  	// On some old kernels, kprobe PMU doesn't allow `.` in symbol names and
   299  	// return -EINVAL. Return ErrNotSupported to allow falling back to tracefs.
   300  	// https://github.com/torvalds/linux/blob/94710cac0ef4/kernel/trace/trace_kprobe.c#L340-L343
   301  	if errors.Is(err, unix.EINVAL) && strings.Contains(args.symbol, ".") {
   302  		return nil, fmt.Errorf("symbol '%s+%#x': older kernels don't accept dots: %w", args.symbol, args.offset, ErrNotSupported)
   303  	}
   304  	// Since commit 97c753e62e6c, ENOENT is correctly returned instead of EINVAL
   305  	// when trying to create a kretprobe for a missing symbol. Make sure ENOENT
   306  	// is returned to the caller.
   307  	if errors.Is(err, os.ErrNotExist) || errors.Is(err, unix.EINVAL) {
   308  		return nil, fmt.Errorf("symbol '%s+%#x' not found: %w", args.symbol, args.offset, os.ErrNotExist)
   309  	}
   310  	// Since commit ab105a4fb894, -EILSEQ is returned when a kprobe sym+offset is resolved
   311  	// to an invalid insn boundary.
   312  	if errors.Is(err, syscall.EILSEQ) {
   313  		return nil, fmt.Errorf("symbol '%s+%#x' not found (bad insn boundary): %w", args.symbol, args.offset, os.ErrNotExist)
   314  	}
   315  	// Since at least commit cb9a19fe4aa51, ENOTSUPP is returned
   316  	// when attempting to set a uprobe on a trap instruction.
   317  	if errors.Is(err, unix.ENOTSUPP) {
   318  		return nil, fmt.Errorf("failed setting uprobe on offset %#x (possible trap insn): %w", args.offset, err)
   319  	}
   320  	if err != nil {
   321  		return nil, fmt.Errorf("opening perf event: %w", err)
   322  	}
   324  	// Ensure the string pointer is not collected before PerfEventOpen returns.
   325  	runtime.KeepAlive(sp)
   327  	fd, err := sys.NewFD(rawFd)
   328  	if err != nil {
   329  		return nil, err
   330  	}
   332  	// Kernel has perf_[k,u]probe PMU available, initialize perf event.
   333  	return &perfEvent{
   334  		typ:    typ.PerfEventType(args.ret),
   335  		name:   args.symbol,
   336  		pmuID:  et,
   337  		cookie: args.cookie,
   338  		fd:     fd,
   339  	}, nil
   340  }
   342  // tracefsKprobe creates a Kprobe tracefs entry.
   343  func tracefsKprobe(args probeArgs) (*perfEvent, error) {
   344  	return tracefsProbe(kprobeType, args)
   345  }
   347  // tracefsProbe creates a trace event by writing an entry to <tracefs>/[k,u]probe_events.
   348  // A new trace event group name is generated on every call to support creating
   349  // multiple trace events for the same kernel or userspace symbol.
   350  // Path and offset are only set in the case of uprobe(s) and are used to set
   351  // the executable/library path on the filesystem and the offset where the probe is inserted.
   352  // A perf event is then opened on the newly-created trace event and returned to the caller.
   353  func tracefsProbe(typ probeType, args probeArgs) (_ *perfEvent, err error) {
   354  	// Generate a random string for each trace event we attempt to create.
   355  	// This value is used as the 'group' token in tracefs to allow creating
   356  	// multiple kprobe trace events with the same name.
   357  	group, err := randomGroup("ebpf")
   358  	if err != nil {
   359  		return nil, fmt.Errorf("randomizing group name: %w", err)
   360  	}
   361  	args.group = group
   363  	// Before attempting to create a trace event through tracefs,
   364  	// check if an event with the same group and name already exists.
   365  	// Kernels 4.x and earlier don't return os.ErrExist on writing a duplicate
   366  	// entry, so we need to rely on reads for detecting uniqueness.
   367  	_, err = getTraceEventID(group, args.symbol)
   368  	if err == nil {
   369  		return nil, fmt.Errorf("trace event already exists: %s/%s", group, args.symbol)
   370  	}
   371  	if err != nil && !errors.Is(err, os.ErrNotExist) {
   372  		return nil, fmt.Errorf("checking trace event %s/%s: %w", group, args.symbol, err)
   373  	}
   375  	// Create the [k,u]probe trace event using tracefs.
   376  	if err := createTraceFSProbeEvent(typ, args); err != nil {
   377  		return nil, fmt.Errorf("creating probe entry on tracefs: %w", err)
   378  	}
   379  	defer func() {
   380  		if err != nil {
   381  			// Make sure we clean up the created tracefs event when we return error.
   382  			// If a livepatch handler is already active on the symbol, the write to
   383  			// tracefs will succeed, a trace event will show up, but creating the
   384  			// perf event will fail with EBUSY.
   385  			_ = closeTraceFSProbeEvent(typ, args.group, args.symbol)
   386  		}
   387  	}()
   389  	// Get the newly-created trace event's id.
   390  	tid, err := getTraceEventID(group, args.symbol)
   391  	if err != nil {
   392  		return nil, fmt.Errorf("getting trace event id: %w", err)
   393  	}
   395  	// Kprobes are ephemeral tracepoints and share the same perf event type.
   396  	fd, err := openTracepointPerfEvent(tid, args.pid)
   397  	if err != nil {
   398  		return nil, err
   399  	}
   401  	return &perfEvent{
   402  		typ:       typ.PerfEventType(args.ret),
   403  		group:     group,
   404  		name:      args.symbol,
   405  		tracefsID: tid,
   406  		cookie:    args.cookie,
   407  		fd:        fd,
   408  	}, nil
   409  }
   411  // createTraceFSProbeEvent creates a new ephemeral trace event by writing to
   412  // <tracefs>/[k,u]probe_events. Returns os.ErrNotExist if symbol is not a valid
   413  // kernel symbol, or if it is not traceable with kprobes. Returns os.ErrExist
   414  // if a probe with the same group and symbol already exists.
   415  func createTraceFSProbeEvent(typ probeType, args probeArgs) error {
   416  	// Open the kprobe_events file in tracefs.
   417  	f, err := os.OpenFile(typ.EventsPath(), os.O_APPEND|os.O_WRONLY, 0666)
   418  	if err != nil {
   419  		return fmt.Errorf("error opening '%s': %w", typ.EventsPath(), err)
   420  	}
   421  	defer f.Close()
   423  	var pe, token string
   424  	switch typ {
   425  	case kprobeType:
   426  		// The kprobe_events syntax is as follows (see Documentation/trace/kprobetrace.txt):
   427  		// p[:[GRP/]EVENT] [MOD:]SYM[+offs]|MEMADDR [FETCHARGS] : Set a probe
   428  		// r[MAXACTIVE][:[GRP/]EVENT] [MOD:]SYM[+0] [FETCHARGS] : Set a return probe
   429  		// -:[GRP/]EVENT                                        : Clear a probe
   430  		//
   431  		// Some examples:
   432  		// r:ebpf_1234/r_my_kretprobe nf_conntrack_destroy
   433  		// p:ebpf_5678/p_my_kprobe __x64_sys_execve
   434  		//
   435  		// Leaving the kretprobe's MAXACTIVE set to 0 (or absent) will make the
   436  		// kernel default to NR_CPUS. This is desired in most eBPF cases since
   437  		// subsampling or rate limiting logic can be more accurately implemented in
   438  		// the eBPF program itself.
   439  		// See Documentation/kprobes.txt for more details.
   440  		token = kprobeToken(args)
   441  		pe = fmt.Sprintf("%s:%s/%s %s", probePrefix(args.ret), args.group, sanitizeSymbol(args.symbol), token)
   442  	case uprobeType:
   443  		// The uprobe_events syntax is as follows:
   444  		// p[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] : Set a probe
   445  		// r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] : Set a return probe
   446  		// -:[GRP/]EVENT                           : Clear a probe
   447  		//
   448  		// Some examples:
   449  		// r:ebpf_1234/readline /bin/bash:0x12345
   450  		// p:ebpf_5678/main_mySymbol /bin/mybin:0x12345(0x123)
   451  		//
   452  		// See Documentation/trace/uprobetracer.txt for more details.
   453  		token = uprobeToken(args)
   454  		pe = fmt.Sprintf("%s:%s/%s %s", probePrefix(args.ret), args.group, args.symbol, token)
   455  	}
   456  	_, err = f.WriteString(pe)
   457  	// Since commit 97c753e62e6c, ENOENT is correctly returned instead of EINVAL
   458  	// when trying to create a kretprobe for a missing symbol. Make sure ENOENT
   459  	// is returned to the caller.
   460  	// EINVAL is also returned on pre-5.2 kernels when the `SYM[+offs]` token
   461  	// is resolved to an invalid insn boundary.
   462  	if errors.Is(err, os.ErrNotExist) || errors.Is(err, unix.EINVAL) {
   463  		return fmt.Errorf("token %s: %w", token, os.ErrNotExist)
   464  	}
   465  	// Since commit ab105a4fb894, -EILSEQ is returned when a kprobe sym+offset is resolved
   466  	// to an invalid insn boundary.
   467  	if errors.Is(err, syscall.EILSEQ) {
   468  		return fmt.Errorf("token %s: bad insn boundary: %w", token, os.ErrNotExist)
   469  	}
   470  	// ERANGE is returned when the `SYM[+offs]` token is too big and cannot
   471  	// be resolved.
   472  	if errors.Is(err, syscall.ERANGE) {
   473  		return fmt.Errorf("token %s: offset too big: %w", token, os.ErrNotExist)
   474  	}
   475  	if err != nil {
   476  		return fmt.Errorf("writing '%s' to '%s': %w", pe, typ.EventsPath(), err)
   477  	}
   479  	return nil
   480  }
   482  // closeTraceFSProbeEvent removes the [k,u]probe with the given type, group and symbol
   483  // from <tracefs>/[k,u]probe_events.
   484  func closeTraceFSProbeEvent(typ probeType, group, symbol string) error {
   485  	f, err := os.OpenFile(typ.EventsPath(), os.O_APPEND|os.O_WRONLY, 0666)
   486  	if err != nil {
   487  		return fmt.Errorf("error opening %s: %w", typ.EventsPath(), err)
   488  	}
   489  	defer f.Close()
   491  	// See [k,u]probe_events syntax above. The probe type does not need to be specified
   492  	// for removals.
   493  	pe := fmt.Sprintf("-:%s/%s", group, sanitizeSymbol(symbol))
   494  	if _, err = f.WriteString(pe); err != nil {
   495  		return fmt.Errorf("writing '%s' to '%s': %w", pe, typ.EventsPath(), err)
   496  	}
   498  	return nil
   499  }
   501  // randomGroup generates a pseudorandom string for use as a tracefs group name.
   502  // Returns an error when the output string would exceed 63 characters (kernel
   503  // limitation), when rand.Read() fails or when prefix contains characters not
   504  // allowed by isValidTraceID.
   505  func randomGroup(prefix string) (string, error) {
   506  	if !isValidTraceID(prefix) {
   507  		return "", fmt.Errorf("prefix '%s' must be alphanumeric or underscore: %w", prefix, errInvalidInput)
   508  	}
   510  	b := make([]byte, 8)
   511  	if _, err := rand.Read(b); err != nil {
   512  		return "", fmt.Errorf("reading random bytes: %w", err)
   513  	}
   515  	group := fmt.Sprintf("%s_%x", prefix, b)
   516  	if len(group) > 63 {
   517  		return "", fmt.Errorf("group name '%s' cannot be longer than 63 characters: %w", group, errInvalidInput)
   518  	}
   520  	return group, nil
   521  }
   523  func probePrefix(ret bool) string {
   524  	if ret {
   525  		return "r"
   526  	}
   527  	return "p"
   528  }
   530  // determineRetprobeBit reads a Performance Monitoring Unit's retprobe bit
   531  // from /sys/bus/event_source/devices/<pmu>/format/retprobe.
   532  func determineRetprobeBit(typ probeType) (uint64, error) {
   533  	p := filepath.Join("/sys/bus/event_source/devices/", typ.String(), "/format/retprobe")
   535  	data, err := os.ReadFile(p)
   536  	if err != nil {
   537  		return 0, err
   538  	}
   540  	var rp uint64
   541  	n, err := fmt.Sscanf(string(bytes.TrimSpace(data)), "config:%d", &rp)
   542  	if err != nil {
   543  		return 0, fmt.Errorf("parse retprobe bit: %w", err)
   544  	}
   545  	if n != 1 {
   546  		return 0, fmt.Errorf("parse retprobe bit: expected 1 item, got %d", n)
   547  	}
   549  	return rp, nil
   550  }
   552  func kretprobeBit() (uint64, error) {
   553  	kprobeRetprobeBit.once.Do(func() {
   554  		kprobeRetprobeBit.value, kprobeRetprobeBit.err = determineRetprobeBit(kprobeType)
   555  	})
   556  	return kprobeRetprobeBit.value, kprobeRetprobeBit.err
   557  }
   559  // kprobeToken creates the SYM[+offs] token for the tracefs api.
   560  func kprobeToken(args probeArgs) string {
   561  	po := args.symbol
   563  	if args.offset != 0 {
   564  		po += fmt.Sprintf("+%#x", args.offset)
   565  	}
   567  	return po
   568  }

View as plain text