diff --git a/cmd/gcs/cgroup_manager_test.go b/cmd/gcs/cgroup_manager_test.go new file mode 100644 index 0000000000..f950cdcc4f --- /dev/null +++ b/cmd/gcs/cgroup_manager_test.go @@ -0,0 +1,122 @@ +//go:build linux +// +build linux + +package main + +import ( + "testing" + + cgroups2stats "github.com/containerd/cgroups/v3/cgroup2/stats" + oci "github.com/opencontainers/runtime-spec/specs-go" +) + +func TestCgroupManagerInterface_Compatibility(t *testing.T) { + // Test that both managers implement the CgroupManager interface. + // Compile-time check: these assignments will fail to compile if the types + // do not satisfy CgroupManager. + var _ CgroupManager = &V1Manager{} + var _ CgroupManager = &V2Manager{} +} + +func TestConvertToV2Resources_Basic(t *testing.T) { + limit := int64(1024 * 1024 * 1024) // 1GB + + ociResources := &oci.LinuxResources{ + Memory: &oci.LinuxMemory{ + Limit: &limit, + }, + } + + v2Resources := convertToV2Resources(ociResources) + + if v2Resources == nil { + t.Fatal("v2Resources should not be nil") + } + if v2Resources.Memory == nil { + t.Fatal("v2Resources.Memory should not be nil") + } + if v2Resources.Memory.Max == nil || *v2Resources.Memory.Max != limit { + t.Errorf("Expected memory max %d, got %v", limit, v2Resources.Memory.Max) + } +} + +func TestIsCgroupV2_Detection(t *testing.T) { + // Test the cgroup version detection + result1 := isCgroupV2() + result2 := isCgroupV2() + + // Should be consistent + if result1 != result2 { + t.Error("isCgroupV2() should return consistent results") + } + + t.Logf("Detected cgroup version: v%d", map[bool]int{false: 1, true: 2}[result1]) +} + +// Additional Error Handling and Edge Cases +func TestCgroupManager_InvalidPath(t *testing.T) { + // Use a path that cannot be created due to read-only filesystem constraints + v2mgr := &V2Manager{path: "/proc/nonexistent/path"} + err := v2mgr.Create(1234) + if err == nil { + t.Error("Expected error for invalid cgroup path, got nil") + } else { + t.Logf("Expected error for invalid path: %v", err) + } +} + +func TestCgroupManager_PermissionDenied(t *testing.T) { + // Test with root path that should require permissions + v2mgr := &V2Manager{path: "/sys/fs/cgroup/test-permission-denied"} + err := v2mgr.Create(1234) + // Error is expected, just ensure it doesn't panic + if err != nil { + t.Logf("Expected permission error: %v", err) + } +} + +func TestConvertV2StatsToV1_InvalidInput(t *testing.T) { + // Test with nil input + result := convertV2StatsToV1Stats(nil) + if result == nil { + t.Error("convertV2StatsToV1Stats should handle nil input gracefully") + } + + // Test with empty metrics + emptyMetrics := &cgroups2stats.Metrics{} + result = convertV2StatsToV1Stats(emptyMetrics) + if result == nil { + t.Error("convertV2StatsToV1Stats should handle empty metrics") + } +} + +func TestConvertToV2Resources_ExtremeLimits(t *testing.T) { + // Test with maximum possible values + maxLimit := int64(^uint64(0) >> 1) // Max int64 + ociResources := &oci.LinuxResources{ + Memory: &oci.LinuxMemory{ + Limit: &maxLimit, + }, + Pids: &oci.LinuxPids{ + Limit: maxLimit, + }, + } + + v2Resources := convertToV2Resources(ociResources) + if v2Resources == nil { + t.Fatal("v2Resources should not be nil") + } + + // Test with zero values + zeroLimit := int64(0) + ociResourcesZero := &oci.LinuxResources{ + Memory: &oci.LinuxMemory{ + Limit: &zeroLimit, + }, + } + + v2ResourcesZero := convertToV2Resources(ociResourcesZero) + if v2ResourcesZero == nil || v2ResourcesZero.Memory == nil { + t.Error("Should handle zero limit values") + } +} diff --git a/cmd/gcs/main.go b/cmd/gcs/main.go index 36ae1991b6..649dca02b3 100644 --- a/cmd/gcs/main.go +++ b/cmd/gcs/main.go @@ -10,17 +10,21 @@ import ( "os" "os/exec" "path/filepath" + "strconv" "strings" "syscall" "time" "github.com/cenkalti/backoff/v4" - cgroups "github.com/containerd/cgroups/v3/cgroup1" - cgroupstats "github.com/containerd/cgroups/v3/cgroup1/stats" + cgroups1 "github.com/containerd/cgroups/v3/cgroup1" + cgroups1stats "github.com/containerd/cgroups/v3/cgroup1/stats" + cgroups2 "github.com/containerd/cgroups/v3/cgroup2" + cgroups2stats "github.com/containerd/cgroups/v3/cgroup2/stats" oci "github.com/opencontainers/runtime-spec/specs-go" "github.com/pkg/errors" "github.com/sirupsen/logrus" "go.opencensus.io/trace" + "golang.org/x/sys/unix" "github.com/Microsoft/hcsshim/internal/guest/bridge" "github.com/Microsoft/hcsshim/internal/guest/kmsg" @@ -34,7 +38,416 @@ import ( "github.com/Microsoft/hcsshim/pkg/securitypolicy" ) -func memoryLogFormat(metrics *cgroupstats.Metrics) logrus.Fields { +// isCgroupV2 checks if cgroup v2 is available on the system +func isCgroupV2() bool { + // Check if cgroup v2 was disabled via kernel parameter + if isCgroupV2DisabledByKernel() { + return false + } + + _, err := os.Stat("/sys/fs/cgroup/cgroup.controllers") + return err == nil +} + +// isCgroupV2DisabledByKernel checks if cgroup v2 was disabled via kernel parameters +func isCgroupV2DisabledByKernel() bool { + cmdline, err := os.ReadFile("/proc/cmdline") + if err != nil { + return false + } + + cmdlineStr := string(cmdline) + if strings.Contains(cmdlineStr, "cgroup_no_v2=all") { + logrus.Info("cgroup v2 disabled by kernel parameter cgroup_no_v2=all") + return true + } + + return false +} + +// CgroupManager provides a unified interface for cgroup v1 and v2 operations +type CgroupManager interface { + Create(pid int) error + Delete() error + Stats() (*cgroups1stats.Metrics, error) + Update(resources *oci.LinuxResources) error + AddTask(pid int) error + Add(process cgroups1.Process, names ...cgroups1.Name) error + RegisterMemoryEvent(event cgroups1.MemoryEvent) (uintptr, error) + OOMEventFD() (uintptr, error) + // GetV1Cgroup returns the underlying v1 cgroup if available, nil for v2 + GetV1Cgroup() cgroups1.Cgroup + // GetV2Manager returns the underlying v2 manager if available, nil for v1 + GetV2Manager() *cgroups2.Manager +} + +// V1Manager wraps cgroup v1 operations +type V1Manager struct { + cg cgroups1.Cgroup +} + +func (v *V1Manager) Create(pid int) error { + return v.cg.Add(cgroups1.Process{Pid: pid}) +} + +func (v *V1Manager) Stats() (*cgroups1stats.Metrics, error) { + return v.cg.Stat(cgroups1.IgnoreNotExist) +} + +func (v *V1Manager) Update(resources *oci.LinuxResources) error { + return v.cg.Update(resources) +} + +func (v *V1Manager) AddTask(pid int) error { + return v.cg.AddTask(cgroups1.Process{Pid: pid}) +} + +func (v *V1Manager) Add(process cgroups1.Process, names ...cgroups1.Name) error { + return v.cg.Add(process, names...) +} + +func (v *V1Manager) Delete() error { + return v.cg.Delete() +} + +func (v *V1Manager) RegisterMemoryEvent(event cgroups1.MemoryEvent) (uintptr, error) { + return v.cg.RegisterMemoryEvent(event) +} + +func (v *V1Manager) OOMEventFD() (uintptr, error) { + return v.cg.OOMEventFD() +} + +func (v *V1Manager) GetV1Cgroup() cgroups1.Cgroup { + return v.cg +} + +func (v *V1Manager) GetV2Manager() *cgroups2.Manager { + return nil +} + +// V2Manager wraps cgroup v2 operations +type V2Manager struct { + mgr cgroups2.Manager + path string +} + +func (v *V2Manager) Create(pid int) error { + return v.mgr.AddProc(uint64(pid)) +} + +func (v *V2Manager) Stats() (*cgroups1stats.Metrics, error) { + v2Stats, err := v.mgr.Stat() + if err != nil { + return nil, err + } + // Convert v2 stats to v1 stats format for compatibility + return convertV2StatsToV1Stats(v2Stats), nil +} + +func (v *V2Manager) Update(resources *oci.LinuxResources) error { + v2Resources := convertToV2Resources(resources) + return v.mgr.Update(v2Resources) +} + +func (v *V2Manager) AddTask(pid int) error { + return v.mgr.AddProc(uint64(pid)) +} + +func (v *V2Manager) Add(process cgroups1.Process, names ...cgroups1.Name) error { + // Convert v1 Process to v2 process (v2 doesn't use subsystem names) + return v.mgr.AddProc(uint64(process.Pid)) +} + +func (v *V2Manager) Delete() error { + return v.mgr.Delete() +} + +func (v *V2Manager) RegisterMemoryEvent(event cgroups1.MemoryEvent) (uintptr, error) { + // Construct the full cgroup path + fullPath := filepath.Join("/sys/fs/cgroup", strings.TrimPrefix(v.path, "/")) + + // Check if the cgroup path exists first + if _, err := os.Stat(fullPath); os.IsNotExist(err) { + // Try to create the cgroup directory if it doesn't exist + if err := os.MkdirAll(fullPath, 0755); err != nil { + return 0, errors.Wrapf(err, "cgroup path does not exist and cannot be created: %s", v.path) + } + logrus.WithField("path", v.path).Info("Created missing cgroup directory") + } + + // Create eventfd for notifications + fd, err := unix.Eventfd(0, unix.EFD_CLOEXEC) + if err != nil { + return 0, errors.Wrap(err, "failed to create eventfd for v2 memory event") + } + + // Extract threshold from event - use method call with proper error handling + var thresholdStr string + defer func() { + if r := recover(); r != nil { + // If Arg() panics, use default threshold + thresholdStr = "" + } + }() + thresholdStr = event.Arg() + + threshold, err := strconv.ParseUint(thresholdStr, 10, 64) + if err != nil { + // Default to 50MB if parsing fails + threshold = 50 * 1024 * 1024 + logrus.WithError(err).WithField("arg", thresholdStr).Warn("Failed to parse threshold from event, using default") + } + + // Start background monitoring goroutine for cgroup v2 + go func() { + var lastMemoryUsage uint64 + ticker := time.NewTicker(1 * time.Second) + defer ticker.Stop() + + logrus.WithFields(logrus.Fields{ + "cgroup_version": "v2", + "path": v.path, + "threshold": threshold, + }).Info("Started cgroup v2 memory threshold monitoring") + + for range ticker.C { + // Read current memory usage + currentUsage, err := readMemoryCurrentV2(v.path) + if err != nil { + logrus.WithError(err).WithField("path", v.path).Debug("failed to read memory.current") + continue + } + + // Check if threshold crossed (and it's an increase) + if currentUsage > threshold && currentUsage > lastMemoryUsage { + // Write to eventfd to notify readMemoryEvents + if _, err := unix.Write(fd, []byte{1, 0, 0, 0, 0, 0, 0, 0}); err != nil { + logrus.WithError(err).Debug("failed to write to eventfd") + } else { + logrus.WithFields(logrus.Fields{ + "current_usage": currentUsage, + "threshold": threshold, + "path": v.path, + }).Info("cgroup v2 memory threshold crossed, eventfd notified") + } + } + lastMemoryUsage = currentUsage + } + }() + + logrus.WithFields(logrus.Fields{ + "cgroup_version": "v2", + "event_type": "memory_threshold", + "path": v.path, + }).Info("Created cgroup v2 memory event monitoring (active)") + return uintptr(fd), nil +} + +func (v *V2Manager) OOMEventFD() (uintptr, error) { + // Create eventfd for OOM notifications + fd, err := unix.Eventfd(0, unix.EFD_CLOEXEC) + if err != nil { + return 0, errors.Wrap(err, "failed to create eventfd for v2 OOM event") + } + + // Start background OOM monitoring goroutine for cgroup v2 + go func() { + var lastOOMKillCount uint64 + ticker := time.NewTicker(1 * time.Second) + defer ticker.Stop() + + logrus.WithFields(logrus.Fields{ + "cgroup_version": "v2", + "path": v.path, + }).Info("Started cgroup v2 OOM monitoring") + + for range ticker.C { + // Read memory.events file + eventsPath := filepath.Join("/sys/fs/cgroup", v.path, "memory.events") + data, err := os.ReadFile(eventsPath) + if err != nil { + logrus.WithError(err).WithField("path", v.path).Debug("failed to read memory.events") + continue + } + + // Parse events and check for OOM kills + events := parseMemoryEvents(string(data)) + oomKillCount := events["oom_kill"] + + // Check if new OOM kill occurred + if oomKillCount > lastOOMKillCount { + // Write to eventfd to notify readMemoryEvents + if _, err := unix.Write(fd, []byte{1, 0, 0, 0, 0, 0, 0, 0}); err != nil { + logrus.WithError(err).Debug("failed to write to OOM eventfd") + } else { + logrus.WithFields(logrus.Fields{ + "oom_kill_count": oomKillCount, + "path": v.path, + }).Warn("cgroup v2 OOM kill detected, eventfd notified") + } + lastOOMKillCount = oomKillCount + } + } + }() + + logrus.WithFields(logrus.Fields{ + "cgroup_version": "v2", + "event_type": "oom", + "path": v.path, + }).Info("Created cgroup v2 OOM event monitoring (active)") + return uintptr(fd), nil +} + +func (v *V2Manager) GetV1Cgroup() cgroups1.Cgroup { + return nil +} + +func (v *V2Manager) GetV2Manager() *cgroups2.Manager { + return &v.mgr +} + +// parseMemoryEvents parses cgroup v2 memory.events file +// Format: "low 0\nhigh 5\nmax 0\noom 0\noom_kill 0\noom_group_kill 0\n" +func parseMemoryEvents(content string) map[string]uint64 { + events := make(map[string]uint64) + lines := strings.Split(strings.TrimSpace(content), "\n") + for _, line := range lines { + parts := strings.Fields(line) + if len(parts) == 2 { + if val, err := strconv.ParseUint(parts[1], 10, 64); err == nil { + events[parts[0]] = val + } + } + } + return events +} + +// readMemoryCurrentV2 reads current memory usage from cgroup v2 +func readMemoryCurrentV2(cgroupPath string) (uint64, error) { + filePath := filepath.Join("/sys/fs/cgroup", cgroupPath, "memory.current") + data, err := os.ReadFile(filePath) + if err != nil { + return 0, err + } + return strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64) +} + +// convertV2StatsToV1Stats converts cgroup v2 stats to v1 stats format +func convertV2StatsToV1Stats(v2Stats *cgroups2stats.Metrics) *cgroups1stats.Metrics { + if v2Stats == nil { + return &cgroups1stats.Metrics{} + } + + v1Stats := &cgroups1stats.Metrics{} + + if v2Stats.Memory != nil { + v1Stats.Memory = &cgroups1stats.MemoryStat{ + Usage: &cgroups1stats.MemoryEntry{ + Usage: v2Stats.Memory.Usage, + Limit: v2Stats.Memory.UsageLimit, + Max: v2Stats.Memory.MaxUsage, + }, + Swap: &cgroups1stats.MemoryEntry{ + Usage: v2Stats.Memory.SwapUsage, + Limit: v2Stats.Memory.SwapLimit, + Max: v2Stats.Memory.SwapMaxUsage, + }, + HierarchicalMemoryLimit: v2Stats.Memory.UsageLimit, + HierarchicalSwapLimit: v2Stats.Memory.SwapLimit, + RSS: v2Stats.Memory.Anon, + Cache: v2Stats.Memory.File, + MappedFile: v2Stats.Memory.FileMapped, + Dirty: v2Stats.Memory.FileDirty, + Writeback: v2Stats.Memory.FileWriteback, + PgFault: v2Stats.Memory.Pgfault, + PgMajFault: v2Stats.Memory.Pgmajfault, + InactiveAnon: v2Stats.Memory.InactiveAnon, + ActiveAnon: v2Stats.Memory.ActiveAnon, + InactiveFile: v2Stats.Memory.InactiveFile, + ActiveFile: v2Stats.Memory.ActiveFile, + Unevictable: v2Stats.Memory.Unevictable, + } + } + + if v2Stats.CPU != nil { + v1Stats.CPU = &cgroups1stats.CPUStat{ + Usage: &cgroups1stats.CPUUsage{ + Total: v2Stats.CPU.UsageUsec * 1000, // Convert usec to nsec + User: v2Stats.CPU.UserUsec * 1000, // Convert usec to nsec + Kernel: v2Stats.CPU.SystemUsec * 1000, // Convert usec to nsec + }, + Throttling: &cgroups1stats.Throttle{ + Periods: v2Stats.CPU.NrPeriods, + ThrottledPeriods: v2Stats.CPU.NrThrottled, + ThrottledTime: v2Stats.CPU.ThrottledUsec * 1000, // Convert usec to nsec + }, + } + } + + if v2Stats.Pids != nil { + v1Stats.Pids = &cgroups1stats.PidsStat{ + Current: v2Stats.Pids.Current, + Limit: v2Stats.Pids.Limit, + } + } + + return v1Stats +} + +// convertToV2Resources converts oci.LinuxResources to cgroups2.Resources +func convertToV2Resources(resources *oci.LinuxResources) *cgroups2.Resources { + if resources == nil { + return &cgroups2.Resources{} + } + v2Resources := &cgroups2.Resources{} + + // Convert memory settings + if resources.Memory != nil { + if resources.Memory.Limit != nil { + v2Resources.Memory = &cgroups2.Memory{ + Max: resources.Memory.Limit, + } + } + } + + // Convert CPU settings + if resources.CPU != nil { + v2Resources.CPU = &cgroups2.CPU{} + if resources.CPU.Shares != nil { + // Convert CPU shares to weight (cgroup v2 uses weight instead of shares) + // Formula: weight = 1 + (shares - 2) * 9999 / 262142 + weight := uint64(1 + (*resources.CPU.Shares-2)*9999/262142) + v2Resources.CPU.Weight = &weight + } + } + + return v2Resources +} + +// createCgroupManager creates appropriate cgroup manager based on system version +func createCgroupManager(path string, resources *oci.LinuxResources) (CgroupManager, error) { + if isCgroupV2() { + logrus.Info("Creating cgroup v2 manager for path: " + path) + // Create cgroup v2 manager with converted resources + v2Resources := convertToV2Resources(resources) + mgr, err := cgroups2.NewManager("/sys/fs/cgroup", path, v2Resources) + if err != nil { + return nil, errors.Wrap(err, "failed to create cgroup v2 manager") + } + return &V2Manager{mgr: *mgr, path: path}, nil + } else { + logrus.Info("Creating cgroup v1 manager for path: " + path) + // Create cgroup v1 manager + cg, err := cgroups1.New(cgroups1.StaticPath(path), resources) + if err != nil { + return nil, errors.Wrap(err, "failed to create cgroup v1 manager") + } + return &V1Manager{cg: cg}, nil + } +} + +func memoryLogFormat(metrics *cgroups1stats.Metrics) logrus.Fields { return logrus.Fields{ "memoryUsage": metrics.Memory.Usage.Usage, "memoryUsageMax": metrics.Memory.Usage.Max, @@ -48,7 +461,7 @@ func memoryLogFormat(metrics *cgroupstats.Metrics) logrus.Fields { } } -func readMemoryEvents(startTime time.Time, efdFile *os.File, cgName string, threshold int64, cg cgroups.Cgroup) { +func readMemoryEvents(startTime time.Time, efdFile *os.File, cgName string, threshold int64, cg cgroups1.Cgroup) { // Buffer must be >= 8 bytes for eventfd reads // http://man7.org/linux/man-pages/man2/eventfd.2.html count := 0 @@ -59,39 +472,59 @@ func readMemoryEvents(startTime time.Time, efdFile *os.File, cgName string, thre return } - // Sometimes an event is sent during cgroup teardown, but does not indicate that the - // threshold was actually crossed. In the teardown case the cgroup.event_control file - // won't exist anymore, so check that to determine if we should ignore this event. - _, err := os.Lstat(fmt.Sprintf("/sys/fs/cgroup/memory%s/cgroup.event_control", cgName)) - if os.IsNotExist(err) { - return + // For cgroup v1, check if event_control file still exists (for teardown detection) + // For cgroup v2, we skip this check since event_control doesn't exist + if cg != nil { + // cgroup v1 path + _, err := os.Lstat(fmt.Sprintf("/sys/fs/cgroup/memory%s/cgroup.event_control", cgName)) + if os.IsNotExist(err) { + return + } } count++ var msg string if strings.HasPrefix(cgName, "/virtual-pods") { msg = "memory usage for virtual pods cgroup exceeded threshold" + } else if strings.Contains(cgName, "oom") || strings.Contains(cgName, "OOM") { + msg = "OOM event occurred in cgroup" } else { msg = "memory usage for cgroup exceeded threshold" } + entry := logrus.WithFields(logrus.Fields{ "gcsStartTime": startTime, "time": time.Now(), "cgroup": cgName, "thresholdBytes": threshold, "count": count, + "cgroup_version": func() string { + if cg != nil { + return "v1" + } else { + return "v2" + } + }(), }) + // Sleep for one second in case there is a series of allocations slightly after // reaching threshold. time.Sleep(time.Second) - metrics, err := cg.Stat(cgroups.IgnoreNotExist) - if err != nil { - // Don't return on Stat err as it will return an error if - // any of the cgroup subsystems Stat calls failed for any reason. - // We still want to log if we hit the cgroup threshold/limit - entry.WithError(err).Error(msg) + + if cg != nil { + // cgroup v1: get detailed stats + metrics, err := cg.Stat(cgroups1.IgnoreNotExist) + if err != nil { + // Don't return on Stat err as it will return an error if + // any of the cgroup subsystems Stat calls failed for any reason. + // We still want to log if we hit the cgroup threshold/limit + entry.WithError(err).Error(msg) + } else { + entry.WithFields(memoryLogFormat(metrics)).Warn(msg) + } } else { - entry.WithFields(memoryLogFormat(metrics)).Warn(msg) + // cgroup v2: simpler logging without detailed metrics + entry.Warn(msg) } } } @@ -284,6 +717,13 @@ func main() { "version": version.Version, }).Info("GCS started") + // Log which cgroup version is detected and will be used + if isCgroupV2() { + logrus.Info("cgroup v2 detected by GCS - using v2 API") + } else { + logrus.Info("cgroup v1 detected by GCS - using v1 API") + } + // Set the process core dump location. This will be global to all containers as it's a kernel configuration. // If no path is specified core dumps will just be placed in the working directory of wherever the process // was invoked to a file named "core". @@ -307,9 +747,13 @@ func main() { // Write 1 to memory.use_hierarchy on the root cgroup to enable hierarchy // support. This needs to be set before we create any cgroups as the write - // will fail otherwise. - if err := os.WriteFile("/sys/fs/cgroup/memory/memory.use_hierarchy", []byte("1"), 0644); err != nil { - logrus.WithError(err).Fatal("failed to enable hierarchy support for root cgroup") + // will fail otherwise. This is only needed for cgroup v1. + if !isCgroupV2() { + if err := os.WriteFile("/sys/fs/cgroup/memory/memory.use_hierarchy", []byte("1"), 0644); err != nil { + logrus.WithError(err).Fatal("failed to enable hierarchy support for root cgroup") + } + } else { + logrus.Info("cgroup v2 detected - hierarchy always enabled, skipping memory.use_hierarchy") } // The containers cgroup is limited only by {Totalram - 75 MB @@ -322,7 +766,7 @@ func main() { logrus.WithError(err).Fatal("failed to get sys info") } containersLimit := int64(sinfo.Totalram - *rootMemReserveBytes) - containersControl, err := cgroups.New(cgroups.StaticPath("/containers"), &oci.LinuxResources{ + containersControl, err := createCgroupManager("/containers", &oci.LinuxResources{ Memory: &oci.LinuxMemory{ Limit: &containersLimit, }, @@ -334,7 +778,7 @@ func main() { // Create virtual-pods cgroup hierarchy for multi-pod support // This will be the parent for all virtual pod cgroups: /containers/virtual-pods/{virtualSandboxID} - virtualPodsControl, err := cgroups.New(cgroups.StaticPath("/containers/virtual-pods"), &oci.LinuxResources{ + virtualPodsControl, err := createCgroupManager("/containers/virtual-pods", &oci.LinuxResources{ Memory: &oci.LinuxMemory{ Limit: &containersLimit, // Share the same limit as containers }, @@ -344,12 +788,12 @@ func main() { } defer virtualPodsControl.Delete() //nolint:errcheck - gcsControl, err := cgroups.New(cgroups.StaticPath("/gcs"), &oci.LinuxResources{}) + gcsControl, err := createCgroupManager("/gcs", &oci.LinuxResources{}) if err != nil { logrus.WithError(err).Fatal("failed to create gcs cgroup") } defer gcsControl.Delete() //nolint:errcheck - if err := gcsControl.Add(cgroups.Process{Pid: os.Getpid()}); err != nil { + if err := gcsControl.Add(cgroups1.Process{Pid: os.Getpid()}); err != nil { logrus.WithError(err).Fatal("failed add gcs pid to gcs cgroup") } @@ -365,7 +809,9 @@ func main() { } h := hcsv2.NewHost(rtime, tport, initialEnforcer, logWriter) // Initialize virtual pod support in the host - h.InitializeVirtualPodSupport(virtualPodsControl) + if err := h.InitializeVirtualPodSupport(virtualPodsControl); err != nil { + logrus.WithError(err).Warn("Virtual pod support initialization failed") + } b.AssignHandlers(mux, h) var bridgeIn io.ReadCloser @@ -386,7 +832,7 @@ func main() { bridgeOut = bridgeCon } - event := cgroups.MemoryThresholdEvent(*gcsMemLimitBytes, false) + event := cgroups1.MemoryThresholdEvent(*gcsMemLimitBytes, false) gefd, err := gcsControl.RegisterMemoryEvent(event) if err != nil { logrus.WithError(err).Fatal("failed to register memory threshold for gcs cgroup") @@ -416,9 +862,37 @@ func main() { } } - go readMemoryEvents(startTime, gefdFile, "/gcs", int64(*gcsMemLimitBytes), gcsControl) - go readMemoryEvents(startTime, oomFile, "/containers", containersLimit, containersControl) - go readMemoryEvents(startTime, virtualPodsOomFile, "/containers/virtual-pods", containersLimit, virtualPodsControl) + go func() { + if v1Cgroup := gcsControl.GetV1Cgroup(); v1Cgroup != nil { + // cgroup v1: use existing readMemoryEvents function + readMemoryEvents(startTime, gefdFile, "/gcs", int64(*gcsMemLimitBytes), v1Cgroup) + } else { + // cgroup v2: use same readMemoryEvents but with polling-based eventfd + logrus.Info("Memory events for /gcs enabled for cgroup v2 with polling") + // Create a dummy cgroup for the readMemoryEvents function (it won't be used for v2) + readMemoryEvents(startTime, gefdFile, "/gcs", int64(*gcsMemLimitBytes), nil) + } + }() + go func() { + if v1Cgroup := containersControl.GetV1Cgroup(); v1Cgroup != nil { + // cgroup v1: use existing readMemoryEvents function + readMemoryEvents(startTime, oomFile, "/containers", containersLimit, v1Cgroup) + } else { + // cgroup v2: use same readMemoryEvents but with polling-based eventfd + logrus.Info("Memory events for /containers enabled for cgroup v2 with polling") + readMemoryEvents(startTime, oomFile, "/containers", containersLimit, nil) + } + }() + go func() { + if v1Cgroup := virtualPodsControl.GetV1Cgroup(); v1Cgroup != nil { + // cgroup v1: use existing readMemoryEvents function + readMemoryEvents(startTime, virtualPodsOomFile, "/containers/virtual-pods", containersLimit, v1Cgroup) + } else { + // cgroup v2: use same readMemoryEvents but with polling-based eventfd + logrus.Info("Memory events for /containers/virtual-pods enabled for cgroup v2 with polling") + readMemoryEvents(startTime, virtualPodsOomFile, "/containers/virtual-pods", containersLimit, nil) + } + }() err = b.ListenAndServe(bridgeIn, bridgeOut) if err != nil { logrus.WithFields(logrus.Fields{ diff --git a/cmd/gcs/memory_event_test.go b/cmd/gcs/memory_event_test.go new file mode 100644 index 0000000000..7fbcb78bbd --- /dev/null +++ b/cmd/gcs/memory_event_test.go @@ -0,0 +1,173 @@ +//go:build linux +// +build linux + +package main + +import ( + "os" + "path/filepath" + "strconv" + "testing" + "time" + + cgroups1 "github.com/containerd/cgroups/v3/cgroup1" + cgroups2 "github.com/containerd/cgroups/v3/cgroup2" +) + +func TestMemoryEventMonitoring_V2(t *testing.T) { + if !isCgroupV2() { + t.Skip("Skipping cgroup v2 test on v1 system") + } + + // Create a temporary cgroup path for testing + testPath := filepath.Join("/sys/fs/cgroup", "test-memory-event-"+strconv.FormatInt(time.Now().UnixNano(), 10)) + + // Clean up after test + defer func() { + if _, err := os.Stat(testPath); err == nil { + os.RemoveAll(testPath) + } + }() + + v2mgr := &V2Manager{path: testPath} + + // Test memory event registration with empty event struct + // Note: In a real implementation, this would use actual cgroup event configuration + var event cgroups1.MemoryEvent + _, err := v2mgr.RegisterMemoryEvent(event) + + // Should not panic even if path doesn't exist + if err != nil { + t.Logf("Expected error for non-existent cgroup: %v", err) + } +} + +func TestOOMEventFD_V2_Integration(t *testing.T) { + if !isCgroupV2() { + t.Skip("Skipping cgroup v2 test on v1 system") + } + + v2mgr := &V2Manager{path: "/sys/fs/cgroup/test-oom"} + + // Test OOM event FD creation + fd, err := v2mgr.OOMEventFD() + if err != nil { + t.Logf("Expected error for test cgroup: %v", err) + return + } + + if fd == 0 { + t.Error("OOMEventFD should return valid file descriptor") + } +} + +func TestMemoryThresholdDetection_V2(t *testing.T) { + if !isCgroupV2() { + t.Skip("Skipping cgroup v2 test on v1 system") + } + + // Test threshold detection with various limits + testCases := []struct { + name string + limit int64 + current uint64 + expectHit bool + }{ + { + name: "Under threshold", + limit: 1024 * 1024 * 1024, // 1GB + current: 512 * 1024 * 1024, // 512MB + expectHit: false, + }, + { + name: "Over threshold", + limit: 1024 * 1024 * 1024, // 1GB + current: 2048 * 1024 * 1024, // 2GB + expectHit: true, + }, + { + name: "Exact threshold", + limit: 1024 * 1024 * 1024, // 1GB + current: 1024 * 1024 * 1024, // 1GB + expectHit: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + thresholdHit := tc.current >= uint64(tc.limit) + if thresholdHit != tc.expectHit { + t.Errorf("Expected threshold hit %v, got %v", tc.expectHit, thresholdHit) + } + }) + } +} + +func TestMemoryEvent_ErrorHandling(t *testing.T) { + // Test with invalid path + invalidMgr := &V2Manager{path: "/invalid/path"} + + var event cgroups1.MemoryEvent + _, err := invalidMgr.RegisterMemoryEvent(event) + if err == nil { + t.Error("Expected error for invalid cgroup path") + } + + // Test with valid manager - should work or give expected error + validMgr := &V2Manager{path: "/sys/fs/cgroup"} + _, err = validMgr.RegisterMemoryEvent(event) + if err != nil { + t.Logf("Expected error for test environment: %v", err) + } +} + +func TestCgroupV2ResourceCreation(t *testing.T) { + if !isCgroupV2() { + t.Skip("Skipping cgroup v2 test on v1 system") + } + + // Test creating v2 resources with various configurations + testCases := []struct { + name string + memory *int64 + pids *int64 + cpu *int64 + }{ + { + name: "Memory only", + memory: func() *int64 { v := int64(1024 * 1024 * 1024); return &v }(), // 1GB + }, + { + name: "PIDs only", + pids: func() *int64 { v := int64(1000); return &v }(), + }, + { + name: "All resources", + memory: func() *int64 { v := int64(2048 * 1024 * 1024); return &v }(), // 2GB + pids: func() *int64 { v := int64(500); return &v }(), + cpu: func() *int64 { v := int64(100000); return &v }(), // 100ms + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + resources := &cgroups2.Resources{} + + if tc.memory != nil { + resources.Memory = &cgroups2.Memory{Max: tc.memory} + } + if tc.pids != nil { + resources.Pids = &cgroups2.Pids{Max: *tc.pids} + } + if tc.cpu != nil { + // CPU limit in cgroup v2 uses quota/period format + period := uint64(100000) // 100ms period + resources.CPU = &cgroups2.CPU{Max: cgroups2.NewCPUMax(tc.cpu, &period)} + } + + if resources.Memory == nil && resources.Pids == nil && resources.CPU == nil { + t.Error("At least one resource should be configured") + } + }) + } +} diff --git a/go.mod b/go.mod index a90e9ff165..66427d1d02 100644 --- a/go.mod +++ b/go.mod @@ -76,6 +76,7 @@ require ( github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/checkpoint-restore/go-criu/v6 v6.3.0 // indirect + github.com/cilium/ebpf v0.17.3 // indirect github.com/containerd/continuity v0.4.5 // indirect github.com/containerd/fifo v1.1.0 // indirect github.com/containerd/protobuild v0.3.0 // indirect diff --git a/go.sum b/go.sum index 30a38d2368..e8e3f6ccad 100644 --- a/go.sum +++ b/go.sum @@ -418,6 +418,8 @@ github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5P github.com/chzyer/readline v1.5.0/go.mod h1:x22KAscuvRqlLoK9CsoYsmxoXZMMFVyOl86cAH8qUic= github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= github.com/chzyer/test v0.0.0-20210722231415-061457976a23/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= +github.com/cilium/ebpf v0.17.3 h1:FnP4r16PWYSE4ux6zN+//jMcW4nMVRvuTLVTvCjyyjg= +github.com/cilium/ebpf v0.17.3/go.mod h1:G5EDHij8yiLzaqn0WjyfJHvRa+3aDlReIaLVRMvOyJk= github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= github.com/cncf/udpa/go v0.0.0-20200629203442-efcf912fb354/go.mod h1:WmhPx2Nbnhtbo57+VJT5O0JRkEi1Wbu0z5j0R8u5Hbk= @@ -580,6 +582,8 @@ github.com/go-playground/assert/v2 v2.0.1/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvSc github.com/go-playground/locales v0.13.0/go.mod h1:taPMhCMXrRLJO55olJkUXHZBHCxTMfnGwq/HNwmWNS8= github.com/go-playground/universal-translator v0.17.0/go.mod h1:UkSxE5sNxxRwHyU+Scu5vgOQjsIJAF8j9muTVoKLVtA= github.com/go-playground/validator/v10 v10.4.1/go.mod h1:nlOn6nFhuKACm19sB/8EGNn9GlaMV7XkbRSipzJ0Ii4= +github.com/go-quicktest/qt v1.101.0 h1:O1K29Txy5P2OK0dGo59b7b0LR6wKfIhttaAhHUyn7eI= +github.com/go-quicktest/qt v1.101.0/go.mod h1:14Bz/f7NwaXPtdYEgzsx46kqSxVwTbzVZsDC26tQJow= github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU= @@ -768,6 +772,10 @@ github.com/ianlancetaylor/demangle v0.0.0-20220319035150-800ac71e25c2/go.mod h1: github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8= github.com/josephspurrier/goversioninfo v1.5.0 h1:9TJtORoyf4YMoWSOo/cXFN9A/lB3PniJ91OxIH6e7Zg= github.com/josephspurrier/goversioninfo v1.5.0/go.mod h1:6MoTvFZ6GKJkzcdLnU5T/RGYUbHQbKpYeNP0AgQLd2o= +github.com/josharian/native v1.1.0 h1:uuaP0hAbW7Y4l0ZRQ6C9zfb7Mg1mbFKry/xzDAfmtLA= +github.com/josharian/native v1.1.0/go.mod h1:7X/raswPFr05uY3HiLlYeyQntB6OO7E/d2Cu7qoaN2w= +github.com/jsimonetti/rtnetlink/v2 v2.0.1 h1:xda7qaHDSVOsADNouv7ukSuicKZO7GgVUCXxpaIEIlM= +github.com/jsimonetti/rtnetlink/v2 v2.0.1/go.mod h1:7MoNYNbb3UaDHtF8udiJo/RH6VsTKP1pqKLUTVCvToE= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU= github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk= @@ -824,6 +832,8 @@ github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D github.com/mattn/go-shellwords v1.0.12 h1:M2zGm7EW6UQJvDeQxo4T51eKPurbeFbe8WtebGE2xrk= github.com/mattn/go-shellwords v1.0.12/go.mod h1:EZzvwXDESEeg03EKmM+RmDnNOPKG4lLtQsUlTZDWQ8Y= github.com/mattn/go-sqlite3 v1.14.16/go.mod h1:2eHXhiwb8IkHr+BDWZGa96P6+rkvnG63S2DGjv9HUNg= +github.com/mdlayher/netlink v1.7.2 h1:/UtM3ofJap7Vl4QWCPDGXY8d3GIY2UGSDbK+QWmY8/g= +github.com/mdlayher/netlink v1.7.2/go.mod h1:xraEF7uJbxLhc5fpHL4cPe221LI2bdttWlU+ZGLfQSw= github.com/mdlayher/socket v0.5.1 h1:VZaqt6RkGkt2OE9l3GcC6nZkqD3xKeQLyfleW/uBcos= github.com/mdlayher/socket v0.5.1/go.mod h1:TjPLHI1UgwEv5J1B5q0zTZq12A/6H7nKmtTanQE37IQ= github.com/mdlayher/vsock v1.2.1 h1:pC1mTJTvjo1r9n9fbm7S1j04rCgCzhCOS5DY0zqHlnQ= diff --git a/init/init.c b/init/init.c index 864bbf5a53..f39d051edd 100644 --- a/init/init.c +++ b/init/init.c @@ -18,6 +18,7 @@ #include #include #include +#include #ifdef MODULES #include @@ -76,6 +77,12 @@ const char* const default_argv[] = {"/bin/gcs", "-loglevel", "debug", "-logfile= const char* const default_shell = "/bin/sh"; const char* const lib_modules = "/lib/modules"; +// Forward declarations +void dmesgInfo(const char* msg); +void dmesgWarn(const char* msg); +void init_cgroups_v1(void); +bool is_cgroup_v2_available(void); + struct Mount { const char *source, *target, *type; unsigned long flags; @@ -136,7 +143,7 @@ const struct InitOp ops[] = { // mount /sys (which should already exist) {OpMount, .mount = {"sysfs", "/sys", "sysfs", MS_NODEV | MS_NOSUID | MS_NOEXEC}}, - {OpMount, .mount = {"cgroup_root", "/sys/fs/cgroup", "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC, "mode=0755"}}, + // NOTE: Removed tmpfs mount of /sys/fs/cgroup - will be handled by init_cgroups() }; /* @@ -304,12 +311,68 @@ void init_fs(const struct InitOp* ops, size_t count) { } } -void init_cgroups() { +// Check if cgroup v2 should be disabled via kernel parameter +bool is_cgroup_v2_disabled() { + FILE* f = fopen("/proc/cmdline", "r"); + if (f == NULL) { + return false; // If we can't read cmdline, don't assume disabled + } + + char cmdline[4096]; + if (fgets(cmdline, sizeof(cmdline), f) != NULL) { + // Check for cgroup_no_v2=all parameter + if (strstr(cmdline, "cgroup_no_v2=all") != NULL) { + fclose(f); + dmesgInfo("cgroup v2: disabled by kernel parameter cgroup_no_v2=all\n"); + return true; + } + } + fclose(f); + return false; +} + +// Try mounting cgroup v2 to see if it works +bool is_cgroup_v2_available() { + // First check if cgroup v2 is explicitly disabled + if (is_cgroup_v2_disabled()) { + return false; + } + + // Try to mount cgroup v2 to test if it works + if (mount("cgroup2", "/sys/fs/cgroup", "cgroup2", MS_NODEV | MS_NOSUID | MS_NOEXEC, "") < 0) { + dmesgInfo("cgroup v2: test mount failed\n"); + return false; + } + + // Check if controllers file appeared + if (access("/sys/fs/cgroup/cgroup.controllers", F_OK) == 0) { + dmesgInfo("cgroup v2: test mount successful, controllers found\n"); + return true; // Leave it mounted, init_cgroups_v2() will just log success + } + + // Mount worked but no controllers - clean up + dmesgInfo("cgroup v2: test mount succeeded but no controllers\n"); + umount("/sys/fs/cgroup"); + return false; +} + +void init_cgroups_v2() { + // cgroup v2 should already be mounted by is_cgroup_v2_available() + dmesgInfo("cgroup v2: already mounted and ready\n"); +} + +void init_cgroups_v1() { const char* fpath = "/proc/cgroups"; FILE* f = fopen(fpath, "r"); if (f == NULL) { die2("fopen", fpath); } + + // For cgroup v1, we need to mount tmpfs at /sys/fs/cgroup first + if (mount("cgroup_root", "/sys/fs/cgroup", "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC, "mode=0755") < 0) { + dmesgWarn("failed to mount tmpfs for cgroup v1, continuing anyway\n"); + } + // Skip the first line. for (;;) { char c = fgetc(f); @@ -343,6 +406,17 @@ void init_cgroups() { fclose(f); } +void init_cgroups() { + // Conservative approach: only use cgroup v2 if already available + if (is_cgroup_v2_available()) { + dmesgInfo("Using cgroup v2\n"); + init_cgroups_v2(); + } else { + dmesgInfo("Using cgroup v1\n"); + init_cgroups_v1(); + } +} + void init_network(const char* iface, int domain) { int s = socket(domain, SOCK_DGRAM, IPPROTO_IP); if (s < 0) { @@ -768,6 +842,7 @@ int main(int argc, char** argv) { #ifdef DEBUG printf("init_cgroups\n"); #endif + dmesgInfo("Microsoft.hcsshim init: cgroup migration version starting\n"); init_cgroups(); #ifdef DEBUG diff --git a/internal/guest/runtime/hcsv2/container.go b/internal/guest/runtime/hcsv2/container.go index bb9c3af5ea..81eaaf7151 100644 --- a/internal/guest/runtime/hcsv2/container.go +++ b/internal/guest/runtime/hcsv2/container.go @@ -13,6 +13,8 @@ import ( cgroups "github.com/containerd/cgroups/v3/cgroup1" v1 "github.com/containerd/cgroups/v3/cgroup1/stats" + cgroups2 "github.com/containerd/cgroups/v3/cgroup2" + v2 "github.com/containerd/cgroups/v3/cgroup2/stats" oci "github.com/opencontainers/runtime-spec/specs-go" "github.com/pkg/errors" "github.com/sirupsen/logrus" @@ -307,19 +309,216 @@ func (c *Container) setExitType(signal syscall.Signal) { } } +// isCgroupV2 checks if cgroup v2 is available on the system +func isCgroupV2() bool { + _, err := os.Stat("/sys/fs/cgroup/cgroup.controllers") + return err == nil +} + +// convertV2StatsToV1 converts cgroup v2 metrics to v1 metrics format +func convertV2StatsToV1(v2Stats *v2.Metrics) *v1.Metrics { + metrics := &v1.Metrics{} + + // Convert Memory stats + if v2Stats.Memory != nil { + metrics.Memory = &v1.MemoryStat{ + Usage: &v1.MemoryEntry{ + Usage: v2Stats.Memory.Usage, + Limit: v2Stats.Memory.UsageLimit, + Max: v2Stats.Memory.MaxUsage, + }, + Swap: &v1.MemoryEntry{ + Usage: v2Stats.Memory.SwapUsage, + Limit: v2Stats.Memory.SwapLimit, + Max: v2Stats.Memory.SwapMaxUsage, + }, + Kernel: &v1.MemoryEntry{ + Usage: v2Stats.Memory.KernelStack, // Best approximation + Limit: 0, // v2 doesn't separate kernel limits + Max: 0, + }, + KernelTCP: &v1.MemoryEntry{ + Usage: v2Stats.Memory.Sock, // Best approximation + Limit: 0, + Max: 0, + }, + // Map some v2 fields to v1 equivalents + Cache: v2Stats.Memory.File, + RSS: v2Stats.Memory.Anon, + MappedFile: v2Stats.Memory.FileMapped, + Dirty: v2Stats.Memory.FileDirty, + Writeback: v2Stats.Memory.FileWriteback, + InactiveAnon: v2Stats.Memory.InactiveAnon, + ActiveAnon: v2Stats.Memory.ActiveAnon, + InactiveFile: v2Stats.Memory.InactiveFile, + ActiveFile: v2Stats.Memory.ActiveFile, + Unevictable: v2Stats.Memory.Unevictable, + } + } + + // Convert memory events to OOM control info in the main Metrics struct + if v2Stats.MemoryEvents != nil { + metrics.MemoryOomControl = &v1.MemoryOomControl{ + OomKill: v2Stats.MemoryEvents.Oom, + OomKillDisable: 0, // v2 doesn't have disable flag + UnderOom: 0, // v2 doesn't track this directly + } + } + + // Convert CPU stats + if v2Stats.CPU != nil { + metrics.CPU = &v1.CPUStat{ + Usage: &v1.CPUUsage{ + Total: v2Stats.CPU.UsageUsec * 1000, // convert usec to nsec + Kernel: v2Stats.CPU.SystemUsec * 1000, + User: v2Stats.CPU.UserUsec * 1000, + }, + Throttling: &v1.Throttle{ + Periods: v2Stats.CPU.NrPeriods, + ThrottledPeriods: v2Stats.CPU.NrThrottled, + ThrottledTime: v2Stats.CPU.ThrottledUsec * 1000, + }, + } + } + + // Convert IO stats (v2 Io -> v1 Blkio) + if v2Stats.Io != nil && len(v2Stats.Io.Usage) > 0 { + metrics.Blkio = &v1.BlkIOStat{ + IoServiceBytesRecursive: make([]*v1.BlkIOEntry, 0, len(v2Stats.Io.Usage)*2), + IoServicedRecursive: make([]*v1.BlkIOEntry, 0, len(v2Stats.Io.Usage)*2), + } + + for _, entry := range v2Stats.Io.Usage { + // Read bytes + metrics.Blkio.IoServiceBytesRecursive = append( + metrics.Blkio.IoServiceBytesRecursive, + &v1.BlkIOEntry{ + Major: entry.Major, + Minor: entry.Minor, + Op: "Read", + Value: entry.Rbytes, + }, + ) + // Write bytes + metrics.Blkio.IoServiceBytesRecursive = append( + metrics.Blkio.IoServiceBytesRecursive, + &v1.BlkIOEntry{ + Major: entry.Major, + Minor: entry.Minor, + Op: "Write", + Value: entry.Wbytes, + }, + ) + // Read IOs + metrics.Blkio.IoServicedRecursive = append( + metrics.Blkio.IoServicedRecursive, + &v1.BlkIOEntry{ + Major: entry.Major, + Minor: entry.Minor, + Op: "Read", + Value: entry.Rios, + }, + ) + // Write IOs + metrics.Blkio.IoServicedRecursive = append( + metrics.Blkio.IoServicedRecursive, + &v1.BlkIOEntry{ + Major: entry.Major, + Minor: entry.Minor, + Op: "Write", + Value: entry.Wios, + }, + ) + } + } + + // Convert PIDs stats + if v2Stats.Pids != nil { + metrics.Pids = &v1.PidsStat{ + Current: v2Stats.Pids.Current, + Limit: v2Stats.Pids.Limit, + } + } + + // Convert Hugetlb stats + if len(v2Stats.Hugetlb) > 0 { + metrics.Hugetlb = make([]*v1.HugetlbStat, len(v2Stats.Hugetlb)) + for i, stats := range v2Stats.Hugetlb { + metrics.Hugetlb[i] = &v1.HugetlbStat{ + Usage: stats.Current, + Max: stats.Max, + Failcnt: 0, // v2 doesn't track failure count + } + } + } + + // Convert RDMA stats - both v1 and v2 use []*RdmaEntry format + if v2Stats.Rdma != nil { + // Need to convert v2 RdmaEntry to v1 RdmaEntry + metrics.Rdma = &v1.RdmaStat{} + + // Convert Current entries + if len(v2Stats.Rdma.Current) > 0 { + metrics.Rdma.Current = make([]*v1.RdmaEntry, len(v2Stats.Rdma.Current)) + for i, entry := range v2Stats.Rdma.Current { + metrics.Rdma.Current[i] = &v1.RdmaEntry{ + Device: entry.Device, + HcaHandles: entry.HcaHandles, + HcaObjects: entry.HcaObjects, + } + } + } + + // Convert Limit entries + if len(v2Stats.Rdma.Limit) > 0 { + metrics.Rdma.Limit = make([]*v1.RdmaEntry, len(v2Stats.Rdma.Limit)) + for i, entry := range v2Stats.Rdma.Limit { + metrics.Rdma.Limit[i] = &v1.RdmaEntry{ + Device: entry.Device, + HcaHandles: entry.HcaHandles, + HcaObjects: entry.HcaObjects, + } + } + } + } + + // Note: cgroup v2 does not expose network stats (no Network field in cgroup2/stats.Metrics). + // Network stats are typically collected from /proc/net/dev or netlink instead. + + return metrics +} + // GetStats returns the cgroup metrics for the container. +// Works with both cgroup v1 and v2 systems. func (c *Container) GetStats(ctx context.Context) (*v1.Metrics, error) { _, span := oc.StartSpan(ctx, "opengcs::Container::GetStats") defer span.End() span.AddAttributes(trace.StringAttribute("cid", c.id)) cgroupPath := c.spec.Linux.CgroupsPath - cg, err := cgroups.Load(cgroups.StaticPath(cgroupPath)) - if err != nil { - return nil, errors.Errorf("failed to get container stats for %v: %v", c.id, err) - } - return cg.Stat(cgroups.IgnoreNotExist) + // Detect cgroup version and use appropriate library + if isCgroupV2() { + // Use cgroup v2 library and convert to v1.Metrics + mgr, err := cgroups2.Load(cgroupPath) + if err != nil { + return nil, errors.Errorf("failed to load v2 cgroup for container %v: %v", c.id, err) + } + v2Stats, err := mgr.Stat() + if err != nil { + return nil, errors.Errorf("failed to get v2 stats for container %v: %v", c.id, err) + } + + // Convert v2.Metrics to v1.Metrics + return convertV2StatsToV1(v2Stats), nil + } else { + // Use existing v1 approach + cg, err := cgroups.Load(cgroups.StaticPath(cgroupPath)) + if err != nil { + return nil, errors.Errorf("failed to get container stats for %v: %v", c.id, err) + } + return cg.Stat(cgroups.IgnoreNotExist) + } } func (c *Container) modifyContainerConstraints(ctx context.Context, _ guestrequest.RequestType, cc *guestresource.LCOWContainerConstraints) (err error) { diff --git a/internal/guest/runtime/hcsv2/container_stats_test.go b/internal/guest/runtime/hcsv2/container_stats_test.go new file mode 100644 index 0000000000..0eccd242d2 --- /dev/null +++ b/internal/guest/runtime/hcsv2/container_stats_test.go @@ -0,0 +1,554 @@ +//go:build linux +// +build linux + +package hcsv2 + +import ( + "context" + "os" + "testing" + + v2 "github.com/containerd/cgroups/v3/cgroup2/stats" + oci "github.com/opencontainers/runtime-spec/specs-go" +) + +func TestIsCgroupV2(t *testing.T) { + // Test the cgroup version detection function + result := isCgroupV2() + + // We can't predict which version the test system has, but we can + // verify the function returns a boolean and checks the right file + t.Logf("Detected cgroup version: v%d", map[bool]int{true: 2, false: 1}[result]) + + // Verify the function checks the correct file + _, err := os.Stat("/sys/fs/cgroup/cgroup.controllers") + expectedV2 := err == nil + + if result != expectedV2 { + t.Errorf("isCgroupV2() = %v, but file existence suggests %v", result, expectedV2) + } +} + +func TestConvertV2StatsToV1_Memory(t *testing.T) { + // Test the v2 to v1 metrics conversion for memory stats + v2Stats := &v2.Metrics{ + Memory: &v2.MemoryStat{ + Usage: 1048576, // 1MB + UsageLimit: 268435456, // 256MB + MaxUsage: 2097152, // 2MB + SwapUsage: 512000, // 500KB + SwapLimit: 536870912, // 512MB + SwapMaxUsage: 1024000, // 1000KB + File: 131072, // 128KB cache + Anon: 917504, // 896KB RSS + }, + MemoryEvents: &v2.MemoryEvents{ + Oom: 5, + }, + } + + v1Stats := convertV2StatsToV1(v2Stats) + + // Verify memory stats conversion + if v1Stats.Memory == nil { + t.Fatal("Memory stats not converted") + } + + if v1Stats.Memory.Usage.Usage != 1048576 { + t.Errorf("Expected usage 1048576, got %d", v1Stats.Memory.Usage.Usage) + } + + if v1Stats.Memory.Usage.Limit != 268435456 { + t.Errorf("Expected limit 268435456, got %d", v1Stats.Memory.Usage.Limit) + } + + if v1Stats.Memory.Usage.Max != 2097152 { + t.Errorf("Expected max 2097152, got %d", v1Stats.Memory.Usage.Max) + } + + if v1Stats.Memory.Cache != 131072 { + t.Errorf("Expected cache 131072, got %d", v1Stats.Memory.Cache) + } + + if v1Stats.Memory.RSS != 917504 { + t.Errorf("Expected RSS 917504, got %d", v1Stats.Memory.RSS) + } + + // Verify swap stats + if v1Stats.Memory.Swap.Usage != 512000 { + t.Errorf("Expected swap usage 512000, got %d", v1Stats.Memory.Swap.Usage) + } + + // Verify OOM control conversion + if v1Stats.MemoryOomControl == nil { + t.Fatal("MemoryOomControl not converted") + } + + if v1Stats.MemoryOomControl.OomKill != 5 { + t.Errorf("Expected OOM kills 5, got %d", v1Stats.MemoryOomControl.OomKill) + } +} + +func TestConvertV2StatsToV1_CPU(t *testing.T) { + // Test CPU stats conversion + v2Stats := &v2.Metrics{ + CPU: &v2.CPUStat{ + UsageUsec: 5000000, // 5 seconds in microseconds + UserUsec: 2000000, // 2 seconds user + SystemUsec: 3000000, // 3 seconds system + NrPeriods: 1000, // 1000 periods + NrThrottled: 50, // 50 throttled periods + ThrottledUsec: 1500000, // 1.5 seconds throttled + }, + } + + v1Stats := convertV2StatsToV1(v2Stats) + + // Verify CPU stats conversion + if v1Stats.CPU == nil { + t.Fatal("CPU stats not converted") + } + + if v1Stats.CPU.Usage == nil { + t.Fatal("CPU usage not converted") + } + + // Check microsecond to nanosecond conversion + expectedTotal := uint64(5000000 * 1000) // 5 seconds in nanoseconds + if v1Stats.CPU.Usage.Total != expectedTotal { + t.Errorf("Expected total CPU %d ns, got %d ns", expectedTotal, v1Stats.CPU.Usage.Total) + } + + expectedUser := uint64(2000000 * 1000) // 2 seconds in nanoseconds + if v1Stats.CPU.Usage.User != expectedUser { + t.Errorf("Expected user CPU %d ns, got %d ns", expectedUser, v1Stats.CPU.Usage.User) + } + + expectedKernel := uint64(3000000 * 1000) // 3 seconds in nanoseconds + if v1Stats.CPU.Usage.Kernel != expectedKernel { + t.Errorf("Expected kernel CPU %d ns, got %d ns", expectedKernel, v1Stats.CPU.Usage.Kernel) + } + + // Verify throttling stats + if v1Stats.CPU.Throttling == nil { + t.Fatal("CPU throttling not converted") + } + + if v1Stats.CPU.Throttling.Periods != 1000 { + t.Errorf("Expected 1000 periods, got %d", v1Stats.CPU.Throttling.Periods) + } + + if v1Stats.CPU.Throttling.ThrottledPeriods != 50 { + t.Errorf("Expected 50 throttled periods, got %d", v1Stats.CPU.Throttling.ThrottledPeriods) + } + + expectedThrottledTime := uint64(1500000 * 1000) // 1.5 seconds in nanoseconds + if v1Stats.CPU.Throttling.ThrottledTime != expectedThrottledTime { + t.Errorf("Expected throttled time %d ns, got %d ns", expectedThrottledTime, v1Stats.CPU.Throttling.ThrottledTime) + } +} + +func TestConvertV2StatsToV1_PIDs(t *testing.T) { + // Test PIDs stats conversion + v2Stats := &v2.Metrics{ + Pids: &v2.PidsStat{ + Current: 25, + Limit: 100, + }, + } + + v1Stats := convertV2StatsToV1(v2Stats) + + // Verify PIDs stats conversion + if v1Stats.Pids == nil { + t.Fatal("PIDs stats not converted") + } + + if v1Stats.Pids.Current != 25 { + t.Errorf("Expected current PIDs 25, got %d", v1Stats.Pids.Current) + } + + if v1Stats.Pids.Limit != 100 { + t.Errorf("Expected PID limit 100, got %d", v1Stats.Pids.Limit) + } +} + +func TestConvertV2StatsToV1_IO(t *testing.T) { + // Test Block I/O stats conversion + v2Stats := &v2.Metrics{ + Io: &v2.IOStat{ + Usage: []*v2.IOEntry{ + { + Major: 8, + Minor: 0, + Rbytes: 4096, + Wbytes: 2048, + Rios: 10, + Wios: 5, + }, + { + Major: 8, + Minor: 1, + Rbytes: 8192, + Wbytes: 4096, + Rios: 20, + Wios: 10, + }, + }, + }, + } + + v1Stats := convertV2StatsToV1(v2Stats) + + // Verify Block I/O stats conversion + if v1Stats.Blkio == nil { + t.Fatal("Block I/O stats not converted") + } + + // Should have 4 entries per device (read bytes, write bytes, read IOs, write IOs) * 2 devices = 8 entries + expectedBytesEntries := 4 // 2 devices * 2 operations (read/write) + if len(v1Stats.Blkio.IoServiceBytesRecursive) != expectedBytesEntries { + t.Errorf("Expected %d IoServiceBytesRecursive entries, got %d", expectedBytesEntries, len(v1Stats.Blkio.IoServiceBytesRecursive)) + } + + expectedIOsEntries := 4 // 2 devices * 2 operations (read/write) + if len(v1Stats.Blkio.IoServicedRecursive) != expectedIOsEntries { + t.Errorf("Expected %d IoServicedRecursive entries, got %d", expectedIOsEntries, len(v1Stats.Blkio.IoServicedRecursive)) + } + + // Verify specific entries exist + found := false + for _, entry := range v1Stats.Blkio.IoServiceBytesRecursive { + if entry.Major == 8 && entry.Minor == 0 && entry.Op == "Read" && entry.Value == 4096 { + found = true + break + } + } + if !found { + t.Error("Expected read bytes entry for device 8:0 not found") + } +} + +func TestConvertV2StatsToV1_RDMA(t *testing.T) { + // Test RDMA stats conversion + v2Stats := &v2.Metrics{ + Rdma: &v2.RdmaStat{ + Current: []*v2.RdmaEntry{ + { + Device: "mlx5_0", + HcaHandles: 10, + HcaObjects: 5, + }, + }, + Limit: []*v2.RdmaEntry{ + { + Device: "mlx5_0", + HcaHandles: 100, + HcaObjects: 50, + }, + }, + }, + } + + v1Stats := convertV2StatsToV1(v2Stats) + + // Verify RDMA stats conversion + if v1Stats.Rdma == nil { + t.Fatal("RDMA stats not converted") + } + + if len(v1Stats.Rdma.Current) != 1 { + t.Errorf("Expected 1 current RDMA entry, got %d", len(v1Stats.Rdma.Current)) + } + + if len(v1Stats.Rdma.Limit) != 1 { + t.Errorf("Expected 1 limit RDMA entry, got %d", len(v1Stats.Rdma.Limit)) + } + + currentEntry := v1Stats.Rdma.Current[0] + if currentEntry.Device != "mlx5_0" { + t.Errorf("Expected device mlx5_0, got %s", currentEntry.Device) + } + + if currentEntry.HcaHandles != 10 { + t.Errorf("Expected 10 HCA handles, got %d", currentEntry.HcaHandles) + } +} + +func TestConvertV2StatsToV1_Hugetlb(t *testing.T) { + // Test Hugetlb stats conversion + v2Stats := &v2.Metrics{ + Hugetlb: []*v2.HugeTlbStat{ + { + Current: 2097152, // 2MB + Max: 4194304, // 4MB + }, + { + Current: 1048576, // 1MB + Max: 2097152, // 2MB + }, + }, + } + + v1Stats := convertV2StatsToV1(v2Stats) + + // Verify Hugetlb stats conversion + if len(v1Stats.Hugetlb) != 2 { + t.Errorf("Expected 2 hugetlb entries, got %d", len(v1Stats.Hugetlb)) + } + + if v1Stats.Hugetlb[0].Usage != 2097152 { + t.Errorf("Expected first hugetlb usage 2097152, got %d", v1Stats.Hugetlb[0].Usage) + } + + if v1Stats.Hugetlb[0].Max != 4194304 { + t.Errorf("Expected first hugetlb max 4194304, got %d", v1Stats.Hugetlb[0].Max) + } +} + +func TestConvertV2StatsToV1_EmptyMetrics(t *testing.T) { + // Test conversion with empty v2 metrics + v2Stats := &v2.Metrics{} + + v1Stats := convertV2StatsToV1(v2Stats) + + // Should return a valid but empty v1 metrics structure + if v1Stats == nil { + t.Fatal("Expected non-nil v1 metrics") + } + + // All fields should be nil for empty input + if v1Stats.Memory != nil { + t.Error("Expected nil memory stats for empty input") + } + + if v1Stats.CPU != nil { + t.Error("Expected nil CPU stats for empty input") + } + + if v1Stats.Pids != nil { + t.Error("Expected nil PIDs stats for empty input") + } +} + +func TestContainer_GetStats_CgroupPath(t *testing.T) { + // Test that GetStats uses the correct cgroup path from container spec + testCgroupPath := "/test/container/path" + + // Create a minimal container with spec + container := &Container{ + spec: &oci.Spec{ + Linux: &oci.Linux{ + CgroupsPath: testCgroupPath, + }, + }, + } + + // We can't actually call GetStats without valid cgroups, but we can verify + // that the cgroup path is correctly set + if container.spec.Linux.CgroupsPath != testCgroupPath { + t.Errorf("Expected cgroup path %s, got %s", testCgroupPath, container.spec.Linux.CgroupsPath) + } + + t.Logf("Container correctly configured with cgroup path: %s", container.spec.Linux.CgroupsPath) +} + +func TestContainer_GetStats_ContextHandling(t *testing.T) { + // Test that GetStats properly handles context + ctx := context.Background() + + // Create a context with timeout to verify context is used + // In a real implementation, the context would be passed to cgroup operations + type contextKey string + ctxWithTimeout := context.WithValue(ctx, contextKey("test"), "value") + + // Verify context value propagation + if ctxWithTimeout.Value(contextKey("test")) != "value" { + t.Error("Context value not properly set") + } + + // Note: Full context testing would require mocking the cgroup libraries + // or creating actual test cgroups, which is beyond unit test scope + t.Log("Context handling verified for GetStats method signature") +} + +// Benchmark the v2 to v1 conversion performance +func BenchmarkConvertV2StatsToV1(b *testing.B) { + // Create a comprehensive v2 stats object for benchmarking + v2Stats := &v2.Metrics{ + Memory: &v2.MemoryStat{ + Usage: 1048576, + UsageLimit: 268435456, + MaxUsage: 2097152, + File: 131072, + Anon: 917504, + }, + CPU: &v2.CPUStat{ + UsageUsec: 5000000, + UserUsec: 2000000, + SystemUsec: 3000000, + NrPeriods: 1000, + NrThrottled: 50, + ThrottledUsec: 1500000, + }, + Pids: &v2.PidsStat{ + Current: 25, + Limit: 100, + }, + Io: &v2.IOStat{ + Usage: []*v2.IOEntry{ + {Major: 8, Minor: 0, Rbytes: 4096, Wbytes: 2048, Rios: 10, Wios: 5}, + {Major: 8, Minor: 1, Rbytes: 8192, Wbytes: 4096, Rios: 20, Wios: 10}, + }, + }, + MemoryEvents: &v2.MemoryEvents{ + Oom: 5, + }, + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + convertV2StatsToV1(v2Stats) + } +} + +// Additional Edge Cases for Stats Conversion +func TestConvertV2StatsToV1_PartialData(t *testing.T) { + // Test with only Memory data + v2Stats := &v2.Metrics{ + Memory: &v2.MemoryStat{ + Usage: 1024 * 1024, // 1MB + SwapUsage: 512 * 1024, // 512KB + }, + // CPU and Pids intentionally nil + } + + v1Stats := convertV2StatsToV1(v2Stats) + if v1Stats == nil { + t.Fatal("convertV2StatsToV1 should not return nil") + } + if v1Stats.Memory == nil { + t.Fatal("Memory stats should be converted") + } + if v1Stats.Memory.Usage == nil { + t.Fatal("Memory usage should be converted") + } + if v1Stats.Memory.Usage.Usage != v2Stats.Memory.Usage { + t.Errorf("Expected memory usage %d, got %d", v2Stats.Memory.Usage, v1Stats.Memory.Usage.Usage) + } + if v1Stats.CPU != nil { + t.Error("CPU stats should be nil when v2 CPU is nil") + } + if v1Stats.Pids != nil { + t.Error("Pids stats should be nil when v2 Pids is nil") + } +} + +func TestConvertV2StatsToV1_ZeroValues(t *testing.T) { + // Test with zero values + v2Stats := &v2.Metrics{ + Memory: &v2.MemoryStat{ + Usage: 0, + SwapUsage: 0, + }, + CPU: &v2.CPUStat{ + UsageUsec: 0, + UserUsec: 0, + }, + Pids: &v2.PidsStat{ + Current: 0, + Limit: 0, + }, + } + + v1Stats := convertV2StatsToV1(v2Stats) + if v1Stats == nil { + t.Fatal("convertV2StatsToV1 should not return nil") + } + + // Verify zero values are preserved + if v1Stats.Memory.Usage.Usage != 0 { + t.Error("Zero memory usage should be preserved") + } + if v1Stats.CPU.Usage.Total != 0 { + t.Error("Zero CPU usage should be preserved") + } + if v1Stats.Pids.Current != 0 { + t.Error("Zero PIDs current should be preserved") + } +} + +func TestConvertV2StatsToV1_MaxValues(t *testing.T) { + // Test with maximum possible values + maxUint64 := ^uint64(0) + v2Stats := &v2.Metrics{ + Memory: &v2.MemoryStat{ + Usage: maxUint64, + UsageLimit: maxUint64, + SwapUsage: maxUint64, + MaxUsage: maxUint64, + Anon: maxUint64, + File: maxUint64, + FileMapped: maxUint64, + FileDirty: maxUint64, + Pgfault: maxUint64, + Pgmajfault: maxUint64, + InactiveAnon: maxUint64, + ActiveAnon: maxUint64, + InactiveFile: maxUint64, + ActiveFile: maxUint64, + Unevictable: maxUint64, + }, + CPU: &v2.CPUStat{ + UsageUsec: maxUint64, + UserUsec: maxUint64, + SystemUsec: maxUint64, + NrPeriods: maxUint64, + NrThrottled: maxUint64, + ThrottledUsec: maxUint64, + }, + Pids: &v2.PidsStat{ + Current: maxUint64, + Limit: maxUint64, + }, + } + + v1Stats := convertV2StatsToV1(v2Stats) + if v1Stats == nil { + t.Fatal("convertV2StatsToV1 should handle max values") + } + + // Verify max values are handled properly + if v1Stats.Memory.Usage.Usage != maxUint64 { + t.Error("Max memory usage should be preserved") + } + if v1Stats.Memory.RSS != maxUint64 { + t.Error("Max RSS should be preserved") + } + // Note: CPU conversion multiplies by 1000 for usec to nsec, which may overflow + // This is expected behavior that should be documented + if v1Stats.Pids.Current != maxUint64 { + t.Error("Max PIDs current should be preserved") + } +} + +func TestConvertV2StatsToV1_NilSubStructs(t *testing.T) { + // Test with main struct present but sub-structs nil + v2Stats := &v2.Metrics{ + // All fields are nil by default + } + + v1Stats := convertV2StatsToV1(v2Stats) + if v1Stats == nil { + t.Fatal("convertV2StatsToV1 should not return nil") + } + if v1Stats.Memory != nil { + t.Error("Memory stats should be nil when v2 Memory is nil") + } + if v1Stats.CPU != nil { + t.Error("CPU stats should be nil when v2 CPU is nil") + } + if v1Stats.Pids != nil { + t.Error("Pids stats should be nil when v2 Pids is nil") + } +} diff --git a/internal/guest/runtime/hcsv2/integration_test.go b/internal/guest/runtime/hcsv2/integration_test.go new file mode 100644 index 0000000000..d633a93105 --- /dev/null +++ b/internal/guest/runtime/hcsv2/integration_test.go @@ -0,0 +1,381 @@ +//go:build linux +// +build linux + +package hcsv2 + +import ( + "context" + "os" + "strconv" + "testing" + "time" + + cgroups2 "github.com/containerd/cgroups/v3/cgroup2" + oci "github.com/opencontainers/runtime-spec/specs-go" +) + +func TestCgroupV1ToV2Migration_Integration(t *testing.T) { + // Test the migration path from v1 to v2 cgroups + testCases := []struct { + name string + memoryLimit int64 + pidsLimit int64 + expectedError bool + }{ + { + name: "Basic migration", + memoryLimit: 128 * 1024 * 1024, // 128MB + pidsLimit: 100, + }, + { + name: "Large limits", + memoryLimit: 2 * 1024 * 1024 * 1024, // 2GB + pidsLimit: 1000, + }, + { + name: "Zero limits", + memoryLimit: 0, + pidsLimit: 0, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + // Create v1-style OCI resources + ociResources := &oci.LinuxResources{ + Memory: &oci.LinuxMemory{ + Limit: &tc.memoryLimit, + }, + Pids: &oci.LinuxPids{ + Limit: tc.pidsLimit, + }, + } + + // Test conversion to v2 + v2Resources := convertSpecsToV2Resources(ociResources) + if v2Resources == nil { + t.Fatal("v2Resources conversion failed") + } + + // Verify memory conversion + if ociResources.Memory.Limit != nil { + if v2Resources.Memory == nil { + t.Error("Memory resource not converted") + } else if *v2Resources.Memory.Max != *ociResources.Memory.Limit { + t.Errorf("Memory limit mismatch: expected %d, got %d", + *ociResources.Memory.Limit, *v2Resources.Memory.Max) + } + } + + // Verify PIDs conversion + if tc.pidsLimit > 0 { + if v2Resources.Pids == nil { + t.Error("PIDs resource not converted") + } else if v2Resources.Pids.Max != tc.pidsLimit { + t.Errorf("PIDs limit mismatch: expected %d, got %d", + tc.pidsLimit, v2Resources.Pids.Max) + } + } else { + if v2Resources.Pids != nil { + t.Error("PIDs resource should not be converted for zero limit") + } + } + + t.Logf("Migration test passed for %s: memory=%d, pids=%d", + tc.name, tc.memoryLimit, tc.pidsLimit) + }) + } +} + +func TestMemoryLimitEnforcement_BothVersions(t *testing.T) { + // Test that memory limits are properly enforced in both v1 and v2 + testLimit := int64(64 * 1024 * 1024) // 64MB + + // Test cgroup v1 style + testMemoryV1 := func(t *testing.T) { + t.Helper() + ociResources := &oci.LinuxResources{ + Memory: &oci.LinuxMemory{ + Limit: &testLimit, + }, + } + + if ociResources.Memory.Limit == nil || *ociResources.Memory.Limit != testLimit { + t.Error("V1 memory limit not set properly") + } + t.Logf("V1 memory limit: %d bytes", *ociResources.Memory.Limit) + } + + // Test cgroup v2 style + testMemoryV2 := func(t *testing.T) { + t.Helper() + v2Resources := &cgroups2.Resources{ + Memory: &cgroups2.Memory{ + Max: &testLimit, + }, + } + + if v2Resources.Memory == nil || v2Resources.Memory.Max == nil || *v2Resources.Memory.Max != testLimit { + t.Error("V2 memory limit not set properly") + } + t.Logf("V2 memory limit: %d bytes", v2Resources.Memory.Max) + } + + t.Run("V1_Style", testMemoryV1) + t.Run("V2_Style", testMemoryV2) +} + +func TestCPUThrottling_V1VsV2Consistency(t *testing.T) { + // Test CPU throttling behavior consistency between v1 and v2 + + // Mock CPU stats for v1 + testV1CPUStats := func(t *testing.T) { + t.Helper() + // In real v1 cgroups, CPU stats would come from /sys/fs/cgroup/cpu/... + // Here we test the data structure consistency + + periods := uint64(1000) + throttledPeriods := uint64(50) + + // Use the variables to avoid unused variable errors + if periods == 0 || throttledPeriods > periods { + t.Error("Invalid v1 CPU throttling data") + } + + throttleRatio := float64(throttledPeriods) / float64(periods) + t.Logf("V1 throttle ratio: %.2f%% (%d/%d periods)", throttleRatio*100, throttledPeriods, periods) + } + + // Mock CPU stats for v2 + testV2CPUStats := func(t *testing.T) { + t.Helper() + // In real v2 cgroups, CPU stats would come from /sys/fs/cgroup/cpu.stat + // Here we test the data structure consistency + + usageUsec := uint64(5000000) // 5 seconds + userUsec := uint64(2000000) // 2 seconds + systemUsec := uint64(3000000) // 3 seconds + nrPeriods := uint64(1000) + nrThrottled := uint64(50) + throttledUsec := uint64(1500000) // 1.5 seconds + + if usageUsec != (userUsec + systemUsec) { + t.Error("V2 CPU usage accounting inconsistent") + } + + if nrThrottled > nrPeriods { + t.Error("Invalid v2 CPU throttling data") + } + + throttleRatio := float64(nrThrottled) / float64(nrPeriods) + t.Logf("V2 throttle ratio: %.2f%% (%d/%d periods), throttled time: %dus", throttleRatio*100, nrThrottled, nrPeriods, throttledUsec) + } + + t.Run("V1_CPU_Stats", testV1CPUStats) + t.Run("V2_CPU_Stats", testV2CPUStats) +} + +func TestContainerLifecycle_Integration(t *testing.T) { + // Test full container lifecycle with both cgroup versions + containerID := "test-lifecycle-" + strconv.FormatInt(time.Now().UnixNano(), 10) + + testCases := []struct { + name string + useV2 bool + memoryLimit int64 + expectedError bool + }{ + { + name: "V1 Lifecycle", + useV2: false, + memoryLimit: 256 * 1024 * 1024, // 256MB + }, + { + name: "V2 Lifecycle", + useV2: true, + memoryLimit: 256 * 1024 * 1024, // 256MB + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + // Create container configuration + ociSpec := &oci.Spec{ + Linux: &oci.Linux{ + Resources: &oci.LinuxResources{ + Memory: &oci.LinuxMemory{ + Limit: &tc.memoryLimit, + }, + }, + }, + } + + // Simulate container creation + container := &Container{ + id: containerID, + spec: ociSpec, + } + + if container.id != containerID { + t.Error("Container ID not set properly") + } + + if container.spec.Linux.Resources.Memory.Limit == nil || + *container.spec.Linux.Resources.Memory.Limit != tc.memoryLimit { + t.Error("Container memory limit not configured properly") + } + + t.Logf("Container %s created with memory limit %d", containerID, tc.memoryLimit) + + // Test stats collection (mocked) + ctx := context.Background() + _, err := container.GetStats(ctx) + if err != nil { + t.Logf("Expected stats error in test environment: %v", err) + } + }) + } +} + +func TestVirtualPod_Integration(t *testing.T) { + // Test virtual pod integration with different cgroup versions + virtualSandboxID := "test-virtual-integration-" + strconv.FormatInt(time.Now().UnixNano(), 10) + + // Test virtual pod creation + virtualPod := &VirtualPod{ + Containers: make(map[string]bool), + } + + // Test adding multiple containers + containerIDs := []string{ + "container-1", "container-2", "container-3", + } + + for _, containerID := range containerIDs { + virtualPod.Containers[containerID] = true + } + + if len(virtualPod.Containers) != len(containerIDs) { + t.Errorf("Expected %d containers, got %d", len(containerIDs), len(virtualPod.Containers)) + } + + // Test resource allocation for virtual pod + resources := &oci.LinuxResources{ + Memory: &oci.LinuxMemory{ + Limit: func() *int64 { v := int64(512 * 1024 * 1024); return &v }(), // 512MB + }, + Pids: &oci.LinuxPids{ + Limit: 200, + }, + } + + v2Resources := convertSpecsToV2Resources(resources) + if v2Resources == nil { + t.Fatal("Failed to convert resources for virtual pod") + } + + if v2Resources.Memory == nil || *v2Resources.Memory.Max != *resources.Memory.Limit { + t.Error("Virtual pod memory allocation failed") + } + + if v2Resources.Pids == nil || v2Resources.Pids.Max != resources.Pids.Limit { + t.Error("Virtual pod PIDs allocation failed") + } + + t.Logf("Virtual pod %s created with %d containers", virtualSandboxID, len(virtualPod.Containers)) +} + +func TestErrorRecovery_Integration(t *testing.T) { + // Test error recovery scenarios in cgroup operations + testCases := []struct { + name string + simulateError string + expectedResult string + }{ + { + name: "Invalid cgroup path", + simulateError: "path_not_found", + expectedResult: "graceful_error_handling", + }, + { + name: "Permission denied", + simulateError: "permission_denied", + expectedResult: "graceful_error_handling", + }, + { + name: "Resource limit exceeded", + simulateError: "resource_limit", + expectedResult: "graceful_error_handling", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + // Test error handling in different scenarios + switch tc.simulateError { + case "path_not_found": + // Test with non-existent path + invalidPath := "/sys/fs/cgroup/nonexistent/path" + if _, err := os.Stat(invalidPath); err == nil { + t.Error("Path should not exist") + } + t.Logf("Correctly detected invalid path: %s", invalidPath) + + case "permission_denied": + // Test with restricted path + restrictedPath := "/sys/fs/cgroup" + if _, err := os.Stat(restrictedPath); os.IsPermission(err) { + t.Logf("Permission check working: %s", restrictedPath) + } + + case "resource_limit": + // Test with extreme resource values + extremeLimit := int64(^uint64(0) >> 1) // Max int64 + resources := &oci.LinuxResources{ + Memory: &oci.LinuxMemory{ + Limit: &extremeLimit, + }, + } + + v2Resources := convertSpecsToV2Resources(resources) + if v2Resources != nil && v2Resources.Memory != nil { + t.Logf("Extreme limit handled: %d", *v2Resources.Memory.Max) + } + } + }) + } +} + +func TestPerformance_Integration(t *testing.T) { + // Performance test for cgroup operations + iterations := 1000 + + // Test stats conversion performance + startTime := time.Now() + for i := 0; i < iterations; i++ { + resources := &oci.LinuxResources{ + Memory: &oci.LinuxMemory{ + Limit: func() *int64 { v := int64(i * 1024 * 1024); return &v }(), // Variable size + }, + Pids: &oci.LinuxPids{ + Limit: int64(i + 100), + }, + } + + v2Resources := convertSpecsToV2Resources(resources) + if v2Resources == nil { + t.Error("Resource conversion failed") + break + } + } + duration := time.Since(startTime) + + avgDuration := duration / time.Duration(iterations) + t.Logf("Resource conversion performance: %d iterations in %v (avg: %v per conversion)", + iterations, duration, avgDuration) + + // Performance should be reasonable (< 1ms per conversion on average) + if avgDuration > time.Millisecond { + t.Logf("Warning: Resource conversion taking longer than expected: %v", avgDuration) + } +} diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go index 6637af0a44..82955ac402 100644 --- a/internal/guest/runtime/hcsv2/uvm.go +++ b/internal/guest/runtime/hcsv2/uvm.go @@ -21,6 +21,7 @@ import ( cgroups "github.com/containerd/cgroups/v3/cgroup1" cgroup1stats "github.com/containerd/cgroups/v3/cgroup1/stats" + cgroups2 "github.com/containerd/cgroups/v3/cgroup2" "github.com/mattn/go-shellwords" "github.com/opencontainers/runtime-spec/specs-go" "github.com/pkg/errors" @@ -60,7 +61,7 @@ type VirtualPod struct { MasterSandboxID string NetworkNamespace string CgroupPath string - CgroupControl cgroups.Cgroup + CgroupControl interface{} // Can be either cgroups.Cgroup (v1) or *cgroups2.Manager (v2) Containers map[string]bool // containerID -> exists CreatedAt time.Time } @@ -75,10 +76,11 @@ type Host struct { externalProcesses map[int]*externalProcess // Virtual pod support for multi-pod scenarios - virtualPodsMutex sync.Mutex - virtualPods map[string]*VirtualPod // virtualSandboxID -> VirtualPod - containerToVirtualPod map[string]string // containerID -> virtualSandboxID - virtualPodsCgroupParent cgroups.Cgroup // Parent cgroup for all virtual pods + virtualPodsMutex sync.Mutex + virtualPods map[string]*VirtualPod // virtualSandboxID -> VirtualPod + containerToVirtualPod map[string]string // containerID -> virtualSandboxID + virtualPodsCgroupParent cgroups.Cgroup // Parent cgroup for all virtual pods (v1) + virtualPodsCgroupV2Manager interface{} // Parent cgroup manager for virtual pods (v2) rtime runtime.Runtime vsock transport.Transport @@ -1274,15 +1276,107 @@ func isPrivilegedContainerCreationRequest(ctx context.Context, spec *specs.Spec) return oci.ParseAnnotationsBool(ctx, spec.Annotations, annotations.LCOWPrivileged, false) } +// convertSpecsToV2Resources converts specs.LinuxResources to cgroups2.Resources +func convertSpecsToV2Resources(resources *specs.LinuxResources) *cgroups2.Resources { + if resources == nil { + return &cgroups2.Resources{} + } + + v2Resources := &cgroups2.Resources{} + + // Memory conversion + if resources.Memory != nil { + v2Memory := &cgroups2.Memory{} + if resources.Memory.Limit != nil { + v2Memory.Max = resources.Memory.Limit + } + if resources.Memory.Reservation != nil { + v2Memory.Low = resources.Memory.Reservation + } + if resources.Memory.Swap != nil { + v2Memory.Swap = resources.Memory.Swap + } + v2Resources.Memory = v2Memory + } + + // CPU conversion + if resources.CPU != nil { + v2CPU := &cgroups2.CPU{} + if resources.CPU.Shares != nil { + // Convert shares to weight (shares * 262144 / 1024) + weight := *resources.CPU.Shares * 256 / 1024 + if weight < 1 { + weight = 1 + } else if weight > 10000 { + weight = 10000 + } + v2CPU.Weight = &weight + } + if resources.CPU.Quota != nil && resources.CPU.Period != nil { + v2CPU.Max = cgroups2.NewCPUMax(resources.CPU.Quota, resources.CPU.Period) + } + v2Resources.CPU = v2CPU + } + + // PIDs conversion + if resources.Pids != nil && resources.Pids.Limit > 0 { + v2Resources.Pids = &cgroups2.Pids{ + Max: resources.Pids.Limit, + } + } + + // IO conversion + if resources.BlockIO != nil { + v2IO := &cgroups2.IO{} + if resources.BlockIO.Weight != nil { + v2IO.BFQ.Weight = *resources.BlockIO.Weight + } + v2Resources.IO = v2IO + } + + return v2Resources +} + // Virtual Pod Management Methods // InitializeVirtualPodSupport sets up the parent cgroup for virtual pods -func (h *Host) InitializeVirtualPodSupport(virtualPodsCgroup cgroups.Cgroup) { +func (h *Host) InitializeVirtualPodSupport(virtualPodsCgroupManager interface{}) error { h.virtualPodsMutex.Lock() defer h.virtualPodsMutex.Unlock() - h.virtualPodsCgroupParent = virtualPodsCgroup - logrus.Info("Virtual pod support initialized") + // Check if the manager is nil + if virtualPodsCgroupManager == nil { + return errors.New("no valid cgroup manager provided for virtual pod support") + } + + // Try to get the CgroupManager interface methods + type CgroupManagerInterface interface { + GetV1Cgroup() cgroups.Cgroup + GetV2Manager() *cgroups2.Manager + Create(pid int) error + Delete() error + } + + if mgr, ok := virtualPodsCgroupManager.(CgroupManagerInterface); ok { + if v1Cgroup := mgr.GetV1Cgroup(); v1Cgroup != nil { + h.virtualPodsCgroupParent = v1Cgroup + logrus.Info("Virtual pod support initialized with cgroup v1") + return nil + } else if v2Manager := mgr.GetV2Manager(); v2Manager != nil { + h.virtualPodsCgroupV2Manager = v2Manager + logrus.Info("Virtual pod support initialized with cgroup v2") + return nil + } + } + + // Fallback to direct cgroup assignment for backward compatibility + if cg, ok := virtualPodsCgroupManager.(cgroups.Cgroup); ok { + h.virtualPodsCgroupParent = cg + logrus.Info("Virtual pod support initialized with direct cgroup assignment") + return nil + } + + return fmt.Errorf("no valid cgroup manager provided for virtual pod support") } // CreateVirtualPod creates a new virtual pod with its own cgroup and network namespace @@ -1295,20 +1389,7 @@ func (h *Host) CreateVirtualPod(ctx context.Context, virtualSandboxID, masterSan return fmt.Errorf("virtual pod %s already exists", virtualSandboxID) } - // Create cgroup path for this virtual pod under the parent cgroup - parentPath := "" - if h.virtualPodsCgroupParent != nil { - if pather, ok := h.virtualPodsCgroupParent.(interface{ Path() string }); ok { - parentPath = pather.Path() - } else { - parentPath = "/containers/virtual-pods" // fallback for default behavior - } - } else { - parentPath = "/containers/virtual-pods" // fallback for default behavior - } - cgroupPath := path.Join(parentPath, virtualSandboxID) - - // Create the cgroup for this virtual pod with resource limits if provided + // Extract resource limits if provided resources := &specs.LinuxResources{} if pSpec != nil && pSpec.Linux != nil && pSpec.Linux.Resources != nil { resources = pSpec.Linux.Resources @@ -1319,9 +1400,37 @@ func (h *Host) CreateVirtualPod(ctx context.Context, virtualSandboxID, masterSan logrus.WithField("virtualSandboxID", virtualSandboxID).Info("Creating pod cgroup with default resources as none were specified") } - cgroupControl, err := cgroups.New(cgroups.StaticPath(cgroupPath), resources) - if err != nil { - return errors.Wrapf(err, "failed to create cgroup for virtual pod %s", virtualSandboxID) + var cgroupControl interface{} + var cgroupPath string + var err error + + // Use appropriate API based on which cgroup version is available + if h.virtualPodsCgroupParent != nil { + // cgroup v1 path + if pather, ok := h.virtualPodsCgroupParent.(interface{ Path() string }); ok { + parentPath := pather.Path() + cgroupPath = path.Join(parentPath, virtualSandboxID) + } else { + cgroupPath = path.Join("/containers/virtual-pods", virtualSandboxID) // fallback + } + cgroupControl, err = cgroups.New(cgroups.StaticPath(cgroupPath), resources) + if err != nil { + return errors.Wrapf(err, "failed to create cgroup v1 for virtual pod %s", virtualSandboxID) + } + logrus.WithField("cgroupPath", cgroupPath).Info("Created virtual pod with cgroup v1") + } else if h.virtualPodsCgroupV2Manager != nil { + // cgroup v2 path + cgroupPath = path.Join("/containers/virtual-pods", virtualSandboxID) + // Convert specs.LinuxResources to cgroup v2 compatible resources + v2Resources := convertSpecsToV2Resources(resources) + manager, err := cgroups2.NewManager("/sys/fs/cgroup", cgroupPath, v2Resources) + if err != nil { + return errors.Wrapf(err, "failed to create cgroup v2 for virtual pod %s", virtualSandboxID) + } + cgroupControl = manager + logrus.WithField("cgroupPath", cgroupPath).Info("Created virtual pod with cgroup v2") + } else { + return fmt.Errorf("no virtual pod cgroup manager available (neither v1 nor v2)") } // Create virtual pod structure @@ -1423,8 +1532,20 @@ func (h *Host) RemoveContainerFromVirtualPod(containerID string) { // cleanupVirtualPod removes a virtual pod and its cgroup (should be called with mutex held) func (h *Host) cleanupVirtualPod(virtualSandboxID string) { if vp, exists := h.virtualPods[virtualSandboxID]; exists { - // Delete the cgroup - if err := vp.CgroupControl.Delete(); err != nil { + // Delete the cgroup - handle both v1 and v2 types + var err error + if v1Cgroup, ok := vp.CgroupControl.(cgroups.Cgroup); ok { + // v1 cgroup + err = v1Cgroup.Delete() + } else if v2Manager, ok := vp.CgroupControl.(*cgroups2.Manager); ok { + // v2 cgroup + err = v2Manager.Delete() + } else { + logrus.WithField("virtualSandboxID", virtualSandboxID). + Warn("Unknown cgroup control type in virtual pod") + } + + if err != nil { logrus.WithError(err).WithField("virtualSandboxID", virtualSandboxID). Warn("Failed to delete virtual pod cgroup") } diff --git a/internal/guest/runtime/hcsv2/virtual_pod_test.go b/internal/guest/runtime/hcsv2/virtual_pod_test.go new file mode 100644 index 0000000000..7c1d93ac16 --- /dev/null +++ b/internal/guest/runtime/hcsv2/virtual_pod_test.go @@ -0,0 +1,324 @@ +//go:build linux +// +build linux + +package hcsv2 + +import ( + "fmt" + "testing" + "time" + + oci "github.com/opencontainers/runtime-spec/specs-go" +) + +func TestConvertSpecsToV2Resources_Memory(t *testing.T) { + limit := int64(1024 * 1024 * 1024) // 1GB + reservation := int64(512 * 1024 * 1024) // 512MB + swap := int64(2 * 1024 * 1024 * 1024) // 2GB + + resources := &oci.LinuxResources{ + Memory: &oci.LinuxMemory{ + Limit: &limit, + Reservation: &reservation, + Swap: &swap, + }, + } + + v2Resources := convertSpecsToV2Resources(resources) + + if v2Resources == nil { + t.Fatal("v2Resources should not be nil") + } + if v2Resources.Memory == nil { + t.Fatal("v2Resources.Memory should not be nil") + } + if v2Resources.Memory.Max == nil || *v2Resources.Memory.Max != limit { + t.Errorf("Expected memory max %d, got %v", limit, v2Resources.Memory.Max) + } + if v2Resources.Memory.Low == nil || *v2Resources.Memory.Low != reservation { + t.Errorf("Expected memory low %d, got %v", reservation, v2Resources.Memory.Low) + } + if v2Resources.Memory.Swap == nil || *v2Resources.Memory.Swap != swap { + t.Errorf("Expected memory swap %d, got %v", swap, v2Resources.Memory.Swap) + } +} + +func TestConvertSpecsToV2Resources_PIDs(t *testing.T) { + limit := int64(1000) + + resources := &oci.LinuxResources{ + Pids: &oci.LinuxPids{ + Limit: limit, + }, + } + + v2Resources := convertSpecsToV2Resources(resources) + + if v2Resources == nil { + t.Fatal("v2Resources should not be nil") + } + if v2Resources.Pids == nil { + t.Fatal("v2Resources.Pids should not be nil") + } + if v2Resources.Pids.Max != limit { + t.Errorf("Expected PIDs max %d, got %d", limit, v2Resources.Pids.Max) + } +} + +func TestConvertSpecsToV2Resources_NilResources(t *testing.T) { + v2Resources := convertSpecsToV2Resources(nil) + + if v2Resources == nil { + t.Fatal("v2Resources should not be nil even for nil input") + } +} + +func TestVirtualPod_Structure(t *testing.T) { + virtualSandboxID := "test-virtual-sandbox-123" + masterSandboxID := "test-master-sandbox-456" + networkNamespace := "test-netns" + cgroupPath := "/containers/virtual-pods/test-virtual-sandbox-123" + + virtualPod := &VirtualPod{ + VirtualSandboxID: virtualSandboxID, + MasterSandboxID: masterSandboxID, + NetworkNamespace: networkNamespace, + CgroupPath: cgroupPath, + Containers: make(map[string]bool), + } + + if virtualPod.VirtualSandboxID != virtualSandboxID { + t.Errorf("Expected VirtualSandboxID %s, got %s", virtualSandboxID, virtualPod.VirtualSandboxID) + } + if virtualPod.MasterSandboxID != masterSandboxID { + t.Errorf("Expected MasterSandboxID %s, got %s", masterSandboxID, virtualPod.MasterSandboxID) + } + if virtualPod.NetworkNamespace != networkNamespace { + t.Errorf("Expected NetworkNamespace %s, got %s", networkNamespace, virtualPod.NetworkNamespace) + } + if virtualPod.CgroupPath != cgroupPath { + t.Errorf("Expected CgroupPath %s, got %s", cgroupPath, virtualPod.CgroupPath) + } + if virtualPod.Containers == nil { + t.Error("Containers map should not be nil") + } +} + +func TestVirtualPod_ContainerManagement(t *testing.T) { + virtualPod := &VirtualPod{ + Containers: make(map[string]bool), + } + + // Test adding containers + containerIDs := []string{"container-1", "container-2", "container-3"} + + for _, id := range containerIDs { + virtualPod.Containers[id] = true + } + + // Verify all containers are tracked + for _, id := range containerIDs { + if !virtualPod.Containers[id] { + t.Errorf("Container %s should be tracked", id) + } + } + + if len(virtualPod.Containers) != len(containerIDs) { + t.Errorf("Expected %d containers, got %d", len(containerIDs), len(virtualPod.Containers)) + } + + // Test removing containers + delete(virtualPod.Containers, containerIDs[0]) + if virtualPod.Containers[containerIDs[0]] { + t.Errorf("Container %s should have been removed", containerIDs[0]) + } + if len(virtualPod.Containers) != len(containerIDs)-1 { + t.Errorf("Expected %d containers after removal, got %d", len(containerIDs)-1, len(virtualPod.Containers)) + } +} + +func TestHost_InitializeVirtualPodSupport_ErrorCases(t *testing.T) { + host := &Host{} + + // Test with nil input + err := host.InitializeVirtualPodSupport(nil) + if err == nil { + t.Error("Expected error for nil input") + } + if err != nil && err.Error() != "no valid cgroup manager provided for virtual pod support" { + t.Errorf("Unexpected error message: %s", err.Error()) + } + + // Test with invalid interface + invalidMgr := "invalid manager" + err = host.InitializeVirtualPodSupport(invalidMgr) + if err == nil { + t.Error("Expected error for invalid manager") + } +} + +// Additional Virtual Pod Resource Management Tests +func TestVirtualPod_ResourceUpdates(t *testing.T) { + virtualPod := &VirtualPod{ + Containers: make(map[string]bool), + } + + // Test adding containers + containerID1 := "container-1" + containerID2 := "container-2" + + virtualPod.Containers[containerID1] = true + virtualPod.Containers[containerID2] = true + + if len(virtualPod.Containers) != 2 { + t.Errorf("Expected 2 containers, got %d", len(virtualPod.Containers)) + } + + // Test removing containers + delete(virtualPod.Containers, containerID1) + if len(virtualPod.Containers) != 1 { + t.Errorf("Expected 1 container after removal, got %d", len(virtualPod.Containers)) + } + if !virtualPod.Containers[containerID2] { + t.Error("Container 2 should still be present") + } +} + +func TestVirtualPod_CleanupV2(t *testing.T) { + virtualPod := &VirtualPod{ + Containers: map[string]bool{ + "container-1": true, + "container-2": true, + }, + } + + // Test cleanup operations + containerCount := len(virtualPod.Containers) + if containerCount != 2 { + t.Errorf("Expected 2 containers before cleanup, got %d", containerCount) + } + + // Simulate cleanup by clearing containers + virtualPod.Containers = make(map[string]bool) + if len(virtualPod.Containers) != 0 { + t.Error("Containers should be empty after cleanup") + } +} + +func TestVirtualPod_MixedV1V2Environment(t *testing.T) { + // Test virtual pod behavior in mixed cgroup environment + testCases := []struct { + name string + cgroupVersion int + expectedError bool + }{ + { + name: "CGroup V1", + cgroupVersion: 1, + expectedError: false, + }, + { + name: "CGroup V2", + cgroupVersion: 2, + expectedError: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + virtualPod := &VirtualPod{ + VirtualSandboxID: fmt.Sprintf("test-mixed-%d-123", tc.cgroupVersion), + CgroupPath: fmt.Sprintf("/containers/virtual-pods/test-mixed-%d-123", tc.cgroupVersion), + } + + if virtualPod.VirtualSandboxID == "" { + t.Error("VirtualSandboxID should not be empty") + } + if virtualPod.CgroupPath == "" { + t.Error("CgroupPath should not be empty") + } + }) + } +} + +func TestConvertSpecsToV2Resources_EdgeCases(t *testing.T) { + // Test with extremely large values + large := int64(^uint64(0) >> 1) // Max int64 + resources := &oci.LinuxResources{ + Memory: &oci.LinuxMemory{ + Limit: &large, + }, + Pids: &oci.LinuxPids{ + Limit: large, + }, + } + + v2Resources := convertSpecsToV2Resources(resources) + if v2Resources == nil { + t.Fatal("v2Resources should not be nil") + } + if v2Resources.Memory == nil || *v2Resources.Memory.Max != large { + t.Error("Large memory limit should be preserved") + } + if v2Resources.Pids == nil || v2Resources.Pids.Max != large { + t.Error("Large PIDs limit should be preserved") + } + + // Test with zero values + zero := int64(0) + zeroResources := &oci.LinuxResources{ + Memory: &oci.LinuxMemory{ + Limit: &zero, + }, + Pids: &oci.LinuxPids{ + Limit: zero, + }, + } + + v2ZeroResources := convertSpecsToV2Resources(zeroResources) + if v2ZeroResources == nil { + t.Fatal("v2ZeroResources should not be nil") + } + if v2ZeroResources.Memory == nil || *v2ZeroResources.Memory.Max != zero { + t.Error("Zero memory limit should be preserved") + } + if v2ZeroResources.Pids != nil { + t.Error("Zero PIDs limit should not create PIDs resource") + } else { + t.Log("Zero PIDs limit correctly ignored") + } +} + +func TestVirtualPod_ConcurrentAccess(t *testing.T) { + virtualPod := &VirtualPod{ + Containers: make(map[string]bool), + } + + // Simulate concurrent container additions + go func() { + for i := 0; i < 10; i++ { + containerID := fmt.Sprintf("container-go1-%d", i) + virtualPod.Containers[containerID] = true + time.Sleep(1 * time.Millisecond) + } + }() + + go func() { + for i := 0; i < 10; i++ { + containerID := fmt.Sprintf("container-go2-%d", i) + virtualPod.Containers[containerID] = true + time.Sleep(1 * time.Millisecond) + } + }() + + // Wait for goroutines to complete + time.Sleep(50 * time.Millisecond) + + // In a real implementation, this would need proper synchronization + // This test demonstrates the need for thread-safe access + containerCount := len(virtualPod.Containers) + if containerCount == 0 { + t.Error("Should have containers after concurrent addition") + } + t.Logf("Final container count: %d", containerCount) +} diff --git a/test/go.mod b/test/go.mod index 7a713f8e14..3f72d4d9bd 100644 --- a/test/go.mod +++ b/test/go.mod @@ -43,6 +43,7 @@ require ( github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/checkpoint-restore/go-criu/v6 v6.3.0 // indirect + github.com/cilium/ebpf v0.17.3 // indirect github.com/containerd/console v1.0.5 // indirect github.com/containerd/continuity v0.4.5 // indirect github.com/containerd/errdefs/pkg v0.3.0 // indirect diff --git a/test/go.sum b/test/go.sum index 8cd7517706..6a107b78d7 100644 --- a/test/go.sum +++ b/test/go.sum @@ -33,6 +33,8 @@ github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UF github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/checkpoint-restore/go-criu/v6 v6.3.0 h1:mIdrSO2cPNWQY1truPg6uHLXyKHk3Z5Odx4wjKOASzA= github.com/checkpoint-restore/go-criu/v6 v6.3.0/go.mod h1:rrRTN/uSwY2X+BPRl/gkulo9gsKOSAeVp9/K2tv7xZI= +github.com/cilium/ebpf v0.17.3 h1:FnP4r16PWYSE4ux6zN+//jMcW4nMVRvuTLVTvCjyyjg= +github.com/cilium/ebpf v0.17.3/go.mod h1:G5EDHij8yiLzaqn0WjyfJHvRa+3aDlReIaLVRMvOyJk= github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= github.com/containerd/cgroups/v3 v3.0.5 h1:44na7Ud+VwyE7LIoJ8JTNQOa549a8543BmzaJHo6Bzo= @@ -116,6 +118,8 @@ github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/go-quicktest/qt v1.101.0 h1:O1K29Txy5P2OK0dGo59b7b0LR6wKfIhttaAhHUyn7eI= +github.com/go-quicktest/qt v1.101.0/go.mod h1:14Bz/f7NwaXPtdYEgzsx46kqSxVwTbzVZsDC26tQJow= github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU= @@ -175,6 +179,10 @@ github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9 github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8= github.com/josephspurrier/goversioninfo v1.5.0 h1:9TJtORoyf4YMoWSOo/cXFN9A/lB3PniJ91OxIH6e7Zg= github.com/josephspurrier/goversioninfo v1.5.0/go.mod h1:6MoTvFZ6GKJkzcdLnU5T/RGYUbHQbKpYeNP0AgQLd2o= +github.com/josharian/native v1.1.0 h1:uuaP0hAbW7Y4l0ZRQ6C9zfb7Mg1mbFKry/xzDAfmtLA= +github.com/josharian/native v1.1.0/go.mod h1:7X/raswPFr05uY3HiLlYeyQntB6OO7E/d2Cu7qoaN2w= +github.com/jsimonetti/rtnetlink/v2 v2.0.1 h1:xda7qaHDSVOsADNouv7ukSuicKZO7GgVUCXxpaIEIlM= +github.com/jsimonetti/rtnetlink/v2 v2.0.1/go.mod h1:7MoNYNbb3UaDHtF8udiJo/RH6VsTKP1pqKLUTVCvToE= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= @@ -200,6 +208,10 @@ github.com/linuxkit/virtsock v0.0.0-20241009230534-cb6a20cc0422 h1:XvRuyDDRvi+UD github.com/linuxkit/virtsock v0.0.0-20241009230534-cb6a20cc0422/go.mod h1:JLgfq4XMVbvfNlAXla/41lZnp21O72a/wWHGJefAvgQ= github.com/mattn/go-shellwords v1.0.12 h1:M2zGm7EW6UQJvDeQxo4T51eKPurbeFbe8WtebGE2xrk= github.com/mattn/go-shellwords v1.0.12/go.mod h1:EZzvwXDESEeg03EKmM+RmDnNOPKG4lLtQsUlTZDWQ8Y= +github.com/mdlayher/netlink v1.7.2 h1:/UtM3ofJap7Vl4QWCPDGXY8d3GIY2UGSDbK+QWmY8/g= +github.com/mdlayher/netlink v1.7.2/go.mod h1:xraEF7uJbxLhc5fpHL4cPe221LI2bdttWlU+ZGLfQSw= +github.com/mdlayher/socket v0.5.1 h1:VZaqt6RkGkt2OE9l3GcC6nZkqD3xKeQLyfleW/uBcos= +github.com/mdlayher/socket v0.5.1/go.mod h1:TjPLHI1UgwEv5J1B5q0zTZq12A/6H7nKmtTanQE37IQ= github.com/miekg/dns v1.1.57 h1:Jzi7ApEIzwEPLHWRcafCN9LZSBbqQpxjt/wpgvg7wcM= github.com/miekg/dns v1.1.57/go.mod h1:uqRjCRUuEAA6qsOiJvDd+CFo/vW+y5WR6SNmHE55hZk= github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y= diff --git a/vendor/github.com/cilium/ebpf/.clang-format b/vendor/github.com/cilium/ebpf/.clang-format new file mode 100644 index 0000000000..0ff4257606 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/.clang-format @@ -0,0 +1,25 @@ +--- +Language: Cpp +BasedOnStyle: LLVM +AlignAfterOpenBracket: DontAlign +AlignConsecutiveAssignments: true +AlignEscapedNewlines: DontAlign +# mkdocs annotations in source code are written as trailing comments +# and alignment pushes these really far away from the content. +AlignTrailingComments: false +AlwaysBreakBeforeMultilineStrings: true +AlwaysBreakTemplateDeclarations: false +AllowAllParametersOfDeclarationOnNextLine: false +AllowShortFunctionsOnASingleLine: false +BreakBeforeBraces: Attach +IndentWidth: 4 +KeepEmptyLinesAtTheStartOfBlocks: false +TabWidth: 4 +UseTab: ForContinuationAndIndentation +ColumnLimit: 1000 +# Go compiler comments need to stay unindented. +CommentPragmas: '^go:.*' +# linux/bpf.h needs to be included before bpf/bpf_helpers.h for types like __u64 +# and sorting makes this impossible. +SortIncludes: false +... diff --git a/vendor/github.com/cilium/ebpf/.gitattributes b/vendor/github.com/cilium/ebpf/.gitattributes new file mode 100644 index 0000000000..113f97b980 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/.gitattributes @@ -0,0 +1 @@ +internal/sys/types.go linguist-generated=false diff --git a/vendor/github.com/cilium/ebpf/.gitignore b/vendor/github.com/cilium/ebpf/.gitignore new file mode 100644 index 0000000000..b46162b8ec --- /dev/null +++ b/vendor/github.com/cilium/ebpf/.gitignore @@ -0,0 +1,14 @@ +# Binaries for programs and plugins +*.exe +*.exe~ +*.dll +*.so +*.dylib +*.o +!*_bpf*.o + +# Test binary, build with `go test -c` +*.test + +# Output of the go coverage tool, specifically when used with LiteIDE +*.out diff --git a/vendor/github.com/cilium/ebpf/.golangci.yaml b/vendor/github.com/cilium/ebpf/.golangci.yaml new file mode 100644 index 0000000000..8298e59ed8 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/.golangci.yaml @@ -0,0 +1,31 @@ +--- +linters: + disable-all: true + enable: + - goimports + - gosimple + - govet + - ineffassign + - misspell + - staticcheck + - typecheck + - unused + - gofmt + - depguard +linters-settings: + goimports: + # A comma-separated list of prefixes, which, if set, checks import paths + # with the given prefixes are grouped after 3rd-party packages. + # Default: "" + local-prefixes: github.com/cilium/ebpf + depguard: + rules: + no-x-sys-unix: + files: + # Filenames are matched against absolute paths, include **/ at the start. + - '!**/internal/unix/*.go' + - '!**/examples/**/*.go' + - '!**/docs/**/*.go' + deny: + - pkg: golang.org/x/sys/unix + desc: use internal/unix instead diff --git a/vendor/github.com/cilium/ebpf/.vimto.toml b/vendor/github.com/cilium/ebpf/.vimto.toml new file mode 100644 index 0000000000..49a12dbc09 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/.vimto.toml @@ -0,0 +1,12 @@ +kernel="ghcr.io/cilium/ci-kernels:stable" +smp="cpus=2" +memory="1G" +user="root" +setup=[ + "mount -t cgroup2 -o nosuid,noexec,nodev cgroup2 /sys/fs/cgroup", + "/bin/sh -c 'modprobe bpf_testmod || true'", + "dmesg --clear", +] +teardown=[ + "dmesg --read-clear", +] diff --git a/vendor/github.com/cilium/ebpf/CODEOWNERS b/vendor/github.com/cilium/ebpf/CODEOWNERS new file mode 100644 index 0000000000..0f76dce85c --- /dev/null +++ b/vendor/github.com/cilium/ebpf/CODEOWNERS @@ -0,0 +1,13 @@ +* @cilium/ebpf-lib-maintainers + +features/ @rgo3 +link/ @mmat11 + +perf/ @florianl +ringbuf/ @florianl + +btf/ @dylandreimerink + +cmd/bpf2go/ @mejedi + +docs/ @ti-mo diff --git a/vendor/github.com/cilium/ebpf/CODE_OF_CONDUCT.md b/vendor/github.com/cilium/ebpf/CODE_OF_CONDUCT.md new file mode 100644 index 0000000000..8e42838c5a --- /dev/null +++ b/vendor/github.com/cilium/ebpf/CODE_OF_CONDUCT.md @@ -0,0 +1,46 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at nathanjsweet at gmail dot com or i at lmb dot io. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] + +[homepage]: http://contributor-covenant.org +[version]: http://contributor-covenant.org/version/1/4/ diff --git a/vendor/github.com/cilium/ebpf/CONTRIBUTING.md b/vendor/github.com/cilium/ebpf/CONTRIBUTING.md new file mode 100644 index 0000000000..673a9ac290 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/CONTRIBUTING.md @@ -0,0 +1,5 @@ +# Contributing to ebpf-go + +Want to contribute to ebpf-go? There are a few things you need to know. + +We wrote a [contribution guide](https://ebpf-go.dev/contributing/) to help you get started. diff --git a/vendor/github.com/cilium/ebpf/LICENSE b/vendor/github.com/cilium/ebpf/LICENSE new file mode 100644 index 0000000000..c637ae99c2 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/LICENSE @@ -0,0 +1,23 @@ +MIT License + +Copyright (c) 2017 Nathan Sweet +Copyright (c) 2018, 2019 Cloudflare +Copyright (c) 2019 Authors of Cilium + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/vendor/github.com/cilium/ebpf/MAINTAINERS.md b/vendor/github.com/cilium/ebpf/MAINTAINERS.md new file mode 100644 index 0000000000..a56a03e394 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/MAINTAINERS.md @@ -0,0 +1,3 @@ +# Maintainers + +Maintainers can be found in the [Cilium Maintainers file](https://github.com/cilium/community/blob/main/roles/Maintainers.md) diff --git a/vendor/github.com/cilium/ebpf/Makefile b/vendor/github.com/cilium/ebpf/Makefile new file mode 100644 index 0000000000..e0fe974920 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/Makefile @@ -0,0 +1,114 @@ +# The development version of clang is distributed as the 'clang' binary, +# while stable/released versions have a version number attached. +# Pin the default clang to a stable version. +CLANG ?= clang-17 +STRIP ?= llvm-strip-17 +OBJCOPY ?= llvm-objcopy-17 +CFLAGS := -O2 -g -Wall -Werror $(CFLAGS) + +CI_KERNEL_URL ?= https://github.com/cilium/ci-kernels/raw/master/ + +# Obtain an absolute path to the directory of the Makefile. +# Assume the Makefile is in the root of the repository. +REPODIR := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) +UIDGID := $(shell stat -c '%u:%g' ${REPODIR}) + +# Prefer podman if installed, otherwise use docker. +# Note: Setting the var at runtime will always override. +CONTAINER_ENGINE ?= $(if $(shell command -v podman), podman, docker) +CONTAINER_RUN_ARGS ?= $(if $(filter ${CONTAINER_ENGINE}, podman), --log-driver=none, --user "${UIDGID}") + +IMAGE := $(shell cat ${REPODIR}/testdata/docker/IMAGE) +VERSION := $(shell cat ${REPODIR}/testdata/docker/VERSION) + +TARGETS := \ + testdata/loader-clang-11 \ + testdata/loader-clang-14 \ + testdata/loader-$(CLANG) \ + testdata/manyprogs \ + testdata/btf_map_init \ + testdata/invalid_map \ + testdata/raw_tracepoint \ + testdata/invalid_map_static \ + testdata/invalid_btf_map_init \ + testdata/strings \ + testdata/freplace \ + testdata/fentry_fexit \ + testdata/iproute2_map_compat \ + testdata/map_spin_lock \ + testdata/subprog_reloc \ + testdata/fwd_decl \ + testdata/kconfig \ + testdata/ksym \ + testdata/kfunc \ + testdata/invalid-kfunc \ + testdata/kfunc-kmod \ + testdata/constants \ + testdata/errors \ + testdata/variables \ + btf/testdata/relocs \ + btf/testdata/relocs_read \ + btf/testdata/relocs_read_tgt \ + btf/testdata/relocs_enum \ + btf/testdata/tags \ + cmd/bpf2go/testdata/minimal + +.PHONY: all clean container-all container-shell generate + +.DEFAULT_TARGET = container-all + +# Build all ELF binaries using a containerized LLVM toolchain. +container-all: + +${CONTAINER_ENGINE} run --rm -ti ${CONTAINER_RUN_ARGS} \ + -v "${REPODIR}":/ebpf -w /ebpf --env MAKEFLAGS \ + --env HOME="/tmp" \ + --env BPF2GO_CC="$(CLANG)" \ + --env BPF2GO_FLAGS="-fdebug-prefix-map=/ebpf=. $(CFLAGS)" \ + "${IMAGE}:${VERSION}" \ + make all + +# (debug) Drop the user into a shell inside the container as root. +# Set BPF2GO_ envs to make 'make generate' just work. +container-shell: + ${CONTAINER_ENGINE} run --rm -ti \ + -v "${REPODIR}":/ebpf -w /ebpf \ + --env BPF2GO_CC="$(CLANG)" \ + --env BPF2GO_FLAGS="-fdebug-prefix-map=/ebpf=. $(CFLAGS)" \ + "${IMAGE}:${VERSION}" + +clean: + find "$(CURDIR)" -name "*.elf" -delete + find "$(CURDIR)" -name "*.o" -delete + +format: + find . -type f -name "*.c" | xargs clang-format -i + +all: format $(addsuffix -el.elf,$(TARGETS)) $(addsuffix -eb.elf,$(TARGETS)) generate + ln -srf testdata/loader-$(CLANG)-el.elf testdata/loader-el.elf + ln -srf testdata/loader-$(CLANG)-eb.elf testdata/loader-eb.elf + +generate: + go generate -run "internal/cmd/gentypes" ./... + go generate -skip "internal/cmd/gentypes" ./... + +testdata/loader-%-el.elf: testdata/loader.c + $* $(CFLAGS) -target bpfel -c $< -o $@ + $(STRIP) -g $@ + +testdata/loader-%-eb.elf: testdata/loader.c + $* $(CFLAGS) -target bpfeb -c $< -o $@ + $(STRIP) -g $@ + +%-el.elf: %.c + $(CLANG) $(CFLAGS) -target bpfel -c $< -o $@ + $(STRIP) -g $@ + +%-eb.elf : %.c + $(CLANG) $(CFLAGS) -target bpfeb -c $< -o $@ + $(STRIP) -g $@ + +.PHONY: update-kernel-deps +update-kernel-deps: export KERNEL_VERSION?=6.8 +update-kernel-deps: + ./testdata/sh/update-kernel-deps.sh + $(MAKE) container-all diff --git a/vendor/github.com/cilium/ebpf/README.md b/vendor/github.com/cilium/ebpf/README.md new file mode 100644 index 0000000000..8238256c8e --- /dev/null +++ b/vendor/github.com/cilium/ebpf/README.md @@ -0,0 +1,73 @@ +# eBPF + +[![PkgGoDev](https://pkg.go.dev/badge/github.com/cilium/ebpf)](https://pkg.go.dev/github.com/cilium/ebpf) + +![HoneyGopher](docs/ebpf/ebpf-go.png) + +ebpf-go is a pure Go library that provides utilities for loading, compiling, and +debugging eBPF programs. It has minimal external dependencies and is intended to +be used in long running processes. + +See [ebpf.io](https://ebpf.io) for complementary projects from the wider eBPF +ecosystem. + +## Getting Started + +Please take a look at our [Getting Started] guide. + +[Contributions](https://ebpf-go.dev/contributing) are highly encouraged, as they highlight certain use cases of +eBPF and the library, and help shape the future of the project. + +## Getting Help + +The community actively monitors our [GitHub Discussions](https://github.com/cilium/ebpf/discussions) page. +Please search for existing threads before starting a new one. Refrain from +opening issues on the bug tracker if you're just starting out or if you're not +sure if something is a bug in the library code. + +Alternatively, [join](https://ebpf.io/slack) the +[#ebpf-go](https://cilium.slack.com/messages/ebpf-go) channel on Slack if you +have other questions regarding the project. Note that this channel is ephemeral +and has its history erased past a certain point, which is less helpful for +others running into the same problem later. + +## Packages + +This library includes the following packages: + +* [asm](https://pkg.go.dev/github.com/cilium/ebpf/asm) contains a basic + assembler, allowing you to write eBPF assembly instructions directly + within your Go code. (You don't need to use this if you prefer to write your eBPF program in C.) +* [cmd/bpf2go](https://pkg.go.dev/github.com/cilium/ebpf/cmd/bpf2go) allows + compiling and embedding eBPF programs written in C within Go code. As well as + compiling the C code, it auto-generates Go code for loading and manipulating + the eBPF program and map objects. +* [link](https://pkg.go.dev/github.com/cilium/ebpf/link) allows attaching eBPF + to various hooks +* [perf](https://pkg.go.dev/github.com/cilium/ebpf/perf) allows reading from a + `PERF_EVENT_ARRAY` +* [ringbuf](https://pkg.go.dev/github.com/cilium/ebpf/ringbuf) allows reading from a + `BPF_MAP_TYPE_RINGBUF` map +* [features](https://pkg.go.dev/github.com/cilium/ebpf/features) implements the equivalent + of `bpftool feature probe` for discovering BPF-related kernel features using native Go. +* [rlimit](https://pkg.go.dev/github.com/cilium/ebpf/rlimit) provides a convenient API to lift + the `RLIMIT_MEMLOCK` constraint on kernels before 5.11. +* [btf](https://pkg.go.dev/github.com/cilium/ebpf/btf) allows reading the BPF Type Format. +* [pin](https://pkg.go.dev/github.com/cilium/ebpf/pin) provides APIs for working with pinned objects on bpffs. + +## Requirements + +* A version of Go that is [supported by + upstream](https://golang.org/doc/devel/release.html#policy) +* CI is run against kernel.org LTS releases. >= 4.4 should work but EOL'ed versions + are not supported. + +## License + +MIT + +### eBPF Gopher + +The eBPF honeygopher is based on the Go gopher designed by Renee French. + +[Getting Started]: https://ebpf-go.dev/guides/getting-started/ diff --git a/vendor/github.com/cilium/ebpf/asm/alu.go b/vendor/github.com/cilium/ebpf/asm/alu.go new file mode 100644 index 0000000000..282233d327 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/alu.go @@ -0,0 +1,180 @@ +package asm + +//go:generate go run golang.org/x/tools/cmd/stringer@latest -output alu_string.go -type=Source,Endianness,ALUOp + +// Source of ALU / ALU64 / Branch operations +// +// msb lsb +// +------------+-+---+ +// | op |S|cls| +// +------------+-+---+ +type Source uint16 + +const sourceMask OpCode = 0x0008 + +// Source bitmask +const ( + // InvalidSource is returned by getters when invoked + // on non ALU / branch OpCodes. + InvalidSource Source = 0xffff + // ImmSource src is from constant + ImmSource Source = 0x0000 + // RegSource src is from register + RegSource Source = 0x0008 +) + +// The Endianness of a byte swap instruction. +type Endianness uint8 + +const endianMask = sourceMask + +// Endian flags +const ( + InvalidEndian Endianness = 0xff + // Convert to little endian + LE Endianness = 0x00 + // Convert to big endian + BE Endianness = 0x08 +) + +// ALUOp are ALU / ALU64 operations +// +// msb lsb +// +-------+----+-+---+ +// | EXT | OP |s|cls| +// +-------+----+-+---+ +type ALUOp uint16 + +const aluMask OpCode = 0x3ff0 + +const ( + // InvalidALUOp is returned by getters when invoked + // on non ALU OpCodes + InvalidALUOp ALUOp = 0xffff + // Add - addition + Add ALUOp = 0x0000 + // Sub - subtraction + Sub ALUOp = 0x0010 + // Mul - multiplication + Mul ALUOp = 0x0020 + // Div - division + Div ALUOp = 0x0030 + // SDiv - signed division + SDiv ALUOp = Div + 0x0100 + // Or - bitwise or + Or ALUOp = 0x0040 + // And - bitwise and + And ALUOp = 0x0050 + // LSh - bitwise shift left + LSh ALUOp = 0x0060 + // RSh - bitwise shift right + RSh ALUOp = 0x0070 + // Neg - sign/unsign signing bit + Neg ALUOp = 0x0080 + // Mod - modulo + Mod ALUOp = 0x0090 + // SMod - signed modulo + SMod ALUOp = Mod + 0x0100 + // Xor - bitwise xor + Xor ALUOp = 0x00a0 + // Mov - move value from one place to another + Mov ALUOp = 0x00b0 + // MovSX8 - move lower 8 bits, sign extended upper bits of target + MovSX8 ALUOp = Mov + 0x0100 + // MovSX16 - move lower 16 bits, sign extended upper bits of target + MovSX16 ALUOp = Mov + 0x0200 + // MovSX32 - move lower 32 bits, sign extended upper bits of target + MovSX32 ALUOp = Mov + 0x0300 + // ArSh - arithmetic shift + ArSh ALUOp = 0x00c0 + // Swap - endian conversions + Swap ALUOp = 0x00d0 +) + +// HostTo converts from host to another endianness. +func HostTo(endian Endianness, dst Register, size Size) Instruction { + var imm int64 + switch size { + case Half: + imm = 16 + case Word: + imm = 32 + case DWord: + imm = 64 + default: + return Instruction{OpCode: InvalidOpCode} + } + + return Instruction{ + OpCode: OpCode(ALUClass).SetALUOp(Swap).SetSource(Source(endian)), + Dst: dst, + Constant: imm, + } +} + +// BSwap unconditionally reverses the order of bytes in a register. +func BSwap(dst Register, size Size) Instruction { + var imm int64 + switch size { + case Half: + imm = 16 + case Word: + imm = 32 + case DWord: + imm = 64 + default: + return Instruction{OpCode: InvalidOpCode} + } + + return Instruction{ + OpCode: OpCode(ALU64Class).SetALUOp(Swap), + Dst: dst, + Constant: imm, + } +} + +// Op returns the OpCode for an ALU operation with a given source. +func (op ALUOp) Op(source Source) OpCode { + return OpCode(ALU64Class).SetALUOp(op).SetSource(source) +} + +// Reg emits `dst (op) src`. +func (op ALUOp) Reg(dst, src Register) Instruction { + return Instruction{ + OpCode: op.Op(RegSource), + Dst: dst, + Src: src, + } +} + +// Imm emits `dst (op) value`. +func (op ALUOp) Imm(dst Register, value int32) Instruction { + return Instruction{ + OpCode: op.Op(ImmSource), + Dst: dst, + Constant: int64(value), + } +} + +// Op32 returns the OpCode for a 32-bit ALU operation with a given source. +func (op ALUOp) Op32(source Source) OpCode { + return OpCode(ALUClass).SetALUOp(op).SetSource(source) +} + +// Reg32 emits `dst (op) src`, zeroing the upper 32 bit of dst. +func (op ALUOp) Reg32(dst, src Register) Instruction { + return Instruction{ + OpCode: op.Op32(RegSource), + Dst: dst, + Src: src, + } +} + +// Imm32 emits `dst (op) value`, zeroing the upper 32 bit of dst. +func (op ALUOp) Imm32(dst Register, value int32) Instruction { + return Instruction{ + OpCode: op.Op32(ImmSource), + Dst: dst, + Constant: int64(value), + } +} diff --git a/vendor/github.com/cilium/ebpf/asm/alu_string.go b/vendor/github.com/cilium/ebpf/asm/alu_string.go new file mode 100644 index 0000000000..35b406bf3f --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/alu_string.go @@ -0,0 +1,117 @@ +// Code generated by "stringer -output alu_string.go -type=Source,Endianness,ALUOp"; DO NOT EDIT. + +package asm + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[InvalidSource-65535] + _ = x[ImmSource-0] + _ = x[RegSource-8] +} + +const ( + _Source_name_0 = "ImmSource" + _Source_name_1 = "RegSource" + _Source_name_2 = "InvalidSource" +) + +func (i Source) String() string { + switch { + case i == 0: + return _Source_name_0 + case i == 8: + return _Source_name_1 + case i == 65535: + return _Source_name_2 + default: + return "Source(" + strconv.FormatInt(int64(i), 10) + ")" + } +} +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[InvalidEndian-255] + _ = x[LE-0] + _ = x[BE-8] +} + +const ( + _Endianness_name_0 = "LE" + _Endianness_name_1 = "BE" + _Endianness_name_2 = "InvalidEndian" +) + +func (i Endianness) String() string { + switch { + case i == 0: + return _Endianness_name_0 + case i == 8: + return _Endianness_name_1 + case i == 255: + return _Endianness_name_2 + default: + return "Endianness(" + strconv.FormatInt(int64(i), 10) + ")" + } +} +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[InvalidALUOp-65535] + _ = x[Add-0] + _ = x[Sub-16] + _ = x[Mul-32] + _ = x[Div-48] + _ = x[SDiv-304] + _ = x[Or-64] + _ = x[And-80] + _ = x[LSh-96] + _ = x[RSh-112] + _ = x[Neg-128] + _ = x[Mod-144] + _ = x[SMod-400] + _ = x[Xor-160] + _ = x[Mov-176] + _ = x[MovSX8-432] + _ = x[MovSX16-688] + _ = x[MovSX32-944] + _ = x[ArSh-192] + _ = x[Swap-208] +} + +const _ALUOp_name = "AddSubMulDivOrAndLShRShNegModXorMovArShSwapSDivSModMovSX8MovSX16MovSX32InvalidALUOp" + +var _ALUOp_map = map[ALUOp]string{ + 0: _ALUOp_name[0:3], + 16: _ALUOp_name[3:6], + 32: _ALUOp_name[6:9], + 48: _ALUOp_name[9:12], + 64: _ALUOp_name[12:14], + 80: _ALUOp_name[14:17], + 96: _ALUOp_name[17:20], + 112: _ALUOp_name[20:23], + 128: _ALUOp_name[23:26], + 144: _ALUOp_name[26:29], + 160: _ALUOp_name[29:32], + 176: _ALUOp_name[32:35], + 192: _ALUOp_name[35:39], + 208: _ALUOp_name[39:43], + 304: _ALUOp_name[43:47], + 400: _ALUOp_name[47:51], + 432: _ALUOp_name[51:57], + 688: _ALUOp_name[57:64], + 944: _ALUOp_name[64:71], + 65535: _ALUOp_name[71:83], +} + +func (i ALUOp) String() string { + if str, ok := _ALUOp_map[i]; ok { + return str + } + return "ALUOp(" + strconv.FormatInt(int64(i), 10) + ")" +} diff --git a/vendor/github.com/cilium/ebpf/asm/doc.go b/vendor/github.com/cilium/ebpf/asm/doc.go new file mode 100644 index 0000000000..7031bdc276 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/doc.go @@ -0,0 +1,2 @@ +// Package asm is an assembler for eBPF bytecode. +package asm diff --git a/vendor/github.com/cilium/ebpf/asm/func.go b/vendor/github.com/cilium/ebpf/asm/func.go new file mode 100644 index 0000000000..7a59acf383 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/func.go @@ -0,0 +1,244 @@ +package asm + +//go:generate go run golang.org/x/tools/cmd/stringer@latest -output func_string.go -type=BuiltinFunc + +// BuiltinFunc is a built-in eBPF function. +type BuiltinFunc int32 + +// eBPF built-in functions +// +// You can regenerate this list using the following gawk script: +// +// /FN\(.+\),/ { +// match($1, /\(([a-z_0-9]+),/, r) +// split(r[1], p, "_") +// printf "Fn" +// for (i in p) { +// printf "%s%s", toupper(substr(p[i], 1, 1)), substr(p[i], 2) +// } +// print "" +// } +// +// The script expects include/uapi/linux/bpf.h as it's input. +const ( + FnUnspec BuiltinFunc = iota + FnMapLookupElem + FnMapUpdateElem + FnMapDeleteElem + FnProbeRead + FnKtimeGetNs + FnTracePrintk + FnGetPrandomU32 + FnGetSmpProcessorId + FnSkbStoreBytes + FnL3CsumReplace + FnL4CsumReplace + FnTailCall + FnCloneRedirect + FnGetCurrentPidTgid + FnGetCurrentUidGid + FnGetCurrentComm + FnGetCgroupClassid + FnSkbVlanPush + FnSkbVlanPop + FnSkbGetTunnelKey + FnSkbSetTunnelKey + FnPerfEventRead + FnRedirect + FnGetRouteRealm + FnPerfEventOutput + FnSkbLoadBytes + FnGetStackid + FnCsumDiff + FnSkbGetTunnelOpt + FnSkbSetTunnelOpt + FnSkbChangeProto + FnSkbChangeType + FnSkbUnderCgroup + FnGetHashRecalc + FnGetCurrentTask + FnProbeWriteUser + FnCurrentTaskUnderCgroup + FnSkbChangeTail + FnSkbPullData + FnCsumUpdate + FnSetHashInvalid + FnGetNumaNodeId + FnSkbChangeHead + FnXdpAdjustHead + FnProbeReadStr + FnGetSocketCookie + FnGetSocketUid + FnSetHash + FnSetsockopt + FnSkbAdjustRoom + FnRedirectMap + FnSkRedirectMap + FnSockMapUpdate + FnXdpAdjustMeta + FnPerfEventReadValue + FnPerfProgReadValue + FnGetsockopt + FnOverrideReturn + FnSockOpsCbFlagsSet + FnMsgRedirectMap + FnMsgApplyBytes + FnMsgCorkBytes + FnMsgPullData + FnBind + FnXdpAdjustTail + FnSkbGetXfrmState + FnGetStack + FnSkbLoadBytesRelative + FnFibLookup + FnSockHashUpdate + FnMsgRedirectHash + FnSkRedirectHash + FnLwtPushEncap + FnLwtSeg6StoreBytes + FnLwtSeg6AdjustSrh + FnLwtSeg6Action + FnRcRepeat + FnRcKeydown + FnSkbCgroupId + FnGetCurrentCgroupId + FnGetLocalStorage + FnSkSelectReuseport + FnSkbAncestorCgroupId + FnSkLookupTcp + FnSkLookupUdp + FnSkRelease + FnMapPushElem + FnMapPopElem + FnMapPeekElem + FnMsgPushData + FnMsgPopData + FnRcPointerRel + FnSpinLock + FnSpinUnlock + FnSkFullsock + FnTcpSock + FnSkbEcnSetCe + FnGetListenerSock + FnSkcLookupTcp + FnTcpCheckSyncookie + FnSysctlGetName + FnSysctlGetCurrentValue + FnSysctlGetNewValue + FnSysctlSetNewValue + FnStrtol + FnStrtoul + FnSkStorageGet + FnSkStorageDelete + FnSendSignal + FnTcpGenSyncookie + FnSkbOutput + FnProbeReadUser + FnProbeReadKernel + FnProbeReadUserStr + FnProbeReadKernelStr + FnTcpSendAck + FnSendSignalThread + FnJiffies64 + FnReadBranchRecords + FnGetNsCurrentPidTgid + FnXdpOutput + FnGetNetnsCookie + FnGetCurrentAncestorCgroupId + FnSkAssign + FnKtimeGetBootNs + FnSeqPrintf + FnSeqWrite + FnSkCgroupId + FnSkAncestorCgroupId + FnRingbufOutput + FnRingbufReserve + FnRingbufSubmit + FnRingbufDiscard + FnRingbufQuery + FnCsumLevel + FnSkcToTcp6Sock + FnSkcToTcpSock + FnSkcToTcpTimewaitSock + FnSkcToTcpRequestSock + FnSkcToUdp6Sock + FnGetTaskStack + FnLoadHdrOpt + FnStoreHdrOpt + FnReserveHdrOpt + FnInodeStorageGet + FnInodeStorageDelete + FnDPath + FnCopyFromUser + FnSnprintfBtf + FnSeqPrintfBtf + FnSkbCgroupClassid + FnRedirectNeigh + FnPerCpuPtr + FnThisCpuPtr + FnRedirectPeer + FnTaskStorageGet + FnTaskStorageDelete + FnGetCurrentTaskBtf + FnBprmOptsSet + FnKtimeGetCoarseNs + FnImaInodeHash + FnSockFromFile + FnCheckMtu + FnForEachMapElem + FnSnprintf + FnSysBpf + FnBtfFindByNameKind + FnSysClose + FnTimerInit + FnTimerSetCallback + FnTimerStart + FnTimerCancel + FnGetFuncIp + FnGetAttachCookie + FnTaskPtRegs + FnGetBranchSnapshot + FnTraceVprintk + FnSkcToUnixSock + FnKallsymsLookupName + FnFindVma + FnLoop + FnStrncmp + FnGetFuncArg + FnGetFuncRet + FnGetFuncArgCnt + FnGetRetval + FnSetRetval + FnXdpGetBuffLen + FnXdpLoadBytes + FnXdpStoreBytes + FnCopyFromUserTask + FnSkbSetTstamp + FnImaFileHash + FnKptrXchg + FnMapLookupPercpuElem + FnSkcToMptcpSock + FnDynptrFromMem + FnRingbufReserveDynptr + FnRingbufSubmitDynptr + FnRingbufDiscardDynptr + FnDynptrRead + FnDynptrWrite + FnDynptrData + FnTcpRawGenSyncookieIpv4 + FnTcpRawGenSyncookieIpv6 + FnTcpRawCheckSyncookieIpv4 + FnTcpRawCheckSyncookieIpv6 + FnKtimeGetTaiNs + FnUserRingbufDrain + FnCgrpStorageGet + FnCgrpStorageDelete +) + +// Call emits a function call. +func (fn BuiltinFunc) Call() Instruction { + return Instruction{ + OpCode: OpCode(JumpClass).SetJumpOp(Call), + Constant: int64(fn), + } +} diff --git a/vendor/github.com/cilium/ebpf/asm/func_string.go b/vendor/github.com/cilium/ebpf/asm/func_string.go new file mode 100644 index 0000000000..6c4ba37892 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/func_string.go @@ -0,0 +1,234 @@ +// Code generated by "stringer -output func_string.go -type=BuiltinFunc"; DO NOT EDIT. + +package asm + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[FnUnspec-0] + _ = x[FnMapLookupElem-1] + _ = x[FnMapUpdateElem-2] + _ = x[FnMapDeleteElem-3] + _ = x[FnProbeRead-4] + _ = x[FnKtimeGetNs-5] + _ = x[FnTracePrintk-6] + _ = x[FnGetPrandomU32-7] + _ = x[FnGetSmpProcessorId-8] + _ = x[FnSkbStoreBytes-9] + _ = x[FnL3CsumReplace-10] + _ = x[FnL4CsumReplace-11] + _ = x[FnTailCall-12] + _ = x[FnCloneRedirect-13] + _ = x[FnGetCurrentPidTgid-14] + _ = x[FnGetCurrentUidGid-15] + _ = x[FnGetCurrentComm-16] + _ = x[FnGetCgroupClassid-17] + _ = x[FnSkbVlanPush-18] + _ = x[FnSkbVlanPop-19] + _ = x[FnSkbGetTunnelKey-20] + _ = x[FnSkbSetTunnelKey-21] + _ = x[FnPerfEventRead-22] + _ = x[FnRedirect-23] + _ = x[FnGetRouteRealm-24] + _ = x[FnPerfEventOutput-25] + _ = x[FnSkbLoadBytes-26] + _ = x[FnGetStackid-27] + _ = x[FnCsumDiff-28] + _ = x[FnSkbGetTunnelOpt-29] + _ = x[FnSkbSetTunnelOpt-30] + _ = x[FnSkbChangeProto-31] + _ = x[FnSkbChangeType-32] + _ = x[FnSkbUnderCgroup-33] + _ = x[FnGetHashRecalc-34] + _ = x[FnGetCurrentTask-35] + _ = x[FnProbeWriteUser-36] + _ = x[FnCurrentTaskUnderCgroup-37] + _ = x[FnSkbChangeTail-38] + _ = x[FnSkbPullData-39] + _ = x[FnCsumUpdate-40] + _ = x[FnSetHashInvalid-41] + _ = x[FnGetNumaNodeId-42] + _ = x[FnSkbChangeHead-43] + _ = x[FnXdpAdjustHead-44] + _ = x[FnProbeReadStr-45] + _ = x[FnGetSocketCookie-46] + _ = x[FnGetSocketUid-47] + _ = x[FnSetHash-48] + _ = x[FnSetsockopt-49] + _ = x[FnSkbAdjustRoom-50] + _ = x[FnRedirectMap-51] + _ = x[FnSkRedirectMap-52] + _ = x[FnSockMapUpdate-53] + _ = x[FnXdpAdjustMeta-54] + _ = x[FnPerfEventReadValue-55] + _ = x[FnPerfProgReadValue-56] + _ = x[FnGetsockopt-57] + _ = x[FnOverrideReturn-58] + _ = x[FnSockOpsCbFlagsSet-59] + _ = x[FnMsgRedirectMap-60] + _ = x[FnMsgApplyBytes-61] + _ = x[FnMsgCorkBytes-62] + _ = x[FnMsgPullData-63] + _ = x[FnBind-64] + _ = x[FnXdpAdjustTail-65] + _ = x[FnSkbGetXfrmState-66] + _ = x[FnGetStack-67] + _ = x[FnSkbLoadBytesRelative-68] + _ = x[FnFibLookup-69] + _ = x[FnSockHashUpdate-70] + _ = x[FnMsgRedirectHash-71] + _ = x[FnSkRedirectHash-72] + _ = x[FnLwtPushEncap-73] + _ = x[FnLwtSeg6StoreBytes-74] + _ = x[FnLwtSeg6AdjustSrh-75] + _ = x[FnLwtSeg6Action-76] + _ = x[FnRcRepeat-77] + _ = x[FnRcKeydown-78] + _ = x[FnSkbCgroupId-79] + _ = x[FnGetCurrentCgroupId-80] + _ = x[FnGetLocalStorage-81] + _ = x[FnSkSelectReuseport-82] + _ = x[FnSkbAncestorCgroupId-83] + _ = x[FnSkLookupTcp-84] + _ = x[FnSkLookupUdp-85] + _ = x[FnSkRelease-86] + _ = x[FnMapPushElem-87] + _ = x[FnMapPopElem-88] + _ = x[FnMapPeekElem-89] + _ = x[FnMsgPushData-90] + _ = x[FnMsgPopData-91] + _ = x[FnRcPointerRel-92] + _ = x[FnSpinLock-93] + _ = x[FnSpinUnlock-94] + _ = x[FnSkFullsock-95] + _ = x[FnTcpSock-96] + _ = x[FnSkbEcnSetCe-97] + _ = x[FnGetListenerSock-98] + _ = x[FnSkcLookupTcp-99] + _ = x[FnTcpCheckSyncookie-100] + _ = x[FnSysctlGetName-101] + _ = x[FnSysctlGetCurrentValue-102] + _ = x[FnSysctlGetNewValue-103] + _ = x[FnSysctlSetNewValue-104] + _ = x[FnStrtol-105] + _ = x[FnStrtoul-106] + _ = x[FnSkStorageGet-107] + _ = x[FnSkStorageDelete-108] + _ = x[FnSendSignal-109] + _ = x[FnTcpGenSyncookie-110] + _ = x[FnSkbOutput-111] + _ = x[FnProbeReadUser-112] + _ = x[FnProbeReadKernel-113] + _ = x[FnProbeReadUserStr-114] + _ = x[FnProbeReadKernelStr-115] + _ = x[FnTcpSendAck-116] + _ = x[FnSendSignalThread-117] + _ = x[FnJiffies64-118] + _ = x[FnReadBranchRecords-119] + _ = x[FnGetNsCurrentPidTgid-120] + _ = x[FnXdpOutput-121] + _ = x[FnGetNetnsCookie-122] + _ = x[FnGetCurrentAncestorCgroupId-123] + _ = x[FnSkAssign-124] + _ = x[FnKtimeGetBootNs-125] + _ = x[FnSeqPrintf-126] + _ = x[FnSeqWrite-127] + _ = x[FnSkCgroupId-128] + _ = x[FnSkAncestorCgroupId-129] + _ = x[FnRingbufOutput-130] + _ = x[FnRingbufReserve-131] + _ = x[FnRingbufSubmit-132] + _ = x[FnRingbufDiscard-133] + _ = x[FnRingbufQuery-134] + _ = x[FnCsumLevel-135] + _ = x[FnSkcToTcp6Sock-136] + _ = x[FnSkcToTcpSock-137] + _ = x[FnSkcToTcpTimewaitSock-138] + _ = x[FnSkcToTcpRequestSock-139] + _ = x[FnSkcToUdp6Sock-140] + _ = x[FnGetTaskStack-141] + _ = x[FnLoadHdrOpt-142] + _ = x[FnStoreHdrOpt-143] + _ = x[FnReserveHdrOpt-144] + _ = x[FnInodeStorageGet-145] + _ = x[FnInodeStorageDelete-146] + _ = x[FnDPath-147] + _ = x[FnCopyFromUser-148] + _ = x[FnSnprintfBtf-149] + _ = x[FnSeqPrintfBtf-150] + _ = x[FnSkbCgroupClassid-151] + _ = x[FnRedirectNeigh-152] + _ = x[FnPerCpuPtr-153] + _ = x[FnThisCpuPtr-154] + _ = x[FnRedirectPeer-155] + _ = x[FnTaskStorageGet-156] + _ = x[FnTaskStorageDelete-157] + _ = x[FnGetCurrentTaskBtf-158] + _ = x[FnBprmOptsSet-159] + _ = x[FnKtimeGetCoarseNs-160] + _ = x[FnImaInodeHash-161] + _ = x[FnSockFromFile-162] + _ = x[FnCheckMtu-163] + _ = x[FnForEachMapElem-164] + _ = x[FnSnprintf-165] + _ = x[FnSysBpf-166] + _ = x[FnBtfFindByNameKind-167] + _ = x[FnSysClose-168] + _ = x[FnTimerInit-169] + _ = x[FnTimerSetCallback-170] + _ = x[FnTimerStart-171] + _ = x[FnTimerCancel-172] + _ = x[FnGetFuncIp-173] + _ = x[FnGetAttachCookie-174] + _ = x[FnTaskPtRegs-175] + _ = x[FnGetBranchSnapshot-176] + _ = x[FnTraceVprintk-177] + _ = x[FnSkcToUnixSock-178] + _ = x[FnKallsymsLookupName-179] + _ = x[FnFindVma-180] + _ = x[FnLoop-181] + _ = x[FnStrncmp-182] + _ = x[FnGetFuncArg-183] + _ = x[FnGetFuncRet-184] + _ = x[FnGetFuncArgCnt-185] + _ = x[FnGetRetval-186] + _ = x[FnSetRetval-187] + _ = x[FnXdpGetBuffLen-188] + _ = x[FnXdpLoadBytes-189] + _ = x[FnXdpStoreBytes-190] + _ = x[FnCopyFromUserTask-191] + _ = x[FnSkbSetTstamp-192] + _ = x[FnImaFileHash-193] + _ = x[FnKptrXchg-194] + _ = x[FnMapLookupPercpuElem-195] + _ = x[FnSkcToMptcpSock-196] + _ = x[FnDynptrFromMem-197] + _ = x[FnRingbufReserveDynptr-198] + _ = x[FnRingbufSubmitDynptr-199] + _ = x[FnRingbufDiscardDynptr-200] + _ = x[FnDynptrRead-201] + _ = x[FnDynptrWrite-202] + _ = x[FnDynptrData-203] + _ = x[FnTcpRawGenSyncookieIpv4-204] + _ = x[FnTcpRawGenSyncookieIpv6-205] + _ = x[FnTcpRawCheckSyncookieIpv4-206] + _ = x[FnTcpRawCheckSyncookieIpv6-207] + _ = x[FnKtimeGetTaiNs-208] + _ = x[FnUserRingbufDrain-209] + _ = x[FnCgrpStorageGet-210] + _ = x[FnCgrpStorageDelete-211] +} + +const _BuiltinFunc_name = "FnUnspecFnMapLookupElemFnMapUpdateElemFnMapDeleteElemFnProbeReadFnKtimeGetNsFnTracePrintkFnGetPrandomU32FnGetSmpProcessorIdFnSkbStoreBytesFnL3CsumReplaceFnL4CsumReplaceFnTailCallFnCloneRedirectFnGetCurrentPidTgidFnGetCurrentUidGidFnGetCurrentCommFnGetCgroupClassidFnSkbVlanPushFnSkbVlanPopFnSkbGetTunnelKeyFnSkbSetTunnelKeyFnPerfEventReadFnRedirectFnGetRouteRealmFnPerfEventOutputFnSkbLoadBytesFnGetStackidFnCsumDiffFnSkbGetTunnelOptFnSkbSetTunnelOptFnSkbChangeProtoFnSkbChangeTypeFnSkbUnderCgroupFnGetHashRecalcFnGetCurrentTaskFnProbeWriteUserFnCurrentTaskUnderCgroupFnSkbChangeTailFnSkbPullDataFnCsumUpdateFnSetHashInvalidFnGetNumaNodeIdFnSkbChangeHeadFnXdpAdjustHeadFnProbeReadStrFnGetSocketCookieFnGetSocketUidFnSetHashFnSetsockoptFnSkbAdjustRoomFnRedirectMapFnSkRedirectMapFnSockMapUpdateFnXdpAdjustMetaFnPerfEventReadValueFnPerfProgReadValueFnGetsockoptFnOverrideReturnFnSockOpsCbFlagsSetFnMsgRedirectMapFnMsgApplyBytesFnMsgCorkBytesFnMsgPullDataFnBindFnXdpAdjustTailFnSkbGetXfrmStateFnGetStackFnSkbLoadBytesRelativeFnFibLookupFnSockHashUpdateFnMsgRedirectHashFnSkRedirectHashFnLwtPushEncapFnLwtSeg6StoreBytesFnLwtSeg6AdjustSrhFnLwtSeg6ActionFnRcRepeatFnRcKeydownFnSkbCgroupIdFnGetCurrentCgroupIdFnGetLocalStorageFnSkSelectReuseportFnSkbAncestorCgroupIdFnSkLookupTcpFnSkLookupUdpFnSkReleaseFnMapPushElemFnMapPopElemFnMapPeekElemFnMsgPushDataFnMsgPopDataFnRcPointerRelFnSpinLockFnSpinUnlockFnSkFullsockFnTcpSockFnSkbEcnSetCeFnGetListenerSockFnSkcLookupTcpFnTcpCheckSyncookieFnSysctlGetNameFnSysctlGetCurrentValueFnSysctlGetNewValueFnSysctlSetNewValueFnStrtolFnStrtoulFnSkStorageGetFnSkStorageDeleteFnSendSignalFnTcpGenSyncookieFnSkbOutputFnProbeReadUserFnProbeReadKernelFnProbeReadUserStrFnProbeReadKernelStrFnTcpSendAckFnSendSignalThreadFnJiffies64FnReadBranchRecordsFnGetNsCurrentPidTgidFnXdpOutputFnGetNetnsCookieFnGetCurrentAncestorCgroupIdFnSkAssignFnKtimeGetBootNsFnSeqPrintfFnSeqWriteFnSkCgroupIdFnSkAncestorCgroupIdFnRingbufOutputFnRingbufReserveFnRingbufSubmitFnRingbufDiscardFnRingbufQueryFnCsumLevelFnSkcToTcp6SockFnSkcToTcpSockFnSkcToTcpTimewaitSockFnSkcToTcpRequestSockFnSkcToUdp6SockFnGetTaskStackFnLoadHdrOptFnStoreHdrOptFnReserveHdrOptFnInodeStorageGetFnInodeStorageDeleteFnDPathFnCopyFromUserFnSnprintfBtfFnSeqPrintfBtfFnSkbCgroupClassidFnRedirectNeighFnPerCpuPtrFnThisCpuPtrFnRedirectPeerFnTaskStorageGetFnTaskStorageDeleteFnGetCurrentTaskBtfFnBprmOptsSetFnKtimeGetCoarseNsFnImaInodeHashFnSockFromFileFnCheckMtuFnForEachMapElemFnSnprintfFnSysBpfFnBtfFindByNameKindFnSysCloseFnTimerInitFnTimerSetCallbackFnTimerStartFnTimerCancelFnGetFuncIpFnGetAttachCookieFnTaskPtRegsFnGetBranchSnapshotFnTraceVprintkFnSkcToUnixSockFnKallsymsLookupNameFnFindVmaFnLoopFnStrncmpFnGetFuncArgFnGetFuncRetFnGetFuncArgCntFnGetRetvalFnSetRetvalFnXdpGetBuffLenFnXdpLoadBytesFnXdpStoreBytesFnCopyFromUserTaskFnSkbSetTstampFnImaFileHashFnKptrXchgFnMapLookupPercpuElemFnSkcToMptcpSockFnDynptrFromMemFnRingbufReserveDynptrFnRingbufSubmitDynptrFnRingbufDiscardDynptrFnDynptrReadFnDynptrWriteFnDynptrDataFnTcpRawGenSyncookieIpv4FnTcpRawGenSyncookieIpv6FnTcpRawCheckSyncookieIpv4FnTcpRawCheckSyncookieIpv6FnKtimeGetTaiNsFnUserRingbufDrainFnCgrpStorageGetFnCgrpStorageDelete" + +var _BuiltinFunc_index = [...]uint16{0, 8, 23, 38, 53, 64, 76, 89, 104, 123, 138, 153, 168, 178, 193, 212, 230, 246, 264, 277, 289, 306, 323, 338, 348, 363, 380, 394, 406, 416, 433, 450, 466, 481, 497, 512, 528, 544, 568, 583, 596, 608, 624, 639, 654, 669, 683, 700, 714, 723, 735, 750, 763, 778, 793, 808, 828, 847, 859, 875, 894, 910, 925, 939, 952, 958, 973, 990, 1000, 1022, 1033, 1049, 1066, 1082, 1096, 1115, 1133, 1148, 1158, 1169, 1182, 1202, 1219, 1238, 1259, 1272, 1285, 1296, 1309, 1321, 1334, 1347, 1359, 1373, 1383, 1395, 1407, 1416, 1429, 1446, 1460, 1479, 1494, 1517, 1536, 1555, 1563, 1572, 1586, 1603, 1615, 1632, 1643, 1658, 1675, 1693, 1713, 1725, 1743, 1754, 1773, 1794, 1805, 1821, 1849, 1859, 1875, 1886, 1896, 1908, 1928, 1943, 1959, 1974, 1990, 2004, 2015, 2030, 2044, 2066, 2087, 2102, 2116, 2128, 2141, 2156, 2173, 2193, 2200, 2214, 2227, 2241, 2259, 2274, 2285, 2297, 2311, 2327, 2346, 2365, 2378, 2396, 2410, 2424, 2434, 2450, 2460, 2468, 2487, 2497, 2508, 2526, 2538, 2551, 2562, 2579, 2591, 2610, 2624, 2639, 2659, 2668, 2674, 2683, 2695, 2707, 2722, 2733, 2744, 2759, 2773, 2788, 2806, 2820, 2833, 2843, 2864, 2880, 2895, 2917, 2938, 2960, 2972, 2985, 2997, 3021, 3045, 3071, 3097, 3112, 3130, 3146, 3165} + +func (i BuiltinFunc) String() string { + if i < 0 || i >= BuiltinFunc(len(_BuiltinFunc_index)-1) { + return "BuiltinFunc(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _BuiltinFunc_name[_BuiltinFunc_index[i]:_BuiltinFunc_index[i+1]] +} diff --git a/vendor/github.com/cilium/ebpf/asm/instruction.go b/vendor/github.com/cilium/ebpf/asm/instruction.go new file mode 100644 index 0000000000..86b384c02a --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/instruction.go @@ -0,0 +1,953 @@ +package asm + +import ( + "crypto/sha1" + "encoding/binary" + "encoding/hex" + "errors" + "fmt" + "io" + "math" + "sort" + "strings" + + "github.com/cilium/ebpf/internal/sys" +) + +// InstructionSize is the size of a BPF instruction in bytes +const InstructionSize = 8 + +// RawInstructionOffset is an offset in units of raw BPF instructions. +type RawInstructionOffset uint64 + +var ErrUnreferencedSymbol = errors.New("unreferenced symbol") +var ErrUnsatisfiedMapReference = errors.New("unsatisfied map reference") +var ErrUnsatisfiedProgramReference = errors.New("unsatisfied program reference") + +// Bytes returns the offset of an instruction in bytes. +func (rio RawInstructionOffset) Bytes() uint64 { + return uint64(rio) * InstructionSize +} + +// Instruction is a single eBPF instruction. +type Instruction struct { + OpCode OpCode + Dst Register + Src Register + Offset int16 + Constant int64 + + // Metadata contains optional metadata about this instruction. + Metadata Metadata +} + +// Unmarshal decodes a BPF instruction. +func (ins *Instruction) Unmarshal(r io.Reader, bo binary.ByteOrder) (uint64, error) { + data := make([]byte, InstructionSize) + if _, err := io.ReadFull(r, data); err != nil { + return 0, err + } + + ins.OpCode = OpCode(data[0]) + + regs := data[1] + switch bo { + case binary.LittleEndian: + ins.Dst, ins.Src = Register(regs&0xF), Register(regs>>4) + case binary.BigEndian: + ins.Dst, ins.Src = Register(regs>>4), Register(regs&0xf) + } + + ins.Offset = int16(bo.Uint16(data[2:4])) + + if ins.OpCode.Class().IsALU() { + switch ins.OpCode.ALUOp() { + case Div: + if ins.Offset == 1 { + ins.OpCode = ins.OpCode.SetALUOp(SDiv) + ins.Offset = 0 + } + case Mod: + if ins.Offset == 1 { + ins.OpCode = ins.OpCode.SetALUOp(SMod) + ins.Offset = 0 + } + case Mov: + switch ins.Offset { + case 8: + ins.OpCode = ins.OpCode.SetALUOp(MovSX8) + ins.Offset = 0 + case 16: + ins.OpCode = ins.OpCode.SetALUOp(MovSX16) + ins.Offset = 0 + case 32: + ins.OpCode = ins.OpCode.SetALUOp(MovSX32) + ins.Offset = 0 + } + } + } + + // Convert to int32 before widening to int64 + // to ensure the signed bit is carried over. + ins.Constant = int64(int32(bo.Uint32(data[4:8]))) + + if !ins.OpCode.IsDWordLoad() { + return InstructionSize, nil + } + + // Pull another instruction from the stream to retrieve the second + // half of the 64-bit immediate value. + if _, err := io.ReadFull(r, data); err != nil { + // No Wrap, to avoid io.EOF clash + return 0, errors.New("64bit immediate is missing second half") + } + + // Require that all fields other than the value are zero. + if bo.Uint32(data[0:4]) != 0 { + return 0, errors.New("64bit immediate has non-zero fields") + } + + cons1 := uint32(ins.Constant) + cons2 := int32(bo.Uint32(data[4:8])) + ins.Constant = int64(cons2)<<32 | int64(cons1) + + return 2 * InstructionSize, nil +} + +// Marshal encodes a BPF instruction. +func (ins Instruction) Marshal(w io.Writer, bo binary.ByteOrder) (uint64, error) { + if ins.OpCode == InvalidOpCode { + return 0, errors.New("invalid opcode") + } + + isDWordLoad := ins.OpCode.IsDWordLoad() + + cons := int32(ins.Constant) + if isDWordLoad { + // Encode least significant 32bit first for 64bit operations. + cons = int32(uint32(ins.Constant)) + } + + regs, err := newBPFRegisters(ins.Dst, ins.Src, bo) + if err != nil { + return 0, fmt.Errorf("can't marshal registers: %s", err) + } + + if ins.OpCode.Class().IsALU() { + newOffset := int16(0) + switch ins.OpCode.ALUOp() { + case SDiv: + ins.OpCode = ins.OpCode.SetALUOp(Div) + newOffset = 1 + case SMod: + ins.OpCode = ins.OpCode.SetALUOp(Mod) + newOffset = 1 + case MovSX8: + ins.OpCode = ins.OpCode.SetALUOp(Mov) + newOffset = 8 + case MovSX16: + ins.OpCode = ins.OpCode.SetALUOp(Mov) + newOffset = 16 + case MovSX32: + ins.OpCode = ins.OpCode.SetALUOp(Mov) + newOffset = 32 + } + if newOffset != 0 && ins.Offset != 0 { + return 0, fmt.Errorf("extended ALU opcodes should have an .Offset of 0: %s", ins) + } + ins.Offset = newOffset + } + + op, err := ins.OpCode.bpfOpCode() + if err != nil { + return 0, err + } + + data := make([]byte, InstructionSize) + data[0] = op + data[1] = byte(regs) + bo.PutUint16(data[2:4], uint16(ins.Offset)) + bo.PutUint32(data[4:8], uint32(cons)) + if _, err := w.Write(data); err != nil { + return 0, err + } + + if !isDWordLoad { + return InstructionSize, nil + } + + // The first half of the second part of a double-wide instruction + // must be zero. The second half carries the value. + bo.PutUint32(data[0:4], 0) + bo.PutUint32(data[4:8], uint32(ins.Constant>>32)) + if _, err := w.Write(data); err != nil { + return 0, err + } + + return 2 * InstructionSize, nil +} + +// AssociateMap associates a Map with this Instruction. +// +// Implicitly clears the Instruction's Reference field. +// +// Returns an error if the Instruction is not a map load. +func (ins *Instruction) AssociateMap(m FDer) error { + if !ins.IsLoadFromMap() { + return errors.New("not a load from a map") + } + + ins.Metadata.Set(referenceMeta{}, nil) + ins.Metadata.Set(mapMeta{}, m) + + return nil +} + +// RewriteMapPtr changes an instruction to use a new map fd. +// +// Returns an error if the instruction doesn't load a map. +// +// Deprecated: use AssociateMap instead. If you cannot provide a Map, +// wrap an fd in a type implementing FDer. +func (ins *Instruction) RewriteMapPtr(fd int) error { + if !ins.IsLoadFromMap() { + return errors.New("not a load from a map") + } + + ins.encodeMapFD(fd) + + return nil +} + +func (ins *Instruction) encodeMapFD(fd int) { + // Preserve the offset value for direct map loads. + offset := uint64(ins.Constant) & (math.MaxUint32 << 32) + rawFd := uint64(uint32(fd)) + ins.Constant = int64(offset | rawFd) +} + +// MapPtr returns the map fd for this instruction. +// +// The result is undefined if the instruction is not a load from a map, +// see IsLoadFromMap. +// +// Deprecated: use Map() instead. +func (ins *Instruction) MapPtr() int { + // If there is a map associated with the instruction, return its FD. + if fd := ins.Metadata.Get(mapMeta{}); fd != nil { + return fd.(FDer).FD() + } + + // Fall back to the fd stored in the Constant field + return ins.mapFd() +} + +// mapFd returns the map file descriptor stored in the 32 least significant +// bits of ins' Constant field. +func (ins *Instruction) mapFd() int { + return int(int32(ins.Constant)) +} + +// RewriteMapOffset changes the offset of a direct load from a map. +// +// Returns an error if the instruction is not a direct load. +func (ins *Instruction) RewriteMapOffset(offset uint32) error { + if !ins.OpCode.IsDWordLoad() { + return fmt.Errorf("%s is not a 64 bit load", ins.OpCode) + } + + if ins.Src != PseudoMapValue { + return errors.New("not a direct load from a map") + } + + fd := uint64(ins.Constant) & math.MaxUint32 + ins.Constant = int64(uint64(offset)<<32 | fd) + return nil +} + +func (ins *Instruction) mapOffset() uint32 { + return uint32(uint64(ins.Constant) >> 32) +} + +// IsLoadFromMap returns true if the instruction loads from a map. +// +// This covers both loading the map pointer and direct map value loads. +func (ins *Instruction) IsLoadFromMap() bool { + return ins.OpCode == LoadImmOp(DWord) && (ins.Src == PseudoMapFD || ins.Src == PseudoMapValue) +} + +// IsFunctionCall returns true if the instruction calls another BPF function. +// +// This is not the same thing as a BPF helper call. +func (ins *Instruction) IsFunctionCall() bool { + return ins.OpCode.JumpOp() == Call && ins.Src == PseudoCall +} + +// IsKfuncCall returns true if the instruction calls a kfunc. +// +// This is not the same thing as a BPF helper call. +func (ins *Instruction) IsKfuncCall() bool { + return ins.OpCode.JumpOp() == Call && ins.Src == PseudoKfuncCall +} + +// IsLoadOfFunctionPointer returns true if the instruction loads a function pointer. +func (ins *Instruction) IsLoadOfFunctionPointer() bool { + return ins.OpCode.IsDWordLoad() && ins.Src == PseudoFunc +} + +// IsFunctionReference returns true if the instruction references another BPF +// function, either by invoking a Call jump operation or by loading a function +// pointer. +func (ins *Instruction) IsFunctionReference() bool { + return ins.IsFunctionCall() || ins.IsLoadOfFunctionPointer() +} + +// IsBuiltinCall returns true if the instruction is a built-in call, i.e. BPF helper call. +func (ins *Instruction) IsBuiltinCall() bool { + return ins.OpCode.JumpOp() == Call && ins.Src == R0 && ins.Dst == R0 +} + +// IsConstantLoad returns true if the instruction loads a constant of the +// given size. +func (ins *Instruction) IsConstantLoad(size Size) bool { + return ins.OpCode == LoadImmOp(size) && ins.Src == R0 && ins.Offset == 0 +} + +// Format implements fmt.Formatter. +func (ins Instruction) Format(f fmt.State, c rune) { + if c != 'v' { + fmt.Fprintf(f, "{UNRECOGNIZED: %c}", c) + return + } + + op := ins.OpCode + + if op == InvalidOpCode { + fmt.Fprint(f, "INVALID") + return + } + + // Omit trailing space for Exit + if op.JumpOp() == Exit { + fmt.Fprint(f, op) + return + } + + if ins.IsLoadFromMap() { + fd := ins.mapFd() + m := ins.Map() + switch ins.Src { + case PseudoMapFD: + if m != nil { + fmt.Fprintf(f, "LoadMapPtr dst: %s map: %s", ins.Dst, m) + } else { + fmt.Fprintf(f, "LoadMapPtr dst: %s fd: %d", ins.Dst, fd) + } + + case PseudoMapValue: + if m != nil { + fmt.Fprintf(f, "LoadMapValue dst: %s, map: %s off: %d", ins.Dst, m, ins.mapOffset()) + } else { + fmt.Fprintf(f, "LoadMapValue dst: %s, fd: %d off: %d", ins.Dst, fd, ins.mapOffset()) + } + } + + goto ref + } + + switch cls := op.Class(); { + case cls.isLoadOrStore(): + fmt.Fprintf(f, "%v ", op) + switch op.Mode() { + case ImmMode: + fmt.Fprintf(f, "dst: %s imm: %d", ins.Dst, ins.Constant) + case AbsMode: + fmt.Fprintf(f, "imm: %d", ins.Constant) + case IndMode: + fmt.Fprintf(f, "dst: %s src: %s imm: %d", ins.Dst, ins.Src, ins.Constant) + case MemMode, MemSXMode: + fmt.Fprintf(f, "dst: %s src: %s off: %d imm: %d", ins.Dst, ins.Src, ins.Offset, ins.Constant) + case XAddMode: + fmt.Fprintf(f, "dst: %s src: %s", ins.Dst, ins.Src) + } + + case cls.IsALU(): + fmt.Fprintf(f, "%v", op) + if op == Swap.Op(ImmSource) { + fmt.Fprintf(f, "%d", ins.Constant) + } + + fmt.Fprintf(f, " dst: %s ", ins.Dst) + switch { + case op.ALUOp() == Swap: + break + case op.Source() == ImmSource: + fmt.Fprintf(f, "imm: %d", ins.Constant) + default: + fmt.Fprintf(f, "src: %s", ins.Src) + } + + case cls.IsJump(): + fmt.Fprintf(f, "%v ", op) + switch jop := op.JumpOp(); jop { + case Call: + switch ins.Src { + case PseudoCall: + // bpf-to-bpf call + fmt.Fprint(f, ins.Constant) + case PseudoKfuncCall: + // kfunc call + fmt.Fprintf(f, "Kfunc(%d)", ins.Constant) + default: + fmt.Fprint(f, BuiltinFunc(ins.Constant)) + } + + case Ja: + if ins.OpCode.Class() == Jump32Class { + fmt.Fprintf(f, "imm: %d", ins.Constant) + } else { + fmt.Fprintf(f, "off: %d", ins.Offset) + } + + default: + fmt.Fprintf(f, "dst: %s off: %d ", ins.Dst, ins.Offset) + if op.Source() == ImmSource { + fmt.Fprintf(f, "imm: %d", ins.Constant) + } else { + fmt.Fprintf(f, "src: %s", ins.Src) + } + } + default: + fmt.Fprintf(f, "%v ", op) + } + +ref: + if ins.Reference() != "" { + fmt.Fprintf(f, " <%s>", ins.Reference()) + } +} + +func (ins Instruction) equal(other Instruction) bool { + return ins.OpCode == other.OpCode && + ins.Dst == other.Dst && + ins.Src == other.Src && + ins.Offset == other.Offset && + ins.Constant == other.Constant +} + +// Size returns the amount of bytes ins would occupy in binary form. +func (ins Instruction) Size() uint64 { + return uint64(InstructionSize * ins.OpCode.rawInstructions()) +} + +// WithMetadata sets the given Metadata on the Instruction. e.g. to copy +// Metadata from another Instruction when replacing it. +func (ins Instruction) WithMetadata(meta Metadata) Instruction { + ins.Metadata = meta + return ins +} + +type symbolMeta struct{} + +// WithSymbol marks the Instruction as a Symbol, which other Instructions +// can point to using corresponding calls to WithReference. +func (ins Instruction) WithSymbol(name string) Instruction { + ins.Metadata.Set(symbolMeta{}, name) + return ins +} + +// Sym creates a symbol. +// +// Deprecated: use WithSymbol instead. +func (ins Instruction) Sym(name string) Instruction { + return ins.WithSymbol(name) +} + +// Symbol returns the value ins has been marked with using WithSymbol, +// otherwise returns an empty string. A symbol is often an Instruction +// at the start of a function body. +func (ins Instruction) Symbol() string { + sym, _ := ins.Metadata.Get(symbolMeta{}).(string) + return sym +} + +type referenceMeta struct{} + +// WithReference makes ins reference another Symbol or map by name. +func (ins Instruction) WithReference(ref string) Instruction { + ins.Metadata.Set(referenceMeta{}, ref) + return ins +} + +// Reference returns the Symbol or map name referenced by ins, if any. +func (ins Instruction) Reference() string { + ref, _ := ins.Metadata.Get(referenceMeta{}).(string) + return ref +} + +type mapMeta struct{} + +// Map returns the Map referenced by ins, if any. +// An Instruction will contain a Map if e.g. it references an existing, +// pinned map that was opened during ELF loading. +func (ins Instruction) Map() FDer { + fd, _ := ins.Metadata.Get(mapMeta{}).(FDer) + return fd +} + +type sourceMeta struct{} + +// WithSource adds source information about the Instruction. +func (ins Instruction) WithSource(src fmt.Stringer) Instruction { + ins.Metadata.Set(sourceMeta{}, src) + return ins +} + +// Source returns source information about the Instruction. The field is +// present when the compiler emits BTF line info about the Instruction and +// usually contains the line of source code responsible for it. +func (ins Instruction) Source() fmt.Stringer { + str, _ := ins.Metadata.Get(sourceMeta{}).(fmt.Stringer) + return str +} + +// A Comment can be passed to Instruction.WithSource to add a comment +// to an instruction. +type Comment string + +func (s Comment) String() string { + return string(s) +} + +// FDer represents a resource tied to an underlying file descriptor. +// Used as a stand-in for e.g. ebpf.Map since that type cannot be +// imported here and FD() is the only method we rely on. +type FDer interface { + FD() int +} + +// Instructions is an eBPF program. +type Instructions []Instruction + +// Unmarshal unmarshals an Instructions from a binary instruction stream. +// All instructions in insns are replaced by instructions decoded from r. +func (insns *Instructions) Unmarshal(r io.Reader, bo binary.ByteOrder) error { + if len(*insns) > 0 { + *insns = nil + } + + var offset uint64 + for { + var ins Instruction + n, err := ins.Unmarshal(r, bo) + if errors.Is(err, io.EOF) { + break + } + if err != nil { + return fmt.Errorf("offset %d: %w", offset, err) + } + + *insns = append(*insns, ins) + offset += n + } + + return nil +} + +// Name returns the name of the function insns belongs to, if any. +func (insns Instructions) Name() string { + if len(insns) == 0 { + return "" + } + return insns[0].Symbol() +} + +func (insns Instructions) String() string { + return fmt.Sprint(insns) +} + +// Size returns the amount of bytes insns would occupy in binary form. +func (insns Instructions) Size() uint64 { + var sum uint64 + for _, ins := range insns { + sum += ins.Size() + } + return sum +} + +// AssociateMap updates all Instructions that Reference the given symbol +// to point to an existing Map m instead. +// +// Returns ErrUnreferencedSymbol error if no references to symbol are found +// in insns. If symbol is anything else than the symbol name of map (e.g. +// a bpf2bpf subprogram), an error is returned. +func (insns Instructions) AssociateMap(symbol string, m FDer) error { + if symbol == "" { + return errors.New("empty symbol") + } + + var found bool + for i := range insns { + ins := &insns[i] + if ins.Reference() != symbol { + continue + } + + if err := ins.AssociateMap(m); err != nil { + return err + } + + found = true + } + + if !found { + return fmt.Errorf("symbol %s: %w", symbol, ErrUnreferencedSymbol) + } + + return nil +} + +// RewriteMapPtr rewrites all loads of a specific map pointer to a new fd. +// +// Returns ErrUnreferencedSymbol if the symbol isn't used. +// +// Deprecated: use AssociateMap instead. +func (insns Instructions) RewriteMapPtr(symbol string, fd int) error { + if symbol == "" { + return errors.New("empty symbol") + } + + var found bool + for i := range insns { + ins := &insns[i] + if ins.Reference() != symbol { + continue + } + + if !ins.IsLoadFromMap() { + return errors.New("not a load from a map") + } + + ins.encodeMapFD(fd) + + found = true + } + + if !found { + return fmt.Errorf("symbol %s: %w", symbol, ErrUnreferencedSymbol) + } + + return nil +} + +// SymbolOffsets returns the set of symbols and their offset in +// the instructions. +func (insns Instructions) SymbolOffsets() (map[string]int, error) { + offsets := make(map[string]int) + + for i, ins := range insns { + if ins.Symbol() == "" { + continue + } + + if _, ok := offsets[ins.Symbol()]; ok { + return nil, fmt.Errorf("duplicate symbol %s", ins.Symbol()) + } + + offsets[ins.Symbol()] = i + } + + return offsets, nil +} + +// FunctionReferences returns a set of symbol names these Instructions make +// bpf-to-bpf calls to. +func (insns Instructions) FunctionReferences() []string { + calls := make(map[string]struct{}) + for _, ins := range insns { + if ins.Constant != -1 { + // BPF-to-BPF calls have -1 constants. + continue + } + + if ins.Reference() == "" { + continue + } + + if !ins.IsFunctionReference() { + continue + } + + calls[ins.Reference()] = struct{}{} + } + + result := make([]string, 0, len(calls)) + for call := range calls { + result = append(result, call) + } + + sort.Strings(result) + return result +} + +// ReferenceOffsets returns the set of references and their offset in +// the instructions. +func (insns Instructions) ReferenceOffsets() map[string][]int { + offsets := make(map[string][]int) + + for i, ins := range insns { + if ins.Reference() == "" { + continue + } + + offsets[ins.Reference()] = append(offsets[ins.Reference()], i) + } + + return offsets +} + +// Format implements fmt.Formatter. +// +// You can control indentation of symbols by +// specifying a width. Setting a precision controls the indentation of +// instructions. +// The default character is a tab, which can be overridden by specifying +// the ' ' space flag. +func (insns Instructions) Format(f fmt.State, c rune) { + if c != 's' && c != 'v' { + fmt.Fprintf(f, "{UNKNOWN FORMAT '%c'}", c) + return + } + + // Precision is better in this case, because it allows + // specifying 0 padding easily. + padding, ok := f.Precision() + if !ok { + padding = 1 + } + + indent := strings.Repeat("\t", padding) + if f.Flag(' ') { + indent = strings.Repeat(" ", padding) + } + + symPadding, ok := f.Width() + if !ok { + symPadding = padding - 1 + } + if symPadding < 0 { + symPadding = 0 + } + + symIndent := strings.Repeat("\t", symPadding) + if f.Flag(' ') { + symIndent = strings.Repeat(" ", symPadding) + } + + // Guess how many digits we need at most, by assuming that all instructions + // are double wide. + highestOffset := len(insns) * 2 + offsetWidth := int(math.Ceil(math.Log10(float64(highestOffset)))) + + iter := insns.Iterate() + for iter.Next() { + if iter.Ins.Symbol() != "" { + fmt.Fprintf(f, "%s%s:\n", symIndent, iter.Ins.Symbol()) + } + if src := iter.Ins.Source(); src != nil { + line := strings.TrimSpace(src.String()) + if line != "" { + fmt.Fprintf(f, "%s%*s; %s\n", indent, offsetWidth, " ", line) + } + } + fmt.Fprintf(f, "%s%*d: %v\n", indent, offsetWidth, iter.Offset, iter.Ins) + } +} + +// Marshal encodes a BPF program into the kernel format. +// +// insns may be modified if there are unresolved jumps or bpf2bpf calls. +// +// Returns ErrUnsatisfiedProgramReference if there is a Reference Instruction +// without a matching Symbol Instruction within insns. +func (insns Instructions) Marshal(w io.Writer, bo binary.ByteOrder) error { + if err := insns.encodeFunctionReferences(); err != nil { + return err + } + + if err := insns.encodeMapPointers(); err != nil { + return err + } + + for i, ins := range insns { + if _, err := ins.Marshal(w, bo); err != nil { + return fmt.Errorf("instruction %d: %w", i, err) + } + } + return nil +} + +// Tag calculates the kernel tag for a series of instructions. +// +// It mirrors bpf_prog_calc_tag in the kernel and so can be compared +// to ProgramInfo.Tag to figure out whether a loaded program matches +// certain instructions. +func (insns Instructions) Tag(bo binary.ByteOrder) (string, error) { + h := sha1.New() + for i, ins := range insns { + if ins.IsLoadFromMap() { + ins.Constant = 0 + } + _, err := ins.Marshal(h, bo) + if err != nil { + return "", fmt.Errorf("instruction %d: %w", i, err) + } + } + return hex.EncodeToString(h.Sum(nil)[:sys.BPF_TAG_SIZE]), nil +} + +// encodeFunctionReferences populates the Offset (or Constant, depending on +// the instruction type) field of instructions with a Reference field to point +// to the offset of the corresponding instruction with a matching Symbol field. +// +// Only Reference Instructions that are either jumps or BPF function references +// (calls or function pointer loads) are populated. +// +// Returns ErrUnsatisfiedProgramReference if there is a Reference Instruction +// without at least one corresponding Symbol Instruction within insns. +func (insns Instructions) encodeFunctionReferences() error { + // Index the offsets of instructions tagged as a symbol. + symbolOffsets := make(map[string]RawInstructionOffset) + iter := insns.Iterate() + for iter.Next() { + ins := iter.Ins + + if ins.Symbol() == "" { + continue + } + + if _, ok := symbolOffsets[ins.Symbol()]; ok { + return fmt.Errorf("duplicate symbol %s", ins.Symbol()) + } + + symbolOffsets[ins.Symbol()] = iter.Offset + } + + // Find all instructions tagged as references to other symbols. + // Depending on the instruction type, populate their constant or offset + // fields to point to the symbol they refer to within the insn stream. + iter = insns.Iterate() + for iter.Next() { + i := iter.Index + offset := iter.Offset + ins := iter.Ins + + if ins.Reference() == "" { + continue + } + + switch { + case ins.IsFunctionReference() && ins.Constant == -1, + ins.OpCode == Ja.opCode(Jump32Class, ImmSource) && ins.Constant == -1: + symOffset, ok := symbolOffsets[ins.Reference()] + if !ok { + return fmt.Errorf("%s at insn %d: symbol %q: %w", ins.OpCode, i, ins.Reference(), ErrUnsatisfiedProgramReference) + } + + ins.Constant = int64(symOffset - offset - 1) + + case ins.OpCode.Class().IsJump() && ins.Offset == -1: + symOffset, ok := symbolOffsets[ins.Reference()] + if !ok { + return fmt.Errorf("%s at insn %d: symbol %q: %w", ins.OpCode, i, ins.Reference(), ErrUnsatisfiedProgramReference) + } + + ins.Offset = int16(symOffset - offset - 1) + } + } + + return nil +} + +// encodeMapPointers finds all Map Instructions and encodes their FDs +// into their Constant fields. +func (insns Instructions) encodeMapPointers() error { + iter := insns.Iterate() + for iter.Next() { + ins := iter.Ins + + if !ins.IsLoadFromMap() { + continue + } + + m := ins.Map() + if m == nil { + continue + } + + fd := m.FD() + if fd < 0 { + return fmt.Errorf("map %s: %w", m, sys.ErrClosedFd) + } + + ins.encodeMapFD(m.FD()) + } + + return nil +} + +// Iterate allows iterating a BPF program while keeping track of +// various offsets. +// +// Modifying the instruction slice will lead to undefined behaviour. +func (insns Instructions) Iterate() *InstructionIterator { + return &InstructionIterator{insns: insns} +} + +// InstructionIterator iterates over a BPF program. +type InstructionIterator struct { + insns Instructions + // The instruction in question. + Ins *Instruction + // The index of the instruction in the original instruction slice. + Index int + // The offset of the instruction in raw BPF instructions. This accounts + // for double-wide instructions. + Offset RawInstructionOffset +} + +// Next returns true as long as there are any instructions remaining. +func (iter *InstructionIterator) Next() bool { + if len(iter.insns) == 0 { + return false + } + + if iter.Ins != nil { + iter.Index++ + iter.Offset += RawInstructionOffset(iter.Ins.OpCode.rawInstructions()) + } + iter.Ins = &iter.insns[0] + iter.insns = iter.insns[1:] + return true +} + +type bpfRegisters uint8 + +func newBPFRegisters(dst, src Register, bo binary.ByteOrder) (bpfRegisters, error) { + switch bo { + case binary.LittleEndian: + return bpfRegisters((src << 4) | (dst & 0xF)), nil + case binary.BigEndian: + return bpfRegisters((dst << 4) | (src & 0xF)), nil + default: + return 0, fmt.Errorf("unrecognized ByteOrder %T", bo) + } +} + +// IsUnreferencedSymbol returns true if err was caused by +// an unreferenced symbol. +// +// Deprecated: use errors.Is(err, asm.ErrUnreferencedSymbol). +func IsUnreferencedSymbol(err error) bool { + return errors.Is(err, ErrUnreferencedSymbol) +} diff --git a/vendor/github.com/cilium/ebpf/asm/jump.go b/vendor/github.com/cilium/ebpf/asm/jump.go new file mode 100644 index 0000000000..2738d736b2 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/jump.go @@ -0,0 +1,135 @@ +package asm + +//go:generate go run golang.org/x/tools/cmd/stringer@latest -output jump_string.go -type=JumpOp + +// JumpOp affect control flow. +// +// msb lsb +// +----+-+---+ +// |OP |s|cls| +// +----+-+---+ +type JumpOp uint8 + +const jumpMask OpCode = 0xf0 + +const ( + // InvalidJumpOp is returned by getters when invoked + // on non branch OpCodes + InvalidJumpOp JumpOp = 0xff + // Ja jumps by offset unconditionally + Ja JumpOp = 0x00 + // JEq jumps by offset if r == imm + JEq JumpOp = 0x10 + // JGT jumps by offset if r > imm + JGT JumpOp = 0x20 + // JGE jumps by offset if r >= imm + JGE JumpOp = 0x30 + // JSet jumps by offset if r & imm + JSet JumpOp = 0x40 + // JNE jumps by offset if r != imm + JNE JumpOp = 0x50 + // JSGT jumps by offset if signed r > signed imm + JSGT JumpOp = 0x60 + // JSGE jumps by offset if signed r >= signed imm + JSGE JumpOp = 0x70 + // Call builtin or user defined function from imm + Call JumpOp = 0x80 + // Exit ends execution, with value in r0 + Exit JumpOp = 0x90 + // JLT jumps by offset if r < imm + JLT JumpOp = 0xa0 + // JLE jumps by offset if r <= imm + JLE JumpOp = 0xb0 + // JSLT jumps by offset if signed r < signed imm + JSLT JumpOp = 0xc0 + // JSLE jumps by offset if signed r <= signed imm + JSLE JumpOp = 0xd0 +) + +// Return emits an exit instruction. +// +// Requires a return value in R0. +func Return() Instruction { + return Instruction{ + OpCode: OpCode(JumpClass).SetJumpOp(Exit), + } +} + +// Op returns the OpCode for a given jump source. +func (op JumpOp) Op(source Source) OpCode { + return OpCode(JumpClass).SetJumpOp(op).SetSource(source) +} + +// Imm compares 64 bit dst to 64 bit value (sign extended), and adjusts PC by offset if the condition is fulfilled. +func (op JumpOp) Imm(dst Register, value int32, label string) Instruction { + return Instruction{ + OpCode: op.opCode(JumpClass, ImmSource), + Dst: dst, + Offset: -1, + Constant: int64(value), + }.WithReference(label) +} + +// Imm32 compares 32 bit dst to 32 bit value, and adjusts PC by offset if the condition is fulfilled. +// Requires kernel 5.1. +func (op JumpOp) Imm32(dst Register, value int32, label string) Instruction { + return Instruction{ + OpCode: op.opCode(Jump32Class, ImmSource), + Dst: dst, + Offset: -1, + Constant: int64(value), + }.WithReference(label) +} + +// Reg compares 64 bit dst to 64 bit src, and adjusts PC by offset if the condition is fulfilled. +func (op JumpOp) Reg(dst, src Register, label string) Instruction { + return Instruction{ + OpCode: op.opCode(JumpClass, RegSource), + Dst: dst, + Src: src, + Offset: -1, + }.WithReference(label) +} + +// Reg32 compares 32 bit dst to 32 bit src, and adjusts PC by offset if the condition is fulfilled. +// Requires kernel 5.1. +func (op JumpOp) Reg32(dst, src Register, label string) Instruction { + return Instruction{ + OpCode: op.opCode(Jump32Class, RegSource), + Dst: dst, + Src: src, + Offset: -1, + }.WithReference(label) +} + +func (op JumpOp) opCode(class Class, source Source) OpCode { + if op == Exit || op == Call { + return InvalidOpCode + } + + return OpCode(class).SetJumpOp(op).SetSource(source) +} + +// LongJump returns a jump always instruction with a range of [-2^31, 2^31 - 1]. +func LongJump(label string) Instruction { + return Instruction{ + OpCode: Ja.opCode(Jump32Class, ImmSource), + Constant: -1, + }.WithReference(label) +} + +// Label adjusts PC to the address of the label. +func (op JumpOp) Label(label string) Instruction { + if op == Call { + return Instruction{ + OpCode: OpCode(JumpClass).SetJumpOp(Call), + Src: PseudoCall, + Constant: -1, + }.WithReference(label) + } + + return Instruction{ + OpCode: OpCode(JumpClass).SetJumpOp(op), + Offset: -1, + }.WithReference(label) +} diff --git a/vendor/github.com/cilium/ebpf/asm/jump_string.go b/vendor/github.com/cilium/ebpf/asm/jump_string.go new file mode 100644 index 0000000000..85a4aaffa5 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/jump_string.go @@ -0,0 +1,53 @@ +// Code generated by "stringer -output jump_string.go -type=JumpOp"; DO NOT EDIT. + +package asm + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[InvalidJumpOp-255] + _ = x[Ja-0] + _ = x[JEq-16] + _ = x[JGT-32] + _ = x[JGE-48] + _ = x[JSet-64] + _ = x[JNE-80] + _ = x[JSGT-96] + _ = x[JSGE-112] + _ = x[Call-128] + _ = x[Exit-144] + _ = x[JLT-160] + _ = x[JLE-176] + _ = x[JSLT-192] + _ = x[JSLE-208] +} + +const _JumpOp_name = "JaJEqJGTJGEJSetJNEJSGTJSGECallExitJLTJLEJSLTJSLEInvalidJumpOp" + +var _JumpOp_map = map[JumpOp]string{ + 0: _JumpOp_name[0:2], + 16: _JumpOp_name[2:5], + 32: _JumpOp_name[5:8], + 48: _JumpOp_name[8:11], + 64: _JumpOp_name[11:15], + 80: _JumpOp_name[15:18], + 96: _JumpOp_name[18:22], + 112: _JumpOp_name[22:26], + 128: _JumpOp_name[26:30], + 144: _JumpOp_name[30:34], + 160: _JumpOp_name[34:37], + 176: _JumpOp_name[37:40], + 192: _JumpOp_name[40:44], + 208: _JumpOp_name[44:48], + 255: _JumpOp_name[48:61], +} + +func (i JumpOp) String() string { + if str, ok := _JumpOp_map[i]; ok { + return str + } + return "JumpOp(" + strconv.FormatInt(int64(i), 10) + ")" +} diff --git a/vendor/github.com/cilium/ebpf/asm/load_store.go b/vendor/github.com/cilium/ebpf/asm/load_store.go new file mode 100644 index 0000000000..cdb5c5cfa4 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/load_store.go @@ -0,0 +1,225 @@ +package asm + +//go:generate go run golang.org/x/tools/cmd/stringer@latest -output load_store_string.go -type=Mode,Size + +// Mode for load and store operations +// +// msb lsb +// +---+--+---+ +// |MDE|sz|cls| +// +---+--+---+ +type Mode uint8 + +const modeMask OpCode = 0xe0 + +const ( + // InvalidMode is returned by getters when invoked + // on non load / store OpCodes + InvalidMode Mode = 0xff + // ImmMode - immediate value + ImmMode Mode = 0x00 + // AbsMode - immediate value + offset + AbsMode Mode = 0x20 + // IndMode - indirect (imm+src) + IndMode Mode = 0x40 + // MemMode - load from memory + MemMode Mode = 0x60 + // MemSXMode - load from memory, sign extension + MemSXMode Mode = 0x80 + // XAddMode - add atomically across processors. + XAddMode Mode = 0xc0 +) + +// Size of load and store operations +// +// msb lsb +// +---+--+---+ +// |mde|SZ|cls| +// +---+--+---+ +type Size uint8 + +const sizeMask OpCode = 0x18 + +const ( + // InvalidSize is returned by getters when invoked + // on non load / store OpCodes + InvalidSize Size = 0xff + // DWord - double word; 64 bits + DWord Size = 0x18 + // Word - word; 32 bits + Word Size = 0x00 + // Half - half-word; 16 bits + Half Size = 0x08 + // Byte - byte; 8 bits + Byte Size = 0x10 +) + +// Sizeof returns the size in bytes. +func (s Size) Sizeof() int { + switch s { + case DWord: + return 8 + case Word: + return 4 + case Half: + return 2 + case Byte: + return 1 + default: + return -1 + } +} + +// LoadMemOp returns the OpCode to load a value of given size from memory. +func LoadMemOp(size Size) OpCode { + return OpCode(LdXClass).SetMode(MemMode).SetSize(size) +} + +// LoadMemSXOp returns the OpCode to load a value of given size from memory sign extended. +func LoadMemSXOp(size Size) OpCode { + return OpCode(LdXClass).SetMode(MemSXMode).SetSize(size) +} + +// LoadMem emits `dst = *(size *)(src + offset)`. +func LoadMem(dst, src Register, offset int16, size Size) Instruction { + return Instruction{ + OpCode: LoadMemOp(size), + Dst: dst, + Src: src, + Offset: offset, + } +} + +// LoadMemSX emits `dst = *(size *)(src + offset)` but sign extends dst. +func LoadMemSX(dst, src Register, offset int16, size Size) Instruction { + if size == DWord { + return Instruction{OpCode: InvalidOpCode} + } + + return Instruction{ + OpCode: LoadMemSXOp(size), + Dst: dst, + Src: src, + Offset: offset, + } +} + +// LoadImmOp returns the OpCode to load an immediate of given size. +// +// As of kernel 4.20, only DWord size is accepted. +func LoadImmOp(size Size) OpCode { + return OpCode(LdClass).SetMode(ImmMode).SetSize(size) +} + +// LoadImm emits `dst = (size)value`. +// +// As of kernel 4.20, only DWord size is accepted. +func LoadImm(dst Register, value int64, size Size) Instruction { + return Instruction{ + OpCode: LoadImmOp(size), + Dst: dst, + Constant: value, + } +} + +// LoadMapPtr stores a pointer to a map in dst. +func LoadMapPtr(dst Register, fd int) Instruction { + if fd < 0 { + return Instruction{OpCode: InvalidOpCode} + } + + return Instruction{ + OpCode: LoadImmOp(DWord), + Dst: dst, + Src: PseudoMapFD, + Constant: int64(uint32(fd)), + } +} + +// LoadMapValue stores a pointer to the value at a certain offset of a map. +func LoadMapValue(dst Register, fd int, offset uint32) Instruction { + if fd < 0 { + return Instruction{OpCode: InvalidOpCode} + } + + fdAndOffset := (uint64(offset) << 32) | uint64(uint32(fd)) + return Instruction{ + OpCode: LoadImmOp(DWord), + Dst: dst, + Src: PseudoMapValue, + Constant: int64(fdAndOffset), + } +} + +// LoadIndOp returns the OpCode for loading a value of given size from an sk_buff. +func LoadIndOp(size Size) OpCode { + return OpCode(LdClass).SetMode(IndMode).SetSize(size) +} + +// LoadInd emits `dst = ntoh(*(size *)(((sk_buff *)R6)->data + src + offset))`. +func LoadInd(dst, src Register, offset int32, size Size) Instruction { + return Instruction{ + OpCode: LoadIndOp(size), + Dst: dst, + Src: src, + Constant: int64(offset), + } +} + +// LoadAbsOp returns the OpCode for loading a value of given size from an sk_buff. +func LoadAbsOp(size Size) OpCode { + return OpCode(LdClass).SetMode(AbsMode).SetSize(size) +} + +// LoadAbs emits `r0 = ntoh(*(size *)(((sk_buff *)R6)->data + offset))`. +func LoadAbs(offset int32, size Size) Instruction { + return Instruction{ + OpCode: LoadAbsOp(size), + Dst: R0, + Constant: int64(offset), + } +} + +// StoreMemOp returns the OpCode for storing a register of given size in memory. +func StoreMemOp(size Size) OpCode { + return OpCode(StXClass).SetMode(MemMode).SetSize(size) +} + +// StoreMem emits `*(size *)(dst + offset) = src` +func StoreMem(dst Register, offset int16, src Register, size Size) Instruction { + return Instruction{ + OpCode: StoreMemOp(size), + Dst: dst, + Src: src, + Offset: offset, + } +} + +// StoreImmOp returns the OpCode for storing an immediate of given size in memory. +func StoreImmOp(size Size) OpCode { + return OpCode(StClass).SetMode(MemMode).SetSize(size) +} + +// StoreImm emits `*(size *)(dst + offset) = value`. +func StoreImm(dst Register, offset int16, value int64, size Size) Instruction { + return Instruction{ + OpCode: StoreImmOp(size), + Dst: dst, + Offset: offset, + Constant: value, + } +} + +// StoreXAddOp returns the OpCode to atomically add a register to a value in memory. +func StoreXAddOp(size Size) OpCode { + return OpCode(StXClass).SetMode(XAddMode).SetSize(size) +} + +// StoreXAdd atomically adds src to *dst. +func StoreXAdd(dst, src Register, size Size) Instruction { + return Instruction{ + OpCode: StoreXAddOp(size), + Dst: dst, + Src: src, + } +} diff --git a/vendor/github.com/cilium/ebpf/asm/load_store_string.go b/vendor/github.com/cilium/ebpf/asm/load_store_string.go new file mode 100644 index 0000000000..c48080327c --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/load_store_string.go @@ -0,0 +1,84 @@ +// Code generated by "stringer -output load_store_string.go -type=Mode,Size"; DO NOT EDIT. + +package asm + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[InvalidMode-255] + _ = x[ImmMode-0] + _ = x[AbsMode-32] + _ = x[IndMode-64] + _ = x[MemMode-96] + _ = x[MemSXMode-128] + _ = x[XAddMode-192] +} + +const ( + _Mode_name_0 = "ImmMode" + _Mode_name_1 = "AbsMode" + _Mode_name_2 = "IndMode" + _Mode_name_3 = "MemMode" + _Mode_name_4 = "MemSXMode" + _Mode_name_5 = "XAddMode" + _Mode_name_6 = "InvalidMode" +) + +func (i Mode) String() string { + switch { + case i == 0: + return _Mode_name_0 + case i == 32: + return _Mode_name_1 + case i == 64: + return _Mode_name_2 + case i == 96: + return _Mode_name_3 + case i == 128: + return _Mode_name_4 + case i == 192: + return _Mode_name_5 + case i == 255: + return _Mode_name_6 + default: + return "Mode(" + strconv.FormatInt(int64(i), 10) + ")" + } +} +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[InvalidSize-255] + _ = x[DWord-24] + _ = x[Word-0] + _ = x[Half-8] + _ = x[Byte-16] +} + +const ( + _Size_name_0 = "Word" + _Size_name_1 = "Half" + _Size_name_2 = "Byte" + _Size_name_3 = "DWord" + _Size_name_4 = "InvalidSize" +) + +func (i Size) String() string { + switch { + case i == 0: + return _Size_name_0 + case i == 8: + return _Size_name_1 + case i == 16: + return _Size_name_2 + case i == 24: + return _Size_name_3 + case i == 255: + return _Size_name_4 + default: + return "Size(" + strconv.FormatInt(int64(i), 10) + ")" + } +} diff --git a/vendor/github.com/cilium/ebpf/asm/metadata.go b/vendor/github.com/cilium/ebpf/asm/metadata.go new file mode 100644 index 0000000000..dd368a9360 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/metadata.go @@ -0,0 +1,80 @@ +package asm + +// Metadata contains metadata about an instruction. +type Metadata struct { + head *metaElement +} + +type metaElement struct { + next *metaElement + key, value interface{} +} + +// Find the element containing key. +// +// Returns nil if there is no such element. +func (m *Metadata) find(key interface{}) *metaElement { + for e := m.head; e != nil; e = e.next { + if e.key == key { + return e + } + } + return nil +} + +// Remove an element from the linked list. +// +// Copies as many elements of the list as necessary to remove r, but doesn't +// perform a full copy. +func (m *Metadata) remove(r *metaElement) { + current := &m.head + for e := m.head; e != nil; e = e.next { + if e == r { + // We've found the element we want to remove. + *current = e.next + + // No need to copy the tail. + return + } + + // There is another element in front of the one we want to remove. + // We have to copy it to be able to change metaElement.next. + cpy := &metaElement{key: e.key, value: e.value} + *current = cpy + current = &cpy.next + } +} + +// Set a key to a value. +// +// If value is nil, the key is removed. Avoids modifying old metadata by +// copying if necessary. +func (m *Metadata) Set(key, value interface{}) { + if e := m.find(key); e != nil { + if e.value == value { + // Key is present and the value is the same. Nothing to do. + return + } + + // Key is present with a different value. Create a copy of the list + // which doesn't have the element in it. + m.remove(e) + } + + // m.head is now a linked list that doesn't contain key. + if value == nil { + return + } + + m.head = &metaElement{key: key, value: value, next: m.head} +} + +// Get the value of a key. +// +// Returns nil if no value with the given key is present. +func (m *Metadata) Get(key interface{}) interface{} { + if e := m.find(key); e != nil { + return e.value + } + return nil +} diff --git a/vendor/github.com/cilium/ebpf/asm/opcode.go b/vendor/github.com/cilium/ebpf/asm/opcode.go new file mode 100644 index 0000000000..1dfd0b171a --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/opcode.go @@ -0,0 +1,303 @@ +package asm + +import ( + "fmt" + "strings" +) + +//go:generate go run golang.org/x/tools/cmd/stringer@latest -output opcode_string.go -type=Class + +// Class of operations +// +// msb lsb +// +---+--+---+ +// | ?? |CLS| +// +---+--+---+ +type Class uint8 + +const classMask OpCode = 0x07 + +const ( + // LdClass loads immediate values into registers. + // Also used for non-standard load operations from cBPF. + LdClass Class = 0x00 + // LdXClass loads memory into registers. + LdXClass Class = 0x01 + // StClass stores immediate values to memory. + StClass Class = 0x02 + // StXClass stores registers to memory. + StXClass Class = 0x03 + // ALUClass describes arithmetic operators. + ALUClass Class = 0x04 + // JumpClass describes jump operators. + JumpClass Class = 0x05 + // Jump32Class describes jump operators with 32-bit comparisons. + // Requires kernel 5.1. + Jump32Class Class = 0x06 + // ALU64Class describes arithmetic operators in 64-bit mode. + ALU64Class Class = 0x07 +) + +// IsLoad checks if this is either LdClass or LdXClass. +func (cls Class) IsLoad() bool { + return cls == LdClass || cls == LdXClass +} + +// IsStore checks if this is either StClass or StXClass. +func (cls Class) IsStore() bool { + return cls == StClass || cls == StXClass +} + +func (cls Class) isLoadOrStore() bool { + return cls.IsLoad() || cls.IsStore() +} + +// IsALU checks if this is either ALUClass or ALU64Class. +func (cls Class) IsALU() bool { + return cls == ALUClass || cls == ALU64Class +} + +// IsJump checks if this is either JumpClass or Jump32Class. +func (cls Class) IsJump() bool { + return cls == JumpClass || cls == Jump32Class +} + +func (cls Class) isJumpOrALU() bool { + return cls.IsJump() || cls.IsALU() +} + +// OpCode represents a single operation. +// It is not a 1:1 mapping to real eBPF opcodes. +// +// The encoding varies based on a 3-bit Class: +// +// 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 +// ??? | CLS +// +// For ALUClass and ALUCLass32: +// +// 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 +// OPC |S| CLS +// +// For LdClass, LdXclass, StClass and StXClass: +// +// 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 +// 0 | MDE |SIZ| CLS +// +// For JumpClass, Jump32Class: +// +// 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 +// 0 | OPC |S| CLS +type OpCode uint16 + +// InvalidOpCode is returned by setters on OpCode +const InvalidOpCode OpCode = 0xffff + +// bpfOpCode returns the actual BPF opcode. +func (op OpCode) bpfOpCode() (byte, error) { + const opCodeMask = 0xff + + if !valid(op, opCodeMask) { + return 0, fmt.Errorf("invalid opcode %x", op) + } + + return byte(op & opCodeMask), nil +} + +// rawInstructions returns the number of BPF instructions required +// to encode this opcode. +func (op OpCode) rawInstructions() int { + if op.IsDWordLoad() { + return 2 + } + return 1 +} + +func (op OpCode) IsDWordLoad() bool { + return op == LoadImmOp(DWord) +} + +// Class returns the class of operation. +func (op OpCode) Class() Class { + return Class(op & classMask) +} + +// Mode returns the mode for load and store operations. +func (op OpCode) Mode() Mode { + if !op.Class().isLoadOrStore() { + return InvalidMode + } + return Mode(op & modeMask) +} + +// Size returns the size for load and store operations. +func (op OpCode) Size() Size { + if !op.Class().isLoadOrStore() { + return InvalidSize + } + return Size(op & sizeMask) +} + +// Source returns the source for branch and ALU operations. +func (op OpCode) Source() Source { + if !op.Class().isJumpOrALU() || op.ALUOp() == Swap { + return InvalidSource + } + return Source(op & sourceMask) +} + +// ALUOp returns the ALUOp. +func (op OpCode) ALUOp() ALUOp { + if !op.Class().IsALU() { + return InvalidALUOp + } + return ALUOp(op & aluMask) +} + +// Endianness returns the Endianness for a byte swap instruction. +func (op OpCode) Endianness() Endianness { + if op.ALUOp() != Swap { + return InvalidEndian + } + return Endianness(op & endianMask) +} + +// JumpOp returns the JumpOp. +// Returns InvalidJumpOp if it doesn't encode a jump. +func (op OpCode) JumpOp() JumpOp { + if !op.Class().IsJump() { + return InvalidJumpOp + } + + jumpOp := JumpOp(op & jumpMask) + + // Some JumpOps are only supported by JumpClass, not Jump32Class. + if op.Class() == Jump32Class && (jumpOp == Exit || jumpOp == Call) { + return InvalidJumpOp + } + + return jumpOp +} + +// SetMode sets the mode on load and store operations. +// +// Returns InvalidOpCode if op is of the wrong class. +func (op OpCode) SetMode(mode Mode) OpCode { + if !op.Class().isLoadOrStore() || !valid(OpCode(mode), modeMask) { + return InvalidOpCode + } + return (op & ^modeMask) | OpCode(mode) +} + +// SetSize sets the size on load and store operations. +// +// Returns InvalidOpCode if op is of the wrong class. +func (op OpCode) SetSize(size Size) OpCode { + if !op.Class().isLoadOrStore() || !valid(OpCode(size), sizeMask) { + return InvalidOpCode + } + return (op & ^sizeMask) | OpCode(size) +} + +// SetSource sets the source on jump and ALU operations. +// +// Returns InvalidOpCode if op is of the wrong class. +func (op OpCode) SetSource(source Source) OpCode { + if !op.Class().isJumpOrALU() || !valid(OpCode(source), sourceMask) { + return InvalidOpCode + } + return (op & ^sourceMask) | OpCode(source) +} + +// SetALUOp sets the ALUOp on ALU operations. +// +// Returns InvalidOpCode if op is of the wrong class. +func (op OpCode) SetALUOp(alu ALUOp) OpCode { + if !op.Class().IsALU() || !valid(OpCode(alu), aluMask) { + return InvalidOpCode + } + return (op & ^aluMask) | OpCode(alu) +} + +// SetJumpOp sets the JumpOp on jump operations. +// +// Returns InvalidOpCode if op is of the wrong class. +func (op OpCode) SetJumpOp(jump JumpOp) OpCode { + if !op.Class().IsJump() || !valid(OpCode(jump), jumpMask) { + return InvalidOpCode + } + + newOp := (op & ^jumpMask) | OpCode(jump) + + // Check newOp is legal. + if newOp.JumpOp() == InvalidJumpOp { + return InvalidOpCode + } + + return newOp +} + +func (op OpCode) String() string { + var f strings.Builder + + switch class := op.Class(); { + case class.isLoadOrStore(): + f.WriteString(strings.TrimSuffix(class.String(), "Class")) + + mode := op.Mode() + f.WriteString(strings.TrimSuffix(mode.String(), "Mode")) + + switch op.Size() { + case DWord: + f.WriteString("DW") + case Word: + f.WriteString("W") + case Half: + f.WriteString("H") + case Byte: + f.WriteString("B") + } + + case class.IsALU(): + if op.ALUOp() == Swap && op.Class() == ALU64Class { + // B to make BSwap, uncontitional byte swap + f.WriteString("B") + } + + f.WriteString(op.ALUOp().String()) + + if op.ALUOp() == Swap { + if op.Class() == ALUClass { + // Width for Endian is controlled by Constant + f.WriteString(op.Endianness().String()) + } + } else { + f.WriteString(strings.TrimSuffix(op.Source().String(), "Source")) + + if class == ALUClass { + f.WriteString("32") + } + } + + case class.IsJump(): + f.WriteString(op.JumpOp().String()) + + if class == Jump32Class { + f.WriteString("32") + } + + if jop := op.JumpOp(); jop != Exit && jop != Call && jop != Ja { + f.WriteString(strings.TrimSuffix(op.Source().String(), "Source")) + } + + default: + fmt.Fprintf(&f, "OpCode(%#x)", uint8(op)) + } + + return f.String() +} + +// valid returns true if all bits in value are covered by mask. +func valid(value, mask OpCode) bool { + return value & ^mask == 0 +} diff --git a/vendor/github.com/cilium/ebpf/asm/opcode_string.go b/vendor/github.com/cilium/ebpf/asm/opcode_string.go new file mode 100644 index 0000000000..58bc3e7e7f --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/opcode_string.go @@ -0,0 +1,30 @@ +// Code generated by "stringer -output opcode_string.go -type=Class"; DO NOT EDIT. + +package asm + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[LdClass-0] + _ = x[LdXClass-1] + _ = x[StClass-2] + _ = x[StXClass-3] + _ = x[ALUClass-4] + _ = x[JumpClass-5] + _ = x[Jump32Class-6] + _ = x[ALU64Class-7] +} + +const _Class_name = "LdClassLdXClassStClassStXClassALUClassJumpClassJump32ClassALU64Class" + +var _Class_index = [...]uint8{0, 7, 15, 22, 30, 38, 47, 58, 68} + +func (i Class) String() string { + if i >= Class(len(_Class_index)-1) { + return "Class(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _Class_name[_Class_index[i]:_Class_index[i+1]] +} diff --git a/vendor/github.com/cilium/ebpf/asm/register.go b/vendor/github.com/cilium/ebpf/asm/register.go new file mode 100644 index 0000000000..457a3b8a88 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/asm/register.go @@ -0,0 +1,51 @@ +package asm + +import ( + "fmt" +) + +// Register is the source or destination of most operations. +type Register uint8 + +// R0 contains return values. +const R0 Register = 0 + +// Registers for function arguments. +const ( + R1 Register = R0 + 1 + iota + R2 + R3 + R4 + R5 +) + +// Callee saved registers preserved by function calls. +const ( + R6 Register = R5 + 1 + iota + R7 + R8 + R9 +) + +// Read-only frame pointer to access stack. +const ( + R10 Register = R9 + 1 + RFP = R10 +) + +// Pseudo registers used by 64bit loads and jumps +const ( + PseudoMapFD = R1 // BPF_PSEUDO_MAP_FD + PseudoMapValue = R2 // BPF_PSEUDO_MAP_VALUE + PseudoCall = R1 // BPF_PSEUDO_CALL + PseudoFunc = R4 // BPF_PSEUDO_FUNC + PseudoKfuncCall = R2 // BPF_PSEUDO_KFUNC_CALL +) + +func (r Register) String() string { + v := uint8(r) + if v == 10 { + return "rfp" + } + return fmt.Sprintf("r%d", v) +} diff --git a/vendor/github.com/cilium/ebpf/attachtype_string.go b/vendor/github.com/cilium/ebpf/attachtype_string.go new file mode 100644 index 0000000000..84445a3256 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/attachtype_string.go @@ -0,0 +1,80 @@ +// Code generated by "stringer -type AttachType -trimprefix Attach"; DO NOT EDIT. + +package ebpf + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[AttachNone-0] + _ = x[AttachCGroupInetIngress-0] + _ = x[AttachCGroupInetEgress-1] + _ = x[AttachCGroupInetSockCreate-2] + _ = x[AttachCGroupSockOps-3] + _ = x[AttachSkSKBStreamParser-4] + _ = x[AttachSkSKBStreamVerdict-5] + _ = x[AttachCGroupDevice-6] + _ = x[AttachSkMsgVerdict-7] + _ = x[AttachCGroupInet4Bind-8] + _ = x[AttachCGroupInet6Bind-9] + _ = x[AttachCGroupInet4Connect-10] + _ = x[AttachCGroupInet6Connect-11] + _ = x[AttachCGroupInet4PostBind-12] + _ = x[AttachCGroupInet6PostBind-13] + _ = x[AttachCGroupUDP4Sendmsg-14] + _ = x[AttachCGroupUDP6Sendmsg-15] + _ = x[AttachLircMode2-16] + _ = x[AttachFlowDissector-17] + _ = x[AttachCGroupSysctl-18] + _ = x[AttachCGroupUDP4Recvmsg-19] + _ = x[AttachCGroupUDP6Recvmsg-20] + _ = x[AttachCGroupGetsockopt-21] + _ = x[AttachCGroupSetsockopt-22] + _ = x[AttachTraceRawTp-23] + _ = x[AttachTraceFEntry-24] + _ = x[AttachTraceFExit-25] + _ = x[AttachModifyReturn-26] + _ = x[AttachLSMMac-27] + _ = x[AttachTraceIter-28] + _ = x[AttachCgroupInet4GetPeername-29] + _ = x[AttachCgroupInet6GetPeername-30] + _ = x[AttachCgroupInet4GetSockname-31] + _ = x[AttachCgroupInet6GetSockname-32] + _ = x[AttachXDPDevMap-33] + _ = x[AttachCgroupInetSockRelease-34] + _ = x[AttachXDPCPUMap-35] + _ = x[AttachSkLookup-36] + _ = x[AttachXDP-37] + _ = x[AttachSkSKBVerdict-38] + _ = x[AttachSkReuseportSelect-39] + _ = x[AttachSkReuseportSelectOrMigrate-40] + _ = x[AttachPerfEvent-41] + _ = x[AttachTraceKprobeMulti-42] + _ = x[AttachTraceKprobeSession-56] + _ = x[AttachLSMCgroup-43] + _ = x[AttachStructOps-44] + _ = x[AttachNetfilter-45] + _ = x[AttachTCXIngress-46] + _ = x[AttachTCXEgress-47] + _ = x[AttachTraceUprobeMulti-48] + _ = x[AttachCgroupUnixConnect-49] + _ = x[AttachCgroupUnixSendmsg-50] + _ = x[AttachCgroupUnixRecvmsg-51] + _ = x[AttachCgroupUnixGetpeername-52] + _ = x[AttachCgroupUnixGetsockname-53] + _ = x[AttachNetkitPrimary-54] + _ = x[AttachNetkitPeer-55] +} + +const _AttachType_name = "NoneCGroupInetEgressCGroupInetSockCreateCGroupSockOpsSkSKBStreamParserSkSKBStreamVerdictCGroupDeviceSkMsgVerdictCGroupInet4BindCGroupInet6BindCGroupInet4ConnectCGroupInet6ConnectCGroupInet4PostBindCGroupInet6PostBindCGroupUDP4SendmsgCGroupUDP6SendmsgLircMode2FlowDissectorCGroupSysctlCGroupUDP4RecvmsgCGroupUDP6RecvmsgCGroupGetsockoptCGroupSetsockoptTraceRawTpTraceFEntryTraceFExitModifyReturnLSMMacTraceIterCgroupInet4GetPeernameCgroupInet6GetPeernameCgroupInet4GetSocknameCgroupInet6GetSocknameXDPDevMapCgroupInetSockReleaseXDPCPUMapSkLookupXDPSkSKBVerdictSkReuseportSelectSkReuseportSelectOrMigratePerfEventTraceKprobeMultiLSMCgroupStructOpsNetfilterTCXIngressTCXEgressTraceUprobeMultiCgroupUnixConnectCgroupUnixSendmsgCgroupUnixRecvmsgCgroupUnixGetpeernameCgroupUnixGetsocknameNetkitPrimaryNetkitPeerTraceKprobeSession" + +var _AttachType_index = [...]uint16{0, 4, 20, 40, 53, 70, 88, 100, 112, 127, 142, 160, 178, 197, 216, 233, 250, 259, 272, 284, 301, 318, 334, 350, 360, 371, 381, 393, 399, 408, 430, 452, 474, 496, 505, 526, 535, 543, 546, 558, 575, 601, 610, 626, 635, 644, 653, 663, 672, 688, 705, 722, 739, 760, 781, 794, 804, 822} + +func (i AttachType) String() string { + if i >= AttachType(len(_AttachType_index)-1) { + return "AttachType(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _AttachType_name[_AttachType_index[i]:_AttachType_index[i+1]] +} diff --git a/vendor/github.com/cilium/ebpf/btf/btf.go b/vendor/github.com/cilium/ebpf/btf/btf.go new file mode 100644 index 0000000000..e8f80c1399 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/btf/btf.go @@ -0,0 +1,713 @@ +package btf + +import ( + "bufio" + "debug/elf" + "encoding/binary" + "errors" + "fmt" + "io" + "math" + "os" + "reflect" + "sync" + + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/sys" +) + +const btfMagic = 0xeB9F + +// Errors returned by BTF functions. +var ( + ErrNotSupported = internal.ErrNotSupported + ErrNotFound = errors.New("not found") + ErrNoExtendedInfo = errors.New("no extended info") + ErrMultipleMatches = errors.New("multiple matching types") +) + +// ID represents the unique ID of a BTF object. +type ID = sys.BTFID + +// immutableTypes is a set of types which musn't be changed. +type immutableTypes struct { + // All types contained by the spec, not including types from the base in + // case the spec was parsed from split BTF. + types []Type + + // Type IDs indexed by type. + typeIDs map[Type]TypeID + + // The ID of the first type in types. + firstTypeID TypeID + + // Types indexed by essential name. + // Includes all struct flavors and types with the same name. + namedTypes map[essentialName][]TypeID + + // Byte order of the types. This affects things like struct member order + // when using bitfields. + byteOrder binary.ByteOrder +} + +func (s *immutableTypes) typeByID(id TypeID) (Type, bool) { + if id < s.firstTypeID { + return nil, false + } + + index := int(id - s.firstTypeID) + if index >= len(s.types) { + return nil, false + } + + return s.types[index], true +} + +// mutableTypes is a set of types which may be changed. +type mutableTypes struct { + imm immutableTypes + mu sync.RWMutex // protects copies below + copies map[Type]Type // map[orig]copy + copiedTypeIDs map[Type]TypeID // map[copy]origID +} + +// add a type to the set of mutable types. +// +// Copies type and all of its children once. Repeated calls with the same type +// do not copy again. +func (mt *mutableTypes) add(typ Type, typeIDs map[Type]TypeID) Type { + mt.mu.RLock() + cpy, ok := mt.copies[typ] + mt.mu.RUnlock() + + if ok { + // Fast path: the type has been copied before. + return cpy + } + + // modifyGraphPreorder copies the type graph node by node, so we can't drop + // the lock in between. + mt.mu.Lock() + defer mt.mu.Unlock() + + return copyType(typ, typeIDs, mt.copies, mt.copiedTypeIDs) +} + +// copy a set of mutable types. +func (mt *mutableTypes) copy() *mutableTypes { + if mt == nil { + return nil + } + + // Prevent concurrent modification of mt.copiedTypeIDs. + mt.mu.RLock() + defer mt.mu.RUnlock() + + mtCopy := &mutableTypes{ + mt.imm, + sync.RWMutex{}, + make(map[Type]Type, len(mt.copies)), + make(map[Type]TypeID, len(mt.copiedTypeIDs)), + } + + copiesOfCopies := make(map[Type]Type, len(mt.copies)) + for orig, copy := range mt.copies { + // NB: We make a copy of copy, not orig, so that changes to mutable types + // are preserved. + copyOfCopy := copyType(copy, mt.copiedTypeIDs, copiesOfCopies, mtCopy.copiedTypeIDs) + mtCopy.copies[orig] = copyOfCopy + } + + return mtCopy +} + +func (mt *mutableTypes) typeID(typ Type) (TypeID, error) { + if _, ok := typ.(*Void); ok { + // Equality is weird for void, since it is a zero sized type. + return 0, nil + } + + mt.mu.RLock() + defer mt.mu.RUnlock() + + id, ok := mt.copiedTypeIDs[typ] + if !ok { + return 0, fmt.Errorf("no ID for type %s: %w", typ, ErrNotFound) + } + + return id, nil +} + +func (mt *mutableTypes) typeByID(id TypeID) (Type, bool) { + immT, ok := mt.imm.typeByID(id) + if !ok { + return nil, false + } + + return mt.add(immT, mt.imm.typeIDs), true +} + +func (mt *mutableTypes) anyTypesByName(name string) ([]Type, error) { + immTypes := mt.imm.namedTypes[newEssentialName(name)] + if len(immTypes) == 0 { + return nil, fmt.Errorf("type name %s: %w", name, ErrNotFound) + } + + // Return a copy to prevent changes to namedTypes. + result := make([]Type, 0, len(immTypes)) + for _, id := range immTypes { + immT, ok := mt.imm.typeByID(id) + if !ok { + return nil, fmt.Errorf("no type with ID %d", id) + } + + // Match against the full name, not just the essential one + // in case the type being looked up is a struct flavor. + if immT.TypeName() == name { + result = append(result, mt.add(immT, mt.imm.typeIDs)) + } + } + return result, nil +} + +// Spec allows querying a set of Types and loading the set into the +// kernel. +type Spec struct { + *mutableTypes + + // String table from ELF. + strings *stringTable +} + +// LoadSpec opens file and calls LoadSpecFromReader on it. +func LoadSpec(file string) (*Spec, error) { + fh, err := os.Open(file) + if err != nil { + return nil, err + } + defer fh.Close() + + return LoadSpecFromReader(fh) +} + +// LoadSpecFromReader reads from an ELF or a raw BTF blob. +// +// Returns ErrNotFound if reading from an ELF which contains no BTF. ExtInfos +// may be nil. +func LoadSpecFromReader(rd io.ReaderAt) (*Spec, error) { + file, err := internal.NewSafeELFFile(rd) + if err != nil { + if bo := guessRawBTFByteOrder(rd); bo != nil { + return loadRawSpec(io.NewSectionReader(rd, 0, math.MaxInt64), bo, nil) + } + + return nil, err + } + + return loadSpecFromELF(file) +} + +// LoadSpecAndExtInfosFromReader reads from an ELF. +// +// ExtInfos may be nil if the ELF doesn't contain section metadata. +// Returns ErrNotFound if the ELF contains no BTF. +func LoadSpecAndExtInfosFromReader(rd io.ReaderAt) (*Spec, *ExtInfos, error) { + file, err := internal.NewSafeELFFile(rd) + if err != nil { + return nil, nil, err + } + + spec, err := loadSpecFromELF(file) + if err != nil { + return nil, nil, err + } + + extInfos, err := loadExtInfosFromELF(file, spec) + if err != nil && !errors.Is(err, ErrNotFound) { + return nil, nil, err + } + + return spec, extInfos, nil +} + +// symbolOffsets extracts all symbols offsets from an ELF and indexes them by +// section and variable name. +// +// References to variables in BTF data sections carry unsigned 32-bit offsets. +// Some ELF symbols (e.g. in vmlinux) may point to virtual memory that is well +// beyond this range. Since these symbols cannot be described by BTF info, +// ignore them here. +func symbolOffsets(file *internal.SafeELFFile) (map[symbol]uint32, error) { + symbols, err := file.Symbols() + if err != nil { + return nil, fmt.Errorf("can't read symbols: %v", err) + } + + offsets := make(map[symbol]uint32) + for _, sym := range symbols { + if idx := sym.Section; idx >= elf.SHN_LORESERVE && idx <= elf.SHN_HIRESERVE { + // Ignore things like SHN_ABS + continue + } + + if sym.Value > math.MaxUint32 { + // VarSecinfo offset is u32, cannot reference symbols in higher regions. + continue + } + + if int(sym.Section) >= len(file.Sections) { + return nil, fmt.Errorf("symbol %s: invalid section %d", sym.Name, sym.Section) + } + + secName := file.Sections[sym.Section].Name + offsets[symbol{secName, sym.Name}] = uint32(sym.Value) + } + + return offsets, nil +} + +func loadSpecFromELF(file *internal.SafeELFFile) (*Spec, error) { + var ( + btfSection *elf.Section + sectionSizes = make(map[string]uint32) + ) + + for _, sec := range file.Sections { + switch sec.Name { + case ".BTF": + btfSection = sec + default: + if sec.Type != elf.SHT_PROGBITS && sec.Type != elf.SHT_NOBITS { + break + } + + if sec.Size > math.MaxUint32 { + return nil, fmt.Errorf("section %s exceeds maximum size", sec.Name) + } + + sectionSizes[sec.Name] = uint32(sec.Size) + } + } + + if btfSection == nil { + return nil, fmt.Errorf("btf: %w", ErrNotFound) + } + + offsets, err := symbolOffsets(file) + if err != nil { + return nil, err + } + + if btfSection.ReaderAt == nil { + return nil, fmt.Errorf("compressed BTF is not supported") + } + + spec, err := loadRawSpec(btfSection.ReaderAt, file.ByteOrder, nil) + if err != nil { + return nil, err + } + + err = fixupDatasec(spec.imm.types, sectionSizes, offsets) + if err != nil { + return nil, err + } + + return spec, nil +} + +func loadRawSpec(btf io.ReaderAt, bo binary.ByteOrder, base *Spec) (*Spec, error) { + var ( + baseStrings *stringTable + firstTypeID TypeID + err error + ) + + if base != nil { + if base.imm.firstTypeID != 0 { + return nil, fmt.Errorf("can't use split BTF as base") + } + + baseStrings = base.strings + + firstTypeID, err = base.nextTypeID() + if err != nil { + return nil, err + } + } + + types, rawStrings, err := parseBTF(btf, bo, baseStrings, base) + if err != nil { + return nil, err + } + + typeIDs, typesByName := indexTypes(types, firstTypeID) + + return &Spec{ + &mutableTypes{ + immutableTypes{ + types, + typeIDs, + firstTypeID, + typesByName, + bo, + }, + sync.RWMutex{}, + make(map[Type]Type), + make(map[Type]TypeID), + }, + rawStrings, + }, nil +} + +func indexTypes(types []Type, firstTypeID TypeID) (map[Type]TypeID, map[essentialName][]TypeID) { + namedTypes := 0 + for _, typ := range types { + if typ.TypeName() != "" { + // Do a pre-pass to figure out how big types by name has to be. + // Most types have unique names, so it's OK to ignore essentialName + // here. + namedTypes++ + } + } + + typeIDs := make(map[Type]TypeID, len(types)) + typesByName := make(map[essentialName][]TypeID, namedTypes) + + for i, typ := range types { + id := firstTypeID + TypeID(i) + typeIDs[typ] = id + + if name := newEssentialName(typ.TypeName()); name != "" { + typesByName[name] = append(typesByName[name], id) + } + } + + return typeIDs, typesByName +} + +func guessRawBTFByteOrder(r io.ReaderAt) binary.ByteOrder { + buf := new(bufio.Reader) + for _, bo := range []binary.ByteOrder{ + binary.LittleEndian, + binary.BigEndian, + } { + buf.Reset(io.NewSectionReader(r, 0, math.MaxInt64)) + if _, err := parseBTFHeader(buf, bo); err == nil { + return bo + } + } + + return nil +} + +// parseBTF reads a .BTF section into memory and parses it into a list of +// raw types and a string table. +func parseBTF(btf io.ReaderAt, bo binary.ByteOrder, baseStrings *stringTable, base *Spec) ([]Type, *stringTable, error) { + buf := internal.NewBufferedSectionReader(btf, 0, math.MaxInt64) + header, err := parseBTFHeader(buf, bo) + if err != nil { + return nil, nil, fmt.Errorf("parsing .BTF header: %v", err) + } + + rawStrings, err := readStringTable(io.NewSectionReader(btf, header.stringStart(), int64(header.StringLen)), + baseStrings) + if err != nil { + return nil, nil, fmt.Errorf("can't read type names: %w", err) + } + + buf.Reset(io.NewSectionReader(btf, header.typeStart(), int64(header.TypeLen))) + types, err := readAndInflateTypes(buf, bo, header.TypeLen, rawStrings, base) + if err != nil { + return nil, nil, err + } + + return types, rawStrings, nil +} + +type symbol struct { + section string + name string +} + +// fixupDatasec attempts to patch up missing info in Datasecs and its members by +// supplementing them with information from the ELF headers and symbol table. +func fixupDatasec(types []Type, sectionSizes map[string]uint32, offsets map[symbol]uint32) error { + for _, typ := range types { + ds, ok := typ.(*Datasec) + if !ok { + continue + } + + name := ds.Name + + // Some Datasecs are virtual and don't have corresponding ELF sections. + switch name { + case ".ksyms": + // .ksyms describes forward declarations of kfunc signatures, as well as + // references to kernel symbols. + // Nothing to fix up, all sizes and offsets are 0. + for _, vsi := range ds.Vars { + switch t := vsi.Type.(type) { + case *Func: + continue + case *Var: + if _, ok := t.Type.(*Void); !ok { + return fmt.Errorf("data section %s: expected %s to be *Void, not %T: %w", name, vsi.Type.TypeName(), vsi.Type, ErrNotSupported) + } + default: + return fmt.Errorf("data section %s: expected to be either *btf.Func or *btf.Var, not %T: %w", name, vsi.Type, ErrNotSupported) + } + } + + continue + case ".kconfig": + // .kconfig has a size of 0 and has all members' offsets set to 0. + // Fix up all offsets and set the Datasec's size. + if err := fixupDatasecLayout(ds); err != nil { + return err + } + + // Fix up extern to global linkage to avoid a BTF verifier error. + for _, vsi := range ds.Vars { + vsi.Type.(*Var).Linkage = GlobalVar + } + + continue + } + + if ds.Size != 0 { + continue + } + + ds.Size, ok = sectionSizes[name] + if !ok { + return fmt.Errorf("data section %s: missing size", name) + } + + for i := range ds.Vars { + symName := ds.Vars[i].Type.TypeName() + ds.Vars[i].Offset, ok = offsets[symbol{name, symName}] + if !ok { + return fmt.Errorf("data section %s: missing offset for symbol %s", name, symName) + } + } + } + + return nil +} + +// fixupDatasecLayout populates ds.Vars[].Offset according to var sizes and +// alignment. Calculate and set ds.Size. +func fixupDatasecLayout(ds *Datasec) error { + var off uint32 + + for i, vsi := range ds.Vars { + v, ok := vsi.Type.(*Var) + if !ok { + return fmt.Errorf("member %d: unsupported type %T", i, vsi.Type) + } + + size, err := Sizeof(v.Type) + if err != nil { + return fmt.Errorf("variable %s: getting size: %w", v.Name, err) + } + align, err := alignof(v.Type) + if err != nil { + return fmt.Errorf("variable %s: getting alignment: %w", v.Name, err) + } + + // Align the current member based on the offset of the end of the previous + // member and the alignment of the current member. + off = internal.Align(off, uint32(align)) + + ds.Vars[i].Offset = off + + off += uint32(size) + } + + ds.Size = off + + return nil +} + +// Copy creates a copy of Spec. +func (s *Spec) Copy() *Spec { + if s == nil { + return nil + } + + return &Spec{ + s.mutableTypes.copy(), + s.strings, + } +} + +type sliceWriter []byte + +func (sw sliceWriter) Write(p []byte) (int, error) { + if len(p) != len(sw) { + return 0, errors.New("size doesn't match") + } + + return copy(sw, p), nil +} + +// nextTypeID returns the next unallocated type ID or an error if there are no +// more type IDs. +func (s *Spec) nextTypeID() (TypeID, error) { + id := s.imm.firstTypeID + TypeID(len(s.imm.types)) + if id < s.imm.firstTypeID { + return 0, fmt.Errorf("no more type IDs") + } + return id, nil +} + +// TypeByID returns the BTF Type with the given type ID. +// +// Returns an error wrapping ErrNotFound if a Type with the given ID +// does not exist in the Spec. +func (s *Spec) TypeByID(id TypeID) (Type, error) { + typ, ok := s.typeByID(id) + if !ok { + return nil, fmt.Errorf("look up type with ID %d (first ID is %d): %w", id, s.imm.firstTypeID, ErrNotFound) + } + + return typ, nil +} + +// TypeID returns the ID for a given Type. +// +// Returns an error wrapping [ErrNotFound] if the type isn't part of the Spec. +func (s *Spec) TypeID(typ Type) (TypeID, error) { + return s.mutableTypes.typeID(typ) +} + +// AnyTypesByName returns a list of BTF Types with the given name. +// +// If the BTF blob describes multiple compilation units like vmlinux, multiple +// Types with the same name and kind can exist, but might not describe the same +// data structure. +// +// Returns an error wrapping ErrNotFound if no matching Type exists in the Spec. +func (s *Spec) AnyTypesByName(name string) ([]Type, error) { + return s.mutableTypes.anyTypesByName(name) +} + +// AnyTypeByName returns a Type with the given name. +// +// Returns an error if multiple types of that name exist. +func (s *Spec) AnyTypeByName(name string) (Type, error) { + types, err := s.AnyTypesByName(name) + if err != nil { + return nil, err + } + + if len(types) > 1 { + return nil, fmt.Errorf("found multiple types: %v", types) + } + + return types[0], nil +} + +// TypeByName searches for a Type with a specific name. Since multiple Types +// with the same name can exist, the parameter typ is taken to narrow down the +// search in case of a clash. +// +// typ must be a non-nil pointer to an implementation of a Type. On success, the +// address of the found Type will be copied to typ. +// +// Returns an error wrapping ErrNotFound if no matching Type exists in the Spec. +// Returns an error wrapping ErrMultipleTypes if multiple candidates are found. +func (s *Spec) TypeByName(name string, typ interface{}) error { + typeInterface := reflect.TypeOf((*Type)(nil)).Elem() + + // typ may be **T or *Type + typValue := reflect.ValueOf(typ) + if typValue.Kind() != reflect.Ptr { + return fmt.Errorf("%T is not a pointer", typ) + } + + typPtr := typValue.Elem() + if !typPtr.CanSet() { + return fmt.Errorf("%T cannot be set", typ) + } + + wanted := typPtr.Type() + if wanted == typeInterface { + // This is *Type. Unwrap the value's type. + wanted = typPtr.Elem().Type() + } + + if !wanted.AssignableTo(typeInterface) { + return fmt.Errorf("%T does not satisfy Type interface", typ) + } + + types, err := s.AnyTypesByName(name) + if err != nil { + return err + } + + var candidate Type + for _, typ := range types { + if reflect.TypeOf(typ) != wanted { + continue + } + + if candidate != nil { + return fmt.Errorf("type %s(%T): %w", name, typ, ErrMultipleMatches) + } + + candidate = typ + } + + if candidate == nil { + return fmt.Errorf("%s %s: %w", wanted, name, ErrNotFound) + } + + typPtr.Set(reflect.ValueOf(candidate)) + + return nil +} + +// LoadSplitSpecFromReader loads split BTF from a reader. +// +// Types from base are used to resolve references in the split BTF. +// The returned Spec only contains types from the split BTF, not from the base. +func LoadSplitSpecFromReader(r io.ReaderAt, base *Spec) (*Spec, error) { + return loadRawSpec(r, internal.NativeEndian, base) +} + +// TypesIterator iterates over types of a given spec. +type TypesIterator struct { + spec *Spec + id TypeID + done bool + // The last visited type in the spec. + Type Type +} + +// Iterate returns the types iterator. +func (s *Spec) Iterate() *TypesIterator { + return &TypesIterator{spec: s, id: s.imm.firstTypeID} +} + +// Next returns true as long as there are any remaining types. +func (iter *TypesIterator) Next() bool { + if iter.done { + return false + } + + var ok bool + iter.Type, ok = iter.spec.typeByID(iter.id) + iter.id++ + iter.done = !ok + if !iter.done { + // Skip declTags, during unmarshaling declTags become `Tags` fields of other types. + // We keep them in the spec to avoid holes in the ID space, but for the purposes of + // iteration, they are not useful to the user. + if _, ok := iter.Type.(*declTag); ok { + return iter.Next() + } + } + return !iter.done +} diff --git a/vendor/github.com/cilium/ebpf/btf/btf_types.go b/vendor/github.com/cilium/ebpf/btf/btf_types.go new file mode 100644 index 0000000000..320311b332 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/btf/btf_types.go @@ -0,0 +1,520 @@ +package btf + +import ( + "encoding/binary" + "errors" + "fmt" + "io" + "unsafe" + + "github.com/cilium/ebpf/internal" +) + +//go:generate go run golang.org/x/tools/cmd/stringer@latest -linecomment -output=btf_types_string.go -type=FuncLinkage,VarLinkage,btfKind + +// btfKind describes a Type. +type btfKind uint8 + +// Equivalents of the BTF_KIND_* constants. +const ( + kindUnknown btfKind = iota // Unknown + kindInt // Int + kindPointer // Pointer + kindArray // Array + kindStruct // Struct + kindUnion // Union + kindEnum // Enum + kindForward // Forward + kindTypedef // Typedef + kindVolatile // Volatile + kindConst // Const + kindRestrict // Restrict + // Added ~4.20 + kindFunc // Func + kindFuncProto // FuncProto + // Added ~5.1 + kindVar // Var + kindDatasec // Datasec + // Added ~5.13 + kindFloat // Float + // Added 5.16 + kindDeclTag // DeclTag + // Added 5.17 + kindTypeTag // TypeTag + // Added 6.0 + kindEnum64 // Enum64 +) + +// FuncLinkage describes BTF function linkage metadata. +type FuncLinkage int + +// Equivalent of enum btf_func_linkage. +const ( + StaticFunc FuncLinkage = iota // static + GlobalFunc // global + ExternFunc // extern +) + +// VarLinkage describes BTF variable linkage metadata. +type VarLinkage int + +const ( + StaticVar VarLinkage = iota // static + GlobalVar // global + ExternVar // extern +) + +const ( + btfTypeKindShift = 24 + btfTypeKindLen = 5 + btfTypeVlenShift = 0 + btfTypeVlenMask = 16 + btfTypeKindFlagShift = 31 + btfTypeKindFlagMask = 1 +) + +var btfHeaderLen = binary.Size(&btfHeader{}) + +type btfHeader struct { + Magic uint16 + Version uint8 + Flags uint8 + HdrLen uint32 + + TypeOff uint32 + TypeLen uint32 + StringOff uint32 + StringLen uint32 +} + +// typeStart returns the offset from the beginning of the .BTF section +// to the start of its type entries. +func (h *btfHeader) typeStart() int64 { + return int64(h.HdrLen + h.TypeOff) +} + +// stringStart returns the offset from the beginning of the .BTF section +// to the start of its string table. +func (h *btfHeader) stringStart() int64 { + return int64(h.HdrLen + h.StringOff) +} + +// parseBTFHeader parses the header of the .BTF section. +func parseBTFHeader(r io.Reader, bo binary.ByteOrder) (*btfHeader, error) { + var header btfHeader + if err := binary.Read(r, bo, &header); err != nil { + return nil, fmt.Errorf("can't read header: %v", err) + } + + if header.Magic != btfMagic { + return nil, fmt.Errorf("incorrect magic value %v", header.Magic) + } + + if header.Version != 1 { + return nil, fmt.Errorf("unexpected version %v", header.Version) + } + + if header.Flags != 0 { + return nil, fmt.Errorf("unsupported flags %v", header.Flags) + } + + remainder := int64(header.HdrLen) - int64(binary.Size(&header)) + if remainder < 0 { + return nil, errors.New("header length shorter than btfHeader size") + } + + if _, err := io.CopyN(internal.DiscardZeroes{}, r, remainder); err != nil { + return nil, fmt.Errorf("header padding: %v", err) + } + + return &header, nil +} + +var btfTypeLen = binary.Size(btfType{}) + +// btfType is equivalent to struct btf_type in Documentation/bpf/btf.rst. +type btfType struct { + NameOff uint32 + /* "info" bits arrangement + * bits 0-15: vlen (e.g. # of struct's members), linkage + * bits 16-23: unused + * bits 24-28: kind (e.g. int, ptr, array...etc) + * bits 29-30: unused + * bit 31: kind_flag, currently used by + * struct, union and fwd + */ + Info uint32 + /* "size" is used by INT, ENUM, STRUCT and UNION. + * "size" tells the size of the type it is describing. + * + * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, + * FUNC and FUNC_PROTO. + * "type" is a type_id referring to another type. + */ + SizeType uint32 +} + +var btfTypeSize = int(unsafe.Sizeof(btfType{})) + +func unmarshalBtfType(bt *btfType, b []byte, bo binary.ByteOrder) (int, error) { + if len(b) < btfTypeSize { + return 0, fmt.Errorf("not enough bytes to unmarshal btfType") + } + + bt.NameOff = bo.Uint32(b[0:]) + bt.Info = bo.Uint32(b[4:]) + bt.SizeType = bo.Uint32(b[8:]) + return btfTypeSize, nil +} + +func mask(len uint32) uint32 { + return (1 << len) - 1 +} + +func readBits(value, len, shift uint32) uint32 { + return (value >> shift) & mask(len) +} + +func writeBits(value, len, shift, new uint32) uint32 { + value &^= mask(len) << shift + value |= (new & mask(len)) << shift + return value +} + +func (bt *btfType) info(len, shift uint32) uint32 { + return readBits(bt.Info, len, shift) +} + +func (bt *btfType) setInfo(value, len, shift uint32) { + bt.Info = writeBits(bt.Info, len, shift, value) +} + +func (bt *btfType) Kind() btfKind { + return btfKind(bt.info(btfTypeKindLen, btfTypeKindShift)) +} + +func (bt *btfType) SetKind(kind btfKind) { + bt.setInfo(uint32(kind), btfTypeKindLen, btfTypeKindShift) +} + +func (bt *btfType) Vlen() int { + return int(bt.info(btfTypeVlenMask, btfTypeVlenShift)) +} + +func (bt *btfType) SetVlen(vlen int) { + bt.setInfo(uint32(vlen), btfTypeVlenMask, btfTypeVlenShift) +} + +func (bt *btfType) kindFlagBool() bool { + return bt.info(btfTypeKindFlagMask, btfTypeKindFlagShift) == 1 +} + +func (bt *btfType) setKindFlagBool(set bool) { + var value uint32 + if set { + value = 1 + } + bt.setInfo(value, btfTypeKindFlagMask, btfTypeKindFlagShift) +} + +// Bitfield returns true if the struct or union contain a bitfield. +func (bt *btfType) Bitfield() bool { + return bt.kindFlagBool() +} + +func (bt *btfType) SetBitfield(isBitfield bool) { + bt.setKindFlagBool(isBitfield) +} + +func (bt *btfType) FwdKind() FwdKind { + return FwdKind(bt.info(btfTypeKindFlagMask, btfTypeKindFlagShift)) +} + +func (bt *btfType) SetFwdKind(kind FwdKind) { + bt.setInfo(uint32(kind), btfTypeKindFlagMask, btfTypeKindFlagShift) +} + +func (bt *btfType) Signed() bool { + return bt.kindFlagBool() +} + +func (bt *btfType) SetSigned(signed bool) { + bt.setKindFlagBool(signed) +} + +func (bt *btfType) Linkage() FuncLinkage { + return FuncLinkage(bt.info(btfTypeVlenMask, btfTypeVlenShift)) +} + +func (bt *btfType) SetLinkage(linkage FuncLinkage) { + bt.setInfo(uint32(linkage), btfTypeVlenMask, btfTypeVlenShift) +} + +func (bt *btfType) Type() TypeID { + // TODO: Panic here if wrong kind? + return TypeID(bt.SizeType) +} + +func (bt *btfType) SetType(id TypeID) { + bt.SizeType = uint32(id) +} + +func (bt *btfType) Size() uint32 { + // TODO: Panic here if wrong kind? + return bt.SizeType +} + +func (bt *btfType) SetSize(size uint32) { + bt.SizeType = size +} + +func (bt *btfType) Marshal(w io.Writer, bo binary.ByteOrder) error { + buf := make([]byte, unsafe.Sizeof(*bt)) + bo.PutUint32(buf[0:], bt.NameOff) + bo.PutUint32(buf[4:], bt.Info) + bo.PutUint32(buf[8:], bt.SizeType) + _, err := w.Write(buf) + return err +} + +type rawType struct { + btfType + data interface{} +} + +func (rt *rawType) Marshal(w io.Writer, bo binary.ByteOrder) error { + if err := rt.btfType.Marshal(w, bo); err != nil { + return err + } + + if rt.data == nil { + return nil + } + + return binary.Write(w, bo, rt.data) +} + +// btfInt encodes additional data for integers. +// +// ? ? ? ? e e e e o o o o o o o o ? ? ? ? ? ? ? ? b b b b b b b b +// ? = undefined +// e = encoding +// o = offset (bitfields?) +// b = bits (bitfields) +type btfInt struct { + Raw uint32 +} + +const ( + btfIntEncodingLen = 4 + btfIntEncodingShift = 24 + btfIntOffsetLen = 8 + btfIntOffsetShift = 16 + btfIntBitsLen = 8 + btfIntBitsShift = 0 +) + +var btfIntLen = int(unsafe.Sizeof(btfInt{})) + +func unmarshalBtfInt(bi *btfInt, b []byte, bo binary.ByteOrder) (int, error) { + if len(b) < btfIntLen { + return 0, fmt.Errorf("not enough bytes to unmarshal btfInt") + } + + bi.Raw = bo.Uint32(b[0:]) + return btfIntLen, nil +} + +func (bi btfInt) Encoding() IntEncoding { + return IntEncoding(readBits(bi.Raw, btfIntEncodingLen, btfIntEncodingShift)) +} + +func (bi *btfInt) SetEncoding(e IntEncoding) { + bi.Raw = writeBits(uint32(bi.Raw), btfIntEncodingLen, btfIntEncodingShift, uint32(e)) +} + +func (bi btfInt) Offset() Bits { + return Bits(readBits(bi.Raw, btfIntOffsetLen, btfIntOffsetShift)) +} + +func (bi *btfInt) SetOffset(offset uint32) { + bi.Raw = writeBits(bi.Raw, btfIntOffsetLen, btfIntOffsetShift, offset) +} + +func (bi btfInt) Bits() Bits { + return Bits(readBits(bi.Raw, btfIntBitsLen, btfIntBitsShift)) +} + +func (bi *btfInt) SetBits(bits byte) { + bi.Raw = writeBits(bi.Raw, btfIntBitsLen, btfIntBitsShift, uint32(bits)) +} + +type btfArray struct { + Type TypeID + IndexType TypeID + Nelems uint32 +} + +var btfArrayLen = int(unsafe.Sizeof(btfArray{})) + +func unmarshalBtfArray(ba *btfArray, b []byte, bo binary.ByteOrder) (int, error) { + if len(b) < btfArrayLen { + return 0, fmt.Errorf("not enough bytes to unmarshal btfArray") + } + + ba.Type = TypeID(bo.Uint32(b[0:])) + ba.IndexType = TypeID(bo.Uint32(b[4:])) + ba.Nelems = bo.Uint32(b[8:]) + return btfArrayLen, nil +} + +type btfMember struct { + NameOff uint32 + Type TypeID + Offset uint32 +} + +var btfMemberLen = int(unsafe.Sizeof(btfMember{})) + +func unmarshalBtfMembers(members []btfMember, b []byte, bo binary.ByteOrder) (int, error) { + off := 0 + for i := range members { + if off+btfMemberLen > len(b) { + return 0, fmt.Errorf("not enough bytes to unmarshal btfMember %d", i) + } + + members[i].NameOff = bo.Uint32(b[off+0:]) + members[i].Type = TypeID(bo.Uint32(b[off+4:])) + members[i].Offset = bo.Uint32(b[off+8:]) + + off += btfMemberLen + } + + return off, nil +} + +type btfVarSecinfo struct { + Type TypeID + Offset uint32 + Size uint32 +} + +var btfVarSecinfoLen = int(unsafe.Sizeof(btfVarSecinfo{})) + +func unmarshalBtfVarSecInfos(secinfos []btfVarSecinfo, b []byte, bo binary.ByteOrder) (int, error) { + off := 0 + for i := range secinfos { + if off+btfVarSecinfoLen > len(b) { + return 0, fmt.Errorf("not enough bytes to unmarshal btfVarSecinfo %d", i) + } + + secinfos[i].Type = TypeID(bo.Uint32(b[off+0:])) + secinfos[i].Offset = bo.Uint32(b[off+4:]) + secinfos[i].Size = bo.Uint32(b[off+8:]) + + off += btfVarSecinfoLen + } + + return off, nil +} + +type btfVariable struct { + Linkage uint32 +} + +var btfVariableLen = int(unsafe.Sizeof(btfVariable{})) + +func unmarshalBtfVariable(bv *btfVariable, b []byte, bo binary.ByteOrder) (int, error) { + if len(b) < btfVariableLen { + return 0, fmt.Errorf("not enough bytes to unmarshal btfVariable") + } + + bv.Linkage = bo.Uint32(b[0:]) + return btfVariableLen, nil +} + +type btfEnum struct { + NameOff uint32 + Val uint32 +} + +var btfEnumLen = int(unsafe.Sizeof(btfEnum{})) + +func unmarshalBtfEnums(enums []btfEnum, b []byte, bo binary.ByteOrder) (int, error) { + off := 0 + for i := range enums { + if off+btfEnumLen > len(b) { + return 0, fmt.Errorf("not enough bytes to unmarshal btfEnum %d", i) + } + + enums[i].NameOff = bo.Uint32(b[off+0:]) + enums[i].Val = bo.Uint32(b[off+4:]) + + off += btfEnumLen + } + + return off, nil +} + +type btfEnum64 struct { + NameOff uint32 + ValLo32 uint32 + ValHi32 uint32 +} + +var btfEnum64Len = int(unsafe.Sizeof(btfEnum64{})) + +func unmarshalBtfEnums64(enums []btfEnum64, b []byte, bo binary.ByteOrder) (int, error) { + off := 0 + for i := range enums { + if off+btfEnum64Len > len(b) { + return 0, fmt.Errorf("not enough bytes to unmarshal btfEnum64 %d", i) + } + + enums[i].NameOff = bo.Uint32(b[off+0:]) + enums[i].ValLo32 = bo.Uint32(b[off+4:]) + enums[i].ValHi32 = bo.Uint32(b[off+8:]) + + off += btfEnum64Len + } + + return off, nil +} + +type btfParam struct { + NameOff uint32 + Type TypeID +} + +var btfParamLen = int(unsafe.Sizeof(btfParam{})) + +func unmarshalBtfParams(params []btfParam, b []byte, bo binary.ByteOrder) (int, error) { + off := 0 + for i := range params { + if off+btfParamLen > len(b) { + return 0, fmt.Errorf("not enough bytes to unmarshal btfParam %d", i) + } + + params[i].NameOff = bo.Uint32(b[off+0:]) + params[i].Type = TypeID(bo.Uint32(b[off+4:])) + + off += btfParamLen + } + + return off, nil +} + +type btfDeclTag struct { + ComponentIdx uint32 +} + +var btfDeclTagLen = int(unsafe.Sizeof(btfDeclTag{})) + +func unmarshalBtfDeclTag(bdt *btfDeclTag, b []byte, bo binary.ByteOrder) (int, error) { + if len(b) < btfDeclTagLen { + return 0, fmt.Errorf("not enough bytes to unmarshal btfDeclTag") + } + + bdt.ComponentIdx = bo.Uint32(b[0:]) + return btfDeclTagLen, nil +} diff --git a/vendor/github.com/cilium/ebpf/btf/btf_types_string.go b/vendor/github.com/cilium/ebpf/btf/btf_types_string.go new file mode 100644 index 0000000000..b7a1b80d15 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/btf/btf_types_string.go @@ -0,0 +1,80 @@ +// Code generated by "stringer -linecomment -output=btf_types_string.go -type=FuncLinkage,VarLinkage,btfKind"; DO NOT EDIT. + +package btf + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[StaticFunc-0] + _ = x[GlobalFunc-1] + _ = x[ExternFunc-2] +} + +const _FuncLinkage_name = "staticglobalextern" + +var _FuncLinkage_index = [...]uint8{0, 6, 12, 18} + +func (i FuncLinkage) String() string { + if i < 0 || i >= FuncLinkage(len(_FuncLinkage_index)-1) { + return "FuncLinkage(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _FuncLinkage_name[_FuncLinkage_index[i]:_FuncLinkage_index[i+1]] +} +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[StaticVar-0] + _ = x[GlobalVar-1] + _ = x[ExternVar-2] +} + +const _VarLinkage_name = "staticglobalextern" + +var _VarLinkage_index = [...]uint8{0, 6, 12, 18} + +func (i VarLinkage) String() string { + if i < 0 || i >= VarLinkage(len(_VarLinkage_index)-1) { + return "VarLinkage(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _VarLinkage_name[_VarLinkage_index[i]:_VarLinkage_index[i+1]] +} +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[kindUnknown-0] + _ = x[kindInt-1] + _ = x[kindPointer-2] + _ = x[kindArray-3] + _ = x[kindStruct-4] + _ = x[kindUnion-5] + _ = x[kindEnum-6] + _ = x[kindForward-7] + _ = x[kindTypedef-8] + _ = x[kindVolatile-9] + _ = x[kindConst-10] + _ = x[kindRestrict-11] + _ = x[kindFunc-12] + _ = x[kindFuncProto-13] + _ = x[kindVar-14] + _ = x[kindDatasec-15] + _ = x[kindFloat-16] + _ = x[kindDeclTag-17] + _ = x[kindTypeTag-18] + _ = x[kindEnum64-19] +} + +const _btfKind_name = "UnknownIntPointerArrayStructUnionEnumForwardTypedefVolatileConstRestrictFuncFuncProtoVarDatasecFloatDeclTagTypeTagEnum64" + +var _btfKind_index = [...]uint8{0, 7, 10, 17, 22, 28, 33, 37, 44, 51, 59, 64, 72, 76, 85, 88, 95, 100, 107, 114, 120} + +func (i btfKind) String() string { + if i >= btfKind(len(_btfKind_index)-1) { + return "btfKind(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _btfKind_name[_btfKind_index[i]:_btfKind_index[i+1]] +} diff --git a/vendor/github.com/cilium/ebpf/btf/core.go b/vendor/github.com/cilium/ebpf/btf/core.go new file mode 100644 index 0000000000..ee89f98331 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/btf/core.go @@ -0,0 +1,1261 @@ +package btf + +import ( + "encoding/binary" + "errors" + "fmt" + "math" + "reflect" + "slices" + "strconv" + "strings" + + "github.com/cilium/ebpf/asm" +) + +// Code in this file is derived from libbpf, which is available under a BSD +// 2-Clause license. + +// A constant used when CO-RE relocation has to remove instructions. +// +// Taken from libbpf. +const COREBadRelocationSentinel = 0xbad2310 + +// COREFixup is the result of computing a CO-RE relocation for a target. +type COREFixup struct { + kind coreKind + local uint64 + target uint64 + // True if there is no valid fixup. The instruction is replaced with an + // invalid dummy. + poison bool + // True if the validation of the local value should be skipped. Used by + // some kinds of bitfield relocations. + skipLocalValidation bool +} + +func (f *COREFixup) equal(other COREFixup) bool { + return f.local == other.local && f.target == other.target +} + +func (f *COREFixup) String() string { + if f.poison { + return fmt.Sprintf("%s=poison", f.kind) + } + return fmt.Sprintf("%s=%d->%d", f.kind, f.local, f.target) +} + +func (f *COREFixup) Apply(ins *asm.Instruction) error { + if f.poison { + // Relocation is poisoned, replace the instruction with an invalid one. + if ins.OpCode.IsDWordLoad() { + // Replace a dword load with a invalid dword load to preserve instruction size. + *ins = asm.LoadImm(asm.R10, COREBadRelocationSentinel, asm.DWord) + } else { + // Replace all single size instruction with a invalid call instruction. + *ins = asm.BuiltinFunc(COREBadRelocationSentinel).Call() + } + + // Add context to the kernel verifier output. + if source := ins.Source(); source != nil { + *ins = ins.WithSource(asm.Comment(fmt.Sprintf("instruction poisoned by CO-RE: %s", source))) + } else { + *ins = ins.WithSource(asm.Comment("instruction poisoned by CO-RE")) + } + + return nil + } + + switch class := ins.OpCode.Class(); class { + case asm.LdXClass, asm.StClass, asm.StXClass: + if want := int16(f.local); !f.skipLocalValidation && want != ins.Offset { + return fmt.Errorf("invalid offset %d, expected %d", ins.Offset, f.local) + } + + if f.target > math.MaxInt16 { + return fmt.Errorf("offset %d exceeds MaxInt16", f.target) + } + + ins.Offset = int16(f.target) + + case asm.LdClass: + if !ins.IsConstantLoad(asm.DWord) { + return fmt.Errorf("not a dword-sized immediate load") + } + + if want := int64(f.local); !f.skipLocalValidation && want != ins.Constant { + return fmt.Errorf("invalid immediate %d, expected %d (fixup: %v)", ins.Constant, want, f) + } + + ins.Constant = int64(f.target) + + case asm.ALUClass: + if ins.OpCode.ALUOp() == asm.Swap { + return fmt.Errorf("relocation against swap") + } + + fallthrough + + case asm.ALU64Class: + if src := ins.OpCode.Source(); src != asm.ImmSource { + return fmt.Errorf("invalid source %s", src) + } + + if want := int64(f.local); !f.skipLocalValidation && want != ins.Constant { + return fmt.Errorf("invalid immediate %d, expected %d (fixup: %v, kind: %v, ins: %v)", ins.Constant, want, f, f.kind, ins) + } + + if f.target > math.MaxInt32 { + return fmt.Errorf("immediate %d exceeds MaxInt32", f.target) + } + + ins.Constant = int64(f.target) + + default: + return fmt.Errorf("invalid class %s", class) + } + + return nil +} + +func (f COREFixup) isNonExistant() bool { + return f.kind.checksForExistence() && f.target == 0 +} + +// coreKind is the type of CO-RE relocation as specified in BPF source code. +type coreKind uint32 + +const ( + reloFieldByteOffset coreKind = iota /* field byte offset */ + reloFieldByteSize /* field size in bytes */ + reloFieldExists /* field existence in target kernel */ + reloFieldSigned /* field signedness (0 - unsigned, 1 - signed) */ + reloFieldLShiftU64 /* bitfield-specific left bitshift */ + reloFieldRShiftU64 /* bitfield-specific right bitshift */ + reloTypeIDLocal /* type ID in local BPF object */ + reloTypeIDTarget /* type ID in target kernel */ + reloTypeExists /* type existence in target kernel */ + reloTypeSize /* type size in bytes */ + reloEnumvalExists /* enum value existence in target kernel */ + reloEnumvalValue /* enum value integer value */ + reloTypeMatches /* type matches kernel type */ +) + +func (k coreKind) checksForExistence() bool { + return k == reloEnumvalExists || k == reloTypeExists || k == reloFieldExists || k == reloTypeMatches +} + +func (k coreKind) String() string { + switch k { + case reloFieldByteOffset: + return "byte_off" + case reloFieldByteSize: + return "byte_sz" + case reloFieldExists: + return "field_exists" + case reloFieldSigned: + return "signed" + case reloFieldLShiftU64: + return "lshift_u64" + case reloFieldRShiftU64: + return "rshift_u64" + case reloTypeIDLocal: + return "local_type_id" + case reloTypeIDTarget: + return "target_type_id" + case reloTypeExists: + return "type_exists" + case reloTypeSize: + return "type_size" + case reloEnumvalExists: + return "enumval_exists" + case reloEnumvalValue: + return "enumval_value" + case reloTypeMatches: + return "type_matches" + default: + return fmt.Sprintf("unknown (%d)", k) + } +} + +// CORERelocate calculates changes needed to adjust eBPF instructions for differences +// in types. +// +// targets forms the set of types to relocate against. The first element has to be +// BTF for vmlinux, the following must be types for kernel modules. +// +// resolveLocalTypeID is called for each local type which requires a stable TypeID. +// Calling the function with the same type multiple times must produce the same +// result. It is the callers responsibility to ensure that the relocated instructions +// are loaded with matching BTF. +// +// Returns a list of fixups which can be applied to instructions to make them +// match the target type(s). +// +// Fixups are returned in the order of relos, e.g. fixup[i] is the solution +// for relos[i]. +func CORERelocate(relos []*CORERelocation, targets []*Spec, bo binary.ByteOrder, resolveLocalTypeID func(Type) (TypeID, error)) ([]COREFixup, error) { + if len(targets) == 0 { + // Explicitly check for nil here since the argument used to be optional. + return nil, fmt.Errorf("targets must be provided") + } + + // We can't encode type IDs that aren't for vmlinux into instructions at the + // moment. + resolveTargetTypeID := targets[0].TypeID + + for _, target := range targets { + if bo != target.imm.byteOrder { + return nil, fmt.Errorf("can't relocate %s against %s", bo, target.imm.byteOrder) + } + } + + type reloGroup struct { + relos []*CORERelocation + // Position of each relocation in relos. + indices []int + } + + // Split relocations into per Type lists. + relosByType := make(map[Type]*reloGroup) + result := make([]COREFixup, len(relos)) + for i, relo := range relos { + if relo.kind == reloTypeIDLocal { + // Filtering out reloTypeIDLocal here makes our lives a lot easier + // down the line, since it doesn't have a target at all. + if len(relo.accessor) > 1 || relo.accessor[0] != 0 { + return nil, fmt.Errorf("%s: unexpected accessor %v", relo.kind, relo.accessor) + } + + id, err := resolveLocalTypeID(relo.typ) + if err != nil { + return nil, fmt.Errorf("%s: get type id: %w", relo.kind, err) + } + + result[i] = COREFixup{ + kind: relo.kind, + local: uint64(relo.id), + target: uint64(id), + } + continue + } + + group, ok := relosByType[relo.typ] + if !ok { + group = &reloGroup{} + relosByType[relo.typ] = group + } + group.relos = append(group.relos, relo) + group.indices = append(group.indices, i) + } + + for localType, group := range relosByType { + localTypeName := localType.TypeName() + if localTypeName == "" { + return nil, fmt.Errorf("relocate unnamed or anonymous type %s: %w", localType, ErrNotSupported) + } + + essentialName := newEssentialName(localTypeName) + + var targetTypes []Type + for _, target := range targets { + namedTypeIDs := target.imm.namedTypes[essentialName] + targetTypes = slices.Grow(targetTypes, len(namedTypeIDs)) + for _, id := range namedTypeIDs { + typ, err := target.TypeByID(id) + if err != nil { + return nil, err + } + + targetTypes = append(targetTypes, typ) + } + } + + fixups, err := coreCalculateFixups(group.relos, targetTypes, bo, resolveTargetTypeID) + if err != nil { + return nil, fmt.Errorf("relocate %s: %w", localType, err) + } + + for j, index := range group.indices { + result[index] = fixups[j] + } + } + + return result, nil +} + +var errAmbiguousRelocation = errors.New("ambiguous relocation") +var errImpossibleRelocation = errors.New("impossible relocation") +var errIncompatibleTypes = errors.New("incompatible types") + +// coreCalculateFixups finds the target type that best matches all relocations. +// +// All relos must target the same type. +// +// The best target is determined by scoring: the less poisoning we have to do +// the better the target is. +func coreCalculateFixups(relos []*CORERelocation, targets []Type, bo binary.ByteOrder, resolveTargetTypeID func(Type) (TypeID, error)) ([]COREFixup, error) { + bestScore := len(relos) + var bestFixups []COREFixup + for _, target := range targets { + score := 0 // lower is better + fixups := make([]COREFixup, 0, len(relos)) + for _, relo := range relos { + fixup, err := coreCalculateFixup(relo, target, bo, resolveTargetTypeID) + if err != nil { + return nil, fmt.Errorf("target %s: %s: %w", target, relo.kind, err) + } + if fixup.poison || fixup.isNonExistant() { + score++ + } + fixups = append(fixups, fixup) + } + + if score > bestScore { + // We have a better target already, ignore this one. + continue + } + + if score < bestScore { + // This is the best target yet, use it. + bestScore = score + bestFixups = fixups + continue + } + + // Some other target has the same score as the current one. Make sure + // the fixups agree with each other. + for i, fixup := range bestFixups { + if !fixup.equal(fixups[i]) { + return nil, fmt.Errorf("%s: multiple types match: %w", fixup.kind, errAmbiguousRelocation) + } + } + } + + if bestFixups == nil { + // Nothing at all matched, probably because there are no suitable + // targets at all. + // + // Poison everything except checksForExistence. + bestFixups = make([]COREFixup, len(relos)) + for i, relo := range relos { + if relo.kind.checksForExistence() { + bestFixups[i] = COREFixup{kind: relo.kind, local: 1, target: 0} + } else { + bestFixups[i] = COREFixup{kind: relo.kind, poison: true} + } + } + } + + return bestFixups, nil +} + +var errNoSignedness = errors.New("no signedness") + +// coreCalculateFixup calculates the fixup given a relocation and a target type. +func coreCalculateFixup(relo *CORERelocation, target Type, bo binary.ByteOrder, resolveTargetTypeID func(Type) (TypeID, error)) (COREFixup, error) { + fixup := func(local, target uint64) (COREFixup, error) { + return COREFixup{kind: relo.kind, local: local, target: target}, nil + } + fixupWithoutValidation := func(local, target uint64) (COREFixup, error) { + return COREFixup{kind: relo.kind, local: local, target: target, skipLocalValidation: true}, nil + } + poison := func() (COREFixup, error) { + if relo.kind.checksForExistence() { + return fixup(1, 0) + } + return COREFixup{kind: relo.kind, poison: true}, nil + } + zero := COREFixup{} + + local := relo.typ + + switch relo.kind { + case reloTypeMatches: + if len(relo.accessor) > 1 || relo.accessor[0] != 0 { + return zero, fmt.Errorf("unexpected accessor %v", relo.accessor) + } + + err := coreTypesMatch(local, target, nil) + if errors.Is(err, errIncompatibleTypes) { + return poison() + } + if err != nil { + return zero, err + } + + return fixup(1, 1) + + case reloTypeIDTarget, reloTypeSize, reloTypeExists: + if len(relo.accessor) > 1 || relo.accessor[0] != 0 { + return zero, fmt.Errorf("unexpected accessor %v", relo.accessor) + } + + err := CheckTypeCompatibility(local, target) + if errors.Is(err, errIncompatibleTypes) { + return poison() + } + if err != nil { + return zero, err + } + + switch relo.kind { + case reloTypeExists: + return fixup(1, 1) + + case reloTypeIDTarget: + targetID, err := resolveTargetTypeID(target) + if errors.Is(err, ErrNotFound) { + // Probably a relocation trying to get the ID + // of a type from a kmod. + return poison() + } + if err != nil { + return zero, err + } + return fixup(uint64(relo.id), uint64(targetID)) + + case reloTypeSize: + localSize, err := Sizeof(local) + if err != nil { + return zero, err + } + + targetSize, err := Sizeof(target) + if err != nil { + return zero, err + } + + return fixup(uint64(localSize), uint64(targetSize)) + } + + case reloEnumvalValue, reloEnumvalExists: + localValue, targetValue, err := coreFindEnumValue(local, relo.accessor, target) + if errors.Is(err, errImpossibleRelocation) { + return poison() + } + if err != nil { + return zero, err + } + + switch relo.kind { + case reloEnumvalExists: + return fixup(1, 1) + + case reloEnumvalValue: + return fixup(localValue.Value, targetValue.Value) + } + + case reloFieldByteOffset, reloFieldByteSize, reloFieldExists, reloFieldLShiftU64, reloFieldRShiftU64, reloFieldSigned: + if _, ok := As[*Fwd](target); ok { + // We can't relocate fields using a forward declaration, so + // skip it. If a non-forward declaration is present in the BTF + // we'll find it in one of the other iterations. + return poison() + } + + localField, targetField, err := coreFindField(local, relo.accessor, target) + if errors.Is(err, errImpossibleRelocation) { + return poison() + } + if err != nil { + return zero, err + } + + maybeSkipValidation := func(f COREFixup, err error) (COREFixup, error) { + f.skipLocalValidation = localField.bitfieldSize > 0 + return f, err + } + + switch relo.kind { + case reloFieldExists: + return fixup(1, 1) + + case reloFieldByteOffset: + return maybeSkipValidation(fixup(uint64(localField.offset), uint64(targetField.offset))) + + case reloFieldByteSize: + localSize, err := Sizeof(localField.Type) + if err != nil { + return zero, err + } + + targetSize, err := Sizeof(targetField.Type) + if err != nil { + return zero, err + } + return maybeSkipValidation(fixup(uint64(localSize), uint64(targetSize))) + + case reloFieldLShiftU64: + var target uint64 + if bo == binary.LittleEndian { + targetSize, err := targetField.sizeBits() + if err != nil { + return zero, err + } + + target = uint64(64 - targetField.bitfieldOffset - targetSize) + } else { + loadWidth, err := Sizeof(targetField.Type) + if err != nil { + return zero, err + } + + target = uint64(64 - Bits(loadWidth*8) + targetField.bitfieldOffset) + } + return fixupWithoutValidation(0, target) + + case reloFieldRShiftU64: + targetSize, err := targetField.sizeBits() + if err != nil { + return zero, err + } + + return fixupWithoutValidation(0, uint64(64-targetSize)) + + case reloFieldSigned: + switch local := UnderlyingType(localField.Type).(type) { + case *Enum: + target, ok := As[*Enum](targetField.Type) + if !ok { + return zero, fmt.Errorf("target isn't *Enum but %T", targetField.Type) + } + + return fixup(boolToUint64(local.Signed), boolToUint64(target.Signed)) + case *Int: + target, ok := As[*Int](targetField.Type) + if !ok { + return zero, fmt.Errorf("target isn't *Int but %T", targetField.Type) + } + + return fixup( + uint64(local.Encoding&Signed), + uint64(target.Encoding&Signed), + ) + default: + return zero, fmt.Errorf("type %T: %w", local, errNoSignedness) + } + } + } + + return zero, ErrNotSupported +} + +func boolToUint64(val bool) uint64 { + if val { + return 1 + } + return 0 +} + +/* coreAccessor contains a path through a struct. It contains at least one index. + * + * The interpretation depends on the kind of the relocation. The following is + * taken from struct bpf_core_relo in libbpf_internal.h: + * + * - for field-based relocations, string encodes an accessed field using + * a sequence of field and array indices, separated by colon (:). It's + * conceptually very close to LLVM's getelementptr ([0]) instruction's + * arguments for identifying offset to a field. + * - for type-based relocations, strings is expected to be just "0"; + * - for enum value-based relocations, string contains an index of enum + * value within its enum type; + * + * Example to provide a better feel. + * + * struct sample { + * int a; + * struct { + * int b[10]; + * }; + * }; + * + * struct sample s = ...; + * int x = &s->a; // encoded as "0:0" (a is field #0) + * int y = &s->b[5]; // encoded as "0:1:0:5" (anon struct is field #1, + * // b is field #0 inside anon struct, accessing elem #5) + * int z = &s[10]->b; // encoded as "10:1" (ptr is used as an array) + */ +type coreAccessor []int + +func parseCOREAccessor(accessor string) (coreAccessor, error) { + if accessor == "" { + return nil, fmt.Errorf("empty accessor") + } + + parts := strings.Split(accessor, ":") + result := make(coreAccessor, 0, len(parts)) + for _, part := range parts { + // 31 bits to avoid overflowing int on 32 bit platforms. + index, err := strconv.ParseUint(part, 10, 31) + if err != nil { + return nil, fmt.Errorf("accessor index %q: %s", part, err) + } + + result = append(result, int(index)) + } + + return result, nil +} + +func (ca coreAccessor) String() string { + strs := make([]string, 0, len(ca)) + for _, i := range ca { + strs = append(strs, strconv.Itoa(i)) + } + return strings.Join(strs, ":") +} + +func (ca coreAccessor) enumValue(t Type) (*EnumValue, error) { + e, ok := As[*Enum](t) + if !ok { + return nil, fmt.Errorf("not an enum: %s", t) + } + + if len(ca) > 1 { + return nil, fmt.Errorf("invalid accessor %s for enum", ca) + } + + i := ca[0] + if i >= len(e.Values) { + return nil, fmt.Errorf("invalid index %d for %s", i, e) + } + + return &e.Values[i], nil +} + +// coreField represents the position of a "child" of a composite type from the +// start of that type. +// +// /- start of composite +// | offset * 8 | bitfieldOffset | bitfieldSize | ... | +// \- start of field end of field -/ +type coreField struct { + Type Type + + // The position of the field from the start of the composite type in bytes. + offset uint32 + + // The offset of the bitfield in bits from the start of the field. + bitfieldOffset Bits + + // The size of the bitfield in bits. + // + // Zero if the field is not a bitfield. + bitfieldSize Bits +} + +func (cf *coreField) adjustOffsetToNthElement(n int) error { + if n == 0 { + return nil + } + + size, err := Sizeof(cf.Type) + if err != nil { + return err + } + + cf.offset += uint32(n) * uint32(size) + return nil +} + +func (cf *coreField) adjustOffsetBits(offset Bits) error { + align, err := alignof(cf.Type) + if err != nil { + return err + } + + // We can compute the load offset by: + // 1) converting the bit offset to bytes with a flooring division. + // 2) dividing and multiplying that offset by the alignment, yielding the + // load size aligned offset. + offsetBytes := uint32(offset/8) / uint32(align) * uint32(align) + + // The number of bits remaining is the bit offset less the number of bits + // we can "skip" with the aligned offset. + cf.bitfieldOffset = offset - Bits(offsetBytes*8) + + // We know that cf.offset is aligned at to at least align since we get it + // from the compiler via BTF. Adding an aligned offsetBytes preserves the + // alignment. + cf.offset += offsetBytes + return nil +} + +func (cf *coreField) sizeBits() (Bits, error) { + if cf.bitfieldSize > 0 { + return cf.bitfieldSize, nil + } + + // Someone is trying to access a non-bitfield via a bit shift relocation. + // This happens when a field changes from a bitfield to a regular field + // between kernel versions. Synthesise the size to make the shifts work. + size, err := Sizeof(cf.Type) + if err != nil { + return 0, err + } + return Bits(size * 8), nil +} + +// coreFindField descends into the local type using the accessor and tries to +// find an equivalent field in target at each step. +// +// Returns the field and the offset of the field from the start of +// target in bits. +func coreFindField(localT Type, localAcc coreAccessor, targetT Type) (coreField, coreField, error) { + local := coreField{Type: localT} + target := coreField{Type: targetT} + + if err := coreAreMembersCompatible(local.Type, target.Type); err != nil { + return coreField{}, coreField{}, fmt.Errorf("fields: %w", err) + } + + // The first index is used to offset a pointer of the base type like + // when accessing an array. + if err := local.adjustOffsetToNthElement(localAcc[0]); err != nil { + return coreField{}, coreField{}, err + } + + if err := target.adjustOffsetToNthElement(localAcc[0]); err != nil { + return coreField{}, coreField{}, err + } + + var localMaybeFlex, targetMaybeFlex bool + for i, acc := range localAcc[1:] { + switch localType := UnderlyingType(local.Type).(type) { + case composite: + // For composite types acc is used to find the field in the local type, + // and then we try to find a field in target with the same name. + localMembers := localType.members() + if acc >= len(localMembers) { + return coreField{}, coreField{}, fmt.Errorf("invalid accessor %d for %s", acc, localType) + } + + localMember := localMembers[acc] + if localMember.Name == "" { + localMemberType, ok := As[composite](localMember.Type) + if !ok { + return coreField{}, coreField{}, fmt.Errorf("unnamed field with type %s: %s", localMember.Type, ErrNotSupported) + } + + // This is an anonymous struct or union, ignore it. + local = coreField{ + Type: localMemberType, + offset: local.offset + localMember.Offset.Bytes(), + } + localMaybeFlex = false + continue + } + + targetType, ok := As[composite](target.Type) + if !ok { + return coreField{}, coreField{}, fmt.Errorf("target not composite: %w", errImpossibleRelocation) + } + + targetMember, last, err := coreFindMember(targetType, localMember.Name) + if err != nil { + return coreField{}, coreField{}, err + } + + local = coreField{ + Type: localMember.Type, + offset: local.offset, + bitfieldSize: localMember.BitfieldSize, + } + localMaybeFlex = acc == len(localMembers)-1 + + target = coreField{ + Type: targetMember.Type, + offset: target.offset, + bitfieldSize: targetMember.BitfieldSize, + } + targetMaybeFlex = last + + if local.bitfieldSize == 0 && target.bitfieldSize == 0 { + local.offset += localMember.Offset.Bytes() + target.offset += targetMember.Offset.Bytes() + break + } + + // Either of the members is a bitfield. Make sure we're at the + // end of the accessor. + if next := i + 1; next < len(localAcc[1:]) { + return coreField{}, coreField{}, fmt.Errorf("can't descend into bitfield") + } + + if err := local.adjustOffsetBits(localMember.Offset); err != nil { + return coreField{}, coreField{}, err + } + + if err := target.adjustOffsetBits(targetMember.Offset); err != nil { + return coreField{}, coreField{}, err + } + + case *Array: + // For arrays, acc is the index in the target. + targetType, ok := As[*Array](target.Type) + if !ok { + return coreField{}, coreField{}, fmt.Errorf("target not array: %w", errImpossibleRelocation) + } + + if localType.Nelems == 0 && !localMaybeFlex { + return coreField{}, coreField{}, fmt.Errorf("local type has invalid flexible array") + } + if targetType.Nelems == 0 && !targetMaybeFlex { + return coreField{}, coreField{}, fmt.Errorf("target type has invalid flexible array") + } + + if localType.Nelems > 0 && acc >= int(localType.Nelems) { + return coreField{}, coreField{}, fmt.Errorf("invalid access of %s at index %d", localType, acc) + } + if targetType.Nelems > 0 && acc >= int(targetType.Nelems) { + return coreField{}, coreField{}, fmt.Errorf("out of bounds access of target: %w", errImpossibleRelocation) + } + + local = coreField{ + Type: localType.Type, + offset: local.offset, + } + localMaybeFlex = false + + if err := local.adjustOffsetToNthElement(acc); err != nil { + return coreField{}, coreField{}, err + } + + target = coreField{ + Type: targetType.Type, + offset: target.offset, + } + targetMaybeFlex = false + + if err := target.adjustOffsetToNthElement(acc); err != nil { + return coreField{}, coreField{}, err + } + + default: + return coreField{}, coreField{}, fmt.Errorf("relocate field of %T: %w", localType, ErrNotSupported) + } + + if err := coreAreMembersCompatible(local.Type, target.Type); err != nil { + return coreField{}, coreField{}, err + } + } + + return local, target, nil +} + +// coreFindMember finds a member in a composite type while handling anonymous +// structs and unions. +func coreFindMember(typ composite, name string) (Member, bool, error) { + if name == "" { + return Member{}, false, errors.New("can't search for anonymous member") + } + + type offsetTarget struct { + composite + offset Bits + } + + targets := []offsetTarget{{typ, 0}} + visited := make(map[composite]bool) + + for i := 0; i < len(targets); i++ { + target := targets[i] + + // Only visit targets once to prevent infinite recursion. + if visited[target] { + continue + } + if len(visited) >= maxResolveDepth { + // This check is different than libbpf, which restricts the entire + // path to BPF_CORE_SPEC_MAX_LEN items. + return Member{}, false, fmt.Errorf("type is nested too deep") + } + visited[target] = true + + members := target.members() + for j, member := range members { + if member.Name == name { + // NB: This is safe because member is a copy. + member.Offset += target.offset + return member, j == len(members)-1, nil + } + + // The names don't match, but this member could be an anonymous struct + // or union. + if member.Name != "" { + continue + } + + comp, ok := As[composite](member.Type) + if !ok { + return Member{}, false, fmt.Errorf("anonymous non-composite type %T not allowed", member.Type) + } + + targets = append(targets, offsetTarget{comp, target.offset + member.Offset}) + } + } + + return Member{}, false, fmt.Errorf("no matching member: %w", errImpossibleRelocation) +} + +// coreFindEnumValue follows localAcc to find the equivalent enum value in target. +func coreFindEnumValue(local Type, localAcc coreAccessor, target Type) (localValue, targetValue *EnumValue, _ error) { + localValue, err := localAcc.enumValue(local) + if err != nil { + return nil, nil, err + } + + targetEnum, ok := As[*Enum](target) + if !ok { + return nil, nil, errImpossibleRelocation + } + + localName := newEssentialName(localValue.Name) + for i, targetValue := range targetEnum.Values { + if newEssentialName(targetValue.Name) != localName { + continue + } + + return localValue, &targetEnum.Values[i], nil + } + + return nil, nil, errImpossibleRelocation +} + +// CheckTypeCompatibility checks local and target types for Compatibility according to CO-RE rules. +// +// Only layout compatibility is checked, ignoring names of the root type. +func CheckTypeCompatibility(localType Type, targetType Type) error { + return coreAreTypesCompatible(localType, targetType, nil) +} + +type pair struct { + A, B Type +} + +/* The comment below is from bpf_core_types_are_compat in libbpf.c: + * + * Check local and target types for compatibility. This check is used for + * type-based CO-RE relocations and follow slightly different rules than + * field-based relocations. This function assumes that root types were already + * checked for name match. Beyond that initial root-level name check, names + * are completely ignored. Compatibility rules are as follows: + * - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs are considered compatible, but + * kind should match for local and target types (i.e., STRUCT is not + * compatible with UNION); + * - for ENUMs, the size is ignored; + * - for INT, size and signedness are ignored; + * - for ARRAY, dimensionality is ignored, element types are checked for + * compatibility recursively; + * - CONST/VOLATILE/RESTRICT modifiers are ignored; + * - TYPEDEFs/PTRs are compatible if types they pointing to are compatible; + * - FUNC_PROTOs are compatible if they have compatible signature: same + * number of input args and compatible return and argument types. + * These rules are not set in stone and probably will be adjusted as we get + * more experience with using BPF CO-RE relocations. + * + * Returns errIncompatibleTypes if types are not compatible. + */ +func coreAreTypesCompatible(localType Type, targetType Type, visited map[pair]struct{}) error { + localType = UnderlyingType(localType) + targetType = UnderlyingType(targetType) + + if reflect.TypeOf(localType) != reflect.TypeOf(targetType) { + return fmt.Errorf("type mismatch between %v and %v: %w", localType, targetType, errIncompatibleTypes) + } + + if _, ok := visited[pair{localType, targetType}]; ok { + return nil + } + if visited == nil { + visited = make(map[pair]struct{}) + } + visited[pair{localType, targetType}] = struct{}{} + + switch lv := localType.(type) { + case *Void, *Struct, *Union, *Enum, *Fwd, *Int: + return nil + + case *Pointer: + tv := targetType.(*Pointer) + return coreAreTypesCompatible(lv.Target, tv.Target, visited) + + case *Array: + tv := targetType.(*Array) + if err := coreAreTypesCompatible(lv.Index, tv.Index, visited); err != nil { + return err + } + + return coreAreTypesCompatible(lv.Type, tv.Type, visited) + + case *FuncProto: + tv := targetType.(*FuncProto) + if err := coreAreTypesCompatible(lv.Return, tv.Return, visited); err != nil { + return err + } + + if len(lv.Params) != len(tv.Params) { + return fmt.Errorf("function param mismatch: %w", errIncompatibleTypes) + } + + for i, localParam := range lv.Params { + targetParam := tv.Params[i] + if err := coreAreTypesCompatible(localParam.Type, targetParam.Type, visited); err != nil { + return err + } + } + + return nil + + default: + return fmt.Errorf("unsupported type %T", localType) + } +} + +/* coreAreMembersCompatible checks two types for field-based relocation compatibility. + * + * The comment below is from bpf_core_fields_are_compat in libbpf.c: + * + * Check two types for compatibility for the purpose of field access + * relocation. const/volatile/restrict and typedefs are skipped to ensure we + * are relocating semantically compatible entities: + * - any two STRUCTs/UNIONs are compatible and can be mixed; + * - any two FWDs are compatible, if their names match (modulo flavor suffix); + * - any two PTRs are always compatible; + * - for ENUMs, names should be the same (ignoring flavor suffix) or at + * least one of enums should be anonymous; + * - for ENUMs, check sizes, names are ignored; + * - for INT, size and signedness are ignored; + * - any two FLOATs are always compatible; + * - for ARRAY, dimensionality is ignored, element types are checked for + * compatibility recursively; + * [ NB: coreAreMembersCompatible doesn't recurse, this check is done + * by coreFindField. ] + * - everything else shouldn't be ever a target of relocation. + * These rules are not set in stone and probably will be adjusted as we get + * more experience with using BPF CO-RE relocations. + * + * Returns errImpossibleRelocation if the members are not compatible. + */ +func coreAreMembersCompatible(localType Type, targetType Type) error { + localType = UnderlyingType(localType) + targetType = UnderlyingType(targetType) + + _, lok := localType.(composite) + _, tok := targetType.(composite) + if lok && tok { + return nil + } + + if reflect.TypeOf(localType) != reflect.TypeOf(targetType) { + return fmt.Errorf("type mismatch: %w", errImpossibleRelocation) + } + + switch lv := localType.(type) { + case *Array, *Pointer, *Float, *Int: + return nil + + case *Enum: + tv := targetType.(*Enum) + if !coreEssentialNamesMatch(lv.Name, tv.Name) { + return fmt.Errorf("names %q and %q don't match: %w", lv.Name, tv.Name, errImpossibleRelocation) + } + + return nil + + case *Fwd: + tv := targetType.(*Fwd) + if !coreEssentialNamesMatch(lv.Name, tv.Name) { + return fmt.Errorf("names %q and %q don't match: %w", lv.Name, tv.Name, errImpossibleRelocation) + } + + return nil + + default: + return fmt.Errorf("type %s: %w", localType, ErrNotSupported) + } +} + +// coreEssentialNamesMatch compares two names while ignoring their flavour suffix. +// +// This should only be used on names which are in the global scope, like struct +// names, typedefs or enum values. +func coreEssentialNamesMatch(a, b string) bool { + if a == "" || b == "" { + // allow anonymous and named type to match + return true + } + + return newEssentialName(a) == newEssentialName(b) +} + +/* The comment below is from __bpf_core_types_match in relo_core.c: + * + * Check that two types "match". This function assumes that root types were + * already checked for name match. + * + * The matching relation is defined as follows: + * - modifiers and typedefs are stripped (and, hence, effectively ignored) + * - generally speaking types need to be of same kind (struct vs. struct, union + * vs. union, etc.) + * - exceptions are struct/union behind a pointer which could also match a + * forward declaration of a struct or union, respectively, and enum vs. + * enum64 (see below) + * Then, depending on type: + * - integers: + * - match if size and signedness match + * - arrays & pointers: + * - target types are recursively matched + * - structs & unions: + * - local members need to exist in target with the same name + * - for each member we recursively check match unless it is already behind a + * pointer, in which case we only check matching names and compatible kind + * - enums: + * - local variants have to have a match in target by symbolic name (but not + * numeric value) + * - size has to match (but enum may match enum64 and vice versa) + * - function pointers: + * - number and position of arguments in local type has to match target + * - for each argument and the return value we recursively check match + */ +func coreTypesMatch(localType Type, targetType Type, visited map[pair]struct{}) error { + localType = UnderlyingType(localType) + targetType = UnderlyingType(targetType) + + if !coreEssentialNamesMatch(localType.TypeName(), targetType.TypeName()) { + return fmt.Errorf("type name %q don't match %q: %w", localType.TypeName(), targetType.TypeName(), errIncompatibleTypes) + } + + if reflect.TypeOf(localType) != reflect.TypeOf(targetType) { + return fmt.Errorf("type mismatch between %v and %v: %w", localType, targetType, errIncompatibleTypes) + } + + if _, ok := visited[pair{localType, targetType}]; ok { + return nil + } + if visited == nil { + visited = make(map[pair]struct{}) + } + visited[pair{localType, targetType}] = struct{}{} + + switch lv := (localType).(type) { + case *Void: + + case *Fwd: + if targetType.(*Fwd).Kind != lv.Kind { + return fmt.Errorf("fwd kind mismatch between %v and %v: %w", localType, targetType, errIncompatibleTypes) + } + + case *Enum: + return coreEnumsMatch(lv, targetType.(*Enum)) + + case composite: + tv := targetType.(composite) + + if len(lv.members()) > len(tv.members()) { + return errIncompatibleTypes + } + + localMembers := lv.members() + targetMembers := map[string]Member{} + for _, member := range tv.members() { + targetMembers[member.Name] = member + } + + for _, localMember := range localMembers { + targetMember, found := targetMembers[localMember.Name] + if !found { + return fmt.Errorf("no field %q in %v: %w", localMember.Name, targetType, errIncompatibleTypes) + } + + err := coreTypesMatch(localMember.Type, targetMember.Type, visited) + if err != nil { + return err + } + } + + case *Int: + if !coreEncodingMatches(lv, targetType.(*Int)) { + return fmt.Errorf("int mismatch between %v and %v: %w", localType, targetType, errIncompatibleTypes) + } + + case *Pointer: + tv := targetType.(*Pointer) + + // Allow a pointer to a forward declaration to match a struct + // or union. + if fwd, ok := As[*Fwd](lv.Target); ok && fwd.matches(tv.Target) { + return nil + } + + if fwd, ok := As[*Fwd](tv.Target); ok && fwd.matches(lv.Target) { + return nil + } + + return coreTypesMatch(lv.Target, tv.Target, visited) + + case *Array: + tv := targetType.(*Array) + + if lv.Nelems != tv.Nelems { + return fmt.Errorf("array mismatch between %v and %v: %w", localType, targetType, errIncompatibleTypes) + } + + return coreTypesMatch(lv.Type, tv.Type, visited) + + case *FuncProto: + tv := targetType.(*FuncProto) + + if len(lv.Params) != len(tv.Params) { + return fmt.Errorf("function param mismatch: %w", errIncompatibleTypes) + } + + for i, lparam := range lv.Params { + if err := coreTypesMatch(lparam.Type, tv.Params[i].Type, visited); err != nil { + return err + } + } + + return coreTypesMatch(lv.Return, tv.Return, visited) + + default: + return fmt.Errorf("unsupported type %T", localType) + } + + return nil +} + +// coreEncodingMatches returns true if both ints have the same size and signedness. +// All encodings other than `Signed` are considered unsigned. +func coreEncodingMatches(local, target *Int) bool { + return local.Size == target.Size && (local.Encoding == Signed) == (target.Encoding == Signed) +} + +// coreEnumsMatch checks two enums match, which is considered to be the case if the following is true: +// - size has to match (but enum may match enum64 and vice versa) +// - local variants have to have a match in target by symbolic name (but not numeric value) +func coreEnumsMatch(local *Enum, target *Enum) error { + if local.Size != target.Size { + return fmt.Errorf("size mismatch between %v and %v: %w", local, target, errIncompatibleTypes) + } + + // If there are more values in the local than the target, there must be at least one value in the local + // that isn't in the target, and therefor the types are incompatible. + if len(local.Values) > len(target.Values) { + return fmt.Errorf("local has more values than target: %w", errIncompatibleTypes) + } + +outer: + for _, lv := range local.Values { + for _, rv := range target.Values { + if coreEssentialNamesMatch(lv.Name, rv.Name) { + continue outer + } + } + + return fmt.Errorf("no match for %v in %v: %w", lv, target, errIncompatibleTypes) + } + + return nil +} diff --git a/vendor/github.com/cilium/ebpf/btf/doc.go b/vendor/github.com/cilium/ebpf/btf/doc.go new file mode 100644 index 0000000000..b1f4b1fc3e --- /dev/null +++ b/vendor/github.com/cilium/ebpf/btf/doc.go @@ -0,0 +1,5 @@ +// Package btf handles data encoded according to the BPF Type Format. +// +// The canonical documentation lives in the Linux kernel repository and is +// available at https://www.kernel.org/doc/html/latest/bpf/btf.html +package btf diff --git a/vendor/github.com/cilium/ebpf/btf/ext_info.go b/vendor/github.com/cilium/ebpf/btf/ext_info.go new file mode 100644 index 0000000000..71dbba8061 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/btf/ext_info.go @@ -0,0 +1,832 @@ +package btf + +import ( + "bytes" + "encoding/binary" + "errors" + "fmt" + "io" + "math" + "sort" + + "github.com/cilium/ebpf/asm" + "github.com/cilium/ebpf/internal" +) + +// ExtInfos contains ELF section metadata. +type ExtInfos struct { + // The slices are sorted by offset in ascending order. + funcInfos map[string]FuncOffsets + lineInfos map[string]LineOffsets + relocationInfos map[string]CORERelocationInfos +} + +// loadExtInfosFromELF parses ext infos from the .BTF.ext section in an ELF. +// +// Returns an error wrapping ErrNotFound if no ext infos are present. +func loadExtInfosFromELF(file *internal.SafeELFFile, spec *Spec) (*ExtInfos, error) { + section := file.Section(".BTF.ext") + if section == nil { + return nil, fmt.Errorf("btf ext infos: %w", ErrNotFound) + } + + if section.ReaderAt == nil { + return nil, fmt.Errorf("compressed ext_info is not supported") + } + + return loadExtInfos(section.ReaderAt, file.ByteOrder, spec) +} + +// loadExtInfos parses bare ext infos. +func loadExtInfos(r io.ReaderAt, bo binary.ByteOrder, spec *Spec) (*ExtInfos, error) { + // Open unbuffered section reader. binary.Read() calls io.ReadFull on + // the header structs, resulting in one syscall per header. + headerRd := io.NewSectionReader(r, 0, math.MaxInt64) + extHeader, err := parseBTFExtHeader(headerRd, bo) + if err != nil { + return nil, fmt.Errorf("parsing BTF extension header: %w", err) + } + + coreHeader, err := parseBTFExtCOREHeader(headerRd, bo, extHeader) + if err != nil { + return nil, fmt.Errorf("parsing BTF CO-RE header: %w", err) + } + + buf := internal.NewBufferedSectionReader(r, extHeader.funcInfoStart(), int64(extHeader.FuncInfoLen)) + btfFuncInfos, err := parseFuncInfos(buf, bo, spec.strings) + if err != nil { + return nil, fmt.Errorf("parsing BTF function info: %w", err) + } + + funcInfos := make(map[string]FuncOffsets, len(btfFuncInfos)) + for section, bfis := range btfFuncInfos { + funcInfos[section], err = newFuncOffsets(bfis, spec) + if err != nil { + return nil, fmt.Errorf("section %s: func infos: %w", section, err) + } + } + + buf = internal.NewBufferedSectionReader(r, extHeader.lineInfoStart(), int64(extHeader.LineInfoLen)) + btfLineInfos, err := parseLineInfos(buf, bo, spec.strings) + if err != nil { + return nil, fmt.Errorf("parsing BTF line info: %w", err) + } + + lineInfos := make(map[string]LineOffsets, len(btfLineInfos)) + for section, blis := range btfLineInfos { + lineInfos[section], err = newLineInfos(blis, spec.strings) + if err != nil { + return nil, fmt.Errorf("section %s: line infos: %w", section, err) + } + } + + if coreHeader == nil || coreHeader.COREReloLen == 0 { + return &ExtInfos{funcInfos, lineInfos, nil}, nil + } + + var btfCORERelos map[string][]bpfCORERelo + buf = internal.NewBufferedSectionReader(r, extHeader.coreReloStart(coreHeader), int64(coreHeader.COREReloLen)) + btfCORERelos, err = parseCORERelos(buf, bo, spec.strings) + if err != nil { + return nil, fmt.Errorf("parsing CO-RE relocation info: %w", err) + } + + coreRelos := make(map[string]CORERelocationInfos, len(btfCORERelos)) + for section, brs := range btfCORERelos { + coreRelos[section], err = newRelocationInfos(brs, spec, spec.strings) + if err != nil { + return nil, fmt.Errorf("section %s: CO-RE relocations: %w", section, err) + } + } + + return &ExtInfos{funcInfos, lineInfos, coreRelos}, nil +} + +type ( + funcInfoMeta struct{} + coreRelocationMeta struct{} +) + +// Assign per-section metadata from BTF to a section's instructions. +func (ei *ExtInfos) Assign(insns asm.Instructions, section string) { + funcInfos := ei.funcInfos[section] + lineInfos := ei.lineInfos[section] + reloInfos := ei.relocationInfos[section] + + AssignMetadataToInstructions(insns, funcInfos, lineInfos, reloInfos) +} + +// Assign per-instruction metadata to the instructions in insns. +func AssignMetadataToInstructions( + insns asm.Instructions, + funcInfos FuncOffsets, + lineInfos LineOffsets, + reloInfos CORERelocationInfos, +) { + iter := insns.Iterate() + for iter.Next() { + if len(funcInfos) > 0 && funcInfos[0].Offset == iter.Offset { + *iter.Ins = WithFuncMetadata(*iter.Ins, funcInfos[0].Func) + funcInfos = funcInfos[1:] + } + + if len(lineInfos) > 0 && lineInfos[0].Offset == iter.Offset { + *iter.Ins = iter.Ins.WithSource(lineInfos[0].Line) + lineInfos = lineInfos[1:] + } + + if len(reloInfos.infos) > 0 && reloInfos.infos[0].offset == iter.Offset { + iter.Ins.Metadata.Set(coreRelocationMeta{}, reloInfos.infos[0].relo) + reloInfos.infos = reloInfos.infos[1:] + } + } +} + +// MarshalExtInfos encodes function and line info embedded in insns into kernel +// wire format. +// +// If an instruction has an [asm.Comment], it will be synthesized into a mostly +// empty line info. +func MarshalExtInfos(insns asm.Instructions, b *Builder) (funcInfos, lineInfos []byte, _ error) { + iter := insns.Iterate() + for iter.Next() { + if iter.Ins.Source() != nil || FuncMetadata(iter.Ins) != nil { + goto marshal + } + } + + return nil, nil, nil + +marshal: + var fiBuf, liBuf bytes.Buffer + for { + if fn := FuncMetadata(iter.Ins); fn != nil { + fi := &FuncOffset{ + Func: fn, + Offset: iter.Offset, + } + if err := fi.marshal(&fiBuf, b); err != nil { + return nil, nil, fmt.Errorf("write func info: %w", err) + } + } + + if source := iter.Ins.Source(); source != nil { + var line *Line + if l, ok := source.(*Line); ok { + line = l + } else { + line = &Line{ + line: source.String(), + } + } + + li := &LineOffset{ + Offset: iter.Offset, + Line: line, + } + if err := li.marshal(&liBuf, b); err != nil { + return nil, nil, fmt.Errorf("write line info: %w", err) + } + } + + if !iter.Next() { + break + } + } + + return fiBuf.Bytes(), liBuf.Bytes(), nil +} + +// btfExtHeader is found at the start of the .BTF.ext section. +type btfExtHeader struct { + Magic uint16 + Version uint8 + Flags uint8 + + // HdrLen is larger than the size of struct btfExtHeader when it is + // immediately followed by a btfExtCOREHeader. + HdrLen uint32 + + FuncInfoOff uint32 + FuncInfoLen uint32 + LineInfoOff uint32 + LineInfoLen uint32 +} + +// parseBTFExtHeader parses the header of the .BTF.ext section. +func parseBTFExtHeader(r io.Reader, bo binary.ByteOrder) (*btfExtHeader, error) { + var header btfExtHeader + if err := binary.Read(r, bo, &header); err != nil { + return nil, fmt.Errorf("can't read header: %v", err) + } + + if header.Magic != btfMagic { + return nil, fmt.Errorf("incorrect magic value %v", header.Magic) + } + + if header.Version != 1 { + return nil, fmt.Errorf("unexpected version %v", header.Version) + } + + if header.Flags != 0 { + return nil, fmt.Errorf("unsupported flags %v", header.Flags) + } + + if int64(header.HdrLen) < int64(binary.Size(&header)) { + return nil, fmt.Errorf("header length shorter than btfExtHeader size") + } + + return &header, nil +} + +// funcInfoStart returns the offset from the beginning of the .BTF.ext section +// to the start of its func_info entries. +func (h *btfExtHeader) funcInfoStart() int64 { + return int64(h.HdrLen + h.FuncInfoOff) +} + +// lineInfoStart returns the offset from the beginning of the .BTF.ext section +// to the start of its line_info entries. +func (h *btfExtHeader) lineInfoStart() int64 { + return int64(h.HdrLen + h.LineInfoOff) +} + +// coreReloStart returns the offset from the beginning of the .BTF.ext section +// to the start of its CO-RE relocation entries. +func (h *btfExtHeader) coreReloStart(ch *btfExtCOREHeader) int64 { + return int64(h.HdrLen + ch.COREReloOff) +} + +// btfExtCOREHeader is found right after the btfExtHeader when its HdrLen +// field is larger than its size. +type btfExtCOREHeader struct { + COREReloOff uint32 + COREReloLen uint32 +} + +// parseBTFExtCOREHeader parses the tail of the .BTF.ext header. If additional +// header bytes are present, extHeader.HdrLen will be larger than the struct, +// indicating the presence of a CO-RE extension header. +func parseBTFExtCOREHeader(r io.Reader, bo binary.ByteOrder, extHeader *btfExtHeader) (*btfExtCOREHeader, error) { + extHdrSize := int64(binary.Size(&extHeader)) + remainder := int64(extHeader.HdrLen) - extHdrSize + + if remainder == 0 { + return nil, nil + } + + var coreHeader btfExtCOREHeader + if err := binary.Read(r, bo, &coreHeader); err != nil { + return nil, fmt.Errorf("can't read header: %v", err) + } + + return &coreHeader, nil +} + +type btfExtInfoSec struct { + SecNameOff uint32 + NumInfo uint32 +} + +// parseExtInfoSec parses a btf_ext_info_sec header within .BTF.ext, +// appearing within func_info and line_info sub-sections. +// These headers appear once for each program section in the ELF and are +// followed by one or more func/line_info records for the section. +func parseExtInfoSec(r io.Reader, bo binary.ByteOrder, strings *stringTable) (string, *btfExtInfoSec, error) { + var infoHeader btfExtInfoSec + if err := binary.Read(r, bo, &infoHeader); err != nil { + return "", nil, fmt.Errorf("read ext info header: %w", err) + } + + secName, err := strings.Lookup(infoHeader.SecNameOff) + if err != nil { + return "", nil, fmt.Errorf("get section name: %w", err) + } + if secName == "" { + return "", nil, fmt.Errorf("extinfo header refers to empty section name") + } + + if infoHeader.NumInfo == 0 { + return "", nil, fmt.Errorf("section %s has zero records", secName) + } + + return secName, &infoHeader, nil +} + +// parseExtInfoRecordSize parses the uint32 at the beginning of a func_infos +// or line_infos segment that describes the length of all extInfoRecords in +// that segment. +func parseExtInfoRecordSize(r io.Reader, bo binary.ByteOrder) (uint32, error) { + const maxRecordSize = 256 + + var recordSize uint32 + if err := binary.Read(r, bo, &recordSize); err != nil { + return 0, fmt.Errorf("can't read record size: %v", err) + } + + if recordSize < 4 { + // Need at least InsnOff worth of bytes per record. + return 0, errors.New("record size too short") + } + if recordSize > maxRecordSize { + return 0, fmt.Errorf("record size %v exceeds %v", recordSize, maxRecordSize) + } + + return recordSize, nil +} + +// FuncOffsets is a sorted slice of FuncOffset. +type FuncOffsets []FuncOffset + +// The size of a FuncInfo in BTF wire format. +var FuncInfoSize = uint32(binary.Size(bpfFuncInfo{})) + +// FuncOffset represents a [btf.Func] and its raw instruction offset within a +// BPF program. +type FuncOffset struct { + Offset asm.RawInstructionOffset + Func *Func +} + +type bpfFuncInfo struct { + // Instruction offset of the function within an ELF section. + InsnOff uint32 + TypeID TypeID +} + +func newFuncOffset(fi bpfFuncInfo, spec *Spec) (*FuncOffset, error) { + typ, err := spec.TypeByID(fi.TypeID) + if err != nil { + return nil, err + } + + fn, ok := typ.(*Func) + if !ok { + return nil, fmt.Errorf("type ID %d is a %T, but expected a Func", fi.TypeID, typ) + } + + // C doesn't have anonymous functions, but check just in case. + if fn.Name == "" { + return nil, fmt.Errorf("func with type ID %d doesn't have a name", fi.TypeID) + } + + return &FuncOffset{ + asm.RawInstructionOffset(fi.InsnOff), + fn, + }, nil +} + +func newFuncOffsets(bfis []bpfFuncInfo, spec *Spec) (FuncOffsets, error) { + fos := make(FuncOffsets, 0, len(bfis)) + + for _, bfi := range bfis { + fi, err := newFuncOffset(bfi, spec) + if err != nil { + return FuncOffsets{}, fmt.Errorf("offset %d: %w", bfi.InsnOff, err) + } + fos = append(fos, *fi) + } + sort.Slice(fos, func(i, j int) bool { + return fos[i].Offset <= fos[j].Offset + }) + return fos, nil +} + +// LoadFuncInfos parses BTF func info from kernel wire format into a +// [FuncOffsets], a sorted slice of [btf.Func]s of (sub)programs within a BPF +// program with their corresponding raw instruction offsets. +func LoadFuncInfos(reader io.Reader, bo binary.ByteOrder, recordNum uint32, spec *Spec) (FuncOffsets, error) { + fis, err := parseFuncInfoRecords( + reader, + bo, + FuncInfoSize, + recordNum, + false, + ) + if err != nil { + return FuncOffsets{}, fmt.Errorf("parsing BTF func info: %w", err) + } + + return newFuncOffsets(fis, spec) +} + +// marshal into the BTF wire format. +func (fi *FuncOffset) marshal(w *bytes.Buffer, b *Builder) error { + id, err := b.Add(fi.Func) + if err != nil { + return err + } + bfi := bpfFuncInfo{ + InsnOff: uint32(fi.Offset), + TypeID: id, + } + buf := make([]byte, FuncInfoSize) + internal.NativeEndian.PutUint32(buf, bfi.InsnOff) + internal.NativeEndian.PutUint32(buf[4:], uint32(bfi.TypeID)) + _, err = w.Write(buf) + return err +} + +// parseFuncInfos parses a func_info sub-section within .BTF.ext ito a map of +// func infos indexed by section name. +func parseFuncInfos(r io.Reader, bo binary.ByteOrder, strings *stringTable) (map[string][]bpfFuncInfo, error) { + recordSize, err := parseExtInfoRecordSize(r, bo) + if err != nil { + return nil, err + } + + result := make(map[string][]bpfFuncInfo) + for { + secName, infoHeader, err := parseExtInfoSec(r, bo, strings) + if errors.Is(err, io.EOF) { + return result, nil + } + if err != nil { + return nil, err + } + + records, err := parseFuncInfoRecords(r, bo, recordSize, infoHeader.NumInfo, true) + if err != nil { + return nil, fmt.Errorf("section %v: %w", secName, err) + } + + result[secName] = records + } +} + +// parseFuncInfoRecords parses a stream of func_infos into a funcInfos. +// These records appear after a btf_ext_info_sec header in the func_info +// sub-section of .BTF.ext. +func parseFuncInfoRecords(r io.Reader, bo binary.ByteOrder, recordSize uint32, recordNum uint32, offsetInBytes bool) ([]bpfFuncInfo, error) { + var out []bpfFuncInfo + var fi bpfFuncInfo + + if exp, got := FuncInfoSize, recordSize; exp != got { + // BTF blob's record size is longer than we know how to parse. + return nil, fmt.Errorf("expected FuncInfo record size %d, but BTF blob contains %d", exp, got) + } + + for i := uint32(0); i < recordNum; i++ { + if err := binary.Read(r, bo, &fi); err != nil { + return nil, fmt.Errorf("can't read function info: %v", err) + } + + if offsetInBytes { + if fi.InsnOff%asm.InstructionSize != 0 { + return nil, fmt.Errorf("offset %v is not aligned with instruction size", fi.InsnOff) + } + + // ELF tracks offset in bytes, the kernel expects raw BPF instructions. + // Convert as early as possible. + fi.InsnOff /= asm.InstructionSize + } + + out = append(out, fi) + } + + return out, nil +} + +var LineInfoSize = uint32(binary.Size(bpfLineInfo{})) + +// Line represents the location and contents of a single line of source +// code a BPF ELF was compiled from. +type Line struct { + fileName string + line string + lineNumber uint32 + lineColumn uint32 +} + +func (li *Line) FileName() string { + return li.fileName +} + +func (li *Line) Line() string { + return li.line +} + +func (li *Line) LineNumber() uint32 { + return li.lineNumber +} + +func (li *Line) LineColumn() uint32 { + return li.lineColumn +} + +func (li *Line) String() string { + return li.line +} + +// LineOffsets contains a sorted list of line infos. +type LineOffsets []LineOffset + +// LineOffset represents a line info and its raw instruction offset. +type LineOffset struct { + Offset asm.RawInstructionOffset + Line *Line +} + +// Constants for the format of bpfLineInfo.LineCol. +const ( + bpfLineShift = 10 + bpfLineMax = (1 << (32 - bpfLineShift)) - 1 + bpfColumnMax = (1 << bpfLineShift) - 1 +) + +type bpfLineInfo struct { + // Instruction offset of the line within the whole instruction stream, in instructions. + InsnOff uint32 + FileNameOff uint32 + LineOff uint32 + LineCol uint32 +} + +// LoadLineInfos parses BTF line info in kernel wire format. +func LoadLineInfos(reader io.Reader, bo binary.ByteOrder, recordNum uint32, spec *Spec) (LineOffsets, error) { + lis, err := parseLineInfoRecords( + reader, + bo, + LineInfoSize, + recordNum, + false, + ) + if err != nil { + return LineOffsets{}, fmt.Errorf("parsing BTF line info: %w", err) + } + + return newLineInfos(lis, spec.strings) +} + +func newLineInfo(li bpfLineInfo, strings *stringTable) (LineOffset, error) { + line, err := strings.Lookup(li.LineOff) + if err != nil { + return LineOffset{}, fmt.Errorf("lookup of line: %w", err) + } + + fileName, err := strings.Lookup(li.FileNameOff) + if err != nil { + return LineOffset{}, fmt.Errorf("lookup of filename: %w", err) + } + + lineNumber := li.LineCol >> bpfLineShift + lineColumn := li.LineCol & bpfColumnMax + + return LineOffset{ + asm.RawInstructionOffset(li.InsnOff), + &Line{ + fileName, + line, + lineNumber, + lineColumn, + }, + }, nil +} + +func newLineInfos(blis []bpfLineInfo, strings *stringTable) (LineOffsets, error) { + lis := make([]LineOffset, 0, len(blis)) + for _, bli := range blis { + li, err := newLineInfo(bli, strings) + if err != nil { + return LineOffsets{}, fmt.Errorf("offset %d: %w", bli.InsnOff, err) + } + lis = append(lis, li) + } + sort.Slice(lis, func(i, j int) bool { + return lis[i].Offset <= lis[j].Offset + }) + return lis, nil +} + +// marshal writes the binary representation of the LineInfo to w. +func (li *LineOffset) marshal(w *bytes.Buffer, b *Builder) error { + line := li.Line + if line.lineNumber > bpfLineMax { + return fmt.Errorf("line %d exceeds %d", line.lineNumber, bpfLineMax) + } + + if line.lineColumn > bpfColumnMax { + return fmt.Errorf("column %d exceeds %d", line.lineColumn, bpfColumnMax) + } + + fileNameOff, err := b.addString(line.fileName) + if err != nil { + return fmt.Errorf("file name %q: %w", line.fileName, err) + } + + lineOff, err := b.addString(line.line) + if err != nil { + return fmt.Errorf("line %q: %w", line.line, err) + } + + bli := bpfLineInfo{ + uint32(li.Offset), + fileNameOff, + lineOff, + (line.lineNumber << bpfLineShift) | line.lineColumn, + } + + buf := make([]byte, LineInfoSize) + internal.NativeEndian.PutUint32(buf, bli.InsnOff) + internal.NativeEndian.PutUint32(buf[4:], bli.FileNameOff) + internal.NativeEndian.PutUint32(buf[8:], bli.LineOff) + internal.NativeEndian.PutUint32(buf[12:], bli.LineCol) + _, err = w.Write(buf) + return err +} + +// parseLineInfos parses a line_info sub-section within .BTF.ext ito a map of +// line infos indexed by section name. +func parseLineInfos(r io.Reader, bo binary.ByteOrder, strings *stringTable) (map[string][]bpfLineInfo, error) { + recordSize, err := parseExtInfoRecordSize(r, bo) + if err != nil { + return nil, err + } + + result := make(map[string][]bpfLineInfo) + for { + secName, infoHeader, err := parseExtInfoSec(r, bo, strings) + if errors.Is(err, io.EOF) { + return result, nil + } + if err != nil { + return nil, err + } + + records, err := parseLineInfoRecords(r, bo, recordSize, infoHeader.NumInfo, true) + if err != nil { + return nil, fmt.Errorf("section %v: %w", secName, err) + } + + result[secName] = records + } +} + +// parseLineInfoRecords parses a stream of line_infos into a lineInfos. +// These records appear after a btf_ext_info_sec header in the line_info +// sub-section of .BTF.ext. +func parseLineInfoRecords(r io.Reader, bo binary.ByteOrder, recordSize uint32, recordNum uint32, offsetInBytes bool) ([]bpfLineInfo, error) { + if exp, got := uint32(binary.Size(bpfLineInfo{})), recordSize; exp != got { + // BTF blob's record size is longer than we know how to parse. + return nil, fmt.Errorf("expected LineInfo record size %d, but BTF blob contains %d", exp, got) + } + + out := make([]bpfLineInfo, recordNum) + if err := binary.Read(r, bo, out); err != nil { + return nil, fmt.Errorf("can't read line info: %v", err) + } + + if offsetInBytes { + for i := range out { + li := &out[i] + if li.InsnOff%asm.InstructionSize != 0 { + return nil, fmt.Errorf("offset %v is not aligned with instruction size", li.InsnOff) + } + + // ELF tracks offset in bytes, the kernel expects raw BPF instructions. + // Convert as early as possible. + li.InsnOff /= asm.InstructionSize + } + } + + return out, nil +} + +// bpfCORERelo matches the kernel's struct bpf_core_relo. +type bpfCORERelo struct { + InsnOff uint32 + TypeID TypeID + AccessStrOff uint32 + Kind coreKind +} + +type CORERelocation struct { + // The local type of the relocation, stripped of typedefs and qualifiers. + typ Type + accessor coreAccessor + kind coreKind + // The ID of the local type in the source BTF. + id TypeID +} + +func (cr *CORERelocation) String() string { + return fmt.Sprintf("CORERelocation(%s, %s[%s], local_id=%d)", cr.kind, cr.typ, cr.accessor, cr.id) +} + +func CORERelocationMetadata(ins *asm.Instruction) *CORERelocation { + relo, _ := ins.Metadata.Get(coreRelocationMeta{}).(*CORERelocation) + return relo +} + +// CORERelocationInfos contains a sorted list of co:re relocation infos. +type CORERelocationInfos struct { + infos []coreRelocationInfo +} + +type coreRelocationInfo struct { + relo *CORERelocation + offset asm.RawInstructionOffset +} + +func newRelocationInfo(relo bpfCORERelo, spec *Spec, strings *stringTable) (*coreRelocationInfo, error) { + typ, err := spec.TypeByID(relo.TypeID) + if err != nil { + return nil, err + } + + accessorStr, err := strings.Lookup(relo.AccessStrOff) + if err != nil { + return nil, err + } + + accessor, err := parseCOREAccessor(accessorStr) + if err != nil { + return nil, fmt.Errorf("accessor %q: %s", accessorStr, err) + } + + return &coreRelocationInfo{ + &CORERelocation{ + typ, + accessor, + relo.Kind, + relo.TypeID, + }, + asm.RawInstructionOffset(relo.InsnOff), + }, nil +} + +func newRelocationInfos(brs []bpfCORERelo, spec *Spec, strings *stringTable) (CORERelocationInfos, error) { + rs := CORERelocationInfos{ + infos: make([]coreRelocationInfo, 0, len(brs)), + } + for _, br := range brs { + relo, err := newRelocationInfo(br, spec, strings) + if err != nil { + return CORERelocationInfos{}, fmt.Errorf("offset %d: %w", br.InsnOff, err) + } + rs.infos = append(rs.infos, *relo) + } + sort.Slice(rs.infos, func(i, j int) bool { + return rs.infos[i].offset < rs.infos[j].offset + }) + return rs, nil +} + +var extInfoReloSize = binary.Size(bpfCORERelo{}) + +// parseCORERelos parses a core_relos sub-section within .BTF.ext ito a map of +// CO-RE relocations indexed by section name. +func parseCORERelos(r io.Reader, bo binary.ByteOrder, strings *stringTable) (map[string][]bpfCORERelo, error) { + recordSize, err := parseExtInfoRecordSize(r, bo) + if err != nil { + return nil, err + } + + if recordSize != uint32(extInfoReloSize) { + return nil, fmt.Errorf("expected record size %d, got %d", extInfoReloSize, recordSize) + } + + result := make(map[string][]bpfCORERelo) + for { + secName, infoHeader, err := parseExtInfoSec(r, bo, strings) + if errors.Is(err, io.EOF) { + return result, nil + } + if err != nil { + return nil, err + } + + records, err := parseCOREReloRecords(r, bo, infoHeader.NumInfo) + if err != nil { + return nil, fmt.Errorf("section %v: %w", secName, err) + } + + result[secName] = records + } +} + +// parseCOREReloRecords parses a stream of CO-RE relocation entries into a +// coreRelos. These records appear after a btf_ext_info_sec header in the +// core_relos sub-section of .BTF.ext. +func parseCOREReloRecords(r io.Reader, bo binary.ByteOrder, recordNum uint32) ([]bpfCORERelo, error) { + var out []bpfCORERelo + + var relo bpfCORERelo + for i := uint32(0); i < recordNum; i++ { + if err := binary.Read(r, bo, &relo); err != nil { + return nil, fmt.Errorf("can't read CO-RE relocation: %v", err) + } + + if relo.InsnOff%asm.InstructionSize != 0 { + return nil, fmt.Errorf("offset %v is not aligned with instruction size", relo.InsnOff) + } + + // ELF tracks offset in bytes, the kernel expects raw BPF instructions. + // Convert as early as possible. + relo.InsnOff /= asm.InstructionSize + + out = append(out, relo) + } + + return out, nil +} diff --git a/vendor/github.com/cilium/ebpf/btf/feature.go b/vendor/github.com/cilium/ebpf/btf/feature.go new file mode 100644 index 0000000000..e71c707fe4 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/btf/feature.go @@ -0,0 +1,158 @@ +package btf + +import ( + "errors" + "math" + + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/sys" + "github.com/cilium/ebpf/internal/unix" +) + +// haveBTF attempts to load a BTF blob containing an Int. It should pass on any +// kernel that supports BPF_BTF_LOAD. +var haveBTF = internal.NewFeatureTest("BTF", func() error { + // 0-length anonymous integer + err := probeBTF(&Int{}) + if errors.Is(err, unix.EINVAL) || errors.Is(err, unix.EPERM) { + return internal.ErrNotSupported + } + return err +}, "4.18") + +// haveMapBTF attempts to load a minimal BTF blob containing a Var. It is +// used as a proxy for .bss, .data and .rodata map support, which generally +// come with a Var and Datasec. These were introduced in Linux 5.2. +var haveMapBTF = internal.NewFeatureTest("Map BTF (Var/Datasec)", func() error { + if err := haveBTF(); err != nil { + return err + } + + v := &Var{ + Name: "a", + Type: &Pointer{(*Void)(nil)}, + } + + err := probeBTF(v) + if errors.Is(err, unix.EINVAL) || errors.Is(err, unix.EPERM) { + // Treat both EINVAL and EPERM as not supported: creating the map may still + // succeed without Btf* attrs. + return internal.ErrNotSupported + } + return err +}, "5.2") + +// haveProgBTF attempts to load a BTF blob containing a Func and FuncProto. It +// is used as a proxy for ext_info (func_info) support, which depends on +// Func(Proto) by definition. +var haveProgBTF = internal.NewFeatureTest("Program BTF (func/line_info)", func() error { + if err := haveBTF(); err != nil { + return err + } + + fn := &Func{ + Name: "a", + Type: &FuncProto{Return: (*Void)(nil)}, + } + + err := probeBTF(fn) + if errors.Is(err, unix.EINVAL) || errors.Is(err, unix.EPERM) { + return internal.ErrNotSupported + } + return err +}, "5.0") + +var haveFuncLinkage = internal.NewFeatureTest("BTF func linkage", func() error { + if err := haveProgBTF(); err != nil { + return err + } + + fn := &Func{ + Name: "a", + Type: &FuncProto{Return: (*Void)(nil)}, + Linkage: GlobalFunc, + } + + err := probeBTF(fn) + if errors.Is(err, unix.EINVAL) { + return internal.ErrNotSupported + } + return err +}, "5.6") + +var haveDeclTags = internal.NewFeatureTest("BTF decl tags", func() error { + if err := haveBTF(); err != nil { + return err + } + + t := &Typedef{ + Name: "a", + Type: &Int{}, + Tags: []string{"a"}, + } + + err := probeBTF(t) + if errors.Is(err, unix.EINVAL) { + return internal.ErrNotSupported + } + return err +}, "5.16") + +var haveTypeTags = internal.NewFeatureTest("BTF type tags", func() error { + if err := haveBTF(); err != nil { + return err + } + + t := &TypeTag{ + Type: &Int{}, + Value: "a", + } + + err := probeBTF(t) + if errors.Is(err, unix.EINVAL) { + return internal.ErrNotSupported + } + return err +}, "5.17") + +var haveEnum64 = internal.NewFeatureTest("ENUM64", func() error { + if err := haveBTF(); err != nil { + return err + } + + enum := &Enum{ + Size: 8, + Values: []EnumValue{ + {"TEST", math.MaxUint32 + 1}, + }, + } + + err := probeBTF(enum) + if errors.Is(err, unix.EINVAL) { + return internal.ErrNotSupported + } + return err +}, "6.0") + +func probeBTF(typ Type) error { + b, err := NewBuilder([]Type{typ}) + if err != nil { + return err + } + + buf, err := b.Marshal(nil, nil) + if err != nil { + return err + } + + fd, err := sys.BtfLoad(&sys.BtfLoadAttr{ + Btf: sys.NewSlicePointer(buf), + BtfSize: uint32(len(buf)), + }) + + if err == nil { + fd.Close() + } + + return err +} diff --git a/vendor/github.com/cilium/ebpf/btf/format.go b/vendor/github.com/cilium/ebpf/btf/format.go new file mode 100644 index 0000000000..3e0dedaa2b --- /dev/null +++ b/vendor/github.com/cilium/ebpf/btf/format.go @@ -0,0 +1,353 @@ +package btf + +import ( + "errors" + "fmt" + "strings" +) + +var errNestedTooDeep = errors.New("nested too deep") + +// GoFormatter converts a Type to Go syntax. +// +// A zero GoFormatter is valid to use. +type GoFormatter struct { + w strings.Builder + + // Types present in this map are referred to using the given name if they + // are encountered when outputting another type. + Names map[Type]string + + // Identifier is called for each field of struct-like types. By default the + // field name is used as is. + Identifier func(string) string + + // EnumIdentifier is called for each element of an enum. By default the + // name of the enum type is concatenated with Identifier(element). + EnumIdentifier func(name, element string) string +} + +// TypeDeclaration generates a Go type declaration for a BTF type. +func (gf *GoFormatter) TypeDeclaration(name string, typ Type) (string, error) { + gf.w.Reset() + if err := gf.writeTypeDecl(name, typ); err != nil { + return "", err + } + return gf.w.String(), nil +} + +func (gf *GoFormatter) identifier(s string) string { + if gf.Identifier != nil { + return gf.Identifier(s) + } + + return s +} + +func (gf *GoFormatter) enumIdentifier(name, element string) string { + if gf.EnumIdentifier != nil { + return gf.EnumIdentifier(name, element) + } + + return name + gf.identifier(element) +} + +// writeTypeDecl outputs a declaration of the given type. +// +// It encodes https://golang.org/ref/spec#Type_declarations: +// +// type foo struct { bar uint32; } +// type bar int32 +func (gf *GoFormatter) writeTypeDecl(name string, typ Type) error { + if name == "" { + return fmt.Errorf("need a name for type %s", typ) + } + + typ = skipQualifiers(typ) + fmt.Fprintf(&gf.w, "type %s ", name) + if err := gf.writeTypeLit(typ, 0); err != nil { + return err + } + + e, ok := typ.(*Enum) + if !ok || len(e.Values) == 0 { + return nil + } + + gf.w.WriteString("; const ( ") + for _, ev := range e.Values { + id := gf.enumIdentifier(name, ev.Name) + var value any + if e.Signed { + value = int64(ev.Value) + } else { + value = ev.Value + } + fmt.Fprintf(&gf.w, "%s %s = %d; ", id, name, value) + } + gf.w.WriteString(")") + + return nil +} + +// writeType outputs the name of a named type or a literal describing the type. +// +// It encodes https://golang.org/ref/spec#Types. +// +// foo (if foo is a named type) +// uint32 +func (gf *GoFormatter) writeType(typ Type, depth int) error { + typ = skipQualifiers(typ) + + name := gf.Names[typ] + if name != "" { + gf.w.WriteString(name) + return nil + } + + return gf.writeTypeLit(typ, depth) +} + +// writeTypeLit outputs a literal describing the type. +// +// The function ignores named types. +// +// It encodes https://golang.org/ref/spec#TypeLit. +// +// struct { bar uint32; } +// uint32 +func (gf *GoFormatter) writeTypeLit(typ Type, depth int) error { + depth++ + if depth > maxResolveDepth { + return errNestedTooDeep + } + + var err error + switch v := skipQualifiers(typ).(type) { + case *Int: + err = gf.writeIntLit(v) + + case *Enum: + if !v.Signed { + gf.w.WriteRune('u') + } + switch v.Size { + case 1: + gf.w.WriteString("int8") + case 2: + gf.w.WriteString("int16") + case 4: + gf.w.WriteString("int32") + case 8: + gf.w.WriteString("int64") + default: + err = fmt.Errorf("invalid enum size %d", v.Size) + } + + case *Typedef: + err = gf.writeType(v.Type, depth) + + case *Array: + fmt.Fprintf(&gf.w, "[%d]", v.Nelems) + err = gf.writeType(v.Type, depth) + + case *Struct: + err = gf.writeStructLit(v.Size, v.Members, depth) + + case *Union: + // Always choose the first member to represent the union in Go. + err = gf.writeStructLit(v.Size, v.Members[:1], depth) + + case *Datasec: + err = gf.writeDatasecLit(v, depth) + + case *Var: + err = gf.writeTypeLit(v.Type, depth) + + default: + return fmt.Errorf("type %T: %w", v, ErrNotSupported) + } + + if err != nil { + return fmt.Errorf("%s: %w", typ, err) + } + + return nil +} + +func (gf *GoFormatter) writeIntLit(i *Int) error { + bits := i.Size * 8 + switch i.Encoding { + case Bool: + if i.Size != 1 { + return fmt.Errorf("bool with size %d", i.Size) + } + gf.w.WriteString("bool") + case Char: + if i.Size != 1 { + return fmt.Errorf("char with size %d", i.Size) + } + // BTF doesn't have a way to specify the signedness of a char. Assume + // we are dealing with unsigned, since this works nicely with []byte + // in Go code. + fallthrough + case Unsigned, Signed: + stem := "uint" + if i.Encoding == Signed { + stem = "int" + } + if i.Size > 8 { + fmt.Fprintf(&gf.w, "[%d]byte /* %s%d */", i.Size, stem, i.Size*8) + } else { + fmt.Fprintf(&gf.w, "%s%d", stem, bits) + } + default: + return fmt.Errorf("can't encode %s", i.Encoding) + } + return nil +} + +func (gf *GoFormatter) writeStructLit(size uint32, members []Member, depth int) error { + gf.w.WriteString("struct { ") + + prevOffset := uint32(0) + skippedBitfield := false + for i, m := range members { + if m.BitfieldSize > 0 { + skippedBitfield = true + continue + } + + offset := m.Offset.Bytes() + if n := offset - prevOffset; skippedBitfield && n > 0 { + fmt.Fprintf(&gf.w, "_ [%d]byte /* unsupported bitfield */; ", n) + } else { + gf.writePadding(n) + } + + fieldSize, err := Sizeof(m.Type) + if err != nil { + return fmt.Errorf("field %d: %w", i, err) + } + + prevOffset = offset + uint32(fieldSize) + if prevOffset > size { + return fmt.Errorf("field %d of size %d exceeds type size %d", i, fieldSize, size) + } + + if err := gf.writeStructField(m, depth); err != nil { + return fmt.Errorf("field %d: %w", i, err) + } + } + + gf.writePadding(size - prevOffset) + gf.w.WriteString("}") + return nil +} + +func (gf *GoFormatter) writeStructField(m Member, depth int) error { + if m.BitfieldSize > 0 { + return fmt.Errorf("bitfields are not supported") + } + if m.Offset%8 != 0 { + return fmt.Errorf("unsupported offset %d", m.Offset) + } + + if m.Name == "" { + // Special case a nested anonymous union like + // struct foo { union { int bar; int baz }; } + // by replacing the whole union with its first member. + union, ok := m.Type.(*Union) + if !ok { + return fmt.Errorf("anonymous fields are not supported") + + } + + if len(union.Members) == 0 { + return errors.New("empty anonymous union") + } + + depth++ + if depth > maxResolveDepth { + return errNestedTooDeep + } + + m := union.Members[0] + size, err := Sizeof(m.Type) + if err != nil { + return err + } + + if err := gf.writeStructField(m, depth); err != nil { + return err + } + + gf.writePadding(union.Size - uint32(size)) + return nil + + } + + fmt.Fprintf(&gf.w, "%s ", gf.identifier(m.Name)) + + if err := gf.writeType(m.Type, depth); err != nil { + return err + } + + gf.w.WriteString("; ") + return nil +} + +func (gf *GoFormatter) writeDatasecLit(ds *Datasec, depth int) error { + gf.w.WriteString("struct { ") + + prevOffset := uint32(0) + for i, vsi := range ds.Vars { + v, ok := vsi.Type.(*Var) + if !ok { + return fmt.Errorf("can't format %s as part of data section", vsi.Type) + } + + if v.Linkage != GlobalVar { + // Ignore static, extern, etc. for now. + continue + } + + if v.Name == "" { + return fmt.Errorf("variable %d: empty name", i) + } + + gf.writePadding(vsi.Offset - prevOffset) + prevOffset = vsi.Offset + vsi.Size + + fmt.Fprintf(&gf.w, "%s ", gf.identifier(v.Name)) + + if err := gf.writeType(v.Type, depth); err != nil { + return fmt.Errorf("variable %d: %w", i, err) + } + + gf.w.WriteString("; ") + } + + gf.writePadding(ds.Size - prevOffset) + gf.w.WriteString("}") + return nil +} + +func (gf *GoFormatter) writePadding(bytes uint32) { + if bytes > 0 { + fmt.Fprintf(&gf.w, "_ [%d]byte; ", bytes) + } +} + +func skipQualifiers(typ Type) Type { + result := typ + for depth := 0; depth <= maxResolveDepth; depth++ { + switch v := (result).(type) { + case qualifier: + result = v.qualify() + default: + return result + } + } + return &cycle{typ} +} diff --git a/vendor/github.com/cilium/ebpf/btf/handle.go b/vendor/github.com/cilium/ebpf/btf/handle.go new file mode 100644 index 0000000000..adfa6fed4b --- /dev/null +++ b/vendor/github.com/cilium/ebpf/btf/handle.go @@ -0,0 +1,317 @@ +package btf + +import ( + "bytes" + "errors" + "fmt" + "math" + "os" + + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/sys" + "github.com/cilium/ebpf/internal/unix" +) + +// Handle is a reference to BTF loaded into the kernel. +type Handle struct { + fd *sys.FD + + // Size of the raw BTF in bytes. + size uint32 + + needsKernelBase bool +} + +// NewHandle loads the contents of a [Builder] into the kernel. +// +// Returns an error wrapping ErrNotSupported if the kernel doesn't support BTF. +func NewHandle(b *Builder) (*Handle, error) { + small := getByteSlice() + defer putByteSlice(small) + + buf, err := b.Marshal(*small, KernelMarshalOptions()) + if err != nil { + return nil, fmt.Errorf("marshal BTF: %w", err) + } + + return NewHandleFromRawBTF(buf) +} + +// NewHandleFromRawBTF loads raw BTF into the kernel. +// +// Returns an error wrapping ErrNotSupported if the kernel doesn't support BTF. +func NewHandleFromRawBTF(btf []byte) (*Handle, error) { + const minLogSize = 64 * 1024 + + if uint64(len(btf)) > math.MaxUint32 { + return nil, errors.New("BTF exceeds the maximum size") + } + + attr := &sys.BtfLoadAttr{ + Btf: sys.NewSlicePointer(btf), + BtfSize: uint32(len(btf)), + } + + var ( + logBuf []byte + err error + ) + for { + var fd *sys.FD + fd, err = sys.BtfLoad(attr) + if err == nil { + return &Handle{fd, attr.BtfSize, false}, nil + } + + if attr.BtfLogTrueSize != 0 && attr.BtfLogSize >= attr.BtfLogTrueSize { + // The log buffer already has the correct size. + break + } + + if attr.BtfLogSize != 0 && !errors.Is(err, unix.ENOSPC) { + // Up until at least kernel 6.0, the BTF verifier does not return ENOSPC + // if there are other verification errors. ENOSPC is only returned when + // the BTF blob is correct, a log was requested, and the provided buffer + // is too small. We're therefore not sure whether we got the full + // log or not. + break + } + + // Make an educated guess how large the buffer should be. Start + // at a reasonable minimum and then double the size. + logSize := uint32(max(len(logBuf)*2, minLogSize)) + if int(logSize) < len(logBuf) { + return nil, errors.New("overflow while probing log buffer size") + } + + if attr.BtfLogTrueSize != 0 { + // The kernel has given us a hint how large the log buffer has to be. + logSize = attr.BtfLogTrueSize + } + + logBuf = make([]byte, logSize) + attr.BtfLogSize = logSize + attr.BtfLogBuf = sys.NewSlicePointer(logBuf) + attr.BtfLogLevel = 1 + } + + if err := haveBTF(); err != nil { + return nil, err + } + + return nil, internal.ErrorWithLog("load btf", err, logBuf) +} + +// NewHandleFromID returns the BTF handle for a given id. +// +// Prefer calling [ebpf.Program.Handle] or [ebpf.Map.Handle] if possible. +// +// Returns ErrNotExist, if there is no BTF with the given id. +// +// Requires CAP_SYS_ADMIN. +func NewHandleFromID(id ID) (*Handle, error) { + fd, err := sys.BtfGetFdById(&sys.BtfGetFdByIdAttr{ + Id: uint32(id), + }) + if err != nil { + return nil, fmt.Errorf("get FD for ID %d: %w", id, err) + } + + info, err := newHandleInfoFromFD(fd) + if err != nil { + _ = fd.Close() + return nil, err + } + + return &Handle{fd, info.size, info.IsModule()}, nil +} + +// Spec parses the kernel BTF into Go types. +// +// base must contain type information for vmlinux if the handle is for +// a kernel module. It may be nil otherwise. +func (h *Handle) Spec(base *Spec) (*Spec, error) { + var btfInfo sys.BtfInfo + btfBuffer := make([]byte, h.size) + btfInfo.Btf, btfInfo.BtfSize = sys.NewSlicePointerLen(btfBuffer) + + if err := sys.ObjInfo(h.fd, &btfInfo); err != nil { + return nil, err + } + + if h.needsKernelBase && base == nil { + return nil, fmt.Errorf("missing base types") + } + + return loadRawSpec(bytes.NewReader(btfBuffer), internal.NativeEndian, base) +} + +// Close destroys the handle. +// +// Subsequent calls to FD will return an invalid value. +func (h *Handle) Close() error { + if h == nil { + return nil + } + + return h.fd.Close() +} + +// FD returns the file descriptor for the handle. +func (h *Handle) FD() int { + return h.fd.Int() +} + +// Info returns metadata about the handle. +func (h *Handle) Info() (*HandleInfo, error) { + return newHandleInfoFromFD(h.fd) +} + +// HandleInfo describes a Handle. +type HandleInfo struct { + // ID of this handle in the kernel. The ID is only valid as long as the + // associated handle is kept alive. + ID ID + + // Name is an identifying name for the BTF, currently only used by the + // kernel. + Name string + + // IsKernel is true if the BTF originated with the kernel and not + // userspace. + IsKernel bool + + // Size of the raw BTF in bytes. + size uint32 +} + +func newHandleInfoFromFD(fd *sys.FD) (*HandleInfo, error) { + // We invoke the syscall once with a empty BTF and name buffers to get size + // information to allocate buffers. Then we invoke it a second time with + // buffers to receive the data. + var btfInfo sys.BtfInfo + if err := sys.ObjInfo(fd, &btfInfo); err != nil { + return nil, fmt.Errorf("get BTF info for fd %s: %w", fd, err) + } + + if btfInfo.NameLen > 0 { + // NameLen doesn't account for the terminating NUL. + btfInfo.NameLen++ + } + + // Don't pull raw BTF by default, since it may be quite large. + btfSize := btfInfo.BtfSize + btfInfo.BtfSize = 0 + + nameBuffer := make([]byte, btfInfo.NameLen) + btfInfo.Name, btfInfo.NameLen = sys.NewSlicePointerLen(nameBuffer) + if err := sys.ObjInfo(fd, &btfInfo); err != nil { + return nil, err + } + + return &HandleInfo{ + ID: ID(btfInfo.Id), + Name: unix.ByteSliceToString(nameBuffer), + IsKernel: btfInfo.KernelBtf != 0, + size: btfSize, + }, nil +} + +// IsVmlinux returns true if the BTF is for the kernel itself. +func (i *HandleInfo) IsVmlinux() bool { + return i.IsKernel && i.Name == "vmlinux" +} + +// IsModule returns true if the BTF is for a kernel module. +func (i *HandleInfo) IsModule() bool { + return i.IsKernel && i.Name != "vmlinux" +} + +// HandleIterator allows enumerating BTF blobs loaded into the kernel. +type HandleIterator struct { + // The ID of the current handle. Only valid after a call to Next. + ID ID + // The current Handle. Only valid until a call to Next. + // See Take if you want to retain the handle. + Handle *Handle + err error +} + +// Next retrieves a handle for the next BTF object. +// +// Returns true if another BTF object was found. Call [HandleIterator.Err] after +// the function returns false. +func (it *HandleIterator) Next() bool { + id := it.ID + for { + attr := &sys.BtfGetNextIdAttr{Id: id} + err := sys.BtfGetNextId(attr) + if errors.Is(err, os.ErrNotExist) { + // There are no more BTF objects. + break + } else if err != nil { + it.err = fmt.Errorf("get next BTF ID: %w", err) + break + } + + id = attr.NextId + handle, err := NewHandleFromID(id) + if errors.Is(err, os.ErrNotExist) { + // Try again with the next ID. + continue + } else if err != nil { + it.err = fmt.Errorf("retrieve handle for ID %d: %w", id, err) + break + } + + it.Handle.Close() + it.ID, it.Handle = id, handle + return true + } + + // No more handles or we encountered an error. + it.Handle.Close() + it.Handle = nil + return false +} + +// Take the ownership of the current handle. +// +// It's the callers responsibility to close the handle. +func (it *HandleIterator) Take() *Handle { + handle := it.Handle + it.Handle = nil + return handle +} + +// Err returns an error if iteration failed for some reason. +func (it *HandleIterator) Err() error { + return it.err +} + +// FindHandle returns the first handle for which predicate returns true. +// +// Requires CAP_SYS_ADMIN. +// +// Returns an error wrapping ErrNotFound if predicate never returns true or if +// there is no BTF loaded into the kernel. +func FindHandle(predicate func(info *HandleInfo) bool) (*Handle, error) { + it := new(HandleIterator) + defer it.Handle.Close() + + for it.Next() { + info, err := it.Handle.Info() + if err != nil { + return nil, fmt.Errorf("info for ID %d: %w", it.ID, err) + } + + if predicate(info) { + return it.Take(), nil + } + } + if err := it.Err(); err != nil { + return nil, fmt.Errorf("iterate handles: %w", err) + } + + return nil, fmt.Errorf("find handle: %w", ErrNotFound) +} diff --git a/vendor/github.com/cilium/ebpf/btf/kernel.go b/vendor/github.com/cilium/ebpf/btf/kernel.go new file mode 100644 index 0000000000..1a9321f054 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/btf/kernel.go @@ -0,0 +1,157 @@ +package btf + +import ( + "errors" + "fmt" + "os" + "path/filepath" + "sync" + + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/linux" +) + +var kernelBTF = struct { + sync.RWMutex + kernel *Spec + modules map[string]*Spec +}{ + modules: make(map[string]*Spec), +} + +// FlushKernelSpec removes any cached kernel type information. +func FlushKernelSpec() { + kernelBTF.Lock() + defer kernelBTF.Unlock() + + kernelBTF.kernel = nil + kernelBTF.modules = make(map[string]*Spec) +} + +// LoadKernelSpec returns the current kernel's BTF information. +// +// Defaults to /sys/kernel/btf/vmlinux and falls back to scanning the file system +// for vmlinux ELFs. Returns an error wrapping ErrNotSupported if BTF is not enabled. +func LoadKernelSpec() (*Spec, error) { + kernelBTF.RLock() + spec := kernelBTF.kernel + kernelBTF.RUnlock() + + if spec == nil { + kernelBTF.Lock() + defer kernelBTF.Unlock() + + spec = kernelBTF.kernel + } + + if spec != nil { + return spec.Copy(), nil + } + + spec, _, err := loadKernelSpec() + if err != nil { + return nil, err + } + + kernelBTF.kernel = spec + return spec.Copy(), nil +} + +// LoadKernelModuleSpec returns the BTF information for the named kernel module. +// +// Defaults to /sys/kernel/btf/. +// Returns an error wrapping ErrNotSupported if BTF is not enabled. +// Returns an error wrapping fs.ErrNotExist if BTF for the specific module doesn't exist. +func LoadKernelModuleSpec(module string) (*Spec, error) { + kernelBTF.RLock() + spec := kernelBTF.modules[module] + kernelBTF.RUnlock() + + if spec != nil { + return spec.Copy(), nil + } + + base, err := LoadKernelSpec() + if err != nil { + return nil, fmt.Errorf("load kernel spec: %w", err) + } + + kernelBTF.Lock() + defer kernelBTF.Unlock() + + if spec = kernelBTF.modules[module]; spec != nil { + return spec.Copy(), nil + } + + spec, err = loadKernelModuleSpec(module, base) + if err != nil { + return nil, err + } + + kernelBTF.modules[module] = spec + return spec.Copy(), nil +} + +func loadKernelSpec() (_ *Spec, fallback bool, _ error) { + fh, err := os.Open("/sys/kernel/btf/vmlinux") + if err == nil { + defer fh.Close() + + spec, err := loadRawSpec(fh, internal.NativeEndian, nil) + return spec, false, err + } + + file, err := findVMLinux() + if err != nil { + return nil, false, err + } + defer file.Close() + + spec, err := LoadSpecFromReader(file) + return spec, true, err +} + +func loadKernelModuleSpec(module string, base *Spec) (*Spec, error) { + dir, file := filepath.Split(module) + if dir != "" || filepath.Ext(file) != "" { + return nil, fmt.Errorf("invalid module name %q", module) + } + + fh, err := os.Open(filepath.Join("/sys/kernel/btf", module)) + if err != nil { + return nil, err + } + defer fh.Close() + + return loadRawSpec(fh, internal.NativeEndian, base) +} + +// findVMLinux scans multiple well-known paths for vmlinux kernel images. +func findVMLinux() (*os.File, error) { + release, err := linux.KernelRelease() + if err != nil { + return nil, err + } + + // use same list of locations as libbpf + // https://github.com/libbpf/libbpf/blob/9a3a42608dbe3731256a5682a125ac1e23bced8f/src/btf.c#L3114-L3122 + locations := []string{ + "/boot/vmlinux-%s", + "/lib/modules/%s/vmlinux-%[1]s", + "/lib/modules/%s/build/vmlinux", + "/usr/lib/modules/%s/kernel/vmlinux", + "/usr/lib/debug/boot/vmlinux-%s", + "/usr/lib/debug/boot/vmlinux-%s.debug", + "/usr/lib/debug/lib/modules/%s/vmlinux", + } + + for _, loc := range locations { + file, err := os.Open(fmt.Sprintf(loc, release)) + if errors.Is(err, os.ErrNotExist) { + continue + } + return file, err + } + + return nil, fmt.Errorf("no BTF found for kernel version %s: %w", release, internal.ErrNotSupported) +} diff --git a/vendor/github.com/cilium/ebpf/btf/marshal.go b/vendor/github.com/cilium/ebpf/btf/marshal.go new file mode 100644 index 0000000000..d7204e6247 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/btf/marshal.go @@ -0,0 +1,654 @@ +package btf + +import ( + "bytes" + "encoding/binary" + "errors" + "fmt" + "maps" + "math" + "slices" + "sync" + + "github.com/cilium/ebpf/internal" +) + +type MarshalOptions struct { + // Target byte order. Defaults to the system's native endianness. + Order binary.ByteOrder + // Remove function linkage information for compatibility with <5.6 kernels. + StripFuncLinkage bool + // Replace decl tags with a placeholder for compatibility with <5.16 kernels. + ReplaceDeclTags bool + // Replace TypeTags with a placeholder for compatibility with <5.17 kernels. + ReplaceTypeTags bool + // Replace Enum64 with a placeholder for compatibility with <6.0 kernels. + ReplaceEnum64 bool + // Prevent the "No type found" error when loading BTF without any types. + PreventNoTypeFound bool +} + +// KernelMarshalOptions will generate BTF suitable for the current kernel. +func KernelMarshalOptions() *MarshalOptions { + return &MarshalOptions{ + Order: internal.NativeEndian, + StripFuncLinkage: haveFuncLinkage() != nil, + ReplaceDeclTags: haveDeclTags() != nil, + ReplaceTypeTags: haveTypeTags() != nil, + ReplaceEnum64: haveEnum64() != nil, + PreventNoTypeFound: true, // All current kernels require this. + } +} + +// encoder turns Types into raw BTF. +type encoder struct { + MarshalOptions + + pending internal.Deque[Type] + buf *bytes.Buffer + strings *stringTableBuilder + ids map[Type]TypeID + visited map[Type]struct{} + lastID TypeID +} + +var bufferPool = sync.Pool{ + New: func() any { + buf := make([]byte, btfHeaderLen+128) + return &buf + }, +} + +func getByteSlice() *[]byte { + return bufferPool.Get().(*[]byte) +} + +func putByteSlice(buf *[]byte) { + *buf = (*buf)[:0] + bufferPool.Put(buf) +} + +// Builder turns Types into raw BTF. +// +// The default value may be used and represents an empty BTF blob. Void is +// added implicitly if necessary. +type Builder struct { + // Explicitly added types. + types []Type + // IDs for all added types which the user knows about. + stableIDs map[Type]TypeID + // Explicitly added strings. + strings *stringTableBuilder +} + +// NewBuilder creates a Builder from a list of types. +// +// It is more efficient than calling [Add] individually. +// +// Returns an error if adding any of the types fails. +func NewBuilder(types []Type) (*Builder, error) { + b := &Builder{ + make([]Type, 0, len(types)), + make(map[Type]TypeID, len(types)), + nil, + } + + for _, typ := range types { + _, err := b.Add(typ) + if err != nil { + return nil, fmt.Errorf("add %s: %w", typ, err) + } + } + + return b, nil +} + +// Empty returns true if neither types nor strings have been added. +func (b *Builder) Empty() bool { + return len(b.types) == 0 && (b.strings == nil || b.strings.Length() == 0) +} + +// Add a Type and allocate a stable ID for it. +// +// Adding the identical Type multiple times is valid and will return the same ID. +// +// See [Type] for details on identity. +func (b *Builder) Add(typ Type) (TypeID, error) { + if b.stableIDs == nil { + b.stableIDs = make(map[Type]TypeID) + } + + if _, ok := typ.(*Void); ok { + // Equality is weird for void, since it is a zero sized type. + return 0, nil + } + + if ds, ok := typ.(*Datasec); ok { + if err := datasecResolveWorkaround(b, ds); err != nil { + return 0, err + } + } + + id, ok := b.stableIDs[typ] + if ok { + return id, nil + } + + b.types = append(b.types, typ) + + id = TypeID(len(b.types)) + if int(id) != len(b.types) { + return 0, fmt.Errorf("no more type IDs") + } + + b.stableIDs[typ] = id + return id, nil +} + +// Marshal encodes all types in the Marshaler into BTF wire format. +// +// opts may be nil. +func (b *Builder) Marshal(buf []byte, opts *MarshalOptions) ([]byte, error) { + stb := b.strings + if stb == nil { + // Assume that most types are named. This makes encoding large BTF like + // vmlinux a lot cheaper. + stb = newStringTableBuilder(len(b.types)) + } else { + // Avoid modifying the Builder's string table. + stb = b.strings.Copy() + } + + if opts == nil { + opts = &MarshalOptions{Order: internal.NativeEndian} + } + + // Reserve space for the BTF header. + buf = slices.Grow(buf, btfHeaderLen)[:btfHeaderLen] + + w := internal.NewBuffer(buf) + defer internal.PutBuffer(w) + + e := encoder{ + MarshalOptions: *opts, + buf: w, + strings: stb, + lastID: TypeID(len(b.types)), + visited: make(map[Type]struct{}, len(b.types)), + ids: maps.Clone(b.stableIDs), + } + + if e.ids == nil { + e.ids = make(map[Type]TypeID) + } + + types := b.types + if len(types) == 0 && stb.Length() > 0 && opts.PreventNoTypeFound { + // We have strings that need to be written out, + // but no types (besides the implicit Void). + // Kernels as recent as v6.7 refuse to load such BTF + // with a "No type found" error in the log. + // Fix this by adding a dummy type. + types = []Type{&Int{Size: 0}} + } + + // Ensure that types are marshaled in the exact order they were Add()ed. + // Otherwise the ID returned from Add() won't match. + e.pending.Grow(len(types)) + for _, typ := range types { + e.pending.Push(typ) + } + + if err := e.deflatePending(); err != nil { + return nil, err + } + + length := e.buf.Len() + typeLen := uint32(length - btfHeaderLen) + + stringLen := e.strings.Length() + buf = e.strings.AppendEncoded(e.buf.Bytes()) + + // Fill out the header, and write it out. + header := &btfHeader{ + Magic: btfMagic, + Version: 1, + Flags: 0, + HdrLen: uint32(btfHeaderLen), + TypeOff: 0, + TypeLen: typeLen, + StringOff: typeLen, + StringLen: uint32(stringLen), + } + + err := binary.Write(sliceWriter(buf[:btfHeaderLen]), e.Order, header) + if err != nil { + return nil, fmt.Errorf("write header: %v", err) + } + + return buf, nil +} + +// addString adds a string to the resulting BTF. +// +// Adding the same string multiple times will return the same result. +// +// Returns an identifier into the string table or an error if the string +// contains invalid characters. +func (b *Builder) addString(str string) (uint32, error) { + if b.strings == nil { + b.strings = newStringTableBuilder(0) + } + + return b.strings.Add(str) +} + +func (e *encoder) allocateIDs(root Type) (err error) { + visitInPostorder(root, e.visited, func(typ Type) bool { + if _, ok := typ.(*Void); ok { + return true + } + + if _, ok := e.ids[typ]; ok { + return true + } + + id := e.lastID + 1 + if id < e.lastID { + err = errors.New("type ID overflow") + return false + } + + e.pending.Push(typ) + e.ids[typ] = id + e.lastID = id + return true + }) + return +} + +// id returns the ID for the given type or panics with an error. +func (e *encoder) id(typ Type) TypeID { + if _, ok := typ.(*Void); ok { + return 0 + } + + id, ok := e.ids[typ] + if !ok { + panic(fmt.Errorf("no ID for type %v", typ)) + } + + return id +} + +func (e *encoder) deflatePending() error { + // Declare root outside of the loop to avoid repeated heap allocations. + var root Type + + for !e.pending.Empty() { + root = e.pending.Shift() + + // Allocate IDs for all children of typ, including transitive dependencies. + if err := e.allocateIDs(root); err != nil { + return err + } + + if err := e.deflateType(root); err != nil { + id := e.ids[root] + return fmt.Errorf("deflate %v with ID %d: %w", root, id, err) + } + } + + return nil +} + +func (e *encoder) deflateType(typ Type) (err error) { + defer func() { + if r := recover(); r != nil { + var ok bool + err, ok = r.(error) + if !ok { + panic(r) + } + } + }() + + var raw rawType + raw.NameOff, err = e.strings.Add(typ.TypeName()) + if err != nil { + return err + } + + switch v := typ.(type) { + case *Void: + return errors.New("Void is implicit in BTF wire format") + + case *Int: + e.deflateInt(&raw, v) + + case *Pointer: + raw.SetKind(kindPointer) + raw.SetType(e.id(v.Target)) + + case *Array: + raw.SetKind(kindArray) + raw.data = &btfArray{ + e.id(v.Type), + e.id(v.Index), + v.Nelems, + } + + case *Struct: + raw.SetKind(kindStruct) + raw.SetSize(v.Size) + raw.data, err = e.convertMembers(&raw.btfType, v.Members) + + case *Union: + err = e.deflateUnion(&raw, v) + + case *Enum: + if v.Size == 8 { + err = e.deflateEnum64(&raw, v) + } else { + err = e.deflateEnum(&raw, v) + } + + case *Fwd: + raw.SetKind(kindForward) + raw.SetFwdKind(v.Kind) + + case *Typedef: + raw.SetKind(kindTypedef) + raw.SetType(e.id(v.Type)) + + case *Volatile: + raw.SetKind(kindVolatile) + raw.SetType(e.id(v.Type)) + + case *Const: + e.deflateConst(&raw, v) + + case *Restrict: + raw.SetKind(kindRestrict) + raw.SetType(e.id(v.Type)) + + case *Func: + raw.SetKind(kindFunc) + raw.SetType(e.id(v.Type)) + if !e.StripFuncLinkage { + raw.SetLinkage(v.Linkage) + } + + case *FuncProto: + raw.SetKind(kindFuncProto) + raw.SetType(e.id(v.Return)) + raw.SetVlen(len(v.Params)) + raw.data, err = e.deflateFuncParams(v.Params) + + case *Var: + raw.SetKind(kindVar) + raw.SetType(e.id(v.Type)) + raw.data = btfVariable{uint32(v.Linkage)} + + case *Datasec: + raw.SetKind(kindDatasec) + raw.SetSize(v.Size) + raw.SetVlen(len(v.Vars)) + raw.data = e.deflateVarSecinfos(v.Vars) + + case *Float: + raw.SetKind(kindFloat) + raw.SetSize(v.Size) + + case *declTag: + err = e.deflateDeclTag(&raw, v) + + case *TypeTag: + err = e.deflateTypeTag(&raw, v) + + default: + return fmt.Errorf("don't know how to deflate %T", v) + } + + if err != nil { + return err + } + + return raw.Marshal(e.buf, e.Order) +} + +func (e *encoder) deflateInt(raw *rawType, i *Int) { + raw.SetKind(kindInt) + raw.SetSize(i.Size) + + var bi btfInt + bi.SetEncoding(i.Encoding) + // We need to set bits in addition to size, since btf_type_int_is_regular + // otherwise flags this as a bitfield. + bi.SetBits(byte(i.Size) * 8) + raw.data = bi +} + +func (e *encoder) deflateDeclTag(raw *rawType, tag *declTag) (err error) { + // Replace a decl tag with an integer for compatibility with <5.16 kernels, + // following libbpf behaviour. + if e.ReplaceDeclTags { + typ := &Int{"decl_tag_placeholder", 1, Unsigned} + e.deflateInt(raw, typ) + + // Add the placeholder type name to the string table. The encoder added the + // original type name before this call. + raw.NameOff, err = e.strings.Add(typ.TypeName()) + return + } + + raw.SetKind(kindDeclTag) + raw.SetType(e.id(tag.Type)) + raw.data = &btfDeclTag{uint32(tag.Index)} + raw.NameOff, err = e.strings.Add(tag.Value) + return +} + +func (e *encoder) deflateConst(raw *rawType, c *Const) { + raw.SetKind(kindConst) + raw.SetType(e.id(c.Type)) +} + +func (e *encoder) deflateTypeTag(raw *rawType, tag *TypeTag) (err error) { + // Replace a type tag with a const qualifier for compatibility with <5.17 + // kernels, following libbpf behaviour. + if e.ReplaceTypeTags { + e.deflateConst(raw, &Const{tag.Type}) + return + } + + raw.SetKind(kindTypeTag) + raw.SetType(e.id(tag.Type)) + raw.NameOff, err = e.strings.Add(tag.Value) + return +} + +func (e *encoder) deflateUnion(raw *rawType, union *Union) (err error) { + raw.SetKind(kindUnion) + raw.SetSize(union.Size) + raw.data, err = e.convertMembers(&raw.btfType, union.Members) + return +} + +func (e *encoder) convertMembers(header *btfType, members []Member) ([]btfMember, error) { + bms := make([]btfMember, 0, len(members)) + isBitfield := false + for _, member := range members { + isBitfield = isBitfield || member.BitfieldSize > 0 + + offset := member.Offset + if isBitfield { + offset = member.BitfieldSize<<24 | (member.Offset & 0xffffff) + } + + nameOff, err := e.strings.Add(member.Name) + if err != nil { + return nil, err + } + + bms = append(bms, btfMember{ + nameOff, + e.id(member.Type), + uint32(offset), + }) + } + + header.SetVlen(len(members)) + header.SetBitfield(isBitfield) + return bms, nil +} + +func (e *encoder) deflateEnum(raw *rawType, enum *Enum) (err error) { + raw.SetKind(kindEnum) + raw.SetSize(enum.Size) + raw.SetVlen(len(enum.Values)) + // Signedness appeared together with ENUM64 support. + raw.SetSigned(enum.Signed && !e.ReplaceEnum64) + raw.data, err = e.deflateEnumValues(enum) + return +} + +func (e *encoder) deflateEnumValues(enum *Enum) ([]btfEnum, error) { + bes := make([]btfEnum, 0, len(enum.Values)) + for _, value := range enum.Values { + nameOff, err := e.strings.Add(value.Name) + if err != nil { + return nil, err + } + + if enum.Signed { + if signedValue := int64(value.Value); signedValue < math.MinInt32 || signedValue > math.MaxInt32 { + return nil, fmt.Errorf("value %d of enum %q exceeds 32 bits", signedValue, value.Name) + } + } else { + if value.Value > math.MaxUint32 { + return nil, fmt.Errorf("value %d of enum %q exceeds 32 bits", value.Value, value.Name) + } + } + + bes = append(bes, btfEnum{ + nameOff, + uint32(value.Value), + }) + } + + return bes, nil +} + +func (e *encoder) deflateEnum64(raw *rawType, enum *Enum) (err error) { + if e.ReplaceEnum64 { + // Replace the ENUM64 with a union of fields with the correct size. + // This matches libbpf behaviour on purpose. + placeholder := &Int{ + "enum64_placeholder", + enum.Size, + Unsigned, + } + if enum.Signed { + placeholder.Encoding = Signed + } + if err := e.allocateIDs(placeholder); err != nil { + return fmt.Errorf("add enum64 placeholder: %w", err) + } + + members := make([]Member, 0, len(enum.Values)) + for _, v := range enum.Values { + members = append(members, Member{ + Name: v.Name, + Type: placeholder, + }) + } + + return e.deflateUnion(raw, &Union{enum.Name, enum.Size, members, nil}) + } + + raw.SetKind(kindEnum64) + raw.SetSize(enum.Size) + raw.SetVlen(len(enum.Values)) + raw.SetSigned(enum.Signed) + raw.data, err = e.deflateEnum64Values(enum.Values) + return +} + +func (e *encoder) deflateEnum64Values(values []EnumValue) ([]btfEnum64, error) { + bes := make([]btfEnum64, 0, len(values)) + for _, value := range values { + nameOff, err := e.strings.Add(value.Name) + if err != nil { + return nil, err + } + + bes = append(bes, btfEnum64{ + nameOff, + uint32(value.Value), + uint32(value.Value >> 32), + }) + } + + return bes, nil +} + +func (e *encoder) deflateFuncParams(params []FuncParam) ([]btfParam, error) { + bps := make([]btfParam, 0, len(params)) + for _, param := range params { + nameOff, err := e.strings.Add(param.Name) + if err != nil { + return nil, err + } + + bps = append(bps, btfParam{ + nameOff, + e.id(param.Type), + }) + } + return bps, nil +} + +func (e *encoder) deflateVarSecinfos(vars []VarSecinfo) []btfVarSecinfo { + vsis := make([]btfVarSecinfo, 0, len(vars)) + for _, v := range vars { + vsis = append(vsis, btfVarSecinfo{ + e.id(v.Type), + v.Offset, + v.Size, + }) + } + return vsis +} + +// MarshalMapKV creates a BTF object containing a map key and value. +// +// The function is intended for the use of the ebpf package and may be removed +// at any point in time. +func MarshalMapKV(key, value Type) (_ *Handle, keyID, valueID TypeID, err error) { + var b Builder + + if key != nil { + keyID, err = b.Add(key) + if err != nil { + return nil, 0, 0, fmt.Errorf("add key type: %w", err) + } + } + + if value != nil { + valueID, err = b.Add(value) + if err != nil { + return nil, 0, 0, fmt.Errorf("add value type: %w", err) + } + } + + handle, err := NewHandle(&b) + if err != nil { + // Check for 'full' map BTF support, since kernels between 4.18 and 5.2 + // already support BTF blobs for maps without Var or Datasec just fine. + if err := haveMapBTF(); err != nil { + return nil, 0, 0, err + } + } + return handle, keyID, valueID, err +} diff --git a/vendor/github.com/cilium/ebpf/btf/strings.go b/vendor/github.com/cilium/ebpf/btf/strings.go new file mode 100644 index 0000000000..7c31461c30 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/btf/strings.go @@ -0,0 +1,198 @@ +package btf + +import ( + "bufio" + "bytes" + "errors" + "fmt" + "io" + "maps" + "slices" + "strings" +) + +type stringTable struct { + base *stringTable + offsets []uint32 + prevIdx int + strings []string +} + +// sizedReader is implemented by bytes.Reader, io.SectionReader, strings.Reader, etc. +type sizedReader interface { + io.Reader + Size() int64 +} + +func readStringTable(r sizedReader, base *stringTable) (*stringTable, error) { + // When parsing split BTF's string table, the first entry offset is derived + // from the last entry offset of the base BTF. + firstStringOffset := uint32(0) + if base != nil { + idx := len(base.offsets) - 1 + firstStringOffset = base.offsets[idx] + uint32(len(base.strings[idx])) + 1 + } + + // Derived from vmlinux BTF. + const averageStringLength = 16 + + n := int(r.Size() / averageStringLength) + offsets := make([]uint32, 0, n) + strings := make([]string, 0, n) + + offset := firstStringOffset + scanner := bufio.NewScanner(r) + scanner.Split(splitNull) + for scanner.Scan() { + str := scanner.Text() + offsets = append(offsets, offset) + strings = append(strings, str) + offset += uint32(len(str)) + 1 + } + if err := scanner.Err(); err != nil { + return nil, err + } + + if len(strings) == 0 { + return nil, errors.New("string table is empty") + } + + if firstStringOffset == 0 && strings[0] != "" { + return nil, errors.New("first item in string table is non-empty") + } + + return &stringTable{base, offsets, 0, strings}, nil +} + +func splitNull(data []byte, atEOF bool) (advance int, token []byte, err error) { + i := bytes.IndexByte(data, 0) + if i == -1 { + if atEOF && len(data) > 0 { + return 0, nil, errors.New("string table isn't null terminated") + } + return 0, nil, nil + } + + return i + 1, data[:i], nil +} + +func (st *stringTable) Lookup(offset uint32) (string, error) { + if st.base != nil && offset <= st.base.offsets[len(st.base.offsets)-1] { + return st.base.lookup(offset) + } + return st.lookup(offset) +} + +func (st *stringTable) lookup(offset uint32) (string, error) { + // Fast path: zero offset is the empty string, looked up frequently. + if offset == 0 && st.base == nil { + return "", nil + } + + // Accesses tend to be globally increasing, so check if the next string is + // the one we want. This skips the binary search in about 50% of cases. + if st.prevIdx+1 < len(st.offsets) && st.offsets[st.prevIdx+1] == offset { + st.prevIdx++ + return st.strings[st.prevIdx], nil + } + + i, found := slices.BinarySearch(st.offsets, offset) + if !found { + return "", fmt.Errorf("offset %d isn't start of a string", offset) + } + + // Set the new increment index, but only if its greater than the current. + if i > st.prevIdx+1 { + st.prevIdx = i + } + + return st.strings[i], nil +} + +// Num returns the number of strings in the table. +func (st *stringTable) Num() int { + return len(st.strings) +} + +// stringTableBuilder builds BTF string tables. +type stringTableBuilder struct { + length uint32 + strings map[string]uint32 +} + +// newStringTableBuilder creates a builder with the given capacity. +// +// capacity may be zero. +func newStringTableBuilder(capacity int) *stringTableBuilder { + var stb stringTableBuilder + + if capacity == 0 { + // Use the runtime's small default size. + stb.strings = make(map[string]uint32) + } else { + stb.strings = make(map[string]uint32, capacity) + } + + // Ensure that the empty string is at index 0. + stb.append("") + return &stb +} + +// Add a string to the table. +// +// Adding the same string multiple times will only store it once. +func (stb *stringTableBuilder) Add(str string) (uint32, error) { + if strings.IndexByte(str, 0) != -1 { + return 0, fmt.Errorf("string contains null: %q", str) + } + + offset, ok := stb.strings[str] + if ok { + return offset, nil + } + + return stb.append(str), nil +} + +func (stb *stringTableBuilder) append(str string) uint32 { + offset := stb.length + stb.length += uint32(len(str)) + 1 + stb.strings[str] = offset + return offset +} + +// Lookup finds the offset of a string in the table. +// +// Returns an error if str hasn't been added yet. +func (stb *stringTableBuilder) Lookup(str string) (uint32, error) { + offset, ok := stb.strings[str] + if !ok { + return 0, fmt.Errorf("string %q is not in table", str) + } + + return offset, nil +} + +// Length returns the length in bytes. +func (stb *stringTableBuilder) Length() int { + return int(stb.length) +} + +// AppendEncoded appends the string table to the end of the provided buffer. +func (stb *stringTableBuilder) AppendEncoded(buf []byte) []byte { + n := len(buf) + buf = append(buf, make([]byte, stb.Length())...) + strings := buf[n:] + for str, offset := range stb.strings { + copy(strings[offset:], str) + } + return buf +} + +// Copy the string table builder. +func (stb *stringTableBuilder) Copy() *stringTableBuilder { + return &stringTableBuilder{ + stb.length, + maps.Clone(stb.strings), + } +} diff --git a/vendor/github.com/cilium/ebpf/btf/traversal.go b/vendor/github.com/cilium/ebpf/btf/traversal.go new file mode 100644 index 0000000000..13647d931f --- /dev/null +++ b/vendor/github.com/cilium/ebpf/btf/traversal.go @@ -0,0 +1,159 @@ +package btf + +import ( + "fmt" +) + +// Functions to traverse a cyclic graph of types. The below was very useful: +// https://eli.thegreenplace.net/2015/directed-graph-traversal-orderings-and-applications-to-data-flow-analysis/#post-order-and-reverse-post-order + +// Visit all types reachable from root in postorder. +// +// Traversal stops if yield returns false. +// +// Returns false if traversal was aborted. +func visitInPostorder(root Type, visited map[Type]struct{}, yield func(typ Type) bool) bool { + if _, ok := visited[root]; ok { + return true + } + if visited == nil { + visited = make(map[Type]struct{}) + } + visited[root] = struct{}{} + + cont := children(root, func(child *Type) bool { + return visitInPostorder(*child, visited, yield) + }) + if !cont { + return false + } + + return yield(root) +} + +// children calls yield on each child of typ. +// +// Traversal stops if yield returns false. +// +// Returns false if traversal was aborted. +func children(typ Type, yield func(child *Type) bool) bool { + // Explicitly type switch on the most common types to allow the inliner to + // do its work. This avoids allocating intermediate slices from walk() on + // the heap. + var tags []string + switch v := typ.(type) { + case *Void, *Int, *Enum, *Fwd, *Float, *declTag: + // No children to traverse. + // declTags is declared as a leaf type since it's parsed into .Tags fields of other types + // during unmarshaling. + case *Pointer: + if !yield(&v.Target) { + return false + } + case *Array: + if !yield(&v.Index) { + return false + } + if !yield(&v.Type) { + return false + } + case *Struct: + for i := range v.Members { + if !yield(&v.Members[i].Type) { + return false + } + for _, t := range v.Members[i].Tags { + var tag Type = &declTag{v, t, i} + if !yield(&tag) { + return false + } + } + } + tags = v.Tags + case *Union: + for i := range v.Members { + if !yield(&v.Members[i].Type) { + return false + } + for _, t := range v.Members[i].Tags { + var tag Type = &declTag{v, t, i} + if !yield(&tag) { + return false + } + } + } + tags = v.Tags + case *Typedef: + if !yield(&v.Type) { + return false + } + tags = v.Tags + case *Volatile: + if !yield(&v.Type) { + return false + } + case *Const: + if !yield(&v.Type) { + return false + } + case *Restrict: + if !yield(&v.Type) { + return false + } + case *Func: + if !yield(&v.Type) { + return false + } + if fp, ok := v.Type.(*FuncProto); ok { + for i := range fp.Params { + if len(v.ParamTags) <= i { + continue + } + for _, t := range v.ParamTags[i] { + var tag Type = &declTag{v, t, i} + if !yield(&tag) { + return false + } + } + } + } + tags = v.Tags + case *FuncProto: + if !yield(&v.Return) { + return false + } + for i := range v.Params { + if !yield(&v.Params[i].Type) { + return false + } + } + case *Var: + if !yield(&v.Type) { + return false + } + tags = v.Tags + case *Datasec: + for i := range v.Vars { + if !yield(&v.Vars[i].Type) { + return false + } + } + case *TypeTag: + if !yield(&v.Type) { + return false + } + case *cycle: + // cycle has children, but we ignore them deliberately. + default: + panic(fmt.Sprintf("don't know how to walk Type %T", v)) + } + + for _, t := range tags { + var tag Type = &declTag{typ, t, -1} + if !yield(&tag) { + return false + } + } + + return true +} diff --git a/vendor/github.com/cilium/ebpf/btf/types.go b/vendor/github.com/cilium/ebpf/btf/types.go new file mode 100644 index 0000000000..dbcdf9dd7a --- /dev/null +++ b/vendor/github.com/cilium/ebpf/btf/types.go @@ -0,0 +1,1417 @@ +package btf + +import ( + "encoding/binary" + "errors" + "fmt" + "io" + "math" + "slices" + "strings" + + "github.com/cilium/ebpf/asm" + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/sys" +) + +// Mirrors MAX_RESOLVE_DEPTH in libbpf. +// https://github.com/libbpf/libbpf/blob/e26b84dc330c9644c07428c271ab491b0f01f4e1/src/btf.c#L761 +const maxResolveDepth = 32 + +// TypeID identifies a type in a BTF section. +type TypeID = sys.TypeID + +// Type represents a type described by BTF. +// +// Identity of Type follows the [Go specification]: two Types are considered +// equal if they have the same concrete type and the same dynamic value, aka +// they point at the same location in memory. This means that the following +// Types are considered distinct even though they have the same "shape". +// +// a := &Int{Size: 1} +// b := &Int{Size: 1} +// a != b +// +// [Go specification]: https://go.dev/ref/spec#Comparison_operators +type Type interface { + // Type can be formatted using the %s and %v verbs. %s outputs only the + // identity of the type, without any detail. %v outputs additional detail. + // + // Use the '+' flag to include the address of the type. + // + // Use the width to specify how many levels of detail to output, for example + // %1v will output detail for the root type and a short description of its + // children. %2v would output details of the root type and its children + // as well as a short description of the grandchildren. + fmt.Formatter + + // Name of the type, empty for anonymous types and types that cannot + // carry a name, like Void and Pointer. + TypeName() string + + // Make a copy of the type, without copying Type members. + copy() Type + + // New implementations must update walkType. +} + +var ( + _ Type = (*Int)(nil) + _ Type = (*Struct)(nil) + _ Type = (*Union)(nil) + _ Type = (*Enum)(nil) + _ Type = (*Fwd)(nil) + _ Type = (*Func)(nil) + _ Type = (*Typedef)(nil) + _ Type = (*Var)(nil) + _ Type = (*Datasec)(nil) + _ Type = (*Float)(nil) + _ Type = (*declTag)(nil) + _ Type = (*TypeTag)(nil) + _ Type = (*cycle)(nil) +) + +// Void is the unit type of BTF. +type Void struct{} + +func (v *Void) Format(fs fmt.State, verb rune) { formatType(fs, verb, v) } +func (v *Void) TypeName() string { return "" } +func (v *Void) size() uint32 { return 0 } +func (v *Void) copy() Type { return (*Void)(nil) } + +type IntEncoding byte + +// Valid IntEncodings. +// +// These may look like they are flags, but they aren't. +const ( + Unsigned IntEncoding = 0 + Signed IntEncoding = 1 + Char IntEncoding = 2 + Bool IntEncoding = 4 +) + +func (ie IntEncoding) String() string { + switch ie { + case Char: + // NB: There is no way to determine signedness for char. + return "char" + case Bool: + return "bool" + case Signed: + return "signed" + case Unsigned: + return "unsigned" + default: + return fmt.Sprintf("IntEncoding(%d)", byte(ie)) + } +} + +// Int is an integer of a given length. +// +// See https://www.kernel.org/doc/html/latest/bpf/btf.html#btf-kind-int +type Int struct { + Name string + + // The size of the integer in bytes. + Size uint32 + Encoding IntEncoding +} + +func (i *Int) Format(fs fmt.State, verb rune) { + formatType(fs, verb, i, i.Encoding, "size=", i.Size) +} + +func (i *Int) TypeName() string { return i.Name } +func (i *Int) size() uint32 { return i.Size } +func (i *Int) copy() Type { + cpy := *i + return &cpy +} + +// Pointer is a pointer to another type. +type Pointer struct { + Target Type +} + +func (p *Pointer) Format(fs fmt.State, verb rune) { + formatType(fs, verb, p, "target=", p.Target) +} + +func (p *Pointer) TypeName() string { return "" } +func (p *Pointer) size() uint32 { return 8 } +func (p *Pointer) copy() Type { + cpy := *p + return &cpy +} + +// Array is an array with a fixed number of elements. +type Array struct { + Index Type + Type Type + Nelems uint32 +} + +func (arr *Array) Format(fs fmt.State, verb rune) { + formatType(fs, verb, arr, "index=", arr.Index, "type=", arr.Type, "n=", arr.Nelems) +} + +func (arr *Array) TypeName() string { return "" } + +func (arr *Array) copy() Type { + cpy := *arr + return &cpy +} + +// Struct is a compound type of consecutive members. +type Struct struct { + Name string + // The size of the struct including padding, in bytes + Size uint32 + Members []Member + Tags []string +} + +func (s *Struct) Format(fs fmt.State, verb rune) { + formatType(fs, verb, s, "fields=", len(s.Members)) +} + +func (s *Struct) TypeName() string { return s.Name } + +func (s *Struct) size() uint32 { return s.Size } + +func (s *Struct) copy() Type { + cpy := *s + cpy.Members = copyMembers(s.Members) + cpy.Tags = copyTags(cpy.Tags) + return &cpy +} + +func (s *Struct) members() []Member { + return s.Members +} + +// Union is a compound type where members occupy the same memory. +type Union struct { + Name string + // The size of the union including padding, in bytes. + Size uint32 + Members []Member + Tags []string +} + +func (u *Union) Format(fs fmt.State, verb rune) { + formatType(fs, verb, u, "fields=", len(u.Members)) +} + +func (u *Union) TypeName() string { return u.Name } + +func (u *Union) size() uint32 { return u.Size } + +func (u *Union) copy() Type { + cpy := *u + cpy.Members = copyMembers(u.Members) + cpy.Tags = copyTags(cpy.Tags) + return &cpy +} + +func (u *Union) members() []Member { + return u.Members +} + +func copyMembers(orig []Member) []Member { + cpy := make([]Member, len(orig)) + copy(cpy, orig) + for i, member := range cpy { + cpy[i].Tags = copyTags(member.Tags) + } + return cpy +} + +func copyTags(orig []string) []string { + if orig == nil { // preserve nil vs zero-len slice distinction + return nil + } + cpy := make([]string, len(orig)) + copy(cpy, orig) + return cpy +} + +type composite interface { + Type + members() []Member +} + +var ( + _ composite = (*Struct)(nil) + _ composite = (*Union)(nil) +) + +// A value in bits. +type Bits uint32 + +// Bytes converts a bit value into bytes. +func (b Bits) Bytes() uint32 { + return uint32(b / 8) +} + +// Member is part of a Struct or Union. +// +// It is not a valid Type. +type Member struct { + Name string + Type Type + Offset Bits + BitfieldSize Bits + Tags []string +} + +// Enum lists possible values. +type Enum struct { + Name string + // Size of the enum value in bytes. + Size uint32 + // True if the values should be interpreted as signed integers. + Signed bool + Values []EnumValue +} + +func (e *Enum) Format(fs fmt.State, verb rune) { + formatType(fs, verb, e, "size=", e.Size, "values=", len(e.Values)) +} + +func (e *Enum) TypeName() string { return e.Name } + +// EnumValue is part of an Enum +// +// Is is not a valid Type +type EnumValue struct { + Name string + Value uint64 +} + +func (e *Enum) size() uint32 { return e.Size } +func (e *Enum) copy() Type { + cpy := *e + cpy.Values = make([]EnumValue, len(e.Values)) + copy(cpy.Values, e.Values) + return &cpy +} + +// FwdKind is the type of forward declaration. +type FwdKind int + +// Valid types of forward declaration. +const ( + FwdStruct FwdKind = iota + FwdUnion +) + +func (fk FwdKind) String() string { + switch fk { + case FwdStruct: + return "struct" + case FwdUnion: + return "union" + default: + return fmt.Sprintf("%T(%d)", fk, int(fk)) + } +} + +// Fwd is a forward declaration of a Type. +type Fwd struct { + Name string + Kind FwdKind +} + +func (f *Fwd) Format(fs fmt.State, verb rune) { + formatType(fs, verb, f, f.Kind) +} + +func (f *Fwd) TypeName() string { return f.Name } + +func (f *Fwd) copy() Type { + cpy := *f + return &cpy +} + +func (f *Fwd) matches(typ Type) bool { + if _, ok := As[*Struct](typ); ok && f.Kind == FwdStruct { + return true + } + + if _, ok := As[*Union](typ); ok && f.Kind == FwdUnion { + return true + } + + return false +} + +// Typedef is an alias of a Type. +type Typedef struct { + Name string + Type Type + Tags []string +} + +func (td *Typedef) Format(fs fmt.State, verb rune) { + formatType(fs, verb, td, td.Type) +} + +func (td *Typedef) TypeName() string { return td.Name } + +func (td *Typedef) copy() Type { + cpy := *td + cpy.Tags = copyTags(td.Tags) + return &cpy +} + +// Volatile is a qualifier. +type Volatile struct { + Type Type +} + +func (v *Volatile) Format(fs fmt.State, verb rune) { + formatType(fs, verb, v, v.Type) +} + +func (v *Volatile) TypeName() string { return "" } + +func (v *Volatile) qualify() Type { return v.Type } +func (v *Volatile) copy() Type { + cpy := *v + return &cpy +} + +// Const is a qualifier. +type Const struct { + Type Type +} + +func (c *Const) Format(fs fmt.State, verb rune) { + formatType(fs, verb, c, c.Type) +} + +func (c *Const) TypeName() string { return "" } + +func (c *Const) qualify() Type { return c.Type } +func (c *Const) copy() Type { + cpy := *c + return &cpy +} + +// Restrict is a qualifier. +type Restrict struct { + Type Type +} + +func (r *Restrict) Format(fs fmt.State, verb rune) { + formatType(fs, verb, r, r.Type) +} + +func (r *Restrict) TypeName() string { return "" } + +func (r *Restrict) qualify() Type { return r.Type } +func (r *Restrict) copy() Type { + cpy := *r + return &cpy +} + +// Func is a function definition. +type Func struct { + Name string + Type Type + Linkage FuncLinkage + Tags []string + // ParamTags holds a list of tags for each parameter of the FuncProto to which `Type` points. + // If no tags are present for any param, the outer slice will be nil/len(ParamTags)==0. + // If at least 1 param has a tag, the outer slice will have the same length as the number of params. + // The inner slice contains the tags and may be nil/len(ParamTags[i])==0 if no tags are present for that param. + ParamTags [][]string +} + +func FuncMetadata(ins *asm.Instruction) *Func { + fn, _ := ins.Metadata.Get(funcInfoMeta{}).(*Func) + return fn +} + +// WithFuncMetadata adds a btf.Func to the Metadata of asm.Instruction. +func WithFuncMetadata(ins asm.Instruction, fn *Func) asm.Instruction { + ins.Metadata.Set(funcInfoMeta{}, fn) + return ins +} + +func (f *Func) Format(fs fmt.State, verb rune) { + formatType(fs, verb, f, f.Linkage, "proto=", f.Type) +} + +func (f *Func) TypeName() string { return f.Name } + +func (f *Func) copy() Type { + cpy := *f + cpy.Tags = copyTags(f.Tags) + if f.ParamTags != nil { // preserve nil vs zero-len slice distinction + ptCopy := make([][]string, len(f.ParamTags)) + for i, tags := range f.ParamTags { + ptCopy[i] = copyTags(tags) + } + cpy.ParamTags = ptCopy + } + return &cpy +} + +// FuncProto is a function declaration. +type FuncProto struct { + Return Type + Params []FuncParam +} + +func (fp *FuncProto) Format(fs fmt.State, verb rune) { + formatType(fs, verb, fp, "args=", len(fp.Params), "return=", fp.Return) +} + +func (fp *FuncProto) TypeName() string { return "" } + +func (fp *FuncProto) copy() Type { + cpy := *fp + cpy.Params = make([]FuncParam, len(fp.Params)) + copy(cpy.Params, fp.Params) + return &cpy +} + +type FuncParam struct { + Name string + Type Type +} + +// Var is a global variable. +type Var struct { + Name string + Type Type + Linkage VarLinkage + Tags []string +} + +func (v *Var) Format(fs fmt.State, verb rune) { + formatType(fs, verb, v, v.Linkage) +} + +func (v *Var) TypeName() string { return v.Name } + +func (v *Var) copy() Type { + cpy := *v + cpy.Tags = copyTags(v.Tags) + return &cpy +} + +// Datasec is a global program section containing data. +type Datasec struct { + Name string + Size uint32 + Vars []VarSecinfo +} + +func (ds *Datasec) Format(fs fmt.State, verb rune) { + formatType(fs, verb, ds) +} + +func (ds *Datasec) TypeName() string { return ds.Name } + +func (ds *Datasec) size() uint32 { return ds.Size } + +func (ds *Datasec) copy() Type { + cpy := *ds + cpy.Vars = make([]VarSecinfo, len(ds.Vars)) + copy(cpy.Vars, ds.Vars) + return &cpy +} + +// VarSecinfo describes variable in a Datasec. +// +// It is not a valid Type. +type VarSecinfo struct { + // Var or Func. + Type Type + Offset uint32 + Size uint32 +} + +// Float is a float of a given length. +type Float struct { + Name string + + // The size of the float in bytes. + Size uint32 +} + +func (f *Float) Format(fs fmt.State, verb rune) { + formatType(fs, verb, f, "size=", f.Size*8) +} + +func (f *Float) TypeName() string { return f.Name } +func (f *Float) size() uint32 { return f.Size } +func (f *Float) copy() Type { + cpy := *f + return &cpy +} + +// declTag associates metadata with a declaration. +type declTag struct { + Type Type + Value string + // The index this tag refers to in the target type. For composite types, + // a value of -1 indicates that the tag refers to the whole type. Otherwise + // it indicates which member or argument the tag applies to. + Index int +} + +func (dt *declTag) Format(fs fmt.State, verb rune) { + formatType(fs, verb, dt, "type=", dt.Type, "value=", dt.Value, "index=", dt.Index) +} + +func (dt *declTag) TypeName() string { return "" } +func (dt *declTag) copy() Type { + cpy := *dt + return &cpy +} + +// TypeTag associates metadata with a pointer type. Tag types act as a custom +// modifier(const, restrict, volatile) for the target type. Unlike declTags, +// TypeTags are ordered so the order in which they are added matters. +// +// One of their uses is to mark pointers as `__kptr` meaning a pointer points +// to kernel memory. Adding a `__kptr` to pointers in map values allows you +// to store pointers to kernel memory in maps. +type TypeTag struct { + Type Type + Value string +} + +func (tt *TypeTag) Format(fs fmt.State, verb rune) { + formatType(fs, verb, tt, "type=", tt.Type, "value=", tt.Value) +} + +func (tt *TypeTag) TypeName() string { return "" } +func (tt *TypeTag) qualify() Type { return tt.Type } +func (tt *TypeTag) copy() Type { + cpy := *tt + return &cpy +} + +// cycle is a type which had to be elided since it exceeded maxTypeDepth. +type cycle struct { + root Type +} + +func (c *cycle) ID() TypeID { return math.MaxUint32 } +func (c *cycle) Format(fs fmt.State, verb rune) { formatType(fs, verb, c, "root=", c.root) } +func (c *cycle) TypeName() string { return "" } +func (c *cycle) copy() Type { + cpy := *c + return &cpy +} + +type sizer interface { + size() uint32 +} + +var ( + _ sizer = (*Int)(nil) + _ sizer = (*Pointer)(nil) + _ sizer = (*Struct)(nil) + _ sizer = (*Union)(nil) + _ sizer = (*Enum)(nil) + _ sizer = (*Datasec)(nil) +) + +type qualifier interface { + qualify() Type +} + +var ( + _ qualifier = (*Const)(nil) + _ qualifier = (*Restrict)(nil) + _ qualifier = (*Volatile)(nil) + _ qualifier = (*TypeTag)(nil) +) + +var errUnsizedType = errors.New("type is unsized") + +// Sizeof returns the size of a type in bytes. +// +// Returns an error if the size can't be computed. +func Sizeof(typ Type) (int, error) { + var ( + n = int64(1) + elem int64 + ) + + for i := 0; i < maxResolveDepth; i++ { + switch v := typ.(type) { + case *Array: + if n > 0 && int64(v.Nelems) > math.MaxInt64/n { + return 0, fmt.Errorf("type %s: overflow", typ) + } + + // Arrays may be of zero length, which allows + // n to be zero as well. + n *= int64(v.Nelems) + typ = v.Type + continue + + case sizer: + elem = int64(v.size()) + + case *Typedef: + typ = v.Type + continue + + case qualifier: + typ = v.qualify() + continue + + default: + return 0, fmt.Errorf("type %T: %w", typ, errUnsizedType) + } + + if n > 0 && elem > math.MaxInt64/n { + return 0, fmt.Errorf("type %s: overflow", typ) + } + + size := n * elem + if int64(int(size)) != size { + return 0, fmt.Errorf("type %s: overflow", typ) + } + + return int(size), nil + } + + return 0, fmt.Errorf("type %s: exceeded type depth", typ) +} + +// alignof returns the alignment of a type. +// +// Returns an error if the Type can't be aligned, like an integer with an uneven +// size. Currently only supports the subset of types necessary for bitfield +// relocations. +func alignof(typ Type) (int, error) { + var n int + + switch t := UnderlyingType(typ).(type) { + case *Enum: + n = int(t.size()) + case *Int: + n = int(t.Size) + case *Array: + return alignof(t.Type) + default: + return 0, fmt.Errorf("can't calculate alignment of %T", t) + } + + if !internal.IsPow(n) { + return 0, fmt.Errorf("alignment value %d is not a power of two", n) + } + + return n, nil +} + +// Copy a Type recursively. +// +// typ may form a cycle. +func Copy(typ Type) Type { + return copyType(typ, nil, make(map[Type]Type), nil) +} + +func copyType(typ Type, ids map[Type]TypeID, copies map[Type]Type, copiedIDs map[Type]TypeID) Type { + if typ == nil { + return nil + } + + cpy, ok := copies[typ] + if ok { + // This has been copied previously, no need to continue. + return cpy + } + + cpy = typ.copy() + copies[typ] = cpy + + if id, ok := ids[typ]; ok { + copiedIDs[cpy] = id + } + + children(cpy, func(child *Type) bool { + *child = copyType(*child, ids, copies, copiedIDs) + return true + }) + + return cpy +} + +type typeDeque = internal.Deque[*Type] + +// readAndInflateTypes reads the raw btf type info and turns it into a graph +// of Types connected via pointers. +// +// If base is provided, then the types are considered to be of a split BTF +// (e.g., a kernel module). +// +// Returns a slice of types indexed by TypeID. Since BTF ignores compilation +// units, multiple types may share the same name. A Type may form a cyclic graph +// by pointing at itself. +func readAndInflateTypes(r io.Reader, bo binary.ByteOrder, typeLen uint32, rawStrings *stringTable, base *Spec) ([]Type, error) { + // because of the interleaving between types and struct members it is difficult to + // precompute the numbers of raw types this will parse + // this "guess" is a good first estimation + sizeOfbtfType := uintptr(btfTypeLen) + tyMaxCount := uintptr(typeLen) / sizeOfbtfType / 2 + types := make([]Type, 0, tyMaxCount) + + // Void is defined to always be type ID 0, and is thus omitted from BTF. + types = append(types, (*Void)(nil)) + + firstTypeID := TypeID(0) + if base != nil { + var err error + firstTypeID, err = base.nextTypeID() + if err != nil { + return nil, err + } + + // Split BTF doesn't contain Void. + types = types[:0] + } + + type fixupDef struct { + id TypeID + typ *Type + } + + var fixups []fixupDef + fixup := func(id TypeID, typ *Type) { + if id < firstTypeID { + if baseType, err := base.TypeByID(id); err == nil { + *typ = baseType + return + } + } + + idx := int(id - firstTypeID) + if idx < len(types) { + // We've already inflated this type, fix it up immediately. + *typ = types[idx] + return + } + + fixups = append(fixups, fixupDef{id, typ}) + } + + type bitfieldFixupDef struct { + id TypeID + m *Member + } + + var ( + legacyBitfields = make(map[TypeID][2]Bits) // offset, size + bitfieldFixups []bitfieldFixupDef + ) + convertMembers := func(raw []btfMember, kindFlag bool) ([]Member, error) { + // NB: The fixup below relies on pre-allocating this array to + // work, since otherwise append might re-allocate members. + members := make([]Member, 0, len(raw)) + for i, btfMember := range raw { + name, err := rawStrings.Lookup(btfMember.NameOff) + if err != nil { + return nil, fmt.Errorf("can't get name for member %d: %w", i, err) + } + + members = append(members, Member{ + Name: name, + Offset: Bits(btfMember.Offset), + }) + + m := &members[i] + fixup(raw[i].Type, &m.Type) + + if kindFlag { + m.BitfieldSize = Bits(btfMember.Offset >> 24) + m.Offset &= 0xffffff + // We ignore legacy bitfield definitions if the current composite + // is a new-style bitfield. This is kind of safe since offset and + // size on the type of the member must be zero if kindFlat is set + // according to spec. + continue + } + + // This may be a legacy bitfield, try to fix it up. + data, ok := legacyBitfields[raw[i].Type] + if ok { + // Bingo! + m.Offset += data[0] + m.BitfieldSize = data[1] + continue + } + + if m.Type != nil { + // We couldn't find a legacy bitfield, but we know that the member's + // type has already been inflated. Hence we know that it can't be + // a legacy bitfield and there is nothing left to do. + continue + } + + // We don't have fixup data, and the type we're pointing + // at hasn't been inflated yet. No choice but to defer + // the fixup. + bitfieldFixups = append(bitfieldFixups, bitfieldFixupDef{ + raw[i].Type, + m, + }) + } + return members, nil + } + + var ( + buf = make([]byte, 1024) + header btfType + bInt btfInt + bArr btfArray + bMembers []btfMember + bEnums []btfEnum + bParams []btfParam + bVariable btfVariable + bSecInfos []btfVarSecinfo + bDeclTag btfDeclTag + bEnums64 []btfEnum64 + ) + + var declTags []*declTag + for { + var ( + id = firstTypeID + TypeID(len(types)) + typ Type + ) + + if _, err := io.ReadFull(r, buf[:btfTypeLen]); err == io.EOF { + break + } else if err != nil { + return nil, fmt.Errorf("can't read type info for id %v: %v", id, err) + } + + if _, err := unmarshalBtfType(&header, buf[:btfTypeLen], bo); err != nil { + return nil, fmt.Errorf("can't unmarshal type info for id %v: %v", id, err) + } + + if id < firstTypeID { + return nil, fmt.Errorf("no more type IDs") + } + + name, err := rawStrings.Lookup(header.NameOff) + if err != nil { + return nil, fmt.Errorf("get name for type id %d: %w", id, err) + } + + switch header.Kind() { + case kindInt: + size := header.Size() + buf = buf[:btfIntLen] + if _, err := io.ReadFull(r, buf); err != nil { + return nil, fmt.Errorf("can't read btfInt, id: %d: %w", id, err) + } + if _, err := unmarshalBtfInt(&bInt, buf, bo); err != nil { + return nil, fmt.Errorf("can't unmarshal btfInt, id: %d: %w", id, err) + } + if bInt.Offset() > 0 || bInt.Bits().Bytes() != size { + legacyBitfields[id] = [2]Bits{bInt.Offset(), bInt.Bits()} + } + typ = &Int{name, header.Size(), bInt.Encoding()} + + case kindPointer: + ptr := &Pointer{nil} + fixup(header.Type(), &ptr.Target) + typ = ptr + + case kindArray: + buf = buf[:btfArrayLen] + if _, err := io.ReadFull(r, buf); err != nil { + return nil, fmt.Errorf("can't read btfArray, id: %d: %w", id, err) + } + if _, err := unmarshalBtfArray(&bArr, buf, bo); err != nil { + return nil, fmt.Errorf("can't unmarshal btfArray, id: %d: %w", id, err) + } + + arr := &Array{nil, nil, bArr.Nelems} + fixup(bArr.IndexType, &arr.Index) + fixup(bArr.Type, &arr.Type) + typ = arr + + case kindStruct: + vlen := header.Vlen() + bMembers = slices.Grow(bMembers[:0], vlen)[:vlen] + buf = slices.Grow(buf[:0], vlen*btfMemberLen)[:vlen*btfMemberLen] + if _, err := io.ReadFull(r, buf); err != nil { + return nil, fmt.Errorf("can't read btfMembers, id: %d: %w", id, err) + } + if _, err := unmarshalBtfMembers(bMembers, buf, bo); err != nil { + return nil, fmt.Errorf("can't unmarshal btfMembers, id: %d: %w", id, err) + } + + members, err := convertMembers(bMembers, header.Bitfield()) + if err != nil { + return nil, fmt.Errorf("struct %s (id %d): %w", name, id, err) + } + typ = &Struct{name, header.Size(), members, nil} + + case kindUnion: + vlen := header.Vlen() + bMembers = slices.Grow(bMembers[:0], vlen)[:vlen] + buf = slices.Grow(buf[:0], vlen*btfMemberLen)[:vlen*btfMemberLen] + if _, err := io.ReadFull(r, buf); err != nil { + return nil, fmt.Errorf("can't read btfMembers, id: %d: %w", id, err) + } + if _, err := unmarshalBtfMembers(bMembers, buf, bo); err != nil { + return nil, fmt.Errorf("can't unmarshal btfMembers, id: %d: %w", id, err) + } + + members, err := convertMembers(bMembers, header.Bitfield()) + if err != nil { + return nil, fmt.Errorf("union %s (id %d): %w", name, id, err) + } + typ = &Union{name, header.Size(), members, nil} + + case kindEnum: + vlen := header.Vlen() + bEnums = slices.Grow(bEnums[:0], vlen)[:vlen] + buf = slices.Grow(buf[:0], vlen*btfEnumLen)[:vlen*btfEnumLen] + if _, err := io.ReadFull(r, buf); err != nil { + return nil, fmt.Errorf("can't read btfEnums, id: %d: %w", id, err) + } + if _, err := unmarshalBtfEnums(bEnums, buf, bo); err != nil { + return nil, fmt.Errorf("can't unmarshal btfEnums, id: %d: %w", id, err) + } + + vals := make([]EnumValue, 0, vlen) + signed := header.Signed() + for i, btfVal := range bEnums { + name, err := rawStrings.Lookup(btfVal.NameOff) + if err != nil { + return nil, fmt.Errorf("get name for enum value %d: %s", i, err) + } + value := uint64(btfVal.Val) + if signed { + // Sign extend values to 64 bit. + value = uint64(int32(btfVal.Val)) + } + vals = append(vals, EnumValue{name, value}) + } + typ = &Enum{name, header.Size(), signed, vals} + + case kindForward: + typ = &Fwd{name, header.FwdKind()} + + case kindTypedef: + typedef := &Typedef{name, nil, nil} + fixup(header.Type(), &typedef.Type) + typ = typedef + + case kindVolatile: + volatile := &Volatile{nil} + fixup(header.Type(), &volatile.Type) + typ = volatile + + case kindConst: + cnst := &Const{nil} + fixup(header.Type(), &cnst.Type) + typ = cnst + + case kindRestrict: + restrict := &Restrict{nil} + fixup(header.Type(), &restrict.Type) + typ = restrict + + case kindFunc: + fn := &Func{name, nil, header.Linkage(), nil, nil} + fixup(header.Type(), &fn.Type) + typ = fn + + case kindFuncProto: + vlen := header.Vlen() + bParams = slices.Grow(bParams[:0], vlen)[:vlen] + buf = slices.Grow(buf[:0], vlen*btfParamLen)[:vlen*btfParamLen] + if _, err := io.ReadFull(r, buf); err != nil { + return nil, fmt.Errorf("can't read btfParams, id: %d: %w", id, err) + } + if _, err := unmarshalBtfParams(bParams, buf, bo); err != nil { + return nil, fmt.Errorf("can't unmarshal btfParams, id: %d: %w", id, err) + } + + params := make([]FuncParam, 0, vlen) + for i, param := range bParams { + name, err := rawStrings.Lookup(param.NameOff) + if err != nil { + return nil, fmt.Errorf("get name for func proto parameter %d: %s", i, err) + } + params = append(params, FuncParam{ + Name: name, + }) + } + for i := range params { + fixup(bParams[i].Type, ¶ms[i].Type) + } + + fp := &FuncProto{nil, params} + fixup(header.Type(), &fp.Return) + typ = fp + + case kindVar: + buf = buf[:btfVariableLen] + if _, err := io.ReadFull(r, buf); err != nil { + return nil, fmt.Errorf("can't read btfVariable, id: %d: %w", id, err) + } + if _, err := unmarshalBtfVariable(&bVariable, buf, bo); err != nil { + return nil, fmt.Errorf("can't read btfVariable, id: %d: %w", id, err) + } + + v := &Var{name, nil, VarLinkage(bVariable.Linkage), nil} + fixup(header.Type(), &v.Type) + typ = v + + case kindDatasec: + vlen := header.Vlen() + bSecInfos = slices.Grow(bSecInfos[:0], vlen)[:vlen] + buf = slices.Grow(buf[:0], vlen*btfVarSecinfoLen)[:vlen*btfVarSecinfoLen] + if _, err := io.ReadFull(r, buf); err != nil { + return nil, fmt.Errorf("can't read btfVarSecInfos, id: %d: %w", id, err) + } + if _, err := unmarshalBtfVarSecInfos(bSecInfos, buf, bo); err != nil { + return nil, fmt.Errorf("can't unmarshal btfVarSecInfos, id: %d: %w", id, err) + } + + vars := make([]VarSecinfo, 0, vlen) + for _, btfVar := range bSecInfos { + vars = append(vars, VarSecinfo{ + Offset: btfVar.Offset, + Size: btfVar.Size, + }) + } + for i := range vars { + fixup(bSecInfos[i].Type, &vars[i].Type) + } + typ = &Datasec{name, header.Size(), vars} + + case kindFloat: + typ = &Float{name, header.Size()} + + case kindDeclTag: + buf = buf[:btfDeclTagLen] + if _, err := io.ReadFull(r, buf); err != nil { + return nil, fmt.Errorf("can't read btfDeclTag, id: %d: %w", id, err) + } + if _, err := unmarshalBtfDeclTag(&bDeclTag, buf, bo); err != nil { + return nil, fmt.Errorf("can't read btfDeclTag, id: %d: %w", id, err) + } + + btfIndex := bDeclTag.ComponentIdx + if uint64(btfIndex) > math.MaxInt { + return nil, fmt.Errorf("type id %d: index exceeds int", id) + } + + dt := &declTag{nil, name, int(int32(btfIndex))} + fixup(header.Type(), &dt.Type) + typ = dt + + declTags = append(declTags, dt) + + case kindTypeTag: + tt := &TypeTag{nil, name} + fixup(header.Type(), &tt.Type) + typ = tt + + case kindEnum64: + vlen := header.Vlen() + bEnums64 = slices.Grow(bEnums64[:0], vlen)[:vlen] + buf = slices.Grow(buf[:0], vlen*btfEnum64Len)[:vlen*btfEnum64Len] + if _, err := io.ReadFull(r, buf); err != nil { + return nil, fmt.Errorf("can't read btfEnum64s, id: %d: %w", id, err) + } + if _, err := unmarshalBtfEnums64(bEnums64, buf, bo); err != nil { + return nil, fmt.Errorf("can't unmarshal btfEnum64s, id: %d: %w", id, err) + } + + vals := make([]EnumValue, 0, vlen) + for i, btfVal := range bEnums64 { + name, err := rawStrings.Lookup(btfVal.NameOff) + if err != nil { + return nil, fmt.Errorf("get name for enum64 value %d: %s", i, err) + } + value := (uint64(btfVal.ValHi32) << 32) | uint64(btfVal.ValLo32) + vals = append(vals, EnumValue{name, value}) + } + typ = &Enum{name, header.Size(), header.Signed(), vals} + + default: + return nil, fmt.Errorf("type id %d: unknown kind: %v", id, header.Kind()) + } + + types = append(types, typ) + } + + for _, fixup := range fixups { + if fixup.id < firstTypeID { + return nil, fmt.Errorf("fixup for base type id %d is not expected", fixup.id) + } + + idx := int(fixup.id - firstTypeID) + if idx >= len(types) { + return nil, fmt.Errorf("reference to invalid type id: %d", fixup.id) + } + + *fixup.typ = types[idx] + } + + for _, bitfieldFixup := range bitfieldFixups { + if bitfieldFixup.id < firstTypeID { + return nil, fmt.Errorf("bitfield fixup from split to base types is not expected") + } + + data, ok := legacyBitfields[bitfieldFixup.id] + if ok { + // This is indeed a legacy bitfield, fix it up. + bitfieldFixup.m.Offset += data[0] + bitfieldFixup.m.BitfieldSize = data[1] + } + } + + for _, dt := range declTags { + switch t := dt.Type.(type) { + case *Var: + if dt.Index != -1 { + return nil, fmt.Errorf("type %s: component idx %d is not -1", dt, dt.Index) + } + t.Tags = append(t.Tags, dt.Value) + + case *Typedef: + if dt.Index != -1 { + return nil, fmt.Errorf("type %s: component idx %d is not -1", dt, dt.Index) + } + t.Tags = append(t.Tags, dt.Value) + + case composite: + if dt.Index >= 0 { + members := t.members() + if dt.Index >= len(members) { + return nil, fmt.Errorf("type %s: component idx %d exceeds members of %s", dt, dt.Index, t) + } + + members[dt.Index].Tags = append(members[dt.Index].Tags, dt.Value) + continue + } + + if dt.Index == -1 { + switch t2 := t.(type) { + case *Struct: + t2.Tags = append(t2.Tags, dt.Value) + case *Union: + t2.Tags = append(t2.Tags, dt.Value) + } + + continue + } + + return nil, fmt.Errorf("type %s: decl tag for type %s has invalid component idx", dt, t) + + case *Func: + fp, ok := t.Type.(*FuncProto) + if !ok { + return nil, fmt.Errorf("type %s: %s is not a FuncProto", dt, t.Type) + } + + // Ensure the number of argument tag lists equals the number of arguments + if len(t.ParamTags) == 0 { + t.ParamTags = make([][]string, len(fp.Params)) + } + + if dt.Index >= 0 { + if dt.Index >= len(fp.Params) { + return nil, fmt.Errorf("type %s: component idx %d exceeds params of %s", dt, dt.Index, t) + } + + t.ParamTags[dt.Index] = append(t.ParamTags[dt.Index], dt.Value) + continue + } + + if dt.Index == -1 { + t.Tags = append(t.Tags, dt.Value) + continue + } + + return nil, fmt.Errorf("type %s: decl tag for type %s has invalid component idx", dt, t) + + default: + return nil, fmt.Errorf("type %s: decl tag for type %s is not supported", dt, t) + } + } + + return types, nil +} + +// essentialName represents the name of a BTF type stripped of any flavor +// suffixes after a ___ delimiter. +type essentialName string + +// newEssentialName returns name without a ___ suffix. +// +// CO-RE has the concept of 'struct flavors', which are used to deal with +// changes in kernel data structures. Anything after three underscores +// in a type name is ignored for the purpose of finding a candidate type +// in the kernel's BTF. +func newEssentialName(name string) essentialName { + if name == "" { + return "" + } + lastIdx := strings.LastIndex(name, "___") + if lastIdx > 0 { + return essentialName(name[:lastIdx]) + } + return essentialName(name) +} + +// UnderlyingType skips qualifiers and Typedefs. +func UnderlyingType(typ Type) Type { + result := typ + for depth := 0; depth <= maxResolveDepth; depth++ { + switch v := (result).(type) { + case qualifier: + result = v.qualify() + case *Typedef: + result = v.Type + default: + return result + } + } + return &cycle{typ} +} + +// QualifiedType returns the type with all qualifiers removed. +func QualifiedType(typ Type) Type { + result := typ + for depth := 0; depth <= maxResolveDepth; depth++ { + switch v := (result).(type) { + case qualifier: + result = v.qualify() + default: + return result + } + } + return &cycle{typ} +} + +// As returns typ if is of type T. Otherwise it peels qualifiers and Typedefs +// until it finds a T. +// +// Returns the zero value and false if there is no T or if the type is nested +// too deeply. +func As[T Type](typ Type) (T, bool) { + // NB: We can't make this function return (*T) since then + // we can't assert that a type matches an interface which + // embeds Type: as[composite](T). + for depth := 0; depth <= maxResolveDepth; depth++ { + switch v := (typ).(type) { + case T: + return v, true + case qualifier: + typ = v.qualify() + case *Typedef: + typ = v.Type + default: + goto notFound + } + } +notFound: + var zero T + return zero, false +} + +type formatState struct { + fmt.State + depth int +} + +// formattableType is a subset of Type, to ease unit testing of formatType. +type formattableType interface { + fmt.Formatter + TypeName() string +} + +// formatType formats a type in a canonical form. +// +// Handles cyclical types by only printing cycles up to a certain depth. Elements +// in extra are separated by spaces unless the preceding element is a string +// ending in '='. +func formatType(f fmt.State, verb rune, t formattableType, extra ...interface{}) { + if verb != 'v' && verb != 's' { + fmt.Fprintf(f, "{UNRECOGNIZED: %c}", verb) + return + } + + _, _ = io.WriteString(f, internal.GoTypeName(t)) + + if name := t.TypeName(); name != "" { + // Output BTF type name if present. + fmt.Fprintf(f, ":%q", name) + } + + if f.Flag('+') { + // Output address if requested. + fmt.Fprintf(f, ":%#p", t) + } + + if verb == 's' { + // %s omits details. + return + } + + var depth int + if ps, ok := f.(*formatState); ok { + depth = ps.depth + f = ps.State + } + + maxDepth, ok := f.Width() + if !ok { + maxDepth = 0 + } + + if depth > maxDepth { + // We've reached the maximum depth. This avoids infinite recursion even + // for cyclical types. + return + } + + if len(extra) == 0 { + return + } + + wantSpace := false + _, _ = io.WriteString(f, "[") + for _, arg := range extra { + if wantSpace { + _, _ = io.WriteString(f, " ") + } + + switch v := arg.(type) { + case string: + _, _ = io.WriteString(f, v) + wantSpace = len(v) > 0 && v[len(v)-1] != '=' + continue + + case formattableType: + v.Format(&formatState{f, depth + 1}, verb) + + default: + fmt.Fprint(f, arg) + } + + wantSpace = true + } + _, _ = io.WriteString(f, "]") +} diff --git a/vendor/github.com/cilium/ebpf/btf/workarounds.go b/vendor/github.com/cilium/ebpf/btf/workarounds.go new file mode 100644 index 0000000000..eb09047fb3 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/btf/workarounds.go @@ -0,0 +1,26 @@ +package btf + +// datasecResolveWorkaround ensures that certain vars in a Datasec are added +// to a Spec before the Datasec. This avoids a bug in kernel BTF validation. +// +// See https://lore.kernel.org/bpf/20230302123440.1193507-1-lmb@isovalent.com/ +func datasecResolveWorkaround(b *Builder, ds *Datasec) error { + for _, vsi := range ds.Vars { + v, ok := vsi.Type.(*Var) + if !ok { + continue + } + + switch v.Type.(type) { + case *Typedef, *Volatile, *Const, *Restrict, *TypeTag: + // NB: We must never call Add on a Datasec, otherwise we risk + // infinite recursion. + _, err := b.Add(v.Type) + if err != nil { + return err + } + } + } + + return nil +} diff --git a/vendor/github.com/cilium/ebpf/collection.go b/vendor/github.com/cilium/ebpf/collection.go new file mode 100644 index 0000000000..a7b6cb31bf --- /dev/null +++ b/vendor/github.com/cilium/ebpf/collection.go @@ -0,0 +1,1039 @@ +package ebpf + +import ( + "encoding/binary" + "errors" + "fmt" + "reflect" + "strings" + + "github.com/cilium/ebpf/asm" + "github.com/cilium/ebpf/btf" + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/kallsyms" + "github.com/cilium/ebpf/internal/kconfig" + "github.com/cilium/ebpf/internal/linux" + "github.com/cilium/ebpf/internal/sys" +) + +// CollectionOptions control loading a collection into the kernel. +// +// Maps and Programs are passed to NewMapWithOptions and NewProgramsWithOptions. +type CollectionOptions struct { + Maps MapOptions + Programs ProgramOptions + + // MapReplacements takes a set of Maps that will be used instead of + // creating new ones when loading the CollectionSpec. + // + // For each given Map, there must be a corresponding MapSpec in + // CollectionSpec.Maps, and its type, key/value size, max entries and flags + // must match the values of the MapSpec. + // + // The given Maps are Clone()d before being used in the Collection, so the + // caller can Close() them freely when they are no longer needed. + MapReplacements map[string]*Map +} + +// CollectionSpec describes a collection. +type CollectionSpec struct { + Maps map[string]*MapSpec + Programs map[string]*ProgramSpec + + // Variables refer to global variables declared in the ELF. They can be read + // and modified freely before loading the Collection. Modifying them after + // loading has no effect on a running eBPF program. + Variables map[string]*VariableSpec + + // Types holds type information about Maps and Programs. + // Modifications to Types are currently undefined behaviour. + Types *btf.Spec + + // ByteOrder specifies whether the ELF was compiled for + // big-endian or little-endian architectures. + ByteOrder binary.ByteOrder +} + +// Copy returns a recursive copy of the spec. +func (cs *CollectionSpec) Copy() *CollectionSpec { + if cs == nil { + return nil + } + + cpy := CollectionSpec{ + Maps: make(map[string]*MapSpec, len(cs.Maps)), + Programs: make(map[string]*ProgramSpec, len(cs.Programs)), + Variables: make(map[string]*VariableSpec, len(cs.Variables)), + ByteOrder: cs.ByteOrder, + Types: cs.Types.Copy(), + } + + for name, spec := range cs.Maps { + cpy.Maps[name] = spec.Copy() + } + + for name, spec := range cs.Programs { + cpy.Programs[name] = spec.Copy() + } + + for name, spec := range cs.Variables { + cpy.Variables[name] = spec.copy(&cpy) + } + + return &cpy +} + +// RewriteMaps replaces all references to specific maps. +// +// Use this function to use pre-existing maps instead of creating new ones +// when calling NewCollection. Any named maps are removed from CollectionSpec.Maps. +// +// Returns an error if a named map isn't used in at least one program. +// +// Deprecated: Pass CollectionOptions.MapReplacements when loading the Collection +// instead. +func (cs *CollectionSpec) RewriteMaps(maps map[string]*Map) error { + for symbol, m := range maps { + // have we seen a program that uses this symbol / map + seen := false + for progName, progSpec := range cs.Programs { + err := progSpec.Instructions.AssociateMap(symbol, m) + + switch { + case err == nil: + seen = true + + case errors.Is(err, asm.ErrUnreferencedSymbol): + // Not all programs need to use the map + + default: + return fmt.Errorf("program %s: %w", progName, err) + } + } + + if !seen { + return fmt.Errorf("map %s not referenced by any programs", symbol) + } + + // Prevent NewCollection from creating rewritten maps + delete(cs.Maps, symbol) + } + + return nil +} + +// MissingConstantsError is returned by [CollectionSpec.RewriteConstants]. +type MissingConstantsError struct { + // The constants missing from .rodata. + Constants []string +} + +func (m *MissingConstantsError) Error() string { + return fmt.Sprintf("some constants are missing from .rodata: %s", strings.Join(m.Constants, ", ")) +} + +// RewriteConstants replaces the value of multiple constants. +// +// The constant must be defined like so in the C program: +// +// volatile const type foobar; +// volatile const type foobar = default; +// +// Replacement values must be of the same length as the C sizeof(type). +// If necessary, they are marshalled according to the same rules as +// map values. +// +// From Linux 5.5 the verifier will use constants to eliminate dead code. +// +// Returns an error wrapping [MissingConstantsError] if a constant doesn't exist. +// +// Deprecated: Use [CollectionSpec.Variables] to interact with constants instead. +// RewriteConstants is now a wrapper around the VariableSpec API. +func (cs *CollectionSpec) RewriteConstants(consts map[string]interface{}) error { + var missing []string + for n, c := range consts { + v, ok := cs.Variables[n] + if !ok { + missing = append(missing, n) + continue + } + + if !v.Constant() { + return fmt.Errorf("variable %s is not a constant", n) + } + + if err := v.Set(c); err != nil { + return fmt.Errorf("rewriting constant %s: %w", n, err) + } + } + + if len(missing) != 0 { + return fmt.Errorf("rewrite constants: %w", &MissingConstantsError{Constants: missing}) + } + + return nil +} + +// Assign the contents of a CollectionSpec to a struct. +// +// This function is a shortcut to manually checking the presence +// of maps and programs in a CollectionSpec. Consider using bpf2go +// if this sounds useful. +// +// 'to' must be a pointer to a struct. A field of the +// struct is updated with values from Programs, Maps or Variables if it +// has an `ebpf` tag and its type is *ProgramSpec, *MapSpec or *VariableSpec. +// The tag's value specifies the name of the program or map as +// found in the CollectionSpec. +// +// struct { +// Foo *ebpf.ProgramSpec `ebpf:"xdp_foo"` +// Bar *ebpf.MapSpec `ebpf:"bar_map"` +// Var *ebpf.VariableSpec `ebpf:"some_var"` +// Ignored int +// } +// +// Returns an error if any of the eBPF objects can't be found, or +// if the same Spec is assigned multiple times. +func (cs *CollectionSpec) Assign(to interface{}) error { + getValue := func(typ reflect.Type, name string) (interface{}, error) { + switch typ { + case reflect.TypeOf((*ProgramSpec)(nil)): + if p := cs.Programs[name]; p != nil { + return p, nil + } + return nil, fmt.Errorf("missing program %q", name) + + case reflect.TypeOf((*MapSpec)(nil)): + if m := cs.Maps[name]; m != nil { + return m, nil + } + return nil, fmt.Errorf("missing map %q", name) + + case reflect.TypeOf((*VariableSpec)(nil)): + if v := cs.Variables[name]; v != nil { + return v, nil + } + return nil, fmt.Errorf("missing variable %q", name) + + default: + return nil, fmt.Errorf("unsupported type %s", typ) + } + } + + return assignValues(to, getValue) +} + +// LoadAndAssign loads Maps and Programs into the kernel and assigns them +// to a struct. +// +// Omitting Map/Program.Close() during application shutdown is an error. +// See the package documentation for details around Map and Program lifecycle. +// +// This function is a shortcut to manually checking the presence +// of maps and programs in a CollectionSpec. Consider using bpf2go +// if this sounds useful. +// +// 'to' must be a pointer to a struct. A field of the struct is updated with +// a Program or Map if it has an `ebpf` tag and its type is *Program or *Map. +// The tag's value specifies the name of the program or map as found in the +// CollectionSpec. Before updating the struct, the requested objects and their +// dependent resources are loaded into the kernel and populated with values if +// specified. +// +// struct { +// Foo *ebpf.Program `ebpf:"xdp_foo"` +// Bar *ebpf.Map `ebpf:"bar_map"` +// Ignored int +// } +// +// opts may be nil. +// +// Returns an error if any of the fields can't be found, or +// if the same Map or Program is assigned multiple times. +func (cs *CollectionSpec) LoadAndAssign(to interface{}, opts *CollectionOptions) error { + loader, err := newCollectionLoader(cs, opts) + if err != nil { + return err + } + defer loader.close() + + // Support assigning Programs and Maps, lazy-loading the required objects. + assignedMaps := make(map[string]bool) + assignedProgs := make(map[string]bool) + assignedVars := make(map[string]bool) + + getValue := func(typ reflect.Type, name string) (interface{}, error) { + switch typ { + + case reflect.TypeOf((*Program)(nil)): + assignedProgs[name] = true + return loader.loadProgram(name) + + case reflect.TypeOf((*Map)(nil)): + assignedMaps[name] = true + return loader.loadMap(name) + + case reflect.TypeOf((*Variable)(nil)): + assignedVars[name] = true + return loader.loadVariable(name) + + default: + return nil, fmt.Errorf("unsupported type %s", typ) + } + } + + // Load the Maps and Programs requested by the annotated struct. + if err := assignValues(to, getValue); err != nil { + return err + } + + // Populate the requested maps. Has a chance of lazy-loading other dependent maps. + if err := loader.populateDeferredMaps(); err != nil { + return err + } + + // Evaluate the loader's objects after all (lazy)loading has taken place. + for n, m := range loader.maps { + switch m.typ { + case ProgramArray: + // Require all lazy-loaded ProgramArrays to be assigned to the given object. + // The kernel empties a ProgramArray once the last user space reference + // to it closes, which leads to failed tail calls. Combined with the library + // closing map fds via GC finalizers this can lead to surprising behaviour. + // Only allow unassigned ProgramArrays when the library hasn't pre-populated + // any entries from static value declarations. At this point, we know the map + // is empty and there's no way for the caller to interact with the map going + // forward. + if !assignedMaps[n] && len(cs.Maps[n].Contents) > 0 { + return fmt.Errorf("ProgramArray %s must be assigned to prevent missed tail calls", n) + } + } + } + + // Prevent loader.cleanup() from closing assigned Maps and Programs. + for m := range assignedMaps { + delete(loader.maps, m) + } + for p := range assignedProgs { + delete(loader.programs, p) + } + for p := range assignedVars { + delete(loader.vars, p) + } + + return nil +} + +// Collection is a collection of live BPF resources present in the kernel. +type Collection struct { + Programs map[string]*Program + Maps map[string]*Map + + // Variables contains global variables used by the Collection's program(s). On + // kernels older than 5.5, most interactions with Variables return + // [ErrNotSupported]. + Variables map[string]*Variable +} + +// NewCollection creates a Collection from the given spec, creating and +// loading its declared resources into the kernel. +// +// Omitting Collection.Close() during application shutdown is an error. +// See the package documentation for details around Map and Program lifecycle. +func NewCollection(spec *CollectionSpec) (*Collection, error) { + return NewCollectionWithOptions(spec, CollectionOptions{}) +} + +// NewCollectionWithOptions creates a Collection from the given spec using +// options, creating and loading its declared resources into the kernel. +// +// Omitting Collection.Close() during application shutdown is an error. +// See the package documentation for details around Map and Program lifecycle. +func NewCollectionWithOptions(spec *CollectionSpec, opts CollectionOptions) (*Collection, error) { + loader, err := newCollectionLoader(spec, &opts) + if err != nil { + return nil, err + } + defer loader.close() + + // Create maps first, as their fds need to be linked into programs. + for mapName := range spec.Maps { + if _, err := loader.loadMap(mapName); err != nil { + return nil, err + } + } + + for progName, prog := range spec.Programs { + if prog.Type == UnspecifiedProgram { + continue + } + + if _, err := loader.loadProgram(progName); err != nil { + return nil, err + } + } + + for varName := range spec.Variables { + if _, err := loader.loadVariable(varName); err != nil { + return nil, err + } + } + + // Maps can contain Program and Map stubs, so populate them after + // all Maps and Programs have been successfully loaded. + if err := loader.populateDeferredMaps(); err != nil { + return nil, err + } + + // Prevent loader.cleanup from closing maps, programs and vars. + maps, progs, vars := loader.maps, loader.programs, loader.vars + loader.maps, loader.programs, loader.vars = nil, nil, nil + + return &Collection{ + progs, + maps, + vars, + }, nil +} + +type collectionLoader struct { + coll *CollectionSpec + opts *CollectionOptions + maps map[string]*Map + programs map[string]*Program + vars map[string]*Variable +} + +func newCollectionLoader(coll *CollectionSpec, opts *CollectionOptions) (*collectionLoader, error) { + if opts == nil { + opts = &CollectionOptions{} + } + + // Check for existing MapSpecs in the CollectionSpec for all provided replacement maps. + for name := range opts.MapReplacements { + if _, ok := coll.Maps[name]; !ok { + return nil, fmt.Errorf("replacement map %s not found in CollectionSpec", name) + } + } + + if err := populateKallsyms(coll.Programs); err != nil { + return nil, fmt.Errorf("populating kallsyms caches: %w", err) + } + + return &collectionLoader{ + coll, + opts, + make(map[string]*Map), + make(map[string]*Program), + make(map[string]*Variable), + }, nil +} + +// populateKallsyms populates kallsyms caches, making lookups cheaper later on +// during individual program loading. Since we have less context available +// at those stages, we batch the lookups here instead to avoid redundant work. +func populateKallsyms(progs map[string]*ProgramSpec) error { + // Look up associated kernel modules for all symbols referenced by + // ProgramSpec.AttachTo for program types that support attaching to kmods. + mods := make(map[string]string) + for _, p := range progs { + if p.AttachTo != "" && p.targetsKernelModule() { + mods[p.AttachTo] = "" + } + } + if len(mods) != 0 { + if err := kallsyms.AssignModules(mods); err != nil { + return fmt.Errorf("getting modules from kallsyms: %w", err) + } + } + + // Look up addresses of all kernel symbols referenced by all programs. + addrs := make(map[string]uint64) + for _, p := range progs { + iter := p.Instructions.Iterate() + for iter.Next() { + ins := iter.Ins + meta, _ := ins.Metadata.Get(ksymMetaKey{}).(*ksymMeta) + if meta != nil { + addrs[meta.Name] = 0 + } + } + } + if len(addrs) != 0 { + if err := kallsyms.AssignAddresses(addrs); err != nil { + return fmt.Errorf("getting addresses from kallsyms: %w", err) + } + } + + return nil +} + +// close all resources left over in the collectionLoader. +func (cl *collectionLoader) close() { + for _, m := range cl.maps { + m.Close() + } + for _, p := range cl.programs { + p.Close() + } +} + +func (cl *collectionLoader) loadMap(mapName string) (*Map, error) { + if m := cl.maps[mapName]; m != nil { + return m, nil + } + + mapSpec := cl.coll.Maps[mapName] + if mapSpec == nil { + return nil, fmt.Errorf("missing map %s", mapName) + } + + mapSpec = mapSpec.Copy() + + // Defer setting the mmapable flag on maps until load time. This avoids the + // MapSpec having different flags on some kernel versions. Also avoid running + // syscalls during ELF loading, so platforms like wasm can also parse an ELF. + if isDataSection(mapSpec.Name) && haveMmapableMaps() == nil { + mapSpec.Flags |= sys.BPF_F_MMAPABLE + } + + if replaceMap, ok := cl.opts.MapReplacements[mapName]; ok { + // Check compatibility with the replacement map after setting + // feature-dependent map flags. + if err := mapSpec.Compatible(replaceMap); err != nil { + return nil, fmt.Errorf("using replacement map %s: %w", mapSpec.Name, err) + } + + // Clone the map to avoid closing user's map later on. + m, err := replaceMap.Clone() + if err != nil { + return nil, err + } + + cl.maps[mapName] = m + return m, nil + } + + m, err := newMapWithOptions(mapSpec, cl.opts.Maps) + if err != nil { + return nil, fmt.Errorf("map %s: %w", mapName, err) + } + + // Finalize 'scalar' maps that don't refer to any other eBPF resources + // potentially pending creation. This is needed for frozen maps like .rodata + // that need to be finalized before invoking the verifier. + if !mapSpec.Type.canStoreMapOrProgram() { + if err := m.finalize(mapSpec); err != nil { + return nil, fmt.Errorf("finalizing map %s: %w", mapName, err) + } + } + + cl.maps[mapName] = m + return m, nil +} + +func (cl *collectionLoader) loadProgram(progName string) (*Program, error) { + if prog := cl.programs[progName]; prog != nil { + return prog, nil + } + + progSpec := cl.coll.Programs[progName] + if progSpec == nil { + return nil, fmt.Errorf("unknown program %s", progName) + } + + // Bail out early if we know the kernel is going to reject the program. + // This skips loading map dependencies, saving some cleanup work later. + if progSpec.Type == UnspecifiedProgram { + return nil, fmt.Errorf("cannot load program %s: program type is unspecified", progName) + } + + progSpec = progSpec.Copy() + + // Rewrite any reference to a valid map in the program's instructions, + // which includes all of its dependencies. + for i := range progSpec.Instructions { + ins := &progSpec.Instructions[i] + + if !ins.IsLoadFromMap() || ins.Reference() == "" { + continue + } + + // Don't overwrite map loads containing non-zero map fd's, + // they can be manually included by the caller. + // Map FDs/IDs are placed in the lower 32 bits of Constant. + if int32(ins.Constant) > 0 { + continue + } + + m, err := cl.loadMap(ins.Reference()) + if err != nil { + return nil, fmt.Errorf("program %s: %w", progName, err) + } + + if err := ins.AssociateMap(m); err != nil { + return nil, fmt.Errorf("program %s: map %s: %w", progName, ins.Reference(), err) + } + } + + prog, err := newProgramWithOptions(progSpec, cl.opts.Programs) + if err != nil { + return nil, fmt.Errorf("program %s: %w", progName, err) + } + + cl.programs[progName] = prog + return prog, nil +} + +func (cl *collectionLoader) loadVariable(varName string) (*Variable, error) { + if v := cl.vars[varName]; v != nil { + return v, nil + } + + varSpec := cl.coll.Variables[varName] + if varSpec == nil { + return nil, fmt.Errorf("unknown variable %s", varName) + } + + // Get the key of the VariableSpec's MapSpec in the CollectionSpec. + var mapName string + for n, ms := range cl.coll.Maps { + if ms == varSpec.m { + mapName = n + break + } + } + if mapName == "" { + return nil, fmt.Errorf("variable %s: underlying MapSpec %s was removed from CollectionSpec", varName, varSpec.m.Name) + } + + m, err := cl.loadMap(mapName) + if err != nil { + return nil, fmt.Errorf("variable %s: %w", varName, err) + } + + // If the kernel is too old or the underlying map was created without + // BPF_F_MMAPABLE, [Map.Memory] will return ErrNotSupported. In this case, + // emit a Variable with a nil Memory. This keeps Collection{Spec}.Variables + // consistent across systems with different feature sets without breaking + // LoadAndAssign. + mm, err := m.Memory() + if err != nil && !errors.Is(err, ErrNotSupported) { + return nil, fmt.Errorf("variable %s: getting memory for map %s: %w", varName, mapName, err) + } + + v, err := newVariable( + varSpec.name, + varSpec.offset, + varSpec.size, + varSpec.t, + mm, + ) + if err != nil { + return nil, fmt.Errorf("variable %s: %w", varName, err) + } + + cl.vars[varName] = v + return v, nil +} + +// populateDeferredMaps iterates maps holding programs or other maps and loads +// any dependencies. Populates all maps in cl and freezes them if specified. +func (cl *collectionLoader) populateDeferredMaps() error { + for mapName, m := range cl.maps { + mapSpec, ok := cl.coll.Maps[mapName] + if !ok { + return fmt.Errorf("missing map spec %s", mapName) + } + + // Scalar maps without Map or Program references are finalized during + // creation. Don't finalize them again. + if !mapSpec.Type.canStoreMapOrProgram() { + continue + } + + mapSpec = mapSpec.Copy() + + // MapSpecs that refer to inner maps or programs within the same + // CollectionSpec do so using strings. These strings are used as the key + // to look up the respective object in the Maps or Programs fields. + // Resolve those references to actual Map or Program resources that + // have been loaded into the kernel. + for i, kv := range mapSpec.Contents { + objName, ok := kv.Value.(string) + if !ok { + continue + } + + switch t := mapSpec.Type; { + case t.canStoreProgram(): + // loadProgram is idempotent and could return an existing Program. + prog, err := cl.loadProgram(objName) + if err != nil { + return fmt.Errorf("loading program %s, for map %s: %w", objName, mapName, err) + } + mapSpec.Contents[i] = MapKV{kv.Key, prog} + + case t.canStoreMap(): + // loadMap is idempotent and could return an existing Map. + innerMap, err := cl.loadMap(objName) + if err != nil { + return fmt.Errorf("loading inner map %s, for map %s: %w", objName, mapName, err) + } + mapSpec.Contents[i] = MapKV{kv.Key, innerMap} + } + } + + // Populate and freeze the map if specified. + if err := m.finalize(mapSpec); err != nil { + return fmt.Errorf("populating map %s: %w", mapName, err) + } + } + + return nil +} + +// resolveKconfig resolves all variables declared in .kconfig and populates +// m.Contents. Does nothing if the given m.Contents is non-empty. +func resolveKconfig(m *MapSpec) error { + ds, ok := m.Value.(*btf.Datasec) + if !ok { + return errors.New("map value is not a Datasec") + } + + type configInfo struct { + offset uint32 + size uint32 + typ btf.Type + } + + configs := make(map[string]configInfo) + + data := make([]byte, ds.Size) + for _, vsi := range ds.Vars { + v := vsi.Type.(*btf.Var) + n := v.TypeName() + + switch n { + case "LINUX_KERNEL_VERSION": + if integer, ok := v.Type.(*btf.Int); !ok || integer.Size != 4 { + return fmt.Errorf("variable %s must be a 32 bits integer, got %s", n, v.Type) + } + + kv, err := linux.KernelVersion() + if err != nil { + return fmt.Errorf("getting kernel version: %w", err) + } + internal.NativeEndian.PutUint32(data[vsi.Offset:], kv.Kernel()) + + case "LINUX_HAS_SYSCALL_WRAPPER": + integer, ok := v.Type.(*btf.Int) + if !ok { + return fmt.Errorf("variable %s must be an integer, got %s", n, v.Type) + } + var value uint64 = 1 + if err := haveSyscallWrapper(); errors.Is(err, ErrNotSupported) { + value = 0 + } else if err != nil { + return fmt.Errorf("unable to derive a value for LINUX_HAS_SYSCALL_WRAPPER: %w", err) + } + + if err := kconfig.PutInteger(data[vsi.Offset:], integer, value); err != nil { + return fmt.Errorf("set LINUX_HAS_SYSCALL_WRAPPER: %w", err) + } + + default: // Catch CONFIG_*. + configs[n] = configInfo{ + offset: vsi.Offset, + size: vsi.Size, + typ: v.Type, + } + } + } + + // We only parse kconfig file if a CONFIG_* variable was found. + if len(configs) > 0 { + f, err := linux.FindKConfig() + if err != nil { + return fmt.Errorf("cannot find a kconfig file: %w", err) + } + defer f.Close() + + filter := make(map[string]struct{}, len(configs)) + for config := range configs { + filter[config] = struct{}{} + } + + kernelConfig, err := kconfig.Parse(f, filter) + if err != nil { + return fmt.Errorf("cannot parse kconfig file: %w", err) + } + + for n, info := range configs { + value, ok := kernelConfig[n] + if !ok { + return fmt.Errorf("config option %q does not exist on this kernel", n) + } + + err := kconfig.PutValue(data[info.offset:info.offset+info.size], info.typ, value) + if err != nil { + return fmt.Errorf("problem adding value for %s: %w", n, err) + } + } + } + + m.Contents = []MapKV{{uint32(0), data}} + + return nil +} + +// LoadCollection reads an object file and creates and loads its declared +// resources into the kernel. +// +// Omitting Collection.Close() during application shutdown is an error. +// See the package documentation for details around Map and Program lifecycle. +func LoadCollection(file string) (*Collection, error) { + spec, err := LoadCollectionSpec(file) + if err != nil { + return nil, err + } + return NewCollection(spec) +} + +// Assign the contents of a Collection to a struct. +// +// This function bridges functionality between bpf2go generated +// code and any functionality better implemented in Collection. +// +// 'to' must be a pointer to a struct. A field of the +// struct is updated with values from Programs or Maps if it +// has an `ebpf` tag and its type is *Program or *Map. +// The tag's value specifies the name of the program or map as +// found in the CollectionSpec. +// +// struct { +// Foo *ebpf.Program `ebpf:"xdp_foo"` +// Bar *ebpf.Map `ebpf:"bar_map"` +// Ignored int +// } +// +// Returns an error if any of the eBPF objects can't be found, or +// if the same Map or Program is assigned multiple times. +// +// Ownership and Close()ing responsibility is transferred to `to` +// for any successful assigns. On error `to` is left in an undefined state. +func (coll *Collection) Assign(to interface{}) error { + assignedMaps := make(map[string]bool) + assignedProgs := make(map[string]bool) + assignedVars := make(map[string]bool) + + // Assign() only transfers already-loaded Maps and Programs. No extra + // loading is done. + getValue := func(typ reflect.Type, name string) (interface{}, error) { + switch typ { + + case reflect.TypeOf((*Program)(nil)): + if p := coll.Programs[name]; p != nil { + assignedProgs[name] = true + return p, nil + } + return nil, fmt.Errorf("missing program %q", name) + + case reflect.TypeOf((*Map)(nil)): + if m := coll.Maps[name]; m != nil { + assignedMaps[name] = true + return m, nil + } + return nil, fmt.Errorf("missing map %q", name) + + case reflect.TypeOf((*Variable)(nil)): + if v := coll.Variables[name]; v != nil { + assignedVars[name] = true + return v, nil + } + return nil, fmt.Errorf("missing variable %q", name) + + default: + return nil, fmt.Errorf("unsupported type %s", typ) + } + } + + if err := assignValues(to, getValue); err != nil { + return err + } + + // Finalize ownership transfer + for p := range assignedProgs { + delete(coll.Programs, p) + } + for m := range assignedMaps { + delete(coll.Maps, m) + } + for s := range assignedVars { + delete(coll.Variables, s) + } + + return nil +} + +// Close frees all maps and programs associated with the collection. +// +// The collection mustn't be used afterwards. +func (coll *Collection) Close() { + for _, prog := range coll.Programs { + prog.Close() + } + for _, m := range coll.Maps { + m.Close() + } +} + +// DetachMap removes the named map from the Collection. +// +// This means that a later call to Close() will not affect this map. +// +// Returns nil if no map of that name exists. +func (coll *Collection) DetachMap(name string) *Map { + m := coll.Maps[name] + delete(coll.Maps, name) + return m +} + +// DetachProgram removes the named program from the Collection. +// +// This means that a later call to Close() will not affect this program. +// +// Returns nil if no program of that name exists. +func (coll *Collection) DetachProgram(name string) *Program { + p := coll.Programs[name] + delete(coll.Programs, name) + return p +} + +// structField represents a struct field containing the ebpf struct tag. +type structField struct { + reflect.StructField + value reflect.Value +} + +// ebpfFields extracts field names tagged with 'ebpf' from a struct type. +// Keep track of visited types to avoid infinite recursion. +func ebpfFields(structVal reflect.Value, visited map[reflect.Type]bool) ([]structField, error) { + if visited == nil { + visited = make(map[reflect.Type]bool) + } + + structType := structVal.Type() + if structType.Kind() != reflect.Struct { + return nil, fmt.Errorf("%s is not a struct", structType) + } + + if visited[structType] { + return nil, fmt.Errorf("recursion on type %s", structType) + } + + fields := make([]structField, 0, structType.NumField()) + for i := 0; i < structType.NumField(); i++ { + field := structField{structType.Field(i), structVal.Field(i)} + + // If the field is tagged, gather it and move on. + name := field.Tag.Get("ebpf") + if name != "" { + fields = append(fields, field) + continue + } + + // If the field does not have an ebpf tag, but is a struct or a pointer + // to a struct, attempt to gather its fields as well. + var v reflect.Value + switch field.Type.Kind() { + case reflect.Ptr: + if field.Type.Elem().Kind() != reflect.Struct { + continue + } + + if field.value.IsNil() { + return nil, fmt.Errorf("nil pointer to %s", structType) + } + + // Obtain the destination type of the pointer. + v = field.value.Elem() + + case reflect.Struct: + // Reference the value's type directly. + v = field.value + + default: + continue + } + + inner, err := ebpfFields(v, visited) + if err != nil { + return nil, fmt.Errorf("field %s: %w", field.Name, err) + } + + fields = append(fields, inner...) + } + + return fields, nil +} + +// assignValues attempts to populate all fields of 'to' tagged with 'ebpf'. +// +// getValue is called for every tagged field of 'to' and must return the value +// to be assigned to the field with the given typ and name. +func assignValues(to interface{}, + getValue func(typ reflect.Type, name string) (interface{}, error)) error { + + toValue := reflect.ValueOf(to) + if toValue.Type().Kind() != reflect.Ptr { + return fmt.Errorf("%T is not a pointer to struct", to) + } + + if toValue.IsNil() { + return fmt.Errorf("nil pointer to %T", to) + } + + fields, err := ebpfFields(toValue.Elem(), nil) + if err != nil { + return err + } + + type elem struct { + // Either *Map or *Program + typ reflect.Type + name string + } + + assigned := make(map[elem]string) + for _, field := range fields { + // Get string value the field is tagged with. + tag := field.Tag.Get("ebpf") + if strings.Contains(tag, ",") { + return fmt.Errorf("field %s: ebpf tag contains a comma", field.Name) + } + + // Check if the eBPF object with the requested + // type and tag was already assigned elsewhere. + e := elem{field.Type, tag} + if af := assigned[e]; af != "" { + return fmt.Errorf("field %s: object %q was already assigned to %s", field.Name, tag, af) + } + + // Get the eBPF object referred to by the tag. + value, err := getValue(field.Type, tag) + if err != nil { + return fmt.Errorf("field %s: %w", field.Name, err) + } + + if !field.value.CanSet() { + return fmt.Errorf("field %s: can't set value", field.Name) + } + field.value.Set(reflect.ValueOf(value)) + + assigned[e] = field.Name + } + + return nil +} diff --git a/vendor/github.com/cilium/ebpf/cpu.go b/vendor/github.com/cilium/ebpf/cpu.go new file mode 100644 index 0000000000..07e959efdc --- /dev/null +++ b/vendor/github.com/cilium/ebpf/cpu.go @@ -0,0 +1,66 @@ +package ebpf + +import ( + "fmt" + "os" + "strings" + "sync" +) + +var possibleCPU = sync.OnceValues(func() (int, error) { + return parseCPUsFromFile("/sys/devices/system/cpu/possible") +}) + +// PossibleCPU returns the max number of CPUs a system may possibly have +// Logical CPU numbers must be of the form 0-n +func PossibleCPU() (int, error) { + return possibleCPU() +} + +// MustPossibleCPU is a helper that wraps a call to PossibleCPU and panics if +// the error is non-nil. +func MustPossibleCPU() int { + cpus, err := PossibleCPU() + if err != nil { + panic(err) + } + return cpus +} + +func parseCPUsFromFile(path string) (int, error) { + spec, err := os.ReadFile(path) + if err != nil { + return 0, err + } + + n, err := parseCPUs(string(spec)) + if err != nil { + return 0, fmt.Errorf("can't parse %s: %v", path, err) + } + + return n, nil +} + +// parseCPUs parses the number of cpus from a string produced +// by bitmap_list_string() in the Linux kernel. +// Multiple ranges are rejected, since they can't be unified +// into a single number. +// This is the format of /sys/devices/system/cpu/possible, it +// is not suitable for /sys/devices/system/cpu/online, etc. +func parseCPUs(spec string) (int, error) { + if strings.Trim(spec, "\n") == "0" { + return 1, nil + } + + var low, high int + n, err := fmt.Sscanf(spec, "%d-%d\n", &low, &high) + if n != 2 || err != nil { + return 0, fmt.Errorf("invalid format: %s", spec) + } + if low != 0 { + return 0, fmt.Errorf("CPU spec doesn't start at zero: %s", spec) + } + + // cpus is 0 indexed + return high + 1, nil +} diff --git a/vendor/github.com/cilium/ebpf/doc.go b/vendor/github.com/cilium/ebpf/doc.go new file mode 100644 index 0000000000..396b3394d3 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/doc.go @@ -0,0 +1,25 @@ +// Package ebpf is a toolkit for working with eBPF programs. +// +// eBPF programs are small snippets of code which are executed directly +// in a VM in the Linux kernel, which makes them very fast and flexible. +// Many Linux subsystems now accept eBPF programs. This makes it possible +// to implement highly application specific logic inside the kernel, +// without having to modify the actual kernel itself. +// +// This package is designed for long-running processes which +// want to use eBPF to implement part of their application logic. It has no +// run-time dependencies outside of the library and the Linux kernel itself. +// eBPF code should be compiled ahead of time using clang, and shipped with +// your application as any other resource. +// +// Use the link subpackage to attach a loaded program to a hook in the kernel. +// +// Note that losing all references to Map and Program resources will cause +// their underlying file descriptors to be closed, potentially removing those +// objects from the kernel. Always retain a reference by e.g. deferring a +// Close() of a Collection or LoadAndAssign object until application exit. +// +// Special care needs to be taken when handling maps of type ProgramArray, +// as the kernel erases its contents when the last userspace or bpffs +// reference disappears, regardless of the map being in active use. +package ebpf diff --git a/vendor/github.com/cilium/ebpf/elf_reader.go b/vendor/github.com/cilium/ebpf/elf_reader.go new file mode 100644 index 0000000000..9e8dbc7ae5 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/elf_reader.go @@ -0,0 +1,1457 @@ +package ebpf + +import ( + "bufio" + "bytes" + "debug/elf" + "encoding/binary" + "errors" + "fmt" + "io" + "math" + "os" + "strings" + + "github.com/cilium/ebpf/asm" + "github.com/cilium/ebpf/btf" + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/sys" +) + +type kconfigMetaKey struct{} + +type kconfigMeta struct { + Map *MapSpec + Offset uint32 +} + +type kfuncMetaKey struct{} + +type kfuncMeta struct { + Binding elf.SymBind + Func *btf.Func +} + +type ksymMetaKey struct{} + +type ksymMeta struct { + Binding elf.SymBind + Name string +} + +// elfCode is a convenience to reduce the amount of arguments that have to +// be passed around explicitly. You should treat its contents as immutable. +type elfCode struct { + *internal.SafeELFFile + sections map[elf.SectionIndex]*elfSection + license string + version uint32 + btf *btf.Spec + extInfo *btf.ExtInfos + maps map[string]*MapSpec + vars map[string]*VariableSpec + kfuncs map[string]*btf.Func + ksyms map[string]struct{} + kconfig *MapSpec +} + +// LoadCollectionSpec parses an ELF file into a CollectionSpec. +func LoadCollectionSpec(file string) (*CollectionSpec, error) { + f, err := os.Open(file) + if err != nil { + return nil, err + } + defer f.Close() + + spec, err := LoadCollectionSpecFromReader(f) + if err != nil { + return nil, fmt.Errorf("file %s: %w", file, err) + } + return spec, nil +} + +// LoadCollectionSpecFromReader parses an ELF file into a CollectionSpec. +func LoadCollectionSpecFromReader(rd io.ReaderAt) (*CollectionSpec, error) { + f, err := internal.NewSafeELFFile(rd) + if err != nil { + return nil, err + } + + // Checks if the ELF file is for BPF data. + // Old LLVM versions set e_machine to EM_NONE. + if f.File.Machine != elf.EM_NONE && f.File.Machine != elf.EM_BPF { + return nil, fmt.Errorf("unexpected machine type for BPF ELF: %s", f.File.Machine) + } + + var ( + licenseSection *elf.Section + versionSection *elf.Section + sections = make(map[elf.SectionIndex]*elfSection) + relSections = make(map[elf.SectionIndex]*elf.Section) + ) + + // This is the target of relocations generated by inline assembly. + sections[elf.SHN_UNDEF] = newElfSection(new(elf.Section), undefSection) + + // Collect all the sections we're interested in. This includes relocations + // which we parse later. + // + // Keep the documentation at docs/ebpf/loading/elf-sections.md up-to-date. + for i, sec := range f.Sections { + idx := elf.SectionIndex(i) + + switch { + case strings.HasPrefix(sec.Name, "license"): + licenseSection = sec + case strings.HasPrefix(sec.Name, "version"): + versionSection = sec + case strings.HasPrefix(sec.Name, "maps"): + sections[idx] = newElfSection(sec, mapSection) + case sec.Name == ".maps": + sections[idx] = newElfSection(sec, btfMapSection) + case isDataSection(sec.Name): + sections[idx] = newElfSection(sec, dataSection) + case sec.Type == elf.SHT_REL: + // Store relocations under the section index of the target + relSections[elf.SectionIndex(sec.Info)] = sec + case sec.Type == elf.SHT_PROGBITS && (sec.Flags&elf.SHF_EXECINSTR) != 0 && sec.Size > 0: + sections[idx] = newElfSection(sec, programSection) + } + } + + license, err := loadLicense(licenseSection) + if err != nil { + return nil, fmt.Errorf("load license: %w", err) + } + + version, err := loadVersion(versionSection, f.ByteOrder) + if err != nil { + return nil, fmt.Errorf("load version: %w", err) + } + + btfSpec, btfExtInfo, err := btf.LoadSpecAndExtInfosFromReader(rd) + if err != nil && !errors.Is(err, btf.ErrNotFound) { + return nil, fmt.Errorf("load BTF: %w", err) + } + + ec := &elfCode{ + SafeELFFile: f, + sections: sections, + license: license, + version: version, + btf: btfSpec, + extInfo: btfExtInfo, + maps: make(map[string]*MapSpec), + vars: make(map[string]*VariableSpec), + kfuncs: make(map[string]*btf.Func), + ksyms: make(map[string]struct{}), + } + + symbols, err := f.Symbols() + if err != nil { + return nil, fmt.Errorf("load symbols: %v", err) + } + + ec.assignSymbols(symbols) + + if err := ec.loadRelocations(relSections, symbols); err != nil { + return nil, fmt.Errorf("load relocations: %w", err) + } + + if err := ec.loadMaps(); err != nil { + return nil, fmt.Errorf("load maps: %w", err) + } + + if err := ec.loadBTFMaps(); err != nil { + return nil, fmt.Errorf("load BTF maps: %w", err) + } + + if err := ec.loadDataSections(); err != nil { + return nil, fmt.Errorf("load data sections: %w", err) + } + + if err := ec.loadKconfigSection(); err != nil { + return nil, fmt.Errorf("load virtual .kconfig section: %w", err) + } + + if err := ec.loadKsymsSection(); err != nil { + return nil, fmt.Errorf("load virtual .ksyms section: %w", err) + } + + // Finally, collect programs and link them. + progs, err := ec.loadProgramSections() + if err != nil { + return nil, fmt.Errorf("load programs: %w", err) + } + + return &CollectionSpec{ec.maps, progs, ec.vars, btfSpec, ec.ByteOrder}, nil +} + +func loadLicense(sec *elf.Section) (string, error) { + if sec == nil { + return "", nil + } + + data, err := sec.Data() + if err != nil { + return "", fmt.Errorf("section %s: %v", sec.Name, err) + } + return string(bytes.TrimRight(data, "\000")), nil +} + +func loadVersion(sec *elf.Section, bo binary.ByteOrder) (uint32, error) { + if sec == nil { + return 0, nil + } + + var version uint32 + if err := binary.Read(sec.Open(), bo, &version); err != nil { + return 0, fmt.Errorf("section %s: %v", sec.Name, err) + } + return version, nil +} + +func isDataSection(name string) bool { + return name == ".bss" || strings.HasPrefix(name, ".data") || strings.HasPrefix(name, ".rodata") +} + +func isConstantDataSection(name string) bool { + return strings.HasPrefix(name, ".rodata") +} + +func isKconfigSection(name string) bool { + return name == ".kconfig" +} + +type elfSectionKind int + +const ( + undefSection elfSectionKind = iota + mapSection + btfMapSection + programSection + dataSection +) + +type elfSection struct { + *elf.Section + kind elfSectionKind + // Offset from the start of the section to a symbol + symbols map[uint64]elf.Symbol + // Offset from the start of the section to a relocation, which points at + // a symbol in another section. + relocations map[uint64]elf.Symbol + // The number of relocations pointing at this section. + references int +} + +func newElfSection(section *elf.Section, kind elfSectionKind) *elfSection { + return &elfSection{ + section, + kind, + make(map[uint64]elf.Symbol), + make(map[uint64]elf.Symbol), + 0, + } +} + +// assignSymbols takes a list of symbols and assigns them to their +// respective sections, indexed by name. +func (ec *elfCode) assignSymbols(symbols []elf.Symbol) { + for _, symbol := range symbols { + symType := elf.ST_TYPE(symbol.Info) + symSection := ec.sections[symbol.Section] + if symSection == nil { + continue + } + + // Anonymous symbols only occur in debug sections which we don't process + // relocations for. Anonymous symbols are not referenced from other sections. + if symbol.Name == "" { + continue + } + + // Older versions of LLVM don't tag symbols correctly, so keep + // all NOTYPE ones. + switch symSection.kind { + case mapSection, btfMapSection, dataSection: + if symType != elf.STT_NOTYPE && symType != elf.STT_OBJECT { + continue + } + case programSection: + if symType != elf.STT_NOTYPE && symType != elf.STT_FUNC { + continue + } + // LLVM emits LBB_ (Local Basic Block) symbols that seem to be jump + // targets within sections, but BPF has no use for them. + if symType == elf.STT_NOTYPE && elf.ST_BIND(symbol.Info) == elf.STB_LOCAL && + strings.HasPrefix(symbol.Name, "LBB") { + continue + } + // Only collect symbols that occur in program/maps/data sections. + default: + continue + } + + symSection.symbols[symbol.Value] = symbol + } +} + +// loadRelocations iterates .rel* sections and extracts relocation entries for +// sections of interest. Makes sure relocations point at valid sections. +func (ec *elfCode) loadRelocations(relSections map[elf.SectionIndex]*elf.Section, symbols []elf.Symbol) error { + for idx, relSection := range relSections { + section := ec.sections[idx] + if section == nil { + continue + } + + rels, err := ec.loadSectionRelocations(relSection, symbols) + if err != nil { + return fmt.Errorf("relocation for section %q: %w", section.Name, err) + } + + for _, rel := range rels { + target := ec.sections[rel.Section] + if target == nil { + return fmt.Errorf("section %q: reference to %q in section %s: %w", section.Name, rel.Name, rel.Section, ErrNotSupported) + } + + target.references++ + } + + section.relocations = rels + } + + return nil +} + +// loadProgramSections iterates ec's sections and emits a ProgramSpec +// for each function it finds. +// +// The resulting map is indexed by function name. +func (ec *elfCode) loadProgramSections() (map[string]*ProgramSpec, error) { + + progs := make(map[string]*ProgramSpec) + + // Generate a ProgramSpec for each function found in each program section. + var export []string + for _, sec := range ec.sections { + if sec.kind != programSection { + continue + } + + if len(sec.symbols) == 0 { + return nil, fmt.Errorf("section %v: missing symbols", sec.Name) + } + + funcs, err := ec.loadFunctions(sec) + if err != nil { + return nil, fmt.Errorf("section %v: %w", sec.Name, err) + } + + progType, attachType, progFlags, attachTo := getProgType(sec.Name) + + for name, insns := range funcs { + spec := &ProgramSpec{ + Name: name, + Type: progType, + Flags: progFlags, + AttachType: attachType, + AttachTo: attachTo, + SectionName: sec.Name, + License: ec.license, + KernelVersion: ec.version, + Instructions: insns, + ByteOrder: ec.ByteOrder, + } + + // Function names must be unique within a single ELF blob. + if progs[name] != nil { + return nil, fmt.Errorf("duplicate program name %s", name) + } + progs[name] = spec + + if spec.SectionName != ".text" { + export = append(export, name) + } + } + } + + flattenPrograms(progs, export) + + // Hide programs (e.g. library functions) that were not explicitly emitted + // to an ELF section. These could be exposed in a separate CollectionSpec + // field later to allow them to be modified. + for n, p := range progs { + if p.SectionName == ".text" { + delete(progs, n) + } + } + + return progs, nil +} + +// loadFunctions extracts instruction streams from the given program section +// starting at each symbol in the section. The section's symbols must already +// be narrowed down to STT_NOTYPE (emitted by clang <8) or STT_FUNC. +// +// The resulting map is indexed by function name. +func (ec *elfCode) loadFunctions(section *elfSection) (map[string]asm.Instructions, error) { + r := bufio.NewReader(section.Open()) + + // Decode the section's instruction stream. + insns := make(asm.Instructions, 0, section.Size/asm.InstructionSize) + if err := insns.Unmarshal(r, ec.ByteOrder); err != nil { + return nil, fmt.Errorf("decoding instructions for section %s: %w", section.Name, err) + } + if len(insns) == 0 { + return nil, fmt.Errorf("no instructions found in section %s", section.Name) + } + + iter := insns.Iterate() + for iter.Next() { + ins := iter.Ins + offset := iter.Offset.Bytes() + + // Tag Symbol Instructions. + if sym, ok := section.symbols[offset]; ok { + *ins = ins.WithSymbol(sym.Name) + } + + // Apply any relocations for the current instruction. + // If no relocation is present, resolve any section-relative function calls. + if rel, ok := section.relocations[offset]; ok { + if err := ec.relocateInstruction(ins, rel); err != nil { + return nil, fmt.Errorf("offset %d: relocating instruction: %w", offset, err) + } + } else { + if err := referenceRelativeJump(ins, offset, section.symbols); err != nil { + return nil, fmt.Errorf("offset %d: resolving relative jump: %w", offset, err) + } + } + } + + if ec.extInfo != nil { + ec.extInfo.Assign(insns, section.Name) + } + + return splitSymbols(insns) +} + +// referenceRelativeJump turns a relative jump to another bpf subprogram within +// the same ELF section into a Reference Instruction. +// +// Up to LLVM 9, calls to subprograms within the same ELF section are sometimes +// encoded using relative jumps instead of relocation entries. These jumps go +// out of bounds of the current program, so their targets must be memoized +// before the section's instruction stream is split. +// +// The relative jump Constant is blinded to -1 and the target Symbol is set as +// the Instruction's Reference so it can be resolved by the linker. +func referenceRelativeJump(ins *asm.Instruction, offset uint64, symbols map[uint64]elf.Symbol) error { + if !ins.IsFunctionReference() || ins.Constant == -1 { + return nil + } + + tgt := jumpTarget(offset, *ins) + sym := symbols[tgt].Name + if sym == "" { + return fmt.Errorf("no jump target found at offset %d", tgt) + } + + *ins = ins.WithReference(sym) + ins.Constant = -1 + + return nil +} + +// jumpTarget takes ins' offset within an instruction stream (in bytes) +// and returns its absolute jump destination (in bytes) within the +// instruction stream. +func jumpTarget(offset uint64, ins asm.Instruction) uint64 { + // A relative jump instruction describes the amount of raw BPF instructions + // to jump, convert the offset into bytes. + dest := ins.Constant * asm.InstructionSize + + // The starting point of the jump is the end of the current instruction. + dest += int64(offset + asm.InstructionSize) + + if dest < 0 { + return 0 + } + + return uint64(dest) +} + +var errUnsupportedBinding = errors.New("unsupported binding") + +func (ec *elfCode) relocateInstruction(ins *asm.Instruction, rel elf.Symbol) error { + var ( + typ = elf.ST_TYPE(rel.Info) + bind = elf.ST_BIND(rel.Info) + name = rel.Name + ) + + target := ec.sections[rel.Section] + + switch target.kind { + case mapSection, btfMapSection: + if bind == elf.STB_LOCAL { + return fmt.Errorf("possible erroneous static qualifier on map definition: found reference to %q", name) + } + + if bind != elf.STB_GLOBAL { + return fmt.Errorf("map %q: %w: %s", name, errUnsupportedBinding, bind) + } + + if typ != elf.STT_OBJECT && typ != elf.STT_NOTYPE { + // STT_NOTYPE is generated on clang < 8 which doesn't tag + // relocations appropriately. + return fmt.Errorf("map load: incorrect relocation type %v", typ) + } + + ins.Src = asm.PseudoMapFD + + case dataSection: + var offset uint32 + switch typ { + case elf.STT_SECTION: + if bind != elf.STB_LOCAL { + return fmt.Errorf("direct load: %s: %w: %s", name, errUnsupportedBinding, bind) + } + + // This is really a reference to a static symbol, which clang doesn't + // emit a symbol table entry for. Instead it encodes the offset in + // the instruction itself. + offset = uint32(uint64(ins.Constant)) + + case elf.STT_OBJECT: + // LLVM 9 emits OBJECT-LOCAL symbols for anonymous constants. + if bind != elf.STB_GLOBAL && bind != elf.STB_LOCAL && bind != elf.STB_WEAK { + return fmt.Errorf("direct load: %s: %w: %s", name, errUnsupportedBinding, bind) + } + + offset = uint32(rel.Value) + + case elf.STT_NOTYPE: + // LLVM 7 emits NOTYPE-LOCAL symbols for anonymous constants. + if bind != elf.STB_LOCAL { + return fmt.Errorf("direct load: %s: %w: %s", name, errUnsupportedBinding, bind) + } + + offset = uint32(rel.Value) + + default: + return fmt.Errorf("incorrect relocation type %v for direct map load", typ) + } + + // We rely on using the name of the data section as the reference. It + // would be nicer to keep the real name in case of an STT_OBJECT, but + // it's not clear how to encode that into Instruction. + name = target.Name + + // The kernel expects the offset in the second basic BPF instruction. + ins.Constant = int64(uint64(offset) << 32) + ins.Src = asm.PseudoMapValue + + case programSection: + switch opCode := ins.OpCode; { + case opCode.JumpOp() == asm.Call: + if ins.Src != asm.PseudoCall { + return fmt.Errorf("call: %s: incorrect source register", name) + } + + switch typ { + case elf.STT_NOTYPE, elf.STT_FUNC: + if bind != elf.STB_GLOBAL { + return fmt.Errorf("call: %s: %w: %s", name, errUnsupportedBinding, bind) + } + + case elf.STT_SECTION: + if bind != elf.STB_LOCAL { + return fmt.Errorf("call: %s: %w: %s", name, errUnsupportedBinding, bind) + } + + // The function we want to call is in the indicated section, + // at the offset encoded in the instruction itself. Reverse + // the calculation to find the real function we're looking for. + // A value of -1 references the first instruction in the section. + offset := int64(int32(ins.Constant)+1) * asm.InstructionSize + sym, ok := target.symbols[uint64(offset)] + if !ok { + return fmt.Errorf("call: no symbol at offset %d", offset) + } + + name = sym.Name + ins.Constant = -1 + + default: + return fmt.Errorf("call: %s: invalid symbol type %s", name, typ) + } + case opCode.IsDWordLoad(): + switch typ { + case elf.STT_FUNC: + if bind != elf.STB_GLOBAL { + return fmt.Errorf("load: %s: %w: %s", name, errUnsupportedBinding, bind) + } + + case elf.STT_SECTION: + if bind != elf.STB_LOCAL { + return fmt.Errorf("load: %s: %w: %s", name, errUnsupportedBinding, bind) + } + + // ins.Constant already contains the offset in bytes from the + // start of the section. This is different than a call to a + // static function. + + default: + return fmt.Errorf("load: %s: invalid symbol type %s", name, typ) + } + + sym, ok := target.symbols[uint64(ins.Constant)] + if !ok { + return fmt.Errorf("load: no symbol at offset %d", ins.Constant) + } + + name = sym.Name + ins.Constant = -1 + ins.Src = asm.PseudoFunc + + default: + return fmt.Errorf("neither a call nor a load instruction: %v", ins) + } + + // The Undefined section is used for 'virtual' symbols that aren't backed by + // an ELF section. This includes symbol references from inline asm, forward + // function declarations, as well as extern kfunc declarations using __ksym + // and extern kconfig variables declared using __kconfig. + case undefSection: + if bind != elf.STB_GLOBAL && bind != elf.STB_WEAK { + return fmt.Errorf("asm relocation: %s: %w: %s", name, errUnsupportedBinding, bind) + } + + if typ != elf.STT_NOTYPE { + return fmt.Errorf("asm relocation: %s: unsupported type %s", name, typ) + } + + kf := ec.kfuncs[name] + _, ks := ec.ksyms[name] + + switch { + // If a Call / DWordLoad instruction is found and the datasec has a btf.Func with a Name + // that matches the symbol name we mark the instruction as a referencing a kfunc. + case kf != nil && ins.OpCode.JumpOp() == asm.Call: + ins.Metadata.Set(kfuncMetaKey{}, &kfuncMeta{ + Func: kf, + Binding: bind, + }) + + ins.Src = asm.PseudoKfuncCall + ins.Constant = -1 + + case kf != nil && ins.OpCode.IsDWordLoad(): + ins.Metadata.Set(kfuncMetaKey{}, &kfuncMeta{ + Func: kf, + Binding: bind, + }) + + ins.Constant = 0 + + case ks && ins.OpCode.IsDWordLoad(): + if bind != elf.STB_GLOBAL && bind != elf.STB_WEAK { + return fmt.Errorf("asm relocation: %s: %w: %s", name, errUnsupportedBinding, bind) + } + ins.Metadata.Set(ksymMetaKey{}, &ksymMeta{ + Binding: bind, + Name: name, + }) + + // If no kconfig map is found, this must be a symbol reference from inline + // asm (see testdata/loader.c:asm_relocation()) or a call to a forward + // function declaration (see testdata/fwd_decl.c). Don't interfere, These + // remain standard symbol references. + // extern __kconfig reads are represented as dword loads that need to be + // rewritten to pseudo map loads from .kconfig. If the map is present, + // require it to contain the symbol to disambiguate between inline asm + // relos and kconfigs. + case ec.kconfig != nil && ins.OpCode.IsDWordLoad(): + if bind != elf.STB_GLOBAL { + return fmt.Errorf("asm relocation: %s: %w: %s", name, errUnsupportedBinding, bind) + } + + for _, vsi := range ec.kconfig.Value.(*btf.Datasec).Vars { + if vsi.Type.(*btf.Var).Name != rel.Name { + continue + } + + ins.Src = asm.PseudoMapValue + ins.Metadata.Set(kconfigMetaKey{}, &kconfigMeta{ec.kconfig, vsi.Offset}) + return nil + } + + return fmt.Errorf("kconfig %s not found in .kconfig", rel.Name) + } + + default: + return fmt.Errorf("relocation to %q: %w", target.Name, ErrNotSupported) + } + + *ins = ins.WithReference(name) + return nil +} + +func (ec *elfCode) loadMaps() error { + for _, sec := range ec.sections { + if sec.kind != mapSection { + continue + } + + nSym := len(sec.symbols) + if nSym == 0 { + return fmt.Errorf("section %v: no symbols", sec.Name) + } + + if sec.Size%uint64(nSym) != 0 { + return fmt.Errorf("section %v: map descriptors are not of equal size", sec.Name) + } + + var ( + r = bufio.NewReader(sec.Open()) + size = sec.Size / uint64(nSym) + ) + for i, offset := 0, uint64(0); i < nSym; i, offset = i+1, offset+size { + mapSym, ok := sec.symbols[offset] + if !ok { + return fmt.Errorf("section %s: missing symbol for map at offset %d", sec.Name, offset) + } + + mapName := mapSym.Name + if ec.maps[mapName] != nil { + return fmt.Errorf("section %v: map %v already exists", sec.Name, mapSym) + } + + lr := io.LimitReader(r, int64(size)) + + spec := MapSpec{ + Name: SanitizeName(mapName, -1), + } + switch { + case binary.Read(lr, ec.ByteOrder, &spec.Type) != nil: + return fmt.Errorf("map %s: missing type", mapName) + case binary.Read(lr, ec.ByteOrder, &spec.KeySize) != nil: + return fmt.Errorf("map %s: missing key size", mapName) + case binary.Read(lr, ec.ByteOrder, &spec.ValueSize) != nil: + return fmt.Errorf("map %s: missing value size", mapName) + case binary.Read(lr, ec.ByteOrder, &spec.MaxEntries) != nil: + return fmt.Errorf("map %s: missing max entries", mapName) + case binary.Read(lr, ec.ByteOrder, &spec.Flags) != nil: + return fmt.Errorf("map %s: missing flags", mapName) + } + + extra, err := io.ReadAll(lr) + if err != nil { + return fmt.Errorf("map %s: reading map tail: %w", mapName, err) + } + if len(extra) > 0 { + spec.Extra = bytes.NewReader(extra) + } + + ec.maps[mapName] = &spec + } + } + + return nil +} + +// loadBTFMaps iterates over all ELF sections marked as BTF map sections +// (like .maps) and parses them into MapSpecs. Dump the .maps section and +// any relocations with `readelf -x .maps -r `. +func (ec *elfCode) loadBTFMaps() error { + for _, sec := range ec.sections { + if sec.kind != btfMapSection { + continue + } + + if ec.btf == nil { + return fmt.Errorf("missing BTF") + } + + // Each section must appear as a DataSec in the ELF's BTF blob. + var ds *btf.Datasec + if err := ec.btf.TypeByName(sec.Name, &ds); err != nil { + return fmt.Errorf("cannot find section '%s' in BTF: %w", sec.Name, err) + } + + // Open a Reader to the ELF's raw section bytes so we can assert that all + // of them are zero on a per-map (per-Var) basis. For now, the section's + // sole purpose is to receive relocations, so all must be zero. + rs := sec.Open() + + for _, vs := range ds.Vars { + // BPF maps are declared as and assigned to global variables, + // so iterate over each Var in the DataSec and validate their types. + v, ok := vs.Type.(*btf.Var) + if !ok { + return fmt.Errorf("section %v: unexpected type %s", sec.Name, vs.Type) + } + name := string(v.Name) + + // The BTF metadata for each Var contains the full length of the map + // declaration, so read the corresponding amount of bytes from the ELF. + // This way, we can pinpoint which map declaration contains unexpected + // (and therefore unsupported) data. + _, err := io.Copy(internal.DiscardZeroes{}, io.LimitReader(rs, int64(vs.Size))) + if err != nil { + return fmt.Errorf("section %v: map %s: initializing BTF map definitions: %w", sec.Name, name, internal.ErrNotSupported) + } + + if ec.maps[name] != nil { + return fmt.Errorf("section %v: map %s already exists", sec.Name, name) + } + + // Each Var representing a BTF map definition contains a Struct. + mapStruct, ok := btf.UnderlyingType(v.Type).(*btf.Struct) + if !ok { + return fmt.Errorf("expected struct, got %s", v.Type) + } + + mapSpec, err := mapSpecFromBTF(sec, &vs, mapStruct, ec.btf, name, false) + if err != nil { + return fmt.Errorf("map %v: %w", name, err) + } + + ec.maps[name] = mapSpec + } + + // Drain the ELF section reader to make sure all bytes are accounted for + // with BTF metadata. + i, err := io.Copy(io.Discard, rs) + if err != nil { + return fmt.Errorf("section %v: unexpected error reading remainder of ELF section: %w", sec.Name, err) + } + if i > 0 { + return fmt.Errorf("section %v: %d unexpected remaining bytes in ELF section, invalid BTF?", sec.Name, i) + } + } + + return nil +} + +// mapSpecFromBTF produces a MapSpec based on a btf.Struct def representing +// a BTF map definition. The name and spec arguments will be copied to the +// resulting MapSpec, and inner must be true on any recursive invocations. +func mapSpecFromBTF(es *elfSection, vs *btf.VarSecinfo, def *btf.Struct, spec *btf.Spec, name string, inner bool) (*MapSpec, error) { + var ( + key, value btf.Type + keySize, valueSize uint32 + mapType MapType + flags, maxEntries uint32 + pinType PinType + innerMapSpec *MapSpec + contents []MapKV + err error + ) + + for i, member := range def.Members { + switch member.Name { + case "type": + mt, err := uintFromBTF(member.Type) + if err != nil { + return nil, fmt.Errorf("can't get type: %w", err) + } + mapType = MapType(mt) + + case "map_flags": + flags, err = uintFromBTF(member.Type) + if err != nil { + return nil, fmt.Errorf("can't get BTF map flags: %w", err) + } + + case "max_entries": + maxEntries, err = uintFromBTF(member.Type) + if err != nil { + return nil, fmt.Errorf("can't get BTF map max entries: %w", err) + } + + case "key": + if keySize != 0 { + return nil, errors.New("both key and key_size given") + } + + pk, ok := member.Type.(*btf.Pointer) + if !ok { + return nil, fmt.Errorf("key type is not a pointer: %T", member.Type) + } + + key = pk.Target + + size, err := btf.Sizeof(pk.Target) + if err != nil { + return nil, fmt.Errorf("can't get size of BTF key: %w", err) + } + + keySize = uint32(size) + + case "value": + if valueSize != 0 { + return nil, errors.New("both value and value_size given") + } + + vk, ok := member.Type.(*btf.Pointer) + if !ok { + return nil, fmt.Errorf("value type is not a pointer: %T", member.Type) + } + + value = vk.Target + + size, err := btf.Sizeof(vk.Target) + if err != nil { + return nil, fmt.Errorf("can't get size of BTF value: %w", err) + } + + valueSize = uint32(size) + + case "key_size": + // Key needs to be nil and keySize needs to be 0 for key_size to be + // considered a valid member. + if key != nil || keySize != 0 { + return nil, errors.New("both key and key_size given") + } + + keySize, err = uintFromBTF(member.Type) + if err != nil { + return nil, fmt.Errorf("can't get BTF key size: %w", err) + } + + case "value_size": + // Value needs to be nil and valueSize needs to be 0 for value_size to be + // considered a valid member. + if value != nil || valueSize != 0 { + return nil, errors.New("both value and value_size given") + } + + valueSize, err = uintFromBTF(member.Type) + if err != nil { + return nil, fmt.Errorf("can't get BTF value size: %w", err) + } + + case "pinning": + if inner { + return nil, errors.New("inner maps can't be pinned") + } + + pinning, err := uintFromBTF(member.Type) + if err != nil { + return nil, fmt.Errorf("can't get pinning: %w", err) + } + + pinType = PinType(pinning) + + case "values": + // The 'values' field in BTF map definitions is used for declaring map + // value types that are references to other BPF objects, like other maps + // or programs. It is always expected to be an array of pointers. + if i != len(def.Members)-1 { + return nil, errors.New("'values' must be the last member in a BTF map definition") + } + + if valueSize != 0 && valueSize != 4 { + return nil, errors.New("value_size must be 0 or 4") + } + valueSize = 4 + + valueType, err := resolveBTFArrayMacro(member.Type) + if err != nil { + return nil, fmt.Errorf("can't resolve type of member 'values': %w", err) + } + + switch t := valueType.(type) { + case *btf.Struct: + // The values member pointing to an array of structs means we're expecting + // a map-in-map declaration. + if mapType != ArrayOfMaps && mapType != HashOfMaps { + return nil, errors.New("outer map needs to be an array or a hash of maps") + } + if inner { + return nil, fmt.Errorf("nested inner maps are not supported") + } + + // This inner map spec is used as a map template, but it needs to be + // created as a traditional map before it can be used to do so. + // libbpf names the inner map template '.inner', but we + // opted for _inner to simplify validation logic. (dots only supported + // on kernels 5.2 and up) + // Pass the BTF spec from the parent object, since both parent and + // child must be created from the same BTF blob (on kernels that support BTF). + innerMapSpec, err = mapSpecFromBTF(es, vs, t, spec, name+"_inner", true) + if err != nil { + return nil, fmt.Errorf("can't parse BTF map definition of inner map: %w", err) + } + + case *btf.FuncProto: + // The values member contains an array of function pointers, meaning an + // autopopulated PROG_ARRAY. + if mapType != ProgramArray { + return nil, errors.New("map needs to be a program array") + } + + default: + return nil, fmt.Errorf("unsupported value type %q in 'values' field", t) + } + + contents, err = resolveBTFValuesContents(es, vs, member) + if err != nil { + return nil, fmt.Errorf("resolving values contents: %w", err) + } + + case "map_extra": + return nil, fmt.Errorf("BTF map definition: field %s: %w", member.Name, ErrNotSupported) + + default: + return nil, fmt.Errorf("unrecognized field %s in BTF map definition", member.Name) + } + } + + // Some maps don't support value sizes, but annotating their map definitions + // with __type macros can still be useful, especially to let bpf2go generate + // type definitions for them. + if value != nil && !mapType.canHaveValueSize() { + valueSize = 0 + } + + return &MapSpec{ + Name: SanitizeName(name, -1), + Type: MapType(mapType), + KeySize: keySize, + ValueSize: valueSize, + MaxEntries: maxEntries, + Flags: flags, + Key: key, + Value: value, + Pinning: pinType, + InnerMap: innerMapSpec, + Contents: contents, + }, nil +} + +// uintFromBTF resolves the __uint macro, which is a pointer to a sized +// array, e.g. for int (*foo)[10], this function will return 10. +func uintFromBTF(typ btf.Type) (uint32, error) { + ptr, ok := typ.(*btf.Pointer) + if !ok { + return 0, fmt.Errorf("not a pointer: %v", typ) + } + + arr, ok := ptr.Target.(*btf.Array) + if !ok { + return 0, fmt.Errorf("not a pointer to array: %v", typ) + } + + return arr.Nelems, nil +} + +// resolveBTFArrayMacro resolves the __array macro, which declares an array +// of pointers to a given type. This function returns the target Type of +// the pointers in the array. +func resolveBTFArrayMacro(typ btf.Type) (btf.Type, error) { + arr, ok := typ.(*btf.Array) + if !ok { + return nil, fmt.Errorf("not an array: %v", typ) + } + + ptr, ok := arr.Type.(*btf.Pointer) + if !ok { + return nil, fmt.Errorf("not an array of pointers: %v", typ) + } + + return ptr.Target, nil +} + +// resolveBTFValuesContents resolves relocations into ELF sections belonging +// to btf.VarSecinfo's. This can be used on the 'values' member in BTF map +// definitions to extract static declarations of map contents. +func resolveBTFValuesContents(es *elfSection, vs *btf.VarSecinfo, member btf.Member) ([]MapKV, error) { + // The elements of a .values pointer array are not encoded in BTF. + // Instead, relocations are generated into each array index. + // However, it's possible to leave certain array indices empty, so all + // indices' offsets need to be checked for emitted relocations. + + // The offset of the 'values' member within the _struct_ (in bits) + // is the starting point of the array. Convert to bytes. Add VarSecinfo + // offset to get the absolute position in the ELF blob. + start := member.Offset.Bytes() + vs.Offset + // 'values' is encoded in BTF as a zero (variable) length struct + // member, and its contents run until the end of the VarSecinfo. + // Add VarSecinfo offset to get the absolute position in the ELF blob. + end := vs.Size + vs.Offset + // The size of an address in this section. This determines the width of + // an index in the array. + align := uint32(es.SectionHeader.Addralign) + + // Check if variable-length section is aligned. + if (end-start)%align != 0 { + return nil, errors.New("unaligned static values section") + } + elems := (end - start) / align + + if elems == 0 { + return nil, nil + } + + contents := make([]MapKV, 0, elems) + + // k is the array index, off is its corresponding ELF section offset. + for k, off := uint32(0), start; k < elems; k, off = k+1, off+align { + r, ok := es.relocations[uint64(off)] + if !ok { + continue + } + + // Relocation exists for the current offset in the ELF section. + // Emit a value stub based on the type of relocation to be replaced by + // a real fd later in the pipeline before populating the map. + // Map keys are encoded in MapKV entries, so empty array indices are + // skipped here. + switch t := elf.ST_TYPE(r.Info); t { + case elf.STT_FUNC: + contents = append(contents, MapKV{uint32(k), r.Name}) + case elf.STT_OBJECT: + contents = append(contents, MapKV{uint32(k), r.Name}) + default: + return nil, fmt.Errorf("unknown relocation type %v for symbol %s", t, r.Name) + } + } + + return contents, nil +} + +func (ec *elfCode) loadDataSections() error { + for _, sec := range ec.sections { + if sec.kind != dataSection { + continue + } + + // If a section has no references, it will be freed as soon as the + // Collection closes, so creating and populating it is wasteful. If it has + // no symbols, it is likely an ephemeral section used during compilation + // that wasn't sanitized by the bpf linker. (like .rodata.str1.1) + // + // No symbols means no VariableSpecs can be generated from it, making it + // pointless to emit a data section for. + if sec.references == 0 && len(sec.symbols) == 0 { + continue + } + + if sec.Size > math.MaxUint32 { + return fmt.Errorf("data section %s: contents exceed maximum size", sec.Name) + } + + mapSpec := &MapSpec{ + Name: SanitizeName(sec.Name, -1), + Type: Array, + KeySize: 4, + ValueSize: uint32(sec.Size), + MaxEntries: 1, + } + + if isConstantDataSection(sec.Name) { + mapSpec.Flags = sys.BPF_F_RDONLY_PROG + } + + switch sec.Type { + // Only open the section if we know there's actual data to be read. + case elf.SHT_PROGBITS: + data, err := sec.Data() + if err != nil { + return fmt.Errorf("data section %s: can't get contents: %w", sec.Name, err) + } + mapSpec.Contents = []MapKV{{uint32(0), data}} + + case elf.SHT_NOBITS: + // NOBITS sections like .bss contain only zeroes and are not allocated in + // the ELF. Since data sections are Arrays, the kernel can preallocate + // them. Don't attempt reading zeroes from the ELF, instead allocate the + // zeroed memory to support getting and setting VariableSpecs for sections + // like .bss. + mapSpec.Contents = []MapKV{{uint32(0), make([]byte, sec.Size)}} + + default: + return fmt.Errorf("data section %s: unknown section type %s", sec.Name, sec.Type) + } + + for off, sym := range sec.symbols { + // Skip symbols marked with the 'hidden' attribute. + if elf.ST_VISIBILITY(sym.Other) == elf.STV_HIDDEN || + elf.ST_VISIBILITY(sym.Other) == elf.STV_INTERNAL { + continue + } + + // Only accept symbols with global or weak bindings. The common + // alternative is STB_LOCAL, which are either function-scoped or declared + // 'static'. + if elf.ST_BIND(sym.Info) != elf.STB_GLOBAL && + elf.ST_BIND(sym.Info) != elf.STB_WEAK { + continue + } + + if ec.vars[sym.Name] != nil { + return fmt.Errorf("data section %s: duplicate variable %s", sec.Name, sym.Name) + } + + // Skip symbols starting with a dot, they are compiler-internal symbols + // emitted by clang 11 and earlier and are not cleaned up by the bpf + // compiler backend (e.g. symbols named .Lconstinit.1 in sections like + // .rodata.cst32). Variables in C cannot start with a dot, so filter these + // out. + if strings.HasPrefix(sym.Name, ".") { + continue + } + + ec.vars[sym.Name] = &VariableSpec{ + name: sym.Name, + offset: off, + size: sym.Size, + m: mapSpec, + } + } + + // It is possible for a data section to exist without a corresponding BTF Datasec + // if it only contains anonymous values like macro-defined arrays. + if ec.btf != nil { + var ds *btf.Datasec + if ec.btf.TypeByName(sec.Name, &ds) == nil { + // Assign the spec's key and BTF only if the Datasec lookup was successful. + mapSpec.Key = &btf.Void{} + mapSpec.Value = ds + + // Populate VariableSpecs with type information, if available. + for _, v := range ds.Vars { + name := v.Type.TypeName() + if name == "" { + return fmt.Errorf("data section %s: anonymous variable %v", sec.Name, v) + } + + vt, ok := v.Type.(*btf.Var) + if !ok { + return fmt.Errorf("data section %s: unexpected type %T for variable %s", sec.Name, v.Type, name) + } + + ev := ec.vars[name] + if ev == nil { + // Hidden symbols appear in the BTF Datasec but don't receive a VariableSpec. + continue + } + + if uint64(v.Offset) != ev.offset { + return fmt.Errorf("data section %s: variable %s datasec offset (%d) doesn't match ELF symbol offset (%d)", sec.Name, name, v.Offset, ev.offset) + } + + if uint64(v.Size) != ev.size { + return fmt.Errorf("data section %s: variable %s size in datasec (%d) doesn't match ELF symbol size (%d)", sec.Name, name, v.Size, ev.size) + } + + // Decouple the Var in the VariableSpec from the underlying DataSec in + // the MapSpec to avoid modifications from affecting map loads later on. + ev.t = btf.Copy(vt).(*btf.Var) + } + } + } + + ec.maps[sec.Name] = mapSpec + } + + return nil +} + +// loadKconfigSection handles the 'virtual' Datasec .kconfig that doesn't +// have a corresponding ELF section and exist purely in BTF. +func (ec *elfCode) loadKconfigSection() error { + if ec.btf == nil { + return nil + } + + var ds *btf.Datasec + err := ec.btf.TypeByName(".kconfig", &ds) + if errors.Is(err, btf.ErrNotFound) { + return nil + } + if err != nil { + return err + } + + if ds.Size == 0 { + return errors.New("zero-length .kconfig") + } + + ec.kconfig = &MapSpec{ + Name: ".kconfig", + Type: Array, + KeySize: uint32(4), + ValueSize: ds.Size, + MaxEntries: 1, + Flags: sys.BPF_F_RDONLY_PROG, + Key: &btf.Int{Size: 4}, + Value: ds, + } + + return nil +} + +// loadKsymsSection handles the 'virtual' Datasec .ksyms that doesn't +// have a corresponding ELF section and exist purely in BTF. +func (ec *elfCode) loadKsymsSection() error { + if ec.btf == nil { + return nil + } + + var ds *btf.Datasec + err := ec.btf.TypeByName(".ksyms", &ds) + if errors.Is(err, btf.ErrNotFound) { + return nil + } + if err != nil { + return err + } + + for _, v := range ds.Vars { + switch t := v.Type.(type) { + case *btf.Func: + ec.kfuncs[t.TypeName()] = t + case *btf.Var: + ec.ksyms[t.TypeName()] = struct{}{} + default: + return fmt.Errorf("unexpected variable type in .ksyms: %T", v) + } + } + + return nil +} + +type libbpfElfSectionDef struct { + pattern string + programType sys.ProgType + attachType sys.AttachType + flags libbpfElfSectionFlag +} + +type libbpfElfSectionFlag uint32 + +// The values correspond to enum sec_def_flags in libbpf. +const ( + _SEC_NONE libbpfElfSectionFlag = 0 + + _SEC_EXP_ATTACH_OPT libbpfElfSectionFlag = 1 << (iota - 1) + _SEC_ATTACHABLE + _SEC_ATTACH_BTF + _SEC_SLEEPABLE + _SEC_XDP_FRAGS + _SEC_USDT + + // Ignore any present extra in order to preserve backwards compatibility + // with earlier versions of the library. + ignoreExtra + + _SEC_ATTACHABLE_OPT = _SEC_ATTACHABLE | _SEC_EXP_ATTACH_OPT +) + +func init() { + // Compatibility with older versions of the library. + // We prepend libbpf definitions since they contain a prefix match + // for "xdp". + elfSectionDefs = append([]libbpfElfSectionDef{ + {"xdp.frags/", sys.BPF_PROG_TYPE_XDP, sys.BPF_XDP, _SEC_XDP_FRAGS | ignoreExtra}, + {"xdp.frags_devmap/", sys.BPF_PROG_TYPE_XDP, sys.BPF_XDP_DEVMAP, _SEC_XDP_FRAGS}, + {"xdp_devmap/", sys.BPF_PROG_TYPE_XDP, sys.BPF_XDP_DEVMAP, 0}, + {"xdp.frags_cpumap/", sys.BPF_PROG_TYPE_XDP, sys.BPF_XDP_CPUMAP, _SEC_XDP_FRAGS}, + {"xdp_cpumap/", sys.BPF_PROG_TYPE_XDP, sys.BPF_XDP_CPUMAP, 0}, + // This has been in the library since the beginning of time. Not sure + // where it came from. + {"seccomp", sys.BPF_PROG_TYPE_SOCKET_FILTER, 0, _SEC_NONE}, + }, elfSectionDefs...) +} + +func getProgType(sectionName string) (ProgramType, AttachType, uint32, string) { + // Skip optional program marking for now. + sectionName = strings.TrimPrefix(sectionName, "?") + + for _, t := range elfSectionDefs { + extra, ok := matchSectionName(sectionName, t.pattern) + if !ok { + continue + } + + programType := ProgramType(t.programType) + attachType := AttachType(t.attachType) + + var flags uint32 + if t.flags&_SEC_SLEEPABLE > 0 { + flags |= sys.BPF_F_SLEEPABLE + } + if t.flags&_SEC_XDP_FRAGS > 0 { + flags |= sys.BPF_F_XDP_HAS_FRAGS + } + if t.flags&_SEC_EXP_ATTACH_OPT > 0 { + if programType == XDP { + // The library doesn't yet have code to fallback to not specifying + // attach type. Only do this for XDP since we've enforced correct + // attach type for all other program types. + attachType = AttachNone + } + } + if t.flags&ignoreExtra > 0 { + extra = "" + } + + return programType, attachType, flags, extra + } + + return UnspecifiedProgram, AttachNone, 0, "" +} + +// matchSectionName checks a section name against a pattern. +// +// It's behaviour mirrors that of libbpf's sec_def_matches. +func matchSectionName(sectionName, pattern string) (extra string, found bool) { + have, extra, found := strings.Cut(sectionName, "/") + want := strings.TrimRight(pattern, "+/") + + if strings.HasSuffix(pattern, "/") { + // Section name must have a slash and extra may be empty. + return extra, have == want && found + } else if strings.HasSuffix(pattern, "+") { + // Section name may have a slash and extra may be empty. + return extra, have == want + } + + // Section name must have a prefix. extra is ignored. + return "", strings.HasPrefix(sectionName, pattern) +} + +func (ec *elfCode) loadSectionRelocations(sec *elf.Section, symbols []elf.Symbol) (map[uint64]elf.Symbol, error) { + rels := make(map[uint64]elf.Symbol) + + if sec.Entsize < 16 { + return nil, fmt.Errorf("section %s: relocations are less than 16 bytes", sec.Name) + } + + r := bufio.NewReader(sec.Open()) + for off := uint64(0); off < sec.Size; off += sec.Entsize { + ent := io.LimitReader(r, int64(sec.Entsize)) + + var rel elf.Rel64 + if binary.Read(ent, ec.ByteOrder, &rel) != nil { + return nil, fmt.Errorf("can't parse relocation at offset %v", off) + } + + symNo := int(elf.R_SYM64(rel.Info) - 1) + if symNo >= len(symbols) { + return nil, fmt.Errorf("offset %d: symbol %d doesn't exist", off, symNo) + } + + symbol := symbols[symNo] + rels[rel.Off] = symbol + } + + return rels, nil +} diff --git a/vendor/github.com/cilium/ebpf/elf_sections.go b/vendor/github.com/cilium/ebpf/elf_sections.go new file mode 100644 index 0000000000..43dcfb103e --- /dev/null +++ b/vendor/github.com/cilium/ebpf/elf_sections.go @@ -0,0 +1,111 @@ +// Code generated by internal/cmd/gensections.awk; DO NOT EDIT. + +package ebpf + +// Code in this file is derived from libbpf, available under BSD-2-Clause. + +import "github.com/cilium/ebpf/internal/sys" + +var elfSectionDefs = []libbpfElfSectionDef{ + {"socket", sys.BPF_PROG_TYPE_SOCKET_FILTER, 0, _SEC_NONE}, + {"sk_reuseport/migrate", sys.BPF_PROG_TYPE_SK_REUSEPORT, sys.BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, _SEC_ATTACHABLE}, + {"sk_reuseport", sys.BPF_PROG_TYPE_SK_REUSEPORT, sys.BPF_SK_REUSEPORT_SELECT, _SEC_ATTACHABLE}, + {"kprobe+", sys.BPF_PROG_TYPE_KPROBE, 0, _SEC_NONE}, + {"uprobe+", sys.BPF_PROG_TYPE_KPROBE, 0, _SEC_NONE}, + {"uprobe.s+", sys.BPF_PROG_TYPE_KPROBE, 0, _SEC_SLEEPABLE}, + {"kretprobe+", sys.BPF_PROG_TYPE_KPROBE, 0, _SEC_NONE}, + {"uretprobe+", sys.BPF_PROG_TYPE_KPROBE, 0, _SEC_NONE}, + {"uretprobe.s+", sys.BPF_PROG_TYPE_KPROBE, 0, _SEC_SLEEPABLE}, + {"kprobe.multi+", sys.BPF_PROG_TYPE_KPROBE, sys.BPF_TRACE_KPROBE_MULTI, _SEC_NONE}, + {"kretprobe.multi+", sys.BPF_PROG_TYPE_KPROBE, sys.BPF_TRACE_KPROBE_MULTI, _SEC_NONE}, + {"kprobe.session+", sys.BPF_PROG_TYPE_KPROBE, sys.BPF_TRACE_KPROBE_SESSION, _SEC_NONE}, + {"uprobe.multi+", sys.BPF_PROG_TYPE_KPROBE, sys.BPF_TRACE_UPROBE_MULTI, _SEC_NONE}, + {"uretprobe.multi+", sys.BPF_PROG_TYPE_KPROBE, sys.BPF_TRACE_UPROBE_MULTI, _SEC_NONE}, + {"uprobe.multi.s+", sys.BPF_PROG_TYPE_KPROBE, sys.BPF_TRACE_UPROBE_MULTI, _SEC_SLEEPABLE}, + {"uretprobe.multi.s+", sys.BPF_PROG_TYPE_KPROBE, sys.BPF_TRACE_UPROBE_MULTI, _SEC_SLEEPABLE}, + {"ksyscall+", sys.BPF_PROG_TYPE_KPROBE, 0, _SEC_NONE}, + {"kretsyscall+", sys.BPF_PROG_TYPE_KPROBE, 0, _SEC_NONE}, + {"usdt+", sys.BPF_PROG_TYPE_KPROBE, 0, _SEC_USDT}, + {"usdt.s+", sys.BPF_PROG_TYPE_KPROBE, 0, _SEC_USDT | _SEC_SLEEPABLE}, + {"tc/ingress", sys.BPF_PROG_TYPE_SCHED_CLS, sys.BPF_TCX_INGRESS, _SEC_NONE}, + {"tc/egress", sys.BPF_PROG_TYPE_SCHED_CLS, sys.BPF_TCX_EGRESS, _SEC_NONE}, + {"tcx/ingress", sys.BPF_PROG_TYPE_SCHED_CLS, sys.BPF_TCX_INGRESS, _SEC_NONE}, + {"tcx/egress", sys.BPF_PROG_TYPE_SCHED_CLS, sys.BPF_TCX_EGRESS, _SEC_NONE}, + {"tc", sys.BPF_PROG_TYPE_SCHED_CLS, 0, _SEC_NONE}, + {"classifier", sys.BPF_PROG_TYPE_SCHED_CLS, 0, _SEC_NONE}, + {"action", sys.BPF_PROG_TYPE_SCHED_ACT, 0, _SEC_NONE}, + {"netkit/primary", sys.BPF_PROG_TYPE_SCHED_CLS, sys.BPF_NETKIT_PRIMARY, _SEC_NONE}, + {"netkit/peer", sys.BPF_PROG_TYPE_SCHED_CLS, sys.BPF_NETKIT_PEER, _SEC_NONE}, + {"tracepoint+", sys.BPF_PROG_TYPE_TRACEPOINT, 0, _SEC_NONE}, + {"tp+", sys.BPF_PROG_TYPE_TRACEPOINT, 0, _SEC_NONE}, + {"raw_tracepoint+", sys.BPF_PROG_TYPE_RAW_TRACEPOINT, 0, _SEC_NONE}, + {"raw_tp+", sys.BPF_PROG_TYPE_RAW_TRACEPOINT, 0, _SEC_NONE}, + {"raw_tracepoint.w+", sys.BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, 0, _SEC_NONE}, + {"raw_tp.w+", sys.BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, 0, _SEC_NONE}, + {"tp_btf+", sys.BPF_PROG_TYPE_TRACING, sys.BPF_TRACE_RAW_TP, _SEC_ATTACH_BTF}, + {"fentry+", sys.BPF_PROG_TYPE_TRACING, sys.BPF_TRACE_FENTRY, _SEC_ATTACH_BTF}, + {"fmod_ret+", sys.BPF_PROG_TYPE_TRACING, sys.BPF_MODIFY_RETURN, _SEC_ATTACH_BTF}, + {"fexit+", sys.BPF_PROG_TYPE_TRACING, sys.BPF_TRACE_FEXIT, _SEC_ATTACH_BTF}, + {"fentry.s+", sys.BPF_PROG_TYPE_TRACING, sys.BPF_TRACE_FENTRY, _SEC_ATTACH_BTF | _SEC_SLEEPABLE}, + {"fmod_ret.s+", sys.BPF_PROG_TYPE_TRACING, sys.BPF_MODIFY_RETURN, _SEC_ATTACH_BTF | _SEC_SLEEPABLE}, + {"fexit.s+", sys.BPF_PROG_TYPE_TRACING, sys.BPF_TRACE_FEXIT, _SEC_ATTACH_BTF | _SEC_SLEEPABLE}, + {"freplace+", sys.BPF_PROG_TYPE_EXT, 0, _SEC_ATTACH_BTF}, + {"lsm+", sys.BPF_PROG_TYPE_LSM, sys.BPF_LSM_MAC, _SEC_ATTACH_BTF}, + {"lsm.s+", sys.BPF_PROG_TYPE_LSM, sys.BPF_LSM_MAC, _SEC_ATTACH_BTF | _SEC_SLEEPABLE}, + {"lsm_cgroup+", sys.BPF_PROG_TYPE_LSM, sys.BPF_LSM_CGROUP, _SEC_ATTACH_BTF}, + {"iter+", sys.BPF_PROG_TYPE_TRACING, sys.BPF_TRACE_ITER, _SEC_ATTACH_BTF}, + {"iter.s+", sys.BPF_PROG_TYPE_TRACING, sys.BPF_TRACE_ITER, _SEC_ATTACH_BTF | _SEC_SLEEPABLE}, + {"syscall", sys.BPF_PROG_TYPE_SYSCALL, 0, _SEC_SLEEPABLE}, + {"xdp.frags/devmap", sys.BPF_PROG_TYPE_XDP, sys.BPF_XDP_DEVMAP, _SEC_XDP_FRAGS}, + {"xdp/devmap", sys.BPF_PROG_TYPE_XDP, sys.BPF_XDP_DEVMAP, _SEC_ATTACHABLE}, + {"xdp.frags/cpumap", sys.BPF_PROG_TYPE_XDP, sys.BPF_XDP_CPUMAP, _SEC_XDP_FRAGS}, + {"xdp/cpumap", sys.BPF_PROG_TYPE_XDP, sys.BPF_XDP_CPUMAP, _SEC_ATTACHABLE}, + {"xdp.frags", sys.BPF_PROG_TYPE_XDP, sys.BPF_XDP, _SEC_XDP_FRAGS}, + {"xdp", sys.BPF_PROG_TYPE_XDP, sys.BPF_XDP, _SEC_ATTACHABLE_OPT}, + {"perf_event", sys.BPF_PROG_TYPE_PERF_EVENT, 0, _SEC_NONE}, + {"lwt_in", sys.BPF_PROG_TYPE_LWT_IN, 0, _SEC_NONE}, + {"lwt_out", sys.BPF_PROG_TYPE_LWT_OUT, 0, _SEC_NONE}, + {"lwt_xmit", sys.BPF_PROG_TYPE_LWT_XMIT, 0, _SEC_NONE}, + {"lwt_seg6local", sys.BPF_PROG_TYPE_LWT_SEG6LOCAL, 0, _SEC_NONE}, + {"sockops", sys.BPF_PROG_TYPE_SOCK_OPS, sys.BPF_CGROUP_SOCK_OPS, _SEC_ATTACHABLE_OPT}, + {"sk_skb/stream_parser", sys.BPF_PROG_TYPE_SK_SKB, sys.BPF_SK_SKB_STREAM_PARSER, _SEC_ATTACHABLE_OPT}, + {"sk_skb/stream_verdict", sys.BPF_PROG_TYPE_SK_SKB, sys.BPF_SK_SKB_STREAM_VERDICT, _SEC_ATTACHABLE_OPT}, + {"sk_skb/verdict", sys.BPF_PROG_TYPE_SK_SKB, sys.BPF_SK_SKB_VERDICT, _SEC_ATTACHABLE_OPT}, + {"sk_skb", sys.BPF_PROG_TYPE_SK_SKB, 0, _SEC_NONE}, + {"sk_msg", sys.BPF_PROG_TYPE_SK_MSG, sys.BPF_SK_MSG_VERDICT, _SEC_ATTACHABLE_OPT}, + {"lirc_mode2", sys.BPF_PROG_TYPE_LIRC_MODE2, sys.BPF_LIRC_MODE2, _SEC_ATTACHABLE_OPT}, + {"flow_dissector", sys.BPF_PROG_TYPE_FLOW_DISSECTOR, sys.BPF_FLOW_DISSECTOR, _SEC_ATTACHABLE_OPT}, + {"cgroup_skb/ingress", sys.BPF_PROG_TYPE_CGROUP_SKB, sys.BPF_CGROUP_INET_INGRESS, _SEC_ATTACHABLE_OPT}, + {"cgroup_skb/egress", sys.BPF_PROG_TYPE_CGROUP_SKB, sys.BPF_CGROUP_INET_EGRESS, _SEC_ATTACHABLE_OPT}, + {"cgroup/skb", sys.BPF_PROG_TYPE_CGROUP_SKB, 0, _SEC_NONE}, + {"cgroup/sock_create", sys.BPF_PROG_TYPE_CGROUP_SOCK, sys.BPF_CGROUP_INET_SOCK_CREATE, _SEC_ATTACHABLE}, + {"cgroup/sock_release", sys.BPF_PROG_TYPE_CGROUP_SOCK, sys.BPF_CGROUP_INET_SOCK_RELEASE, _SEC_ATTACHABLE}, + {"cgroup/sock", sys.BPF_PROG_TYPE_CGROUP_SOCK, sys.BPF_CGROUP_INET_SOCK_CREATE, _SEC_ATTACHABLE_OPT}, + {"cgroup/post_bind4", sys.BPF_PROG_TYPE_CGROUP_SOCK, sys.BPF_CGROUP_INET4_POST_BIND, _SEC_ATTACHABLE}, + {"cgroup/post_bind6", sys.BPF_PROG_TYPE_CGROUP_SOCK, sys.BPF_CGROUP_INET6_POST_BIND, _SEC_ATTACHABLE}, + {"cgroup/bind4", sys.BPF_PROG_TYPE_CGROUP_SOCK_ADDR, sys.BPF_CGROUP_INET4_BIND, _SEC_ATTACHABLE}, + {"cgroup/bind6", sys.BPF_PROG_TYPE_CGROUP_SOCK_ADDR, sys.BPF_CGROUP_INET6_BIND, _SEC_ATTACHABLE}, + {"cgroup/connect4", sys.BPF_PROG_TYPE_CGROUP_SOCK_ADDR, sys.BPF_CGROUP_INET4_CONNECT, _SEC_ATTACHABLE}, + {"cgroup/connect6", sys.BPF_PROG_TYPE_CGROUP_SOCK_ADDR, sys.BPF_CGROUP_INET6_CONNECT, _SEC_ATTACHABLE}, + {"cgroup/connect_unix", sys.BPF_PROG_TYPE_CGROUP_SOCK_ADDR, sys.BPF_CGROUP_UNIX_CONNECT, _SEC_ATTACHABLE}, + {"cgroup/sendmsg4", sys.BPF_PROG_TYPE_CGROUP_SOCK_ADDR, sys.BPF_CGROUP_UDP4_SENDMSG, _SEC_ATTACHABLE}, + {"cgroup/sendmsg6", sys.BPF_PROG_TYPE_CGROUP_SOCK_ADDR, sys.BPF_CGROUP_UDP6_SENDMSG, _SEC_ATTACHABLE}, + {"cgroup/sendmsg_unix", sys.BPF_PROG_TYPE_CGROUP_SOCK_ADDR, sys.BPF_CGROUP_UNIX_SENDMSG, _SEC_ATTACHABLE}, + {"cgroup/recvmsg4", sys.BPF_PROG_TYPE_CGROUP_SOCK_ADDR, sys.BPF_CGROUP_UDP4_RECVMSG, _SEC_ATTACHABLE}, + {"cgroup/recvmsg6", sys.BPF_PROG_TYPE_CGROUP_SOCK_ADDR, sys.BPF_CGROUP_UDP6_RECVMSG, _SEC_ATTACHABLE}, + {"cgroup/recvmsg_unix", sys.BPF_PROG_TYPE_CGROUP_SOCK_ADDR, sys.BPF_CGROUP_UNIX_RECVMSG, _SEC_ATTACHABLE}, + {"cgroup/getpeername4", sys.BPF_PROG_TYPE_CGROUP_SOCK_ADDR, sys.BPF_CGROUP_INET4_GETPEERNAME, _SEC_ATTACHABLE}, + {"cgroup/getpeername6", sys.BPF_PROG_TYPE_CGROUP_SOCK_ADDR, sys.BPF_CGROUP_INET6_GETPEERNAME, _SEC_ATTACHABLE}, + {"cgroup/getpeername_unix", sys.BPF_PROG_TYPE_CGROUP_SOCK_ADDR, sys.BPF_CGROUP_UNIX_GETPEERNAME, _SEC_ATTACHABLE}, + {"cgroup/getsockname4", sys.BPF_PROG_TYPE_CGROUP_SOCK_ADDR, sys.BPF_CGROUP_INET4_GETSOCKNAME, _SEC_ATTACHABLE}, + {"cgroup/getsockname6", sys.BPF_PROG_TYPE_CGROUP_SOCK_ADDR, sys.BPF_CGROUP_INET6_GETSOCKNAME, _SEC_ATTACHABLE}, + {"cgroup/getsockname_unix", sys.BPF_PROG_TYPE_CGROUP_SOCK_ADDR, sys.BPF_CGROUP_UNIX_GETSOCKNAME, _SEC_ATTACHABLE}, + {"cgroup/sysctl", sys.BPF_PROG_TYPE_CGROUP_SYSCTL, sys.BPF_CGROUP_SYSCTL, _SEC_ATTACHABLE}, + {"cgroup/getsockopt", sys.BPF_PROG_TYPE_CGROUP_SOCKOPT, sys.BPF_CGROUP_GETSOCKOPT, _SEC_ATTACHABLE}, + {"cgroup/setsockopt", sys.BPF_PROG_TYPE_CGROUP_SOCKOPT, sys.BPF_CGROUP_SETSOCKOPT, _SEC_ATTACHABLE}, + {"cgroup/dev", sys.BPF_PROG_TYPE_CGROUP_DEVICE, sys.BPF_CGROUP_DEVICE, _SEC_ATTACHABLE_OPT}, + {"struct_ops+", sys.BPF_PROG_TYPE_STRUCT_OPS, 0, _SEC_NONE}, + {"struct_ops.s+", sys.BPF_PROG_TYPE_STRUCT_OPS, 0, _SEC_SLEEPABLE}, + {"sk_lookup", sys.BPF_PROG_TYPE_SK_LOOKUP, sys.BPF_SK_LOOKUP, _SEC_ATTACHABLE}, + {"netfilter", sys.BPF_PROG_TYPE_NETFILTER, sys.BPF_NETFILTER, _SEC_NONE}, +} diff --git a/vendor/github.com/cilium/ebpf/info.go b/vendor/github.com/cilium/ebpf/info.go new file mode 100644 index 0000000000..676e8fa6e7 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/info.go @@ -0,0 +1,812 @@ +package ebpf + +import ( + "bufio" + "bytes" + "encoding/hex" + "errors" + "fmt" + "io" + "os" + "reflect" + "strings" + "syscall" + "time" + + "github.com/cilium/ebpf/asm" + "github.com/cilium/ebpf/btf" + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/sys" + "github.com/cilium/ebpf/internal/unix" +) + +// The *Info structs expose metadata about a program or map. Most +// fields are exposed via a getter: +// +// func (*MapInfo) ID() (MapID, bool) +// +// This is because the metadata available changes based on kernel version. +// The second boolean return value indicates whether a particular field is +// available on the current kernel. +// +// Always add new metadata as such a getter, unless you can somehow get the +// value of the field on all supported kernels. Also document which version +// a particular field first appeared in. +// +// Some metadata is a buffer which needs additional parsing. In this case, +// store the undecoded data in the Info struct and provide a getter which +// decodes it when necessary. See ProgramInfo.Instructions for an example. + +// MapInfo describes a map. +type MapInfo struct { + // Type of the map. + Type MapType + // KeySize is the size of the map key in bytes. + KeySize uint32 + // ValueSize is the size of the map value in bytes. + ValueSize uint32 + // MaxEntries is the maximum number of entries the map can hold. Its meaning + // is map-specific. + MaxEntries uint32 + // Flags used during map creation. + Flags uint32 + // Name as supplied by user space at load time. Available from 4.15. + Name string + + id MapID + btf btf.ID + mapExtra uint64 + memlock uint64 + frozen bool +} + +// newMapInfoFromFd queries map information about the given fd. [sys.ObjInfo] is +// attempted first, supplementing any missing values with information from +// /proc/self/fdinfo. Ignores EINVAL from ObjInfo as well as ErrNotSupported +// from reading fdinfo (indicating the file exists, but no fields of interest +// were found). If both fail, an error is always returned. +func newMapInfoFromFd(fd *sys.FD) (*MapInfo, error) { + var info sys.MapInfo + err1 := sys.ObjInfo(fd, &info) + // EINVAL means the kernel doesn't support BPF_OBJ_GET_INFO_BY_FD. Continue + // with fdinfo if that's the case. + if err1 != nil && !errors.Is(err1, unix.EINVAL) { + return nil, fmt.Errorf("getting object info: %w", err1) + } + + mi := &MapInfo{ + MapType(info.Type), + info.KeySize, + info.ValueSize, + info.MaxEntries, + uint32(info.MapFlags), + unix.ByteSliceToString(info.Name[:]), + MapID(info.Id), + btf.ID(info.BtfId), + info.MapExtra, + 0, + false, + } + + // Supplement OBJ_INFO with data from /proc/self/fdinfo. It contains fields + // like memlock and frozen that are not present in OBJ_INFO. + err2 := readMapInfoFromProc(fd, mi) + if err2 != nil && !errors.Is(err2, ErrNotSupported) { + return nil, fmt.Errorf("getting map info from fdinfo: %w", err2) + } + + if err1 != nil && err2 != nil { + return nil, fmt.Errorf("ObjInfo and fdinfo both failed: objinfo: %w, fdinfo: %w", err1, err2) + } + + return mi, nil +} + +// readMapInfoFromProc queries map information about the given fd from +// /proc/self/fdinfo. It only writes data into fields that have a zero value. +func readMapInfoFromProc(fd *sys.FD, mi *MapInfo) error { + return scanFdInfo(fd, map[string]interface{}{ + "map_type": &mi.Type, + "map_id": &mi.id, + "key_size": &mi.KeySize, + "value_size": &mi.ValueSize, + "max_entries": &mi.MaxEntries, + "map_flags": &mi.Flags, + "map_extra": &mi.mapExtra, + "memlock": &mi.memlock, + "frozen": &mi.frozen, + }) +} + +// ID returns the map ID. +// +// Available from 4.13. +// +// The bool return value indicates whether this optional field is available. +func (mi *MapInfo) ID() (MapID, bool) { + return mi.id, mi.id > 0 +} + +// BTFID returns the BTF ID associated with the Map. +// +// The ID is only valid as long as the associated Map is kept alive. +// Available from 4.18. +// +// The bool return value indicates whether this optional field is available and +// populated. (The field may be available but not populated if the kernel +// supports the field but the Map was loaded without BTF information.) +func (mi *MapInfo) BTFID() (btf.ID, bool) { + return mi.btf, mi.btf > 0 +} + +// MapExtra returns an opaque field whose meaning is map-specific. +// +// Available from 5.16. +// +// The bool return value indicates whether this optional field is available and +// populated, if it was specified during Map creation. +func (mi *MapInfo) MapExtra() (uint64, bool) { + return mi.mapExtra, mi.mapExtra > 0 +} + +// Memlock returns an approximate number of bytes allocated to this map. +// +// Available from 4.10. +// +// The bool return value indicates whether this optional field is available. +func (mi *MapInfo) Memlock() (uint64, bool) { + return mi.memlock, mi.memlock > 0 +} + +// Frozen indicates whether [Map.Freeze] was called on this map. If true, +// modifications from user space are not allowed. +// +// Available from 5.2. Requires access to procfs. +// +// If the kernel doesn't support map freezing, this field will always be false. +func (mi *MapInfo) Frozen() bool { + return mi.frozen +} + +// programStats holds statistics of a program. +type programStats struct { + // Total accumulated runtime of the program ins ns. + runtime time.Duration + // Total number of times the program was called. + runCount uint64 + // Total number of times the programm was NOT called. + // Added in commit 9ed9e9ba2337 ("bpf: Count the number of times recursion was prevented"). + recursionMisses uint64 +} + +// programJitedInfo holds information about JITed info of a program. +type programJitedInfo struct { + // ksyms holds the ksym addresses of the BPF program, including those of its + // subprograms. + // + // Available from 4.18. + ksyms []uint64 + numKsyms uint32 + + // insns holds the JITed machine native instructions of the program, + // including those of its subprograms. + // + // Available from 4.13. + insns []byte + numInsns uint32 + + // lineInfos holds the JITed line infos, which are kernel addresses. + // + // Available from 5.0. + lineInfos []uint64 + numLineInfos uint32 + + // lineInfoRecSize is the size of a single line info record. + // + // Available from 5.0. + lineInfoRecSize uint32 + + // funcLens holds the insns length of each function. + // + // Available from 4.18. + funcLens []uint32 + numFuncLens uint32 +} + +// ProgramInfo describes a program. +type ProgramInfo struct { + Type ProgramType + id ProgramID + // Truncated hash of the BPF bytecode. Available from 4.13. + Tag string + // Name as supplied by user space at load time. Available from 4.15. + Name string + + createdByUID uint32 + haveCreatedByUID bool + btf btf.ID + stats *programStats + loadTime time.Duration + + maps []MapID + insns []byte + jitedSize uint32 + verifiedInstructions uint32 + + jitedInfo programJitedInfo + + lineInfos []byte + numLineInfos uint32 + funcInfos []byte + numFuncInfos uint32 +} + +func newProgramInfoFromFd(fd *sys.FD) (*ProgramInfo, error) { + var info sys.ProgInfo + err := sys.ObjInfo(fd, &info) + if errors.Is(err, syscall.EINVAL) { + return newProgramInfoFromProc(fd) + } + if err != nil { + return nil, err + } + + pi := ProgramInfo{ + Type: ProgramType(info.Type), + id: ProgramID(info.Id), + Tag: hex.EncodeToString(info.Tag[:]), + Name: unix.ByteSliceToString(info.Name[:]), + btf: btf.ID(info.BtfId), + stats: &programStats{ + runtime: time.Duration(info.RunTimeNs), + runCount: info.RunCnt, + recursionMisses: info.RecursionMisses, + }, + jitedSize: info.JitedProgLen, + loadTime: time.Duration(info.LoadTime), + verifiedInstructions: info.VerifiedInsns, + } + + // Start with a clean struct for the second call, otherwise we may get EFAULT. + var info2 sys.ProgInfo + + makeSecondCall := false + + if info.NrMapIds > 0 { + pi.maps = make([]MapID, info.NrMapIds) + info2.NrMapIds = info.NrMapIds + info2.MapIds = sys.NewSlicePointer(pi.maps) + makeSecondCall = true + } else if haveProgramInfoMapIDs() == nil { + // This program really has no associated maps. + pi.maps = make([]MapID, 0) + } else { + // The kernel doesn't report associated maps. + pi.maps = nil + } + + // createdByUID and NrMapIds were introduced in the same kernel version. + if pi.maps != nil { + pi.createdByUID = info.CreatedByUid + pi.haveCreatedByUID = true + } + + if info.XlatedProgLen > 0 { + pi.insns = make([]byte, info.XlatedProgLen) + info2.XlatedProgLen = info.XlatedProgLen + info2.XlatedProgInsns = sys.NewSlicePointer(pi.insns) + makeSecondCall = true + } + + if info.NrLineInfo > 0 { + pi.lineInfos = make([]byte, btf.LineInfoSize*info.NrLineInfo) + info2.LineInfo = sys.NewSlicePointer(pi.lineInfos) + info2.LineInfoRecSize = btf.LineInfoSize + info2.NrLineInfo = info.NrLineInfo + pi.numLineInfos = info.NrLineInfo + makeSecondCall = true + } + + if info.NrFuncInfo > 0 { + pi.funcInfos = make([]byte, btf.FuncInfoSize*info.NrFuncInfo) + info2.FuncInfo = sys.NewSlicePointer(pi.funcInfos) + info2.FuncInfoRecSize = btf.FuncInfoSize + info2.NrFuncInfo = info.NrFuncInfo + pi.numFuncInfos = info.NrFuncInfo + makeSecondCall = true + } + + pi.jitedInfo.lineInfoRecSize = info.JitedLineInfoRecSize + if info.JitedProgLen > 0 { + pi.jitedInfo.numInsns = info.JitedProgLen + pi.jitedInfo.insns = make([]byte, info.JitedProgLen) + info2.JitedProgLen = info.JitedProgLen + info2.JitedProgInsns = sys.NewSlicePointer(pi.jitedInfo.insns) + makeSecondCall = true + } + + if info.NrJitedFuncLens > 0 { + pi.jitedInfo.numFuncLens = info.NrJitedFuncLens + pi.jitedInfo.funcLens = make([]uint32, info.NrJitedFuncLens) + info2.NrJitedFuncLens = info.NrJitedFuncLens + info2.JitedFuncLens = sys.NewSlicePointer(pi.jitedInfo.funcLens) + makeSecondCall = true + } + + if info.NrJitedLineInfo > 0 { + pi.jitedInfo.numLineInfos = info.NrJitedLineInfo + pi.jitedInfo.lineInfos = make([]uint64, info.NrJitedLineInfo) + info2.NrJitedLineInfo = info.NrJitedLineInfo + info2.JitedLineInfo = sys.NewSlicePointer(pi.jitedInfo.lineInfos) + info2.JitedLineInfoRecSize = info.JitedLineInfoRecSize + makeSecondCall = true + } + + if info.NrJitedKsyms > 0 { + pi.jitedInfo.numKsyms = info.NrJitedKsyms + pi.jitedInfo.ksyms = make([]uint64, info.NrJitedKsyms) + info2.JitedKsyms = sys.NewSlicePointer(pi.jitedInfo.ksyms) + info2.NrJitedKsyms = info.NrJitedKsyms + makeSecondCall = true + } + + if makeSecondCall { + if err := sys.ObjInfo(fd, &info2); err != nil { + return nil, err + } + } + + return &pi, nil +} + +func newProgramInfoFromProc(fd *sys.FD) (*ProgramInfo, error) { + var info ProgramInfo + err := scanFdInfo(fd, map[string]interface{}{ + "prog_type": &info.Type, + "prog_tag": &info.Tag, + }) + if errors.Is(err, ErrNotSupported) { + return nil, &internal.UnsupportedFeatureError{ + Name: "reading program info from /proc/self/fdinfo", + MinimumVersion: internal.Version{4, 10, 0}, + } + } + if err != nil { + return nil, err + } + + return &info, nil +} + +// ID returns the program ID. +// +// Available from 4.13. +// +// The bool return value indicates whether this optional field is available. +func (pi *ProgramInfo) ID() (ProgramID, bool) { + return pi.id, pi.id > 0 +} + +// CreatedByUID returns the Uid that created the program. +// +// Available from 4.15. +// +// The bool return value indicates whether this optional field is available. +func (pi *ProgramInfo) CreatedByUID() (uint32, bool) { + return pi.createdByUID, pi.haveCreatedByUID +} + +// BTFID returns the BTF ID associated with the program. +// +// The ID is only valid as long as the associated program is kept alive. +// Available from 5.0. +// +// The bool return value indicates whether this optional field is available and +// populated. (The field may be available but not populated if the kernel +// supports the field but the program was loaded without BTF information.) +func (pi *ProgramInfo) BTFID() (btf.ID, bool) { + return pi.btf, pi.btf > 0 +} + +// RunCount returns the total number of times the program was called. +// +// Can return 0 if the collection of statistics is not enabled. See EnableStats(). +// The bool return value indicates whether this optional field is available. +func (pi *ProgramInfo) RunCount() (uint64, bool) { + if pi.stats != nil { + return pi.stats.runCount, true + } + return 0, false +} + +// Runtime returns the total accumulated runtime of the program. +// +// Can return 0 if the collection of statistics is not enabled. See EnableStats(). +// The bool return value indicates whether this optional field is available. +func (pi *ProgramInfo) Runtime() (time.Duration, bool) { + if pi.stats != nil { + return pi.stats.runtime, true + } + return time.Duration(0), false +} + +// RecursionMisses returns the total number of times the program was NOT called. +// This can happen when another bpf program is already running on the cpu, which +// is likely to happen for example when you interrupt bpf program execution. +func (pi *ProgramInfo) RecursionMisses() (uint64, bool) { + if pi.stats != nil { + return pi.stats.recursionMisses, true + } + return 0, false +} + +// btfSpec returns the BTF spec associated with the program. +func (pi *ProgramInfo) btfSpec() (*btf.Spec, error) { + id, ok := pi.BTFID() + if !ok { + return nil, fmt.Errorf("program created without BTF or unsupported kernel: %w", ErrNotSupported) + } + + h, err := btf.NewHandleFromID(id) + if err != nil { + return nil, fmt.Errorf("get BTF handle: %w", err) + } + defer h.Close() + + spec, err := h.Spec(nil) + if err != nil { + return nil, fmt.Errorf("get BTF spec: %w", err) + } + + return spec, nil +} + +// LineInfos returns the BTF line information of the program. +// +// Available from 5.0. +// +// Requires CAP_SYS_ADMIN or equivalent for reading BTF information. Returns +// ErrNotSupported if the program was created without BTF or if the kernel +// doesn't support the field. +func (pi *ProgramInfo) LineInfos() (btf.LineOffsets, error) { + if len(pi.lineInfos) == 0 { + return nil, fmt.Errorf("insufficient permissions or unsupported kernel: %w", ErrNotSupported) + } + + spec, err := pi.btfSpec() + if err != nil { + return nil, err + } + + return btf.LoadLineInfos( + bytes.NewReader(pi.lineInfos), + internal.NativeEndian, + pi.numLineInfos, + spec, + ) +} + +// Instructions returns the 'xlated' instruction stream of the program +// after it has been verified and rewritten by the kernel. These instructions +// cannot be loaded back into the kernel as-is, this is mainly used for +// inspecting loaded programs for troubleshooting, dumping, etc. +// +// For example, map accesses are made to reference their kernel map IDs, +// not the FDs they had when the program was inserted. Note that before +// the introduction of bpf_insn_prepare_dump in kernel 4.16, xlated +// instructions were not sanitized, making the output even less reusable +// and less likely to round-trip or evaluate to the same program Tag. +// +// The first instruction is marked as a symbol using the Program's name. +// +// If available, the instructions will be annotated with metadata from the +// BTF. This includes line information and function information. Reading +// this metadata requires CAP_SYS_ADMIN or equivalent. If capability is +// unavailable, the instructions will be returned without metadata. +// +// Available from 4.13. Requires CAP_BPF or equivalent for plain instructions. +// Requires CAP_SYS_ADMIN for instructions with metadata. +func (pi *ProgramInfo) Instructions() (asm.Instructions, error) { + // If the calling process is not BPF-capable or if the kernel doesn't + // support getting xlated instructions, the field will be zero. + if len(pi.insns) == 0 { + return nil, fmt.Errorf("insufficient permissions or unsupported kernel: %w", ErrNotSupported) + } + + r := bytes.NewReader(pi.insns) + var insns asm.Instructions + if err := insns.Unmarshal(r, internal.NativeEndian); err != nil { + return nil, fmt.Errorf("unmarshaling instructions: %w", err) + } + + if pi.btf != 0 { + btfh, err := btf.NewHandleFromID(pi.btf) + if err != nil { + // Getting a BTF handle requires CAP_SYS_ADMIN, if not available we get an -EPERM. + // Ignore it and fall back to instructions without metadata. + if !errors.Is(err, unix.EPERM) { + return nil, fmt.Errorf("unable to get BTF handle: %w", err) + } + } + + // If we have a BTF handle, we can use it to assign metadata to the instructions. + if btfh != nil { + defer btfh.Close() + + spec, err := btfh.Spec(nil) + if err != nil { + return nil, fmt.Errorf("unable to get BTF spec: %w", err) + } + + lineInfos, err := btf.LoadLineInfos( + bytes.NewReader(pi.lineInfos), + internal.NativeEndian, + pi.numLineInfos, + spec, + ) + if err != nil { + return nil, fmt.Errorf("parse line info: %w", err) + } + + funcInfos, err := btf.LoadFuncInfos( + bytes.NewReader(pi.funcInfos), + internal.NativeEndian, + pi.numFuncInfos, + spec, + ) + if err != nil { + return nil, fmt.Errorf("parse func info: %w", err) + } + + btf.AssignMetadataToInstructions(insns, funcInfos, lineInfos, btf.CORERelocationInfos{}) + } + } + + fn := btf.FuncMetadata(&insns[0]) + name := pi.Name + if fn != nil { + name = fn.Name + } + insns[0] = insns[0].WithSymbol(name) + + return insns, nil +} + +// JitedSize returns the size of the program's JIT-compiled machine code in bytes, which is the +// actual code executed on the host's CPU. This field requires the BPF JIT compiler to be enabled. +// +// Available from 4.13. Reading this metadata requires CAP_BPF or equivalent. +func (pi *ProgramInfo) JitedSize() (uint32, error) { + if pi.jitedSize == 0 { + return 0, fmt.Errorf("insufficient permissions, unsupported kernel, or JIT compiler disabled: %w", ErrNotSupported) + } + return pi.jitedSize, nil +} + +// TranslatedSize returns the size of the program's translated instructions in bytes, after it has +// been verified and rewritten by the kernel. +// +// Available from 4.13. Reading this metadata requires CAP_BPF or equivalent. +func (pi *ProgramInfo) TranslatedSize() (int, error) { + insns := len(pi.insns) + if insns == 0 { + return 0, fmt.Errorf("insufficient permissions or unsupported kernel: %w", ErrNotSupported) + } + return insns, nil +} + +// MapIDs returns the maps related to the program. +// +// Available from 4.15. +// +// The bool return value indicates whether this optional field is available. +func (pi *ProgramInfo) MapIDs() ([]MapID, bool) { + return pi.maps, pi.maps != nil +} + +// LoadTime returns when the program was loaded since boot time. +// +// Available from 4.15. +// +// The bool return value indicates whether this optional field is available. +func (pi *ProgramInfo) LoadTime() (time.Duration, bool) { + // loadTime and NrMapIds were introduced in the same kernel version. + return pi.loadTime, pi.loadTime > 0 +} + +// VerifiedInstructions returns the number verified instructions in the program. +// +// Available from 5.16. +// +// The bool return value indicates whether this optional field is available. +func (pi *ProgramInfo) VerifiedInstructions() (uint32, bool) { + return pi.verifiedInstructions, pi.verifiedInstructions > 0 +} + +// JitedKsymAddrs returns the ksym addresses of the BPF program, including its +// subprograms. The addresses correspond to their symbols in /proc/kallsyms. +// +// Available from 4.18. Note that before 5.x, this field can be empty for +// programs without subprograms (bpf2bpf calls). +// +// The bool return value indicates whether this optional field is available. +// +// When a kernel address can't fit into uintptr (which is usually the case when +// running 32 bit program on a 64 bit kernel), this returns an empty slice and +// a false. +func (pi *ProgramInfo) JitedKsymAddrs() ([]uintptr, bool) { + ksyms := make([]uintptr, 0, len(pi.jitedInfo.ksyms)) + if cap(ksyms) == 0 { + return ksyms, false + } + // Check if a kernel address fits into uintptr (it might not when + // using a 32 bit binary on a 64 bit kernel). This check should work + // with any kernel address, since they have 1s at the highest bits. + if a := pi.jitedInfo.ksyms[0]; uint64(uintptr(a)) != a { + return nil, false + } + for _, ksym := range pi.jitedInfo.ksyms { + ksyms = append(ksyms, uintptr(ksym)) + } + return ksyms, true +} + +// JitedInsns returns the JITed machine native instructions of the program. +// +// Available from 4.13. +// +// The bool return value indicates whether this optional field is available. +func (pi *ProgramInfo) JitedInsns() ([]byte, bool) { + return pi.jitedInfo.insns, len(pi.jitedInfo.insns) > 0 +} + +// JitedLineInfos returns the JITed line infos of the program. +// +// Available from 5.0. +// +// The bool return value indicates whether this optional field is available. +func (pi *ProgramInfo) JitedLineInfos() ([]uint64, bool) { + return pi.jitedInfo.lineInfos, len(pi.jitedInfo.lineInfos) > 0 +} + +// JitedFuncLens returns the insns length of each function in the JITed program. +// +// Available from 4.18. +// +// The bool return value indicates whether this optional field is available. +func (pi *ProgramInfo) JitedFuncLens() ([]uint32, bool) { + return pi.jitedInfo.funcLens, len(pi.jitedInfo.funcLens) > 0 +} + +// FuncInfos returns the offset and function information of all (sub)programs in +// a BPF program. +// +// Available from 5.0. +// +// Requires CAP_SYS_ADMIN or equivalent for reading BTF information. Returns +// ErrNotSupported if the program was created without BTF or if the kernel +// doesn't support the field. +func (pi *ProgramInfo) FuncInfos() (btf.FuncOffsets, error) { + if len(pi.funcInfos) == 0 { + return nil, fmt.Errorf("insufficient permissions or unsupported kernel: %w", ErrNotSupported) + } + + spec, err := pi.btfSpec() + if err != nil { + return nil, err + } + + return btf.LoadFuncInfos( + bytes.NewReader(pi.funcInfos), + internal.NativeEndian, + pi.numFuncInfos, + spec, + ) +} + +func scanFdInfo(fd *sys.FD, fields map[string]interface{}) error { + fh, err := os.Open(fmt.Sprintf("/proc/self/fdinfo/%d", fd.Int())) + if err != nil { + return err + } + defer fh.Close() + + if err := scanFdInfoReader(fh, fields); err != nil { + return fmt.Errorf("%s: %w", fh.Name(), err) + } + return nil +} + +func scanFdInfoReader(r io.Reader, fields map[string]interface{}) error { + var ( + scanner = bufio.NewScanner(r) + scanned int + ) + + for scanner.Scan() { + parts := strings.SplitN(scanner.Text(), "\t", 2) + if len(parts) != 2 { + continue + } + + name := strings.TrimSuffix(parts[0], ":") + field, ok := fields[string(name)] + if !ok { + continue + } + + // If field already contains a non-zero value, don't overwrite it with fdinfo. + if zero(field) { + if n, err := fmt.Sscanln(parts[1], field); err != nil || n != 1 { + return fmt.Errorf("can't parse field %s: %v", name, err) + } + } + + scanned++ + } + + if err := scanner.Err(); err != nil { + return fmt.Errorf("scanning fdinfo: %w", err) + } + + if len(fields) > 0 && scanned == 0 { + return ErrNotSupported + } + + return nil +} + +func zero(arg any) bool { + v := reflect.ValueOf(arg) + + // Unwrap pointers and interfaces. + for v.Kind() == reflect.Pointer || + v.Kind() == reflect.Interface { + v = v.Elem() + } + + return v.IsZero() +} + +// EnableStats starts the measuring of the runtime +// and run counts of eBPF programs. +// +// Collecting statistics can have an impact on the performance. +// +// Requires at least 5.8. +func EnableStats(which uint32) (io.Closer, error) { + fd, err := sys.EnableStats(&sys.EnableStatsAttr{ + Type: which, + }) + if err != nil { + return nil, err + } + return fd, nil +} + +var haveProgramInfoMapIDs = internal.NewFeatureTest("map IDs in program info", func() error { + prog, err := progLoad(asm.Instructions{ + asm.LoadImm(asm.R0, 0, asm.DWord), + asm.Return(), + }, SocketFilter, "MIT") + if err != nil { + return err + } + defer prog.Close() + + err = sys.ObjInfo(prog, &sys.ProgInfo{ + // NB: Don't need to allocate MapIds since the program isn't using + // any maps. + NrMapIds: 1, + }) + if errors.Is(err, unix.EINVAL) { + // Most likely the syscall doesn't exist. + return internal.ErrNotSupported + } + if errors.Is(err, unix.E2BIG) { + // We've hit check_uarg_tail_zero on older kernels. + return internal.ErrNotSupported + } + + return err +}, "4.15") diff --git a/vendor/github.com/cilium/ebpf/internal/buffer.go b/vendor/github.com/cilium/ebpf/internal/buffer.go new file mode 100644 index 0000000000..81c6544330 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/buffer.go @@ -0,0 +1,31 @@ +package internal + +import ( + "bytes" + "sync" +) + +var bytesBufferPool = sync.Pool{ + New: func() interface{} { + return new(bytes.Buffer) + }, +} + +// NewBuffer retrieves a [bytes.Buffer] from a pool an re-initialises it. +// +// The returned buffer should be passed to [PutBuffer]. +func NewBuffer(buf []byte) *bytes.Buffer { + wr := bytesBufferPool.Get().(*bytes.Buffer) + // Reinitialize the Buffer with a new backing slice since it is returned to + // the caller by wr.Bytes() below. Pooling is faster despite calling + // NewBuffer. The pooled alloc is still reused, it only needs to be zeroed. + *wr = *bytes.NewBuffer(buf) + return wr +} + +// PutBuffer releases a buffer to the pool. +func PutBuffer(buf *bytes.Buffer) { + // Release reference to the backing buffer. + *buf = *bytes.NewBuffer(nil) + bytesBufferPool.Put(buf) +} diff --git a/vendor/github.com/cilium/ebpf/internal/deque.go b/vendor/github.com/cilium/ebpf/internal/deque.go new file mode 100644 index 0000000000..e3a3050215 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/deque.go @@ -0,0 +1,91 @@ +package internal + +import "math/bits" + +// Deque implements a double ended queue. +type Deque[T any] struct { + elems []T + read, write uint64 + mask uint64 +} + +// Reset clears the contents of the deque while retaining the backing buffer. +func (dq *Deque[T]) Reset() { + var zero T + + for i := dq.read; i < dq.write; i++ { + dq.elems[i&dq.mask] = zero + } + + dq.read, dq.write = 0, 0 +} + +func (dq *Deque[T]) Empty() bool { + return dq.read == dq.write +} + +// Push adds an element to the end. +func (dq *Deque[T]) Push(e T) { + dq.Grow(1) + dq.elems[dq.write&dq.mask] = e + dq.write++ +} + +// Shift returns the first element or the zero value. +func (dq *Deque[T]) Shift() T { + var zero T + + if dq.Empty() { + return zero + } + + index := dq.read & dq.mask + t := dq.elems[index] + dq.elems[index] = zero + dq.read++ + return t +} + +// Pop returns the last element or the zero value. +func (dq *Deque[T]) Pop() T { + var zero T + + if dq.Empty() { + return zero + } + + dq.write-- + index := dq.write & dq.mask + t := dq.elems[index] + dq.elems[index] = zero + return t +} + +// Grow the deque's capacity, if necessary, to guarantee space for another n +// elements. +func (dq *Deque[T]) Grow(n int) { + have := dq.write - dq.read + need := have + uint64(n) + if need < have { + panic("overflow") + } + if uint64(len(dq.elems)) >= need { + return + } + + // Round up to the new power of two which is at least 8. + // See https://jameshfisher.com/2018/03/30/round-up-power-2/ + capacity := 1 << (64 - bits.LeadingZeros64(need-1)) + if capacity < 8 { + capacity = 8 + } + + elems := make([]T, have, capacity) + pivot := dq.read & dq.mask + copied := copy(elems, dq.elems[pivot:]) + copy(elems[copied:], dq.elems[:pivot]) + + dq.elems = elems[:capacity] + dq.mask = uint64(capacity) - 1 + dq.read, dq.write = 0, have +} diff --git a/vendor/github.com/cilium/ebpf/internal/elf.go b/vendor/github.com/cilium/ebpf/internal/elf.go new file mode 100644 index 0000000000..011581938d --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/elf.go @@ -0,0 +1,102 @@ +package internal + +import ( + "debug/elf" + "fmt" + "io" +) + +type SafeELFFile struct { + *elf.File +} + +// NewSafeELFFile reads an ELF safely. +// +// Any panic during parsing is turned into an error. This is necessary since +// there are a bunch of unfixed bugs in debug/elf. +// +// https://github.com/golang/go/issues?q=is%3Aissue+is%3Aopen+debug%2Felf+in%3Atitle +func NewSafeELFFile(r io.ReaderAt) (safe *SafeELFFile, err error) { + defer func() { + r := recover() + if r == nil { + return + } + + safe = nil + err = fmt.Errorf("reading ELF file panicked: %s", r) + }() + + file, err := elf.NewFile(r) + if err != nil { + return nil, err + } + + return &SafeELFFile{file}, nil +} + +// OpenSafeELFFile reads an ELF from a file. +// +// It works like NewSafeELFFile, with the exception that safe.Close will +// close the underlying file. +func OpenSafeELFFile(path string) (safe *SafeELFFile, err error) { + defer func() { + r := recover() + if r == nil { + return + } + + safe = nil + err = fmt.Errorf("reading ELF file panicked: %s", r) + }() + + file, err := elf.Open(path) + if err != nil { + return nil, err + } + + return &SafeELFFile{file}, nil +} + +// Symbols is the safe version of elf.File.Symbols. +func (se *SafeELFFile) Symbols() (syms []elf.Symbol, err error) { + defer func() { + r := recover() + if r == nil { + return + } + + syms = nil + err = fmt.Errorf("reading ELF symbols panicked: %s", r) + }() + + syms, err = se.File.Symbols() + return +} + +// DynamicSymbols is the safe version of elf.File.DynamicSymbols. +func (se *SafeELFFile) DynamicSymbols() (syms []elf.Symbol, err error) { + defer func() { + r := recover() + if r == nil { + return + } + + syms = nil + err = fmt.Errorf("reading ELF dynamic symbols panicked: %s", r) + }() + + syms, err = se.File.DynamicSymbols() + return +} + +// SectionsByType returns all sections in the file with the specified section type. +func (se *SafeELFFile) SectionsByType(typ elf.SectionType) []*elf.Section { + sections := make([]*elf.Section, 0, 1) + for _, section := range se.Sections { + if section.Type == typ { + sections = append(sections, section) + } + } + return sections +} diff --git a/vendor/github.com/cilium/ebpf/internal/endian_be.go b/vendor/github.com/cilium/ebpf/internal/endian_be.go new file mode 100644 index 0000000000..a37777f21f --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/endian_be.go @@ -0,0 +1,9 @@ +//go:build armbe || arm64be || mips || mips64 || mips64p32 || ppc64 || s390 || s390x || sparc || sparc64 + +package internal + +import "encoding/binary" + +// NativeEndian is set to either binary.BigEndian or binary.LittleEndian, +// depending on the host's endianness. +var NativeEndian = binary.BigEndian diff --git a/vendor/github.com/cilium/ebpf/internal/endian_le.go b/vendor/github.com/cilium/ebpf/internal/endian_le.go new file mode 100644 index 0000000000..6dcd916d5d --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/endian_le.go @@ -0,0 +1,9 @@ +//go:build 386 || amd64 || amd64p32 || arm || arm64 || loong64 || mipsle || mips64le || mips64p32le || ppc64le || riscv64 + +package internal + +import "encoding/binary" + +// NativeEndian is set to either binary.BigEndian or binary.LittleEndian, +// depending on the host's endianness. +var NativeEndian = binary.LittleEndian diff --git a/vendor/github.com/cilium/ebpf/internal/errors.go b/vendor/github.com/cilium/ebpf/internal/errors.go new file mode 100644 index 0000000000..19d5294ca0 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/errors.go @@ -0,0 +1,179 @@ +package internal + +import ( + "bytes" + "fmt" + "io" + "strings" +) + +// ErrorWithLog wraps err in a VerifierError that includes the parsed verifier +// log buffer. +// +// The default error output is a summary of the full log. The latter can be +// accessed via VerifierError.Log or by formatting the error, see Format. +func ErrorWithLog(source string, err error, log []byte) *VerifierError { + const whitespace = "\t\r\v\n " + + // Convert verifier log C string by truncating it on the first 0 byte + // and trimming trailing whitespace before interpreting as a Go string. + if i := bytes.IndexByte(log, 0); i != -1 { + log = log[:i] + } + + log = bytes.Trim(log, whitespace) + if len(log) == 0 { + return &VerifierError{source, err, nil} + } + + logLines := bytes.Split(log, []byte{'\n'}) + lines := make([]string, 0, len(logLines)) + for _, line := range logLines { + // Don't remove leading white space on individual lines. We rely on it + // when outputting logs. + lines = append(lines, string(bytes.TrimRight(line, whitespace))) + } + + return &VerifierError{source, err, lines} +} + +// VerifierError includes information from the eBPF verifier. +// +// It summarises the log output, see Format if you want to output the full contents. +type VerifierError struct { + source string + // The error which caused this error. + Cause error + // The verifier output split into lines. + Log []string +} + +func (le *VerifierError) Unwrap() error { + return le.Cause +} + +func (le *VerifierError) Error() string { + log := le.Log + if n := len(log); n > 0 && strings.HasPrefix(log[n-1], "processed ") { + // Get rid of "processed 39 insns (limit 1000000) ..." from summary. + log = log[:n-1] + } + + var b strings.Builder + fmt.Fprintf(&b, "%s: %s", le.source, le.Cause.Error()) + + n := len(log) + if n == 0 { + return b.String() + } + + lines := log[n-1:] + if n >= 2 && includePreviousLine(log[n-1]) { + // Add one more line of context if it aids understanding the error. + lines = log[n-2:] + } + + for _, line := range lines { + b.WriteString(": ") + b.WriteString(strings.TrimSpace(line)) + } + + omitted := len(le.Log) - len(lines) + if omitted > 0 { + fmt.Fprintf(&b, " (%d line(s) omitted)", omitted) + } + + return b.String() +} + +// includePreviousLine returns true if the given line likely is better +// understood with additional context from the preceding line. +func includePreviousLine(line string) bool { + // We need to find a good trade off between understandable error messages + // and too much complexity here. Checking the string prefix is ok, requiring + // regular expressions to do it is probably overkill. + + if strings.HasPrefix(line, "\t") { + // [13] STRUCT drm_rect size=16 vlen=4 + // \tx1 type_id=2 + return true + } + + if len(line) >= 2 && line[0] == 'R' && line[1] >= '0' && line[1] <= '9' { + // 0: (95) exit + // R0 !read_ok + return true + } + + if strings.HasPrefix(line, "invalid bpf_context access") { + // 0: (79) r6 = *(u64 *)(r1 +0) + // func '__x64_sys_recvfrom' arg0 type FWD is not a struct + // invalid bpf_context access off=0 size=8 + return true + } + + return false +} + +// Format the error. +// +// Understood verbs are %s and %v, which are equivalent to calling Error(). %v +// allows outputting additional information using the following flags: +// +// %+v: Output the first lines, or all lines if no width is given. +// %-v: Output the last lines, or all lines if no width is given. +// +// Use width to specify how many lines to output. Use the '-' flag to output +// lines from the end of the log instead of the beginning. +func (le *VerifierError) Format(f fmt.State, verb rune) { + switch verb { + case 's': + _, _ = io.WriteString(f, le.Error()) + + case 'v': + n, haveWidth := f.Width() + if !haveWidth || n > len(le.Log) { + n = len(le.Log) + } + + if !f.Flag('+') && !f.Flag('-') { + if haveWidth { + _, _ = io.WriteString(f, "%!v(BADWIDTH)") + return + } + + _, _ = io.WriteString(f, le.Error()) + return + } + + if f.Flag('+') && f.Flag('-') { + _, _ = io.WriteString(f, "%!v(BADFLAG)") + return + } + + fmt.Fprintf(f, "%s: %s:", le.source, le.Cause.Error()) + + omitted := len(le.Log) - n + lines := le.Log[:n] + if f.Flag('-') { + // Print last instead of first lines. + lines = le.Log[len(le.Log)-n:] + if omitted > 0 { + fmt.Fprintf(f, "\n\t(%d line(s) omitted)", omitted) + } + } + + for _, line := range lines { + fmt.Fprintf(f, "\n\t%s", line) + } + + if !f.Flag('-') { + if omitted > 0 { + fmt.Fprintf(f, "\n\t(%d line(s) omitted)", omitted) + } + } + + default: + fmt.Fprintf(f, "%%!%c(BADVERB)", verb) + } +} diff --git a/vendor/github.com/cilium/ebpf/internal/feature.go b/vendor/github.com/cilium/ebpf/internal/feature.go new file mode 100644 index 0000000000..6f64822e11 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/feature.go @@ -0,0 +1,227 @@ +package internal + +import ( + "errors" + "fmt" + "runtime" + "strings" + "sync" +) + +// ErrNotSupported indicates that a feature is not supported. +var ErrNotSupported = errors.New("not supported") + +// ErrNotSupportedOnOS indicates that a feature is not supported on the current +// operating system. +var ErrNotSupportedOnOS = fmt.Errorf("%w on %s", ErrNotSupported, runtime.GOOS) + +// UnsupportedFeatureError is returned by FeatureTest() functions. +type UnsupportedFeatureError struct { + // The minimum version required for this feature. + // + // On Linux this refers to the mainline kernel version, on other platforms + // to the version of the runtime. + // + // Used for the error string, and for sanity checking during testing. + MinimumVersion Version + + // The name of the feature that isn't supported. + Name string +} + +func (ufe *UnsupportedFeatureError) Error() string { + if ufe.MinimumVersion.Unspecified() { + return fmt.Sprintf("%s not supported", ufe.Name) + } + return fmt.Sprintf("%s not supported (requires >= %s)", ufe.Name, ufe.MinimumVersion) +} + +// Is indicates that UnsupportedFeatureError is ErrNotSupported. +func (ufe *UnsupportedFeatureError) Is(target error) bool { + return target == ErrNotSupported +} + +// FeatureTest caches the result of a [FeatureTestFn]. +// +// Fields should not be modified after creation. +type FeatureTest struct { + // The name of the feature being detected. + Name string + // Version in the form Major.Minor[.Patch]. + Version string + // The feature test itself. + Fn FeatureTestFn + + mu sync.RWMutex + done bool + result error +} + +// FeatureTestFn is used to determine whether the kernel supports +// a certain feature. +// +// The return values have the following semantics: +// +// err == ErrNotSupported: the feature is not available +// err == nil: the feature is available +// err != nil: the test couldn't be executed +type FeatureTestFn func() error + +// NewFeatureTest is a convenient way to create a single [FeatureTest]. +// +// versions specifies in which version of a BPF runtime a feature appeared. +// The format is "GOOS:Major.Minor[.Patch]". GOOS may be omitted when targeting +// Linux. Returns [ErrNotSupportedOnOS] if there is no version specified for the +// current OS. +func NewFeatureTest(name string, fn FeatureTestFn, versions ...string) func() error { + const nativePrefix = runtime.GOOS + ":" + + if len(versions) == 0 { + return func() error { + return fmt.Errorf("feature test %q: no versions specified", name) + } + } + + ft := &FeatureTest{ + Name: name, + Fn: fn, + } + + for _, version := range versions { + if strings.HasPrefix(version, nativePrefix) { + ft.Version = strings.TrimPrefix(version, nativePrefix) + break + } + + if OnLinux && !strings.ContainsRune(version, ':') { + // Allow version numbers without a GOOS prefix on Linux. + ft.Version = version + break + } + } + + if ft.Version == "" { + return func() error { + // We don't return an UnsupportedFeatureError here, since that will + // trigger version checks which don't make sense. + return fmt.Errorf("%s: %w", name, ErrNotSupportedOnOS) + } + } + + return ft.execute +} + +// execute the feature test. +// +// The result is cached if the test is conclusive. +// +// See [FeatureTestFn] for the meaning of the returned error. +func (ft *FeatureTest) execute() error { + ft.mu.RLock() + result, done := ft.result, ft.done + ft.mu.RUnlock() + + if done { + return result + } + + ft.mu.Lock() + defer ft.mu.Unlock() + + // The test may have been executed by another caller while we were + // waiting to acquire ft.mu. + if ft.done { + return ft.result + } + + err := ft.Fn() + if err == nil { + ft.done = true + return nil + } + + if errors.Is(err, ErrNotSupported) { + var v Version + if ft.Version != "" { + v, err = NewVersion(ft.Version) + if err != nil { + return fmt.Errorf("feature %s: %w", ft.Name, err) + } + } + + ft.done = true + ft.result = &UnsupportedFeatureError{ + MinimumVersion: v, + Name: ft.Name, + } + + return ft.result + } + + // We couldn't execute the feature test to a point + // where it could make a determination. + // Don't cache the result, just return it. + return fmt.Errorf("detect support for %s: %w", ft.Name, err) +} + +// FeatureMatrix groups multiple related feature tests into a map. +// +// Useful when there is a small number of discrete features which are known +// at compile time. +// +// It must not be modified concurrently with calling [FeatureMatrix.Result]. +type FeatureMatrix[K comparable] map[K]*FeatureTest + +// Result returns the outcome of the feature test for the given key. +// +// It's safe to call this function concurrently. +func (fm FeatureMatrix[K]) Result(key K) error { + ft, ok := fm[key] + if !ok { + return fmt.Errorf("no feature probe for %v", key) + } + + return ft.execute() +} + +// FeatureCache caches a potentially unlimited number of feature probes. +// +// Useful when there is a high cardinality for a feature test. +type FeatureCache[K comparable] struct { + mu sync.RWMutex + newTest func(K) *FeatureTest + features map[K]*FeatureTest +} + +func NewFeatureCache[K comparable](newTest func(K) *FeatureTest) *FeatureCache[K] { + return &FeatureCache[K]{ + newTest: newTest, + features: make(map[K]*FeatureTest), + } +} + +func (fc *FeatureCache[K]) Result(key K) error { + // NB: Executing the feature test happens without fc.mu taken. + return fc.retrieve(key).execute() +} + +func (fc *FeatureCache[K]) retrieve(key K) *FeatureTest { + fc.mu.RLock() + ft := fc.features[key] + fc.mu.RUnlock() + + if ft != nil { + return ft + } + + fc.mu.Lock() + defer fc.mu.Unlock() + + if ft := fc.features[key]; ft != nil { + return ft + } + + ft = fc.newTest(key) + fc.features[key] = ft + return ft +} diff --git a/vendor/github.com/cilium/ebpf/internal/goos.go b/vendor/github.com/cilium/ebpf/internal/goos.go new file mode 100644 index 0000000000..0569d5ce0b --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/goos.go @@ -0,0 +1,7 @@ +package internal + +import "runtime" + +const ( + OnLinux = runtime.GOOS == "linux" +) diff --git a/vendor/github.com/cilium/ebpf/internal/io.go b/vendor/github.com/cilium/ebpf/internal/io.go new file mode 100644 index 0000000000..1eaf4775ad --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/io.go @@ -0,0 +1,128 @@ +package internal + +import ( + "bufio" + "bytes" + "compress/gzip" + "errors" + "fmt" + "io" + "os" + "path/filepath" + "sync" +) + +// NewBufferedSectionReader wraps an io.ReaderAt in an appropriately-sized +// buffered reader. It is a convenience function for reading subsections of +// ELF sections while minimizing the amount of read() syscalls made. +// +// Syscall overhead is non-negligible in continuous integration context +// where ELFs might be accessed over virtual filesystems with poor random +// access performance. Buffering reads makes sense because (sub)sections +// end up being read completely anyway. +// +// Use instead of the r.Seek() + io.LimitReader() pattern. +func NewBufferedSectionReader(ra io.ReaderAt, off, n int64) *bufio.Reader { + // Clamp the size of the buffer to one page to avoid slurping large parts + // of a file into memory. bufio.NewReader uses a hardcoded default buffer + // of 4096. Allow arches with larger pages to allocate more, but don't + // allocate a fixed 4k buffer if we only need to read a small segment. + buf := n + if ps := int64(os.Getpagesize()); n > ps { + buf = ps + } + + return bufio.NewReaderSize(io.NewSectionReader(ra, off, n), int(buf)) +} + +// DiscardZeroes makes sure that all written bytes are zero +// before discarding them. +type DiscardZeroes struct{} + +func (DiscardZeroes) Write(p []byte) (int, error) { + for _, b := range p { + if b != 0 { + return 0, errors.New("encountered non-zero byte") + } + } + return len(p), nil +} + +// ReadAllCompressed decompresses a gzipped file into memory. +func ReadAllCompressed(file string) ([]byte, error) { + fh, err := os.Open(file) + if err != nil { + return nil, err + } + defer fh.Close() + + gz, err := gzip.NewReader(fh) + if err != nil { + return nil, err + } + defer gz.Close() + + return io.ReadAll(gz) +} + +// ReadUint64FromFile reads a uint64 from a file. +// +// format specifies the contents of the file in fmt.Scanf syntax. +func ReadUint64FromFile(format string, path ...string) (uint64, error) { + filename := filepath.Join(path...) + data, err := os.ReadFile(filename) + if err != nil { + return 0, fmt.Errorf("reading file %q: %w", filename, err) + } + + var value uint64 + n, err := fmt.Fscanf(bytes.NewReader(data), format, &value) + if err != nil { + return 0, fmt.Errorf("parsing file %q: %w", filename, err) + } + if n != 1 { + return 0, fmt.Errorf("parsing file %q: expected 1 item, got %d", filename, n) + } + + return value, nil +} + +type uint64FromFileKey struct { + format, path string +} + +var uint64FromFileCache = struct { + sync.RWMutex + values map[uint64FromFileKey]uint64 +}{ + values: map[uint64FromFileKey]uint64{}, +} + +// ReadUint64FromFileOnce is like readUint64FromFile but memoizes the result. +func ReadUint64FromFileOnce(format string, path ...string) (uint64, error) { + filename := filepath.Join(path...) + key := uint64FromFileKey{format, filename} + + uint64FromFileCache.RLock() + if value, ok := uint64FromFileCache.values[key]; ok { + uint64FromFileCache.RUnlock() + return value, nil + } + uint64FromFileCache.RUnlock() + + value, err := ReadUint64FromFile(format, filename) + if err != nil { + return 0, err + } + + uint64FromFileCache.Lock() + defer uint64FromFileCache.Unlock() + + if value, ok := uint64FromFileCache.values[key]; ok { + // Someone else got here before us, use what is cached. + return value, nil + } + + uint64FromFileCache.values[key] = value + return value, nil +} diff --git a/vendor/github.com/cilium/ebpf/internal/kallsyms/cache.go b/vendor/github.com/cilium/ebpf/internal/kallsyms/cache.go new file mode 100644 index 0000000000..b7f3e0b781 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/kallsyms/cache.go @@ -0,0 +1,20 @@ +package kallsyms + +import "sync" + +type cache[K, V comparable] struct { + m sync.Map +} + +func (c *cache[K, V]) Load(key K) (value V, _ bool) { + v, ok := c.m.Load(key) + if !ok { + return value, false + } + value = v.(V) + return value, true +} + +func (c *cache[K, V]) Store(key K, value V) { + c.m.Store(key, value) +} diff --git a/vendor/github.com/cilium/ebpf/internal/kallsyms/kallsyms.go b/vendor/github.com/cilium/ebpf/internal/kallsyms/kallsyms.go new file mode 100644 index 0000000000..2cdfb56a58 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/kallsyms/kallsyms.go @@ -0,0 +1,289 @@ +package kallsyms + +import ( + "errors" + "fmt" + "io" + "os" + "slices" + "strconv" + "strings" + + "github.com/cilium/ebpf/internal" +) + +var errAmbiguousKsym = errors.New("multiple kernel symbols with the same name") + +var symAddrs cache[string, uint64] +var symModules cache[string, string] + +// Module returns the kernel module providing the given symbol in the kernel, if +// any. Returns an empty string and no error if the symbol is not present in the +// kernel. Only function symbols are considered. Returns an error if multiple +// symbols with the same name were found. +// +// Consider [AssignModules] if you need to resolve multiple symbols, as it will +// only perform one iteration over /proc/kallsyms. +func Module(name string) (string, error) { + if name == "" { + return "", nil + } + + if mod, ok := symModules.Load(name); ok { + return mod, nil + } + + request := map[string]string{name: ""} + if err := AssignModules(request); err != nil { + return "", err + } + + return request[name], nil +} + +// AssignModules looks up the kernel module providing each given symbol, if any, +// and assigns them to their corresponding values in the symbols map. Only +// function symbols are considered. Results of all lookups are cached, +// successful or otherwise. +// +// Any symbols missing in the kernel are ignored. Returns an error if multiple +// symbols with a given name were found. +func AssignModules(symbols map[string]string) error { + if !internal.OnLinux { + return fmt.Errorf("read /proc/kallsyms: %w", internal.ErrNotSupportedOnOS) + } + + if len(symbols) == 0 { + return nil + } + + // Attempt to fetch symbols from cache. + request := make(map[string]string) + for name := range symbols { + if mod, ok := symModules.Load(name); ok { + symbols[name] = mod + continue + } + + // Mark the symbol to be read from /proc/kallsyms. + request[name] = "" + } + if len(request) == 0 { + // All symbols satisfied from cache. + return nil + } + + f, err := os.Open("/proc/kallsyms") + if err != nil { + return err + } + defer f.Close() + + if err := assignModules(f, request); err != nil { + return fmt.Errorf("assigning symbol modules: %w", err) + } + + // Update the cache with the new symbols. Cache all requested symbols, even if + // they're missing or don't belong to a module. + for name, mod := range request { + symModules.Store(name, mod) + symbols[name] = mod + } + + return nil +} + +// assignModules assigns kernel symbol modules read from f to values requested +// by symbols. Always scans the whole input to make sure the user didn't request +// an ambiguous symbol. +func assignModules(f io.Reader, symbols map[string]string) error { + if len(symbols) == 0 { + return nil + } + + found := make(map[string]struct{}) + r := newReader(f) + for r.Line() { + // Only look for function symbols in the kernel's text section (tT). + s, err, skip := parseSymbol(r, []rune{'t', 'T'}) + if err != nil { + return fmt.Errorf("parsing kallsyms line: %w", err) + } + if skip { + continue + } + + if _, requested := symbols[s.name]; !requested { + continue + } + + if _, ok := found[s.name]; ok { + // We've already seen this symbol. Return an error to avoid silently + // attaching to a symbol in the wrong module. libbpf also rejects + // referring to ambiguous symbols. + // + // We can't simply check if we already have a value for the given symbol, + // since many won't have an associated kernel module. + return fmt.Errorf("symbol %s: duplicate found at address 0x%x (module %q): %w", + s.name, s.addr, s.mod, errAmbiguousKsym) + } + + symbols[s.name] = s.mod + found[s.name] = struct{}{} + } + if err := r.Err(); err != nil { + return fmt.Errorf("reading kallsyms: %w", err) + } + + return nil +} + +// Address returns the address of the given symbol in the kernel. Returns 0 and +// no error if the symbol is not present. Returns an error if multiple addresses +// were found for a symbol. +// +// Consider [AssignAddresses] if you need to resolve multiple symbols, as it +// will only perform one iteration over /proc/kallsyms. +func Address(symbol string) (uint64, error) { + if symbol == "" { + return 0, nil + } + + if addr, ok := symAddrs.Load(symbol); ok { + return addr, nil + } + + request := map[string]uint64{symbol: 0} + if err := AssignAddresses(request); err != nil { + return 0, err + } + + return request[symbol], nil +} + +// AssignAddresses looks up the addresses of the requested symbols in the kernel +// and assigns them to their corresponding values in the symbols map. Results +// of all lookups are cached, successful or otherwise. +// +// Any symbols missing in the kernel are ignored. Returns an error if multiple +// addresses were found for a symbol. +func AssignAddresses(symbols map[string]uint64) error { + if !internal.OnLinux { + return fmt.Errorf("read /proc/kallsyms: %w", internal.ErrNotSupportedOnOS) + } + + if len(symbols) == 0 { + return nil + } + + // Attempt to fetch symbols from cache. + request := make(map[string]uint64) + for name := range symbols { + if addr, ok := symAddrs.Load(name); ok { + symbols[name] = addr + continue + } + + // Mark the symbol to be read from /proc/kallsyms. + request[name] = 0 + } + if len(request) == 0 { + // All symbols satisfied from cache. + return nil + } + + f, err := os.Open("/proc/kallsyms") + if err != nil { + return err + } + defer f.Close() + + if err := assignAddresses(f, request); err != nil { + return fmt.Errorf("loading symbol addresses: %w", err) + } + + // Update the cache with the new symbols. Cache all requested symbols even if + // they weren't found, to avoid repeated lookups. + for name, addr := range request { + symAddrs.Store(name, addr) + symbols[name] = addr + } + + return nil +} + +// assignAddresses assigns kernel symbol addresses read from f to values +// requested by symbols. Always scans the whole input to make sure the user +// didn't request an ambiguous symbol. +func assignAddresses(f io.Reader, symbols map[string]uint64) error { + if len(symbols) == 0 { + return nil + } + r := newReader(f) + for r.Line() { + s, err, skip := parseSymbol(r, nil) + if err != nil { + return fmt.Errorf("parsing kallsyms line: %w", err) + } + if skip { + continue + } + + existing, requested := symbols[s.name] + if existing != 0 { + // Multiple addresses for a symbol have been found. Return a friendly + // error to avoid silently attaching to the wrong symbol. libbpf also + // rejects referring to ambiguous symbols. + return fmt.Errorf("symbol %s(0x%x): duplicate found at address 0x%x: %w", s.name, existing, s.addr, errAmbiguousKsym) + } + if requested { + symbols[s.name] = s.addr + } + } + if err := r.Err(); err != nil { + return fmt.Errorf("reading kallsyms: %w", err) + } + + return nil +} + +type ksym struct { + addr uint64 + name string + mod string +} + +// parseSymbol parses a line from /proc/kallsyms into an address, type, name and +// module. Skip will be true if the symbol doesn't match any of the given symbol +// types. See `man 1 nm` for all available types. +// +// Example line: `ffffffffc1682010 T nf_nat_init [nf_nat]` +func parseSymbol(r *reader, types []rune) (s ksym, err error, skip bool) { + for i := 0; r.Word(); i++ { + switch i { + // Address of the symbol. + case 0: + s.addr, err = strconv.ParseUint(r.Text(), 16, 64) + if err != nil { + return s, fmt.Errorf("parsing address: %w", err), false + } + // Type of the symbol. Assume the character is ASCII-encoded by converting + // it directly to a rune, since it's a fixed field controlled by the kernel. + case 1: + if len(types) > 0 && !slices.Contains(types, rune(r.Bytes()[0])) { + return s, nil, true + } + // Name of the symbol. + case 2: + s.name = r.Text() + // Kernel module the symbol is provided by. + case 3: + s.mod = strings.Trim(r.Text(), "[]") + // Ignore any future fields. + default: + break + } + } + + return +} diff --git a/vendor/github.com/cilium/ebpf/internal/kallsyms/reader.go b/vendor/github.com/cilium/ebpf/internal/kallsyms/reader.go new file mode 100644 index 0000000000..2bd4f8eafc --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/kallsyms/reader.go @@ -0,0 +1,118 @@ +package kallsyms + +import ( + "bufio" + "io" + "unicode" + "unicode/utf8" +) + +// reader is a line and word-oriented reader built for reading /proc/kallsyms. +// It takes an io.Reader and iterates its contents line by line, then word by +// word. +// +// It's designed to allow partial reading of lines without paying the cost of +// allocating objects that will never be accessed, resulting in less work for +// the garbage collector. +type reader struct { + s *bufio.Scanner + line []byte + word []byte + + err error +} + +func newReader(r io.Reader) *reader { + return &reader{ + s: bufio.NewScanner(r), + } +} + +// Bytes returns the current word as a byte slice. +func (r *reader) Bytes() []byte { + return r.word +} + +// Text returns the output of Bytes as a string. +func (r *reader) Text() string { + return string(r.Bytes()) +} + +// Line advances the reader to the next line in the input. Calling Line resets +// the current word, making [reader.Bytes] and [reader.Text] return empty +// values. Follow this up with a call to [reader.Word]. +// +// Like [bufio.Scanner], [reader.Err] needs to be checked after Line returns +// false to determine if an error occurred during reading. +// +// Returns true if Line can be called again. Returns false if all lines in the +// input have been read. +func (r *reader) Line() bool { + for r.s.Scan() { + line := r.s.Bytes() + if len(line) == 0 { + continue + } + + r.line = line + r.word = nil + + return true + } + if err := r.s.Err(); err != nil { + r.err = err + } + + return false +} + +// Word advances the reader to the next word in the current line. +// +// Returns true if a word is found and Word should be called again. Returns +// false when all words on the line have been read. +func (r *reader) Word() bool { + if len(r.line) == 0 { + return false + } + + // Find next word start, skipping leading spaces. + start := 0 + for width := 0; start < len(r.line); start += width { + var c rune + c, width = utf8.DecodeRune(r.line[start:]) + if !unicode.IsSpace(c) { + break + } + } + + // Whitespace scanning reached the end of the line due to trailing whitespace, + // meaning there are no more words to read + if start == len(r.line) { + return false + } + + // Find next word end. + for width, i := 0, start; i < len(r.line); i += width { + var c rune + c, width = utf8.DecodeRune(r.line[i:]) + if unicode.IsSpace(c) { + r.word = r.line[start:i] + r.line = r.line[i:] + return true + } + } + + // The line contains data, but no end-of-word boundary was found. This is the + // last, unterminated word in the line. + if len(r.line) > start { + r.word = r.line[start:] + r.line = nil + return true + } + + return false +} + +func (r *reader) Err() error { + return r.err +} diff --git a/vendor/github.com/cilium/ebpf/internal/kconfig/kconfig.go b/vendor/github.com/cilium/ebpf/internal/kconfig/kconfig.go new file mode 100644 index 0000000000..29c62b6266 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/kconfig/kconfig.go @@ -0,0 +1,274 @@ +// Package kconfig implements a parser for the format of Linux's .config file. +package kconfig + +import ( + "bufio" + "bytes" + "compress/gzip" + "fmt" + "io" + "math" + "strconv" + "strings" + + "github.com/cilium/ebpf/btf" + "github.com/cilium/ebpf/internal" +) + +// Parse parses the kconfig file for which a reader is given. +// All the CONFIG_* which are in filter and which are set set will be +// put in the returned map as key with their corresponding value as map value. +// If filter is nil, no filtering will occur. +// If the kconfig file is not valid, error will be returned. +func Parse(source io.ReaderAt, filter map[string]struct{}) (map[string]string, error) { + var r io.Reader + zr, err := gzip.NewReader(io.NewSectionReader(source, 0, math.MaxInt64)) + if err != nil { + r = io.NewSectionReader(source, 0, math.MaxInt64) + } else { + // Source is gzip compressed, transparently decompress. + r = zr + } + + ret := make(map[string]string, len(filter)) + + s := bufio.NewScanner(r) + + for s.Scan() { + line := s.Bytes() + err = processKconfigLine(line, ret, filter) + if err != nil { + return nil, fmt.Errorf("cannot parse line: %w", err) + } + + if filter != nil && len(ret) == len(filter) { + break + } + } + + if err := s.Err(); err != nil { + return nil, fmt.Errorf("cannot parse: %w", err) + } + + if zr != nil { + return ret, zr.Close() + } + + return ret, nil +} + +// Golang translation of libbpf bpf_object__process_kconfig_line(): +// https://github.com/libbpf/libbpf/blob/fbd60dbff51c870f5e80a17c4f2fd639eb80af90/src/libbpf.c#L1874 +// It does the same checks but does not put the data inside the BPF map. +func processKconfigLine(line []byte, m map[string]string, filter map[string]struct{}) error { + // Ignore empty lines and "# CONFIG_* is not set". + if !bytes.HasPrefix(line, []byte("CONFIG_")) { + return nil + } + + key, value, found := bytes.Cut(line, []byte{'='}) + if !found { + return fmt.Errorf("line %q does not contain separator '='", line) + } + + if len(value) == 0 { + return fmt.Errorf("line %q has no value", line) + } + + if filter != nil { + // NB: map[string(key)] gets special optimisation help from the compiler + // and doesn't allocate. Don't turn this into a variable. + _, ok := filter[string(key)] + if !ok { + return nil + } + } + + // This can seem odd, but libbpf only sets the value the first time the key is + // met: + // https://github.com/torvalds/linux/blob/0d85b27b0cc6/tools/lib/bpf/libbpf.c#L1906-L1908 + _, ok := m[string(key)] + if !ok { + m[string(key)] = string(value) + } + + return nil +} + +// PutValue translates the value given as parameter depending on the BTF +// type, the translated value is then written to the byte array. +func PutValue(data []byte, typ btf.Type, value string) error { + typ = btf.UnderlyingType(typ) + + switch value { + case "y", "n", "m": + return putValueTri(data, typ, value) + } + + if strings.HasPrefix(value, `"`) { + return putValueString(data, typ, value) + } + + return putValueNumber(data, typ, value) +} + +// Golang translation of libbpf_tristate enum: +// https://github.com/libbpf/libbpf/blob/fbd60dbff51c870f5e80a17c4f2fd639eb80af90/src/bpf_helpers.h#L169 +type triState int + +const ( + TriNo triState = 0 + TriYes triState = 1 + TriModule triState = 2 +) + +func putValueTri(data []byte, typ btf.Type, value string) error { + switch v := typ.(type) { + case *btf.Int: + if v.Encoding != btf.Bool { + return fmt.Errorf("cannot add tri value, expected btf.Bool, got: %v", v.Encoding) + } + + if v.Size != 1 { + return fmt.Errorf("cannot add tri value, expected size of 1 byte, got: %d", v.Size) + } + + switch value { + case "y": + data[0] = 1 + case "n": + data[0] = 0 + default: + return fmt.Errorf("cannot use %q for btf.Bool", value) + } + case *btf.Enum: + if v.Name != "libbpf_tristate" { + return fmt.Errorf("cannot use enum %q, only libbpf_tristate is supported", v.Name) + } + + if len(data) != 4 { + return fmt.Errorf("expected enum value to occupy 4 bytes in datasec, got: %d", len(data)) + } + + var tri triState + switch value { + case "y": + tri = TriYes + case "m": + tri = TriModule + case "n": + tri = TriNo + default: + return fmt.Errorf("value %q is not supported for libbpf_tristate", value) + } + + internal.NativeEndian.PutUint32(data, uint32(tri)) + default: + return fmt.Errorf("cannot add number value, expected btf.Int or btf.Enum, got: %T", v) + } + + return nil +} + +func putValueString(data []byte, typ btf.Type, value string) error { + array, ok := typ.(*btf.Array) + if !ok { + return fmt.Errorf("cannot add string value, expected btf.Array, got %T", array) + } + + contentType, ok := btf.UnderlyingType(array.Type).(*btf.Int) + if !ok { + return fmt.Errorf("cannot add string value, expected array of btf.Int, got %T", contentType) + } + + // Any Int, which is not bool, of one byte could be used to store char: + // https://github.com/torvalds/linux/blob/1a5304fecee5/tools/lib/bpf/libbpf.c#L3637-L3638 + if contentType.Size != 1 && contentType.Encoding != btf.Bool { + return fmt.Errorf("cannot add string value, expected array of btf.Int of size 1, got array of btf.Int of size: %v", contentType.Size) + } + + if !strings.HasPrefix(value, `"`) || !strings.HasSuffix(value, `"`) { + return fmt.Errorf(`value %q must start and finish with '"'`, value) + } + + str := strings.Trim(value, `"`) + + // We need to trim string if the bpf array is smaller. + if uint32(len(str)) >= array.Nelems { + str = str[:array.Nelems] + } + + // Write the string content to .kconfig. + copy(data, str) + + return nil +} + +func putValueNumber(data []byte, typ btf.Type, value string) error { + integer, ok := typ.(*btf.Int) + if !ok { + return fmt.Errorf("cannot add number value, expected *btf.Int, got: %T", integer) + } + + size := integer.Size + sizeInBits := size * 8 + + var n uint64 + var err error + if integer.Encoding == btf.Signed { + parsed, e := strconv.ParseInt(value, 0, int(sizeInBits)) + + n = uint64(parsed) + err = e + } else { + parsed, e := strconv.ParseUint(value, 0, int(sizeInBits)) + + n = uint64(parsed) + err = e + } + + if err != nil { + return fmt.Errorf("cannot parse value: %w", err) + } + + return PutInteger(data, integer, n) +} + +// PutInteger writes n into data. +// +// integer determines how much is written into data and what the valid values +// are. +func PutInteger(data []byte, integer *btf.Int, n uint64) error { + // This function should match set_kcfg_value_num in libbpf. + if integer.Encoding == btf.Bool && n > 1 { + return fmt.Errorf("invalid boolean value: %d", n) + } + + if len(data) < int(integer.Size) { + return fmt.Errorf("can't fit an integer of size %d into a byte slice of length %d", integer.Size, len(data)) + } + + switch integer.Size { + case 1: + if integer.Encoding == btf.Signed && (int64(n) > math.MaxInt8 || int64(n) < math.MinInt8) { + return fmt.Errorf("can't represent %d as a signed integer of size %d", int64(n), integer.Size) + } + data[0] = byte(n) + case 2: + if integer.Encoding == btf.Signed && (int64(n) > math.MaxInt16 || int64(n) < math.MinInt16) { + return fmt.Errorf("can't represent %d as a signed integer of size %d", int64(n), integer.Size) + } + internal.NativeEndian.PutUint16(data, uint16(n)) + case 4: + if integer.Encoding == btf.Signed && (int64(n) > math.MaxInt32 || int64(n) < math.MinInt32) { + return fmt.Errorf("can't represent %d as a signed integer of size %d", int64(n), integer.Size) + } + internal.NativeEndian.PutUint32(data, uint32(n)) + case 8: + internal.NativeEndian.PutUint64(data, uint64(n)) + default: + return fmt.Errorf("size (%d) is not valid, expected: 1, 2, 4 or 8", integer.Size) + } + + return nil +} diff --git a/vendor/github.com/cilium/ebpf/internal/linux/auxv.go b/vendor/github.com/cilium/ebpf/internal/linux/auxv.go new file mode 100644 index 0000000000..10508fd8fc --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/linux/auxv.go @@ -0,0 +1,62 @@ +package linux + +import ( + "fmt" + "io" + + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/unix" +) + +type auxvPairReader interface { + Close() error + ReadAuxvPair() (uint64, uint64, error) +} + +// See https://elixir.bootlin.com/linux/v6.5.5/source/include/uapi/linux/auxvec.h +const ( + _AT_NULL = 0 // End of vector + _AT_SYSINFO_EHDR = 33 // Offset to vDSO blob in process image +) + +type auxvRuntimeReader struct { + data [][2]uintptr + index int +} + +func (r *auxvRuntimeReader) Close() error { + return nil +} + +func (r *auxvRuntimeReader) ReadAuxvPair() (uint64, uint64, error) { + if r.index >= len(r.data)+2 { + return 0, 0, io.EOF + } + + // we manually add the (_AT_NULL, _AT_NULL) pair at the end + // that is not provided by the go runtime + var tag, value uintptr + if r.index < len(r.data) { + tag, value = r.data[r.index][0], r.data[r.index][1] + } else { + tag, value = _AT_NULL, _AT_NULL + } + r.index += 1 + return uint64(tag), uint64(value), nil +} + +func newAuxvRuntimeReader() (auxvPairReader, error) { + if !internal.OnLinux { + return nil, fmt.Errorf("read auxv from runtime: %w", internal.ErrNotSupportedOnOS) + } + + data, err := unix.Auxv() + if err != nil { + return nil, fmt.Errorf("read auxv from runtime: %w", err) + } + + return &auxvRuntimeReader{ + data: data, + index: 0, + }, nil +} diff --git a/vendor/github.com/cilium/ebpf/internal/linux/doc.go b/vendor/github.com/cilium/ebpf/internal/linux/doc.go new file mode 100644 index 0000000000..064e75437d --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/linux/doc.go @@ -0,0 +1,2 @@ +// Package linux contains OS specific wrappers around package unix. +package linux diff --git a/vendor/github.com/cilium/ebpf/internal/linux/kconfig.go b/vendor/github.com/cilium/ebpf/internal/linux/kconfig.go new file mode 100644 index 0000000000..1488ecb35c --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/linux/kconfig.go @@ -0,0 +1,31 @@ +package linux + +import ( + "fmt" + "os" +) + +// FindKConfig searches for a kconfig file on the host. +// +// It first reads from /boot/config- of the current running kernel and tries +// /proc/config.gz if nothing was found in /boot. +// If none of the file provide a kconfig, it returns an error. +func FindKConfig() (*os.File, error) { + kernelRelease, err := KernelRelease() + if err != nil { + return nil, fmt.Errorf("cannot get kernel release: %w", err) + } + + path := "/boot/config-" + kernelRelease + f, err := os.Open(path) + if err == nil { + return f, nil + } + + f, err = os.Open("/proc/config.gz") + if err == nil { + return f, nil + } + + return nil, fmt.Errorf("neither %s nor /proc/config.gz provide a kconfig", path) +} diff --git a/vendor/github.com/cilium/ebpf/internal/linux/platform.go b/vendor/github.com/cilium/ebpf/internal/linux/platform.go new file mode 100644 index 0000000000..39bdcc51f9 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/linux/platform.go @@ -0,0 +1,43 @@ +package linux + +import ( + "runtime" +) + +// PlatformPrefix returns the platform-dependent syscall wrapper prefix used by +// the linux kernel. +// +// Based on https://github.com/golang/go/blob/master/src/go/build/syslist.go +// and https://github.com/libbpf/libbpf/blob/master/src/libbpf.c#L10047 +func PlatformPrefix() string { + switch runtime.GOARCH { + case "386": + return "__ia32_" + case "amd64", "amd64p32": + return "__x64_" + + case "arm", "armbe": + return "__arm_" + case "arm64", "arm64be": + return "__arm64_" + + case "mips", "mipsle", "mips64", "mips64le", "mips64p32", "mips64p32le": + return "__mips_" + + case "s390": + return "__s390_" + case "s390x": + return "__s390x_" + + case "riscv", "riscv64": + return "__riscv_" + + case "ppc": + return "__powerpc_" + case "ppc64", "ppc64le": + return "__powerpc64_" + + default: + return "" + } +} diff --git a/vendor/github.com/cilium/ebpf/internal/linux/statfs.go b/vendor/github.com/cilium/ebpf/internal/linux/statfs.go new file mode 100644 index 0000000000..e268c06fab --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/linux/statfs.go @@ -0,0 +1,23 @@ +package linux + +import ( + "unsafe" + + "github.com/cilium/ebpf/internal/unix" +) + +func FSType(path string) (int64, error) { + var statfs unix.Statfs_t + if err := unix.Statfs(path, &statfs); err != nil { + return 0, err + } + + fsType := int64(statfs.Type) + if unsafe.Sizeof(statfs.Type) == 4 { + // We're on a 32 bit arch, where statfs.Type is int32. bpfFSType is a + // negative number when interpreted as int32 so we need to cast via + // uint32 to avoid sign extension. + fsType = int64(uint32(statfs.Type)) + } + return fsType, nil +} diff --git a/vendor/github.com/cilium/ebpf/internal/linux/vdso.go b/vendor/github.com/cilium/ebpf/internal/linux/vdso.go new file mode 100644 index 0000000000..1d8d0ef6b1 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/linux/vdso.go @@ -0,0 +1,144 @@ +package linux + +import ( + "debug/elf" + "encoding/binary" + "errors" + "fmt" + "io" + "math" + "os" + + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/unix" +) + +var ( + errAuxvNoVDSO = errors.New("no vdso address found in auxv") +) + +// vdsoVersion returns the LINUX_VERSION_CODE embedded in the vDSO library +// linked into the current process image. +func vdsoVersion() (uint32, error) { + av, err := newAuxvRuntimeReader() + if err != nil { + return 0, err + } + + defer av.Close() + + vdsoAddr, err := vdsoMemoryAddress(av) + if err != nil { + return 0, fmt.Errorf("finding vDSO memory address: %w", err) + } + + // Use /proc/self/mem rather than unsafe.Pointer tricks. + mem, err := os.Open("/proc/self/mem") + if err != nil { + return 0, fmt.Errorf("opening mem: %w", err) + } + defer mem.Close() + + // Open ELF at provided memory address, as offset into /proc/self/mem. + c, err := vdsoLinuxVersionCode(io.NewSectionReader(mem, int64(vdsoAddr), math.MaxInt64)) + if err != nil { + return 0, fmt.Errorf("reading linux version code: %w", err) + } + + return c, nil +} + +// vdsoMemoryAddress returns the memory address of the vDSO library +// linked into the current process image. r is an io.Reader into an auxv blob. +func vdsoMemoryAddress(r auxvPairReader) (uintptr, error) { + // Loop through all tag/value pairs in auxv until we find `AT_SYSINFO_EHDR`, + // the address of a page containing the virtual Dynamic Shared Object (vDSO). + for { + tag, value, err := r.ReadAuxvPair() + if err != nil { + return 0, err + } + + switch tag { + case _AT_SYSINFO_EHDR: + if value != 0 { + return uintptr(value), nil + } + return 0, fmt.Errorf("invalid vDSO address in auxv") + // _AT_NULL is always the last tag/val pair in the aux vector + // and can be treated like EOF. + case _AT_NULL: + return 0, errAuxvNoVDSO + } + } +} + +// format described at https://www.man7.org/linux/man-pages/man5/elf.5.html in section 'Notes (Nhdr)' +type elfNoteHeader struct { + NameSize int32 + DescSize int32 + Type int32 +} + +// vdsoLinuxVersionCode returns the LINUX_VERSION_CODE embedded in +// the ELF notes section of the binary provided by the reader. +func vdsoLinuxVersionCode(r io.ReaderAt) (uint32, error) { + hdr, err := internal.NewSafeELFFile(r) + if err != nil { + return 0, fmt.Errorf("reading vDSO ELF: %w", err) + } + + sections := hdr.SectionsByType(elf.SHT_NOTE) + if len(sections) == 0 { + return 0, fmt.Errorf("no note section found in vDSO ELF") + } + + for _, sec := range sections { + sr := sec.Open() + var n elfNoteHeader + + // Read notes until we find one named 'Linux'. + for { + if err := binary.Read(sr, hdr.ByteOrder, &n); err != nil { + if errors.Is(err, io.EOF) { + // We looked at all the notes in this section + break + } + return 0, fmt.Errorf("reading note header: %w", err) + } + + // If a note name is defined, it follows the note header. + var name string + if n.NameSize > 0 { + // Read the note name, aligned to 4 bytes. + buf := make([]byte, internal.Align(n.NameSize, 4)) + if err := binary.Read(sr, hdr.ByteOrder, &buf); err != nil { + return 0, fmt.Errorf("reading note name: %w", err) + } + + // Read nul-terminated string. + name = unix.ByteSliceToString(buf[:n.NameSize]) + } + + // If a note descriptor is defined, it follows the name. + // It is possible for a note to have a descriptor but not a name. + if n.DescSize > 0 { + // LINUX_VERSION_CODE is a uint32 value. + if name == "Linux" && n.DescSize == 4 && n.Type == 0 { + var version uint32 + if err := binary.Read(sr, hdr.ByteOrder, &version); err != nil { + return 0, fmt.Errorf("reading note descriptor: %w", err) + } + return version, nil + } + + // Discard the note descriptor if it exists but we're not interested in it. + if _, err := io.CopyN(io.Discard, sr, int64(internal.Align(n.DescSize, 4))); err != nil { + return 0, err + } + } + } + } + + return 0, fmt.Errorf("no Linux note in ELF") +} diff --git a/vendor/github.com/cilium/ebpf/internal/linux/version.go b/vendor/github.com/cilium/ebpf/internal/linux/version.go new file mode 100644 index 0000000000..798dd3fed0 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/linux/version.go @@ -0,0 +1,34 @@ +package linux + +import ( + "fmt" + "sync" + + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/unix" +) + +// KernelVersion returns the version of the currently running kernel. +var KernelVersion = sync.OnceValues(detectKernelVersion) + +// detectKernelVersion returns the version of the running kernel. +func detectKernelVersion() (internal.Version, error) { + vc, err := vdsoVersion() + if err != nil { + return internal.Version{}, err + } + return internal.NewVersionFromCode(vc), nil +} + +// KernelRelease returns the release string of the running kernel. +// Its format depends on the Linux distribution and corresponds to directory +// names in /lib/modules by convention. Some examples are 5.15.17-1-lts and +// 4.19.0-16-amd64. +func KernelRelease() (string, error) { + var uname unix.Utsname + if err := unix.Uname(&uname); err != nil { + return "", fmt.Errorf("uname failed: %w", err) + } + + return unix.ByteSliceToString(uname.Release[:]), nil +} diff --git a/vendor/github.com/cilium/ebpf/internal/math.go b/vendor/github.com/cilium/ebpf/internal/math.go new file mode 100644 index 0000000000..10cde66860 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/math.go @@ -0,0 +1,33 @@ +package internal + +// Align returns 'n' updated to 'alignment' boundary. +func Align[I Integer](n, alignment I) I { + return (n + alignment - 1) / alignment * alignment +} + +// IsPow returns true if n is a power of two. +func IsPow[I Integer](n I) bool { + return n != 0 && (n&(n-1)) == 0 +} + +// Between returns the value clamped between a and b. +func Between[I Integer](val, a, b I) I { + lower, upper := a, b + if lower > upper { + upper, lower = a, b + } + + val = min(val, upper) + return max(val, lower) +} + +// Integer represents all possible integer types. +// Remove when x/exp/constraints is moved to the standard library. +type Integer interface { + ~int | ~int8 | ~int16 | ~int32 | ~int64 | ~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 | ~uintptr +} + +// List of integer types known by the Go compiler. Used by TestIntegerConstraint +// to warn if a new integer type is introduced. Remove when x/exp/constraints +// is moved to the standard library. +var integers = []string{"int", "int8", "int16", "int32", "int64", "uint", "uint8", "uint16", "uint32", "uint64", "uintptr"} diff --git a/vendor/github.com/cilium/ebpf/internal/output.go b/vendor/github.com/cilium/ebpf/internal/output.go new file mode 100644 index 0000000000..dd6e6cbafe --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/output.go @@ -0,0 +1,97 @@ +package internal + +import ( + "bytes" + "errors" + "go/format" + "go/scanner" + "io" + "reflect" + "strings" + "unicode" +) + +// Identifier turns a C style type or field name into an exportable Go equivalent. +func Identifier(str string) string { + prev := rune(-1) + return strings.Map(func(r rune) rune { + // See https://golang.org/ref/spec#Identifiers + switch { + case unicode.IsLetter(r): + if prev == -1 { + r = unicode.ToUpper(r) + } + + case r == '_': + switch { + // The previous rune was deleted, or we are at the + // beginning of the string. + case prev == -1: + fallthrough + + // The previous rune is a lower case letter or a digit. + case unicode.IsDigit(prev) || (unicode.IsLetter(prev) && unicode.IsLower(prev)): + // delete the current rune, and force the + // next character to be uppercased. + r = -1 + } + + case unicode.IsDigit(r): + + default: + // Delete the current rune. prev is unchanged. + return -1 + } + + prev = r + return r + }, str) +} + +// WriteFormatted outputs a formatted src into out. +// +// If formatting fails it returns an informative error message. +func WriteFormatted(src []byte, out io.Writer) error { + formatted, err := format.Source(src) + if err == nil { + _, err = out.Write(formatted) + return err + } + + var el scanner.ErrorList + if !errors.As(err, &el) { + return err + } + + var nel scanner.ErrorList + for _, err := range el { + if !err.Pos.IsValid() { + nel = append(nel, err) + continue + } + + buf := src[err.Pos.Offset:] + nl := bytes.IndexRune(buf, '\n') + if nl == -1 { + nel = append(nel, err) + continue + } + + err.Msg += ": " + string(buf[:nl]) + nel = append(nel, err) + } + + return nel +} + +// GoTypeName is like %T, but elides the package name. +// +// Pointers to a type are peeled off. +func GoTypeName(t any) string { + rT := reflect.TypeOf(t) + for rT.Kind() == reflect.Pointer { + rT = rT.Elem() + } + // Doesn't return the correct Name for generic types due to https://github.com/golang/go/issues/55924 + return rT.Name() +} diff --git a/vendor/github.com/cilium/ebpf/internal/prog.go b/vendor/github.com/cilium/ebpf/internal/prog.go new file mode 100644 index 0000000000..d629145b62 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/prog.go @@ -0,0 +1,11 @@ +package internal + +// EmptyBPFContext is the smallest-possible BPF input context to be used for +// invoking `Program.{Run,Benchmark,Test}`. +// +// Programs require a context input buffer of at least 15 bytes. Looking in +// net/bpf/test_run.c, bpf_test_init() requires that the input is at least +// ETH_HLEN (14) bytes. As of Linux commit fd18942 ("bpf: Don't redirect packets +// with invalid pkt_len"), it also requires the skb to be non-empty after +// removing the Layer 2 header. +var EmptyBPFContext = make([]byte, 15) diff --git a/vendor/github.com/cilium/ebpf/internal/sys/doc.go b/vendor/github.com/cilium/ebpf/internal/sys/doc.go new file mode 100644 index 0000000000..dfe174448e --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/sys/doc.go @@ -0,0 +1,6 @@ +// Package sys contains bindings for the BPF syscall. +package sys + +// Regenerate types.go by invoking go generate in the current directory. + +//go:generate go run github.com/cilium/ebpf/internal/cmd/gentypes ../../btf/testdata/vmlinux.btf.gz diff --git a/vendor/github.com/cilium/ebpf/internal/sys/fd.go b/vendor/github.com/cilium/ebpf/internal/sys/fd.go new file mode 100644 index 0000000000..e2ba43fd3b --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/sys/fd.go @@ -0,0 +1,165 @@ +package sys + +import ( + "fmt" + "math" + "os" + "path/filepath" + "runtime" + "strconv" + "strings" + + "github.com/cilium/ebpf/internal/testutils/fdtrace" + "github.com/cilium/ebpf/internal/unix" +) + +var ErrClosedFd = unix.EBADF + +type FD struct { + raw int +} + +func newFD(value int) *FD { + fdtrace.TraceFD(value, 1) + + fd := &FD{value} + runtime.SetFinalizer(fd, (*FD).finalize) + return fd +} + +// finalize is set as the FD's runtime finalizer and +// sends a leak trace before calling FD.Close(). +func (fd *FD) finalize() { + if fd.raw < 0 { + return + } + + fdtrace.LeakFD(fd.raw) + + _ = fd.Close() +} + +// NewFD wraps a raw fd with a finalizer. +// +// You must not use the raw fd after calling this function, since the underlying +// file descriptor number may change. This is because the BPF UAPI assumes that +// zero is not a valid fd value. +func NewFD(value int) (*FD, error) { + if value < 0 { + return nil, fmt.Errorf("invalid fd %d", value) + } + + fd := newFD(value) + if value != 0 { + return fd, nil + } + + dup, err := fd.Dup() + _ = fd.Close() + return dup, err +} + +func (fd *FD) String() string { + return strconv.FormatInt(int64(fd.raw), 10) +} + +func (fd *FD) Int() int { + return fd.raw +} + +func (fd *FD) Uint() uint32 { + if fd.raw < 0 || int64(fd.raw) > math.MaxUint32 { + // Best effort: this is the number most likely to be an invalid file + // descriptor. It is equal to -1 (on two's complement arches). + return math.MaxUint32 + } + return uint32(fd.raw) +} + +func (fd *FD) Close() error { + if fd.raw < 0 { + return nil + } + + return unix.Close(fd.Disown()) +} + +// Disown destroys the FD and returns its raw file descriptor without closing +// it. After this call, the underlying fd is no longer tied to the FD's +// lifecycle. +func (fd *FD) Disown() int { + value := fd.raw + fdtrace.ForgetFD(value) + fd.raw = -1 + + runtime.SetFinalizer(fd, nil) + return value +} + +func (fd *FD) Dup() (*FD, error) { + if fd.raw < 0 { + return nil, ErrClosedFd + } + + // Always require the fd to be larger than zero: the BPF API treats the value + // as "no argument provided". + dup, err := unix.FcntlInt(uintptr(fd.raw), unix.F_DUPFD_CLOEXEC, 1) + if err != nil { + return nil, fmt.Errorf("can't dup fd: %v", err) + } + + return newFD(dup), nil +} + +// File takes ownership of FD and turns it into an [*os.File]. +// +// You must not use the FD after the call returns. +// +// Returns nil if the FD is not valid. +func (fd *FD) File(name string) *os.File { + if fd.raw < 0 { + return nil + } + + return os.NewFile(uintptr(fd.Disown()), name) +} + +// ObjGetTyped wraps [ObjGet] with a readlink call to extract the type of the +// underlying bpf object. +func ObjGetTyped(attr *ObjGetAttr) (*FD, ObjType, error) { + fd, err := ObjGet(attr) + if err != nil { + return nil, 0, err + } + + typ, err := readType(fd) + if err != nil { + _ = fd.Close() + return nil, 0, fmt.Errorf("reading fd type: %w", err) + } + + return fd, typ, nil +} + +// readType returns the bpf object type of the file descriptor by calling +// readlink(3). Returns an error if the file descriptor does not represent a bpf +// object. +func readType(fd *FD) (ObjType, error) { + s, err := os.Readlink(filepath.Join("/proc/self/fd/", fd.String())) + if err != nil { + return 0, fmt.Errorf("readlink fd %d: %w", fd.Int(), err) + } + + s = strings.TrimPrefix(s, "anon_inode:") + + switch s { + case "bpf-map": + return BPF_TYPE_MAP, nil + case "bpf-prog": + return BPF_TYPE_PROG, nil + case "bpf-link": + return BPF_TYPE_LINK, nil + } + + return 0, fmt.Errorf("unknown type %s of fd %d", s, fd.Int()) +} diff --git a/vendor/github.com/cilium/ebpf/internal/sys/pinning.go b/vendor/github.com/cilium/ebpf/internal/sys/pinning.go new file mode 100644 index 0000000000..9a4c6c7a15 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/sys/pinning.go @@ -0,0 +1,65 @@ +package sys + +import ( + "errors" + "fmt" + "os" + "path/filepath" + "runtime" + + "github.com/cilium/ebpf/internal/linux" + "github.com/cilium/ebpf/internal/unix" +) + +func Pin(currentPath, newPath string, fd *FD) error { + if newPath == "" { + return errors.New("given pinning path cannot be empty") + } + if currentPath == newPath { + return nil + } + + fsType, err := linux.FSType(filepath.Dir(newPath)) + if err != nil { + return err + } + if fsType != unix.BPF_FS_MAGIC { + return fmt.Errorf("%s is not on a bpf filesystem", newPath) + } + + defer runtime.KeepAlive(fd) + + if currentPath == "" { + return ObjPin(&ObjPinAttr{ + Pathname: NewStringPointer(newPath), + BpfFd: fd.Uint(), + }) + } + + // Renameat2 is used instead of os.Rename to disallow the new path replacing + // an existing path. + err = unix.Renameat2(unix.AT_FDCWD, currentPath, unix.AT_FDCWD, newPath, unix.RENAME_NOREPLACE) + if err == nil { + // Object is now moved to the new pinning path. + return nil + } + if !os.IsNotExist(err) { + return fmt.Errorf("unable to move pinned object to new path %v: %w", newPath, err) + } + // Internal state not in sync with the file system so let's fix it. + return ObjPin(&ObjPinAttr{ + Pathname: NewStringPointer(newPath), + BpfFd: fd.Uint(), + }) +} + +func Unpin(pinnedPath string) error { + if pinnedPath == "" { + return nil + } + err := os.Remove(pinnedPath) + if err == nil || os.IsNotExist(err) { + return nil + } + return err +} diff --git a/vendor/github.com/cilium/ebpf/internal/sys/ptr.go b/vendor/github.com/cilium/ebpf/internal/sys/ptr.go new file mode 100644 index 0000000000..af0c014e3b --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/sys/ptr.go @@ -0,0 +1,52 @@ +package sys + +import ( + "unsafe" + + "github.com/cilium/ebpf/internal/unix" +) + +// NewPointer creates a 64-bit pointer from an unsafe Pointer. +func NewPointer(ptr unsafe.Pointer) Pointer { + return Pointer{ptr: ptr} +} + +// NewSlicePointer creates a 64-bit pointer from a slice. +func NewSlicePointer[T comparable](buf []T) Pointer { + if len(buf) == 0 { + return Pointer{} + } + + return Pointer{ptr: unsafe.Pointer(unsafe.SliceData(buf))} +} + +// NewSlicePointerLen creates a 64-bit pointer from a byte slice. +// +// Useful to assign both the pointer and the length in one go. +func NewSlicePointerLen(buf []byte) (Pointer, uint32) { + return NewSlicePointer(buf), uint32(len(buf)) +} + +// NewStringPointer creates a 64-bit pointer from a string. +func NewStringPointer(str string) Pointer { + p, err := unix.BytePtrFromString(str) + if err != nil { + return Pointer{} + } + + return Pointer{ptr: unsafe.Pointer(p)} +} + +// NewStringSlicePointer allocates an array of Pointers to each string in the +// given slice of strings and returns a 64-bit pointer to the start of the +// resulting array. +// +// Use this function to pass arrays of strings as syscall arguments. +func NewStringSlicePointer(strings []string) Pointer { + sp := make([]Pointer, 0, len(strings)) + for _, s := range strings { + sp = append(sp, NewStringPointer(s)) + } + + return Pointer{ptr: unsafe.Pointer(&sp[0])} +} diff --git a/vendor/github.com/cilium/ebpf/internal/sys/ptr_32_be.go b/vendor/github.com/cilium/ebpf/internal/sys/ptr_32_be.go new file mode 100644 index 0000000000..6278c79c9e --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/sys/ptr_32_be.go @@ -0,0 +1,14 @@ +//go:build armbe || mips || mips64p32 + +package sys + +import ( + "unsafe" +) + +// Pointer wraps an unsafe.Pointer to be 64bit to +// conform to the syscall specification. +type Pointer struct { + pad uint32 + ptr unsafe.Pointer +} diff --git a/vendor/github.com/cilium/ebpf/internal/sys/ptr_32_le.go b/vendor/github.com/cilium/ebpf/internal/sys/ptr_32_le.go new file mode 100644 index 0000000000..c27b537e8e --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/sys/ptr_32_le.go @@ -0,0 +1,14 @@ +//go:build 386 || amd64p32 || arm || mipsle || mips64p32le + +package sys + +import ( + "unsafe" +) + +// Pointer wraps an unsafe.Pointer to be 64bit to +// conform to the syscall specification. +type Pointer struct { + ptr unsafe.Pointer + pad uint32 +} diff --git a/vendor/github.com/cilium/ebpf/internal/sys/ptr_64.go b/vendor/github.com/cilium/ebpf/internal/sys/ptr_64.go new file mode 100644 index 0000000000..2d7828230a --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/sys/ptr_64.go @@ -0,0 +1,13 @@ +//go:build !386 && !amd64p32 && !arm && !mipsle && !mips64p32le && !armbe && !mips && !mips64p32 + +package sys + +import ( + "unsafe" +) + +// Pointer wraps an unsafe.Pointer to be 64bit to +// conform to the syscall specification. +type Pointer struct { + ptr unsafe.Pointer +} diff --git a/vendor/github.com/cilium/ebpf/internal/sys/signals.go b/vendor/github.com/cilium/ebpf/internal/sys/signals.go new file mode 100644 index 0000000000..e5337191d6 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/sys/signals.go @@ -0,0 +1,83 @@ +package sys + +import ( + "fmt" + "runtime" + "unsafe" + + "github.com/cilium/ebpf/internal/unix" +) + +// A sigset containing only SIGPROF. +var profSet unix.Sigset_t + +func init() { + // See sigsetAdd for details on the implementation. Open coded here so + // that the compiler will check the constant calculations for us. + profSet.Val[sigprofBit/wordBits] |= 1 << (sigprofBit % wordBits) +} + +// maskProfilerSignal locks the calling goroutine to its underlying OS thread +// and adds SIGPROF to the thread's signal mask. This prevents pprof from +// interrupting expensive syscalls like e.g. BPF_PROG_LOAD. +// +// The caller must defer unmaskProfilerSignal() to reverse the operation. +func maskProfilerSignal() { + runtime.LockOSThread() + + if err := unix.PthreadSigmask(unix.SIG_BLOCK, &profSet, nil); err != nil { + runtime.UnlockOSThread() + panic(fmt.Errorf("masking profiler signal: %w", err)) + } +} + +// unmaskProfilerSignal removes SIGPROF from the underlying thread's signal +// mask, allowing it to be interrupted for profiling once again. +// +// It also unlocks the current goroutine from its underlying OS thread. +func unmaskProfilerSignal() { + defer runtime.UnlockOSThread() + + if err := unix.PthreadSigmask(unix.SIG_UNBLOCK, &profSet, nil); err != nil { + panic(fmt.Errorf("unmasking profiler signal: %w", err)) + } +} + +const ( + // Signal is the nth bit in the bitfield. + sigprofBit = int(unix.SIGPROF - 1) + // The number of bits in one Sigset_t word. + wordBits = int(unsafe.Sizeof(unix.Sigset_t{}.Val[0])) * 8 +) + +// sigsetAdd adds signal to set. +// +// Note: Sigset_t.Val's value type is uint32 or uint64 depending on the arch. +// This function must be able to deal with both and so must avoid any direct +// references to u32 or u64 types. +func sigsetAdd(set *unix.Sigset_t, signal unix.Signal) error { + if signal < 1 { + return fmt.Errorf("signal %d must be larger than 0", signal) + } + + // For amd64, runtime.sigaddset() performs the following operation: + // set[(signal-1)/32] |= 1 << ((uint32(signal) - 1) & 31) + // + // This trick depends on sigset being two u32's, causing a signal in the + // bottom 31 bits to be written to the low word if bit 32 is low, or the high + // word if bit 32 is high. + + // Signal is the nth bit in the bitfield. + bit := int(signal - 1) + // Word within the sigset the bit needs to be written to. + word := bit / wordBits + + if word >= len(set.Val) { + return fmt.Errorf("signal %d does not fit within unix.Sigset_t", signal) + } + + // Write the signal bit into its corresponding word at the corrected offset. + set.Val[word] |= 1 << (bit % wordBits) + + return nil +} diff --git a/vendor/github.com/cilium/ebpf/internal/sys/syscall.go b/vendor/github.com/cilium/ebpf/internal/sys/syscall.go new file mode 100644 index 0000000000..4fdb74c57a --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/sys/syscall.go @@ -0,0 +1,215 @@ +package sys + +import ( + "runtime" + "unsafe" + + "github.com/cilium/ebpf/internal/unix" +) + +// ENOTSUPP is a Linux internal error code that has leaked into UAPI. +// +// It is not the same as ENOTSUP or EOPNOTSUPP. +const ENOTSUPP = unix.Errno(524) + +// BPF wraps SYS_BPF. +// +// Any pointers contained in attr must use the Pointer type from this package. +func BPF(cmd Cmd, attr unsafe.Pointer, size uintptr) (uintptr, error) { + // Prevent the Go profiler from repeatedly interrupting the verifier, + // which could otherwise lead to a livelock due to receiving EAGAIN. + if cmd == BPF_PROG_LOAD || cmd == BPF_PROG_RUN { + maskProfilerSignal() + defer unmaskProfilerSignal() + } + + for { + r1, _, errNo := unix.Syscall(unix.SYS_BPF, uintptr(cmd), uintptr(attr), size) + runtime.KeepAlive(attr) + + // As of ~4.20 the verifier can be interrupted by a signal, + // and returns EAGAIN in that case. + if errNo == unix.EAGAIN && cmd == BPF_PROG_LOAD { + continue + } + + var err error + if errNo != 0 { + err = wrappedErrno{errNo} + } + + return r1, err + } +} + +// Info is implemented by all structs that can be passed to the ObjInfo syscall. +// +// MapInfo +// ProgInfo +// LinkInfo +// BtfInfo +type Info interface { + info() (unsafe.Pointer, uint32) +} + +var _ Info = (*MapInfo)(nil) + +func (i *MapInfo) info() (unsafe.Pointer, uint32) { + return unsafe.Pointer(i), uint32(unsafe.Sizeof(*i)) +} + +var _ Info = (*ProgInfo)(nil) + +func (i *ProgInfo) info() (unsafe.Pointer, uint32) { + return unsafe.Pointer(i), uint32(unsafe.Sizeof(*i)) +} + +var _ Info = (*LinkInfo)(nil) + +func (i *LinkInfo) info() (unsafe.Pointer, uint32) { + return unsafe.Pointer(i), uint32(unsafe.Sizeof(*i)) +} + +func (i *TracingLinkInfo) info() (unsafe.Pointer, uint32) { + return unsafe.Pointer(i), uint32(unsafe.Sizeof(*i)) +} + +func (i *CgroupLinkInfo) info() (unsafe.Pointer, uint32) { + return unsafe.Pointer(i), uint32(unsafe.Sizeof(*i)) +} + +func (i *NetNsLinkInfo) info() (unsafe.Pointer, uint32) { + return unsafe.Pointer(i), uint32(unsafe.Sizeof(*i)) +} + +func (i *XDPLinkInfo) info() (unsafe.Pointer, uint32) { + return unsafe.Pointer(i), uint32(unsafe.Sizeof(*i)) +} + +func (i *TcxLinkInfo) info() (unsafe.Pointer, uint32) { + return unsafe.Pointer(i), uint32(unsafe.Sizeof(*i)) +} + +func (i *NetfilterLinkInfo) info() (unsafe.Pointer, uint32) { + return unsafe.Pointer(i), uint32(unsafe.Sizeof(*i)) +} + +func (i *NetkitLinkInfo) info() (unsafe.Pointer, uint32) { + return unsafe.Pointer(i), uint32(unsafe.Sizeof(*i)) +} + +func (i *KprobeMultiLinkInfo) info() (unsafe.Pointer, uint32) { + return unsafe.Pointer(i), uint32(unsafe.Sizeof(*i)) +} + +func (i *KprobeLinkInfo) info() (unsafe.Pointer, uint32) { + return unsafe.Pointer(i), uint32(unsafe.Sizeof(*i)) +} + +var _ Info = (*BtfInfo)(nil) + +func (i *BtfInfo) info() (unsafe.Pointer, uint32) { + return unsafe.Pointer(i), uint32(unsafe.Sizeof(*i)) +} + +func (i *PerfEventLinkInfo) info() (unsafe.Pointer, uint32) { + return unsafe.Pointer(i), uint32(unsafe.Sizeof(*i)) +} + +// ObjInfo retrieves information about a BPF Fd. +// +// info may be one of MapInfo, ProgInfo, LinkInfo and BtfInfo. +func ObjInfo(fd *FD, info Info) error { + ptr, len := info.info() + err := ObjGetInfoByFd(&ObjGetInfoByFdAttr{ + BpfFd: fd.Uint(), + InfoLen: len, + Info: NewPointer(ptr), + }) + runtime.KeepAlive(fd) + return err +} + +// BPFObjName is a null-terminated string made up of +// 'A-Za-z0-9_' characters. +type ObjName [BPF_OBJ_NAME_LEN]byte + +// NewObjName truncates the result if it is too long. +func NewObjName(name string) ObjName { + var result ObjName + copy(result[:BPF_OBJ_NAME_LEN-1], name) + return result +} + +// LogLevel controls the verbosity of the kernel's eBPF program verifier. +type LogLevel uint32 + +const ( + BPF_LOG_LEVEL1 LogLevel = 1 << iota + BPF_LOG_LEVEL2 + BPF_LOG_STATS +) + +// LinkID uniquely identifies a bpf_link. +type LinkID uint32 + +// BTFID uniquely identifies a BTF blob loaded into the kernel. +type BTFID uint32 + +// TypeID identifies a type in a BTF blob. +type TypeID uint32 + +// Flags used by bpf_mprog. +const ( + BPF_F_REPLACE = 1 << (iota + 2) + BPF_F_BEFORE + BPF_F_AFTER + BPF_F_ID + BPF_F_LINK_MPROG = 1 << 13 // aka BPF_F_LINK +) + +// Flags used by BPF_PROG_LOAD. +const ( + BPF_F_SLEEPABLE = 1 << 4 + BPF_F_XDP_HAS_FRAGS = 1 << 5 + BPF_F_XDP_DEV_BOUND_ONLY = 1 << 6 +) + +const BPF_TAG_SIZE = 8 +const BPF_OBJ_NAME_LEN = 16 + +// wrappedErrno wraps [unix.Errno] to prevent direct comparisons with +// syscall.E* or unix.E* constants. +// +// You should never export an error of this type. +type wrappedErrno struct { + unix.Errno +} + +func (we wrappedErrno) Unwrap() error { + return we.Errno +} + +func (we wrappedErrno) Error() string { + if we.Errno == ENOTSUPP { + return "operation not supported" + } + return we.Errno.Error() +} + +type syscallError struct { + error + errno unix.Errno +} + +func Error(err error, errno unix.Errno) error { + return &syscallError{err, errno} +} + +func (se *syscallError) Is(target error) bool { + return target == se.error +} + +func (se *syscallError) Unwrap() error { + return se.errno +} diff --git a/vendor/github.com/cilium/ebpf/internal/sys/types.go b/vendor/github.com/cilium/ebpf/internal/sys/types.go new file mode 100644 index 0000000000..82f3492a83 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/sys/types.go @@ -0,0 +1,1577 @@ +// Code generated by internal/cmd/gentypes; DO NOT EDIT. + +package sys + +import ( + "unsafe" +) + +const ( + BPF_ADJ_ROOM_ENCAP_L2_MASK = 255 + BPF_ADJ_ROOM_ENCAP_L2_SHIFT = 56 + BPF_ANY = 0 + BPF_CSUM_LEVEL_DEC = 2 + BPF_CSUM_LEVEL_INC = 1 + BPF_CSUM_LEVEL_QUERY = 0 + BPF_CSUM_LEVEL_RESET = 3 + BPF_EXIST = 2 + BPF_FIB_LKUP_RET_BLACKHOLE = 1 + BPF_FIB_LKUP_RET_FRAG_NEEDED = 8 + BPF_FIB_LKUP_RET_FWD_DISABLED = 5 + BPF_FIB_LKUP_RET_NOT_FWDED = 4 + BPF_FIB_LKUP_RET_NO_NEIGH = 7 + BPF_FIB_LKUP_RET_NO_SRC_ADDR = 9 + BPF_FIB_LKUP_RET_PROHIBIT = 3 + BPF_FIB_LKUP_RET_SUCCESS = 0 + BPF_FIB_LKUP_RET_UNREACHABLE = 2 + BPF_FIB_LKUP_RET_UNSUPP_LWT = 6 + BPF_FIB_LOOKUP_DIRECT = 1 + BPF_FIB_LOOKUP_MARK = 32 + BPF_FIB_LOOKUP_OUTPUT = 2 + BPF_FIB_LOOKUP_SKIP_NEIGH = 4 + BPF_FIB_LOOKUP_SRC = 16 + BPF_FIB_LOOKUP_TBID = 8 + BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG = 1 + BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP = 4 + BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL = 2 + BPF_F_ADJ_ROOM_DECAP_L3_IPV4 = 128 + BPF_F_ADJ_ROOM_DECAP_L3_IPV6 = 256 + BPF_F_ADJ_ROOM_ENCAP_L2_ETH = 64 + BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 = 2 + BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = 4 + BPF_F_ADJ_ROOM_ENCAP_L4_GRE = 8 + BPF_F_ADJ_ROOM_ENCAP_L4_UDP = 16 + BPF_F_ADJ_ROOM_FIXED_GSO = 1 + BPF_F_ADJ_ROOM_NO_CSUM_RESET = 32 + BPF_F_BPRM_SECUREEXEC = 1 + BPF_F_BROADCAST = 8 + BPF_F_CLONE = 512 + BPF_F_CTXLEN_MASK = 4503595332403200 + BPF_F_CURRENT_CPU = 4294967295 + BPF_F_CURRENT_NETNS = 18446744073709551615 + BPF_F_DONT_FRAGMENT = 4 + BPF_F_EXCLUDE_INGRESS = 16 + BPF_F_FAST_STACK_CMP = 512 + BPF_F_GET_BRANCH_RECORDS_SIZE = 1 + BPF_F_HDR_FIELD_MASK = 15 + BPF_F_INDEX_MASK = 4294967295 + BPF_F_INGRESS = 1 + BPF_F_INNER_MAP = 4096 + BPF_F_INVALIDATE_HASH = 2 + BPF_F_KPROBE_MULTI_RETURN = 1 + BPF_F_LINK = 8192 + BPF_F_LOCK = 4 + BPF_F_MARK_ENFORCE = 64 + BPF_F_MARK_MANGLED_0 = 32 + BPF_F_MMAPABLE = 1024 + BPF_F_NEIGH = 2 + BPF_F_NEXTHOP = 8 + BPF_F_NO_COMMON_LRU = 2 + BPF_F_NO_PREALLOC = 1 + BPF_F_NO_TUNNEL_KEY = 16 + BPF_F_NO_USER_CONV = 262144 + BPF_F_NUMA_NODE = 4 + BPF_F_PATH_FD = 16384 + BPF_F_PEER = 4 + BPF_F_PRESERVE_ELEMS = 2048 + BPF_F_PSEUDO_HDR = 16 + BPF_F_RDONLY = 8 + BPF_F_RDONLY_PROG = 128 + BPF_F_RECOMPUTE_CSUM = 1 + BPF_F_REUSE_STACKID = 1024 + BPF_F_SEGV_ON_FAULT = 131072 + BPF_F_SEQ_NUMBER = 8 + BPF_F_SKIP_FIELD_MASK = 255 + BPF_F_STACK_BUILD_ID = 32 + BPF_F_SYSCTL_BASE_NAME = 1 + BPF_F_TIMER_ABS = 1 + BPF_F_TIMER_CPU_PIN = 2 + BPF_F_TOKEN_FD = 65536 + BPF_F_TUNINFO_FLAGS = 16 + BPF_F_TUNINFO_IPV6 = 1 + BPF_F_UPROBE_MULTI_RETURN = 1 + BPF_F_USER_BUILD_ID = 2048 + BPF_F_USER_STACK = 256 + BPF_F_VTYPE_BTF_OBJ_FD = 32768 + BPF_F_WRONLY = 16 + BPF_F_WRONLY_PROG = 256 + BPF_F_ZERO_CSUM_TX = 2 + BPF_F_ZERO_SEED = 64 + BPF_LOAD_HDR_OPT_TCP_SYN = 1 + BPF_LOCAL_STORAGE_GET_F_CREATE = 1 + BPF_MAX_LOOPS = 8388608 + BPF_MAX_TRAMP_LINKS = 38 + BPF_NOEXIST = 1 + BPF_RB_AVAIL_DATA = 0 + BPF_RB_CONS_POS = 2 + BPF_RB_FORCE_WAKEUP = 2 + BPF_RB_NO_WAKEUP = 1 + BPF_RB_PROD_POS = 3 + BPF_RB_RING_SIZE = 1 + BPF_REG_0 = 0 + BPF_REG_1 = 1 + BPF_REG_10 = 10 + BPF_REG_2 = 2 + BPF_REG_3 = 3 + BPF_REG_4 = 4 + BPF_REG_5 = 5 + BPF_REG_6 = 6 + BPF_REG_7 = 7 + BPF_REG_8 = 8 + BPF_REG_9 = 9 + BPF_RINGBUF_BUSY_BIT = 2147483648 + BPF_RINGBUF_DISCARD_BIT = 1073741824 + BPF_RINGBUF_HDR_SZ = 8 + BPF_SKB_CLOCK_MONOTONIC = 1 + BPF_SKB_CLOCK_REALTIME = 0 + BPF_SKB_CLOCK_TAI = 2 + BPF_SKB_TSTAMP_DELIVERY_MONO = 1 + BPF_SKB_TSTAMP_UNSPEC = 0 + BPF_SK_LOOKUP_F_NO_REUSEPORT = 2 + BPF_SK_LOOKUP_F_REPLACE = 1 + BPF_SK_STORAGE_GET_F_CREATE = 1 + BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB = 4 + BPF_SOCK_OPS_ALL_CB_FLAGS = 127 + BPF_SOCK_OPS_BASE_RTT = 7 + BPF_SOCK_OPS_HDR_OPT_LEN_CB = 14 + BPF_SOCK_OPS_NEEDS_ECN = 6 + BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = 16 + BPF_SOCK_OPS_PARSE_HDR_OPT_CB = 13 + BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = 32 + BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB = 5 + BPF_SOCK_OPS_RETRANS_CB = 9 + BPF_SOCK_OPS_RETRANS_CB_FLAG = 2 + BPF_SOCK_OPS_RTO_CB = 8 + BPF_SOCK_OPS_RTO_CB_FLAG = 1 + BPF_SOCK_OPS_RTT_CB = 12 + BPF_SOCK_OPS_RTT_CB_FLAG = 8 + BPF_SOCK_OPS_RWND_INIT = 2 + BPF_SOCK_OPS_STATE_CB = 10 + BPF_SOCK_OPS_STATE_CB_FLAG = 4 + BPF_SOCK_OPS_TCP_CONNECT_CB = 3 + BPF_SOCK_OPS_TCP_LISTEN_CB = 11 + BPF_SOCK_OPS_TIMEOUT_INIT = 1 + BPF_SOCK_OPS_VOID = 0 + BPF_SOCK_OPS_WRITE_HDR_OPT_CB = 15 + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG = 64 + BPF_TASK_ITER_ALL_PROCS = 0 + BPF_TASK_ITER_ALL_THREADS = 1 + BPF_TASK_ITER_PROC_THREADS = 2 + BPF_TCP_BOUND_INACTIVE = 13 + BPF_TCP_CLOSE = 7 + BPF_TCP_CLOSE_WAIT = 8 + BPF_TCP_CLOSING = 11 + BPF_TCP_ESTABLISHED = 1 + BPF_TCP_FIN_WAIT1 = 4 + BPF_TCP_FIN_WAIT2 = 5 + BPF_TCP_LAST_ACK = 9 + BPF_TCP_LISTEN = 10 + BPF_TCP_MAX_STATES = 14 + BPF_TCP_NEW_SYN_RECV = 12 + BPF_TCP_SYN_RECV = 3 + BPF_TCP_SYN_SENT = 2 + BPF_TCP_TIME_WAIT = 6 + BPF_WRITE_HDR_TCP_CURRENT_MSS = 1 + BPF_WRITE_HDR_TCP_SYNACK_COOKIE = 2 + BPF_XFRM_STATE_OPTS_SZ = 36 +) + +type AdjRoomMode uint32 + +const ( + BPF_ADJ_ROOM_NET AdjRoomMode = 0 + BPF_ADJ_ROOM_MAC AdjRoomMode = 1 +) + +type AttachType uint32 + +const ( + BPF_CGROUP_INET_INGRESS AttachType = 0 + BPF_CGROUP_INET_EGRESS AttachType = 1 + BPF_CGROUP_INET_SOCK_CREATE AttachType = 2 + BPF_CGROUP_SOCK_OPS AttachType = 3 + BPF_SK_SKB_STREAM_PARSER AttachType = 4 + BPF_SK_SKB_STREAM_VERDICT AttachType = 5 + BPF_CGROUP_DEVICE AttachType = 6 + BPF_SK_MSG_VERDICT AttachType = 7 + BPF_CGROUP_INET4_BIND AttachType = 8 + BPF_CGROUP_INET6_BIND AttachType = 9 + BPF_CGROUP_INET4_CONNECT AttachType = 10 + BPF_CGROUP_INET6_CONNECT AttachType = 11 + BPF_CGROUP_INET4_POST_BIND AttachType = 12 + BPF_CGROUP_INET6_POST_BIND AttachType = 13 + BPF_CGROUP_UDP4_SENDMSG AttachType = 14 + BPF_CGROUP_UDP6_SENDMSG AttachType = 15 + BPF_LIRC_MODE2 AttachType = 16 + BPF_FLOW_DISSECTOR AttachType = 17 + BPF_CGROUP_SYSCTL AttachType = 18 + BPF_CGROUP_UDP4_RECVMSG AttachType = 19 + BPF_CGROUP_UDP6_RECVMSG AttachType = 20 + BPF_CGROUP_GETSOCKOPT AttachType = 21 + BPF_CGROUP_SETSOCKOPT AttachType = 22 + BPF_TRACE_RAW_TP AttachType = 23 + BPF_TRACE_FENTRY AttachType = 24 + BPF_TRACE_FEXIT AttachType = 25 + BPF_MODIFY_RETURN AttachType = 26 + BPF_LSM_MAC AttachType = 27 + BPF_TRACE_ITER AttachType = 28 + BPF_CGROUP_INET4_GETPEERNAME AttachType = 29 + BPF_CGROUP_INET6_GETPEERNAME AttachType = 30 + BPF_CGROUP_INET4_GETSOCKNAME AttachType = 31 + BPF_CGROUP_INET6_GETSOCKNAME AttachType = 32 + BPF_XDP_DEVMAP AttachType = 33 + BPF_CGROUP_INET_SOCK_RELEASE AttachType = 34 + BPF_XDP_CPUMAP AttachType = 35 + BPF_SK_LOOKUP AttachType = 36 + BPF_XDP AttachType = 37 + BPF_SK_SKB_VERDICT AttachType = 38 + BPF_SK_REUSEPORT_SELECT AttachType = 39 + BPF_SK_REUSEPORT_SELECT_OR_MIGRATE AttachType = 40 + BPF_PERF_EVENT AttachType = 41 + BPF_TRACE_KPROBE_MULTI AttachType = 42 + BPF_LSM_CGROUP AttachType = 43 + BPF_STRUCT_OPS AttachType = 44 + BPF_NETFILTER AttachType = 45 + BPF_TCX_INGRESS AttachType = 46 + BPF_TCX_EGRESS AttachType = 47 + BPF_TRACE_UPROBE_MULTI AttachType = 48 + BPF_CGROUP_UNIX_CONNECT AttachType = 49 + BPF_CGROUP_UNIX_SENDMSG AttachType = 50 + BPF_CGROUP_UNIX_RECVMSG AttachType = 51 + BPF_CGROUP_UNIX_GETPEERNAME AttachType = 52 + BPF_CGROUP_UNIX_GETSOCKNAME AttachType = 53 + BPF_NETKIT_PRIMARY AttachType = 54 + BPF_NETKIT_PEER AttachType = 55 + BPF_TRACE_KPROBE_SESSION AttachType = 56 + __MAX_BPF_ATTACH_TYPE AttachType = 57 +) + +type Cmd uint32 + +const ( + BPF_MAP_CREATE Cmd = 0 + BPF_MAP_LOOKUP_ELEM Cmd = 1 + BPF_MAP_UPDATE_ELEM Cmd = 2 + BPF_MAP_DELETE_ELEM Cmd = 3 + BPF_MAP_GET_NEXT_KEY Cmd = 4 + BPF_PROG_LOAD Cmd = 5 + BPF_OBJ_PIN Cmd = 6 + BPF_OBJ_GET Cmd = 7 + BPF_PROG_ATTACH Cmd = 8 + BPF_PROG_DETACH Cmd = 9 + BPF_PROG_TEST_RUN Cmd = 10 + BPF_PROG_RUN Cmd = 10 + BPF_PROG_GET_NEXT_ID Cmd = 11 + BPF_MAP_GET_NEXT_ID Cmd = 12 + BPF_PROG_GET_FD_BY_ID Cmd = 13 + BPF_MAP_GET_FD_BY_ID Cmd = 14 + BPF_OBJ_GET_INFO_BY_FD Cmd = 15 + BPF_PROG_QUERY Cmd = 16 + BPF_RAW_TRACEPOINT_OPEN Cmd = 17 + BPF_BTF_LOAD Cmd = 18 + BPF_BTF_GET_FD_BY_ID Cmd = 19 + BPF_TASK_FD_QUERY Cmd = 20 + BPF_MAP_LOOKUP_AND_DELETE_ELEM Cmd = 21 + BPF_MAP_FREEZE Cmd = 22 + BPF_BTF_GET_NEXT_ID Cmd = 23 + BPF_MAP_LOOKUP_BATCH Cmd = 24 + BPF_MAP_LOOKUP_AND_DELETE_BATCH Cmd = 25 + BPF_MAP_UPDATE_BATCH Cmd = 26 + BPF_MAP_DELETE_BATCH Cmd = 27 + BPF_LINK_CREATE Cmd = 28 + BPF_LINK_UPDATE Cmd = 29 + BPF_LINK_GET_FD_BY_ID Cmd = 30 + BPF_LINK_GET_NEXT_ID Cmd = 31 + BPF_ENABLE_STATS Cmd = 32 + BPF_ITER_CREATE Cmd = 33 + BPF_LINK_DETACH Cmd = 34 + BPF_PROG_BIND_MAP Cmd = 35 + BPF_TOKEN_CREATE Cmd = 36 + __MAX_BPF_CMD Cmd = 37 +) + +type FunctionId uint32 + +const ( + BPF_FUNC_unspec FunctionId = 0 + BPF_FUNC_map_lookup_elem FunctionId = 1 + BPF_FUNC_map_update_elem FunctionId = 2 + BPF_FUNC_map_delete_elem FunctionId = 3 + BPF_FUNC_probe_read FunctionId = 4 + BPF_FUNC_ktime_get_ns FunctionId = 5 + BPF_FUNC_trace_printk FunctionId = 6 + BPF_FUNC_get_prandom_u32 FunctionId = 7 + BPF_FUNC_get_smp_processor_id FunctionId = 8 + BPF_FUNC_skb_store_bytes FunctionId = 9 + BPF_FUNC_l3_csum_replace FunctionId = 10 + BPF_FUNC_l4_csum_replace FunctionId = 11 + BPF_FUNC_tail_call FunctionId = 12 + BPF_FUNC_clone_redirect FunctionId = 13 + BPF_FUNC_get_current_pid_tgid FunctionId = 14 + BPF_FUNC_get_current_uid_gid FunctionId = 15 + BPF_FUNC_get_current_comm FunctionId = 16 + BPF_FUNC_get_cgroup_classid FunctionId = 17 + BPF_FUNC_skb_vlan_push FunctionId = 18 + BPF_FUNC_skb_vlan_pop FunctionId = 19 + BPF_FUNC_skb_get_tunnel_key FunctionId = 20 + BPF_FUNC_skb_set_tunnel_key FunctionId = 21 + BPF_FUNC_perf_event_read FunctionId = 22 + BPF_FUNC_redirect FunctionId = 23 + BPF_FUNC_get_route_realm FunctionId = 24 + BPF_FUNC_perf_event_output FunctionId = 25 + BPF_FUNC_skb_load_bytes FunctionId = 26 + BPF_FUNC_get_stackid FunctionId = 27 + BPF_FUNC_csum_diff FunctionId = 28 + BPF_FUNC_skb_get_tunnel_opt FunctionId = 29 + BPF_FUNC_skb_set_tunnel_opt FunctionId = 30 + BPF_FUNC_skb_change_proto FunctionId = 31 + BPF_FUNC_skb_change_type FunctionId = 32 + BPF_FUNC_skb_under_cgroup FunctionId = 33 + BPF_FUNC_get_hash_recalc FunctionId = 34 + BPF_FUNC_get_current_task FunctionId = 35 + BPF_FUNC_probe_write_user FunctionId = 36 + BPF_FUNC_current_task_under_cgroup FunctionId = 37 + BPF_FUNC_skb_change_tail FunctionId = 38 + BPF_FUNC_skb_pull_data FunctionId = 39 + BPF_FUNC_csum_update FunctionId = 40 + BPF_FUNC_set_hash_invalid FunctionId = 41 + BPF_FUNC_get_numa_node_id FunctionId = 42 + BPF_FUNC_skb_change_head FunctionId = 43 + BPF_FUNC_xdp_adjust_head FunctionId = 44 + BPF_FUNC_probe_read_str FunctionId = 45 + BPF_FUNC_get_socket_cookie FunctionId = 46 + BPF_FUNC_get_socket_uid FunctionId = 47 + BPF_FUNC_set_hash FunctionId = 48 + BPF_FUNC_setsockopt FunctionId = 49 + BPF_FUNC_skb_adjust_room FunctionId = 50 + BPF_FUNC_redirect_map FunctionId = 51 + BPF_FUNC_sk_redirect_map FunctionId = 52 + BPF_FUNC_sock_map_update FunctionId = 53 + BPF_FUNC_xdp_adjust_meta FunctionId = 54 + BPF_FUNC_perf_event_read_value FunctionId = 55 + BPF_FUNC_perf_prog_read_value FunctionId = 56 + BPF_FUNC_getsockopt FunctionId = 57 + BPF_FUNC_override_return FunctionId = 58 + BPF_FUNC_sock_ops_cb_flags_set FunctionId = 59 + BPF_FUNC_msg_redirect_map FunctionId = 60 + BPF_FUNC_msg_apply_bytes FunctionId = 61 + BPF_FUNC_msg_cork_bytes FunctionId = 62 + BPF_FUNC_msg_pull_data FunctionId = 63 + BPF_FUNC_bind FunctionId = 64 + BPF_FUNC_xdp_adjust_tail FunctionId = 65 + BPF_FUNC_skb_get_xfrm_state FunctionId = 66 + BPF_FUNC_get_stack FunctionId = 67 + BPF_FUNC_skb_load_bytes_relative FunctionId = 68 + BPF_FUNC_fib_lookup FunctionId = 69 + BPF_FUNC_sock_hash_update FunctionId = 70 + BPF_FUNC_msg_redirect_hash FunctionId = 71 + BPF_FUNC_sk_redirect_hash FunctionId = 72 + BPF_FUNC_lwt_push_encap FunctionId = 73 + BPF_FUNC_lwt_seg6_store_bytes FunctionId = 74 + BPF_FUNC_lwt_seg6_adjust_srh FunctionId = 75 + BPF_FUNC_lwt_seg6_action FunctionId = 76 + BPF_FUNC_rc_repeat FunctionId = 77 + BPF_FUNC_rc_keydown FunctionId = 78 + BPF_FUNC_skb_cgroup_id FunctionId = 79 + BPF_FUNC_get_current_cgroup_id FunctionId = 80 + BPF_FUNC_get_local_storage FunctionId = 81 + BPF_FUNC_sk_select_reuseport FunctionId = 82 + BPF_FUNC_skb_ancestor_cgroup_id FunctionId = 83 + BPF_FUNC_sk_lookup_tcp FunctionId = 84 + BPF_FUNC_sk_lookup_udp FunctionId = 85 + BPF_FUNC_sk_release FunctionId = 86 + BPF_FUNC_map_push_elem FunctionId = 87 + BPF_FUNC_map_pop_elem FunctionId = 88 + BPF_FUNC_map_peek_elem FunctionId = 89 + BPF_FUNC_msg_push_data FunctionId = 90 + BPF_FUNC_msg_pop_data FunctionId = 91 + BPF_FUNC_rc_pointer_rel FunctionId = 92 + BPF_FUNC_spin_lock FunctionId = 93 + BPF_FUNC_spin_unlock FunctionId = 94 + BPF_FUNC_sk_fullsock FunctionId = 95 + BPF_FUNC_tcp_sock FunctionId = 96 + BPF_FUNC_skb_ecn_set_ce FunctionId = 97 + BPF_FUNC_get_listener_sock FunctionId = 98 + BPF_FUNC_skc_lookup_tcp FunctionId = 99 + BPF_FUNC_tcp_check_syncookie FunctionId = 100 + BPF_FUNC_sysctl_get_name FunctionId = 101 + BPF_FUNC_sysctl_get_current_value FunctionId = 102 + BPF_FUNC_sysctl_get_new_value FunctionId = 103 + BPF_FUNC_sysctl_set_new_value FunctionId = 104 + BPF_FUNC_strtol FunctionId = 105 + BPF_FUNC_strtoul FunctionId = 106 + BPF_FUNC_sk_storage_get FunctionId = 107 + BPF_FUNC_sk_storage_delete FunctionId = 108 + BPF_FUNC_send_signal FunctionId = 109 + BPF_FUNC_tcp_gen_syncookie FunctionId = 110 + BPF_FUNC_skb_output FunctionId = 111 + BPF_FUNC_probe_read_user FunctionId = 112 + BPF_FUNC_probe_read_kernel FunctionId = 113 + BPF_FUNC_probe_read_user_str FunctionId = 114 + BPF_FUNC_probe_read_kernel_str FunctionId = 115 + BPF_FUNC_tcp_send_ack FunctionId = 116 + BPF_FUNC_send_signal_thread FunctionId = 117 + BPF_FUNC_jiffies64 FunctionId = 118 + BPF_FUNC_read_branch_records FunctionId = 119 + BPF_FUNC_get_ns_current_pid_tgid FunctionId = 120 + BPF_FUNC_xdp_output FunctionId = 121 + BPF_FUNC_get_netns_cookie FunctionId = 122 + BPF_FUNC_get_current_ancestor_cgroup_id FunctionId = 123 + BPF_FUNC_sk_assign FunctionId = 124 + BPF_FUNC_ktime_get_boot_ns FunctionId = 125 + BPF_FUNC_seq_printf FunctionId = 126 + BPF_FUNC_seq_write FunctionId = 127 + BPF_FUNC_sk_cgroup_id FunctionId = 128 + BPF_FUNC_sk_ancestor_cgroup_id FunctionId = 129 + BPF_FUNC_ringbuf_output FunctionId = 130 + BPF_FUNC_ringbuf_reserve FunctionId = 131 + BPF_FUNC_ringbuf_submit FunctionId = 132 + BPF_FUNC_ringbuf_discard FunctionId = 133 + BPF_FUNC_ringbuf_query FunctionId = 134 + BPF_FUNC_csum_level FunctionId = 135 + BPF_FUNC_skc_to_tcp6_sock FunctionId = 136 + BPF_FUNC_skc_to_tcp_sock FunctionId = 137 + BPF_FUNC_skc_to_tcp_timewait_sock FunctionId = 138 + BPF_FUNC_skc_to_tcp_request_sock FunctionId = 139 + BPF_FUNC_skc_to_udp6_sock FunctionId = 140 + BPF_FUNC_get_task_stack FunctionId = 141 + BPF_FUNC_load_hdr_opt FunctionId = 142 + BPF_FUNC_store_hdr_opt FunctionId = 143 + BPF_FUNC_reserve_hdr_opt FunctionId = 144 + BPF_FUNC_inode_storage_get FunctionId = 145 + BPF_FUNC_inode_storage_delete FunctionId = 146 + BPF_FUNC_d_path FunctionId = 147 + BPF_FUNC_copy_from_user FunctionId = 148 + BPF_FUNC_snprintf_btf FunctionId = 149 + BPF_FUNC_seq_printf_btf FunctionId = 150 + BPF_FUNC_skb_cgroup_classid FunctionId = 151 + BPF_FUNC_redirect_neigh FunctionId = 152 + BPF_FUNC_per_cpu_ptr FunctionId = 153 + BPF_FUNC_this_cpu_ptr FunctionId = 154 + BPF_FUNC_redirect_peer FunctionId = 155 + BPF_FUNC_task_storage_get FunctionId = 156 + BPF_FUNC_task_storage_delete FunctionId = 157 + BPF_FUNC_get_current_task_btf FunctionId = 158 + BPF_FUNC_bprm_opts_set FunctionId = 159 + BPF_FUNC_ktime_get_coarse_ns FunctionId = 160 + BPF_FUNC_ima_inode_hash FunctionId = 161 + BPF_FUNC_sock_from_file FunctionId = 162 + BPF_FUNC_check_mtu FunctionId = 163 + BPF_FUNC_for_each_map_elem FunctionId = 164 + BPF_FUNC_snprintf FunctionId = 165 + BPF_FUNC_sys_bpf FunctionId = 166 + BPF_FUNC_btf_find_by_name_kind FunctionId = 167 + BPF_FUNC_sys_close FunctionId = 168 + BPF_FUNC_timer_init FunctionId = 169 + BPF_FUNC_timer_set_callback FunctionId = 170 + BPF_FUNC_timer_start FunctionId = 171 + BPF_FUNC_timer_cancel FunctionId = 172 + BPF_FUNC_get_func_ip FunctionId = 173 + BPF_FUNC_get_attach_cookie FunctionId = 174 + BPF_FUNC_task_pt_regs FunctionId = 175 + BPF_FUNC_get_branch_snapshot FunctionId = 176 + BPF_FUNC_trace_vprintk FunctionId = 177 + BPF_FUNC_skc_to_unix_sock FunctionId = 178 + BPF_FUNC_kallsyms_lookup_name FunctionId = 179 + BPF_FUNC_find_vma FunctionId = 180 + BPF_FUNC_loop FunctionId = 181 + BPF_FUNC_strncmp FunctionId = 182 + BPF_FUNC_get_func_arg FunctionId = 183 + BPF_FUNC_get_func_ret FunctionId = 184 + BPF_FUNC_get_func_arg_cnt FunctionId = 185 + BPF_FUNC_get_retval FunctionId = 186 + BPF_FUNC_set_retval FunctionId = 187 + BPF_FUNC_xdp_get_buff_len FunctionId = 188 + BPF_FUNC_xdp_load_bytes FunctionId = 189 + BPF_FUNC_xdp_store_bytes FunctionId = 190 + BPF_FUNC_copy_from_user_task FunctionId = 191 + BPF_FUNC_skb_set_tstamp FunctionId = 192 + BPF_FUNC_ima_file_hash FunctionId = 193 + BPF_FUNC_kptr_xchg FunctionId = 194 + BPF_FUNC_map_lookup_percpu_elem FunctionId = 195 + BPF_FUNC_skc_to_mptcp_sock FunctionId = 196 + BPF_FUNC_dynptr_from_mem FunctionId = 197 + BPF_FUNC_ringbuf_reserve_dynptr FunctionId = 198 + BPF_FUNC_ringbuf_submit_dynptr FunctionId = 199 + BPF_FUNC_ringbuf_discard_dynptr FunctionId = 200 + BPF_FUNC_dynptr_read FunctionId = 201 + BPF_FUNC_dynptr_write FunctionId = 202 + BPF_FUNC_dynptr_data FunctionId = 203 + BPF_FUNC_tcp_raw_gen_syncookie_ipv4 FunctionId = 204 + BPF_FUNC_tcp_raw_gen_syncookie_ipv6 FunctionId = 205 + BPF_FUNC_tcp_raw_check_syncookie_ipv4 FunctionId = 206 + BPF_FUNC_tcp_raw_check_syncookie_ipv6 FunctionId = 207 + BPF_FUNC_ktime_get_tai_ns FunctionId = 208 + BPF_FUNC_user_ringbuf_drain FunctionId = 209 + BPF_FUNC_cgrp_storage_get FunctionId = 210 + BPF_FUNC_cgrp_storage_delete FunctionId = 211 + __BPF_FUNC_MAX_ID FunctionId = 212 +) + +type HdrStartOff uint32 + +const ( + BPF_HDR_START_MAC HdrStartOff = 0 + BPF_HDR_START_NET HdrStartOff = 1 +) + +type LinkType uint32 + +const ( + BPF_LINK_TYPE_UNSPEC LinkType = 0 + BPF_LINK_TYPE_RAW_TRACEPOINT LinkType = 1 + BPF_LINK_TYPE_TRACING LinkType = 2 + BPF_LINK_TYPE_CGROUP LinkType = 3 + BPF_LINK_TYPE_ITER LinkType = 4 + BPF_LINK_TYPE_NETNS LinkType = 5 + BPF_LINK_TYPE_XDP LinkType = 6 + BPF_LINK_TYPE_PERF_EVENT LinkType = 7 + BPF_LINK_TYPE_KPROBE_MULTI LinkType = 8 + BPF_LINK_TYPE_STRUCT_OPS LinkType = 9 + BPF_LINK_TYPE_NETFILTER LinkType = 10 + BPF_LINK_TYPE_TCX LinkType = 11 + BPF_LINK_TYPE_UPROBE_MULTI LinkType = 12 + BPF_LINK_TYPE_NETKIT LinkType = 13 + BPF_LINK_TYPE_SOCKMAP LinkType = 14 + __MAX_BPF_LINK_TYPE LinkType = 15 +) + +type MapType uint32 + +const ( + BPF_MAP_TYPE_UNSPEC MapType = 0 + BPF_MAP_TYPE_HASH MapType = 1 + BPF_MAP_TYPE_ARRAY MapType = 2 + BPF_MAP_TYPE_PROG_ARRAY MapType = 3 + BPF_MAP_TYPE_PERF_EVENT_ARRAY MapType = 4 + BPF_MAP_TYPE_PERCPU_HASH MapType = 5 + BPF_MAP_TYPE_PERCPU_ARRAY MapType = 6 + BPF_MAP_TYPE_STACK_TRACE MapType = 7 + BPF_MAP_TYPE_CGROUP_ARRAY MapType = 8 + BPF_MAP_TYPE_LRU_HASH MapType = 9 + BPF_MAP_TYPE_LRU_PERCPU_HASH MapType = 10 + BPF_MAP_TYPE_LPM_TRIE MapType = 11 + BPF_MAP_TYPE_ARRAY_OF_MAPS MapType = 12 + BPF_MAP_TYPE_HASH_OF_MAPS MapType = 13 + BPF_MAP_TYPE_DEVMAP MapType = 14 + BPF_MAP_TYPE_SOCKMAP MapType = 15 + BPF_MAP_TYPE_CPUMAP MapType = 16 + BPF_MAP_TYPE_XSKMAP MapType = 17 + BPF_MAP_TYPE_SOCKHASH MapType = 18 + BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED MapType = 19 + BPF_MAP_TYPE_CGROUP_STORAGE MapType = 19 + BPF_MAP_TYPE_REUSEPORT_SOCKARRAY MapType = 20 + BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE_DEPRECATED MapType = 21 + BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE MapType = 21 + BPF_MAP_TYPE_QUEUE MapType = 22 + BPF_MAP_TYPE_STACK MapType = 23 + BPF_MAP_TYPE_SK_STORAGE MapType = 24 + BPF_MAP_TYPE_DEVMAP_HASH MapType = 25 + BPF_MAP_TYPE_STRUCT_OPS MapType = 26 + BPF_MAP_TYPE_RINGBUF MapType = 27 + BPF_MAP_TYPE_INODE_STORAGE MapType = 28 + BPF_MAP_TYPE_TASK_STORAGE MapType = 29 + BPF_MAP_TYPE_BLOOM_FILTER MapType = 30 + BPF_MAP_TYPE_USER_RINGBUF MapType = 31 + BPF_MAP_TYPE_CGRP_STORAGE MapType = 32 + BPF_MAP_TYPE_ARENA MapType = 33 + __MAX_BPF_MAP_TYPE MapType = 34 +) + +type ObjType uint32 + +const ( + BPF_TYPE_UNSPEC ObjType = 0 + BPF_TYPE_PROG ObjType = 1 + BPF_TYPE_MAP ObjType = 2 + BPF_TYPE_LINK ObjType = 3 +) + +type PerfEventType uint32 + +const ( + BPF_PERF_EVENT_UNSPEC PerfEventType = 0 + BPF_PERF_EVENT_UPROBE PerfEventType = 1 + BPF_PERF_EVENT_URETPROBE PerfEventType = 2 + BPF_PERF_EVENT_KPROBE PerfEventType = 3 + BPF_PERF_EVENT_KRETPROBE PerfEventType = 4 + BPF_PERF_EVENT_TRACEPOINT PerfEventType = 5 + BPF_PERF_EVENT_EVENT PerfEventType = 6 +) + +type ProgType uint32 + +const ( + BPF_PROG_TYPE_UNSPEC ProgType = 0 + BPF_PROG_TYPE_SOCKET_FILTER ProgType = 1 + BPF_PROG_TYPE_KPROBE ProgType = 2 + BPF_PROG_TYPE_SCHED_CLS ProgType = 3 + BPF_PROG_TYPE_SCHED_ACT ProgType = 4 + BPF_PROG_TYPE_TRACEPOINT ProgType = 5 + BPF_PROG_TYPE_XDP ProgType = 6 + BPF_PROG_TYPE_PERF_EVENT ProgType = 7 + BPF_PROG_TYPE_CGROUP_SKB ProgType = 8 + BPF_PROG_TYPE_CGROUP_SOCK ProgType = 9 + BPF_PROG_TYPE_LWT_IN ProgType = 10 + BPF_PROG_TYPE_LWT_OUT ProgType = 11 + BPF_PROG_TYPE_LWT_XMIT ProgType = 12 + BPF_PROG_TYPE_SOCK_OPS ProgType = 13 + BPF_PROG_TYPE_SK_SKB ProgType = 14 + BPF_PROG_TYPE_CGROUP_DEVICE ProgType = 15 + BPF_PROG_TYPE_SK_MSG ProgType = 16 + BPF_PROG_TYPE_RAW_TRACEPOINT ProgType = 17 + BPF_PROG_TYPE_CGROUP_SOCK_ADDR ProgType = 18 + BPF_PROG_TYPE_LWT_SEG6LOCAL ProgType = 19 + BPF_PROG_TYPE_LIRC_MODE2 ProgType = 20 + BPF_PROG_TYPE_SK_REUSEPORT ProgType = 21 + BPF_PROG_TYPE_FLOW_DISSECTOR ProgType = 22 + BPF_PROG_TYPE_CGROUP_SYSCTL ProgType = 23 + BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE ProgType = 24 + BPF_PROG_TYPE_CGROUP_SOCKOPT ProgType = 25 + BPF_PROG_TYPE_TRACING ProgType = 26 + BPF_PROG_TYPE_STRUCT_OPS ProgType = 27 + BPF_PROG_TYPE_EXT ProgType = 28 + BPF_PROG_TYPE_LSM ProgType = 29 + BPF_PROG_TYPE_SK_LOOKUP ProgType = 30 + BPF_PROG_TYPE_SYSCALL ProgType = 31 + BPF_PROG_TYPE_NETFILTER ProgType = 32 + __MAX_BPF_PROG_TYPE ProgType = 33 +) + +type RetCode uint32 + +const ( + BPF_OK RetCode = 0 + BPF_DROP RetCode = 2 + BPF_REDIRECT RetCode = 7 + BPF_LWT_REROUTE RetCode = 128 + BPF_FLOW_DISSECTOR_CONTINUE RetCode = 129 +) + +type SkAction uint32 + +const ( + SK_DROP SkAction = 0 + SK_PASS SkAction = 1 +) + +type StackBuildIdStatus uint32 + +const ( + BPF_STACK_BUILD_ID_EMPTY StackBuildIdStatus = 0 + BPF_STACK_BUILD_ID_VALID StackBuildIdStatus = 1 + BPF_STACK_BUILD_ID_IP StackBuildIdStatus = 2 +) + +type StatsType uint32 + +const ( + BPF_STATS_RUN_TIME StatsType = 0 +) + +type TcxActionBase int32 + +const ( + TCX_NEXT TcxActionBase = -1 + TCX_PASS TcxActionBase = 0 + TCX_DROP TcxActionBase = 2 + TCX_REDIRECT TcxActionBase = 7 +) + +type XdpAction uint32 + +const ( + XDP_ABORTED XdpAction = 0 + XDP_DROP XdpAction = 1 + XDP_PASS XdpAction = 2 + XDP_TX XdpAction = 3 + XDP_REDIRECT XdpAction = 4 +) + +type BtfInfo struct { + Btf Pointer + BtfSize uint32 + Id BTFID + Name Pointer + NameLen uint32 + KernelBtf uint32 +} + +type FuncInfo struct { + InsnOff uint32 + TypeId uint32 +} + +type LineInfo struct { + InsnOff uint32 + FileNameOff uint32 + LineOff uint32 + LineCol uint32 +} + +type LinkInfo struct { + Type LinkType + Id LinkID + ProgId uint32 + _ [4]byte + Extra [48]uint8 +} + +type MapInfo struct { + Type uint32 + Id uint32 + KeySize uint32 + ValueSize uint32 + MaxEntries uint32 + MapFlags uint32 + Name ObjName + Ifindex uint32 + BtfVmlinuxValueTypeId TypeID + NetnsDev uint64 + NetnsIno uint64 + BtfId uint32 + BtfKeyTypeId TypeID + BtfValueTypeId TypeID + BtfVmlinuxId uint32 + MapExtra uint64 +} + +type ProgInfo struct { + Type uint32 + Id uint32 + Tag [8]uint8 + JitedProgLen uint32 + XlatedProgLen uint32 + JitedProgInsns Pointer + XlatedProgInsns Pointer + LoadTime uint64 + CreatedByUid uint32 + NrMapIds uint32 + MapIds Pointer + Name ObjName + Ifindex uint32 + _ [4]byte /* unsupported bitfield */ + NetnsDev uint64 + NetnsIno uint64 + NrJitedKsyms uint32 + NrJitedFuncLens uint32 + JitedKsyms Pointer + JitedFuncLens Pointer + BtfId BTFID + FuncInfoRecSize uint32 + FuncInfo Pointer + NrFuncInfo uint32 + NrLineInfo uint32 + LineInfo Pointer + JitedLineInfo Pointer + NrJitedLineInfo uint32 + LineInfoRecSize uint32 + JitedLineInfoRecSize uint32 + NrProgTags uint32 + ProgTags uint64 + RunTimeNs uint64 + RunCnt uint64 + RecursionMisses uint64 + VerifiedInsns uint32 + AttachBtfObjId BTFID + AttachBtfId TypeID + _ [4]byte +} + +type SkLookup struct { + Cookie uint64 + Family uint32 + Protocol uint32 + RemoteIp4 [4]uint8 + RemoteIp6 [16]uint8 + RemotePort uint16 + _ [2]byte + LocalIp4 [4]uint8 + LocalIp6 [16]uint8 + LocalPort uint32 + IngressIfindex uint32 + _ [4]byte +} + +type XdpMd struct { + Data uint32 + DataEnd uint32 + DataMeta uint32 + IngressIfindex uint32 + RxQueueIndex uint32 + EgressIfindex uint32 +} + +type BtfGetFdByIdAttr struct{ Id uint32 } + +func BtfGetFdById(attr *BtfGetFdByIdAttr) (*FD, error) { + fd, err := BPF(BPF_BTF_GET_FD_BY_ID, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + if err != nil { + return nil, err + } + return NewFD(int(fd)) +} + +type BtfGetNextIdAttr struct { + Id BTFID + NextId BTFID +} + +func BtfGetNextId(attr *BtfGetNextIdAttr) error { + _, err := BPF(BPF_BTF_GET_NEXT_ID, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + return err +} + +type BtfLoadAttr struct { + Btf Pointer + BtfLogBuf Pointer + BtfSize uint32 + BtfLogSize uint32 + BtfLogLevel uint32 + BtfLogTrueSize uint32 + BtfFlags uint32 + BtfTokenFd int32 +} + +func BtfLoad(attr *BtfLoadAttr) (*FD, error) { + fd, err := BPF(BPF_BTF_LOAD, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + if err != nil { + return nil, err + } + return NewFD(int(fd)) +} + +type EnableStatsAttr struct{ Type uint32 } + +func EnableStats(attr *EnableStatsAttr) (*FD, error) { + fd, err := BPF(BPF_ENABLE_STATS, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + if err != nil { + return nil, err + } + return NewFD(int(fd)) +} + +type IterCreateAttr struct { + LinkFd uint32 + Flags uint32 +} + +func IterCreate(attr *IterCreateAttr) (*FD, error) { + fd, err := BPF(BPF_ITER_CREATE, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + if err != nil { + return nil, err + } + return NewFD(int(fd)) +} + +type LinkCreateAttr struct { + ProgFd uint32 + TargetFd uint32 + AttachType AttachType + Flags uint32 + TargetBtfId TypeID + _ [44]byte +} + +func LinkCreate(attr *LinkCreateAttr) (*FD, error) { + fd, err := BPF(BPF_LINK_CREATE, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + if err != nil { + return nil, err + } + return NewFD(int(fd)) +} + +type LinkCreateIterAttr struct { + ProgFd uint32 + TargetFd uint32 + AttachType AttachType + Flags uint32 + IterInfo Pointer + IterInfoLen uint32 + _ [36]byte +} + +func LinkCreateIter(attr *LinkCreateIterAttr) (*FD, error) { + fd, err := BPF(BPF_LINK_CREATE, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + if err != nil { + return nil, err + } + return NewFD(int(fd)) +} + +type LinkCreateKprobeMultiAttr struct { + ProgFd uint32 + TargetFd uint32 + AttachType AttachType + Flags uint32 + KprobeMultiFlags uint32 + Count uint32 + Syms Pointer + Addrs Pointer + Cookies Pointer + _ [16]byte +} + +func LinkCreateKprobeMulti(attr *LinkCreateKprobeMultiAttr) (*FD, error) { + fd, err := BPF(BPF_LINK_CREATE, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + if err != nil { + return nil, err + } + return NewFD(int(fd)) +} + +type LinkCreateNetfilterAttr struct { + ProgFd uint32 + TargetFd uint32 + AttachType AttachType + Flags uint32 + Pf uint32 + Hooknum uint32 + Priority int32 + NetfilterFlags uint32 + _ [32]byte +} + +func LinkCreateNetfilter(attr *LinkCreateNetfilterAttr) (*FD, error) { + fd, err := BPF(BPF_LINK_CREATE, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + if err != nil { + return nil, err + } + return NewFD(int(fd)) +} + +type LinkCreateNetkitAttr struct { + ProgFd uint32 + TargetIfindex uint32 + AttachType AttachType + Flags uint32 + RelativeFdOrId uint32 + _ [4]byte + ExpectedRevision uint64 + _ [32]byte +} + +func LinkCreateNetkit(attr *LinkCreateNetkitAttr) (*FD, error) { + fd, err := BPF(BPF_LINK_CREATE, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + if err != nil { + return nil, err + } + return NewFD(int(fd)) +} + +type LinkCreatePerfEventAttr struct { + ProgFd uint32 + TargetFd uint32 + AttachType AttachType + Flags uint32 + BpfCookie uint64 + _ [40]byte +} + +func LinkCreatePerfEvent(attr *LinkCreatePerfEventAttr) (*FD, error) { + fd, err := BPF(BPF_LINK_CREATE, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + if err != nil { + return nil, err + } + return NewFD(int(fd)) +} + +type LinkCreateTcxAttr struct { + ProgFd uint32 + TargetIfindex uint32 + AttachType AttachType + Flags uint32 + RelativeFdOrId uint32 + _ [4]byte + ExpectedRevision uint64 + _ [32]byte +} + +func LinkCreateTcx(attr *LinkCreateTcxAttr) (*FD, error) { + fd, err := BPF(BPF_LINK_CREATE, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + if err != nil { + return nil, err + } + return NewFD(int(fd)) +} + +type LinkCreateTracingAttr struct { + ProgFd uint32 + TargetFd uint32 + AttachType AttachType + Flags uint32 + TargetBtfId BTFID + _ [4]byte + Cookie uint64 + _ [32]byte +} + +func LinkCreateTracing(attr *LinkCreateTracingAttr) (*FD, error) { + fd, err := BPF(BPF_LINK_CREATE, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + if err != nil { + return nil, err + } + return NewFD(int(fd)) +} + +type LinkCreateUprobeMultiAttr struct { + ProgFd uint32 + TargetFd uint32 + AttachType AttachType + Flags uint32 + Path Pointer + Offsets Pointer + RefCtrOffsets Pointer + Cookies Pointer + Count uint32 + UprobeMultiFlags uint32 + Pid uint32 + _ [4]byte +} + +func LinkCreateUprobeMulti(attr *LinkCreateUprobeMultiAttr) (*FD, error) { + fd, err := BPF(BPF_LINK_CREATE, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + if err != nil { + return nil, err + } + return NewFD(int(fd)) +} + +type LinkGetFdByIdAttr struct{ Id LinkID } + +func LinkGetFdById(attr *LinkGetFdByIdAttr) (*FD, error) { + fd, err := BPF(BPF_LINK_GET_FD_BY_ID, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + if err != nil { + return nil, err + } + return NewFD(int(fd)) +} + +type LinkGetNextIdAttr struct { + Id LinkID + NextId LinkID +} + +func LinkGetNextId(attr *LinkGetNextIdAttr) error { + _, err := BPF(BPF_LINK_GET_NEXT_ID, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + return err +} + +type LinkUpdateAttr struct { + LinkFd uint32 + NewProgFd uint32 + Flags uint32 + OldProgFd uint32 +} + +func LinkUpdate(attr *LinkUpdateAttr) error { + _, err := BPF(BPF_LINK_UPDATE, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + return err +} + +type MapCreateAttr struct { + MapType MapType + KeySize uint32 + ValueSize uint32 + MaxEntries uint32 + MapFlags uint32 + InnerMapFd uint32 + NumaNode uint32 + MapName ObjName + MapIfindex uint32 + BtfFd uint32 + BtfKeyTypeId TypeID + BtfValueTypeId TypeID + BtfVmlinuxValueTypeId TypeID + MapExtra uint64 + ValueTypeBtfObjFd int32 + MapTokenFd int32 +} + +func MapCreate(attr *MapCreateAttr) (*FD, error) { + fd, err := BPF(BPF_MAP_CREATE, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + if err != nil { + return nil, err + } + return NewFD(int(fd)) +} + +type MapDeleteBatchAttr struct { + InBatch Pointer + OutBatch Pointer + Keys Pointer + Values Pointer + Count uint32 + MapFd uint32 + ElemFlags uint64 + Flags uint64 +} + +func MapDeleteBatch(attr *MapDeleteBatchAttr) error { + _, err := BPF(BPF_MAP_DELETE_BATCH, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + return err +} + +type MapDeleteElemAttr struct { + MapFd uint32 + _ [4]byte + Key Pointer + Value Pointer + Flags uint64 +} + +func MapDeleteElem(attr *MapDeleteElemAttr) error { + _, err := BPF(BPF_MAP_DELETE_ELEM, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + return err +} + +type MapFreezeAttr struct{ MapFd uint32 } + +func MapFreeze(attr *MapFreezeAttr) error { + _, err := BPF(BPF_MAP_FREEZE, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + return err +} + +type MapGetFdByIdAttr struct{ Id uint32 } + +func MapGetFdById(attr *MapGetFdByIdAttr) (*FD, error) { + fd, err := BPF(BPF_MAP_GET_FD_BY_ID, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + if err != nil { + return nil, err + } + return NewFD(int(fd)) +} + +type MapGetNextIdAttr struct { + Id uint32 + NextId uint32 +} + +func MapGetNextId(attr *MapGetNextIdAttr) error { + _, err := BPF(BPF_MAP_GET_NEXT_ID, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + return err +} + +type MapGetNextKeyAttr struct { + MapFd uint32 + _ [4]byte + Key Pointer + NextKey Pointer +} + +func MapGetNextKey(attr *MapGetNextKeyAttr) error { + _, err := BPF(BPF_MAP_GET_NEXT_KEY, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + return err +} + +type MapLookupAndDeleteBatchAttr struct { + InBatch Pointer + OutBatch Pointer + Keys Pointer + Values Pointer + Count uint32 + MapFd uint32 + ElemFlags uint64 + Flags uint64 +} + +func MapLookupAndDeleteBatch(attr *MapLookupAndDeleteBatchAttr) error { + _, err := BPF(BPF_MAP_LOOKUP_AND_DELETE_BATCH, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + return err +} + +type MapLookupAndDeleteElemAttr struct { + MapFd uint32 + _ [4]byte + Key Pointer + Value Pointer + Flags uint64 +} + +func MapLookupAndDeleteElem(attr *MapLookupAndDeleteElemAttr) error { + _, err := BPF(BPF_MAP_LOOKUP_AND_DELETE_ELEM, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + return err +} + +type MapLookupBatchAttr struct { + InBatch Pointer + OutBatch Pointer + Keys Pointer + Values Pointer + Count uint32 + MapFd uint32 + ElemFlags uint64 + Flags uint64 +} + +func MapLookupBatch(attr *MapLookupBatchAttr) error { + _, err := BPF(BPF_MAP_LOOKUP_BATCH, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + return err +} + +type MapLookupElemAttr struct { + MapFd uint32 + _ [4]byte + Key Pointer + Value Pointer + Flags uint64 +} + +func MapLookupElem(attr *MapLookupElemAttr) error { + _, err := BPF(BPF_MAP_LOOKUP_ELEM, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + return err +} + +type MapUpdateBatchAttr struct { + InBatch Pointer + OutBatch Pointer + Keys Pointer + Values Pointer + Count uint32 + MapFd uint32 + ElemFlags uint64 + Flags uint64 +} + +func MapUpdateBatch(attr *MapUpdateBatchAttr) error { + _, err := BPF(BPF_MAP_UPDATE_BATCH, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + return err +} + +type MapUpdateElemAttr struct { + MapFd uint32 + _ [4]byte + Key Pointer + Value Pointer + Flags uint64 +} + +func MapUpdateElem(attr *MapUpdateElemAttr) error { + _, err := BPF(BPF_MAP_UPDATE_ELEM, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + return err +} + +type ObjGetAttr struct { + Pathname Pointer + BpfFd uint32 + FileFlags uint32 + PathFd int32 + _ [4]byte +} + +func ObjGet(attr *ObjGetAttr) (*FD, error) { + fd, err := BPF(BPF_OBJ_GET, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + if err != nil { + return nil, err + } + return NewFD(int(fd)) +} + +type ObjGetInfoByFdAttr struct { + BpfFd uint32 + InfoLen uint32 + Info Pointer +} + +func ObjGetInfoByFd(attr *ObjGetInfoByFdAttr) error { + _, err := BPF(BPF_OBJ_GET_INFO_BY_FD, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + return err +} + +type ObjPinAttr struct { + Pathname Pointer + BpfFd uint32 + FileFlags uint32 + PathFd int32 + _ [4]byte +} + +func ObjPin(attr *ObjPinAttr) error { + _, err := BPF(BPF_OBJ_PIN, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + return err +} + +type ProgAttachAttr struct { + TargetFdOrIfindex uint32 + AttachBpfFd uint32 + AttachType uint32 + AttachFlags uint32 + ReplaceBpfFd uint32 + RelativeFdOrId uint32 + ExpectedRevision uint64 +} + +func ProgAttach(attr *ProgAttachAttr) error { + _, err := BPF(BPF_PROG_ATTACH, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + return err +} + +type ProgBindMapAttr struct { + ProgFd uint32 + MapFd uint32 + Flags uint32 +} + +func ProgBindMap(attr *ProgBindMapAttr) error { + _, err := BPF(BPF_PROG_BIND_MAP, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + return err +} + +type ProgDetachAttr struct { + TargetFdOrIfindex uint32 + AttachBpfFd uint32 + AttachType uint32 + AttachFlags uint32 + _ [4]byte + RelativeFdOrId uint32 + ExpectedRevision uint64 +} + +func ProgDetach(attr *ProgDetachAttr) error { + _, err := BPF(BPF_PROG_DETACH, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + return err +} + +type ProgGetFdByIdAttr struct{ Id uint32 } + +func ProgGetFdById(attr *ProgGetFdByIdAttr) (*FD, error) { + fd, err := BPF(BPF_PROG_GET_FD_BY_ID, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + if err != nil { + return nil, err + } + return NewFD(int(fd)) +} + +type ProgGetNextIdAttr struct { + Id uint32 + NextId uint32 +} + +func ProgGetNextId(attr *ProgGetNextIdAttr) error { + _, err := BPF(BPF_PROG_GET_NEXT_ID, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + return err +} + +type ProgLoadAttr struct { + ProgType ProgType + InsnCnt uint32 + Insns Pointer + License Pointer + LogLevel LogLevel + LogSize uint32 + LogBuf Pointer + KernVersion uint32 + ProgFlags uint32 + ProgName ObjName + ProgIfindex uint32 + ExpectedAttachType AttachType + ProgBtfFd uint32 + FuncInfoRecSize uint32 + FuncInfo Pointer + FuncInfoCnt uint32 + LineInfoRecSize uint32 + LineInfo Pointer + LineInfoCnt uint32 + AttachBtfId TypeID + AttachBtfObjFd uint32 + CoreReloCnt uint32 + FdArray Pointer + CoreRelos Pointer + CoreReloRecSize uint32 + LogTrueSize uint32 + ProgTokenFd int32 + _ [4]byte +} + +func ProgLoad(attr *ProgLoadAttr) (*FD, error) { + fd, err := BPF(BPF_PROG_LOAD, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + if err != nil { + return nil, err + } + return NewFD(int(fd)) +} + +type ProgQueryAttr struct { + TargetFdOrIfindex uint32 + AttachType AttachType + QueryFlags uint32 + AttachFlags uint32 + ProgIds Pointer + Count uint32 + _ [4]byte + ProgAttachFlags Pointer + LinkIds Pointer + LinkAttachFlags Pointer + Revision uint64 +} + +func ProgQuery(attr *ProgQueryAttr) error { + _, err := BPF(BPF_PROG_QUERY, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + return err +} + +type ProgRunAttr struct { + ProgFd uint32 + Retval uint32 + DataSizeIn uint32 + DataSizeOut uint32 + DataIn Pointer + DataOut Pointer + Repeat uint32 + Duration uint32 + CtxSizeIn uint32 + CtxSizeOut uint32 + CtxIn Pointer + CtxOut Pointer + Flags uint32 + Cpu uint32 + BatchSize uint32 + _ [4]byte +} + +func ProgRun(attr *ProgRunAttr) error { + _, err := BPF(BPF_PROG_TEST_RUN, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + return err +} + +type RawTracepointOpenAttr struct { + Name Pointer + ProgFd uint32 + _ [4]byte + Cookie uint64 +} + +func RawTracepointOpen(attr *RawTracepointOpenAttr) (*FD, error) { + fd, err := BPF(BPF_RAW_TRACEPOINT_OPEN, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + if err != nil { + return nil, err + } + return NewFD(int(fd)) +} + +type CgroupLinkInfo struct { + Type LinkType + Id LinkID + ProgId uint32 + _ [4]byte + CgroupId uint64 + AttachType AttachType + _ [36]byte +} + +type IterLinkInfo struct { + Type LinkType + Id LinkID + ProgId uint32 + _ [4]byte + TargetName Pointer + TargetNameLen uint32 +} + +type KprobeLinkInfo struct { + Type LinkType + Id LinkID + ProgId uint32 + _ [4]byte + PerfEventType PerfEventType + _ [4]byte + FuncName Pointer + NameLen uint32 + Offset uint32 + Addr uint64 + Missed uint64 + Cookie uint64 +} + +type KprobeMultiLinkInfo struct { + Type LinkType + Id LinkID + ProgId uint32 + _ [4]byte + Addrs Pointer + Count uint32 + Flags uint32 + Missed uint64 + Cookies uint64 + _ [16]byte +} + +type NetNsLinkInfo struct { + Type LinkType + Id LinkID + ProgId uint32 + _ [4]byte + NetnsIno uint32 + AttachType AttachType + _ [40]byte +} + +type NetfilterLinkInfo struct { + Type LinkType + Id LinkID + ProgId uint32 + _ [4]byte + Pf uint32 + Hooknum uint32 + Priority int32 + Flags uint32 + _ [32]byte +} + +type NetkitLinkInfo struct { + Type LinkType + Id LinkID + ProgId uint32 + _ [4]byte + Ifindex uint32 + AttachType AttachType + _ [40]byte +} + +type PerfEventLinkInfo struct { + Type LinkType + Id LinkID + ProgId uint32 + _ [4]byte + PerfEventType PerfEventType +} + +type RawTracepointLinkInfo struct { + Type LinkType + Id LinkID + ProgId uint32 + _ [4]byte + TpName Pointer + TpNameLen uint32 + _ [36]byte +} + +type TcxLinkInfo struct { + Type LinkType + Id LinkID + ProgId uint32 + _ [4]byte + Ifindex uint32 + AttachType AttachType + _ [40]byte +} + +type TracingLinkInfo struct { + Type LinkType + Id LinkID + ProgId uint32 + _ [4]byte + AttachType AttachType + TargetObjId uint32 + TargetBtfId TypeID + _ [36]byte +} + +type XDPLinkInfo struct { + Type LinkType + Id LinkID + ProgId uint32 + _ [4]byte + Ifindex uint32 + _ [44]byte +} diff --git a/vendor/github.com/cilium/ebpf/internal/sysenc/buffer.go b/vendor/github.com/cilium/ebpf/internal/sysenc/buffer.go new file mode 100644 index 0000000000..1b4f052ee2 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/sysenc/buffer.go @@ -0,0 +1,85 @@ +package sysenc + +import ( + "unsafe" + + "github.com/cilium/ebpf/internal/sys" +) + +type Buffer struct { + ptr unsafe.Pointer + // Size of the buffer. syscallPointerOnly if created from UnsafeBuffer or when using + // zero-copy unmarshaling. + size int +} + +const syscallPointerOnly = -1 + +func newBuffer(buf []byte) Buffer { + if len(buf) == 0 { + return Buffer{} + } + return Buffer{unsafe.Pointer(&buf[0]), len(buf)} +} + +// UnsafeBuffer constructs a Buffer for zero-copy unmarshaling. +// +// [Pointer] is the only valid method to call on such a Buffer. +// Use [SyscallBuffer] instead if possible. +func UnsafeBuffer(ptr unsafe.Pointer) Buffer { + return Buffer{ptr, syscallPointerOnly} +} + +// SyscallOutput prepares a Buffer for a syscall to write into. +// +// size is the length of the desired buffer in bytes. +// The buffer may point at the underlying memory of dst, in which case [Unmarshal] +// becomes a no-op. +// +// The contents of the buffer are undefined and may be non-zero. +func SyscallOutput(dst any, size int) Buffer { + if dstBuf := unsafeBackingMemory(dst); len(dstBuf) == size { + buf := newBuffer(dstBuf) + buf.size = syscallPointerOnly + return buf + } + + return newBuffer(make([]byte, size)) +} + +// CopyTo copies the buffer into dst. +// +// Returns the number of copied bytes. +func (b Buffer) CopyTo(dst []byte) int { + return copy(dst, b.Bytes()) +} + +// AppendTo appends the buffer onto dst. +func (b Buffer) AppendTo(dst []byte) []byte { + return append(dst, b.Bytes()...) +} + +// Pointer returns the location where a syscall should write. +func (b Buffer) Pointer() sys.Pointer { + // NB: This deliberately ignores b.length to support zero-copy + // marshaling / unmarshaling using unsafe.Pointer. + return sys.NewPointer(b.ptr) +} + +// Unmarshal the buffer into the provided value. +func (b Buffer) Unmarshal(data any) error { + if b.size == syscallPointerOnly { + return nil + } + + return Unmarshal(data, b.Bytes()) +} + +// Bytes returns the buffer as a byte slice. Returns nil if the Buffer was +// created using UnsafeBuffer or by zero-copy unmarshaling. +func (b Buffer) Bytes() []byte { + if b.size == syscallPointerOnly { + return nil + } + return unsafe.Slice((*byte)(b.ptr), b.size) +} diff --git a/vendor/github.com/cilium/ebpf/internal/sysenc/doc.go b/vendor/github.com/cilium/ebpf/internal/sysenc/doc.go new file mode 100644 index 0000000000..676ad98ba1 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/sysenc/doc.go @@ -0,0 +1,3 @@ +// Package sysenc provides efficient conversion of Go values to system +// call interfaces. +package sysenc diff --git a/vendor/github.com/cilium/ebpf/internal/sysenc/layout.go b/vendor/github.com/cilium/ebpf/internal/sysenc/layout.go new file mode 100644 index 0000000000..52d111e7af --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/sysenc/layout.go @@ -0,0 +1,41 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found at https://go.dev/LICENSE. + +package sysenc + +import ( + "reflect" + "sync" +) + +var hasUnexportedFieldsCache sync.Map // map[reflect.Type]bool + +func hasUnexportedFields(typ reflect.Type) bool { + switch typ.Kind() { + case reflect.Slice, reflect.Array, reflect.Pointer: + return hasUnexportedFields(typ.Elem()) + + case reflect.Struct: + if unexported, ok := hasUnexportedFieldsCache.Load(typ); ok { + return unexported.(bool) + } + + unexported := false + for i, n := 0, typ.NumField(); i < n; i++ { + field := typ.Field(i) + // Package binary allows _ fields but always writes zeroes into them. + if (!field.IsExported() && field.Name != "_") || hasUnexportedFields(field.Type) { + unexported = true + break + } + } + + hasUnexportedFieldsCache.Store(typ, unexported) + return unexported + + default: + // NB: It's not clear what this means for Chan and so on. + return false + } +} diff --git a/vendor/github.com/cilium/ebpf/internal/sysenc/marshal.go b/vendor/github.com/cilium/ebpf/internal/sysenc/marshal.go new file mode 100644 index 0000000000..0026af8f24 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/sysenc/marshal.go @@ -0,0 +1,177 @@ +package sysenc + +import ( + "bytes" + "encoding" + "encoding/binary" + "errors" + "fmt" + "reflect" + "slices" + "sync" + "unsafe" + + "github.com/cilium/ebpf/internal" +) + +// Marshal turns data into a byte slice using the system's native endianness. +// +// If possible, avoids allocations by directly using the backing memory +// of data. This means that the variable must not be modified for the lifetime +// of the returned [Buffer]. +// +// Returns an error if the data can't be turned into a byte slice according to +// the behaviour of [binary.Write]. +func Marshal(data any, size int) (Buffer, error) { + if data == nil { + return Buffer{}, errors.New("can't marshal a nil value") + } + + var buf []byte + var err error + switch value := data.(type) { + case encoding.BinaryMarshaler: + buf, err = value.MarshalBinary() + case string: + buf = unsafe.Slice(unsafe.StringData(value), len(value)) + case []byte: + buf = value + case int16: + buf = internal.NativeEndian.AppendUint16(make([]byte, 0, 2), uint16(value)) + case uint16: + buf = internal.NativeEndian.AppendUint16(make([]byte, 0, 2), value) + case int32: + buf = internal.NativeEndian.AppendUint32(make([]byte, 0, 4), uint32(value)) + case uint32: + buf = internal.NativeEndian.AppendUint32(make([]byte, 0, 4), value) + case int64: + buf = internal.NativeEndian.AppendUint64(make([]byte, 0, 8), uint64(value)) + case uint64: + buf = internal.NativeEndian.AppendUint64(make([]byte, 0, 8), value) + default: + if buf := unsafeBackingMemory(data); len(buf) == size { + return newBuffer(buf), nil + } + + wr := internal.NewBuffer(make([]byte, 0, size)) + defer internal.PutBuffer(wr) + + err = binary.Write(wr, internal.NativeEndian, value) + buf = wr.Bytes() + } + if err != nil { + return Buffer{}, err + } + + if len(buf) != size { + return Buffer{}, fmt.Errorf("%T doesn't marshal to %d bytes", data, size) + } + + return newBuffer(buf), nil +} + +var bytesReaderPool = sync.Pool{ + New: func() interface{} { + return new(bytes.Reader) + }, +} + +// Unmarshal a byte slice in the system's native endianness into data. +// +// Returns an error if buf can't be unmarshalled according to the behaviour +// of [binary.Read]. +func Unmarshal(data interface{}, buf []byte) error { + switch value := data.(type) { + case encoding.BinaryUnmarshaler: + return value.UnmarshalBinary(buf) + + case *string: + *value = string(buf) + return nil + + case *[]byte: + // Backwards compat: unmarshaling into a slice replaces the whole slice. + *value = slices.Clone(buf) + return nil + + default: + if dataBuf := unsafeBackingMemory(data); len(dataBuf) == len(buf) { + copy(dataBuf, buf) + return nil + } + + rd := bytesReaderPool.Get().(*bytes.Reader) + defer bytesReaderPool.Put(rd) + + rd.Reset(buf) + + if err := binary.Read(rd, internal.NativeEndian, value); err != nil { + return err + } + + if rd.Len() != 0 { + return fmt.Errorf("unmarshaling %T doesn't consume all data", data) + } + + return nil + } +} + +// unsafeBackingMemory returns the backing memory of data if it can be used +// instead of calling into package binary. +// +// Returns nil if the value is not a pointer or a slice, or if it contains +// padding or unexported fields. +func unsafeBackingMemory(data any) []byte { + if data == nil { + return nil + } + + value := reflect.ValueOf(data) + var valueSize int + switch value.Kind() { + case reflect.Pointer: + if value.IsNil() { + return nil + } + + if elemType := value.Type().Elem(); elemType.Kind() != reflect.Slice { + valueSize = int(elemType.Size()) + break + } + + // We're dealing with a pointer to a slice. Dereference and + // handle it like a regular slice. + value = value.Elem() + fallthrough + + case reflect.Slice: + valueSize = int(value.Type().Elem().Size()) * value.Len() + + default: + // Prevent Value.UnsafePointer from panicking. + return nil + } + + // Some nil pointer types currently crash binary.Size. Call it after our own + // code so that the panic isn't reachable. + // See https://github.com/golang/go/issues/60892 + if size := binary.Size(data); size == -1 || size != valueSize { + // The type contains padding or unsupported types. + return nil + } + + if hasUnexportedFields(reflect.TypeOf(data)) { + return nil + } + + // Reinterpret the pointer as a byte slice. This violates the unsafe.Pointer + // rules because it's very unlikely that the source data has "an equivalent + // memory layout". However, we can make it safe-ish because of the + // following reasons: + // - There is no alignment mismatch since we cast to a type with an + // alignment of 1. + // - There are no pointers in the source type so we don't upset the GC. + // - The length is verified at runtime. + return unsafe.Slice((*byte)(value.UnsafePointer()), valueSize) +} diff --git a/vendor/github.com/cilium/ebpf/internal/testutils/fdtrace/fd_trace.go b/vendor/github.com/cilium/ebpf/internal/testutils/fdtrace/fd_trace.go new file mode 100644 index 0000000000..562df2cc0c --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/testutils/fdtrace/fd_trace.go @@ -0,0 +1,103 @@ +package fdtrace + +import ( + "bytes" + "fmt" + "os" + "runtime" + "sync" + "sync/atomic" +) + +// foundLeak is atomic since the GC may collect objects in parallel. +var foundLeak atomic.Bool + +func onLeakFD(fs *runtime.Frames) { + foundLeak.Store(true) + fmt.Fprintln(os.Stderr, "leaked fd created at:") + fmt.Fprintln(os.Stderr, formatFrames(fs)) +} + +// fds is a registry of all file descriptors wrapped into sys.fds that were +// created while an fd tracer was active. +var fds *sync.Map // map[int]*runtime.Frames + +// TraceFD associates raw with the current execution stack. +// +// skip controls how many entries of the stack the function should skip. +func TraceFD(raw int, skip int) { + if fds == nil { + return + } + + // Attempt to store the caller's stack for the given fd value. + // Panic if fds contains an existing stack for the fd. + old, exist := fds.LoadOrStore(raw, callersFrames(skip)) + if exist { + f := old.(*runtime.Frames) + panic(fmt.Sprintf("found existing stack for fd %d:\n%s", raw, formatFrames(f))) + } +} + +// ForgetFD removes any existing association for raw. +func ForgetFD(raw int) { + if fds != nil { + fds.Delete(raw) + } +} + +// LeakFD indicates that raw was leaked. +// +// Calling the function with a value that was not passed to [TraceFD] before +// is undefined. +func LeakFD(raw int) { + if fds == nil { + return + } + + // Invoke the fd leak callback. Calls LoadAndDelete to guarantee the callback + // is invoked at most once for one sys.FD allocation, runtime.Frames can only + // be unwound once. + f, ok := fds.LoadAndDelete(raw) + if ok { + onLeakFD(f.(*runtime.Frames)) + } +} + +// flushFrames removes all elements from fds and returns them as a slice. This +// deals with the fact that a runtime.Frames can only be unwound once using +// Next(). +func flushFrames() []*runtime.Frames { + var frames []*runtime.Frames + fds.Range(func(key, value any) bool { + frames = append(frames, value.(*runtime.Frames)) + fds.Delete(key) + return true + }) + return frames +} + +func callersFrames(skip int) *runtime.Frames { + c := make([]uintptr, 32) + + // Skip runtime.Callers and this function. + i := runtime.Callers(skip+2, c) + if i == 0 { + return nil + } + + return runtime.CallersFrames(c) +} + +// formatFrames formats a runtime.Frames as a human-readable string. +func formatFrames(fs *runtime.Frames) string { + var b bytes.Buffer + for { + f, more := fs.Next() + b.WriteString(fmt.Sprintf("\t%s+%#x\n\t\t%s:%d\n", f.Function, f.PC-f.Entry, f.File, f.Line)) + if !more { + break + } + } + return b.String() +} diff --git a/vendor/github.com/cilium/ebpf/internal/testutils/fdtrace/main.go b/vendor/github.com/cilium/ebpf/internal/testutils/fdtrace/main.go new file mode 100644 index 0000000000..c1f7b42d91 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/testutils/fdtrace/main.go @@ -0,0 +1,31 @@ +package fdtrace + +import ( + "os" + "sync" +) + +type testingM interface { + Run() int +} + +// TestMain runs m with fd tracing enabled. +// +// The function calls [os.Exit] and does not return. +func TestMain(m testingM) { + fds = new(sync.Map) + + ret := m.Run() + + if fs := flushFrames(); len(fs) != 0 { + for _, f := range fs { + onLeakFD(f) + } + } + + if foundLeak.Load() { + ret = 99 + } + + os.Exit(ret) +} diff --git a/vendor/github.com/cilium/ebpf/internal/tracefs/kprobe.go b/vendor/github.com/cilium/ebpf/internal/tracefs/kprobe.go new file mode 100644 index 0000000000..bdadf4d8ea --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/tracefs/kprobe.go @@ -0,0 +1,368 @@ +package tracefs + +import ( + "crypto/rand" + "errors" + "fmt" + "os" + "path/filepath" + "runtime" + "strings" + "sync" + "syscall" + + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/linux" + "github.com/cilium/ebpf/internal/unix" +) + +var ( + ErrInvalidInput = errors.New("invalid input") + + ErrInvalidMaxActive = errors.New("can only set maxactive on kretprobes") +) + +//go:generate go run golang.org/x/tools/cmd/stringer@latest -type=ProbeType -linecomment + +type ProbeType uint8 + +const ( + Kprobe ProbeType = iota // kprobe + Uprobe // uprobe +) + +func (pt ProbeType) eventsFile() (*os.File, error) { + path, err := sanitizeTracefsPath(fmt.Sprintf("%s_events", pt.String())) + if err != nil { + return nil, err + } + + return os.OpenFile(path, os.O_APPEND|os.O_WRONLY, 0666) +} + +type ProbeArgs struct { + Type ProbeType + Symbol, Group, Path string + Offset, RefCtrOffset, Cookie uint64 + Pid, RetprobeMaxActive int + Ret bool +} + +// RandomGroup generates a pseudorandom string for use as a tracefs group name. +// Returns an error when the output string would exceed 63 characters (kernel +// limitation), when rand.Read() fails or when prefix contains characters not +// allowed by IsValidTraceID. +func RandomGroup(prefix string) (string, error) { + if !validIdentifier(prefix) { + return "", fmt.Errorf("prefix '%s' must be alphanumeric or underscore: %w", prefix, ErrInvalidInput) + } + + b := make([]byte, 8) + if _, err := rand.Read(b); err != nil { + return "", fmt.Errorf("reading random bytes: %w", err) + } + + group := fmt.Sprintf("%s_%x", prefix, b) + if len(group) > 63 { + return "", fmt.Errorf("group name '%s' cannot be longer than 63 characters: %w", group, ErrInvalidInput) + } + + return group, nil +} + +// validIdentifier implements the equivalent of a regex match +// against "^[a-zA-Z_][0-9a-zA-Z_]*$". +// +// Trace event groups, names and kernel symbols must adhere to this set +// of characters. Non-empty, first character must not be a number, all +// characters must be alphanumeric or underscore. +func validIdentifier(s string) bool { + if len(s) < 1 { + return false + } + for i, c := range []byte(s) { + switch { + case c >= 'a' && c <= 'z': + case c >= 'A' && c <= 'Z': + case c == '_': + case i > 0 && c >= '0' && c <= '9': + + default: + return false + } + } + + return true +} + +func sanitizeTracefsPath(path ...string) (string, error) { + base, err := getTracefsPath() + if err != nil { + return "", err + } + l := filepath.Join(path...) + p := filepath.Join(base, l) + if !strings.HasPrefix(p, base) { + return "", fmt.Errorf("path '%s' attempts to escape base path '%s': %w", l, base, ErrInvalidInput) + } + return p, nil +} + +// getTracefsPath will return a correct path to the tracefs mount point. +// Since kernel 4.1 tracefs should be mounted by default at /sys/kernel/tracing, +// but may be also be available at /sys/kernel/debug/tracing if debugfs is mounted. +// The available tracefs paths will depends on distribution choices. +var getTracefsPath = sync.OnceValues(func() (string, error) { + if !internal.OnLinux { + return "", fmt.Errorf("tracefs: %w", internal.ErrNotSupportedOnOS) + } + + for _, p := range []struct { + path string + fsType int64 + }{ + {"/sys/kernel/tracing", unix.TRACEFS_MAGIC}, + {"/sys/kernel/debug/tracing", unix.TRACEFS_MAGIC}, + // RHEL/CentOS + {"/sys/kernel/debug/tracing", unix.DEBUGFS_MAGIC}, + } { + if fsType, err := linux.FSType(p.path); err == nil && fsType == p.fsType { + return p.path, nil + } + } + + return "", errors.New("neither debugfs nor tracefs are mounted") +}) + +// sanitizeIdentifier replaces every invalid character for the tracefs api with an underscore. +// +// It is equivalent to calling regexp.MustCompile("[^a-zA-Z0-9]+").ReplaceAllString("_"). +func sanitizeIdentifier(s string) string { + var skip bool + return strings.Map(func(c rune) rune { + switch { + case c >= 'a' && c <= 'z', + c >= 'A' && c <= 'Z', + c >= '0' && c <= '9': + skip = false + return c + + case skip: + return -1 + + default: + skip = true + return '_' + } + }, s) +} + +// EventID reads a trace event's ID from tracefs given its group and name. +// The kernel requires group and name to be alphanumeric or underscore. +func EventID(group, name string) (uint64, error) { + if !validIdentifier(group) { + return 0, fmt.Errorf("invalid tracefs group: %q", group) + } + + if !validIdentifier(name) { + return 0, fmt.Errorf("invalid tracefs name: %q", name) + } + + path, err := sanitizeTracefsPath("events", group, name, "id") + if err != nil { + return 0, err + } + tid, err := internal.ReadUint64FromFile("%d\n", path) + if errors.Is(err, os.ErrNotExist) { + return 0, err + } + if err != nil { + return 0, fmt.Errorf("reading trace event ID of %s/%s: %w", group, name, err) + } + + return tid, nil +} + +func probePrefix(ret bool, maxActive int) string { + if ret { + if maxActive > 0 { + return fmt.Sprintf("r%d", maxActive) + } + return "r" + } + return "p" +} + +// Event represents an entry in a tracefs probe events file. +type Event struct { + typ ProbeType + group, name string + // event id allocated by the kernel. 0 if the event has already been removed. + id uint64 +} + +// NewEvent creates a new ephemeral trace event. +// +// Returns os.ErrNotExist if symbol is not a valid +// kernel symbol, or if it is not traceable with kprobes. Returns os.ErrExist +// if a probe with the same group and symbol already exists. Returns an error if +// args.RetprobeMaxActive is used on non kprobe types. Returns ErrNotSupported if +// the kernel is too old to support kretprobe maxactive. +func NewEvent(args ProbeArgs) (*Event, error) { + // Before attempting to create a trace event through tracefs, + // check if an event with the same group and name already exists. + // Kernels 4.x and earlier don't return os.ErrExist on writing a duplicate + // entry, so we need to rely on reads for detecting uniqueness. + eventName := sanitizeIdentifier(args.Symbol) + _, err := EventID(args.Group, eventName) + if err == nil { + return nil, fmt.Errorf("trace event %s/%s: %w", args.Group, eventName, os.ErrExist) + } + if errors.Is(err, unix.EINVAL) { + return nil, fmt.Errorf("trace event %s/%s: %w (unknown symbol?)", args.Group, eventName, err) + } + if !errors.Is(err, os.ErrNotExist) { + return nil, fmt.Errorf("checking trace event %s/%s: %w", args.Group, eventName, err) + } + + // Open the kprobe_events file in tracefs. + f, err := args.Type.eventsFile() + if err != nil { + return nil, err + } + defer f.Close() + + var pe, token string + switch args.Type { + case Kprobe: + // The kprobe_events syntax is as follows (see Documentation/trace/kprobetrace.txt): + // p[:[GRP/]EVENT] [MOD:]SYM[+offs]|MEMADDR [FETCHARGS] : Set a probe + // r[MAXACTIVE][:[GRP/]EVENT] [MOD:]SYM[+0] [FETCHARGS] : Set a return probe + // -:[GRP/]EVENT : Clear a probe + // + // Some examples: + // r:ebpf_1234/r_my_kretprobe nf_conntrack_destroy + // p:ebpf_5678/p_my_kprobe __x64_sys_execve + // + // Leaving the kretprobe's MAXACTIVE set to 0 (or absent) will make the + // kernel default to NR_CPUS. This is desired in most eBPF cases since + // subsampling or rate limiting logic can be more accurately implemented in + // the eBPF program itself. + // See Documentation/kprobes.txt for more details. + if args.RetprobeMaxActive != 0 && !args.Ret { + return nil, ErrInvalidMaxActive + } + token = KprobeToken(args) + pe = fmt.Sprintf("%s:%s/%s %s", probePrefix(args.Ret, args.RetprobeMaxActive), args.Group, eventName, token) + case Uprobe: + // The uprobe_events syntax is as follows: + // p[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] : Set a probe + // r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] : Set a return probe + // -:[GRP/]EVENT : Clear a probe + // + // Some examples: + // r:ebpf_1234/readline /bin/bash:0x12345 + // p:ebpf_5678/main_mySymbol /bin/mybin:0x12345(0x123) + // + // See Documentation/trace/uprobetracer.txt for more details. + if args.RetprobeMaxActive != 0 { + return nil, ErrInvalidMaxActive + } + token = UprobeToken(args) + pe = fmt.Sprintf("%s:%s/%s %s", probePrefix(args.Ret, 0), args.Group, eventName, token) + } + _, err = f.WriteString(pe) + + // Since commit 97c753e62e6c, ENOENT is correctly returned instead of EINVAL + // when trying to create a retprobe for a missing symbol. + if errors.Is(err, os.ErrNotExist) { + return nil, fmt.Errorf("token %s: not found: %w", token, err) + } + // Since commit ab105a4fb894, EILSEQ is returned when a kprobe sym+offset is resolved + // to an invalid insn boundary. The exact conditions that trigger this error are + // arch specific however. + if errors.Is(err, syscall.EILSEQ) { + return nil, fmt.Errorf("token %s: bad insn boundary: %w", token, os.ErrNotExist) + } + // ERANGE is returned when the `SYM[+offs]` token is too big and cannot + // be resolved. + if errors.Is(err, syscall.ERANGE) { + return nil, fmt.Errorf("token %s: offset too big: %w", token, os.ErrNotExist) + } + + if err != nil { + return nil, fmt.Errorf("token %s: writing '%s': %w", token, pe, err) + } + + // Get the newly-created trace event's id. + tid, err := EventID(args.Group, eventName) + if args.RetprobeMaxActive != 0 && errors.Is(err, os.ErrNotExist) { + // Kernels < 4.12 don't support maxactive and therefore auto generate + // group and event names from the symbol and offset. The symbol is used + // without any sanitization. + // See https://elixir.bootlin.com/linux/v4.10/source/kernel/trace/trace_kprobe.c#L712 + event := fmt.Sprintf("kprobes/r_%s_%d", args.Symbol, args.Offset) + if err := removeEvent(args.Type, event); err != nil { + return nil, fmt.Errorf("failed to remove spurious maxactive event: %s", err) + } + return nil, fmt.Errorf("create trace event with non-default maxactive: %w", internal.ErrNotSupported) + } + if err != nil { + return nil, fmt.Errorf("get trace event id: %w", err) + } + + evt := &Event{args.Type, args.Group, eventName, tid} + runtime.SetFinalizer(evt, (*Event).Close) + return evt, nil +} + +// Close removes the event from tracefs. +// +// Returns os.ErrClosed if the event has already been closed before. +func (evt *Event) Close() error { + if evt.id == 0 { + return os.ErrClosed + } + + evt.id = 0 + runtime.SetFinalizer(evt, nil) + pe := fmt.Sprintf("%s/%s", evt.group, evt.name) + return removeEvent(evt.typ, pe) +} + +func removeEvent(typ ProbeType, pe string) error { + f, err := typ.eventsFile() + if err != nil { + return err + } + defer f.Close() + + // See [k,u]probe_events syntax above. The probe type does not need to be specified + // for removals. + if _, err = f.WriteString("-:" + pe); err != nil { + return fmt.Errorf("remove event %q from %s: %w", pe, f.Name(), err) + } + + return nil +} + +// ID returns the tracefs ID associated with the event. +func (evt *Event) ID() uint64 { + return evt.id +} + +// Group returns the tracefs group used by the event. +func (evt *Event) Group() string { + return evt.group +} + +// KprobeToken creates the SYM[+offs] token for the tracefs api. +func KprobeToken(args ProbeArgs) string { + po := args.Symbol + + if args.Offset != 0 { + po += fmt.Sprintf("+%#x", args.Offset) + } + + return po +} diff --git a/vendor/github.com/cilium/ebpf/internal/tracefs/probetype_string.go b/vendor/github.com/cilium/ebpf/internal/tracefs/probetype_string.go new file mode 100644 index 0000000000..87cb0a059b --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/tracefs/probetype_string.go @@ -0,0 +1,24 @@ +// Code generated by "stringer -type=ProbeType -linecomment"; DO NOT EDIT. + +package tracefs + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[Kprobe-0] + _ = x[Uprobe-1] +} + +const _ProbeType_name = "kprobeuprobe" + +var _ProbeType_index = [...]uint8{0, 6, 12} + +func (i ProbeType) String() string { + if i >= ProbeType(len(_ProbeType_index)-1) { + return "ProbeType(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _ProbeType_name[_ProbeType_index[i]:_ProbeType_index[i+1]] +} diff --git a/vendor/github.com/cilium/ebpf/internal/tracefs/uprobe.go b/vendor/github.com/cilium/ebpf/internal/tracefs/uprobe.go new file mode 100644 index 0000000000..994f31260d --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/tracefs/uprobe.go @@ -0,0 +1,16 @@ +package tracefs + +import "fmt" + +// UprobeToken creates the PATH:OFFSET(REF_CTR_OFFSET) token for the tracefs api. +func UprobeToken(args ProbeArgs) string { + po := fmt.Sprintf("%s:%#x", args.Path, args.Offset) + + if args.RefCtrOffset != 0 { + // This is not documented in Documentation/trace/uprobetracer.txt. + // elixir.bootlin.com/linux/v5.15-rc7/source/kernel/trace/trace.c#L5564 + po += fmt.Sprintf("(%#x)", args.RefCtrOffset) + } + + return po +} diff --git a/vendor/github.com/cilium/ebpf/internal/unix/doc.go b/vendor/github.com/cilium/ebpf/internal/unix/doc.go new file mode 100644 index 0000000000..d168d36f18 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/unix/doc.go @@ -0,0 +1,11 @@ +// Package unix re-exports Linux specific parts of golang.org/x/sys/unix. +// +// It avoids breaking compilation on other OS by providing stubs as follows: +// - Invoking a function always returns an error. +// - Errnos have distinct, non-zero values. +// - Constants have distinct but meaningless values. +// - Types use the same names for members, but may or may not follow the +// Linux layout. +package unix + +// Note: please don't add any custom API to this package. Use internal/sys instead. diff --git a/vendor/github.com/cilium/ebpf/internal/unix/errno_linux.go b/vendor/github.com/cilium/ebpf/internal/unix/errno_linux.go new file mode 100644 index 0000000000..0c4886bd13 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/unix/errno_linux.go @@ -0,0 +1,29 @@ +package unix + +import ( + "syscall" + + linux "golang.org/x/sys/unix" +) + +type Errno = syscall.Errno + +const ( + E2BIG = linux.E2BIG + EACCES = linux.EACCES + EAGAIN = linux.EAGAIN + EBADF = linux.EBADF + EEXIST = linux.EEXIST + EFAULT = linux.EFAULT + EILSEQ = linux.EILSEQ + EINTR = linux.EINTR + EINVAL = linux.EINVAL + ENODEV = linux.ENODEV + ENOENT = linux.ENOENT + ENOSPC = linux.ENOSPC + EOPNOTSUPP = linux.EOPNOTSUPP + EPERM = linux.EPERM + EPOLLIN = linux.EPOLLIN + ESRCH = linux.ESRCH + ESTALE = linux.ESTALE +) diff --git a/vendor/github.com/cilium/ebpf/internal/unix/errno_other.go b/vendor/github.com/cilium/ebpf/internal/unix/errno_other.go new file mode 100644 index 0000000000..fc2b042b54 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/unix/errno_other.go @@ -0,0 +1,29 @@ +//go:build !linux && !windows + +package unix + +import "syscall" + +type Errno = syscall.Errno + +// Errnos are distinct and non-zero. +const ( + E2BIG Errno = iota + 1 + EACCES + EAGAIN + EBADF + EEXIST + EFAULT + EILSEQ + EINTR + EINVAL + ENODEV + ENOENT + ENOSPC + ENOTSUP + ENOTSUPP + EOPNOTSUPP + EPERM + ESRCH + ESTALE +) diff --git a/vendor/github.com/cilium/ebpf/internal/unix/errno_string_windows.go b/vendor/github.com/cilium/ebpf/internal/unix/errno_string_windows.go new file mode 100644 index 0000000000..6077e983f3 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/unix/errno_string_windows.go @@ -0,0 +1,59 @@ +// Code generated by "stringer -type=Errno -tags=windows -output=errno_string_windows.go"; DO NOT EDIT. + +package unix + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[EPERM-1] + _ = x[ENOENT-2] + _ = x[ESRCH-3] + _ = x[EINTR-4] + _ = x[E2BIG-7] + _ = x[EBADF-9] + _ = x[EAGAIN-11] + _ = x[EACCES-13] + _ = x[EFAULT-14] + _ = x[EEXIST-17] + _ = x[ENODEV-19] + _ = x[EINVAL-22] + _ = x[ENOSPC-28] + _ = x[EILSEQ-42] + _ = x[ENOTSUP-129] + _ = x[EOPNOTSUPP-130] + _ = x[ENOTSUPP-536870912] + _ = x[ESTALE-536870913] +} + +const _Errno_name = "EPERMENOENTESRCHEINTRE2BIGEBADFEAGAINEACCESEFAULTEEXISTENODEVEINVALENOSPCEILSEQENOTSUPEOPNOTSUPPENOTSUPPESTALE" + +var _Errno_map = map[Errno]string{ + 1: _Errno_name[0:5], + 2: _Errno_name[5:11], + 3: _Errno_name[11:16], + 4: _Errno_name[16:21], + 7: _Errno_name[21:26], + 9: _Errno_name[26:31], + 11: _Errno_name[31:37], + 13: _Errno_name[37:43], + 14: _Errno_name[43:49], + 17: _Errno_name[49:55], + 19: _Errno_name[55:61], + 22: _Errno_name[61:67], + 28: _Errno_name[67:73], + 42: _Errno_name[73:79], + 129: _Errno_name[79:86], + 130: _Errno_name[86:96], + 536870912: _Errno_name[96:104], + 536870913: _Errno_name[104:110], +} + +func (i Errno) String() string { + if str, ok := _Errno_map[i]; ok { + return str + } + return "Errno(" + strconv.FormatInt(int64(i), 10) + ")" +} diff --git a/vendor/github.com/cilium/ebpf/internal/unix/errno_windows.go b/vendor/github.com/cilium/ebpf/internal/unix/errno_windows.go new file mode 100644 index 0000000000..7500cd6d4e --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/unix/errno_windows.go @@ -0,0 +1,78 @@ +package unix + +// The code in this file is derived from syscall_unix.go in the Go source code, +// licensed under the MIT license. + +import ( + "errors" + "os" + "syscall" +) + +//go:generate go run golang.org/x/tools/cmd/stringer@latest -type=Errno -tags=windows -output=errno_string_windows.go + +// Windows specific constants for Unix errnos. +// +// The values do not always match Linux, for example EILSEQ and EOPNOTSUPP. +// +// See https://learn.microsoft.com/en-us/cpp/c-runtime-library/errno-constants?view=msvc-170 +const ( + EPERM Errno = 1 + ENOENT Errno = 2 + ESRCH Errno = 3 + EINTR Errno = 4 + E2BIG Errno = 7 + EBADF Errno = 9 + EAGAIN Errno = 11 + EACCES Errno = 13 + EFAULT Errno = 14 + EEXIST Errno = 17 + ENODEV Errno = 19 + EINVAL Errno = 22 + ENFILE Errno = 23 + EMFILE Errno = 24 + ENOSPC Errno = 28 + ENOSYS Errno = 40 + ENOTEMPTY Errno = 41 + EILSEQ Errno = 42 + ENOTSUP Errno = 129 + EOPNOTSUPP Errno = 130 + ETIMEDOUT Errno = 138 + EWOULDBLOCK Errno = 140 +) + +// These constants do not exist on Windows and therefore have a non-zero +// dummy value. +const ( + ENOTSUPP Errno = Errno(syscall.APPLICATION_ERROR) + iota + ESTALE +) + +// Errno is a Windows compatibility shim for Unix errnos. +type Errno uintptr + +func (e Errno) Error() string { + return e.String() +} + +func (e Errno) Is(target error) bool { + switch target { + case os.ErrPermission: + return e == EACCES || e == EPERM + case os.ErrExist: + return e == EEXIST || e == ENOTEMPTY + case os.ErrNotExist: + return e == ENOENT + case errors.ErrUnsupported: + return e == ENOSYS || e == ENOTSUP || e == EOPNOTSUPP + } + return false +} + +func (e Errno) Temporary() bool { + return e == EINTR || e == EMFILE || e == ENFILE || e.Timeout() +} + +func (e Errno) Timeout() bool { + return e == EAGAIN || e == EWOULDBLOCK || e == ETIMEDOUT +} diff --git a/vendor/github.com/cilium/ebpf/internal/unix/error.go b/vendor/github.com/cilium/ebpf/internal/unix/error.go new file mode 100644 index 0000000000..48017c1009 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/unix/error.go @@ -0,0 +1,23 @@ +package unix + +import ( + "fmt" + "runtime" + "strings" + + "github.com/cilium/ebpf/internal" +) + +// errNonLinux returns an error which wraps [internal.ErrNotSupportedOnOS] and +// includes the name of the calling function. +func errNonLinux() error { + name := "unknown" + pc, _, _, ok := runtime.Caller(1) + if ok { + name = runtime.FuncForPC(pc).Name() + if pos := strings.LastIndexByte(name, '.'); pos != -1 { + name = name[pos+1:] + } + } + return fmt.Errorf("unix: %s: %w", name, internal.ErrNotSupportedOnOS) +} diff --git a/vendor/github.com/cilium/ebpf/internal/unix/strings_other.go b/vendor/github.com/cilium/ebpf/internal/unix/strings_other.go new file mode 100644 index 0000000000..49e1fe2172 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/unix/strings_other.go @@ -0,0 +1,11 @@ +//go:build !linux && !windows + +package unix + +func BytePtrFromString(s string) (*byte, error) { + return nil, errNonLinux() +} + +func ByteSliceToString(s []byte) string { + return "" +} diff --git a/vendor/github.com/cilium/ebpf/internal/unix/strings_windows.go b/vendor/github.com/cilium/ebpf/internal/unix/strings_windows.go new file mode 100644 index 0000000000..7e3d4da777 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/unix/strings_windows.go @@ -0,0 +1,19 @@ +package unix + +import ( + "syscall" + + "golang.org/x/sys/windows" +) + +func BytePtrFromString(s string) (*byte, error) { + p, err := windows.BytePtrFromString(s) + if err == syscall.EINVAL { + err = EINVAL + } + return p, err +} + +func ByteSliceToString(s []byte) string { + return windows.ByteSliceToString(s) +} diff --git a/vendor/github.com/cilium/ebpf/internal/unix/types_linux.go b/vendor/github.com/cilium/ebpf/internal/unix/types_linux.go new file mode 100644 index 0000000000..385adf08b8 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/unix/types_linux.go @@ -0,0 +1,201 @@ +//go:build linux + +package unix + +import ( + "syscall" + + linux "golang.org/x/sys/unix" +) + +const ( + BPF_F_NO_PREALLOC = linux.BPF_F_NO_PREALLOC + BPF_F_NUMA_NODE = linux.BPF_F_NUMA_NODE + BPF_F_RDONLY = linux.BPF_F_RDONLY + BPF_F_WRONLY = linux.BPF_F_WRONLY + BPF_F_RDONLY_PROG = linux.BPF_F_RDONLY_PROG + BPF_F_WRONLY_PROG = linux.BPF_F_WRONLY_PROG + BPF_F_SLEEPABLE = linux.BPF_F_SLEEPABLE + BPF_F_XDP_HAS_FRAGS = linux.BPF_F_XDP_HAS_FRAGS + BPF_F_MMAPABLE = linux.BPF_F_MMAPABLE + BPF_F_INNER_MAP = linux.BPF_F_INNER_MAP + BPF_F_KPROBE_MULTI_RETURN = linux.BPF_F_KPROBE_MULTI_RETURN + BPF_F_UPROBE_MULTI_RETURN = linux.BPF_F_UPROBE_MULTI_RETURN + BPF_F_LOCK = linux.BPF_F_LOCK + BPF_OBJ_NAME_LEN = linux.BPF_OBJ_NAME_LEN + BPF_TAG_SIZE = linux.BPF_TAG_SIZE + BPF_RINGBUF_BUSY_BIT = linux.BPF_RINGBUF_BUSY_BIT + BPF_RINGBUF_DISCARD_BIT = linux.BPF_RINGBUF_DISCARD_BIT + BPF_RINGBUF_HDR_SZ = linux.BPF_RINGBUF_HDR_SZ + SYS_BPF = linux.SYS_BPF + F_DUPFD_CLOEXEC = linux.F_DUPFD_CLOEXEC + EPOLL_CTL_ADD = linux.EPOLL_CTL_ADD + EPOLL_CLOEXEC = linux.EPOLL_CLOEXEC + O_CLOEXEC = linux.O_CLOEXEC + O_NONBLOCK = linux.O_NONBLOCK + PROT_NONE = linux.PROT_NONE + PROT_READ = linux.PROT_READ + PROT_WRITE = linux.PROT_WRITE + MAP_ANON = linux.MAP_ANON + MAP_SHARED = linux.MAP_SHARED + MAP_PRIVATE = linux.MAP_PRIVATE + PERF_ATTR_SIZE_VER1 = linux.PERF_ATTR_SIZE_VER1 + PERF_TYPE_SOFTWARE = linux.PERF_TYPE_SOFTWARE + PERF_TYPE_TRACEPOINT = linux.PERF_TYPE_TRACEPOINT + PERF_COUNT_SW_BPF_OUTPUT = linux.PERF_COUNT_SW_BPF_OUTPUT + PERF_EVENT_IOC_DISABLE = linux.PERF_EVENT_IOC_DISABLE + PERF_EVENT_IOC_ENABLE = linux.PERF_EVENT_IOC_ENABLE + PERF_EVENT_IOC_SET_BPF = linux.PERF_EVENT_IOC_SET_BPF + PerfBitWatermark = linux.PerfBitWatermark + PerfBitWriteBackward = linux.PerfBitWriteBackward + PERF_SAMPLE_RAW = linux.PERF_SAMPLE_RAW + PERF_FLAG_FD_CLOEXEC = linux.PERF_FLAG_FD_CLOEXEC + RLIM_INFINITY = linux.RLIM_INFINITY + RLIMIT_MEMLOCK = linux.RLIMIT_MEMLOCK + BPF_STATS_RUN_TIME = linux.BPF_STATS_RUN_TIME + PERF_RECORD_LOST = linux.PERF_RECORD_LOST + PERF_RECORD_SAMPLE = linux.PERF_RECORD_SAMPLE + AT_FDCWD = linux.AT_FDCWD + RENAME_NOREPLACE = linux.RENAME_NOREPLACE + SO_ATTACH_BPF = linux.SO_ATTACH_BPF + SO_DETACH_BPF = linux.SO_DETACH_BPF + SOL_SOCKET = linux.SOL_SOCKET + SIGPROF = linux.SIGPROF + SIGUSR1 = linux.SIGUSR1 + SIG_BLOCK = linux.SIG_BLOCK + SIG_UNBLOCK = linux.SIG_UNBLOCK + BPF_FS_MAGIC = linux.BPF_FS_MAGIC + TRACEFS_MAGIC = linux.TRACEFS_MAGIC + DEBUGFS_MAGIC = linux.DEBUGFS_MAGIC + BPF_RB_NO_WAKEUP = linux.BPF_RB_NO_WAKEUP + BPF_RB_FORCE_WAKEUP = linux.BPF_RB_FORCE_WAKEUP + AF_UNSPEC = linux.AF_UNSPEC + IFF_UP = linux.IFF_UP +) + +type Statfs_t = linux.Statfs_t +type Stat_t = linux.Stat_t +type Rlimit = linux.Rlimit +type Signal = linux.Signal +type Sigset_t = linux.Sigset_t +type PerfEventMmapPage = linux.PerfEventMmapPage +type EpollEvent = linux.EpollEvent +type PerfEventAttr = linux.PerfEventAttr +type Utsname = linux.Utsname +type CPUSet = linux.CPUSet + +func Syscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno) { + return linux.Syscall(trap, a1, a2, a3) +} + +func PthreadSigmask(how int, set, oldset *Sigset_t) error { + return linux.PthreadSigmask(how, set, oldset) +} + +func FcntlInt(fd uintptr, cmd, arg int) (int, error) { + return linux.FcntlInt(fd, cmd, arg) +} + +func IoctlSetInt(fd int, req uint, value int) error { + return linux.IoctlSetInt(fd, req, value) +} + +func Statfs(path string, buf *Statfs_t) (err error) { + return linux.Statfs(path, buf) +} + +func Close(fd int) (err error) { + return linux.Close(fd) +} + +func EpollWait(epfd int, events []EpollEvent, msec int) (n int, err error) { + return linux.EpollWait(epfd, events, msec) +} + +func EpollCtl(epfd int, op int, fd int, event *EpollEvent) (err error) { + return linux.EpollCtl(epfd, op, fd, event) +} + +func Eventfd(initval uint, flags int) (fd int, err error) { + return linux.Eventfd(initval, flags) +} + +func Write(fd int, p []byte) (n int, err error) { + return linux.Write(fd, p) +} + +func EpollCreate1(flag int) (fd int, err error) { + return linux.EpollCreate1(flag) +} + +func SetNonblock(fd int, nonblocking bool) (err error) { + return linux.SetNonblock(fd, nonblocking) +} + +func Mmap(fd int, offset int64, length int, prot int, flags int) (data []byte, err error) { + return linux.Mmap(fd, offset, length, prot, flags) +} + +func Munmap(b []byte) (err error) { + return linux.Munmap(b) +} + +func PerfEventOpen(attr *PerfEventAttr, pid int, cpu int, groupFd int, flags int) (fd int, err error) { + return linux.PerfEventOpen(attr, pid, cpu, groupFd, flags) +} + +func Uname(buf *Utsname) (err error) { + return linux.Uname(buf) +} + +func Getpid() int { + return linux.Getpid() +} + +func Gettid() int { + return linux.Gettid() +} + +func Tgkill(tgid int, tid int, sig syscall.Signal) (err error) { + return linux.Tgkill(tgid, tid, sig) +} + +func BytePtrFromString(s string) (*byte, error) { + return linux.BytePtrFromString(s) +} + +func ByteSliceToString(s []byte) string { + return linux.ByteSliceToString(s) +} + +func Renameat2(olddirfd int, oldpath string, newdirfd int, newpath string, flags uint) error { + return linux.Renameat2(olddirfd, oldpath, newdirfd, newpath, flags) +} + +func Prlimit(pid, resource int, new, old *Rlimit) error { + return linux.Prlimit(pid, resource, new, old) +} + +func Open(path string, mode int, perm uint32) (int, error) { + return linux.Open(path, mode, perm) +} + +func Fstat(fd int, stat *Stat_t) error { + return linux.Fstat(fd, stat) +} + +func SetsockoptInt(fd, level, opt, value int) error { + return linux.SetsockoptInt(fd, level, opt, value) +} + +func SchedSetaffinity(pid int, set *CPUSet) error { + return linux.SchedSetaffinity(pid, set) +} + +func SchedGetaffinity(pid int, set *CPUSet) error { + return linux.SchedGetaffinity(pid, set) +} + +func Auxv() ([][2]uintptr, error) { + return linux.Auxv() +} diff --git a/vendor/github.com/cilium/ebpf/internal/unix/types_other.go b/vendor/github.com/cilium/ebpf/internal/unix/types_other.go new file mode 100644 index 0000000000..323f7ff34e --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/unix/types_other.go @@ -0,0 +1,284 @@ +//go:build !linux + +package unix + +import ( + "syscall" +) + +// Constants are distinct to avoid breaking switch statements. +const ( + BPF_F_NO_PREALLOC = iota + BPF_F_NUMA_NODE + BPF_F_RDONLY + BPF_F_WRONLY + BPF_F_RDONLY_PROG + BPF_F_WRONLY_PROG + BPF_F_SLEEPABLE + BPF_F_MMAPABLE + BPF_F_INNER_MAP + BPF_F_KPROBE_MULTI_RETURN + BPF_F_UPROBE_MULTI_RETURN + BPF_F_XDP_HAS_FRAGS + BPF_OBJ_NAME_LEN + BPF_TAG_SIZE + BPF_RINGBUF_BUSY_BIT + BPF_RINGBUF_DISCARD_BIT + BPF_RINGBUF_HDR_SZ + SYS_BPF + F_DUPFD_CLOEXEC + EPOLLIN + EPOLL_CTL_ADD + EPOLL_CLOEXEC + O_CLOEXEC + O_NONBLOCK + PROT_NONE + PROT_READ + PROT_WRITE + MAP_ANON + MAP_SHARED + MAP_PRIVATE + PERF_ATTR_SIZE_VER1 + PERF_TYPE_SOFTWARE + PERF_TYPE_TRACEPOINT + PERF_COUNT_SW_BPF_OUTPUT + PERF_EVENT_IOC_DISABLE + PERF_EVENT_IOC_ENABLE + PERF_EVENT_IOC_SET_BPF + PerfBitWatermark + PerfBitWriteBackward + PERF_SAMPLE_RAW + PERF_FLAG_FD_CLOEXEC + RLIM_INFINITY + RLIMIT_MEMLOCK + BPF_STATS_RUN_TIME + PERF_RECORD_LOST + PERF_RECORD_SAMPLE + AT_FDCWD + RENAME_NOREPLACE + SO_ATTACH_BPF + SO_DETACH_BPF + SOL_SOCKET + SIGPROF + SIGUSR1 + SIG_BLOCK + SIG_UNBLOCK + BPF_FS_MAGIC + TRACEFS_MAGIC + DEBUGFS_MAGIC + BPF_RB_NO_WAKEUP + BPF_RB_FORCE_WAKEUP + BPF_F_LOCK + AF_UNSPEC + IFF_UP +) + +type Statfs_t struct { + Type int64 + Bsize int64 + Blocks uint64 + Bfree uint64 + Bavail uint64 + Files uint64 + Ffree uint64 + Fsid [2]int32 + Namelen int64 + Frsize int64 + Flags int64 + Spare [4]int64 +} + +type Stat_t struct { + Dev uint64 + Ino uint64 + Nlink uint64 + Mode uint32 + Uid uint32 + Gid uint32 + _ int32 + Rdev uint64 + Size int64 + Blksize int64 + Blocks int64 +} + +type Rlimit struct { + Cur uint64 + Max uint64 +} + +type Signal int + +type Sigset_t struct { + Val [4]uint64 +} + +func Syscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err Errno) { + return 0, 0, ENOTSUP +} + +func PthreadSigmask(how int, set, oldset *Sigset_t) error { + return errNonLinux() +} + +func FcntlInt(fd uintptr, cmd, arg int) (int, error) { + return -1, errNonLinux() +} + +func IoctlSetInt(fd int, req uint, value int) error { + return errNonLinux() +} + +func Statfs(path string, buf *Statfs_t) error { + return errNonLinux() +} + +func Close(fd int) (err error) { + return errNonLinux() +} + +type EpollEvent struct { + Events uint32 + Fd int32 + Pad int32 +} + +func EpollWait(epfd int, events []EpollEvent, msec int) (n int, err error) { + return 0, errNonLinux() +} + +func EpollCtl(epfd int, op int, fd int, event *EpollEvent) (err error) { + return errNonLinux() +} + +func Eventfd(initval uint, flags int) (fd int, err error) { + return 0, errNonLinux() +} + +func Write(fd int, p []byte) (n int, err error) { + return 0, errNonLinux() +} + +func EpollCreate1(flag int) (fd int, err error) { + return 0, errNonLinux() +} + +type PerfEventMmapPage struct { + Version uint32 + Compat_version uint32 + Lock uint32 + Index uint32 + Offset int64 + Time_enabled uint64 + Time_running uint64 + Capabilities uint64 + Pmc_width uint16 + Time_shift uint16 + Time_mult uint32 + Time_offset uint64 + Time_zero uint64 + Size uint32 + + Data_head uint64 + Data_tail uint64 + Data_offset uint64 + Data_size uint64 + Aux_head uint64 + Aux_tail uint64 + Aux_offset uint64 + Aux_size uint64 +} + +func SetNonblock(fd int, nonblocking bool) (err error) { + return errNonLinux() +} + +func Mmap(fd int, offset int64, length int, prot int, flags int) (data []byte, err error) { + return []byte{}, errNonLinux() +} + +func Munmap(b []byte) (err error) { + return errNonLinux() +} + +type PerfEventAttr struct { + Type uint32 + Size uint32 + Config uint64 + Sample uint64 + Sample_type uint64 + Read_format uint64 + Bits uint64 + Wakeup uint32 + Bp_type uint32 + Ext1 uint64 + Ext2 uint64 + Branch_sample_type uint64 + Sample_regs_user uint64 + Sample_stack_user uint32 + Clockid int32 + Sample_regs_intr uint64 + Aux_watermark uint32 + Sample_max_stack uint16 +} + +func PerfEventOpen(attr *PerfEventAttr, pid int, cpu int, groupFd int, flags int) (fd int, err error) { + return 0, errNonLinux() +} + +type Utsname struct { + Release [65]byte + Version [65]byte +} + +func Uname(buf *Utsname) (err error) { + return errNonLinux() +} + +func Getpid() int { + return -1 +} + +func Gettid() int { + return -1 +} + +func Tgkill(tgid int, tid int, sig syscall.Signal) (err error) { + return errNonLinux() +} + +func Renameat2(olddirfd int, oldpath string, newdirfd int, newpath string, flags uint) error { + return errNonLinux() +} + +func Prlimit(pid, resource int, new, old *Rlimit) error { + return errNonLinux() +} + +func Open(path string, mode int, perm uint32) (int, error) { + return -1, errNonLinux() +} + +func Fstat(fd int, stat *Stat_t) error { + return errNonLinux() +} + +func SetsockoptInt(fd, level, opt, value int) error { + return errNonLinux() +} + +type CPUSet struct{} + +func (*CPUSet) Set(int) {} + +func SchedSetaffinity(pid int, set *CPUSet) error { + return errNonLinux() +} + +func SchedGetaffinity(pid int, set *CPUSet) error { + return errNonLinux() +} + +func Auxv() ([][2]uintptr, error) { + return nil, errNonLinux() +} diff --git a/vendor/github.com/cilium/ebpf/internal/version.go b/vendor/github.com/cilium/ebpf/internal/version.go new file mode 100644 index 0000000000..a230830b01 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/version.go @@ -0,0 +1,77 @@ +package internal + +import ( + "fmt" +) + +const ( + // Version constant used in ELF binaries indicating that the loader needs to + // substitute the eBPF program's version with the value of the kernel's + // KERNEL_VERSION compile-time macro. Used for compatibility with BCC, gobpf + // and RedSift. + MagicKernelVersion = 0xFFFFFFFE +) + +// A Version in the form Major.Minor.Patch. +type Version [3]uint16 + +// NewVersion creates a version from a string like "Major.Minor.Patch". +// +// Patch is optional. +func NewVersion(ver string) (Version, error) { + var major, minor, patch uint16 + n, _ := fmt.Sscanf(ver, "%d.%d.%d", &major, &minor, &patch) + if n < 2 { + return Version{}, fmt.Errorf("invalid version: %s", ver) + } + return Version{major, minor, patch}, nil +} + +// NewVersionFromCode creates a version from a LINUX_VERSION_CODE. +func NewVersionFromCode(code uint32) Version { + return Version{ + uint16(uint8(code >> 16)), + uint16(uint8(code >> 8)), + uint16(uint8(code)), + } +} + +func (v Version) String() string { + if v[2] == 0 { + return fmt.Sprintf("v%d.%d", v[0], v[1]) + } + return fmt.Sprintf("v%d.%d.%d", v[0], v[1], v[2]) +} + +// Less returns true if the version is less than another version. +func (v Version) Less(other Version) bool { + for i, a := range v { + if a == other[i] { + continue + } + return a < other[i] + } + return false +} + +// Unspecified returns true if the version is all zero. +func (v Version) Unspecified() bool { + return v[0] == 0 && v[1] == 0 && v[2] == 0 +} + +// Kernel implements the kernel's KERNEL_VERSION macro from linux/version.h. +// It represents the kernel version and patch level as a single value. +func (v Version) Kernel() uint32 { + + // Kernels 4.4 and 4.9 have their SUBLEVEL clamped to 255 to avoid + // overflowing into PATCHLEVEL. + // See kernel commit 9b82f13e7ef3 ("kbuild: clamp SUBLEVEL to 255"). + s := v[2] + if s > 255 { + s = 255 + } + + // Truncate members to uint8 to prevent them from spilling over into + // each other when overflowing 8 bits. + return uint32(uint8(v[0]))<<16 | uint32(uint8(v[1]))<<8 | uint32(uint8(s)) +} diff --git a/vendor/github.com/cilium/ebpf/link/anchor.go b/vendor/github.com/cilium/ebpf/link/anchor.go new file mode 100644 index 0000000000..1a3b5f7681 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/anchor.go @@ -0,0 +1,137 @@ +package link + +import ( + "fmt" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/internal/sys" +) + +const anchorFlags = sys.BPF_F_REPLACE | + sys.BPF_F_BEFORE | + sys.BPF_F_AFTER | + sys.BPF_F_ID | + sys.BPF_F_LINK_MPROG + +// Anchor is a reference to a link or program. +// +// It is used to describe where an attachment or detachment should take place +// for link types which support multiple attachment. +type Anchor interface { + // anchor returns an fd or ID and a set of flags. + // + // By default fdOrID is taken to reference a program, but BPF_F_LINK_MPROG + // changes this to refer to a link instead. + // + // BPF_F_BEFORE, BPF_F_AFTER, BPF_F_REPLACE modify where a link or program + // is attached. The default behaviour if none of these flags is specified + // matches BPF_F_AFTER. + anchor() (fdOrID, flags uint32, _ error) +} + +type firstAnchor struct{} + +func (firstAnchor) anchor() (fdOrID, flags uint32, _ error) { + return 0, sys.BPF_F_BEFORE, nil +} + +// Head is the position before all other programs or links. +func Head() Anchor { + return firstAnchor{} +} + +type lastAnchor struct{} + +func (lastAnchor) anchor() (fdOrID, flags uint32, _ error) { + return 0, sys.BPF_F_AFTER, nil +} + +// Tail is the position after all other programs or links. +func Tail() Anchor { + return lastAnchor{} +} + +// Before is the position just in front of target. +func BeforeLink(target Link) Anchor { + return anchor{target, sys.BPF_F_BEFORE} +} + +// After is the position just after target. +func AfterLink(target Link) Anchor { + return anchor{target, sys.BPF_F_AFTER} +} + +// Before is the position just in front of target. +func BeforeLinkByID(target ID) Anchor { + return anchor{target, sys.BPF_F_BEFORE} +} + +// After is the position just after target. +func AfterLinkByID(target ID) Anchor { + return anchor{target, sys.BPF_F_AFTER} +} + +// Before is the position just in front of target. +func BeforeProgram(target *ebpf.Program) Anchor { + return anchor{target, sys.BPF_F_BEFORE} +} + +// After is the position just after target. +func AfterProgram(target *ebpf.Program) Anchor { + return anchor{target, sys.BPF_F_AFTER} +} + +// Replace the target itself. +func ReplaceProgram(target *ebpf.Program) Anchor { + return anchor{target, sys.BPF_F_REPLACE} +} + +// Before is the position just in front of target. +func BeforeProgramByID(target ebpf.ProgramID) Anchor { + return anchor{target, sys.BPF_F_BEFORE} +} + +// After is the position just after target. +func AfterProgramByID(target ebpf.ProgramID) Anchor { + return anchor{target, sys.BPF_F_AFTER} +} + +// Replace the target itself. +func ReplaceProgramByID(target ebpf.ProgramID) Anchor { + return anchor{target, sys.BPF_F_REPLACE} +} + +type anchor struct { + target any + position uint32 +} + +func (ap anchor) anchor() (fdOrID, flags uint32, _ error) { + var typeFlag uint32 + switch target := ap.target.(type) { + case *ebpf.Program: + fd := target.FD() + if fd < 0 { + return 0, 0, sys.ErrClosedFd + } + fdOrID = uint32(fd) + typeFlag = 0 + case ebpf.ProgramID: + fdOrID = uint32(target) + typeFlag = sys.BPF_F_ID + case interface{ FD() int }: + fd := target.FD() + if fd < 0 { + return 0, 0, sys.ErrClosedFd + } + fdOrID = uint32(fd) + typeFlag = sys.BPF_F_LINK_MPROG + case ID: + fdOrID = uint32(target) + typeFlag = sys.BPF_F_LINK_MPROG | sys.BPF_F_ID + default: + return 0, 0, fmt.Errorf("invalid target %T", ap.target) + } + + return fdOrID, ap.position | typeFlag, nil +} diff --git a/vendor/github.com/cilium/ebpf/link/cgroup.go b/vendor/github.com/cilium/ebpf/link/cgroup.go new file mode 100644 index 0000000000..f17d34f03c --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/cgroup.go @@ -0,0 +1,208 @@ +package link + +import ( + "errors" + "fmt" + "os" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/internal/sys" +) + +type cgroupAttachFlags uint32 + +const ( + // Allow programs attached to sub-cgroups to override the verdict of this + // program. + flagAllowOverride cgroupAttachFlags = 1 << iota + // Allow attaching multiple programs to the cgroup. Only works if the cgroup + // has zero or more programs attached using the Multi flag. Implies override. + flagAllowMulti + // Set automatically by progAttachCgroup.Update(). Used for updating a + // specific given program attached in multi-mode. + flagReplace +) + +type CgroupOptions struct { + // Path to a cgroupv2 folder. + Path string + // One of the AttachCgroup* constants + Attach ebpf.AttachType + // Program must be of type CGroup*, and the attach type must match Attach. + Program *ebpf.Program +} + +// AttachCgroup links a BPF program to a cgroup. +// +// If the running kernel doesn't support bpf_link, attempts to emulate its +// semantics using the legacy PROG_ATTACH mechanism. If bpf_link is not +// available, the returned [Link] will not support pinning to bpffs. +// +// If you need more control over attachment flags or the attachment mechanism +// used, look at [RawAttachProgram] and [AttachRawLink] instead. +func AttachCgroup(opts CgroupOptions) (cg Link, err error) { + cgroup, err := os.Open(opts.Path) + if err != nil { + return nil, fmt.Errorf("can't open cgroup: %s", err) + } + defer func() { + if _, ok := cg.(*progAttachCgroup); ok { + // Skip closing the cgroup handle if we return a valid progAttachCgroup, + // where the handle is retained to implement Update(). + return + } + cgroup.Close() + }() + + cg, err = newLinkCgroup(cgroup, opts.Attach, opts.Program) + if err == nil { + return cg, nil + } + + if errors.Is(err, ErrNotSupported) { + cg, err = newProgAttachCgroup(cgroup, opts.Attach, opts.Program, flagAllowMulti) + } + if errors.Is(err, ErrNotSupported) { + cg, err = newProgAttachCgroup(cgroup, opts.Attach, opts.Program, flagAllowOverride) + } + if err != nil { + return nil, err + } + + return cg, nil +} + +type progAttachCgroup struct { + cgroup *os.File + current *ebpf.Program + attachType ebpf.AttachType + flags cgroupAttachFlags +} + +var _ Link = (*progAttachCgroup)(nil) + +func (cg *progAttachCgroup) isLink() {} + +// newProgAttachCgroup attaches prog to cgroup using BPF_PROG_ATTACH. +// cgroup and prog are retained by [progAttachCgroup]. +func newProgAttachCgroup(cgroup *os.File, attach ebpf.AttachType, prog *ebpf.Program, flags cgroupAttachFlags) (*progAttachCgroup, error) { + if flags&flagAllowMulti > 0 { + if err := haveProgAttachReplace(); err != nil { + return nil, fmt.Errorf("can't support multiple programs: %w", err) + } + } + + // Use a program handle that cannot be closed by the caller. + clone, err := prog.Clone() + if err != nil { + return nil, err + } + + err = RawAttachProgram(RawAttachProgramOptions{ + Target: int(cgroup.Fd()), + Program: clone, + Flags: uint32(flags), + Attach: attach, + }) + if err != nil { + clone.Close() + return nil, fmt.Errorf("cgroup: %w", err) + } + + return &progAttachCgroup{cgroup, clone, attach, flags}, nil +} + +func (cg *progAttachCgroup) Close() error { + defer cg.cgroup.Close() + defer cg.current.Close() + + err := RawDetachProgram(RawDetachProgramOptions{ + Target: int(cg.cgroup.Fd()), + Program: cg.current, + Attach: cg.attachType, + }) + if err != nil { + return fmt.Errorf("close cgroup: %s", err) + } + return nil +} + +func (cg *progAttachCgroup) Update(prog *ebpf.Program) error { + new, err := prog.Clone() + if err != nil { + return err + } + + args := RawAttachProgramOptions{ + Target: int(cg.cgroup.Fd()), + Program: prog, + Attach: cg.attachType, + Flags: uint32(cg.flags), + } + + if cg.flags&flagAllowMulti > 0 { + // Atomically replacing multiple programs requires at least + // 5.5 (commit 7dd68b3279f17921 "bpf: Support replacing cgroup-bpf + // program in MULTI mode") + args.Anchor = ReplaceProgram(cg.current) + } + + if err := RawAttachProgram(args); err != nil { + new.Close() + return fmt.Errorf("can't update cgroup: %s", err) + } + + cg.current.Close() + cg.current = new + return nil +} + +func (cg *progAttachCgroup) Pin(string) error { + return fmt.Errorf("can't pin cgroup: %w", ErrNotSupported) +} + +func (cg *progAttachCgroup) Unpin() error { + return fmt.Errorf("can't unpin cgroup: %w", ErrNotSupported) +} + +func (cg *progAttachCgroup) Info() (*Info, error) { + return nil, fmt.Errorf("can't get cgroup info: %w", ErrNotSupported) +} + +type linkCgroup struct { + RawLink +} + +var _ Link = (*linkCgroup)(nil) + +// newLinkCgroup attaches prog to cgroup using BPF_LINK_CREATE. +func newLinkCgroup(cgroup *os.File, attach ebpf.AttachType, prog *ebpf.Program) (*linkCgroup, error) { + link, err := AttachRawLink(RawLinkOptions{ + Target: int(cgroup.Fd()), + Program: prog, + Attach: attach, + }) + if err != nil { + return nil, err + } + + return &linkCgroup{*link}, err +} + +func (cg *linkCgroup) Info() (*Info, error) { + var info sys.CgroupLinkInfo + if err := sys.ObjInfo(cg.fd, &info); err != nil { + return nil, fmt.Errorf("cgroup link info: %s", err) + } + extra := &CgroupInfo{ + CgroupId: info.CgroupId, + AttachType: info.AttachType, + } + + return &Info{ + info.Type, + info.Id, + ebpf.ProgramID(info.ProgId), + extra, + }, nil +} diff --git a/vendor/github.com/cilium/ebpf/link/doc.go b/vendor/github.com/cilium/ebpf/link/doc.go new file mode 100644 index 0000000000..2bde35ed7a --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/doc.go @@ -0,0 +1,2 @@ +// Package link allows attaching eBPF programs to various kernel hooks. +package link diff --git a/vendor/github.com/cilium/ebpf/link/iter.go b/vendor/github.com/cilium/ebpf/link/iter.go new file mode 100644 index 0000000000..0a39faef88 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/iter.go @@ -0,0 +1,84 @@ +package link + +import ( + "fmt" + "io" + "unsafe" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/internal/sys" +) + +type IterOptions struct { + // Program must be of type Tracing with attach type + // AttachTraceIter. The kind of iterator to attach to is + // determined at load time via the AttachTo field. + // + // AttachTo requires the kernel to include BTF of itself, + // and it to be compiled with a recent pahole (>= 1.16). + Program *ebpf.Program + + // Map specifies the target map for bpf_map_elem and sockmap iterators. + // It may be nil. + Map *ebpf.Map +} + +// AttachIter attaches a BPF seq_file iterator. +func AttachIter(opts IterOptions) (*Iter, error) { + progFd := opts.Program.FD() + if progFd < 0 { + return nil, fmt.Errorf("invalid program: %s", sys.ErrClosedFd) + } + + var info bpfIterLinkInfoMap + if opts.Map != nil { + mapFd := opts.Map.FD() + if mapFd < 0 { + return nil, fmt.Errorf("invalid map: %w", sys.ErrClosedFd) + } + info.map_fd = uint32(mapFd) + } + + attr := sys.LinkCreateIterAttr{ + ProgFd: uint32(progFd), + AttachType: sys.AttachType(ebpf.AttachTraceIter), + IterInfo: sys.NewPointer(unsafe.Pointer(&info)), + IterInfoLen: uint32(unsafe.Sizeof(info)), + } + + fd, err := sys.LinkCreateIter(&attr) + if err != nil { + if haveFeatErr := haveBPFLink(); haveFeatErr != nil { + return nil, haveFeatErr + } + return nil, fmt.Errorf("can't link iterator: %w", err) + } + + return &Iter{RawLink{fd, ""}}, err +} + +// Iter represents an attached bpf_iter. +type Iter struct { + RawLink +} + +// Open creates a new instance of the iterator. +// +// Reading from the returned reader triggers the BPF program. +func (it *Iter) Open() (io.ReadCloser, error) { + attr := &sys.IterCreateAttr{ + LinkFd: it.fd.Uint(), + } + + fd, err := sys.IterCreate(attr) + if err != nil { + return nil, fmt.Errorf("can't create iterator: %w", err) + } + + return fd.File("bpf_iter"), nil +} + +// union bpf_iter_link_info.map +type bpfIterLinkInfoMap struct { + map_fd uint32 +} diff --git a/vendor/github.com/cilium/ebpf/link/kprobe.go b/vendor/github.com/cilium/ebpf/link/kprobe.go new file mode 100644 index 0000000000..6f93a27a25 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/kprobe.go @@ -0,0 +1,369 @@ +package link + +import ( + "errors" + "fmt" + "os" + "runtime" + "strings" + "unsafe" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/linux" + "github.com/cilium/ebpf/internal/sys" + "github.com/cilium/ebpf/internal/tracefs" + "github.com/cilium/ebpf/internal/unix" +) + +// KprobeOptions defines additional parameters that will be used +// when loading Kprobes. +type KprobeOptions struct { + // Arbitrary value that can be fetched from an eBPF program + // via `bpf_get_attach_cookie()`. + // + // Needs kernel 5.15+. + Cookie uint64 + // Offset of the kprobe relative to the traced symbol. + // Can be used to insert kprobes at arbitrary offsets in kernel functions, + // e.g. in places where functions have been inlined. + Offset uint64 + // Increase the maximum number of concurrent invocations of a kretprobe. + // Required when tracing some long running functions in the kernel. + // + // Deprecated: this setting forces the use of an outdated kernel API and is not portable + // across kernel versions. + RetprobeMaxActive int + // Prefix used for the event name if the kprobe must be attached using tracefs. + // The group name will be formatted as `_`. + // The default empty string is equivalent to "ebpf" as the prefix. + TraceFSPrefix string +} + +func (ko *KprobeOptions) cookie() uint64 { + if ko == nil { + return 0 + } + return ko.Cookie +} + +// Kprobe attaches the given eBPF program to a perf event that fires when the +// given kernel symbol starts executing. See /proc/kallsyms for available +// symbols. For example, printk(): +// +// kp, err := Kprobe("printk", prog, nil) +// +// Losing the reference to the resulting Link (kp) will close the Kprobe +// and prevent further execution of prog. The Link must be Closed during +// program shutdown to avoid leaking system resources. +// +// If attaching to symbol fails, automatically retries with the running +// platform's syscall prefix (e.g. __x64_) to support attaching to syscalls +// in a portable fashion. +// +// On kernels 6.11 and later, setting a kprobe on a nonexistent symbol using +// tracefs incorrectly returns [unix.EINVAL] instead of [os.ErrNotExist]. +// +// The returned Link may implement [PerfEvent]. +func Kprobe(symbol string, prog *ebpf.Program, opts *KprobeOptions) (Link, error) { + k, err := kprobe(symbol, prog, opts, false) + if err != nil { + return nil, err + } + + lnk, err := attachPerfEvent(k, prog, opts.cookie()) + if err != nil { + k.Close() + return nil, err + } + + return lnk, nil +} + +// Kretprobe attaches the given eBPF program to a perf event that fires right +// before the given kernel symbol exits, with the function stack left intact. +// See /proc/kallsyms for available symbols. For example, printk(): +// +// kp, err := Kretprobe("printk", prog, nil) +// +// Losing the reference to the resulting Link (kp) will close the Kretprobe +// and prevent further execution of prog. The Link must be Closed during +// program shutdown to avoid leaking system resources. +// +// If attaching to symbol fails, automatically retries with the running +// platform's syscall prefix (e.g. __x64_) to support attaching to syscalls +// in a portable fashion. +// +// On kernels 5.10 and earlier, setting a kretprobe on a nonexistent symbol +// incorrectly returns [unix.EINVAL] instead of [os.ErrNotExist]. +// +// The returned Link may implement [PerfEvent]. +func Kretprobe(symbol string, prog *ebpf.Program, opts *KprobeOptions) (Link, error) { + k, err := kprobe(symbol, prog, opts, true) + if err != nil { + return nil, err + } + + lnk, err := attachPerfEvent(k, prog, opts.cookie()) + if err != nil { + k.Close() + return nil, err + } + + return lnk, nil +} + +// isValidKprobeSymbol implements the equivalent of a regex match +// against "^[a-zA-Z_][0-9a-zA-Z_.]*$". +func isValidKprobeSymbol(s string) bool { + if len(s) < 1 { + return false + } + + for i, c := range []byte(s) { + switch { + case c >= 'a' && c <= 'z': + case c >= 'A' && c <= 'Z': + case c == '_': + case i > 0 && c >= '0' && c <= '9': + + // Allow `.` in symbol name. GCC-compiled kernel may change symbol name + // to have a `.isra.$n` suffix, like `udp_send_skb.isra.52`. + // See: https://gcc.gnu.org/gcc-10/changes.html + case i > 0 && c == '.': + + default: + return false + } + } + + return true +} + +// kprobe opens a perf event on the given symbol and attaches prog to it. +// If ret is true, create a kretprobe. +func kprobe(symbol string, prog *ebpf.Program, opts *KprobeOptions, ret bool) (*perfEvent, error) { + if symbol == "" { + return nil, fmt.Errorf("symbol name cannot be empty: %w", errInvalidInput) + } + if prog == nil { + return nil, fmt.Errorf("prog cannot be nil: %w", errInvalidInput) + } + if !isValidKprobeSymbol(symbol) { + return nil, fmt.Errorf("symbol '%s' must be a valid symbol in /proc/kallsyms: %w", symbol, errInvalidInput) + } + if prog.Type() != ebpf.Kprobe { + return nil, fmt.Errorf("eBPF program type %s is not a Kprobe: %w", prog.Type(), errInvalidInput) + } + + args := tracefs.ProbeArgs{ + Type: tracefs.Kprobe, + Pid: perfAllThreads, + Symbol: symbol, + Ret: ret, + } + + if opts != nil { + args.RetprobeMaxActive = opts.RetprobeMaxActive + args.Cookie = opts.Cookie + args.Offset = opts.Offset + args.Group = opts.TraceFSPrefix + } + + // Use kprobe PMU if the kernel has it available. + tp, err := pmuProbe(args) + if errors.Is(err, os.ErrNotExist) || errors.Is(err, unix.EINVAL) { + if prefix := linux.PlatformPrefix(); prefix != "" { + args.Symbol = prefix + symbol + tp, err = pmuProbe(args) + } + } + if err == nil { + return tp, nil + } + if !errors.Is(err, ErrNotSupported) { + return nil, fmt.Errorf("creating perf_kprobe PMU (arch-specific fallback for %q): %w", symbol, err) + } + + // Use tracefs if kprobe PMU is missing. + args.Symbol = symbol + tp, err = tracefsProbe(args) + if errors.Is(err, os.ErrNotExist) || errors.Is(err, unix.EINVAL) { + if prefix := linux.PlatformPrefix(); prefix != "" { + args.Symbol = prefix + symbol + tp, err = tracefsProbe(args) + } + } + if err != nil { + return nil, fmt.Errorf("creating tracefs event (arch-specific fallback for %q): %w", symbol, err) + } + + return tp, nil +} + +// pmuProbe opens a perf event based on a Performance Monitoring Unit. +// +// Requires at least a 4.17 kernel. +// e12f03d7031a "perf/core: Implement the 'perf_kprobe' PMU" +// 33ea4b24277b "perf/core: Implement the 'perf_uprobe' PMU" +// +// Returns ErrNotSupported if the kernel doesn't support perf_[k,u]probe PMU +func pmuProbe(args tracefs.ProbeArgs) (*perfEvent, error) { + // Getting the PMU type will fail if the kernel doesn't support + // the perf_[k,u]probe PMU. + eventType, err := internal.ReadUint64FromFileOnce("%d\n", "/sys/bus/event_source/devices", args.Type.String(), "type") + if errors.Is(err, os.ErrNotExist) { + return nil, fmt.Errorf("%s: %w", args.Type, ErrNotSupported) + } + if err != nil { + return nil, err + } + + // Use tracefs if we want to set kretprobe's retprobeMaxActive. + if args.RetprobeMaxActive != 0 { + return nil, fmt.Errorf("pmu probe: non-zero retprobeMaxActive: %w", ErrNotSupported) + } + + var config uint64 + if args.Ret { + bit, err := internal.ReadUint64FromFileOnce("config:%d\n", "/sys/bus/event_source/devices", args.Type.String(), "/format/retprobe") + if err != nil { + return nil, err + } + config |= 1 << bit + } + + var ( + attr unix.PerfEventAttr + sp unsafe.Pointer + token string + ) + switch args.Type { + case tracefs.Kprobe: + // Create a pointer to a NUL-terminated string for the kernel. + sp, err = unsafeStringPtr(args.Symbol) + if err != nil { + return nil, err + } + + token = tracefs.KprobeToken(args) + + attr = unix.PerfEventAttr{ + // The minimum size required for PMU kprobes is PERF_ATTR_SIZE_VER1, + // since it added the config2 (Ext2) field. Use Ext2 as probe_offset. + Size: unix.PERF_ATTR_SIZE_VER1, + Type: uint32(eventType), // PMU event type read from sysfs + Ext1: uint64(uintptr(sp)), // Kernel symbol to trace + Ext2: args.Offset, // Kernel symbol offset + Config: config, // Retprobe flag + } + case tracefs.Uprobe: + sp, err = unsafeStringPtr(args.Path) + if err != nil { + return nil, err + } + + if args.RefCtrOffset != 0 { + config |= args.RefCtrOffset << uprobeRefCtrOffsetShift + } + + token = tracefs.UprobeToken(args) + + attr = unix.PerfEventAttr{ + // The minimum size required for PMU uprobes is PERF_ATTR_SIZE_VER1, + // since it added the config2 (Ext2) field. The Size field controls the + // size of the internal buffer the kernel allocates for reading the + // perf_event_attr argument from userspace. + Size: unix.PERF_ATTR_SIZE_VER1, + Type: uint32(eventType), // PMU event type read from sysfs + Ext1: uint64(uintptr(sp)), // Uprobe path + Ext2: args.Offset, // Uprobe offset + Config: config, // RefCtrOffset, Retprobe flag + } + } + + cpu := 0 + if args.Pid != perfAllThreads { + cpu = -1 + } + rawFd, err := unix.PerfEventOpen(&attr, args.Pid, cpu, -1, unix.PERF_FLAG_FD_CLOEXEC) + + // On some old kernels, kprobe PMU doesn't allow `.` in symbol names and + // return -EINVAL. Return ErrNotSupported to allow falling back to tracefs. + // https://github.com/torvalds/linux/blob/94710cac0ef4/kernel/trace/trace_kprobe.c#L340-L343 + if errors.Is(err, unix.EINVAL) && strings.Contains(args.Symbol, ".") { + return nil, fmt.Errorf("token %s: older kernels don't accept dots: %w", token, ErrNotSupported) + } + // Since commit 97c753e62e6c, ENOENT is correctly returned instead of EINVAL + // when trying to create a retprobe for a missing symbol. + if errors.Is(err, os.ErrNotExist) { + return nil, fmt.Errorf("token %s: not found: %w", token, err) + } + // Since commit ab105a4fb894, EILSEQ is returned when a kprobe sym+offset is resolved + // to an invalid insn boundary. The exact conditions that trigger this error are + // arch specific however. + if errors.Is(err, unix.EILSEQ) { + return nil, fmt.Errorf("token %s: bad insn boundary: %w", token, os.ErrNotExist) + } + // Since at least commit cb9a19fe4aa51, ENOTSUPP is returned + // when attempting to set a uprobe on a trap instruction. + if errors.Is(err, sys.ENOTSUPP) { + return nil, fmt.Errorf("token %s: failed setting uprobe on offset %#x (possible trap insn): %w", token, args.Offset, err) + } + + if err != nil { + return nil, fmt.Errorf("token %s: opening perf event: %w", token, err) + } + + // Ensure the string pointer is not collected before PerfEventOpen returns. + runtime.KeepAlive(sp) + + fd, err := sys.NewFD(rawFd) + if err != nil { + return nil, err + } + + // Kernel has perf_[k,u]probe PMU available, initialize perf event. + return newPerfEvent(fd, nil), nil +} + +// tracefsProbe creates a trace event by writing an entry to /[k,u]probe_events. +// A new trace event group name is generated on every call to support creating +// multiple trace events for the same kernel or userspace symbol. +// Path and offset are only set in the case of uprobe(s) and are used to set +// the executable/library path on the filesystem and the offset where the probe is inserted. +// A perf event is then opened on the newly-created trace event and returned to the caller. +func tracefsProbe(args tracefs.ProbeArgs) (*perfEvent, error) { + groupPrefix := "ebpf" + if args.Group != "" { + groupPrefix = args.Group + } + + // Generate a random string for each trace event we attempt to create. + // This value is used as the 'group' token in tracefs to allow creating + // multiple kprobe trace events with the same name. + group, err := tracefs.RandomGroup(groupPrefix) + if err != nil { + return nil, fmt.Errorf("randomizing group name: %w", err) + } + args.Group = group + + // Create the [k,u]probe trace event using tracefs. + evt, err := tracefs.NewEvent(args) + if err != nil { + return nil, fmt.Errorf("creating probe entry on tracefs: %w", err) + } + + // Kprobes are ephemeral tracepoints and share the same perf event type. + fd, err := openTracepointPerfEvent(evt.ID(), args.Pid) + if err != nil { + // Make sure we clean up the created tracefs event when we return error. + // If a livepatch handler is already active on the symbol, the write to + // tracefs will succeed, a trace event will show up, but creating the + // perf event will fail with EBUSY. + _ = evt.Close() + return nil, err + } + + return newPerfEvent(fd, evt), nil +} diff --git a/vendor/github.com/cilium/ebpf/link/kprobe_multi.go b/vendor/github.com/cilium/ebpf/link/kprobe_multi.go new file mode 100644 index 0000000000..f19f9f4c73 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/kprobe_multi.go @@ -0,0 +1,255 @@ +package link + +import ( + "errors" + "fmt" + "os" + "unsafe" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/asm" + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/sys" + "github.com/cilium/ebpf/internal/unix" +) + +// KprobeMultiOptions defines additional parameters that will be used +// when opening a KprobeMulti Link. +type KprobeMultiOptions struct { + // Symbols takes a list of kernel symbol names to attach an ebpf program to. + // + // Mutually exclusive with Addresses. + Symbols []string + + // Addresses takes a list of kernel symbol addresses in case they can not + // be referred to by name. + // + // Note that only start addresses can be specified, since the fprobe API + // limits the attach point to the function entry or return. + // + // Mutually exclusive with Symbols. + Addresses []uintptr + + // Cookies specifies arbitrary values that can be fetched from an eBPF + // program via `bpf_get_attach_cookie()`. + // + // If set, its length should be equal to the length of Symbols or Addresses. + // Each Cookie is assigned to the Symbol or Address specified at the + // corresponding slice index. + Cookies []uint64 + + // Session must be true when attaching Programs with the + // [ebpf.AttachTraceKprobeSession] attach type. + // + // This makes a Kprobe execute on both function entry and return. The entry + // program can share a cookie value with the return program and can decide + // whether the return program gets executed. + Session bool +} + +// KprobeMulti attaches the given eBPF program to the entry point of a given set +// of kernel symbols. +// +// The difference with Kprobe() is that multi-kprobe accomplishes this in a +// single system call, making it significantly faster than attaching many +// probes one at a time. +// +// Requires at least Linux 5.18. +func KprobeMulti(prog *ebpf.Program, opts KprobeMultiOptions) (Link, error) { + return kprobeMulti(prog, opts, 0) +} + +// KretprobeMulti attaches the given eBPF program to the return point of a given +// set of kernel symbols. +// +// The difference with Kretprobe() is that multi-kprobe accomplishes this in a +// single system call, making it significantly faster than attaching many +// probes one at a time. +// +// Requires at least Linux 5.18. +func KretprobeMulti(prog *ebpf.Program, opts KprobeMultiOptions) (Link, error) { + return kprobeMulti(prog, opts, sys.BPF_F_KPROBE_MULTI_RETURN) +} + +func kprobeMulti(prog *ebpf.Program, opts KprobeMultiOptions, flags uint32) (Link, error) { + if prog == nil { + return nil, errors.New("cannot attach a nil program") + } + + syms := uint32(len(opts.Symbols)) + addrs := uint32(len(opts.Addresses)) + cookies := uint32(len(opts.Cookies)) + + if syms == 0 && addrs == 0 { + return nil, fmt.Errorf("one of Symbols or Addresses is required: %w", errInvalidInput) + } + if syms != 0 && addrs != 0 { + return nil, fmt.Errorf("Symbols and Addresses are mutually exclusive: %w", errInvalidInput) + } + if cookies > 0 && cookies != syms && cookies != addrs { + return nil, fmt.Errorf("Cookies must be exactly Symbols or Addresses in length: %w", errInvalidInput) + } + + attachType := sys.BPF_TRACE_KPROBE_MULTI + if opts.Session { + attachType = sys.BPF_TRACE_KPROBE_SESSION + } + + attr := &sys.LinkCreateKprobeMultiAttr{ + ProgFd: uint32(prog.FD()), + AttachType: attachType, + KprobeMultiFlags: flags, + } + + switch { + case syms != 0: + attr.Count = syms + attr.Syms = sys.NewStringSlicePointer(opts.Symbols) + + case addrs != 0: + attr.Count = addrs + attr.Addrs = sys.NewPointer(unsafe.Pointer(&opts.Addresses[0])) + } + + if cookies != 0 { + attr.Cookies = sys.NewPointer(unsafe.Pointer(&opts.Cookies[0])) + } + + fd, err := sys.LinkCreateKprobeMulti(attr) + if err == nil { + return &kprobeMultiLink{RawLink{fd, ""}}, nil + } + + if errors.Is(err, unix.ESRCH) { + return nil, fmt.Errorf("couldn't find one or more symbols: %w", os.ErrNotExist) + } + + if opts.Session { + if haveFeatErr := haveBPFLinkKprobeSession(); haveFeatErr != nil { + return nil, haveFeatErr + } + } else { + if haveFeatErr := haveBPFLinkKprobeMulti(); haveFeatErr != nil { + return nil, haveFeatErr + } + } + + // Check EINVAL after running feature probes, since it's also returned when + // the kernel doesn't support the multi/session attach types. + if errors.Is(err, unix.EINVAL) { + return nil, fmt.Errorf("%w (missing kernel symbol or prog's AttachType not %s?)", err, ebpf.AttachType(attachType)) + } + + return nil, err +} + +type kprobeMultiLink struct { + RawLink +} + +var _ Link = (*kprobeMultiLink)(nil) + +func (kml *kprobeMultiLink) Update(_ *ebpf.Program) error { + return fmt.Errorf("update kprobe_multi: %w", ErrNotSupported) +} + +func (kml *kprobeMultiLink) Info() (*Info, error) { + var info sys.KprobeMultiLinkInfo + if err := sys.ObjInfo(kml.fd, &info); err != nil { + return nil, fmt.Errorf("kprobe multi link info: %s", err) + } + extra := &KprobeMultiInfo{ + count: info.Count, + flags: info.Flags, + missed: info.Missed, + } + + return &Info{ + info.Type, + info.Id, + ebpf.ProgramID(info.ProgId), + extra, + }, nil +} + +var haveBPFLinkKprobeMulti = internal.NewFeatureTest("bpf_link_kprobe_multi", func() error { + prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{ + Name: "probe_kpm_link", + Type: ebpf.Kprobe, + Instructions: asm.Instructions{ + asm.Mov.Imm(asm.R0, 0), + asm.Return(), + }, + AttachType: ebpf.AttachTraceKprobeMulti, + License: "MIT", + }) + if errors.Is(err, unix.E2BIG) { + // Kernel doesn't support AttachType field. + return internal.ErrNotSupported + } + if err != nil { + return err + } + defer prog.Close() + + fd, err := sys.LinkCreateKprobeMulti(&sys.LinkCreateKprobeMultiAttr{ + ProgFd: uint32(prog.FD()), + AttachType: sys.BPF_TRACE_KPROBE_MULTI, + Count: 1, + Syms: sys.NewStringSlicePointer([]string{"vprintk"}), + }) + switch { + case errors.Is(err, unix.EINVAL): + return internal.ErrNotSupported + // If CONFIG_FPROBE isn't set. + case errors.Is(err, unix.EOPNOTSUPP): + return internal.ErrNotSupported + case err != nil: + return err + } + + fd.Close() + + return nil +}, "5.18") + +var haveBPFLinkKprobeSession = internal.NewFeatureTest("bpf_link_kprobe_session", func() error { + prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{ + Name: "probe_kps_link", + Type: ebpf.Kprobe, + Instructions: asm.Instructions{ + asm.Mov.Imm(asm.R0, 0), + asm.Return(), + }, + AttachType: ebpf.AttachTraceKprobeSession, + License: "MIT", + }) + if errors.Is(err, unix.E2BIG) { + // Kernel doesn't support AttachType field. + return internal.ErrNotSupported + } + if err != nil { + return err + } + defer prog.Close() + + fd, err := sys.LinkCreateKprobeMulti(&sys.LinkCreateKprobeMultiAttr{ + ProgFd: uint32(prog.FD()), + AttachType: sys.BPF_TRACE_KPROBE_SESSION, + Count: 1, + Syms: sys.NewStringSlicePointer([]string{"vprintk"}), + }) + switch { + case errors.Is(err, unix.EINVAL): + return internal.ErrNotSupported + // If CONFIG_FPROBE isn't set. + case errors.Is(err, unix.EOPNOTSUPP): + return internal.ErrNotSupported + case err != nil: + return err + } + + fd.Close() + + return nil +}, "6.10") diff --git a/vendor/github.com/cilium/ebpf/link/link.go b/vendor/github.com/cilium/ebpf/link/link.go new file mode 100644 index 0000000000..796769f8ea --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/link.go @@ -0,0 +1,537 @@ +package link + +import ( + "errors" + "fmt" + "os" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/btf" + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/sys" +) + +var ErrNotSupported = internal.ErrNotSupported + +// Link represents a Program attached to a BPF hook. +type Link interface { + // Replace the current program with a new program. + // + // Passing a nil program is an error. May return an error wrapping ErrNotSupported. + Update(*ebpf.Program) error + + // Persist a link by pinning it into a bpffs. + // + // May return an error wrapping ErrNotSupported. + Pin(string) error + + // Undo a previous call to Pin. + // + // May return an error wrapping ErrNotSupported. + Unpin() error + + // Close frees resources. + // + // The link will be broken unless it has been successfully pinned. + // A link may continue past the lifetime of the process if Close is + // not called. + Close() error + + // Info returns metadata on a link. + // + // May return an error wrapping ErrNotSupported. + Info() (*Info, error) + + // Prevent external users from implementing this interface. + isLink() +} + +// NewLinkFromFD creates a link from a raw fd. +// +// Deprecated: use [NewFromFD] instead. +func NewLinkFromFD(fd int) (Link, error) { + return NewFromFD(fd) +} + +// NewFromFD creates a link from a raw fd. +// +// You should not use fd after calling this function. +func NewFromFD(fd int) (Link, error) { + sysFD, err := sys.NewFD(fd) + if err != nil { + return nil, err + } + + return wrapRawLink(&RawLink{fd: sysFD}) +} + +// NewFromID returns the link associated with the given id. +// +// Returns ErrNotExist if there is no link with the given id. +func NewFromID(id ID) (Link, error) { + getFdAttr := &sys.LinkGetFdByIdAttr{Id: id} + fd, err := sys.LinkGetFdById(getFdAttr) + if err != nil { + return nil, fmt.Errorf("get link fd from ID %d: %w", id, err) + } + + return wrapRawLink(&RawLink{fd, ""}) +} + +// LoadPinnedLink loads a Link from a pin (file) on the BPF virtual filesystem. +// +// Requires at least Linux 5.7. +func LoadPinnedLink(fileName string, opts *ebpf.LoadPinOptions) (Link, error) { + raw, err := loadPinnedRawLink(fileName, opts) + if err != nil { + return nil, err + } + + return wrapRawLink(raw) +} + +// wrap a RawLink in a more specific type if possible. +// +// The function takes ownership of raw and closes it on error. +func wrapRawLink(raw *RawLink) (_ Link, err error) { + defer func() { + if err != nil { + raw.Close() + } + }() + + info, err := raw.Info() + if err != nil { + return nil, err + } + + switch info.Type { + case RawTracepointType: + return &rawTracepoint{*raw}, nil + case TracingType: + return &tracing{*raw}, nil + case CgroupType: + return &linkCgroup{*raw}, nil + case IterType: + return &Iter{*raw}, nil + case NetNsType: + return &NetNsLink{*raw}, nil + case KprobeMultiType: + return &kprobeMultiLink{*raw}, nil + case UprobeMultiType: + return &uprobeMultiLink{*raw}, nil + case PerfEventType: + return &perfEventLink{*raw, nil}, nil + case TCXType: + return &tcxLink{*raw}, nil + case NetfilterType: + return &netfilterLink{*raw}, nil + case NetkitType: + return &netkitLink{*raw}, nil + case XDPType: + return &xdpLink{*raw}, nil + default: + return raw, nil + } +} + +// ID uniquely identifies a BPF link. +type ID = sys.LinkID + +// RawLinkOptions control the creation of a raw link. +type RawLinkOptions struct { + // File descriptor to attach to. This differs for each attach type. + Target int + // Program to attach. + Program *ebpf.Program + // Attach must match the attach type of Program. + Attach ebpf.AttachType + // BTF is the BTF of the attachment target. + BTF btf.TypeID + // Flags control the attach behaviour. + Flags uint32 +} + +// Info contains metadata on a link. +type Info struct { + Type Type + ID ID + Program ebpf.ProgramID + extra interface{} +} + +type TracingInfo struct { + AttachType sys.AttachType + TargetObjId uint32 + TargetBtfId sys.TypeID +} + +type CgroupInfo struct { + CgroupId uint64 + AttachType sys.AttachType + _ [4]byte +} + +type NetNsInfo struct { + NetnsIno uint32 + AttachType sys.AttachType +} + +type TCXInfo struct { + Ifindex uint32 + AttachType sys.AttachType +} + +type XDPInfo struct { + Ifindex uint32 +} + +type NetfilterInfo struct { + Pf uint32 + Hooknum uint32 + Priority int32 + Flags uint32 +} + +type NetkitInfo struct { + Ifindex uint32 + AttachType sys.AttachType +} + +type KprobeMultiInfo struct { + count uint32 + flags uint32 + missed uint64 +} + +// AddressCount is the number of addresses hooked by the kprobe. +func (kpm *KprobeMultiInfo) AddressCount() (uint32, bool) { + return kpm.count, kpm.count > 0 +} + +func (kpm *KprobeMultiInfo) Flags() (uint32, bool) { + return kpm.flags, kpm.count > 0 +} + +func (kpm *KprobeMultiInfo) Missed() (uint64, bool) { + return kpm.missed, kpm.count > 0 +} + +type PerfEventInfo struct { + Type sys.PerfEventType + extra interface{} +} + +func (r *PerfEventInfo) Kprobe() *KprobeInfo { + e, _ := r.extra.(*KprobeInfo) + return e +} + +type KprobeInfo struct { + address uint64 + missed uint64 +} + +func (kp *KprobeInfo) Address() (uint64, bool) { + return kp.address, kp.address > 0 +} + +func (kp *KprobeInfo) Missed() (uint64, bool) { + return kp.missed, kp.address > 0 +} + +// Tracing returns tracing type-specific link info. +// +// Returns nil if the type-specific link info isn't available. +func (r Info) Tracing() *TracingInfo { + e, _ := r.extra.(*TracingInfo) + return e +} + +// Cgroup returns cgroup type-specific link info. +// +// Returns nil if the type-specific link info isn't available. +func (r Info) Cgroup() *CgroupInfo { + e, _ := r.extra.(*CgroupInfo) + return e +} + +// NetNs returns netns type-specific link info. +// +// Returns nil if the type-specific link info isn't available. +func (r Info) NetNs() *NetNsInfo { + e, _ := r.extra.(*NetNsInfo) + return e +} + +// XDP returns XDP type-specific link info. +// +// Returns nil if the type-specific link info isn't available. +func (r Info) XDP() *XDPInfo { + e, _ := r.extra.(*XDPInfo) + return e +} + +// TCX returns TCX type-specific link info. +// +// Returns nil if the type-specific link info isn't available. +func (r Info) TCX() *TCXInfo { + e, _ := r.extra.(*TCXInfo) + return e +} + +// Netfilter returns netfilter type-specific link info. +// +// Returns nil if the type-specific link info isn't available. +func (r Info) Netfilter() *NetfilterInfo { + e, _ := r.extra.(*NetfilterInfo) + return e +} + +// Netkit returns netkit type-specific link info. +// +// Returns nil if the type-specific link info isn't available. +func (r Info) Netkit() *NetkitInfo { + e, _ := r.extra.(*NetkitInfo) + return e +} + +// KprobeMulti returns kprobe-multi type-specific link info. +// +// Returns nil if the type-specific link info isn't available. +func (r Info) KprobeMulti() *KprobeMultiInfo { + e, _ := r.extra.(*KprobeMultiInfo) + return e +} + +// PerfEvent returns perf-event type-specific link info. +// +// Returns nil if the type-specific link info isn't available. +func (r Info) PerfEvent() *PerfEventInfo { + e, _ := r.extra.(*PerfEventInfo) + return e +} + +// RawLink is the low-level API to bpf_link. +// +// You should consider using the higher level interfaces in this +// package instead. +type RawLink struct { + fd *sys.FD + pinnedPath string +} + +// AttachRawLink creates a raw link. +func AttachRawLink(opts RawLinkOptions) (*RawLink, error) { + if err := haveBPFLink(); err != nil { + return nil, err + } + + if opts.Target < 0 { + return nil, fmt.Errorf("invalid target: %s", sys.ErrClosedFd) + } + + progFd := opts.Program.FD() + if progFd < 0 { + return nil, fmt.Errorf("invalid program: %s", sys.ErrClosedFd) + } + + attr := sys.LinkCreateAttr{ + TargetFd: uint32(opts.Target), + ProgFd: uint32(progFd), + AttachType: sys.AttachType(opts.Attach), + TargetBtfId: opts.BTF, + Flags: opts.Flags, + } + fd, err := sys.LinkCreate(&attr) + if err != nil { + return nil, fmt.Errorf("create link: %w", err) + } + + return &RawLink{fd, ""}, nil +} + +func loadPinnedRawLink(fileName string, opts *ebpf.LoadPinOptions) (*RawLink, error) { + fd, typ, err := sys.ObjGetTyped(&sys.ObjGetAttr{ + Pathname: sys.NewStringPointer(fileName), + FileFlags: opts.Marshal(), + }) + if err != nil { + return nil, fmt.Errorf("load pinned link: %w", err) + } + + if typ != sys.BPF_TYPE_LINK { + _ = fd.Close() + return nil, fmt.Errorf("%s is not a Link", fileName) + } + + return &RawLink{fd, fileName}, nil +} + +func (l *RawLink) isLink() {} + +// FD returns the raw file descriptor. +func (l *RawLink) FD() int { + return l.fd.Int() +} + +// Close breaks the link. +// +// Use Pin if you want to make the link persistent. +func (l *RawLink) Close() error { + return l.fd.Close() +} + +// Pin persists a link past the lifetime of the process. +// +// Calling Close on a pinned Link will not break the link +// until the pin is removed. +func (l *RawLink) Pin(fileName string) error { + if err := sys.Pin(l.pinnedPath, fileName, l.fd); err != nil { + return err + } + l.pinnedPath = fileName + return nil +} + +// Unpin implements the Link interface. +func (l *RawLink) Unpin() error { + if err := sys.Unpin(l.pinnedPath); err != nil { + return err + } + l.pinnedPath = "" + return nil +} + +// IsPinned returns true if the Link has a non-empty pinned path. +func (l *RawLink) IsPinned() bool { + return l.pinnedPath != "" +} + +// Update implements the Link interface. +func (l *RawLink) Update(new *ebpf.Program) error { + return l.UpdateArgs(RawLinkUpdateOptions{ + New: new, + }) +} + +// RawLinkUpdateOptions control the behaviour of RawLink.UpdateArgs. +type RawLinkUpdateOptions struct { + New *ebpf.Program + Old *ebpf.Program + Flags uint32 +} + +// UpdateArgs updates a link based on args. +func (l *RawLink) UpdateArgs(opts RawLinkUpdateOptions) error { + newFd := opts.New.FD() + if newFd < 0 { + return fmt.Errorf("invalid program: %s", sys.ErrClosedFd) + } + + var oldFd int + if opts.Old != nil { + oldFd = opts.Old.FD() + if oldFd < 0 { + return fmt.Errorf("invalid replacement program: %s", sys.ErrClosedFd) + } + } + + attr := sys.LinkUpdateAttr{ + LinkFd: l.fd.Uint(), + NewProgFd: uint32(newFd), + OldProgFd: uint32(oldFd), + Flags: opts.Flags, + } + return sys.LinkUpdate(&attr) +} + +// Info returns metadata about the link. +// +// Linktype specific metadata is not included and can be retrieved +// via the linktype specific Info() method. +func (l *RawLink) Info() (*Info, error) { + var info sys.LinkInfo + + if err := sys.ObjInfo(l.fd, &info); err != nil { + return nil, fmt.Errorf("link info: %s", err) + } + + return &Info{ + info.Type, + info.Id, + ebpf.ProgramID(info.ProgId), + nil, + }, nil +} + +// Iterator allows iterating over links attached into the kernel. +type Iterator struct { + // The ID of the current link. Only valid after a call to Next + ID ID + // The current link. Only valid until a call to Next. + // See Take if you want to retain the link. + Link Link + err error +} + +// Next retrieves the next link. +// +// Returns true if another link was found. Call [Iterator.Err] after the function returns false. +func (it *Iterator) Next() bool { + id := it.ID + for { + getIdAttr := &sys.LinkGetNextIdAttr{Id: id} + err := sys.LinkGetNextId(getIdAttr) + if errors.Is(err, os.ErrNotExist) { + // There are no more links. + break + } else if err != nil { + it.err = fmt.Errorf("get next link ID: %w", err) + break + } + + id = getIdAttr.NextId + l, err := NewFromID(id) + if errors.Is(err, os.ErrNotExist) { + // Couldn't load the link fast enough. Try next ID. + continue + } else if err != nil { + it.err = fmt.Errorf("get link for ID %d: %w", id, err) + break + } + + if it.Link != nil { + it.Link.Close() + } + it.ID, it.Link = id, l + return true + } + + // No more links or we encountered an error. + if it.Link != nil { + it.Link.Close() + } + it.Link = nil + return false +} + +// Take the ownership of the current link. +// +// It's the callers responsibility to close the link. +func (it *Iterator) Take() Link { + l := it.Link + it.Link = nil + return l +} + +// Err returns an error if iteration failed for some reason. +func (it *Iterator) Err() error { + return it.err +} + +func (it *Iterator) Close() { + if it.Link != nil { + it.Link.Close() + } +} diff --git a/vendor/github.com/cilium/ebpf/link/netfilter.go b/vendor/github.com/cilium/ebpf/link/netfilter.go new file mode 100644 index 0000000000..9436d11df9 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/netfilter.go @@ -0,0 +1,90 @@ +package link + +import ( + "fmt" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/internal/sys" +) + +const NetfilterIPDefrag NetfilterAttachFlags = 0 // Enable IP packet defragmentation + +type NetfilterAttachFlags uint32 + +type NetfilterOptions struct { + // Program must be a netfilter BPF program. + Program *ebpf.Program + // The protocol family. + ProtocolFamily uint32 + // The number of the hook you are interested in. + HookNumber uint32 + // Priority within hook + Priority int32 + // Extra link flags + Flags uint32 + // Netfilter flags + NetfilterFlags NetfilterAttachFlags +} + +type netfilterLink struct { + RawLink +} + +// AttachNetfilter links a netfilter BPF program to a netfilter hook. +func AttachNetfilter(opts NetfilterOptions) (Link, error) { + if opts.Program == nil { + return nil, fmt.Errorf("netfilter program is nil") + } + + if t := opts.Program.Type(); t != ebpf.Netfilter { + return nil, fmt.Errorf("invalid program type %s, expected netfilter", t) + } + + progFd := opts.Program.FD() + if progFd < 0 { + return nil, fmt.Errorf("invalid program: %s", sys.ErrClosedFd) + } + + attr := sys.LinkCreateNetfilterAttr{ + ProgFd: uint32(opts.Program.FD()), + AttachType: sys.BPF_NETFILTER, + Flags: opts.Flags, + Pf: uint32(opts.ProtocolFamily), + Hooknum: uint32(opts.HookNumber), + Priority: opts.Priority, + NetfilterFlags: uint32(opts.NetfilterFlags), + } + + fd, err := sys.LinkCreateNetfilter(&attr) + if err != nil { + return nil, fmt.Errorf("attach netfilter link: %w", err) + } + + return &netfilterLink{RawLink{fd, ""}}, nil +} + +func (*netfilterLink) Update(_ *ebpf.Program) error { + return fmt.Errorf("netfilter update: %w", ErrNotSupported) +} + +func (nf *netfilterLink) Info() (*Info, error) { + var info sys.NetfilterLinkInfo + if err := sys.ObjInfo(nf.fd, &info); err != nil { + return nil, fmt.Errorf("netfilter link info: %s", err) + } + extra := &NetfilterInfo{ + Pf: info.Pf, + Hooknum: info.Hooknum, + Priority: info.Priority, + Flags: info.Flags, + } + + return &Info{ + info.Type, + info.Id, + ebpf.ProgramID(info.ProgId), + extra, + }, nil +} + +var _ Link = (*netfilterLink)(nil) diff --git a/vendor/github.com/cilium/ebpf/link/netkit.go b/vendor/github.com/cilium/ebpf/link/netkit.go new file mode 100644 index 0000000000..5eee3b023a --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/netkit.go @@ -0,0 +1,89 @@ +package link + +import ( + "fmt" + "runtime" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/internal/sys" +) + +type NetkitOptions struct { + // Index of the interface to attach to. + Interface int + // Program to attach. + Program *ebpf.Program + // One of the AttachNetkit* constants. + Attach ebpf.AttachType + // Attach relative to an anchor. Optional. + Anchor Anchor + // Only attach if the expected revision matches. + ExpectedRevision uint64 + // Flags control the attach behaviour. Specify an Anchor instead of + // F_LINK, F_ID, F_BEFORE, F_AFTER and R_REPLACE. Optional. + Flags uint32 +} + +func AttachNetkit(opts NetkitOptions) (Link, error) { + if opts.Interface < 0 { + return nil, fmt.Errorf("interface %d is out of bounds", opts.Interface) + } + + if opts.Flags&anchorFlags != 0 { + return nil, fmt.Errorf("disallowed flags: use Anchor to specify attach target") + } + + attr := sys.LinkCreateNetkitAttr{ + ProgFd: uint32(opts.Program.FD()), + AttachType: sys.AttachType(opts.Attach), + TargetIfindex: uint32(opts.Interface), + ExpectedRevision: opts.ExpectedRevision, + Flags: opts.Flags, + } + + if opts.Anchor != nil { + fdOrID, flags, err := opts.Anchor.anchor() + if err != nil { + return nil, fmt.Errorf("attach netkit link: %w", err) + } + + attr.RelativeFdOrId = fdOrID + attr.Flags |= flags + } + + fd, err := sys.LinkCreateNetkit(&attr) + runtime.KeepAlive(opts.Program) + runtime.KeepAlive(opts.Anchor) + if err != nil { + if haveFeatErr := haveNetkit(); haveFeatErr != nil { + return nil, haveFeatErr + } + return nil, fmt.Errorf("attach netkit link: %w", err) + } + + return &netkitLink{RawLink{fd, ""}}, nil +} + +type netkitLink struct { + RawLink +} + +var _ Link = (*netkitLink)(nil) + +func (netkit *netkitLink) Info() (*Info, error) { + var info sys.NetkitLinkInfo + if err := sys.ObjInfo(netkit.fd, &info); err != nil { + return nil, fmt.Errorf("netkit link info: %s", err) + } + extra := &NetkitInfo{ + Ifindex: info.Ifindex, + AttachType: info.AttachType, + } + + return &Info{ + info.Type, + info.Id, + ebpf.ProgramID(info.ProgId), + extra, + }, nil +} diff --git a/vendor/github.com/cilium/ebpf/link/netns.go b/vendor/github.com/cilium/ebpf/link/netns.go new file mode 100644 index 0000000000..b1edd340a3 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/netns.go @@ -0,0 +1,55 @@ +package link + +import ( + "fmt" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/internal/sys" +) + +// NetNsLink is a program attached to a network namespace. +type NetNsLink struct { + RawLink +} + +// AttachNetNs attaches a program to a network namespace. +func AttachNetNs(ns int, prog *ebpf.Program) (*NetNsLink, error) { + var attach ebpf.AttachType + switch t := prog.Type(); t { + case ebpf.FlowDissector: + attach = ebpf.AttachFlowDissector + case ebpf.SkLookup: + attach = ebpf.AttachSkLookup + default: + return nil, fmt.Errorf("can't attach %v to network namespace", t) + } + + link, err := AttachRawLink(RawLinkOptions{ + Target: ns, + Program: prog, + Attach: attach, + }) + if err != nil { + return nil, err + } + + return &NetNsLink{*link}, nil +} + +func (ns *NetNsLink) Info() (*Info, error) { + var info sys.NetNsLinkInfo + if err := sys.ObjInfo(ns.fd, &info); err != nil { + return nil, fmt.Errorf("netns link info: %s", err) + } + extra := &NetNsInfo{ + NetnsIno: info.NetnsIno, + AttachType: info.AttachType, + } + + return &Info{ + info.Type, + info.Id, + ebpf.ProgramID(info.ProgId), + extra, + }, nil +} diff --git a/vendor/github.com/cilium/ebpf/link/perf_event.go b/vendor/github.com/cilium/ebpf/link/perf_event.go new file mode 100644 index 0000000000..7440e8b292 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/perf_event.go @@ -0,0 +1,332 @@ +package link + +import ( + "errors" + "fmt" + "os" + "runtime" + "unsafe" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/asm" + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/sys" + "github.com/cilium/ebpf/internal/tracefs" + "github.com/cilium/ebpf/internal/unix" +) + +// Getting the terminology right is usually the hardest part. For posterity and +// for staying sane during implementation: +// +// - trace event: Representation of a kernel runtime hook. Filesystem entries +// under /events. Can be tracepoints (static), kprobes or uprobes. +// Can be instantiated into perf events (see below). +// - tracepoint: A predetermined hook point in the kernel. Exposed as trace +// events in (sub)directories under /events. Cannot be closed or +// removed, they are static. +// - k(ret)probe: Ephemeral trace events based on entry or exit points of +// exported kernel symbols. kprobe-based (tracefs) trace events can be +// created system-wide by writing to the /kprobe_events file, or +// they can be scoped to the current process by creating PMU perf events. +// - u(ret)probe: Ephemeral trace events based on user provides ELF binaries +// and offsets. uprobe-based (tracefs) trace events can be +// created system-wide by writing to the /uprobe_events file, or +// they can be scoped to the current process by creating PMU perf events. +// - perf event: An object instantiated based on an existing trace event or +// kernel symbol. Referred to by fd in userspace. +// Exactly one eBPF program can be attached to a perf event. Multiple perf +// events can be created from a single trace event. Closing a perf event +// stops any further invocations of the attached eBPF program. + +var ( + errInvalidInput = tracefs.ErrInvalidInput +) + +const ( + perfAllThreads = -1 +) + +// A perfEvent represents a perf event kernel object. Exactly one eBPF program +// can be attached to it. It is created based on a tracefs trace event or a +// Performance Monitoring Unit (PMU). +type perfEvent struct { + // Trace event backing this perfEvent. May be nil. + tracefsEvent *tracefs.Event + + // This is the perf event FD. + fd *sys.FD +} + +func newPerfEvent(fd *sys.FD, event *tracefs.Event) *perfEvent { + pe := &perfEvent{event, fd} + // Both event and fd have their own finalizer, but we want to + // guarantee that they are closed in a certain order. + runtime.SetFinalizer(pe, (*perfEvent).Close) + return pe +} + +func (pe *perfEvent) Close() error { + runtime.SetFinalizer(pe, nil) + + if err := pe.fd.Close(); err != nil { + return fmt.Errorf("closing perf event fd: %w", err) + } + + if pe.tracefsEvent != nil { + return pe.tracefsEvent.Close() + } + + return nil +} + +// PerfEvent is implemented by some Link types which use a perf event under +// the hood. +type PerfEvent interface { + // PerfEvent returns a file for the underlying perf event. + // + // It is the callers responsibility to close the returned file. + // + // Making changes to the associated perf event lead to + // undefined behaviour. + PerfEvent() (*os.File, error) +} + +// perfEventLink represents a bpf perf link. +type perfEventLink struct { + RawLink + pe *perfEvent +} + +func (pl *perfEventLink) isLink() {} + +func (pl *perfEventLink) Close() error { + if err := pl.fd.Close(); err != nil { + return fmt.Errorf("perf link close: %w", err) + } + + // when created from pinned link + if pl.pe == nil { + return nil + } + + if err := pl.pe.Close(); err != nil { + return fmt.Errorf("perf event close: %w", err) + } + return nil +} + +func (pl *perfEventLink) Update(_ *ebpf.Program) error { + return fmt.Errorf("perf event link update: %w", ErrNotSupported) +} + +var _ PerfEvent = (*perfEventLink)(nil) + +func (pl *perfEventLink) PerfEvent() (*os.File, error) { + // when created from pinned link + if pl.pe == nil { + return nil, ErrNotSupported + } + + fd, err := pl.pe.fd.Dup() + if err != nil { + return nil, err + } + + return fd.File("perf-event"), nil +} + +func (pl *perfEventLink) Info() (*Info, error) { + var info sys.PerfEventLinkInfo + if err := sys.ObjInfo(pl.fd, &info); err != nil { + return nil, fmt.Errorf("perf event link info: %s", err) + } + + var extra2 interface{} + switch info.PerfEventType { + case sys.BPF_PERF_EVENT_KPROBE, sys.BPF_PERF_EVENT_KRETPROBE: + var kprobeInfo sys.KprobeLinkInfo + if err := sys.ObjInfo(pl.fd, &kprobeInfo); err != nil { + return nil, fmt.Errorf("kprobe link info: %s", err) + } + extra2 = &KprobeInfo{ + address: kprobeInfo.Addr, + missed: kprobeInfo.Missed, + } + } + + extra := &PerfEventInfo{ + Type: info.PerfEventType, + extra: extra2, + } + + return &Info{ + info.Type, + info.Id, + ebpf.ProgramID(info.ProgId), + extra, + }, nil +} + +// perfEventIoctl implements Link and handles the perf event lifecycle +// via ioctl(). +type perfEventIoctl struct { + *perfEvent +} + +func (pi *perfEventIoctl) isLink() {} + +// Since 4.15 (e87c6bc3852b "bpf: permit multiple bpf attachments for a single perf event"), +// calling PERF_EVENT_IOC_SET_BPF appends the given program to a prog_array +// owned by the perf event, which means multiple programs can be attached +// simultaneously. +// +// Before 4.15, calling PERF_EVENT_IOC_SET_BPF more than once on a perf event +// returns EEXIST. +// +// Detaching a program from a perf event is currently not possible, so a +// program replacement mechanism cannot be implemented for perf events. +func (pi *perfEventIoctl) Update(_ *ebpf.Program) error { + return fmt.Errorf("perf event ioctl update: %w", ErrNotSupported) +} + +func (pi *perfEventIoctl) Pin(string) error { + return fmt.Errorf("perf event ioctl pin: %w", ErrNotSupported) +} + +func (pi *perfEventIoctl) Unpin() error { + return fmt.Errorf("perf event ioctl unpin: %w", ErrNotSupported) +} + +func (pi *perfEventIoctl) Info() (*Info, error) { + return nil, fmt.Errorf("perf event ioctl info: %w", ErrNotSupported) +} + +var _ PerfEvent = (*perfEventIoctl)(nil) + +func (pi *perfEventIoctl) PerfEvent() (*os.File, error) { + fd, err := pi.fd.Dup() + if err != nil { + return nil, err + } + + return fd.File("perf-event"), nil +} + +// attach the given eBPF prog to the perf event stored in pe. +// pe must contain a valid perf event fd. +// prog's type must match the program type stored in pe. +func attachPerfEvent(pe *perfEvent, prog *ebpf.Program, cookie uint64) (Link, error) { + if prog == nil { + return nil, errors.New("cannot attach a nil program") + } + if prog.FD() < 0 { + return nil, fmt.Errorf("invalid program: %w", sys.ErrClosedFd) + } + + if err := haveBPFLinkPerfEvent(); err == nil { + return attachPerfEventLink(pe, prog, cookie) + } + + if cookie != 0 { + return nil, fmt.Errorf("cookies are not supported: %w", ErrNotSupported) + } + + return attachPerfEventIoctl(pe, prog) +} + +func attachPerfEventIoctl(pe *perfEvent, prog *ebpf.Program) (*perfEventIoctl, error) { + // Assign the eBPF program to the perf event. + err := unix.IoctlSetInt(pe.fd.Int(), unix.PERF_EVENT_IOC_SET_BPF, prog.FD()) + if err != nil { + return nil, fmt.Errorf("setting perf event bpf program: %w", err) + } + + // PERF_EVENT_IOC_ENABLE and _DISABLE ignore their given values. + if err := unix.IoctlSetInt(pe.fd.Int(), unix.PERF_EVENT_IOC_ENABLE, 0); err != nil { + return nil, fmt.Errorf("enable perf event: %s", err) + } + + return &perfEventIoctl{pe}, nil +} + +// Use the bpf api to attach the perf event (BPF_LINK_TYPE_PERF_EVENT, 5.15+). +// +// https://github.com/torvalds/linux/commit/b89fbfbb854c9afc3047e8273cc3a694650b802e +func attachPerfEventLink(pe *perfEvent, prog *ebpf.Program, cookie uint64) (*perfEventLink, error) { + fd, err := sys.LinkCreatePerfEvent(&sys.LinkCreatePerfEventAttr{ + ProgFd: uint32(prog.FD()), + TargetFd: pe.fd.Uint(), + AttachType: sys.BPF_PERF_EVENT, + BpfCookie: cookie, + }) + if err != nil { + return nil, fmt.Errorf("cannot create bpf perf link: %v", err) + } + + return &perfEventLink{RawLink{fd: fd}, pe}, nil +} + +// unsafeStringPtr returns an unsafe.Pointer to a NUL-terminated copy of str. +func unsafeStringPtr(str string) (unsafe.Pointer, error) { + p, err := unix.BytePtrFromString(str) + if err != nil { + return nil, err + } + return unsafe.Pointer(p), nil +} + +// openTracepointPerfEvent opens a tracepoint-type perf event. System-wide +// [k,u]probes created by writing to /[k,u]probe_events are tracepoints +// behind the scenes, and can be attached to using these perf events. +func openTracepointPerfEvent(tid uint64, pid int) (*sys.FD, error) { + attr := unix.PerfEventAttr{ + Type: unix.PERF_TYPE_TRACEPOINT, + Config: tid, + Sample_type: unix.PERF_SAMPLE_RAW, + Sample: 1, + Wakeup: 1, + } + + cpu := 0 + if pid != perfAllThreads { + cpu = -1 + } + fd, err := unix.PerfEventOpen(&attr, pid, cpu, -1, unix.PERF_FLAG_FD_CLOEXEC) + if err != nil { + return nil, fmt.Errorf("opening tracepoint perf event: %w", err) + } + + return sys.NewFD(fd) +} + +// Probe BPF perf link. +// +// https://elixir.bootlin.com/linux/v5.16.8/source/kernel/bpf/syscall.c#L4307 +// https://github.com/torvalds/linux/commit/b89fbfbb854c9afc3047e8273cc3a694650b802e +var haveBPFLinkPerfEvent = internal.NewFeatureTest("bpf_link_perf_event", func() error { + prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{ + Name: "probe_bpf_perf_link", + Type: ebpf.Kprobe, + Instructions: asm.Instructions{ + asm.Mov.Imm(asm.R0, 0), + asm.Return(), + }, + License: "MIT", + }) + if err != nil { + return err + } + defer prog.Close() + + _, err = sys.LinkCreatePerfEvent(&sys.LinkCreatePerfEventAttr{ + ProgFd: uint32(prog.FD()), + AttachType: sys.BPF_PERF_EVENT, + }) + if errors.Is(err, unix.EINVAL) { + return internal.ErrNotSupported + } + if errors.Is(err, unix.EBADF) { + return nil + } + return err +}, "5.15") diff --git a/vendor/github.com/cilium/ebpf/link/program.go b/vendor/github.com/cilium/ebpf/link/program.go new file mode 100644 index 0000000000..d8a2a15f93 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/program.go @@ -0,0 +1,107 @@ +package link + +import ( + "fmt" + "runtime" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/internal/sys" +) + +type RawAttachProgramOptions struct { + // Target to query. This is usually a file descriptor but may refer to + // something else based on the attach type. + Target int + // Program to attach. + Program *ebpf.Program + // Attach must match the attach type of Program. + Attach ebpf.AttachType + // Attach relative to an anchor. Optional. + Anchor Anchor + // Flags control the attach behaviour. Specify an Anchor instead of + // F_LINK, F_ID, F_BEFORE, F_AFTER and F_REPLACE. Optional. + Flags uint32 + // Only attach if the internal revision matches the given value. + ExpectedRevision uint64 +} + +// RawAttachProgram is a low level wrapper around BPF_PROG_ATTACH. +// +// You should use one of the higher level abstractions available in this +// package if possible. +func RawAttachProgram(opts RawAttachProgramOptions) error { + if opts.Flags&anchorFlags != 0 { + return fmt.Errorf("disallowed flags: use Anchor to specify attach target") + } + + attr := sys.ProgAttachAttr{ + TargetFdOrIfindex: uint32(opts.Target), + AttachBpfFd: uint32(opts.Program.FD()), + AttachType: uint32(opts.Attach), + AttachFlags: uint32(opts.Flags), + ExpectedRevision: opts.ExpectedRevision, + } + + if opts.Anchor != nil { + fdOrID, flags, err := opts.Anchor.anchor() + if err != nil { + return fmt.Errorf("attach program: %w", err) + } + + if flags == sys.BPF_F_REPLACE { + // Ensure that replacing a program works on old kernels. + attr.ReplaceBpfFd = fdOrID + } else { + attr.RelativeFdOrId = fdOrID + attr.AttachFlags |= flags + } + } + + if err := sys.ProgAttach(&attr); err != nil { + if haveFeatErr := haveProgAttach(); haveFeatErr != nil { + return haveFeatErr + } + return fmt.Errorf("attach program: %w", err) + } + runtime.KeepAlive(opts.Program) + + return nil +} + +type RawDetachProgramOptions RawAttachProgramOptions + +// RawDetachProgram is a low level wrapper around BPF_PROG_DETACH. +// +// You should use one of the higher level abstractions available in this +// package if possible. +func RawDetachProgram(opts RawDetachProgramOptions) error { + if opts.Flags&anchorFlags != 0 { + return fmt.Errorf("disallowed flags: use Anchor to specify attach target") + } + + attr := sys.ProgDetachAttr{ + TargetFdOrIfindex: uint32(opts.Target), + AttachBpfFd: uint32(opts.Program.FD()), + AttachType: uint32(opts.Attach), + ExpectedRevision: opts.ExpectedRevision, + } + + if opts.Anchor != nil { + fdOrID, flags, err := opts.Anchor.anchor() + if err != nil { + return fmt.Errorf("detach program: %w", err) + } + + attr.RelativeFdOrId = fdOrID + attr.AttachFlags |= flags + } + + if err := sys.ProgDetach(&attr); err != nil { + if haveFeatErr := haveProgAttach(); haveFeatErr != nil { + return haveFeatErr + } + return fmt.Errorf("can't detach program: %w", err) + } + + return nil +} diff --git a/vendor/github.com/cilium/ebpf/link/query.go b/vendor/github.com/cilium/ebpf/link/query.go new file mode 100644 index 0000000000..fe534f8efa --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/query.go @@ -0,0 +1,111 @@ +package link + +import ( + "fmt" + "unsafe" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/internal/sys" +) + +// QueryOptions defines additional parameters when querying for programs. +type QueryOptions struct { + // Target to query. This is usually a file descriptor but may refer to + // something else based on the attach type. + Target int + // Attach specifies the AttachType of the programs queried for + Attach ebpf.AttachType + // QueryFlags are flags for BPF_PROG_QUERY, e.g. BPF_F_QUERY_EFFECTIVE + QueryFlags uint32 +} + +// QueryResult describes which programs and links are active. +type QueryResult struct { + // List of attached programs. + Programs []AttachedProgram + + // Incremented by one every time the set of attached programs changes. + // May be zero if not supported by the [ebpf.AttachType]. + Revision uint64 +} + +// HaveLinkInfo returns true if the kernel supports querying link information +// for a particular [ebpf.AttachType]. +func (qr *QueryResult) HaveLinkInfo() bool { + return qr.Revision > 0 +} + +type AttachedProgram struct { + ID ebpf.ProgramID + linkID ID +} + +// LinkID returns the ID associated with the program. +// +// Returns 0, false if the kernel doesn't support retrieving the ID or if the +// program wasn't attached via a link. See [QueryResult.HaveLinkInfo] if you +// need to tell the two apart. +func (ap *AttachedProgram) LinkID() (ID, bool) { + return ap.linkID, ap.linkID != 0 +} + +// QueryPrograms retrieves a list of programs for the given AttachType. +// +// Returns a slice of attached programs, which may be empty. +// revision counts how many times the set of attached programs has changed and +// may be zero if not supported by the [ebpf.AttachType]. +// Returns ErrNotSupportd on a kernel without BPF_PROG_QUERY +func QueryPrograms(opts QueryOptions) (*QueryResult, error) { + // query the number of programs to allocate correct slice size + attr := sys.ProgQueryAttr{ + TargetFdOrIfindex: uint32(opts.Target), + AttachType: sys.AttachType(opts.Attach), + QueryFlags: opts.QueryFlags, + } + err := sys.ProgQuery(&attr) + if err != nil { + if haveFeatErr := haveProgQuery(); haveFeatErr != nil { + return nil, fmt.Errorf("query programs: %w", haveFeatErr) + } + return nil, fmt.Errorf("query programs: %w", err) + } + if attr.Count == 0 { + return &QueryResult{Revision: attr.Revision}, nil + } + + // The minimum bpf_mprog revision is 1, so we can use the field to detect + // whether the attach type supports link ids. + haveLinkIDs := attr.Revision != 0 + + count := attr.Count + progIds := make([]ebpf.ProgramID, count) + attr = sys.ProgQueryAttr{ + TargetFdOrIfindex: uint32(opts.Target), + AttachType: sys.AttachType(opts.Attach), + QueryFlags: opts.QueryFlags, + Count: count, + ProgIds: sys.NewPointer(unsafe.Pointer(&progIds[0])), + } + + var linkIds []ID + if haveLinkIDs { + linkIds = make([]ID, count) + attr.LinkIds = sys.NewPointer(unsafe.Pointer(&linkIds[0])) + } + + if err := sys.ProgQuery(&attr); err != nil { + return nil, fmt.Errorf("query programs: %w", err) + } + + // NB: attr.Count might have changed between the two syscalls. + var programs []AttachedProgram + for i, id := range progIds[:attr.Count] { + ap := AttachedProgram{ID: id} + if haveLinkIDs { + ap.linkID = linkIds[i] + } + programs = append(programs, ap) + } + + return &QueryResult{programs, attr.Revision}, nil +} diff --git a/vendor/github.com/cilium/ebpf/link/raw_tracepoint.go b/vendor/github.com/cilium/ebpf/link/raw_tracepoint.go new file mode 100644 index 0000000000..925e621cbb --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/raw_tracepoint.go @@ -0,0 +1,87 @@ +package link + +import ( + "errors" + "fmt" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/internal/sys" +) + +type RawTracepointOptions struct { + // Tracepoint name. + Name string + // Program must be of type RawTracepoint* + Program *ebpf.Program +} + +// AttachRawTracepoint links a BPF program to a raw_tracepoint. +// +// Requires at least Linux 4.17. +func AttachRawTracepoint(opts RawTracepointOptions) (Link, error) { + if t := opts.Program.Type(); t != ebpf.RawTracepoint && t != ebpf.RawTracepointWritable { + return nil, fmt.Errorf("invalid program type %s, expected RawTracepoint(Writable)", t) + } + if opts.Program.FD() < 0 { + return nil, fmt.Errorf("invalid program: %w", sys.ErrClosedFd) + } + + fd, err := sys.RawTracepointOpen(&sys.RawTracepointOpenAttr{ + Name: sys.NewStringPointer(opts.Name), + ProgFd: uint32(opts.Program.FD()), + }) + if err != nil { + return nil, err + } + + err = haveBPFLink() + if errors.Is(err, ErrNotSupported) { + // Prior to commit 70ed506c3bbc ("bpf: Introduce pinnable bpf_link abstraction") + // raw_tracepoints are just a plain fd. + return &simpleRawTracepoint{fd}, nil + } + + if err != nil { + return nil, err + } + + return &rawTracepoint{RawLink{fd: fd}}, nil +} + +type simpleRawTracepoint struct { + fd *sys.FD +} + +var _ Link = (*simpleRawTracepoint)(nil) + +func (frt *simpleRawTracepoint) isLink() {} + +func (frt *simpleRawTracepoint) Close() error { + return frt.fd.Close() +} + +func (frt *simpleRawTracepoint) Update(_ *ebpf.Program) error { + return fmt.Errorf("update raw_tracepoint: %w", ErrNotSupported) +} + +func (frt *simpleRawTracepoint) Pin(string) error { + return fmt.Errorf("pin raw_tracepoint: %w", ErrNotSupported) +} + +func (frt *simpleRawTracepoint) Unpin() error { + return fmt.Errorf("unpin raw_tracepoint: %w", ErrNotSupported) +} + +func (frt *simpleRawTracepoint) Info() (*Info, error) { + return nil, fmt.Errorf("can't get raw_tracepoint info: %w", ErrNotSupported) +} + +type rawTracepoint struct { + RawLink +} + +var _ Link = (*rawTracepoint)(nil) + +func (rt *rawTracepoint) Update(_ *ebpf.Program) error { + return fmt.Errorf("update raw_tracepoint: %w", ErrNotSupported) +} diff --git a/vendor/github.com/cilium/ebpf/link/socket_filter.go b/vendor/github.com/cilium/ebpf/link/socket_filter.go new file mode 100644 index 0000000000..84f0b656f8 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/socket_filter.go @@ -0,0 +1,40 @@ +package link + +import ( + "syscall" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/internal/unix" +) + +// AttachSocketFilter attaches a SocketFilter BPF program to a socket. +func AttachSocketFilter(conn syscall.Conn, program *ebpf.Program) error { + rawConn, err := conn.SyscallConn() + if err != nil { + return err + } + var ssoErr error + err = rawConn.Control(func(fd uintptr) { + ssoErr = unix.SetsockoptInt(int(fd), unix.SOL_SOCKET, unix.SO_ATTACH_BPF, program.FD()) + }) + if ssoErr != nil { + return ssoErr + } + return err +} + +// DetachSocketFilter detaches a SocketFilter BPF program from a socket. +func DetachSocketFilter(conn syscall.Conn) error { + rawConn, err := conn.SyscallConn() + if err != nil { + return err + } + var ssoErr error + err = rawConn.Control(func(fd uintptr) { + ssoErr = unix.SetsockoptInt(int(fd), unix.SOL_SOCKET, unix.SO_DETACH_BPF, 0) + }) + if ssoErr != nil { + return ssoErr + } + return err +} diff --git a/vendor/github.com/cilium/ebpf/link/syscalls.go b/vendor/github.com/cilium/ebpf/link/syscalls.go new file mode 100644 index 0000000000..25951b017d --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/syscalls.go @@ -0,0 +1,200 @@ +package link + +import ( + "errors" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/asm" + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/sys" + "github.com/cilium/ebpf/internal/unix" +) + +// Type is the kind of link. +type Type = sys.LinkType + +// Valid link types. +const ( + UnspecifiedType = sys.BPF_LINK_TYPE_UNSPEC + RawTracepointType = sys.BPF_LINK_TYPE_RAW_TRACEPOINT + TracingType = sys.BPF_LINK_TYPE_TRACING + CgroupType = sys.BPF_LINK_TYPE_CGROUP + IterType = sys.BPF_LINK_TYPE_ITER + NetNsType = sys.BPF_LINK_TYPE_NETNS + XDPType = sys.BPF_LINK_TYPE_XDP + PerfEventType = sys.BPF_LINK_TYPE_PERF_EVENT + KprobeMultiType = sys.BPF_LINK_TYPE_KPROBE_MULTI + TCXType = sys.BPF_LINK_TYPE_TCX + UprobeMultiType = sys.BPF_LINK_TYPE_UPROBE_MULTI + NetfilterType = sys.BPF_LINK_TYPE_NETFILTER + NetkitType = sys.BPF_LINK_TYPE_NETKIT +) + +var haveProgAttach = internal.NewFeatureTest("BPF_PROG_ATTACH", func() error { + prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{ + Type: ebpf.CGroupSKB, + License: "MIT", + Instructions: asm.Instructions{ + asm.Mov.Imm(asm.R0, 0), + asm.Return(), + }, + }) + if err != nil { + return internal.ErrNotSupported + } + + // BPF_PROG_ATTACH was introduced at the same time as CGgroupSKB, + // so being able to load the program is enough to infer that we + // have the syscall. + prog.Close() + return nil +}, "4.10") + +var haveProgAttachReplace = internal.NewFeatureTest("BPF_PROG_ATTACH atomic replacement of MULTI progs", func() error { + if err := haveProgAttach(); err != nil { + return err + } + + prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{ + Type: ebpf.CGroupSKB, + AttachType: ebpf.AttachCGroupInetIngress, + License: "MIT", + Instructions: asm.Instructions{ + asm.Mov.Imm(asm.R0, 0), + asm.Return(), + }, + }) + + if err != nil { + return internal.ErrNotSupported + } + + defer prog.Close() + + // We know that we have BPF_PROG_ATTACH since we can load CGroupSKB programs. + // If passing BPF_F_REPLACE gives us EINVAL we know that the feature isn't + // present. + attr := sys.ProgAttachAttr{ + // We rely on this being checked after attachFlags. + TargetFdOrIfindex: ^uint32(0), + AttachBpfFd: uint32(prog.FD()), + AttachType: uint32(ebpf.AttachCGroupInetIngress), + AttachFlags: uint32(flagReplace), + } + + err = sys.ProgAttach(&attr) + if errors.Is(err, unix.EINVAL) { + return internal.ErrNotSupported + } + if errors.Is(err, unix.EBADF) { + return nil + } + return err +}, "5.5") + +var haveBPFLink = internal.NewFeatureTest("bpf_link", func() error { + attr := sys.LinkCreateAttr{ + // This is a hopefully invalid file descriptor, which triggers EBADF. + TargetFd: ^uint32(0), + ProgFd: ^uint32(0), + AttachType: sys.AttachType(ebpf.AttachCGroupInetIngress), + } + _, err := sys.LinkCreate(&attr) + if errors.Is(err, unix.EINVAL) { + return internal.ErrNotSupported + } + if errors.Is(err, unix.EBADF) { + return nil + } + return err +}, "5.7") + +var haveProgQuery = internal.NewFeatureTest("BPF_PROG_QUERY", func() error { + attr := sys.ProgQueryAttr{ + // We rely on this being checked during the syscall. + // With an otherwise correct payload we expect EBADF here + // as an indication that the feature is present. + TargetFdOrIfindex: ^uint32(0), + AttachType: sys.AttachType(ebpf.AttachCGroupInetIngress), + } + + err := sys.ProgQuery(&attr) + + if errors.Is(err, unix.EBADF) { + return nil + } + if err != nil { + return ErrNotSupported + } + return errors.New("syscall succeeded unexpectedly") +}, "4.15") + +var haveTCX = internal.NewFeatureTest("tcx", func() error { + prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{ + Type: ebpf.SchedCLS, + License: "MIT", + Instructions: asm.Instructions{ + asm.Mov.Imm(asm.R0, 0), + asm.Return(), + }, + }) + + if err != nil { + return internal.ErrNotSupported + } + + defer prog.Close() + attr := sys.LinkCreateTcxAttr{ + // We rely on this being checked during the syscall. + // With an otherwise correct payload we expect ENODEV here + // as an indication that the feature is present. + TargetIfindex: ^uint32(0), + ProgFd: uint32(prog.FD()), + AttachType: sys.AttachType(ebpf.AttachTCXIngress), + } + + _, err = sys.LinkCreateTcx(&attr) + + if errors.Is(err, unix.ENODEV) { + return nil + } + if err != nil { + return ErrNotSupported + } + return errors.New("syscall succeeded unexpectedly") +}, "6.6") + +var haveNetkit = internal.NewFeatureTest("netkit", func() error { + prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{ + Type: ebpf.SchedCLS, + License: "MIT", + Instructions: asm.Instructions{ + asm.Mov.Imm(asm.R0, 0), + asm.Return(), + }, + }) + + if err != nil { + return internal.ErrNotSupported + } + + defer prog.Close() + attr := sys.LinkCreateNetkitAttr{ + // We rely on this being checked during the syscall. + // With an otherwise correct payload we expect ENODEV here + // as an indication that the feature is present. + TargetIfindex: ^uint32(0), + ProgFd: uint32(prog.FD()), + AttachType: sys.AttachType(ebpf.AttachNetkitPrimary), + } + + _, err = sys.LinkCreateNetkit(&attr) + + if errors.Is(err, unix.ENODEV) { + return nil + } + if err != nil { + return ErrNotSupported + } + return errors.New("syscall succeeded unexpectedly") +}, "6.7") diff --git a/vendor/github.com/cilium/ebpf/link/tcx.go b/vendor/github.com/cilium/ebpf/link/tcx.go new file mode 100644 index 0000000000..ac045b71da --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/tcx.go @@ -0,0 +1,89 @@ +package link + +import ( + "fmt" + "runtime" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/internal/sys" +) + +type TCXOptions struct { + // Index of the interface to attach to. + Interface int + // Program to attach. + Program *ebpf.Program + // One of the AttachTCX* constants. + Attach ebpf.AttachType + // Attach relative to an anchor. Optional. + Anchor Anchor + // Only attach if the expected revision matches. + ExpectedRevision uint64 + // Flags control the attach behaviour. Specify an Anchor instead of + // F_LINK, F_ID, F_BEFORE, F_AFTER and R_REPLACE. Optional. + Flags uint32 +} + +func AttachTCX(opts TCXOptions) (Link, error) { + if opts.Interface < 0 { + return nil, fmt.Errorf("interface %d is out of bounds", opts.Interface) + } + + if opts.Flags&anchorFlags != 0 { + return nil, fmt.Errorf("disallowed flags: use Anchor to specify attach target") + } + + attr := sys.LinkCreateTcxAttr{ + ProgFd: uint32(opts.Program.FD()), + AttachType: sys.AttachType(opts.Attach), + TargetIfindex: uint32(opts.Interface), + ExpectedRevision: opts.ExpectedRevision, + Flags: opts.Flags, + } + + if opts.Anchor != nil { + fdOrID, flags, err := opts.Anchor.anchor() + if err != nil { + return nil, fmt.Errorf("attach tcx link: %w", err) + } + + attr.RelativeFdOrId = fdOrID + attr.Flags |= flags + } + + fd, err := sys.LinkCreateTcx(&attr) + runtime.KeepAlive(opts.Program) + runtime.KeepAlive(opts.Anchor) + if err != nil { + if haveFeatErr := haveTCX(); haveFeatErr != nil { + return nil, haveFeatErr + } + return nil, fmt.Errorf("attach tcx link: %w", err) + } + + return &tcxLink{RawLink{fd, ""}}, nil +} + +type tcxLink struct { + RawLink +} + +var _ Link = (*tcxLink)(nil) + +func (tcx *tcxLink) Info() (*Info, error) { + var info sys.TcxLinkInfo + if err := sys.ObjInfo(tcx.fd, &info); err != nil { + return nil, fmt.Errorf("tcx link info: %s", err) + } + extra := &TCXInfo{ + Ifindex: info.Ifindex, + AttachType: info.AttachType, + } + + return &Info{ + info.Type, + info.Id, + ebpf.ProgramID(info.ProgId), + extra, + }, nil +} diff --git a/vendor/github.com/cilium/ebpf/link/tracepoint.go b/vendor/github.com/cilium/ebpf/link/tracepoint.go new file mode 100644 index 0000000000..6fc78b9828 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/tracepoint.go @@ -0,0 +1,70 @@ +package link + +import ( + "fmt" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/internal/tracefs" +) + +// TracepointOptions defines additional parameters that will be used +// when loading Tracepoints. +type TracepointOptions struct { + // Arbitrary value that can be fetched from an eBPF program + // via `bpf_get_attach_cookie()`. + // + // Needs kernel 5.15+. + Cookie uint64 +} + +// Tracepoint attaches the given eBPF program to the tracepoint with the given +// group and name. See /sys/kernel/tracing/events to find available +// tracepoints. The top-level directory is the group, the event's subdirectory +// is the name. Example: +// +// tp, err := Tracepoint("syscalls", "sys_enter_fork", prog, nil) +// +// Losing the reference to the resulting Link (tp) will close the Tracepoint +// and prevent further execution of prog. The Link must be Closed during +// program shutdown to avoid leaking system resources. +// +// Note that attaching eBPF programs to syscalls (sys_enter_*/sys_exit_*) is +// only possible as of kernel 4.14 (commit cf5f5ce). +// +// The returned Link may implement [PerfEvent]. +func Tracepoint(group, name string, prog *ebpf.Program, opts *TracepointOptions) (Link, error) { + if group == "" || name == "" { + return nil, fmt.Errorf("group and name cannot be empty: %w", errInvalidInput) + } + if prog == nil { + return nil, fmt.Errorf("prog cannot be nil: %w", errInvalidInput) + } + if prog.Type() != ebpf.TracePoint { + return nil, fmt.Errorf("eBPF program type %s is not a Tracepoint: %w", prog.Type(), errInvalidInput) + } + + tid, err := tracefs.EventID(group, name) + if err != nil { + return nil, err + } + + fd, err := openTracepointPerfEvent(tid, perfAllThreads) + if err != nil { + return nil, err + } + + var cookie uint64 + if opts != nil { + cookie = opts.Cookie + } + + pe := newPerfEvent(fd, nil) + + lnk, err := attachPerfEvent(pe, prog, cookie) + if err != nil { + pe.Close() + return nil, err + } + + return lnk, nil +} diff --git a/vendor/github.com/cilium/ebpf/link/tracing.go b/vendor/github.com/cilium/ebpf/link/tracing.go new file mode 100644 index 0000000000..a461007f9f --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/tracing.go @@ -0,0 +1,218 @@ +package link + +import ( + "errors" + "fmt" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/btf" + "github.com/cilium/ebpf/internal/sys" + "github.com/cilium/ebpf/internal/unix" +) + +type tracing struct { + RawLink +} + +func (f *tracing) Update(_ *ebpf.Program) error { + return fmt.Errorf("tracing update: %w", ErrNotSupported) +} + +func (f *tracing) Info() (*Info, error) { + var info sys.TracingLinkInfo + if err := sys.ObjInfo(f.fd, &info); err != nil { + return nil, fmt.Errorf("tracing link info: %s", err) + } + extra := &TracingInfo{ + TargetObjId: info.TargetObjId, + TargetBtfId: info.TargetBtfId, + AttachType: info.AttachType, + } + + return &Info{ + info.Type, + info.Id, + ebpf.ProgramID(info.ProgId), + extra, + }, nil +} + +// AttachFreplace attaches the given eBPF program to the function it replaces. +// +// The program and name can either be provided at link time, or can be provided +// at program load time. If they were provided at load time, they should be nil +// and empty respectively here, as they will be ignored by the kernel. +// Examples: +// +// AttachFreplace(dispatcher, "function", replacement) +// AttachFreplace(nil, "", replacement) +func AttachFreplace(targetProg *ebpf.Program, name string, prog *ebpf.Program) (Link, error) { + if (name == "") != (targetProg == nil) { + return nil, fmt.Errorf("must provide both or neither of name and targetProg: %w", errInvalidInput) + } + if prog == nil { + return nil, fmt.Errorf("prog cannot be nil: %w", errInvalidInput) + } + if prog.Type() != ebpf.Extension { + return nil, fmt.Errorf("eBPF program type %s is not an Extension: %w", prog.Type(), errInvalidInput) + } + + var ( + target int + typeID btf.TypeID + ) + if targetProg != nil { + btfHandle, err := targetProg.Handle() + if err != nil { + return nil, err + } + defer btfHandle.Close() + + spec, err := btfHandle.Spec(nil) + if err != nil { + return nil, err + } + + var function *btf.Func + if err := spec.TypeByName(name, &function); err != nil { + return nil, err + } + + target = targetProg.FD() + typeID, err = spec.TypeID(function) + if err != nil { + return nil, err + } + } + + link, err := AttachRawLink(RawLinkOptions{ + Target: target, + Program: prog, + Attach: ebpf.AttachNone, + BTF: typeID, + }) + if errors.Is(err, sys.ENOTSUPP) { + // This may be returned by bpf_tracing_prog_attach via bpf_arch_text_poke. + return nil, fmt.Errorf("create raw tracepoint: %w", ErrNotSupported) + } + if err != nil { + return nil, err + } + + return &tracing{*link}, nil +} + +type TracingOptions struct { + // Program must be of type Tracing with attach type + // AttachTraceFEntry/AttachTraceFExit/AttachModifyReturn or + // AttachTraceRawTp. + Program *ebpf.Program + // Program attach type. Can be one of: + // - AttachTraceFEntry + // - AttachTraceFExit + // - AttachModifyReturn + // - AttachTraceRawTp + // This field is optional. + AttachType ebpf.AttachType + // Arbitrary value that can be fetched from an eBPF program + // via `bpf_get_attach_cookie()`. + Cookie uint64 +} + +type LSMOptions struct { + // Program must be of type LSM with attach type + // AttachLSMMac. + Program *ebpf.Program + // Arbitrary value that can be fetched from an eBPF program + // via `bpf_get_attach_cookie()`. + Cookie uint64 +} + +// attachBTFID links all BPF program types (Tracing/LSM) that they attach to a btf_id. +func attachBTFID(program *ebpf.Program, at ebpf.AttachType, cookie uint64) (Link, error) { + if program.FD() < 0 { + return nil, fmt.Errorf("invalid program %w", sys.ErrClosedFd) + } + + var ( + fd *sys.FD + err error + ) + switch at { + case ebpf.AttachTraceFEntry, ebpf.AttachTraceFExit, ebpf.AttachTraceRawTp, + ebpf.AttachModifyReturn, ebpf.AttachLSMMac: + // Attach via BPF link + fd, err = sys.LinkCreateTracing(&sys.LinkCreateTracingAttr{ + ProgFd: uint32(program.FD()), + AttachType: sys.AttachType(at), + Cookie: cookie, + }) + if err == nil { + break + } + if !errors.Is(err, unix.EINVAL) && !errors.Is(err, sys.ENOTSUPP) { + return nil, fmt.Errorf("create tracing link: %w", err) + } + fallthrough + case ebpf.AttachNone: + // Attach via RawTracepointOpen + if cookie > 0 { + return nil, fmt.Errorf("create raw tracepoint with cookie: %w", ErrNotSupported) + } + + fd, err = sys.RawTracepointOpen(&sys.RawTracepointOpenAttr{ + ProgFd: uint32(program.FD()), + }) + if errors.Is(err, sys.ENOTSUPP) { + // This may be returned by bpf_tracing_prog_attach via bpf_arch_text_poke. + return nil, fmt.Errorf("create raw tracepoint: %w", ErrNotSupported) + } + if err != nil { + return nil, fmt.Errorf("create raw tracepoint: %w", err) + } + default: + return nil, fmt.Errorf("invalid attach type: %s", at.String()) + } + + raw := RawLink{fd: fd} + info, err := raw.Info() + if err != nil { + raw.Close() + return nil, err + } + + if info.Type == RawTracepointType { + // Sadness upon sadness: a Tracing program with AttachRawTp returns + // a raw_tracepoint link. Other types return a tracing link. + return &rawTracepoint{raw}, nil + } + return &tracing{raw}, nil +} + +// AttachTracing links a tracing (fentry/fexit/fmod_ret) BPF program or +// a BTF-powered raw tracepoint (tp_btf) BPF Program to a BPF hook defined +// in kernel modules. +func AttachTracing(opts TracingOptions) (Link, error) { + if t := opts.Program.Type(); t != ebpf.Tracing { + return nil, fmt.Errorf("invalid program type %s, expected Tracing", t) + } + + switch opts.AttachType { + case ebpf.AttachTraceFEntry, ebpf.AttachTraceFExit, ebpf.AttachModifyReturn, + ebpf.AttachTraceRawTp, ebpf.AttachNone: + default: + return nil, fmt.Errorf("invalid attach type: %s", opts.AttachType.String()) + } + + return attachBTFID(opts.Program, opts.AttachType, opts.Cookie) +} + +// AttachLSM links a Linux security module (LSM) BPF Program to a BPF +// hook defined in kernel modules. +func AttachLSM(opts LSMOptions) (Link, error) { + if t := opts.Program.Type(); t != ebpf.LSM { + return nil, fmt.Errorf("invalid program type %s, expected LSM", t) + } + + return attachBTFID(opts.Program, ebpf.AttachLSMMac, opts.Cookie) +} diff --git a/vendor/github.com/cilium/ebpf/link/uprobe.go b/vendor/github.com/cilium/ebpf/link/uprobe.go new file mode 100644 index 0000000000..1852a3fadd --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/uprobe.go @@ -0,0 +1,335 @@ +package link + +import ( + "debug/elf" + "errors" + "fmt" + "os" + "sync" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/tracefs" +) + +var ( + uprobeRefCtrOffsetPMUPath = "/sys/bus/event_source/devices/uprobe/format/ref_ctr_offset" + // elixir.bootlin.com/linux/v5.15-rc7/source/kernel/events/core.c#L9799 + uprobeRefCtrOffsetShift = 32 + haveRefCtrOffsetPMU = internal.NewFeatureTest("RefCtrOffsetPMU", func() error { + _, err := os.Stat(uprobeRefCtrOffsetPMUPath) + if errors.Is(err, os.ErrNotExist) { + return internal.ErrNotSupported + } + if err != nil { + return err + } + return nil + }, "4.20") + + // ErrNoSymbol indicates that the given symbol was not found + // in the ELF symbols table. + ErrNoSymbol = errors.New("not found") +) + +// Executable defines an executable program on the filesystem. +type Executable struct { + // Path of the executable on the filesystem. + path string + // Parsed ELF and dynamic symbols' cachedAddresses. + cachedAddresses map[string]uint64 + // Keep track of symbol table lazy load. + cacheAddressesOnce sync.Once +} + +// UprobeOptions defines additional parameters that will be used +// when loading Uprobes. +type UprobeOptions struct { + // Symbol address. Must be provided in case of external symbols (shared libs). + // If set, overrides the address eventually parsed from the executable. + Address uint64 + // The offset relative to given symbol. Useful when tracing an arbitrary point + // inside the frame of given symbol. + // + // Note: this field changed from being an absolute offset to being relative + // to Address. + Offset uint64 + // Only set the uprobe on the given process ID. Useful when tracing + // shared library calls or programs that have many running instances. + PID int + // Automatically manage SDT reference counts (semaphores). + // + // If this field is set, the Kernel will increment/decrement the + // semaphore located in the process memory at the provided address on + // probe attach/detach. + // + // See also: + // sourceware.org/systemtap/wiki/UserSpaceProbeImplementation (Semaphore Handling) + // github.com/torvalds/linux/commit/1cc33161a83d + // github.com/torvalds/linux/commit/a6ca88b241d5 + RefCtrOffset uint64 + // Arbitrary value that can be fetched from an eBPF program + // via `bpf_get_attach_cookie()`. + // + // Needs kernel 5.15+. + Cookie uint64 + // Prefix used for the event name if the uprobe must be attached using tracefs. + // The group name will be formatted as `_`. + // The default empty string is equivalent to "ebpf" as the prefix. + TraceFSPrefix string +} + +func (uo *UprobeOptions) cookie() uint64 { + if uo == nil { + return 0 + } + return uo.Cookie +} + +// To open a new Executable, use: +// +// OpenExecutable("/bin/bash") +// +// The returned value can then be used to open Uprobe(s). +func OpenExecutable(path string) (*Executable, error) { + if path == "" { + return nil, fmt.Errorf("path cannot be empty") + } + + f, err := internal.OpenSafeELFFile(path) + if err != nil { + return nil, fmt.Errorf("parse ELF file: %w", err) + } + defer f.Close() + + if f.Type != elf.ET_EXEC && f.Type != elf.ET_DYN { + // ELF is not an executable or a shared object. + return nil, errors.New("the given file is not an executable or a shared object") + } + + return &Executable{ + path: path, + cachedAddresses: make(map[string]uint64), + }, nil +} + +func (ex *Executable) load(f *internal.SafeELFFile) error { + syms, err := f.Symbols() + if err != nil && !errors.Is(err, elf.ErrNoSymbols) { + return err + } + + dynsyms, err := f.DynamicSymbols() + if err != nil && !errors.Is(err, elf.ErrNoSymbols) { + return err + } + + syms = append(syms, dynsyms...) + + for _, s := range syms { + if elf.ST_TYPE(s.Info) != elf.STT_FUNC { + // Symbol not associated with a function or other executable code. + continue + } + + address := s.Value + + // Loop over ELF segments. + for _, prog := range f.Progs { + // Skip uninteresting segments. + if prog.Type != elf.PT_LOAD || (prog.Flags&elf.PF_X) == 0 { + continue + } + + if prog.Vaddr <= s.Value && s.Value < (prog.Vaddr+prog.Memsz) { + // If the symbol value is contained in the segment, calculate + // the symbol offset. + // + // fn symbol offset = fn symbol VA - .text VA + .text offset + // + // stackoverflow.com/a/40249502 + address = s.Value - prog.Vaddr + prog.Off + break + } + } + + ex.cachedAddresses[s.Name] = address + } + + return nil +} + +// address calculates the address of a symbol in the executable. +// +// opts must not be nil. +func (ex *Executable) address(symbol string, address, offset uint64) (uint64, error) { + if address > 0 { + return address + offset, nil + } + + var err error + ex.cacheAddressesOnce.Do(func() { + var f *internal.SafeELFFile + f, err = internal.OpenSafeELFFile(ex.path) + if err != nil { + err = fmt.Errorf("parse ELF file: %w", err) + return + } + defer f.Close() + + err = ex.load(f) + }) + if err != nil { + return 0, fmt.Errorf("lazy load symbols: %w", err) + } + + address, ok := ex.cachedAddresses[symbol] + if !ok { + return 0, fmt.Errorf("symbol %s: %w", symbol, ErrNoSymbol) + } + + // Symbols with location 0 from section undef are shared library calls and + // are relocated before the binary is executed. Dynamic linking is not + // implemented by the library, so mark this as unsupported for now. + // + // Since only offset values are stored and not elf.Symbol, if the value is 0, + // assume it's an external symbol. + if address == 0 { + return 0, fmt.Errorf("cannot resolve %s library call '%s': %w "+ + "(consider providing UprobeOptions.Address)", ex.path, symbol, ErrNotSupported) + } + + return address + offset, nil +} + +// Uprobe attaches the given eBPF program to a perf event that fires when the +// given symbol starts executing in the given Executable. +// For example, /bin/bash::main(): +// +// ex, _ = OpenExecutable("/bin/bash") +// ex.Uprobe("main", prog, nil) +// +// When using symbols which belongs to shared libraries, +// an offset must be provided via options: +// +// up, err := ex.Uprobe("main", prog, &UprobeOptions{Offset: 0x123}) +// +// Note: Setting the Offset field in the options supersedes the symbol's offset. +// +// Losing the reference to the resulting Link (up) will close the Uprobe +// and prevent further execution of prog. The Link must be Closed during +// program shutdown to avoid leaking system resources. +// +// Functions provided by shared libraries can currently not be traced and +// will result in an ErrNotSupported. +// +// The returned Link may implement [PerfEvent]. +func (ex *Executable) Uprobe(symbol string, prog *ebpf.Program, opts *UprobeOptions) (Link, error) { + u, err := ex.uprobe(symbol, prog, opts, false) + if err != nil { + return nil, err + } + + lnk, err := attachPerfEvent(u, prog, opts.cookie()) + if err != nil { + u.Close() + return nil, err + } + + return lnk, nil +} + +// Uretprobe attaches the given eBPF program to a perf event that fires right +// before the given symbol exits. For example, /bin/bash::main(): +// +// ex, _ = OpenExecutable("/bin/bash") +// ex.Uretprobe("main", prog, nil) +// +// When using symbols which belongs to shared libraries, +// an offset must be provided via options: +// +// up, err := ex.Uretprobe("main", prog, &UprobeOptions{Offset: 0x123}) +// +// Note: Setting the Offset field in the options supersedes the symbol's offset. +// +// Losing the reference to the resulting Link (up) will close the Uprobe +// and prevent further execution of prog. The Link must be Closed during +// program shutdown to avoid leaking system resources. +// +// Functions provided by shared libraries can currently not be traced and +// will result in an ErrNotSupported. +// +// The returned Link may implement [PerfEvent]. +func (ex *Executable) Uretprobe(symbol string, prog *ebpf.Program, opts *UprobeOptions) (Link, error) { + u, err := ex.uprobe(symbol, prog, opts, true) + if err != nil { + return nil, err + } + + lnk, err := attachPerfEvent(u, prog, opts.cookie()) + if err != nil { + u.Close() + return nil, err + } + + return lnk, nil +} + +// uprobe opens a perf event for the given binary/symbol and attaches prog to it. +// If ret is true, create a uretprobe. +func (ex *Executable) uprobe(symbol string, prog *ebpf.Program, opts *UprobeOptions, ret bool) (*perfEvent, error) { + if prog == nil { + return nil, fmt.Errorf("prog cannot be nil: %w", errInvalidInput) + } + if prog.Type() != ebpf.Kprobe { + return nil, fmt.Errorf("eBPF program type %s is not Kprobe: %w", prog.Type(), errInvalidInput) + } + if opts == nil { + opts = &UprobeOptions{} + } + + offset, err := ex.address(symbol, opts.Address, opts.Offset) + if err != nil { + return nil, err + } + + pid := opts.PID + if pid == 0 { + pid = perfAllThreads + } + + if opts.RefCtrOffset != 0 { + if err := haveRefCtrOffsetPMU(); err != nil { + return nil, fmt.Errorf("uprobe ref_ctr_offset: %w", err) + } + } + + args := tracefs.ProbeArgs{ + Type: tracefs.Uprobe, + Symbol: symbol, + Path: ex.path, + Offset: offset, + Pid: pid, + RefCtrOffset: opts.RefCtrOffset, + Ret: ret, + Cookie: opts.Cookie, + Group: opts.TraceFSPrefix, + } + + // Use uprobe PMU if the kernel has it available. + tp, err := pmuProbe(args) + if err == nil { + return tp, nil + } + if !errors.Is(err, ErrNotSupported) { + return nil, fmt.Errorf("creating perf_uprobe PMU: %w", err) + } + + // Use tracefs if uprobe PMU is missing. + tp, err = tracefsProbe(args) + if err != nil { + return nil, fmt.Errorf("creating trace event '%s:%s' in tracefs: %w", ex.path, symbol, err) + } + + return tp, nil +} diff --git a/vendor/github.com/cilium/ebpf/link/uprobe_multi.go b/vendor/github.com/cilium/ebpf/link/uprobe_multi.go new file mode 100644 index 0000000000..49dc18b449 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/uprobe_multi.go @@ -0,0 +1,219 @@ +package link + +import ( + "errors" + "fmt" + "os" + "unsafe" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/asm" + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/sys" + "github.com/cilium/ebpf/internal/unix" +) + +// UprobeMultiOptions defines additional parameters that will be used +// when opening a UprobeMulti Link. +type UprobeMultiOptions struct { + // Symbol addresses. If set, overrides the addresses eventually parsed from + // the executable. Mutually exclusive with UprobeMulti's symbols argument. + Addresses []uint64 + + // Offsets into functions provided by UprobeMulti's symbols argument. + // For example: to set uprobes to main+5 and _start+10, call UprobeMulti + // with: + // symbols: "main", "_start" + // opt.Offsets: 5, 10 + Offsets []uint64 + + // Optional list of associated ref counter offsets. + RefCtrOffsets []uint64 + + // Optional list of associated BPF cookies. + Cookies []uint64 + + // Only set the uprobe_multi link on the given process ID, zero PID means + // system-wide. + PID uint32 +} + +func (ex *Executable) UprobeMulti(symbols []string, prog *ebpf.Program, opts *UprobeMultiOptions) (Link, error) { + return ex.uprobeMulti(symbols, prog, opts, 0) +} + +func (ex *Executable) UretprobeMulti(symbols []string, prog *ebpf.Program, opts *UprobeMultiOptions) (Link, error) { + + // The return probe is not limited for symbols entry, so there's no special + // setup for return uprobes (other than the extra flag). The symbols, opts.Offsets + // and opts.Addresses arrays follow the same logic as for entry uprobes. + return ex.uprobeMulti(symbols, prog, opts, sys.BPF_F_UPROBE_MULTI_RETURN) +} + +func (ex *Executable) uprobeMulti(symbols []string, prog *ebpf.Program, opts *UprobeMultiOptions, flags uint32) (Link, error) { + if prog == nil { + return nil, errors.New("cannot attach a nil program") + } + + if opts == nil { + opts = &UprobeMultiOptions{} + } + + addresses, err := ex.addresses(symbols, opts.Addresses, opts.Offsets) + if err != nil { + return nil, err + } + + addrs := len(addresses) + cookies := len(opts.Cookies) + refCtrOffsets := len(opts.RefCtrOffsets) + + if addrs == 0 { + return nil, fmt.Errorf("Addresses are required: %w", errInvalidInput) + } + if refCtrOffsets > 0 && refCtrOffsets != addrs { + return nil, fmt.Errorf("RefCtrOffsets must be exactly Addresses in length: %w", errInvalidInput) + } + if cookies > 0 && cookies != addrs { + return nil, fmt.Errorf("Cookies must be exactly Addresses in length: %w", errInvalidInput) + } + + attr := &sys.LinkCreateUprobeMultiAttr{ + Path: sys.NewStringPointer(ex.path), + ProgFd: uint32(prog.FD()), + AttachType: sys.BPF_TRACE_UPROBE_MULTI, + UprobeMultiFlags: flags, + Count: uint32(addrs), + Offsets: sys.NewPointer(unsafe.Pointer(&addresses[0])), + Pid: opts.PID, + } + + if refCtrOffsets != 0 { + attr.RefCtrOffsets = sys.NewPointer(unsafe.Pointer(&opts.RefCtrOffsets[0])) + } + if cookies != 0 { + attr.Cookies = sys.NewPointer(unsafe.Pointer(&opts.Cookies[0])) + } + + fd, err := sys.LinkCreateUprobeMulti(attr) + if errors.Is(err, unix.ESRCH) { + return nil, fmt.Errorf("%w (specified pid not found?)", os.ErrNotExist) + } + // Since Linux commit 46ba0e49b642 ("bpf: fix multi-uprobe PID filtering + // logic"), if the provided pid overflows MaxInt32 (turning it negative), the + // kernel will return EINVAL instead of ESRCH. + if errors.Is(err, unix.EINVAL) { + return nil, fmt.Errorf("%w (invalid pid, missing symbol or prog's AttachType not AttachTraceUprobeMulti?)", err) + } + + if err != nil { + if haveFeatErr := haveBPFLinkUprobeMulti(); haveFeatErr != nil { + return nil, haveFeatErr + } + return nil, err + } + + return &uprobeMultiLink{RawLink{fd, ""}}, nil +} + +func (ex *Executable) addresses(symbols []string, addresses, offsets []uint64) ([]uint64, error) { + n := len(symbols) + if n == 0 { + n = len(addresses) + } + + if n == 0 { + return nil, fmt.Errorf("%w: neither symbols nor addresses given", errInvalidInput) + } + + if symbols != nil && len(symbols) != n { + return nil, fmt.Errorf("%w: have %d symbols but want %d", errInvalidInput, len(symbols), n) + } + + if addresses != nil && len(addresses) != n { + return nil, fmt.Errorf("%w: have %d addresses but want %d", errInvalidInput, len(addresses), n) + } + + if offsets != nil && len(offsets) != n { + return nil, fmt.Errorf("%w: have %d offsets but want %d", errInvalidInput, len(offsets), n) + } + + results := make([]uint64, 0, n) + for i := 0; i < n; i++ { + var sym string + if symbols != nil { + sym = symbols[i] + } + + var addr, off uint64 + if addresses != nil { + addr = addresses[i] + } + + if offsets != nil { + off = offsets[i] + } + + result, err := ex.address(sym, addr, off) + if err != nil { + return nil, err + } + + results = append(results, result) + } + + return results, nil +} + +type uprobeMultiLink struct { + RawLink +} + +var _ Link = (*uprobeMultiLink)(nil) + +func (kml *uprobeMultiLink) Update(_ *ebpf.Program) error { + return fmt.Errorf("update uprobe_multi: %w", ErrNotSupported) +} + +var haveBPFLinkUprobeMulti = internal.NewFeatureTest("bpf_link_uprobe_multi", func() error { + prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{ + Name: "probe_upm_link", + Type: ebpf.Kprobe, + Instructions: asm.Instructions{ + asm.Mov.Imm(asm.R0, 0), + asm.Return(), + }, + AttachType: ebpf.AttachTraceUprobeMulti, + License: "MIT", + }) + if errors.Is(err, unix.E2BIG) { + // Kernel doesn't support AttachType field. + return internal.ErrNotSupported + } + if err != nil { + return err + } + defer prog.Close() + + // We try to create uprobe multi link on '/' path which results in + // error with -EBADF in case uprobe multi link is supported. + fd, err := sys.LinkCreateUprobeMulti(&sys.LinkCreateUprobeMultiAttr{ + ProgFd: uint32(prog.FD()), + AttachType: sys.BPF_TRACE_UPROBE_MULTI, + Path: sys.NewStringPointer("/"), + Offsets: sys.NewPointer(unsafe.Pointer(&[]uint64{0})), + Count: 1, + }) + switch { + case errors.Is(err, unix.EBADF): + return nil + case errors.Is(err, unix.EINVAL): + return internal.ErrNotSupported + case err != nil: + return err + } + + // should not happen + fd.Close() + return errors.New("successfully attached uprobe_multi to /, kernel bug?") +}, "6.6") diff --git a/vendor/github.com/cilium/ebpf/link/xdp.go b/vendor/github.com/cilium/ebpf/link/xdp.go new file mode 100644 index 0000000000..2ec441229a --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/xdp.go @@ -0,0 +1,80 @@ +package link + +import ( + "fmt" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/internal/sys" +) + +// XDPAttachFlags represents how XDP program will be attached to interface. +type XDPAttachFlags uint32 + +const ( + // XDPGenericMode (SKB) links XDP BPF program for drivers which do + // not yet support native XDP. + XDPGenericMode XDPAttachFlags = 1 << (iota + 1) + // XDPDriverMode links XDP BPF program into the driver’s receive path. + XDPDriverMode + // XDPOffloadMode offloads the entire XDP BPF program into hardware. + XDPOffloadMode +) + +type XDPOptions struct { + // Program must be an XDP BPF program. + Program *ebpf.Program + + // Interface is the interface index to attach program to. + Interface int + + // Flags is one of XDPAttachFlags (optional). + // + // Only one XDP mode should be set, without flag defaults + // to driver/generic mode (best effort). + Flags XDPAttachFlags +} + +// AttachXDP links an XDP BPF program to an XDP hook. +func AttachXDP(opts XDPOptions) (Link, error) { + if t := opts.Program.Type(); t != ebpf.XDP { + return nil, fmt.Errorf("invalid program type %s, expected XDP", t) + } + + if opts.Interface < 1 { + return nil, fmt.Errorf("invalid interface index: %d", opts.Interface) + } + + rawLink, err := AttachRawLink(RawLinkOptions{ + Program: opts.Program, + Attach: ebpf.AttachXDP, + Target: opts.Interface, + Flags: uint32(opts.Flags), + }) + + if err != nil { + return nil, fmt.Errorf("failed to attach link: %w", err) + } + + return &xdpLink{*rawLink}, nil +} + +type xdpLink struct { + RawLink +} + +func (xdp *xdpLink) Info() (*Info, error) { + var info sys.XDPLinkInfo + if err := sys.ObjInfo(xdp.fd, &info); err != nil { + return nil, fmt.Errorf("xdp link info: %s", err) + } + extra := &XDPInfo{ + Ifindex: info.Ifindex, + } + + return &Info{ + info.Type, + info.Id, + ebpf.ProgramID(info.ProgId), + extra, + }, nil +} diff --git a/vendor/github.com/cilium/ebpf/linker.go b/vendor/github.com/cilium/ebpf/linker.go new file mode 100644 index 0000000000..024b72bbc8 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/linker.go @@ -0,0 +1,506 @@ +package ebpf + +import ( + "debug/elf" + "encoding/binary" + "errors" + "fmt" + "io" + "io/fs" + "math" + "slices" + "strings" + + "github.com/cilium/ebpf/asm" + "github.com/cilium/ebpf/btf" + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/kallsyms" +) + +// handles stores handle objects to avoid gc cleanup +type handles []*btf.Handle + +func (hs *handles) add(h *btf.Handle) (int, error) { + if h == nil { + return 0, nil + } + + if len(*hs) == math.MaxInt16 { + return 0, fmt.Errorf("can't add more than %d module FDs to fdArray", math.MaxInt16) + } + + *hs = append(*hs, h) + + // return length of slice so that indexes start at 1 + return len(*hs), nil +} + +func (hs handles) fdArray() []int32 { + // first element of fda is reserved as no module can be indexed with 0 + fda := []int32{0} + for _, h := range hs { + fda = append(fda, int32(h.FD())) + } + + return fda +} + +func (hs *handles) Close() error { + var errs []error + for _, h := range *hs { + errs = append(errs, h.Close()) + } + return errors.Join(errs...) +} + +// splitSymbols splits insns into subsections delimited by Symbol Instructions. +// insns cannot be empty and must start with a Symbol Instruction. +// +// The resulting map is indexed by Symbol name. +func splitSymbols(insns asm.Instructions) (map[string]asm.Instructions, error) { + if len(insns) == 0 { + return nil, errors.New("insns is empty") + } + + currentSym := insns[0].Symbol() + if currentSym == "" { + return nil, errors.New("insns must start with a Symbol") + } + + start := 0 + progs := make(map[string]asm.Instructions) + for i, ins := range insns[1:] { + i := i + 1 + + sym := ins.Symbol() + if sym == "" { + continue + } + + // New symbol, flush the old one out. + progs[currentSym] = slices.Clone(insns[start:i]) + + if progs[sym] != nil { + return nil, fmt.Errorf("insns contains duplicate Symbol %s", sym) + } + currentSym = sym + start = i + } + + if tail := insns[start:]; len(tail) > 0 { + progs[currentSym] = slices.Clone(tail) + } + + return progs, nil +} + +// The linker is responsible for resolving bpf-to-bpf calls between programs +// within an ELF. Each BPF program must be a self-contained binary blob, +// so when an instruction in one ELF program section wants to jump to +// a function in another, the linker needs to pull in the bytecode +// (and BTF info) of the target function and concatenate the instruction +// streams. +// +// Later on in the pipeline, all call sites are fixed up with relative jumps +// within this newly-created instruction stream to then finally hand off to +// the kernel with BPF_PROG_LOAD. +// +// Each function is denoted by an ELF symbol and the compiler takes care of +// register setup before each jump instruction. + +// hasFunctionReferences returns true if insns contains one or more bpf2bpf +// function references. +func hasFunctionReferences(insns asm.Instructions) bool { + for _, i := range insns { + if i.IsFunctionReference() { + return true + } + } + return false +} + +// applyRelocations collects and applies any CO-RE relocations in insns. +// +// Passing a nil target will relocate against the running kernel. insns are +// modified in place. +func applyRelocations(insns asm.Instructions, targets []*btf.Spec, kmodName string, bo binary.ByteOrder, b *btf.Builder) error { + var relos []*btf.CORERelocation + var reloInsns []*asm.Instruction + iter := insns.Iterate() + for iter.Next() { + if relo := btf.CORERelocationMetadata(iter.Ins); relo != nil { + relos = append(relos, relo) + reloInsns = append(reloInsns, iter.Ins) + } + } + + if len(relos) == 0 { + return nil + } + + if bo == nil { + bo = internal.NativeEndian + } + + if len(targets) == 0 { + kernelTarget, err := btf.LoadKernelSpec() + if err != nil { + return fmt.Errorf("load kernel spec: %w", err) + } + targets = append(targets, kernelTarget) + + if kmodName != "" { + kmodTarget, err := btf.LoadKernelModuleSpec(kmodName) + // Ignore ErrNotExists to cater to kernels which have CONFIG_DEBUG_INFO_BTF_MODULES disabled. + if err != nil && !errors.Is(err, fs.ErrNotExist) { + return fmt.Errorf("load kernel module spec: %w", err) + } + if err == nil { + targets = append(targets, kmodTarget) + } + } + } + + fixups, err := btf.CORERelocate(relos, targets, bo, b.Add) + if err != nil { + return err + } + + for i, fixup := range fixups { + if err := fixup.Apply(reloInsns[i]); err != nil { + return fmt.Errorf("fixup for %s: %w", relos[i], err) + } + } + + return nil +} + +// flattenPrograms resolves bpf-to-bpf calls for a set of programs. +// +// Links all programs in names by modifying their ProgramSpec in progs. +func flattenPrograms(progs map[string]*ProgramSpec, names []string) { + // Pre-calculate all function references. + refs := make(map[*ProgramSpec][]string) + for _, prog := range progs { + refs[prog] = prog.Instructions.FunctionReferences() + } + + // Create a flattened instruction stream, but don't modify progs yet to + // avoid linking multiple times. + flattened := make([]asm.Instructions, 0, len(names)) + for _, name := range names { + flattened = append(flattened, flattenInstructions(name, progs, refs)) + } + + // Finally, assign the flattened instructions. + for i, name := range names { + progs[name].Instructions = flattened[i] + } +} + +// flattenInstructions resolves bpf-to-bpf calls for a single program. +// +// Flattens the instructions of prog by concatenating the instructions of all +// direct and indirect dependencies. +// +// progs contains all referenceable programs, while refs contain the direct +// dependencies of each program. +func flattenInstructions(name string, progs map[string]*ProgramSpec, refs map[*ProgramSpec][]string) asm.Instructions { + prog := progs[name] + progRefs := refs[prog] + + if len(progRefs) == 0 { + // No references, nothing to do. + return prog.Instructions + } + + insns := make(asm.Instructions, len(prog.Instructions)) + copy(insns, prog.Instructions) + + // Add all direct references of prog to the list of to be linked programs. + pending := make([]string, len(progRefs)) + copy(pending, progRefs) + + // All references for which we've appended instructions. + linked := make(map[string]bool) + + // Iterate all pending references. We can't use a range since pending is + // modified in the body below. + for len(pending) > 0 { + var ref string + ref, pending = pending[0], pending[1:] + + if linked[ref] { + // We've already linked this ref, don't append instructions again. + continue + } + + progRef := progs[ref] + if progRef == nil { + // We don't have instructions that go with this reference. This + // happens when calling extern functions. + continue + } + + insns = append(insns, progRef.Instructions...) + linked[ref] = true + + // Make sure we link indirect references. + pending = append(pending, refs[progRef]...) + } + + return insns +} + +// fixupAndValidate is called by the ELF reader right before marshaling the +// instruction stream. It performs last-minute adjustments to the program and +// runs some sanity checks before sending it off to the kernel. +func fixupAndValidate(insns asm.Instructions) error { + iter := insns.Iterate() + for iter.Next() { + ins := iter.Ins + + // Map load was tagged with a Reference, but does not contain a Map pointer. + needsMap := ins.Reference() != "" || ins.Metadata.Get(kconfigMetaKey{}) != nil + if ins.IsLoadFromMap() && needsMap && ins.Map() == nil { + return fmt.Errorf("instruction %d: %w", iter.Index, asm.ErrUnsatisfiedMapReference) + } + + fixupProbeReadKernel(ins) + } + + return nil +} + +// POISON_CALL_KFUNC_BASE in libbpf. +// https://github.com/libbpf/libbpf/blob/2778cbce609aa1e2747a69349f7f46a2f94f0522/src/libbpf.c#L5767 +const kfuncCallPoisonBase = 2002000000 + +// fixupKfuncs loops over all instructions in search for kfunc calls. +// If at least one is found, the current kernels BTF and module BTFis are searched to set Instruction.Constant +// and Instruction.Offset to the correct values. +func fixupKfuncs(insns asm.Instructions) (_ handles, err error) { + closeOnError := func(c io.Closer) { + if err != nil { + c.Close() + } + } + + iter := insns.Iterate() + for iter.Next() { + ins := iter.Ins + if metadata := ins.Metadata.Get(kfuncMetaKey{}); metadata != nil { + goto fixups + } + } + + return nil, nil + +fixups: + // only load the kernel spec if we found at least one kfunc call + kernelSpec, err := btf.LoadKernelSpec() + if err != nil { + return nil, err + } + + fdArray := make(handles, 0) + defer closeOnError(&fdArray) + + for { + ins := iter.Ins + + metadata := ins.Metadata.Get(kfuncMetaKey{}) + if metadata == nil { + if !iter.Next() { + // break loop if this was the last instruction in the stream. + break + } + continue + } + + // check meta, if no meta return err + kfm, _ := metadata.(*kfuncMeta) + if kfm == nil { + return nil, fmt.Errorf("kfuncMetaKey doesn't contain kfuncMeta") + } + + target := btf.Type((*btf.Func)(nil)) + spec, module, err := findTargetInKernel(kernelSpec, kfm.Func.Name, &target) + if kfm.Binding == elf.STB_WEAK && errors.Is(err, btf.ErrNotFound) { + if ins.IsKfuncCall() { + // If the kfunc call is weak and not found, poison the call. Use a recognizable constant + // to make it easier to debug. And set src to zero so the verifier doesn't complain + // about the invalid imm/offset values before dead-code elimination. + ins.Constant = kfuncCallPoisonBase + ins.Src = 0 + } else if ins.OpCode.IsDWordLoad() { + // If the kfunc DWordLoad is weak and not found, set its address to 0. + ins.Constant = 0 + ins.Src = 0 + } else { + return nil, fmt.Errorf("only kfunc calls and dword loads may have kfunc metadata") + } + + iter.Next() + continue + } + // Error on non-weak kfunc not found. + if errors.Is(err, btf.ErrNotFound) { + return nil, fmt.Errorf("kfunc %q: %w", kfm.Func.Name, ErrNotSupported) + } + if err != nil { + return nil, err + } + + idx, err := fdArray.add(module) + if err != nil { + return nil, err + } + + if err := btf.CheckTypeCompatibility(kfm.Func.Type, target.(*btf.Func).Type); err != nil { + return nil, &incompatibleKfuncError{kfm.Func.Name, err} + } + + id, err := spec.TypeID(target) + if err != nil { + return nil, err + } + + ins.Constant = int64(id) + ins.Offset = int16(idx) + + if !iter.Next() { + break + } + } + + return fdArray, nil +} + +type incompatibleKfuncError struct { + name string + err error +} + +func (ike *incompatibleKfuncError) Error() string { + return fmt.Sprintf("kfunc %q: %s", ike.name, ike.err) +} + +// fixupProbeReadKernel replaces calls to bpf_probe_read_{kernel,user}(_str) +// with bpf_probe_read(_str) on kernels that don't support it yet. +func fixupProbeReadKernel(ins *asm.Instruction) { + if !ins.IsBuiltinCall() { + return + } + + // Kernel supports bpf_probe_read_kernel, nothing to do. + if haveProbeReadKernel() == nil { + return + } + + switch asm.BuiltinFunc(ins.Constant) { + case asm.FnProbeReadKernel, asm.FnProbeReadUser: + ins.Constant = int64(asm.FnProbeRead) + case asm.FnProbeReadKernelStr, asm.FnProbeReadUserStr: + ins.Constant = int64(asm.FnProbeReadStr) + } +} + +// resolveKconfigReferences creates and populates a .kconfig map if necessary. +// +// Returns a nil Map and no error if no references exist. +func resolveKconfigReferences(insns asm.Instructions) (_ *Map, err error) { + closeOnError := func(c io.Closer) { + if err != nil { + c.Close() + } + } + + var spec *MapSpec + iter := insns.Iterate() + for iter.Next() { + meta, _ := iter.Ins.Metadata.Get(kconfigMetaKey{}).(*kconfigMeta) + if meta != nil { + spec = meta.Map + break + } + } + + if spec == nil { + return nil, nil + } + + cpy := spec.Copy() + if err := resolveKconfig(cpy); err != nil { + return nil, err + } + + kconfig, err := NewMap(cpy) + if err != nil { + return nil, err + } + defer closeOnError(kconfig) + + // Resolve all instructions which load from .kconfig map with actual map + // and offset inside it. + iter = insns.Iterate() + for iter.Next() { + meta, _ := iter.Ins.Metadata.Get(kconfigMetaKey{}).(*kconfigMeta) + if meta == nil { + continue + } + + if meta.Map != spec { + return nil, fmt.Errorf("instruction %d: reference to multiple .kconfig maps is not allowed", iter.Index) + } + + if err := iter.Ins.AssociateMap(kconfig); err != nil { + return nil, fmt.Errorf("instruction %d: %w", iter.Index, err) + } + + // Encode a map read at the offset of the var in the datasec. + iter.Ins.Constant = int64(uint64(meta.Offset) << 32) + iter.Ins.Metadata.Set(kconfigMetaKey{}, nil) + } + + return kconfig, nil +} + +func resolveKsymReferences(insns asm.Instructions) error { + var missing []string + + iter := insns.Iterate() + for iter.Next() { + ins := iter.Ins + meta, _ := ins.Metadata.Get(ksymMetaKey{}).(*ksymMeta) + if meta == nil { + continue + } + + addr, err := kallsyms.Address(meta.Name) + if err != nil { + return fmt.Errorf("resolve ksym %s: %w", meta.Name, err) + } + if addr != 0 { + ins.Constant = int64(addr) + continue + } + + if meta.Binding == elf.STB_WEAK { + // A weak ksym variable in eBPF C means its resolution is optional. + // Set a zero constant explicitly for clarity. + ins.Constant = 0 + continue + } + + if !slices.Contains(missing, meta.Name) { + missing = append(missing, meta.Name) + } + } + + if len(missing) > 0 { + return fmt.Errorf("kernel is missing symbol: %s", strings.Join(missing, ",")) + } + + return nil +} diff --git a/vendor/github.com/cilium/ebpf/map.go b/vendor/github.com/cilium/ebpf/map.go new file mode 100644 index 0000000000..e5d8bd7809 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/map.go @@ -0,0 +1,1750 @@ +package ebpf + +import ( + "bytes" + "errors" + "fmt" + "io" + "math/rand" + "os" + "path/filepath" + "reflect" + "slices" + "strings" + "sync" + "time" + "unsafe" + + "github.com/cilium/ebpf/btf" + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/sys" + "github.com/cilium/ebpf/internal/sysenc" + "github.com/cilium/ebpf/internal/unix" +) + +// Errors returned by Map and MapIterator methods. +var ( + ErrKeyNotExist = errors.New("key does not exist") + ErrKeyExist = errors.New("key already exists") + ErrIterationAborted = errors.New("iteration aborted") + ErrMapIncompatible = errors.New("map spec is incompatible with existing map") + errMapNoBTFValue = errors.New("map spec does not contain a BTF Value") + + // pre-allocating these errors here since they may get called in hot code paths + // and cause unnecessary memory allocations + errMapLookupKeyNotExist = fmt.Errorf("lookup: %w", sysErrKeyNotExist) +) + +// MapOptions control loading a map into the kernel. +type MapOptions struct { + // The base path to pin maps in if requested via PinByName. + // Existing maps will be re-used if they are compatible, otherwise an + // error is returned. + PinPath string + LoadPinOptions LoadPinOptions +} + +// MapID represents the unique ID of an eBPF map +type MapID uint32 + +// MapSpec defines a Map. +type MapSpec struct { + // Name is passed to the kernel as a debug aid. Must only contain + // alpha numeric and '_' characters. + Name string + Type MapType + KeySize uint32 + ValueSize uint32 + MaxEntries uint32 + + // Flags is passed to the kernel and specifies additional map + // creation attributes. + Flags uint32 + + // Automatically pin and load a map from MapOptions.PinPath. + // Generates an error if an existing pinned map is incompatible with the MapSpec. + Pinning PinType + + // Specify numa node during map creation + // (effective only if sys.BPF_F_NUMA_NODE flag is set, + // which can be imported from golang.org/x/sys/unix) + NumaNode uint32 + + // The initial contents of the map. May be nil. + Contents []MapKV + + // InnerMap is used as a template for ArrayOfMaps and HashOfMaps + InnerMap *MapSpec + + // Extra trailing bytes found in the ELF map definition when using structs + // larger than libbpf's bpf_map_def. nil if no trailing bytes were present. + // Must be nil or empty before instantiating the MapSpec into a Map. + Extra *bytes.Reader + + // The key and value type of this map. May be nil. + Key, Value btf.Type +} + +func (ms *MapSpec) String() string { + return fmt.Sprintf("%s(keySize=%d, valueSize=%d, maxEntries=%d, flags=%d)", ms.Type, ms.KeySize, ms.ValueSize, ms.MaxEntries, ms.Flags) +} + +// Copy returns a copy of the spec. +// +// MapSpec.Contents is a shallow copy. +func (ms *MapSpec) Copy() *MapSpec { + if ms == nil { + return nil + } + + cpy := *ms + cpy.Contents = slices.Clone(cpy.Contents) + cpy.Key = btf.Copy(cpy.Key) + cpy.Value = btf.Copy(cpy.Value) + + if cpy.InnerMap == ms { + cpy.InnerMap = &cpy + } else { + cpy.InnerMap = ms.InnerMap.Copy() + } + + if cpy.Extra != nil { + extra := *cpy.Extra + cpy.Extra = &extra + } + + return &cpy +} + +// fixupMagicFields fills fields of MapSpec which are usually +// left empty in ELF or which depend on runtime information. +// +// The method doesn't modify Spec, instead returning a copy. +// The copy is only performed if fixups are necessary, so callers mustn't mutate +// the returned spec. +func (spec *MapSpec) fixupMagicFields() (*MapSpec, error) { + switch spec.Type { + case ArrayOfMaps, HashOfMaps: + if spec.ValueSize != 0 && spec.ValueSize != 4 { + return nil, errors.New("ValueSize must be zero or four for map of map") + } + + spec = spec.Copy() + spec.ValueSize = 4 + + case PerfEventArray: + if spec.KeySize != 0 && spec.KeySize != 4 { + return nil, errors.New("KeySize must be zero or four for perf event array") + } + + if spec.ValueSize != 0 && spec.ValueSize != 4 { + return nil, errors.New("ValueSize must be zero or four for perf event array") + } + + spec = spec.Copy() + spec.KeySize = 4 + spec.ValueSize = 4 + + n, err := PossibleCPU() + if err != nil { + return nil, fmt.Errorf("fixup perf event array: %w", err) + } + + if n := uint32(n); spec.MaxEntries == 0 || spec.MaxEntries > n { + // MaxEntries should be zero most of the time, but there is code + // out there which hardcodes large constants. Clamp the number + // of entries to the number of CPUs at most. Allow creating maps with + // less than n items since some kernel selftests relied on this + // behaviour in the past. + spec.MaxEntries = n + } + + case CPUMap: + n, err := PossibleCPU() + if err != nil { + return nil, fmt.Errorf("fixup cpu map: %w", err) + } + + if n := uint32(n); spec.MaxEntries == 0 || spec.MaxEntries > n { + // Perform clamping similar to PerfEventArray. + spec.MaxEntries = n + } + } + + return spec, nil +} + +// dataSection returns the contents and BTF Datasec descriptor of the spec. +func (ms *MapSpec) dataSection() ([]byte, *btf.Datasec, error) { + if ms.Value == nil { + return nil, nil, errMapNoBTFValue + } + + ds, ok := ms.Value.(*btf.Datasec) + if !ok { + return nil, nil, fmt.Errorf("map value BTF is a %T, not a *btf.Datasec", ms.Value) + } + + if n := len(ms.Contents); n != 1 { + return nil, nil, fmt.Errorf("expected one key, found %d", n) + } + + kv := ms.Contents[0] + value, ok := kv.Value.([]byte) + if !ok { + return nil, nil, fmt.Errorf("value at first map key is %T, not []byte", kv.Value) + } + + return value, ds, nil +} + +func (ms *MapSpec) readOnly() bool { + return (ms.Flags & sys.BPF_F_RDONLY_PROG) > 0 +} + +func (ms *MapSpec) writeOnly() bool { + return (ms.Flags & sys.BPF_F_WRONLY_PROG) > 0 +} + +// MapKV is used to initialize the contents of a Map. +type MapKV struct { + Key interface{} + Value interface{} +} + +// Compatible returns nil if an existing map may be used instead of creating +// one from the spec. +// +// Returns an error wrapping [ErrMapIncompatible] otherwise. +func (ms *MapSpec) Compatible(m *Map) error { + ms, err := ms.fixupMagicFields() + if err != nil { + return err + } + + diffs := []string{} + if m.typ != ms.Type { + diffs = append(diffs, fmt.Sprintf("Type: %s changed to %s", m.typ, ms.Type)) + } + if m.keySize != ms.KeySize { + diffs = append(diffs, fmt.Sprintf("KeySize: %d changed to %d", m.keySize, ms.KeySize)) + } + if m.valueSize != ms.ValueSize { + diffs = append(diffs, fmt.Sprintf("ValueSize: %d changed to %d", m.valueSize, ms.ValueSize)) + } + if m.maxEntries != ms.MaxEntries { + diffs = append(diffs, fmt.Sprintf("MaxEntries: %d changed to %d", m.maxEntries, ms.MaxEntries)) + } + + // BPF_F_RDONLY_PROG is set unconditionally for devmaps. Explicitly allow this + // mismatch. + if !((ms.Type == DevMap || ms.Type == DevMapHash) && m.flags^ms.Flags == sys.BPF_F_RDONLY_PROG) && + m.flags != ms.Flags { + diffs = append(diffs, fmt.Sprintf("Flags: %d changed to %d", m.flags, ms.Flags)) + } + + if len(diffs) == 0 { + return nil + } + + return fmt.Errorf("%s: %w", strings.Join(diffs, ", "), ErrMapIncompatible) +} + +// Map represents a Map file descriptor. +// +// It is not safe to close a map which is used by other goroutines. +// +// Methods which take interface{} arguments by default encode +// them using binary.Read/Write in the machine's native endianness. +// +// Implement encoding.BinaryMarshaler or encoding.BinaryUnmarshaler +// if you require custom encoding. +type Map struct { + name string + fd *sys.FD + typ MapType + keySize uint32 + valueSize uint32 + maxEntries uint32 + flags uint32 + pinnedPath string + // Per CPU maps return values larger than the size in the spec + fullValueSize int + + memory *Memory +} + +// NewMapFromFD creates a map from a raw fd. +// +// You should not use fd after calling this function. +func NewMapFromFD(fd int) (*Map, error) { + f, err := sys.NewFD(fd) + if err != nil { + return nil, err + } + + return newMapFromFD(f) +} + +func newMapFromFD(fd *sys.FD) (*Map, error) { + info, err := newMapInfoFromFd(fd) + if err != nil { + fd.Close() + return nil, fmt.Errorf("get map info: %w", err) + } + + return newMap(fd, info.Name, info.Type, info.KeySize, info.ValueSize, info.MaxEntries, info.Flags) +} + +// NewMap creates a new Map. +// +// It's equivalent to calling NewMapWithOptions with default options. +func NewMap(spec *MapSpec) (*Map, error) { + return NewMapWithOptions(spec, MapOptions{}) +} + +// NewMapWithOptions creates a new Map. +// +// Creating a map for the first time will perform feature detection +// by creating small, temporary maps. +// +// The caller is responsible for ensuring the process' rlimit is set +// sufficiently high for locking memory during map creation. This can be done +// by calling rlimit.RemoveMemlock() prior to calling NewMapWithOptions. +// +// May return an error wrapping ErrMapIncompatible. +func NewMapWithOptions(spec *MapSpec, opts MapOptions) (*Map, error) { + m, err := newMapWithOptions(spec, opts) + if err != nil { + return nil, fmt.Errorf("creating map: %w", err) + } + + if err := m.finalize(spec); err != nil { + m.Close() + return nil, fmt.Errorf("populating map: %w", err) + } + + return m, nil +} + +func newMapWithOptions(spec *MapSpec, opts MapOptions) (_ *Map, err error) { + closeOnError := func(c io.Closer) { + if err != nil { + c.Close() + } + } + + switch spec.Pinning { + case PinByName: + if spec.Name == "" { + return nil, fmt.Errorf("pin by name: missing Name") + } + + if opts.PinPath == "" { + return nil, fmt.Errorf("pin by name: missing MapOptions.PinPath") + } + + path := filepath.Join(opts.PinPath, spec.Name) + m, err := LoadPinnedMap(path, &opts.LoadPinOptions) + if errors.Is(err, unix.ENOENT) { + break + } + if err != nil { + return nil, fmt.Errorf("load pinned map: %w", err) + } + defer closeOnError(m) + + if err := spec.Compatible(m); err != nil { + return nil, fmt.Errorf("use pinned map %s: %w", spec.Name, err) + } + + return m, nil + + case PinNone: + // Nothing to do here + + default: + return nil, fmt.Errorf("pin type %d: %w", int(spec.Pinning), ErrNotSupported) + } + + var innerFd *sys.FD + if spec.Type == ArrayOfMaps || spec.Type == HashOfMaps { + if spec.InnerMap == nil { + return nil, fmt.Errorf("%s requires InnerMap", spec.Type) + } + + if spec.InnerMap.Pinning != PinNone { + return nil, errors.New("inner maps cannot be pinned") + } + + template, err := spec.InnerMap.createMap(nil) + if err != nil { + return nil, fmt.Errorf("inner map: %w", err) + } + defer template.Close() + + // Intentionally skip populating and freezing (finalizing) + // the inner map template since it will be removed shortly. + + innerFd = template.fd + } + + m, err := spec.createMap(innerFd) + if err != nil { + return nil, err + } + defer closeOnError(m) + + if spec.Pinning == PinByName { + path := filepath.Join(opts.PinPath, spec.Name) + if err := m.Pin(path); err != nil { + return nil, fmt.Errorf("pin map to %s: %w", path, err) + } + } + + return m, nil +} + +// Memory returns a memory-mapped region for the Map. The Map must have been +// created with the BPF_F_MMAPABLE flag. Repeated calls to Memory return the +// same mapping. Callers are responsible for coordinating access to Memory. +func (m *Map) Memory() (*Memory, error) { + if m.memory != nil { + return m.memory, nil + } + + if m.flags&sys.BPF_F_MMAPABLE == 0 { + return nil, fmt.Errorf("Map was not created with the BPF_F_MMAPABLE flag: %w", ErrNotSupported) + } + + size, err := m.memorySize() + if err != nil { + return nil, err + } + + mm, err := newMemory(m.FD(), size) + if err != nil { + return nil, fmt.Errorf("creating new Memory: %w", err) + } + + m.memory = mm + + return mm, nil +} + +func (m *Map) memorySize() (int, error) { + switch m.Type() { + case Array: + // In Arrays, values are always laid out on 8-byte boundaries regardless of + // architecture. Multiply by MaxEntries and align the result to the host's + // page size. + size := int(internal.Align(m.ValueSize(), 8) * m.MaxEntries()) + size = internal.Align(size, os.Getpagesize()) + return size, nil + case Arena: + // For Arenas, MaxEntries denotes the maximum number of pages available to + // the arena. + return int(m.MaxEntries()) * os.Getpagesize(), nil + } + + return 0, fmt.Errorf("determine memory size of map type %s: %w", m.Type(), ErrNotSupported) +} + +// createMap validates the spec's properties and creates the map in the kernel +// using the given opts. It does not populate or freeze the map. +func (spec *MapSpec) createMap(inner *sys.FD) (_ *Map, err error) { + closeOnError := func(closer io.Closer) { + if err != nil { + closer.Close() + } + } + + // Kernels 4.13 through 5.4 used a struct bpf_map_def that contained + // additional 'inner_map_idx' and later 'numa_node' fields. + // In order to support loading these definitions, tolerate the presence of + // extra bytes, but require them to be zeroes. + if spec.Extra != nil { + if _, err := io.Copy(internal.DiscardZeroes{}, spec.Extra); err != nil { + return nil, errors.New("extra contains unhandled non-zero bytes, drain before creating map") + } + } + + spec, err = spec.fixupMagicFields() + if err != nil { + return nil, err + } + + attr := sys.MapCreateAttr{ + MapType: sys.MapType(spec.Type), + KeySize: spec.KeySize, + ValueSize: spec.ValueSize, + MaxEntries: spec.MaxEntries, + MapFlags: spec.Flags, + NumaNode: spec.NumaNode, + } + + if inner != nil { + attr.InnerMapFd = inner.Uint() + } + + if haveObjName() == nil { + attr.MapName = sys.NewObjName(spec.Name) + } + + if spec.Key != nil || spec.Value != nil { + handle, keyTypeID, valueTypeID, err := btf.MarshalMapKV(spec.Key, spec.Value) + if err != nil && !errors.Is(err, btf.ErrNotSupported) { + return nil, fmt.Errorf("load BTF: %w", err) + } + + if handle != nil { + defer handle.Close() + + // Use BTF k/v during map creation. + attr.BtfFd = uint32(handle.FD()) + attr.BtfKeyTypeId = keyTypeID + attr.BtfValueTypeId = valueTypeID + } + } + + fd, err := sys.MapCreate(&attr) + + // Some map types don't support BTF k/v in earlier kernel versions. + // Remove BTF metadata and retry map creation. + if (errors.Is(err, sys.ENOTSUPP) || errors.Is(err, unix.EINVAL)) && attr.BtfFd != 0 { + attr.BtfFd, attr.BtfKeyTypeId, attr.BtfValueTypeId = 0, 0, 0 + fd, err = sys.MapCreate(&attr) + } + if err != nil { + return nil, handleMapCreateError(attr, spec, err) + } + + defer closeOnError(fd) + m, err := newMap(fd, spec.Name, spec.Type, spec.KeySize, spec.ValueSize, spec.MaxEntries, spec.Flags) + if err != nil { + return nil, fmt.Errorf("map create: %w", err) + } + return m, nil +} + +func handleMapCreateError(attr sys.MapCreateAttr, spec *MapSpec, err error) error { + if errors.Is(err, unix.EPERM) { + return fmt.Errorf("map create: %w (MEMLOCK may be too low, consider rlimit.RemoveMemlock)", err) + } + if errors.Is(err, unix.EINVAL) && spec.MaxEntries == 0 { + return fmt.Errorf("map create: %w (MaxEntries may be incorrectly set to zero)", err) + } + if errors.Is(err, unix.EINVAL) && spec.Type == UnspecifiedMap { + return fmt.Errorf("map create: cannot use type %s", UnspecifiedMap) + } + if errors.Is(err, unix.EINVAL) && spec.Flags&sys.BPF_F_NO_PREALLOC > 0 { + return fmt.Errorf("map create: %w (noPrealloc flag may be incompatible with map type %s)", err, spec.Type) + } + + if spec.Type.canStoreMap() { + if haveFeatErr := haveNestedMaps(); haveFeatErr != nil { + return fmt.Errorf("map create: %w", haveFeatErr) + } + } + + if spec.readOnly() || spec.writeOnly() { + if haveFeatErr := haveMapMutabilityModifiers(); haveFeatErr != nil { + return fmt.Errorf("map create: %w", haveFeatErr) + } + } + if spec.Flags&sys.BPF_F_MMAPABLE > 0 { + if haveFeatErr := haveMmapableMaps(); haveFeatErr != nil { + return fmt.Errorf("map create: %w", haveFeatErr) + } + } + if spec.Flags&sys.BPF_F_INNER_MAP > 0 { + if haveFeatErr := haveInnerMaps(); haveFeatErr != nil { + return fmt.Errorf("map create: %w", haveFeatErr) + } + } + if spec.Flags&sys.BPF_F_NO_PREALLOC > 0 { + if haveFeatErr := haveNoPreallocMaps(); haveFeatErr != nil { + return fmt.Errorf("map create: %w", haveFeatErr) + } + } + // BPF_MAP_TYPE_RINGBUF's max_entries must be a power-of-2 multiple of kernel's page size. + if errors.Is(err, unix.EINVAL) && + (attr.MapType == sys.BPF_MAP_TYPE_RINGBUF || attr.MapType == sys.BPF_MAP_TYPE_USER_RINGBUF) { + pageSize := uint32(os.Getpagesize()) + maxEntries := attr.MaxEntries + if maxEntries%pageSize != 0 || !internal.IsPow(maxEntries) { + return fmt.Errorf("map create: %w (ring map size %d not a multiple of page size %d)", err, maxEntries, pageSize) + } + } + + return fmt.Errorf("map create: %w", err) +} + +// newMap allocates and returns a new Map structure. +// Sets the fullValueSize on per-CPU maps. +func newMap(fd *sys.FD, name string, typ MapType, keySize, valueSize, maxEntries, flags uint32) (*Map, error) { + m := &Map{ + name, + fd, + typ, + keySize, + valueSize, + maxEntries, + flags, + "", + int(valueSize), + nil, + } + + if !typ.hasPerCPUValue() { + return m, nil + } + + possibleCPUs, err := PossibleCPU() + if err != nil { + return nil, err + } + + m.fullValueSize = int(internal.Align(valueSize, 8)) * possibleCPUs + return m, nil +} + +func (m *Map) String() string { + if m.name != "" { + return fmt.Sprintf("%s(%s)#%v", m.typ, m.name, m.fd) + } + return fmt.Sprintf("%s#%v", m.typ, m.fd) +} + +// Type returns the underlying type of the map. +func (m *Map) Type() MapType { + return m.typ +} + +// KeySize returns the size of the map key in bytes. +func (m *Map) KeySize() uint32 { + return m.keySize +} + +// ValueSize returns the size of the map value in bytes. +func (m *Map) ValueSize() uint32 { + return m.valueSize +} + +// MaxEntries returns the maximum number of elements the map can hold. +func (m *Map) MaxEntries() uint32 { + return m.maxEntries +} + +// Flags returns the flags of the map. +func (m *Map) Flags() uint32 { + return m.flags +} + +// Info returns metadata about the map. This was first introduced in Linux 4.5, +// but newer kernels support more MapInfo fields with the introduction of more +// features. See [MapInfo] and its methods for more details. +// +// Returns an error wrapping ErrNotSupported if the kernel supports neither +// BPF_OBJ_GET_INFO_BY_FD nor reading map information from /proc/self/fdinfo. +func (m *Map) Info() (*MapInfo, error) { + return newMapInfoFromFd(m.fd) +} + +// Handle returns a reference to the Map's type information in the kernel. +// +// Returns ErrNotSupported if the kernel has no BTF support, or if there is no +// BTF associated with the Map. +func (m *Map) Handle() (*btf.Handle, error) { + info, err := m.Info() + if err != nil { + return nil, err + } + + id, ok := info.BTFID() + if !ok { + return nil, fmt.Errorf("map %s: retrieve BTF ID: %w", m, ErrNotSupported) + } + + return btf.NewHandleFromID(id) +} + +// MapLookupFlags controls the behaviour of the map lookup calls. +type MapLookupFlags uint64 + +// LookupLock look up the value of a spin-locked map. +const LookupLock MapLookupFlags = sys.BPF_F_LOCK + +// Lookup retrieves a value from a Map. +// +// Calls Close() on valueOut if it is of type **Map or **Program, +// and *valueOut is not nil. +// +// Returns an error if the key doesn't exist, see ErrKeyNotExist. +func (m *Map) Lookup(key, valueOut interface{}) error { + return m.LookupWithFlags(key, valueOut, 0) +} + +// LookupWithFlags retrieves a value from a Map with flags. +// +// Passing LookupLock flag will look up the value of a spin-locked +// map without returning the lock. This must be specified if the +// elements contain a spinlock. +// +// Calls Close() on valueOut if it is of type **Map or **Program, +// and *valueOut is not nil. +// +// Returns an error if the key doesn't exist, see ErrKeyNotExist. +func (m *Map) LookupWithFlags(key, valueOut interface{}, flags MapLookupFlags) error { + if m.typ.hasPerCPUValue() { + return m.lookupPerCPU(key, valueOut, flags) + } + + valueBytes := makeMapSyscallOutput(valueOut, m.fullValueSize) + if err := m.lookup(key, valueBytes.Pointer(), flags); err != nil { + return err + } + + return m.unmarshalValue(valueOut, valueBytes) +} + +// LookupAndDelete retrieves and deletes a value from a Map. +// +// Returns ErrKeyNotExist if the key doesn't exist. +func (m *Map) LookupAndDelete(key, valueOut interface{}) error { + return m.LookupAndDeleteWithFlags(key, valueOut, 0) +} + +// LookupAndDeleteWithFlags retrieves and deletes a value from a Map. +// +// Passing LookupLock flag will look up and delete the value of a spin-locked +// map without returning the lock. This must be specified if the elements +// contain a spinlock. +// +// Returns ErrKeyNotExist if the key doesn't exist. +func (m *Map) LookupAndDeleteWithFlags(key, valueOut interface{}, flags MapLookupFlags) error { + if m.typ.hasPerCPUValue() { + return m.lookupAndDeletePerCPU(key, valueOut, flags) + } + + valueBytes := makeMapSyscallOutput(valueOut, m.fullValueSize) + if err := m.lookupAndDelete(key, valueBytes.Pointer(), flags); err != nil { + return err + } + return m.unmarshalValue(valueOut, valueBytes) +} + +// LookupBytes gets a value from Map. +// +// Returns a nil value if a key doesn't exist. +func (m *Map) LookupBytes(key interface{}) ([]byte, error) { + valueBytes := make([]byte, m.fullValueSize) + valuePtr := sys.NewSlicePointer(valueBytes) + + err := m.lookup(key, valuePtr, 0) + if errors.Is(err, ErrKeyNotExist) { + return nil, nil + } + + return valueBytes, err +} + +func (m *Map) lookupPerCPU(key, valueOut any, flags MapLookupFlags) error { + slice, err := ensurePerCPUSlice(valueOut) + if err != nil { + return err + } + valueBytes := make([]byte, m.fullValueSize) + if err := m.lookup(key, sys.NewSlicePointer(valueBytes), flags); err != nil { + return err + } + return unmarshalPerCPUValue(slice, int(m.valueSize), valueBytes) +} + +func (m *Map) lookup(key interface{}, valueOut sys.Pointer, flags MapLookupFlags) error { + keyPtr, err := m.marshalKey(key) + if err != nil { + return fmt.Errorf("can't marshal key: %w", err) + } + + attr := sys.MapLookupElemAttr{ + MapFd: m.fd.Uint(), + Key: keyPtr, + Value: valueOut, + Flags: uint64(flags), + } + + if err = sys.MapLookupElem(&attr); err != nil { + if errors.Is(err, unix.ENOENT) { + return errMapLookupKeyNotExist + } + return fmt.Errorf("lookup: %w", wrapMapError(err)) + } + return nil +} + +func (m *Map) lookupAndDeletePerCPU(key, valueOut any, flags MapLookupFlags) error { + slice, err := ensurePerCPUSlice(valueOut) + if err != nil { + return err + } + valueBytes := make([]byte, m.fullValueSize) + if err := m.lookupAndDelete(key, sys.NewSlicePointer(valueBytes), flags); err != nil { + return err + } + return unmarshalPerCPUValue(slice, int(m.valueSize), valueBytes) +} + +// ensurePerCPUSlice allocates a slice for a per-CPU value if necessary. +func ensurePerCPUSlice(sliceOrPtr any) (any, error) { + sliceOrPtrType := reflect.TypeOf(sliceOrPtr) + if sliceOrPtrType.Kind() == reflect.Slice { + // The target is a slice, the caller is responsible for ensuring that + // size is correct. + return sliceOrPtr, nil + } + + slicePtrType := sliceOrPtrType + if slicePtrType.Kind() != reflect.Ptr || slicePtrType.Elem().Kind() != reflect.Slice { + return nil, fmt.Errorf("per-cpu value requires a slice or a pointer to slice") + } + + possibleCPUs, err := PossibleCPU() + if err != nil { + return nil, err + } + + sliceType := slicePtrType.Elem() + slice := reflect.MakeSlice(sliceType, possibleCPUs, possibleCPUs) + + sliceElemType := sliceType.Elem() + sliceElemIsPointer := sliceElemType.Kind() == reflect.Ptr + reflect.ValueOf(sliceOrPtr).Elem().Set(slice) + if !sliceElemIsPointer { + return slice.Interface(), nil + } + sliceElemType = sliceElemType.Elem() + + for i := 0; i < possibleCPUs; i++ { + newElem := reflect.New(sliceElemType) + slice.Index(i).Set(newElem) + } + + return slice.Interface(), nil +} + +func (m *Map) lookupAndDelete(key any, valuePtr sys.Pointer, flags MapLookupFlags) error { + keyPtr, err := m.marshalKey(key) + if err != nil { + return fmt.Errorf("can't marshal key: %w", err) + } + + attr := sys.MapLookupAndDeleteElemAttr{ + MapFd: m.fd.Uint(), + Key: keyPtr, + Value: valuePtr, + Flags: uint64(flags), + } + + if err := sys.MapLookupAndDeleteElem(&attr); err != nil { + return fmt.Errorf("lookup and delete: %w", wrapMapError(err)) + } + + return nil +} + +// MapUpdateFlags controls the behaviour of the Map.Update call. +// +// The exact semantics depend on the specific MapType. +type MapUpdateFlags uint64 + +const ( + // UpdateAny creates a new element or update an existing one. + UpdateAny MapUpdateFlags = iota + // UpdateNoExist creates a new element. + UpdateNoExist MapUpdateFlags = 1 << (iota - 1) + // UpdateExist updates an existing element. + UpdateExist + // UpdateLock updates elements under bpf_spin_lock. + UpdateLock +) + +// Put replaces or creates a value in map. +// +// It is equivalent to calling Update with UpdateAny. +func (m *Map) Put(key, value interface{}) error { + return m.Update(key, value, UpdateAny) +} + +// Update changes the value of a key. +func (m *Map) Update(key, value any, flags MapUpdateFlags) error { + if m.typ.hasPerCPUValue() { + return m.updatePerCPU(key, value, flags) + } + + valuePtr, err := m.marshalValue(value) + if err != nil { + return fmt.Errorf("marshal value: %w", err) + } + + return m.update(key, valuePtr, flags) +} + +func (m *Map) updatePerCPU(key, value any, flags MapUpdateFlags) error { + valuePtr, err := marshalPerCPUValue(value, int(m.valueSize)) + if err != nil { + return fmt.Errorf("marshal value: %w", err) + } + + return m.update(key, valuePtr, flags) +} + +func (m *Map) update(key any, valuePtr sys.Pointer, flags MapUpdateFlags) error { + keyPtr, err := m.marshalKey(key) + if err != nil { + return fmt.Errorf("marshal key: %w", err) + } + + attr := sys.MapUpdateElemAttr{ + MapFd: m.fd.Uint(), + Key: keyPtr, + Value: valuePtr, + Flags: uint64(flags), + } + + if err = sys.MapUpdateElem(&attr); err != nil { + return fmt.Errorf("update: %w", wrapMapError(err)) + } + + return nil +} + +// Delete removes a value. +// +// Returns ErrKeyNotExist if the key does not exist. +func (m *Map) Delete(key interface{}) error { + keyPtr, err := m.marshalKey(key) + if err != nil { + return fmt.Errorf("can't marshal key: %w", err) + } + + attr := sys.MapDeleteElemAttr{ + MapFd: m.fd.Uint(), + Key: keyPtr, + } + + if err = sys.MapDeleteElem(&attr); err != nil { + return fmt.Errorf("delete: %w", wrapMapError(err)) + } + return nil +} + +// NextKey finds the key following an initial key. +// +// See NextKeyBytes for details. +// +// Returns ErrKeyNotExist if there is no next key. +func (m *Map) NextKey(key, nextKeyOut interface{}) error { + nextKeyBytes := makeMapSyscallOutput(nextKeyOut, int(m.keySize)) + + if err := m.nextKey(key, nextKeyBytes.Pointer()); err != nil { + return err + } + + if err := nextKeyBytes.Unmarshal(nextKeyOut); err != nil { + return fmt.Errorf("can't unmarshal next key: %w", err) + } + return nil +} + +// NextKeyBytes returns the key following an initial key as a byte slice. +// +// Passing nil will return the first key. +// +// Use Iterate if you want to traverse all entries in the map. +// +// Returns nil if there are no more keys. +func (m *Map) NextKeyBytes(key interface{}) ([]byte, error) { + nextKey := make([]byte, m.keySize) + nextKeyPtr := sys.NewSlicePointer(nextKey) + + err := m.nextKey(key, nextKeyPtr) + if errors.Is(err, ErrKeyNotExist) { + return nil, nil + } + + return nextKey, err +} + +func (m *Map) nextKey(key interface{}, nextKeyOut sys.Pointer) error { + var ( + keyPtr sys.Pointer + err error + ) + + if key != nil { + keyPtr, err = m.marshalKey(key) + if err != nil { + return fmt.Errorf("can't marshal key: %w", err) + } + } + + attr := sys.MapGetNextKeyAttr{ + MapFd: m.fd.Uint(), + Key: keyPtr, + NextKey: nextKeyOut, + } + + if err = sys.MapGetNextKey(&attr); err != nil { + // Kernels 4.4.131 and earlier return EFAULT instead of a pointer to the + // first map element when a nil key pointer is specified. + if key == nil && errors.Is(err, unix.EFAULT) { + var guessKey []byte + guessKey, err = m.guessNonExistentKey() + if err != nil { + return err + } + + // Retry the syscall with a valid non-existing key. + attr.Key = sys.NewSlicePointer(guessKey) + if err = sys.MapGetNextKey(&attr); err == nil { + return nil + } + } + + return fmt.Errorf("next key: %w", wrapMapError(err)) + } + + return nil +} + +var mmapProtectedPage = sync.OnceValues(func() ([]byte, error) { + return unix.Mmap(-1, 0, os.Getpagesize(), unix.PROT_NONE, unix.MAP_ANON|unix.MAP_SHARED) +}) + +// guessNonExistentKey attempts to perform a map lookup that returns ENOENT. +// This is necessary on kernels before 4.4.132, since those don't support +// iterating maps from the start by providing an invalid key pointer. +func (m *Map) guessNonExistentKey() ([]byte, error) { + // Map a protected page and use that as the value pointer. This saves some + // work copying out the value, which we're not interested in. + page, err := mmapProtectedPage() + if err != nil { + return nil, err + } + valuePtr := sys.NewSlicePointer(page) + + randKey := make([]byte, int(m.keySize)) + + for i := 0; i < 4; i++ { + switch i { + // For hash maps, the 0 key is less likely to be occupied. They're often + // used for storing data related to pointers, and their access pattern is + // generally scattered across the keyspace. + case 0: + // An all-0xff key is guaranteed to be out of bounds of any array, since + // those have a fixed key size of 4 bytes. The only corner case being + // arrays with 2^32 max entries, but those are prohibitively expensive + // in many environments. + case 1: + for r := range randKey { + randKey[r] = 0xff + } + // Inspired by BCC, 0x55 is an alternating binary pattern (0101), so + // is unlikely to be taken. + case 2: + for r := range randKey { + randKey[r] = 0x55 + } + // Last ditch effort, generate a random key. + case 3: + rand.New(rand.NewSource(time.Now().UnixNano())).Read(randKey) + } + + err := m.lookup(randKey, valuePtr, 0) + if errors.Is(err, ErrKeyNotExist) { + return randKey, nil + } + } + + return nil, errors.New("couldn't find non-existing key") +} + +// BatchLookup looks up many elements in a map at once. +// +// "keysOut" and "valuesOut" must be of type slice, a pointer +// to a slice or buffer will not work. +// "cursor" is an pointer to an opaque handle. It must be non-nil. Pass +// "cursor" to subsequent calls of this function to continue the batching +// operation in the case of chunking. +// +// Warning: This API is not very safe to use as the kernel implementation for +// batching relies on the user to be aware of subtle details with regarding to +// different map type implementations. +// +// ErrKeyNotExist is returned when the batch lookup has reached +// the end of all possible results, even when partial results +// are returned. It should be used to evaluate when lookup is "done". +func (m *Map) BatchLookup(cursor *MapBatchCursor, keysOut, valuesOut interface{}, opts *BatchOptions) (int, error) { + n, err := m.batchLookup(sys.BPF_MAP_LOOKUP_BATCH, cursor, keysOut, valuesOut, opts) + if err != nil { + return n, fmt.Errorf("map batch lookup: %w", err) + } + return n, nil +} + +// BatchLookupAndDelete looks up many elements in a map at once, +// +// It then deletes all those elements. +// "keysOut" and "valuesOut" must be of type slice, a pointer +// to a slice or buffer will not work. +// "cursor" is an pointer to an opaque handle. It must be non-nil. Pass +// "cursor" to subsequent calls of this function to continue the batching +// operation in the case of chunking. +// +// Warning: This API is not very safe to use as the kernel implementation for +// batching relies on the user to be aware of subtle details with regarding to +// different map type implementations. +// +// ErrKeyNotExist is returned when the batch lookup has reached +// the end of all possible results, even when partial results +// are returned. It should be used to evaluate when lookup is "done". +func (m *Map) BatchLookupAndDelete(cursor *MapBatchCursor, keysOut, valuesOut interface{}, opts *BatchOptions) (int, error) { + n, err := m.batchLookup(sys.BPF_MAP_LOOKUP_AND_DELETE_BATCH, cursor, keysOut, valuesOut, opts) + if err != nil { + return n, fmt.Errorf("map batch lookup and delete: %w", err) + } + return n, nil +} + +// MapBatchCursor represents a starting point for a batch operation. +type MapBatchCursor struct { + m *Map + opaque []byte +} + +func (m *Map) batchLookup(cmd sys.Cmd, cursor *MapBatchCursor, keysOut, valuesOut interface{}, opts *BatchOptions) (int, error) { + if m.typ.hasPerCPUValue() { + return m.batchLookupPerCPU(cmd, cursor, keysOut, valuesOut, opts) + } + + count, err := batchCount(keysOut, valuesOut) + if err != nil { + return 0, err + } + + valueBuf := sysenc.SyscallOutput(valuesOut, count*int(m.fullValueSize)) + + n, err := m.batchLookupCmd(cmd, cursor, count, keysOut, valueBuf.Pointer(), opts) + if errors.Is(err, unix.ENOSPC) { + // Hash tables return ENOSPC when the size of the batch is smaller than + // any bucket. + return n, fmt.Errorf("%w (batch size too small?)", err) + } else if err != nil { + return n, err + } + + err = valueBuf.Unmarshal(valuesOut) + if err != nil { + return 0, err + } + + return n, nil +} + +func (m *Map) batchLookupPerCPU(cmd sys.Cmd, cursor *MapBatchCursor, keysOut, valuesOut interface{}, opts *BatchOptions) (int, error) { + count, err := sliceLen(keysOut) + if err != nil { + return 0, fmt.Errorf("keys: %w", err) + } + + valueBuf := make([]byte, count*int(m.fullValueSize)) + valuePtr := sys.NewSlicePointer(valueBuf) + + n, sysErr := m.batchLookupCmd(cmd, cursor, count, keysOut, valuePtr, opts) + if sysErr != nil && !errors.Is(sysErr, unix.ENOENT) { + return 0, err + } + + err = unmarshalBatchPerCPUValue(valuesOut, count, int(m.valueSize), valueBuf) + if err != nil { + return 0, err + } + + return n, sysErr +} + +func (m *Map) batchLookupCmd(cmd sys.Cmd, cursor *MapBatchCursor, count int, keysOut any, valuePtr sys.Pointer, opts *BatchOptions) (int, error) { + cursorLen := int(m.keySize) + if cursorLen < 4 { + // * generic_map_lookup_batch requires that batch_out is key_size bytes. + // This is used by array and LPM maps. + // + // * __htab_map_lookup_and_delete_batch requires u32. This is used by the + // various hash maps. + // + // Use a minimum of 4 bytes to avoid having to distinguish between the two. + cursorLen = 4 + } + + inBatch := cursor.opaque + if inBatch == nil { + // This is the first lookup, allocate a buffer to hold the cursor. + cursor.opaque = make([]byte, cursorLen) + cursor.m = m + } else if cursor.m != m { + // Prevent reuse of a cursor across maps. First, it's unlikely to work. + // Second, the maps may require different cursorLen and cursor.opaque + // may therefore be too short. This could lead to the kernel clobbering + // user space memory. + return 0, errors.New("a cursor may not be reused across maps") + } + + if err := haveBatchAPI(); err != nil { + return 0, err + } + + keyBuf := sysenc.SyscallOutput(keysOut, count*int(m.keySize)) + + attr := sys.MapLookupBatchAttr{ + MapFd: m.fd.Uint(), + Keys: keyBuf.Pointer(), + Values: valuePtr, + Count: uint32(count), + InBatch: sys.NewSlicePointer(inBatch), + OutBatch: sys.NewSlicePointer(cursor.opaque), + } + + if opts != nil { + attr.ElemFlags = opts.ElemFlags + attr.Flags = opts.Flags + } + + _, sysErr := sys.BPF(cmd, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + sysErr = wrapMapError(sysErr) + if sysErr != nil && !errors.Is(sysErr, unix.ENOENT) { + return 0, sysErr + } + + if err := keyBuf.Unmarshal(keysOut); err != nil { + return 0, err + } + + return int(attr.Count), sysErr +} + +// BatchUpdate updates the map with multiple keys and values +// simultaneously. +// "keys" and "values" must be of type slice, a pointer +// to a slice or buffer will not work. +func (m *Map) BatchUpdate(keys, values interface{}, opts *BatchOptions) (int, error) { + if m.typ.hasPerCPUValue() { + return m.batchUpdatePerCPU(keys, values, opts) + } + + count, err := batchCount(keys, values) + if err != nil { + return 0, err + } + + valuePtr, err := marshalMapSyscallInput(values, count*int(m.valueSize)) + if err != nil { + return 0, err + } + + return m.batchUpdate(count, keys, valuePtr, opts) +} + +func (m *Map) batchUpdate(count int, keys any, valuePtr sys.Pointer, opts *BatchOptions) (int, error) { + keyPtr, err := marshalMapSyscallInput(keys, count*int(m.keySize)) + if err != nil { + return 0, err + } + + attr := sys.MapUpdateBatchAttr{ + MapFd: m.fd.Uint(), + Keys: keyPtr, + Values: valuePtr, + Count: uint32(count), + } + if opts != nil { + attr.ElemFlags = opts.ElemFlags + attr.Flags = opts.Flags + } + + err = sys.MapUpdateBatch(&attr) + if err != nil { + if haveFeatErr := haveBatchAPI(); haveFeatErr != nil { + return 0, haveFeatErr + } + return int(attr.Count), fmt.Errorf("batch update: %w", wrapMapError(err)) + } + + return int(attr.Count), nil +} + +func (m *Map) batchUpdatePerCPU(keys, values any, opts *BatchOptions) (int, error) { + count, err := sliceLen(keys) + if err != nil { + return 0, fmt.Errorf("keys: %w", err) + } + + valueBuf, err := marshalBatchPerCPUValue(values, count, int(m.valueSize)) + if err != nil { + return 0, err + } + + return m.batchUpdate(count, keys, sys.NewSlicePointer(valueBuf), opts) +} + +// BatchDelete batch deletes entries in the map by keys. +// "keys" must be of type slice, a pointer to a slice or buffer will not work. +func (m *Map) BatchDelete(keys interface{}, opts *BatchOptions) (int, error) { + count, err := sliceLen(keys) + if err != nil { + return 0, fmt.Errorf("keys: %w", err) + } + + keyPtr, err := marshalMapSyscallInput(keys, count*int(m.keySize)) + if err != nil { + return 0, fmt.Errorf("cannot marshal keys: %v", err) + } + + attr := sys.MapDeleteBatchAttr{ + MapFd: m.fd.Uint(), + Keys: keyPtr, + Count: uint32(count), + } + + if opts != nil { + attr.ElemFlags = opts.ElemFlags + attr.Flags = opts.Flags + } + + if err = sys.MapDeleteBatch(&attr); err != nil { + if haveFeatErr := haveBatchAPI(); haveFeatErr != nil { + return 0, haveFeatErr + } + return int(attr.Count), fmt.Errorf("batch delete: %w", wrapMapError(err)) + } + + return int(attr.Count), nil +} + +func batchCount(keys, values any) (int, error) { + keysLen, err := sliceLen(keys) + if err != nil { + return 0, fmt.Errorf("keys: %w", err) + } + + valuesLen, err := sliceLen(values) + if err != nil { + return 0, fmt.Errorf("values: %w", err) + } + + if keysLen != valuesLen { + return 0, fmt.Errorf("keys and values must have the same length") + } + + return keysLen, nil +} + +// Iterate traverses a map. +// +// It's safe to create multiple iterators at the same time. +// +// It's not possible to guarantee that all keys in a map will be +// returned if there are concurrent modifications to the map. +func (m *Map) Iterate() *MapIterator { + return newMapIterator(m) +} + +// Close the Map's underlying file descriptor, which could unload the +// Map from the kernel if it is not pinned or in use by a loaded Program. +func (m *Map) Close() error { + if m == nil { + // This makes it easier to clean up when iterating maps + // of maps / programs. + return nil + } + + return m.fd.Close() +} + +// FD gets the file descriptor of the Map. +// +// Calling this function is invalid after Close has been called. +func (m *Map) FD() int { + return m.fd.Int() +} + +// Clone creates a duplicate of the Map. +// +// Closing the duplicate does not affect the original, and vice versa. +// Changes made to the map are reflected by both instances however. +// If the original map was pinned, the cloned map will not be pinned by default. +// +// Cloning a nil Map returns nil. +func (m *Map) Clone() (*Map, error) { + if m == nil { + return nil, nil + } + + dup, err := m.fd.Dup() + if err != nil { + return nil, fmt.Errorf("can't clone map: %w", err) + } + + return &Map{ + m.name, + dup, + m.typ, + m.keySize, + m.valueSize, + m.maxEntries, + m.flags, + "", + m.fullValueSize, + nil, + }, nil +} + +// Pin persists the map on the BPF virtual file system past the lifetime of +// the process that created it . +// +// Calling Pin on a previously pinned map will overwrite the path, except when +// the new path already exists. Re-pinning across filesystems is not supported. +// You can Clone a map to pin it to a different path. +// +// This requires bpffs to be mounted above fileName. +// See https://docs.cilium.io/en/stable/network/kubernetes/configuration/#mounting-bpffs-with-systemd +func (m *Map) Pin(fileName string) error { + if err := sys.Pin(m.pinnedPath, fileName, m.fd); err != nil { + return err + } + m.pinnedPath = fileName + return nil +} + +// Unpin removes the persisted state for the map from the BPF virtual filesystem. +// +// Failed calls to Unpin will not alter the state returned by IsPinned. +// +// Unpinning an unpinned Map returns nil. +func (m *Map) Unpin() error { + if err := sys.Unpin(m.pinnedPath); err != nil { + return err + } + m.pinnedPath = "" + return nil +} + +// IsPinned returns true if the map has a non-empty pinned path. +func (m *Map) IsPinned() bool { + return m.pinnedPath != "" +} + +// Freeze prevents a map to be modified from user space. +// +// It makes no changes to kernel-side restrictions. +func (m *Map) Freeze() error { + attr := sys.MapFreezeAttr{ + MapFd: m.fd.Uint(), + } + + if err := sys.MapFreeze(&attr); err != nil { + if haveFeatErr := haveMapMutabilityModifiers(); haveFeatErr != nil { + return fmt.Errorf("can't freeze map: %w", haveFeatErr) + } + return fmt.Errorf("can't freeze map: %w", err) + } + return nil +} + +// finalize populates the Map according to the Contents specified +// in spec and freezes the Map if requested by spec. +func (m *Map) finalize(spec *MapSpec) error { + for _, kv := range spec.Contents { + if err := m.Put(kv.Key, kv.Value); err != nil { + return fmt.Errorf("putting value: key %v: %w", kv.Key, err) + } + } + + if isConstantDataSection(spec.Name) || isKconfigSection(spec.Name) { + if err := m.Freeze(); err != nil { + return fmt.Errorf("freezing map: %w", err) + } + } + + return nil +} + +func (m *Map) marshalKey(data interface{}) (sys.Pointer, error) { + if data == nil { + if m.keySize == 0 { + // Queues have a key length of zero, so passing nil here is valid. + return sys.NewPointer(nil), nil + } + return sys.Pointer{}, errors.New("can't use nil as key of map") + } + + return marshalMapSyscallInput(data, int(m.keySize)) +} + +func (m *Map) marshalValue(data interface{}) (sys.Pointer, error) { + var ( + buf []byte + err error + ) + + switch value := data.(type) { + case *Map: + if !m.typ.canStoreMap() { + return sys.Pointer{}, fmt.Errorf("can't store map in %s", m.typ) + } + buf, err = marshalMap(value, int(m.valueSize)) + + case *Program: + if !m.typ.canStoreProgram() { + return sys.Pointer{}, fmt.Errorf("can't store program in %s", m.typ) + } + buf, err = marshalProgram(value, int(m.valueSize)) + + default: + return marshalMapSyscallInput(data, int(m.valueSize)) + } + + if err != nil { + return sys.Pointer{}, err + } + + return sys.NewSlicePointer(buf), nil +} + +func (m *Map) unmarshalValue(value any, buf sysenc.Buffer) error { + switch value := value.(type) { + case **Map: + if !m.typ.canStoreMap() { + return fmt.Errorf("can't read a map from %s", m.typ) + } + + other, err := unmarshalMap(buf) + if err != nil { + return err + } + + // The caller might close the map externally, so ignore errors. + _ = (*value).Close() + + *value = other + return nil + + case *Map: + if !m.typ.canStoreMap() { + return fmt.Errorf("can't read a map from %s", m.typ) + } + return errors.New("require pointer to *Map") + + case **Program: + if !m.typ.canStoreProgram() { + return fmt.Errorf("can't read a program from %s", m.typ) + } + + other, err := unmarshalProgram(buf) + if err != nil { + return err + } + + // The caller might close the program externally, so ignore errors. + _ = (*value).Close() + + *value = other + return nil + + case *Program: + if !m.typ.canStoreProgram() { + return fmt.Errorf("can't read a program from %s", m.typ) + } + return errors.New("require pointer to *Program") + } + + return buf.Unmarshal(value) +} + +// LoadPinnedMap opens a Map from a pin (file) on the BPF virtual filesystem. +// +// Requires at least Linux 4.5. +func LoadPinnedMap(fileName string, opts *LoadPinOptions) (*Map, error) { + fd, typ, err := sys.ObjGetTyped(&sys.ObjGetAttr{ + Pathname: sys.NewStringPointer(fileName), + FileFlags: opts.Marshal(), + }) + if err != nil { + return nil, err + } + + if typ != sys.BPF_TYPE_MAP { + _ = fd.Close() + return nil, fmt.Errorf("%s is not a Map", fileName) + } + + m, err := newMapFromFD(fd) + if err == nil { + m.pinnedPath = fileName + } + + return m, err +} + +// unmarshalMap creates a map from a map ID encoded in host endianness. +func unmarshalMap(buf sysenc.Buffer) (*Map, error) { + var id uint32 + if err := buf.Unmarshal(&id); err != nil { + return nil, err + } + return NewMapFromID(MapID(id)) +} + +// marshalMap marshals the fd of a map into a buffer in host endianness. +func marshalMap(m *Map, length int) ([]byte, error) { + if m == nil { + return nil, errors.New("can't marshal a nil Map") + } + + if length != 4 { + return nil, fmt.Errorf("can't marshal map to %d bytes", length) + } + + buf := make([]byte, 4) + internal.NativeEndian.PutUint32(buf, m.fd.Uint()) + return buf, nil +} + +// MapIterator iterates a Map. +// +// See Map.Iterate. +type MapIterator struct { + target *Map + // Temporary storage to avoid allocations in Next(). This is any instead + // of []byte to avoid allocations. + cursor any + count, maxEntries uint32 + done bool + err error +} + +func newMapIterator(target *Map) *MapIterator { + return &MapIterator{ + target: target, + maxEntries: target.maxEntries, + } +} + +// Next decodes the next key and value. +// +// Iterating a hash map from which keys are being deleted is not +// safe. You may see the same key multiple times. Iteration may +// also abort with an error, see IsIterationAborted. +// +// Returns false if there are no more entries. You must check +// the result of Err afterwards. +// +// See Map.Get for further caveats around valueOut. +func (mi *MapIterator) Next(keyOut, valueOut interface{}) bool { + if mi.err != nil || mi.done { + return false + } + + // For array-like maps NextKey returns nil only after maxEntries + // iterations. + for mi.count <= mi.maxEntries { + if mi.cursor == nil { + // Pass nil interface to NextKey to make sure the Map's first key + // is returned. If we pass an uninitialized []byte instead, it'll see a + // non-nil interface and try to marshal it. + mi.cursor = make([]byte, mi.target.keySize) + mi.err = mi.target.NextKey(nil, mi.cursor) + } else { + mi.err = mi.target.NextKey(mi.cursor, mi.cursor) + } + + if errors.Is(mi.err, ErrKeyNotExist) { + mi.done = true + mi.err = nil + return false + } else if mi.err != nil { + mi.err = fmt.Errorf("get next key: %w", mi.err) + return false + } + + mi.count++ + mi.err = mi.target.Lookup(mi.cursor, valueOut) + if errors.Is(mi.err, ErrKeyNotExist) { + // Even though the key should be valid, we couldn't look up + // its value. If we're iterating a hash map this is probably + // because a concurrent delete removed the value before we + // could get it. This means that the next call to NextKeyBytes + // is very likely to restart iteration. + // If we're iterating one of the fd maps like + // ProgramArray it means that a given slot doesn't have + // a valid fd associated. It's OK to continue to the next slot. + continue + } + if mi.err != nil { + mi.err = fmt.Errorf("look up next key: %w", mi.err) + return false + } + + buf := mi.cursor.([]byte) + if ptr, ok := keyOut.(unsafe.Pointer); ok { + copy(unsafe.Slice((*byte)(ptr), len(buf)), buf) + } else { + mi.err = sysenc.Unmarshal(keyOut, buf) + } + + return mi.err == nil + } + + mi.err = fmt.Errorf("%w", ErrIterationAborted) + return false +} + +// Err returns any encountered error. +// +// The method must be called after Next returns nil. +// +// Returns ErrIterationAborted if it wasn't possible to do a full iteration. +func (mi *MapIterator) Err() error { + return mi.err +} + +// MapGetNextID returns the ID of the next eBPF map. +// +// Returns ErrNotExist, if there is no next eBPF map. +func MapGetNextID(startID MapID) (MapID, error) { + attr := &sys.MapGetNextIdAttr{Id: uint32(startID)} + return MapID(attr.NextId), sys.MapGetNextId(attr) +} + +// NewMapFromID returns the map for a given id. +// +// Returns ErrNotExist, if there is no eBPF map with the given id. +func NewMapFromID(id MapID) (*Map, error) { + fd, err := sys.MapGetFdById(&sys.MapGetFdByIdAttr{ + Id: uint32(id), + }) + if err != nil { + return nil, err + } + + return newMapFromFD(fd) +} + +// sliceLen returns the length if the value is a slice or an error otherwise. +func sliceLen(slice any) (int, error) { + sliceValue := reflect.ValueOf(slice) + if sliceValue.Kind() != reflect.Slice { + return 0, fmt.Errorf("%T is not a slice", slice) + } + return sliceValue.Len(), nil +} diff --git a/vendor/github.com/cilium/ebpf/marshalers.go b/vendor/github.com/cilium/ebpf/marshalers.go new file mode 100644 index 0000000000..57a0a8e88a --- /dev/null +++ b/vendor/github.com/cilium/ebpf/marshalers.go @@ -0,0 +1,210 @@ +package ebpf + +import ( + "encoding" + "errors" + "fmt" + "reflect" + "slices" + "unsafe" + + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/sys" + "github.com/cilium/ebpf/internal/sysenc" +) + +// marshalMapSyscallInput converts an arbitrary value into a pointer suitable +// to be passed to the kernel. +// +// As an optimization, it returns the original value if it is an +// unsafe.Pointer. +func marshalMapSyscallInput(data any, length int) (sys.Pointer, error) { + if ptr, ok := data.(unsafe.Pointer); ok { + return sys.NewPointer(ptr), nil + } + + buf, err := sysenc.Marshal(data, length) + if err != nil { + return sys.Pointer{}, err + } + + return buf.Pointer(), nil +} + +func makeMapSyscallOutput(dst any, length int) sysenc.Buffer { + if ptr, ok := dst.(unsafe.Pointer); ok { + return sysenc.UnsafeBuffer(ptr) + } + + _, ok := dst.(encoding.BinaryUnmarshaler) + if ok { + return sysenc.SyscallOutput(nil, length) + } + + return sysenc.SyscallOutput(dst, length) +} + +// appendPerCPUSlice encodes a slice containing one value per +// possible CPU into a buffer of bytes. +// +// Values are initialized to zero if the slice has less elements than CPUs. +func appendPerCPUSlice(buf []byte, slice any, possibleCPUs, elemLength, alignedElemLength int) ([]byte, error) { + sliceType := reflect.TypeOf(slice) + if sliceType.Kind() != reflect.Slice { + return nil, errors.New("per-CPU value requires slice") + } + + sliceValue := reflect.ValueOf(slice) + sliceLen := sliceValue.Len() + if sliceLen > possibleCPUs { + return nil, fmt.Errorf("per-CPU value greater than number of CPUs") + } + + // Grow increases the slice's capacity, _if_necessary_ + buf = slices.Grow(buf, alignedElemLength*possibleCPUs) + for i := 0; i < sliceLen; i++ { + elem := sliceValue.Index(i).Interface() + elemBytes, err := sysenc.Marshal(elem, elemLength) + if err != nil { + return nil, err + } + + buf = elemBytes.AppendTo(buf) + buf = append(buf, make([]byte, alignedElemLength-elemLength)...) + } + + // Ensure buf is zero-padded full size. + buf = append(buf, make([]byte, (possibleCPUs-sliceLen)*alignedElemLength)...) + + return buf, nil +} + +// marshalPerCPUValue encodes a slice containing one value per +// possible CPU into a buffer of bytes. +// +// Values are initialized to zero if the slice has less elements than CPUs. +func marshalPerCPUValue(slice any, elemLength int) (sys.Pointer, error) { + possibleCPUs, err := PossibleCPU() + if err != nil { + return sys.Pointer{}, err + } + + alignedElemLength := internal.Align(elemLength, 8) + buf := make([]byte, 0, alignedElemLength*possibleCPUs) + buf, err = appendPerCPUSlice(buf, slice, possibleCPUs, elemLength, alignedElemLength) + if err != nil { + return sys.Pointer{}, err + } + + return sys.NewSlicePointer(buf), nil +} + +// marshalBatchPerCPUValue encodes a batch-sized slice of slices containing +// one value per possible CPU into a buffer of bytes. +func marshalBatchPerCPUValue(slice any, batchLen, elemLength int) ([]byte, error) { + sliceType := reflect.TypeOf(slice) + if sliceType.Kind() != reflect.Slice { + return nil, fmt.Errorf("batch value requires a slice") + } + sliceValue := reflect.ValueOf(slice) + + possibleCPUs, err := PossibleCPU() + if err != nil { + return nil, err + } + if sliceValue.Len() != batchLen*possibleCPUs { + return nil, fmt.Errorf("per-CPU slice has incorrect length, expected %d, got %d", + batchLen*possibleCPUs, sliceValue.Len()) + } + alignedElemLength := internal.Align(elemLength, 8) + buf := make([]byte, 0, batchLen*alignedElemLength*possibleCPUs) + for i := 0; i < batchLen; i++ { + batch := sliceValue.Slice(i*possibleCPUs, (i+1)*possibleCPUs).Interface() + buf, err = appendPerCPUSlice(buf, batch, possibleCPUs, elemLength, alignedElemLength) + if err != nil { + return nil, fmt.Errorf("batch %d: %w", i, err) + } + } + return buf, nil +} + +// unmarshalPerCPUValue decodes a buffer into a slice containing one value per +// possible CPU. +// +// slice must be a literal slice and not a pointer. +func unmarshalPerCPUValue(slice any, elemLength int, buf []byte) error { + sliceType := reflect.TypeOf(slice) + if sliceType.Kind() != reflect.Slice { + return fmt.Errorf("per-CPU value requires a slice") + } + + possibleCPUs, err := PossibleCPU() + if err != nil { + return err + } + + sliceValue := reflect.ValueOf(slice) + if sliceValue.Len() != possibleCPUs { + return fmt.Errorf("per-CPU slice has incorrect length, expected %d, got %d", + possibleCPUs, sliceValue.Len()) + } + + sliceElemType := sliceType.Elem() + sliceElemIsPointer := sliceElemType.Kind() == reflect.Ptr + stride := internal.Align(elemLength, 8) + for i := 0; i < possibleCPUs; i++ { + var elem any + v := sliceValue.Index(i) + if sliceElemIsPointer { + if !v.Elem().CanAddr() { + return fmt.Errorf("per-CPU slice elements cannot be nil") + } + elem = v.Elem().Addr().Interface() + } else { + elem = v.Addr().Interface() + } + err := sysenc.Unmarshal(elem, buf[:elemLength]) + if err != nil { + return fmt.Errorf("cpu %d: %w", i, err) + } + + buf = buf[stride:] + } + return nil +} + +// unmarshalBatchPerCPUValue decodes a buffer into a batch-sized slice +// containing one value per possible CPU. +// +// slice must have length batchLen * PossibleCPUs(). +func unmarshalBatchPerCPUValue(slice any, batchLen, elemLength int, buf []byte) error { + sliceType := reflect.TypeOf(slice) + if sliceType.Kind() != reflect.Slice { + return fmt.Errorf("batch requires a slice") + } + + sliceValue := reflect.ValueOf(slice) + possibleCPUs, err := PossibleCPU() + if err != nil { + return err + } + if sliceValue.Len() != batchLen*possibleCPUs { + return fmt.Errorf("per-CPU slice has incorrect length, expected %d, got %d", + sliceValue.Len(), batchLen*possibleCPUs) + } + + fullValueSize := possibleCPUs * internal.Align(elemLength, 8) + if len(buf) != batchLen*fullValueSize { + return fmt.Errorf("input buffer has incorrect length, expected %d, got %d", + len(buf), batchLen*fullValueSize) + } + + for i := 0; i < batchLen; i++ { + elem := sliceValue.Slice(i*possibleCPUs, (i+1)*possibleCPUs).Interface() + if err := unmarshalPerCPUValue(elem, elemLength, buf[:fullValueSize]); err != nil { + return fmt.Errorf("batch %d: %w", i, err) + } + buf = buf[fullValueSize:] + } + return nil +} diff --git a/vendor/github.com/cilium/ebpf/memory.go b/vendor/github.com/cilium/ebpf/memory.go new file mode 100644 index 0000000000..312c967131 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/memory.go @@ -0,0 +1,145 @@ +package ebpf + +import ( + "errors" + "fmt" + "io" + "runtime" + + "github.com/cilium/ebpf/internal/unix" +) + +// Memory is the building block for accessing the memory of specific bpf map +// types (Array and Arena at the time of writing) without going through the bpf +// syscall interface. +// +// Given the fd of a bpf map created with the BPF_F_MMAPABLE flag, a shared +// 'file'-based memory-mapped region can be allocated in the process' address +// space, exposing the bpf map's memory by simply accessing a memory location. + +var ErrReadOnly = errors.New("resource is read-only") + +// Memory implements accessing a Map's memory without making any syscalls. +// Pay attention to the difference between Go and C struct alignment rules. Use +// [structs.HostLayout] on supported Go versions to help with alignment. +// +// Note on memory coherence: avoid using packed structs in memory shared between +// user space and eBPF C programs. This drops a struct's memory alignment to 1, +// forcing the compiler to use single-byte loads and stores for field accesses. +// This may lead to partially-written data to be observed from user space. +// +// On most architectures, the memmove implementation used by Go's copy() will +// access data in word-sized chunks. If paired with a matching access pattern on +// the eBPF C side (and if using default memory alignment), accessing shared +// memory without atomics or other synchronization primitives should be sound +// for individual values. For accesses beyond a single value, the usual +// concurrent programming rules apply. +type Memory struct { + b []byte + ro bool +} + +func newMemory(fd, size int) (*Memory, error) { + // Typically, maps created with BPF_F_RDONLY_PROG remain writable from user + // space until frozen. As a security precaution, the kernel doesn't allow + // mapping bpf map memory as read-write into user space if the bpf map was + // frozen, or if it was created using the RDONLY_PROG flag. + // + // The user would be able to write to the map after freezing (since the kernel + // can't change the protection mode of an already-mapped page), while the + // verifier assumes the contents to be immutable. + b, err := unix.Mmap(fd, 0, size, unix.PROT_READ|unix.PROT_WRITE, unix.MAP_SHARED) + + // If the map is frozen when an rw mapping is requested, expect EPERM. If the + // map was created with BPF_F_RDONLY_PROG, expect EACCES. + var ro bool + if errors.Is(err, unix.EPERM) || errors.Is(err, unix.EACCES) { + ro = true + b, err = unix.Mmap(fd, 0, size, unix.PROT_READ, unix.MAP_SHARED) + } + if err != nil { + return nil, fmt.Errorf("setting up memory-mapped region: %w", err) + } + + mm := &Memory{ + b, + ro, + } + runtime.SetFinalizer(mm, (*Memory).close) + + return mm, nil +} + +func (mm *Memory) close() { + if err := unix.Munmap(mm.b); err != nil { + panic(fmt.Errorf("unmapping memory: %w", err)) + } + mm.b = nil +} + +// Size returns the size of the memory-mapped region in bytes. +func (mm *Memory) Size() int { + return len(mm.b) +} + +// ReadOnly returns true if the memory-mapped region is read-only. +func (mm *Memory) ReadOnly() bool { + return mm.ro +} + +// bounds returns true if an access at off of the given size is within bounds. +func (mm *Memory) bounds(off uint64, size uint64) bool { + return off+size < uint64(len(mm.b)) +} + +// ReadAt implements [io.ReaderAt]. Useful for creating a new [io.OffsetWriter]. +// +// See [Memory] for details around memory coherence. +func (mm *Memory) ReadAt(p []byte, off int64) (int, error) { + if mm.b == nil { + return 0, fmt.Errorf("memory-mapped region closed") + } + + if p == nil { + return 0, fmt.Errorf("input buffer p is nil") + } + + if off < 0 || off >= int64(len(mm.b)) { + return 0, fmt.Errorf("read offset out of range") + } + + n := copy(p, mm.b[off:]) + if n < len(p) { + return n, io.EOF + } + + return n, nil +} + +// WriteAt implements [io.WriterAt]. Useful for creating a new +// [io.SectionReader]. +// +// See [Memory] for details around memory coherence. +func (mm *Memory) WriteAt(p []byte, off int64) (int, error) { + if mm.b == nil { + return 0, fmt.Errorf("memory-mapped region closed") + } + if mm.ro { + return 0, fmt.Errorf("memory-mapped region not writable: %w", ErrReadOnly) + } + + if p == nil { + return 0, fmt.Errorf("output buffer p is nil") + } + + if off < 0 || off >= int64(len(mm.b)) { + return 0, fmt.Errorf("write offset out of range") + } + + n := copy(mm.b[off:], p) + if n < len(p) { + return n, io.EOF + } + + return n, nil +} diff --git a/vendor/github.com/cilium/ebpf/netlify.toml b/vendor/github.com/cilium/ebpf/netlify.toml new file mode 100644 index 0000000000..764c3b447b --- /dev/null +++ b/vendor/github.com/cilium/ebpf/netlify.toml @@ -0,0 +1,5 @@ +[build] + base = "docs/" + publish = "site/" + command = "mkdocs build" + environment = { PYTHON_VERSION = "3.13" } diff --git a/vendor/github.com/cilium/ebpf/prog.go b/vendor/github.com/cilium/ebpf/prog.go new file mode 100644 index 0000000000..8fcae6a3fb --- /dev/null +++ b/vendor/github.com/cilium/ebpf/prog.go @@ -0,0 +1,1177 @@ +package ebpf + +import ( + "bytes" + "encoding/binary" + "errors" + "fmt" + "math" + "path/filepath" + "runtime" + "strings" + "time" + "unsafe" + + "github.com/cilium/ebpf/asm" + "github.com/cilium/ebpf/btf" + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/kallsyms" + "github.com/cilium/ebpf/internal/linux" + "github.com/cilium/ebpf/internal/sys" + "github.com/cilium/ebpf/internal/sysenc" + "github.com/cilium/ebpf/internal/unix" +) + +// ErrNotSupported is returned whenever the kernel doesn't support a feature. +var ErrNotSupported = internal.ErrNotSupported + +// errBadRelocation is returned when the verifier rejects a program due to a +// bad CO-RE relocation. +// +// This error is detected based on heuristics and therefore may not be reliable. +var errBadRelocation = errors.New("bad CO-RE relocation") + +// errUnknownKfunc is returned when the verifier rejects a program due to an +// unknown kfunc. +// +// This error is detected based on heuristics and therefore may not be reliable. +var errUnknownKfunc = errors.New("unknown kfunc") + +// ProgramID represents the unique ID of an eBPF program. +type ProgramID uint32 + +const ( + // Number of bytes to pad the output buffer for BPF_PROG_TEST_RUN. + // This is currently the maximum of spare space allocated for SKB + // and XDP programs, and equal to XDP_PACKET_HEADROOM + NET_IP_ALIGN. + outputPad = 256 + 2 +) + +// minVerifierLogSize is the default number of bytes allocated for the +// verifier log. +const minVerifierLogSize = 64 * 1024 + +// maxVerifierLogSize is the maximum size of verifier log buffer the kernel +// will accept before returning EINVAL. May be increased to MaxUint32 in the +// future, but avoid the unnecessary EINVAL for now. +const maxVerifierLogSize = math.MaxUint32 >> 2 + +// ProgramOptions control loading a program into the kernel. +type ProgramOptions struct { + // Bitmap controlling the detail emitted by the kernel's eBPF verifier log. + // LogLevel-type values can be ORed together to request specific kinds of + // verifier output. See the documentation on [ebpf.LogLevel] for details. + // + // opts.LogLevel = (ebpf.LogLevelBranch | ebpf.LogLevelStats) + // + // If left to its default value, the program will first be loaded without + // verifier output enabled. Upon error, the program load will be repeated + // with LogLevelBranch and the given (or default) LogSize value. + // + // Unless LogDisabled is set, setting this to a non-zero value will enable the verifier + // log, populating the [ebpf.Program.VerifierLog] field on successful loads + // and including detailed verifier errors if the program is rejected. This + // will always allocate an output buffer, but will result in only a single + // attempt at loading the program. + LogLevel LogLevel + + // Starting size of the verifier log buffer. If the verifier log is larger + // than this size, the buffer will be grown to fit the entire log. Leave at + // its default value unless troubleshooting. + LogSizeStart uint32 + + // Disables the verifier log completely, regardless of other options. + LogDisabled bool + + // Type information used for CO-RE relocations. + // + // This is useful in environments where the kernel BTF is not available + // (containers) or where it is in a non-standard location. Defaults to + // use the kernel BTF from a well-known location if nil. + KernelTypes *btf.Spec + + // Type information used for CO-RE relocations of kernel modules, + // indexed by module name. + // + // This is useful in environments where the kernel BTF is not available + // (containers) or where it is in a non-standard location. Defaults to + // use the kernel module BTF from a well-known location if nil. + KernelModuleTypes map[string]*btf.Spec +} + +// ProgramSpec defines a Program. +type ProgramSpec struct { + // Name is passed to the kernel as a debug aid. Must only contain + // alpha numeric and '_' characters. + Name string + + // Type determines at which hook in the kernel a program will run. + Type ProgramType + + // AttachType of the program, needed to differentiate allowed context + // accesses in some newer program types like CGroupSockAddr. + // + // Available on kernels 4.17 and later. + AttachType AttachType + + // Name of a kernel data structure or function to attach to. Its + // interpretation depends on Type and AttachType. + AttachTo string + + // The program to attach to. Must be provided manually. + AttachTarget *Program + + // The name of the ELF section this program originated from. + SectionName string + + Instructions asm.Instructions + + // Flags is passed to the kernel and specifies additional program + // load attributes. + Flags uint32 + + // License of the program. Some helpers are only available if + // the license is deemed compatible with the GPL. + // + // See https://www.kernel.org/doc/html/latest/process/license-rules.html#id1 + License string + + // Version used by Kprobe programs. + // + // Deprecated on kernels 5.0 and later. Leave empty to let the library + // detect this value automatically. + KernelVersion uint32 + + // The byte order this program was compiled for, may be nil. + ByteOrder binary.ByteOrder +} + +// Copy returns a copy of the spec. +func (ps *ProgramSpec) Copy() *ProgramSpec { + if ps == nil { + return nil + } + + cpy := *ps + cpy.Instructions = make(asm.Instructions, len(ps.Instructions)) + copy(cpy.Instructions, ps.Instructions) + return &cpy +} + +// Tag calculates the kernel tag for a series of instructions. +// +// Use asm.Instructions.Tag if you need to calculate for non-native endianness. +func (ps *ProgramSpec) Tag() (string, error) { + return ps.Instructions.Tag(internal.NativeEndian) +} + +// kernelModule returns the kernel module providing the symbol in +// ProgramSpec.AttachTo, if any. Returns an empty string if the symbol is not +// present or not part of a kernel module. +func (ps *ProgramSpec) kernelModule() (string, error) { + if ps.targetsKernelModule() { + return kallsyms.Module(ps.AttachTo) + } + + return "", nil +} + +// targetsKernelModule returns true if the program supports being attached to a +// symbol provided by a kernel module. +func (ps *ProgramSpec) targetsKernelModule() bool { + if ps.AttachTo == "" { + return false + } + + switch ps.Type { + case Tracing: + switch ps.AttachType { + case AttachTraceFEntry, AttachTraceFExit: + return true + } + case Kprobe: + return true + } + + return false +} + +// VerifierError is returned by [NewProgram] and [NewProgramWithOptions] if a +// program is rejected by the verifier. +// +// Use [errors.As] to access the error. +type VerifierError = internal.VerifierError + +// Program represents BPF program loaded into the kernel. +// +// It is not safe to close a Program which is used by other goroutines. +type Program struct { + // Contains the output of the kernel verifier if enabled, + // otherwise it is empty. + VerifierLog string + + fd *sys.FD + name string + pinnedPath string + typ ProgramType +} + +// NewProgram creates a new Program. +// +// See [NewProgramWithOptions] for details. +// +// Returns a [VerifierError] containing the full verifier log if the program is +// rejected by the kernel. +func NewProgram(spec *ProgramSpec) (*Program, error) { + return NewProgramWithOptions(spec, ProgramOptions{}) +} + +// NewProgramWithOptions creates a new Program. +// +// Loading a program for the first time will perform +// feature detection by loading small, temporary programs. +// +// Returns a [VerifierError] containing the full verifier log if the program is +// rejected by the kernel. +func NewProgramWithOptions(spec *ProgramSpec, opts ProgramOptions) (*Program, error) { + if spec == nil { + return nil, errors.New("can't load a program from a nil spec") + } + + prog, err := newProgramWithOptions(spec, opts) + if errors.Is(err, asm.ErrUnsatisfiedMapReference) { + return nil, fmt.Errorf("cannot load program without loading its whole collection: %w", err) + } + return prog, err +} + +var ( + coreBadLoad = []byte(fmt.Sprintf("(18) r10 = 0x%x\n", btf.COREBadRelocationSentinel)) + // This log message was introduced by ebb676daa1a3 ("bpf: Print function name in + // addition to function id") which first appeared in v4.10 and has remained + // unchanged since. + coreBadCall = []byte(fmt.Sprintf("invalid func unknown#%d\n", btf.COREBadRelocationSentinel)) + kfuncBadCall = []byte(fmt.Sprintf("invalid func unknown#%d\n", kfuncCallPoisonBase)) +) + +func newProgramWithOptions(spec *ProgramSpec, opts ProgramOptions) (*Program, error) { + if len(spec.Instructions) == 0 { + return nil, errors.New("instructions cannot be empty") + } + + if spec.Type == UnspecifiedProgram { + return nil, errors.New("can't load program of unspecified type") + } + + if spec.ByteOrder != nil && spec.ByteOrder != internal.NativeEndian { + return nil, fmt.Errorf("can't load %s program on %s", spec.ByteOrder, internal.NativeEndian) + } + + // Kernels before 5.0 (6c4fc209fcf9 "bpf: remove useless version check for prog load") + // require the version field to be set to the value of the KERNEL_VERSION + // macro for kprobe-type programs. + // Overwrite Kprobe program version if set to zero or the magic version constant. + kv := spec.KernelVersion + if spec.Type == Kprobe && (kv == 0 || kv == internal.MagicKernelVersion) { + v, err := linux.KernelVersion() + if err != nil { + return nil, fmt.Errorf("detecting kernel version: %w", err) + } + kv = v.Kernel() + } + + attr := &sys.ProgLoadAttr{ + ProgType: sys.ProgType(spec.Type), + ProgFlags: spec.Flags, + ExpectedAttachType: sys.AttachType(spec.AttachType), + License: sys.NewStringPointer(spec.License), + KernVersion: kv, + } + + if haveObjName() == nil { + attr.ProgName = sys.NewObjName(spec.Name) + } + + insns := make(asm.Instructions, len(spec.Instructions)) + copy(insns, spec.Instructions) + + kmodName, err := spec.kernelModule() + if err != nil { + return nil, fmt.Errorf("kernel module search: %w", err) + } + + var targets []*btf.Spec + if opts.KernelTypes != nil { + targets = append(targets, opts.KernelTypes) + } + if kmodName != "" && opts.KernelModuleTypes != nil { + if modBTF, ok := opts.KernelModuleTypes[kmodName]; ok { + targets = append(targets, modBTF) + } + } + + var b btf.Builder + if err := applyRelocations(insns, targets, kmodName, spec.ByteOrder, &b); err != nil { + return nil, fmt.Errorf("apply CO-RE relocations: %w", err) + } + + errExtInfos := haveProgramExtInfos() + if !b.Empty() && errors.Is(errExtInfos, ErrNotSupported) { + // There is at least one CO-RE relocation which relies on a stable local + // type ID. + // Return ErrNotSupported instead of E2BIG if there is no BTF support. + return nil, errExtInfos + } + + if errExtInfos == nil { + // Only add func and line info if the kernel supports it. This allows + // BPF compiled with modern toolchains to work on old kernels. + fib, lib, err := btf.MarshalExtInfos(insns, &b) + if err != nil { + return nil, fmt.Errorf("marshal ext_infos: %w", err) + } + + attr.FuncInfoRecSize = btf.FuncInfoSize + attr.FuncInfoCnt = uint32(len(fib)) / btf.FuncInfoSize + attr.FuncInfo = sys.NewSlicePointer(fib) + + attr.LineInfoRecSize = btf.LineInfoSize + attr.LineInfoCnt = uint32(len(lib)) / btf.LineInfoSize + attr.LineInfo = sys.NewSlicePointer(lib) + } + + if !b.Empty() { + handle, err := btf.NewHandle(&b) + if err != nil { + return nil, fmt.Errorf("load BTF: %w", err) + } + defer handle.Close() + + attr.ProgBtfFd = uint32(handle.FD()) + } + + kconfig, err := resolveKconfigReferences(insns) + if err != nil { + return nil, fmt.Errorf("resolve .kconfig: %w", err) + } + defer kconfig.Close() + + if err := resolveKsymReferences(insns); err != nil { + return nil, fmt.Errorf("resolve .ksyms: %w", err) + } + + if err := fixupAndValidate(insns); err != nil { + return nil, err + } + + handles, err := fixupKfuncs(insns) + if err != nil { + return nil, fmt.Errorf("fixing up kfuncs: %w", err) + } + defer handles.Close() + + if len(handles) > 0 { + fdArray := handles.fdArray() + attr.FdArray = sys.NewPointer(unsafe.Pointer(&fdArray[0])) + } + + buf := bytes.NewBuffer(make([]byte, 0, insns.Size())) + err = insns.Marshal(buf, internal.NativeEndian) + if err != nil { + return nil, err + } + + bytecode := buf.Bytes() + attr.Insns = sys.NewSlicePointer(bytecode) + attr.InsnCnt = uint32(len(bytecode) / asm.InstructionSize) + + if spec.AttachTarget != nil { + targetID, err := findTargetInProgram(spec.AttachTarget, spec.AttachTo, spec.Type, spec.AttachType) + if err != nil { + return nil, fmt.Errorf("attach %s/%s: %w", spec.Type, spec.AttachType, err) + } + + attr.AttachBtfId = targetID + attr.AttachBtfObjFd = uint32(spec.AttachTarget.FD()) + defer runtime.KeepAlive(spec.AttachTarget) + } else if spec.AttachTo != "" { + module, targetID, err := findProgramTargetInKernel(spec.AttachTo, spec.Type, spec.AttachType) + if err != nil && !errors.Is(err, errUnrecognizedAttachType) { + // We ignore errUnrecognizedAttachType since AttachTo may be non-empty + // for programs that don't attach anywhere. + return nil, fmt.Errorf("attach %s/%s: %w", spec.Type, spec.AttachType, err) + } + + attr.AttachBtfId = targetID + if module != nil { + attr.AttachBtfObjFd = uint32(module.FD()) + defer module.Close() + } + } + + // The caller requested a specific verifier log level. Set up the log buffer + // so that there is a chance of loading the program in a single shot. + logSize := internal.Between(opts.LogSizeStart, minVerifierLogSize, maxVerifierLogSize) + var logBuf []byte + if !opts.LogDisabled && opts.LogLevel != 0 { + logBuf = make([]byte, logSize) + attr.LogLevel = opts.LogLevel + attr.LogSize = uint32(len(logBuf)) + attr.LogBuf = sys.NewSlicePointer(logBuf) + } + + for { + var fd *sys.FD + fd, err = sys.ProgLoad(attr) + if err == nil { + return &Program{unix.ByteSliceToString(logBuf), fd, spec.Name, "", spec.Type}, nil + } + + if opts.LogDisabled { + break + } + + if attr.LogTrueSize != 0 && attr.LogSize >= attr.LogTrueSize { + // The log buffer already has the correct size. + break + } + + if attr.LogSize != 0 && !errors.Is(err, unix.ENOSPC) { + // Logging is enabled and the error is not ENOSPC, so we can infer + // that the log buffer is large enough. + break + } + + if attr.LogLevel == 0 { + // Logging is not enabled but loading the program failed. Enable + // basic logging. + attr.LogLevel = LogLevelBranch + } + + // Make an educated guess how large the buffer should be by multiplying. + // Ensure the size doesn't overflow. + const factor = 2 + logSize = internal.Between(logSize, minVerifierLogSize, maxVerifierLogSize/factor) + logSize *= factor + + if attr.LogTrueSize != 0 { + // The kernel has given us a hint how large the log buffer has to be. + logSize = attr.LogTrueSize + } + + logBuf = make([]byte, logSize) + attr.LogSize = logSize + attr.LogBuf = sys.NewSlicePointer(logBuf) + } + + end := bytes.IndexByte(logBuf, 0) + if end < 0 { + end = len(logBuf) + } + + tail := logBuf[max(end-256, 0):end] + switch { + case errors.Is(err, unix.EPERM): + if len(logBuf) > 0 && logBuf[0] == 0 { + // EPERM due to RLIMIT_MEMLOCK happens before the verifier, so we can + // check that the log is empty to reduce false positives. + return nil, fmt.Errorf("load program: %w (MEMLOCK may be too low, consider rlimit.RemoveMemlock)", err) + } + + case errors.Is(err, unix.EFAULT): + // EFAULT is returned when the kernel hits a verifier bug, and always + // overrides ENOSPC, defeating the buffer growth strategy. Warn the user + // that they may need to increase the buffer size manually. + return nil, fmt.Errorf("load program: %w (hit verifier bug, increase LogSizeStart to fit the log and check dmesg)", err) + + case errors.Is(err, unix.EINVAL): + if bytes.Contains(tail, coreBadCall) { + err = errBadRelocation + break + } else if bytes.Contains(tail, kfuncBadCall) { + err = errUnknownKfunc + break + } + + case errors.Is(err, unix.EACCES): + if bytes.Contains(tail, coreBadLoad) { + err = errBadRelocation + break + } + } + + // hasFunctionReferences may be expensive, so check it last. + if (errors.Is(err, unix.EINVAL) || errors.Is(err, unix.EPERM)) && + hasFunctionReferences(spec.Instructions) { + if err := haveBPFToBPFCalls(); err != nil { + return nil, fmt.Errorf("load program: %w", err) + } + } + + return nil, internal.ErrorWithLog("load program", err, logBuf) +} + +// NewProgramFromFD creates a program from a raw fd. +// +// You should not use fd after calling this function. +// +// Requires at least Linux 4.10. +func NewProgramFromFD(fd int) (*Program, error) { + f, err := sys.NewFD(fd) + if err != nil { + return nil, err + } + + return newProgramFromFD(f) +} + +// NewProgramFromID returns the program for a given id. +// +// Returns ErrNotExist, if there is no eBPF program with the given id. +func NewProgramFromID(id ProgramID) (*Program, error) { + fd, err := sys.ProgGetFdById(&sys.ProgGetFdByIdAttr{ + Id: uint32(id), + }) + if err != nil { + return nil, fmt.Errorf("get program by id: %w", err) + } + + return newProgramFromFD(fd) +} + +func newProgramFromFD(fd *sys.FD) (*Program, error) { + info, err := newProgramInfoFromFd(fd) + if err != nil { + fd.Close() + return nil, fmt.Errorf("discover program type: %w", err) + } + + return &Program{"", fd, info.Name, "", info.Type}, nil +} + +func (p *Program) String() string { + if p.name != "" { + return fmt.Sprintf("%s(%s)#%v", p.typ, p.name, p.fd) + } + return fmt.Sprintf("%s(%v)", p.typ, p.fd) +} + +// Type returns the underlying type of the program. +func (p *Program) Type() ProgramType { + return p.typ +} + +// Info returns metadata about the program. +// +// Requires at least 4.10. +func (p *Program) Info() (*ProgramInfo, error) { + return newProgramInfoFromFd(p.fd) +} + +// Handle returns a reference to the program's type information in the kernel. +// +// Returns ErrNotSupported if the kernel has no BTF support, or if there is no +// BTF associated with the program. +func (p *Program) Handle() (*btf.Handle, error) { + info, err := p.Info() + if err != nil { + return nil, err + } + + id, ok := info.BTFID() + if !ok { + return nil, fmt.Errorf("program %s: retrieve BTF ID: %w", p, ErrNotSupported) + } + + return btf.NewHandleFromID(id) +} + +// FD gets the file descriptor of the Program. +// +// It is invalid to call this function after Close has been called. +func (p *Program) FD() int { + return p.fd.Int() +} + +// Clone creates a duplicate of the Program. +// +// Closing the duplicate does not affect the original, and vice versa. +// +// Cloning a nil Program returns nil. +func (p *Program) Clone() (*Program, error) { + if p == nil { + return nil, nil + } + + dup, err := p.fd.Dup() + if err != nil { + return nil, fmt.Errorf("can't clone program: %w", err) + } + + return &Program{p.VerifierLog, dup, p.name, "", p.typ}, nil +} + +// Pin persists the Program on the BPF virtual file system past the lifetime of +// the process that created it +// +// Calling Pin on a previously pinned program will overwrite the path, except when +// the new path already exists. Re-pinning across filesystems is not supported. +// +// This requires bpffs to be mounted above fileName. +// See https://docs.cilium.io/en/stable/network/kubernetes/configuration/#mounting-bpffs-with-systemd +func (p *Program) Pin(fileName string) error { + if err := sys.Pin(p.pinnedPath, fileName, p.fd); err != nil { + return err + } + p.pinnedPath = fileName + return nil +} + +// Unpin removes the persisted state for the Program from the BPF virtual filesystem. +// +// Failed calls to Unpin will not alter the state returned by IsPinned. +// +// Unpinning an unpinned Program returns nil. +func (p *Program) Unpin() error { + if err := sys.Unpin(p.pinnedPath); err != nil { + return err + } + p.pinnedPath = "" + return nil +} + +// IsPinned returns true if the Program has a non-empty pinned path. +func (p *Program) IsPinned() bool { + return p.pinnedPath != "" +} + +// Close the Program's underlying file descriptor, which could unload +// the program from the kernel if it is not pinned or attached to a +// kernel hook. +func (p *Program) Close() error { + if p == nil { + return nil + } + + return p.fd.Close() +} + +// Various options for Run'ing a Program +type RunOptions struct { + // Program's data input. Required field. + // + // The kernel expects at least 14 bytes input for an ethernet header for + // XDP and SKB programs. + Data []byte + // Program's data after Program has run. Caller must allocate. Optional field. + DataOut []byte + // Program's context input. Optional field. + Context interface{} + // Program's context after Program has run. Must be a pointer or slice. Optional field. + ContextOut interface{} + // Minimum number of times to run Program. Optional field. Defaults to 1. + // + // The program may be executed more often than this due to interruptions, e.g. + // when runtime.AllThreadsSyscall is invoked. + Repeat uint32 + // Optional flags. + Flags uint32 + // CPU to run Program on. Optional field. + // Note not all program types support this field. + CPU uint32 + // Called whenever the syscall is interrupted, and should be set to testing.B.ResetTimer + // or similar. Typically used during benchmarking. Optional field. + // + // Deprecated: use [testing.B.ReportMetric] with unit "ns/op" instead. + Reset func() +} + +// Test runs the Program in the kernel with the given input and returns the +// value returned by the eBPF program. +// +// Note: the kernel expects at least 14 bytes input for an ethernet header for +// XDP and SKB programs. +// +// This function requires at least Linux 4.12. +func (p *Program) Test(in []byte) (uint32, []byte, error) { + // Older kernels ignore the dataSizeOut argument when copying to user space. + // Combined with things like bpf_xdp_adjust_head() we don't really know what the final + // size will be. Hence we allocate an output buffer which we hope will always be large + // enough, and panic if the kernel wrote past the end of the allocation. + // See https://patchwork.ozlabs.org/cover/1006822/ + var out []byte + if len(in) > 0 { + out = make([]byte, len(in)+outputPad) + } + + opts := RunOptions{ + Data: in, + DataOut: out, + Repeat: 1, + } + + ret, _, err := p.run(&opts) + if err != nil { + return ret, nil, fmt.Errorf("test program: %w", err) + } + return ret, opts.DataOut, nil +} + +// Run runs the Program in kernel with given RunOptions. +// +// Note: the same restrictions from Test apply. +func (p *Program) Run(opts *RunOptions) (uint32, error) { + if opts == nil { + opts = &RunOptions{} + } + + ret, _, err := p.run(opts) + if err != nil { + return ret, fmt.Errorf("run program: %w", err) + } + return ret, nil +} + +// Benchmark runs the Program with the given input for a number of times +// and returns the time taken per iteration. +// +// Returns the result of the last execution of the program and the time per +// run or an error. reset is called whenever the benchmark syscall is +// interrupted, and should be set to testing.B.ResetTimer or similar. +// +// This function requires at least Linux 4.12. +func (p *Program) Benchmark(in []byte, repeat int, reset func()) (uint32, time.Duration, error) { + if uint(repeat) > math.MaxUint32 { + return 0, 0, fmt.Errorf("repeat is too high") + } + + opts := RunOptions{ + Data: in, + Repeat: uint32(repeat), + Reset: reset, + } + + ret, total, err := p.run(&opts) + if err != nil { + return ret, total, fmt.Errorf("benchmark program: %w", err) + } + return ret, total, nil +} + +var haveProgRun = internal.NewFeatureTest("BPF_PROG_RUN", func() error { + prog, err := NewProgram(&ProgramSpec{ + // SocketFilter does not require privileges on newer kernels. + Type: SocketFilter, + Instructions: asm.Instructions{ + asm.LoadImm(asm.R0, 0, asm.DWord), + asm.Return(), + }, + License: "MIT", + }) + if err != nil { + // This may be because we lack sufficient permissions, etc. + return err + } + defer prog.Close() + + in := internal.EmptyBPFContext + attr := sys.ProgRunAttr{ + ProgFd: uint32(prog.FD()), + DataSizeIn: uint32(len(in)), + DataIn: sys.NewSlicePointer(in), + } + + err = sys.ProgRun(&attr) + switch { + case errors.Is(err, unix.EINVAL): + // Check for EINVAL specifically, rather than err != nil since we + // otherwise misdetect due to insufficient permissions. + return internal.ErrNotSupported + + case errors.Is(err, unix.EINTR): + // We know that PROG_TEST_RUN is supported if we get EINTR. + return nil + + case errors.Is(err, sys.ENOTSUPP): + // The first PROG_TEST_RUN patches shipped in 4.12 didn't include + // a test runner for SocketFilter. ENOTSUPP means PROG_TEST_RUN is + // supported, but not for the program type used in the probe. + return nil + } + + return err +}, "4.12") + +func (p *Program) run(opts *RunOptions) (uint32, time.Duration, error) { + if uint(len(opts.Data)) > math.MaxUint32 { + return 0, 0, fmt.Errorf("input is too long") + } + + if err := haveProgRun(); err != nil { + return 0, 0, err + } + + var ctxBytes []byte + if opts.Context != nil { + ctx := new(bytes.Buffer) + if err := binary.Write(ctx, internal.NativeEndian, opts.Context); err != nil { + return 0, 0, fmt.Errorf("cannot serialize context: %v", err) + } + ctxBytes = ctx.Bytes() + } + + var ctxOut []byte + if opts.ContextOut != nil { + ctxOut = make([]byte, binary.Size(opts.ContextOut)) + } + + attr := sys.ProgRunAttr{ + ProgFd: p.fd.Uint(), + DataSizeIn: uint32(len(opts.Data)), + DataSizeOut: uint32(len(opts.DataOut)), + DataIn: sys.NewSlicePointer(opts.Data), + DataOut: sys.NewSlicePointer(opts.DataOut), + Repeat: uint32(opts.Repeat), + CtxSizeIn: uint32(len(ctxBytes)), + CtxSizeOut: uint32(len(ctxOut)), + CtxIn: sys.NewSlicePointer(ctxBytes), + CtxOut: sys.NewSlicePointer(ctxOut), + Flags: opts.Flags, + Cpu: opts.CPU, + } + +retry: + for { + err := sys.ProgRun(&attr) + if err == nil { + break retry + } + + if errors.Is(err, unix.EINTR) { + if attr.Repeat <= 1 { + // Older kernels check whether enough repetitions have been + // executed only after checking for pending signals. + // + // run signal? done? run ... + // + // As a result we can get EINTR for repeat==1 even though + // the program was run exactly once. Treat this as a + // successful run instead. + // + // Since commit 607b9cc92bd7 ("bpf: Consolidate shared test timing code") + // the conditions are reversed: + // run done? signal? ... + break retry + } + + if opts.Reset != nil { + opts.Reset() + } + continue retry + } + + if errors.Is(err, sys.ENOTSUPP) { + return 0, 0, fmt.Errorf("kernel doesn't support running %s: %w", p.Type(), ErrNotSupported) + } + + return 0, 0, err + } + + if opts.DataOut != nil { + if int(attr.DataSizeOut) > cap(opts.DataOut) { + // Houston, we have a problem. The program created more data than we allocated, + // and the kernel wrote past the end of our buffer. + panic("kernel wrote past end of output buffer") + } + opts.DataOut = opts.DataOut[:int(attr.DataSizeOut)] + } + + if len(ctxOut) != 0 { + b := bytes.NewReader(ctxOut) + if err := binary.Read(b, internal.NativeEndian, opts.ContextOut); err != nil { + return 0, 0, fmt.Errorf("failed to decode ContextOut: %v", err) + } + } + + total := time.Duration(attr.Duration) * time.Nanosecond + return attr.Retval, total, nil +} + +func unmarshalProgram(buf sysenc.Buffer) (*Program, error) { + var id uint32 + if err := buf.Unmarshal(&id); err != nil { + return nil, err + } + + // Looking up an entry in a nested map or prog array returns an id, + // not an fd. + return NewProgramFromID(ProgramID(id)) +} + +func marshalProgram(p *Program, length int) ([]byte, error) { + if p == nil { + return nil, errors.New("can't marshal a nil Program") + } + + if length != 4 { + return nil, fmt.Errorf("can't marshal program to %d bytes", length) + } + + buf := make([]byte, 4) + internal.NativeEndian.PutUint32(buf, p.fd.Uint()) + return buf, nil +} + +// LoadPinnedProgram loads a Program from a pin (file) on the BPF virtual +// filesystem. +// +// Requires at least Linux 4.11. +func LoadPinnedProgram(fileName string, opts *LoadPinOptions) (*Program, error) { + fd, typ, err := sys.ObjGetTyped(&sys.ObjGetAttr{ + Pathname: sys.NewStringPointer(fileName), + FileFlags: opts.Marshal(), + }) + if err != nil { + return nil, err + } + + if typ != sys.BPF_TYPE_PROG { + _ = fd.Close() + return nil, fmt.Errorf("%s is not a Program", fileName) + } + + info, err := newProgramInfoFromFd(fd) + if err != nil { + _ = fd.Close() + return nil, fmt.Errorf("info for %s: %w", fileName, err) + } + + var progName string + if haveObjName() == nil { + progName = info.Name + } else { + progName = filepath.Base(fileName) + } + + return &Program{"", fd, progName, fileName, info.Type}, nil +} + +// SanitizeName replaces all invalid characters in name with replacement. +// Passing a negative value for replacement will delete characters instead +// of replacing them. Use this to automatically generate valid names for maps +// and programs at runtime. +// +// The set of allowed characters depends on the running kernel version. +// Dots are only allowed as of kernel 5.2. +func SanitizeName(name string, replacement rune) string { + return strings.Map(func(char rune) rune { + if invalidBPFObjNameChar(char) { + return replacement + } + return char + }, name) +} + +// ProgramGetNextID returns the ID of the next eBPF program. +// +// Returns ErrNotExist, if there is no next eBPF program. +func ProgramGetNextID(startID ProgramID) (ProgramID, error) { + attr := &sys.ProgGetNextIdAttr{Id: uint32(startID)} + return ProgramID(attr.NextId), sys.ProgGetNextId(attr) +} + +// BindMap binds map to the program and is only released once program is released. +// +// This may be used in cases where metadata should be associated with the program +// which otherwise does not contain any references to the map. +func (p *Program) BindMap(m *Map) error { + attr := &sys.ProgBindMapAttr{ + ProgFd: uint32(p.FD()), + MapFd: uint32(m.FD()), + } + + return sys.ProgBindMap(attr) +} + +var errUnrecognizedAttachType = errors.New("unrecognized attach type") + +// find an attach target type in the kernel. +// +// name, progType and attachType determine which type we need to attach to. +// +// The attach target may be in a loaded kernel module. +// In that case the returned handle will be non-nil. +// The caller is responsible for closing the handle. +// +// Returns errUnrecognizedAttachType if the combination of progType and attachType +// is not recognised. +func findProgramTargetInKernel(name string, progType ProgramType, attachType AttachType) (*btf.Handle, btf.TypeID, error) { + type match struct { + p ProgramType + a AttachType + } + + var ( + typeName, featureName string + target btf.Type + ) + + switch (match{progType, attachType}) { + case match{LSM, AttachLSMMac}: + typeName = "bpf_lsm_" + name + featureName = name + " LSM hook" + target = (*btf.Func)(nil) + case match{Tracing, AttachTraceIter}: + typeName = "bpf_iter_" + name + featureName = name + " iterator" + target = (*btf.Func)(nil) + case match{Tracing, AttachTraceFEntry}: + typeName = name + featureName = fmt.Sprintf("fentry %s", name) + target = (*btf.Func)(nil) + case match{Tracing, AttachTraceFExit}: + typeName = name + featureName = fmt.Sprintf("fexit %s", name) + target = (*btf.Func)(nil) + case match{Tracing, AttachModifyReturn}: + typeName = name + featureName = fmt.Sprintf("fmod_ret %s", name) + target = (*btf.Func)(nil) + case match{Tracing, AttachTraceRawTp}: + typeName = fmt.Sprintf("btf_trace_%s", name) + featureName = fmt.Sprintf("raw_tp %s", name) + target = (*btf.Typedef)(nil) + default: + return nil, 0, errUnrecognizedAttachType + } + + spec, err := btf.LoadKernelSpec() + if err != nil { + return nil, 0, fmt.Errorf("load kernel spec: %w", err) + } + + spec, module, err := findTargetInKernel(spec, typeName, &target) + if errors.Is(err, btf.ErrNotFound) { + return nil, 0, &internal.UnsupportedFeatureError{Name: featureName} + } + // See cilium/ebpf#894. Until we can disambiguate between equally-named kernel + // symbols, we should explicitly refuse program loads. They will not reliably + // do what the caller intended. + if errors.Is(err, btf.ErrMultipleMatches) { + return nil, 0, fmt.Errorf("attaching to ambiguous kernel symbol is not supported: %w", err) + } + if err != nil { + return nil, 0, fmt.Errorf("find target for %s: %w", featureName, err) + } + + id, err := spec.TypeID(target) + if err != nil { + module.Close() + return nil, 0, err + } + + return module, id, nil +} + +// findTargetInKernel attempts to find a named type in the current kernel. +// +// target will point at the found type after a successful call. Searches both +// vmlinux and any loaded modules. +// +// Returns a non-nil handle if the type was found in a module, or btf.ErrNotFound +// if the type wasn't found at all. +func findTargetInKernel(kernelSpec *btf.Spec, typeName string, target *btf.Type) (*btf.Spec, *btf.Handle, error) { + err := kernelSpec.TypeByName(typeName, target) + if errors.Is(err, btf.ErrNotFound) { + spec, module, err := findTargetInModule(kernelSpec, typeName, target) + if err != nil { + return nil, nil, fmt.Errorf("find target in modules: %w", err) + } + return spec, module, nil + } + if err != nil { + return nil, nil, fmt.Errorf("find target in vmlinux: %w", err) + } + return kernelSpec, nil, err +} + +// findTargetInModule attempts to find a named type in any loaded module. +// +// base must contain the kernel's types and is used to parse kmod BTF. Modules +// are searched in the order they were loaded. +// +// Returns btf.ErrNotFound if the target can't be found in any module. +func findTargetInModule(base *btf.Spec, typeName string, target *btf.Type) (*btf.Spec, *btf.Handle, error) { + it := new(btf.HandleIterator) + defer it.Handle.Close() + + for it.Next() { + info, err := it.Handle.Info() + if err != nil { + return nil, nil, fmt.Errorf("get info for BTF ID %d: %w", it.ID, err) + } + + if !info.IsModule() { + continue + } + + spec, err := it.Handle.Spec(base) + if err != nil { + return nil, nil, fmt.Errorf("parse types for module %s: %w", info.Name, err) + } + + err = spec.TypeByName(typeName, target) + if errors.Is(err, btf.ErrNotFound) { + continue + } + if err != nil { + return nil, nil, fmt.Errorf("lookup type in module %s: %w", info.Name, err) + } + + return spec, it.Take(), nil + } + if err := it.Err(); err != nil { + return nil, nil, fmt.Errorf("iterate modules: %w", err) + } + + return nil, nil, btf.ErrNotFound +} + +// find an attach target type in a program. +// +// Returns errUnrecognizedAttachType. +func findTargetInProgram(prog *Program, name string, progType ProgramType, attachType AttachType) (btf.TypeID, error) { + type match struct { + p ProgramType + a AttachType + } + + var typeName string + switch (match{progType, attachType}) { + case match{Extension, AttachNone}, + match{Tracing, AttachTraceFEntry}, + match{Tracing, AttachTraceFExit}: + typeName = name + default: + return 0, errUnrecognizedAttachType + } + + btfHandle, err := prog.Handle() + if err != nil { + return 0, fmt.Errorf("load target BTF: %w", err) + } + defer btfHandle.Close() + + spec, err := btfHandle.Spec(nil) + if err != nil { + return 0, err + } + + var targetFunc *btf.Func + err = spec.TypeByName(typeName, &targetFunc) + if err != nil { + return 0, fmt.Errorf("find target %s: %w", typeName, err) + } + + return spec.TypeID(targetFunc) +} diff --git a/vendor/github.com/cilium/ebpf/syscalls.go b/vendor/github.com/cilium/ebpf/syscalls.go new file mode 100644 index 0000000000..25c84c3c5c --- /dev/null +++ b/vendor/github.com/cilium/ebpf/syscalls.go @@ -0,0 +1,355 @@ +package ebpf + +import ( + "bytes" + "errors" + "fmt" + "math" + "os" + "runtime" + + "github.com/cilium/ebpf/asm" + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/linux" + "github.com/cilium/ebpf/internal/sys" + "github.com/cilium/ebpf/internal/tracefs" + "github.com/cilium/ebpf/internal/unix" +) + +var ( + // pre-allocating these here since they may + // get called in hot code paths and cause + // unnecessary memory allocations + sysErrKeyNotExist = sys.Error(ErrKeyNotExist, unix.ENOENT) + sysErrKeyExist = sys.Error(ErrKeyExist, unix.EEXIST) + sysErrNotSupported = sys.Error(ErrNotSupported, sys.ENOTSUPP) +) + +// invalidBPFObjNameChar returns true if char may not appear in +// a BPF object name. +func invalidBPFObjNameChar(char rune) bool { + dotAllowed := objNameAllowsDot() == nil + + switch { + case char >= 'A' && char <= 'Z': + return false + case char >= 'a' && char <= 'z': + return false + case char >= '0' && char <= '9': + return false + case dotAllowed && char == '.': + return false + case char == '_': + return false + default: + return true + } +} + +func progLoad(insns asm.Instructions, typ ProgramType, license string) (*sys.FD, error) { + buf := bytes.NewBuffer(make([]byte, 0, insns.Size())) + if err := insns.Marshal(buf, internal.NativeEndian); err != nil { + return nil, err + } + bytecode := buf.Bytes() + + return sys.ProgLoad(&sys.ProgLoadAttr{ + ProgType: sys.ProgType(typ), + License: sys.NewStringPointer(license), + Insns: sys.NewSlicePointer(bytecode), + InsnCnt: uint32(len(bytecode) / asm.InstructionSize), + }) +} + +var haveNestedMaps = internal.NewFeatureTest("nested maps", func() error { + _, err := sys.MapCreate(&sys.MapCreateAttr{ + MapType: sys.MapType(ArrayOfMaps), + KeySize: 4, + ValueSize: 4, + MaxEntries: 1, + // Invalid file descriptor. + InnerMapFd: ^uint32(0), + }) + if errors.Is(err, unix.EINVAL) { + return internal.ErrNotSupported + } + if errors.Is(err, unix.EBADF) { + return nil + } + return err +}, "4.12") + +var haveMapMutabilityModifiers = internal.NewFeatureTest("read- and write-only maps", func() error { + // This checks BPF_F_RDONLY_PROG and BPF_F_WRONLY_PROG. Since + // BPF_MAP_FREEZE appeared in 5.2 as well we don't do a separate check. + m, err := sys.MapCreate(&sys.MapCreateAttr{ + MapType: sys.MapType(Array), + KeySize: 4, + ValueSize: 4, + MaxEntries: 1, + MapFlags: sys.BPF_F_RDONLY_PROG, + }) + if err != nil { + return internal.ErrNotSupported + } + _ = m.Close() + return nil +}, "5.2") + +var haveMmapableMaps = internal.NewFeatureTest("mmapable maps", func() error { + // This checks BPF_F_MMAPABLE, which appeared in 5.5 for array maps. + m, err := sys.MapCreate(&sys.MapCreateAttr{ + MapType: sys.MapType(Array), + KeySize: 4, + ValueSize: 4, + MaxEntries: 1, + MapFlags: sys.BPF_F_MMAPABLE, + }) + if err != nil { + return internal.ErrNotSupported + } + _ = m.Close() + return nil +}, "5.5") + +var haveInnerMaps = internal.NewFeatureTest("inner maps", func() error { + // This checks BPF_F_INNER_MAP, which appeared in 5.10. + m, err := sys.MapCreate(&sys.MapCreateAttr{ + MapType: sys.MapType(Array), + KeySize: 4, + ValueSize: 4, + MaxEntries: 1, + MapFlags: sys.BPF_F_INNER_MAP, + }) + + if err != nil { + return internal.ErrNotSupported + } + _ = m.Close() + return nil +}, "5.10") + +var haveNoPreallocMaps = internal.NewFeatureTest("prealloc maps", func() error { + // This checks BPF_F_NO_PREALLOC, which appeared in 4.6. + m, err := sys.MapCreate(&sys.MapCreateAttr{ + MapType: sys.MapType(Hash), + KeySize: 4, + ValueSize: 4, + MaxEntries: 1, + MapFlags: sys.BPF_F_NO_PREALLOC, + }) + + if err != nil { + return internal.ErrNotSupported + } + _ = m.Close() + return nil +}, "4.6") + +func wrapMapError(err error) error { + if err == nil { + return nil + } + + if errors.Is(err, unix.ENOENT) { + return sysErrKeyNotExist + } + + if errors.Is(err, unix.EEXIST) { + return sysErrKeyExist + } + + if errors.Is(err, sys.ENOTSUPP) { + return sysErrNotSupported + } + + if errors.Is(err, unix.E2BIG) { + return fmt.Errorf("key too big for map: %w", err) + } + + return err +} + +var haveObjName = internal.NewFeatureTest("object names", func() error { + attr := sys.MapCreateAttr{ + MapType: sys.MapType(Array), + KeySize: 4, + ValueSize: 4, + MaxEntries: 1, + MapName: sys.NewObjName("feature_test"), + } + + // Tolerate EPERM as this runs during ELF loading which is potentially + // unprivileged. Only EINVAL is conclusive, thrown from CHECK_ATTR. + fd, err := sys.MapCreate(&attr) + if errors.Is(err, unix.EPERM) { + return nil + } + if errors.Is(err, unix.EINVAL) { + return internal.ErrNotSupported + } + if err != nil { + return err + } + + _ = fd.Close() + return nil +}, "4.15") + +var objNameAllowsDot = internal.NewFeatureTest("dot in object names", func() error { + if err := haveObjName(); err != nil { + return err + } + + attr := sys.MapCreateAttr{ + MapType: sys.MapType(Array), + KeySize: 4, + ValueSize: 4, + MaxEntries: 1, + MapName: sys.NewObjName(".test"), + } + + // Tolerate EPERM, otherwise MapSpec.Name has its dots removed when run by + // unprivileged tools. (bpf2go, other code gen). Only EINVAL is conclusive, + // thrown from bpf_obj_name_cpy(). + fd, err := sys.MapCreate(&attr) + if errors.Is(err, unix.EPERM) { + return nil + } + if errors.Is(err, unix.EINVAL) { + return internal.ErrNotSupported + } + if err != nil { + return err + } + + _ = fd.Close() + return nil +}, "5.2") + +var haveBatchAPI = internal.NewFeatureTest("map batch api", func() error { + var maxEntries uint32 = 2 + attr := sys.MapCreateAttr{ + MapType: sys.MapType(Hash), + KeySize: 4, + ValueSize: 4, + MaxEntries: maxEntries, + } + + fd, err := sys.MapCreate(&attr) + if err != nil { + return internal.ErrNotSupported + } + defer fd.Close() + + keys := []uint32{1, 2} + values := []uint32{3, 4} + kp, _ := marshalMapSyscallInput(keys, 8) + vp, _ := marshalMapSyscallInput(values, 8) + + err = sys.MapUpdateBatch(&sys.MapUpdateBatchAttr{ + MapFd: fd.Uint(), + Keys: kp, + Values: vp, + Count: maxEntries, + }) + if err != nil { + return internal.ErrNotSupported + } + return nil +}, "5.6") + +var haveProbeReadKernel = internal.NewFeatureTest("bpf_probe_read_kernel", func() error { + insns := asm.Instructions{ + asm.Mov.Reg(asm.R1, asm.R10), + asm.Add.Imm(asm.R1, -8), + asm.Mov.Imm(asm.R2, 8), + asm.Mov.Imm(asm.R3, 0), + asm.FnProbeReadKernel.Call(), + asm.Return(), + } + + fd, err := progLoad(insns, Kprobe, "GPL") + if err != nil { + return internal.ErrNotSupported + } + _ = fd.Close() + return nil +}, "5.5") + +var haveBPFToBPFCalls = internal.NewFeatureTest("bpf2bpf calls", func() error { + insns := asm.Instructions{ + asm.Call.Label("prog2").WithSymbol("prog1"), + asm.Return(), + asm.Mov.Imm(asm.R0, 0).WithSymbol("prog2"), + asm.Return(), + } + + fd, err := progLoad(insns, SocketFilter, "MIT") + if err != nil { + return internal.ErrNotSupported + } + _ = fd.Close() + return nil +}, "4.16") + +var haveSyscallWrapper = internal.NewFeatureTest("syscall wrapper", func() error { + prefix := linux.PlatformPrefix() + if prefix == "" { + return fmt.Errorf("unable to find the platform prefix for (%s)", runtime.GOARCH) + } + + args := tracefs.ProbeArgs{ + Type: tracefs.Kprobe, + Symbol: prefix + "sys_bpf", + Pid: -1, + } + + var err error + args.Group, err = tracefs.RandomGroup("ebpf_probe") + if err != nil { + return err + } + + evt, err := tracefs.NewEvent(args) + if errors.Is(err, os.ErrNotExist) { + return internal.ErrNotSupported + } + if err != nil { + return err + } + + return evt.Close() +}, "4.17") + +var haveProgramExtInfos = internal.NewFeatureTest("program ext_infos", func() error { + insns := asm.Instructions{ + asm.Mov.Imm(asm.R0, 0), + asm.Return(), + } + + buf := bytes.NewBuffer(make([]byte, 0, insns.Size())) + if err := insns.Marshal(buf, internal.NativeEndian); err != nil { + return err + } + bytecode := buf.Bytes() + + _, err := sys.ProgLoad(&sys.ProgLoadAttr{ + ProgType: sys.ProgType(SocketFilter), + License: sys.NewStringPointer("MIT"), + Insns: sys.NewSlicePointer(bytecode), + InsnCnt: uint32(len(bytecode) / asm.InstructionSize), + FuncInfoCnt: 1, + ProgBtfFd: math.MaxUint32, + }) + + if errors.Is(err, unix.EBADF) { + return nil + } + + if errors.Is(err, unix.E2BIG) { + return ErrNotSupported + } + + return err +}, "5.0") diff --git a/vendor/github.com/cilium/ebpf/types.go b/vendor/github.com/cilium/ebpf/types.go new file mode 100644 index 0000000000..d7fb00509d --- /dev/null +++ b/vendor/github.com/cilium/ebpf/types.go @@ -0,0 +1,322 @@ +package ebpf + +import ( + "github.com/cilium/ebpf/internal/sys" +) + +//go:generate go run golang.org/x/tools/cmd/stringer@latest -output types_string.go -type=MapType,ProgramType,PinType + +// MapType indicates the type map structure +// that will be initialized in the kernel. +type MapType uint32 + +// All the various map types that can be created +const ( + UnspecifiedMap MapType = iota + // Hash is a hash map + Hash + // Array is an array map + Array + // ProgramArray - A program array map is a special kind of array map whose map + // values contain only file descriptors referring to other eBPF + // programs. Thus, both the key_size and value_size must be + // exactly four bytes. This map is used in conjunction with the + // TailCall helper. + ProgramArray + // PerfEventArray - A perf event array is used in conjunction with PerfEventRead + // and PerfEventOutput calls, to read the raw bpf_perf_data from the registers. + PerfEventArray + // PerCPUHash - This data structure is useful for people who have high performance + // network needs and can reconcile adds at the end of some cycle, so that + // hashes can be lock free without the use of XAdd, which can be costly. + PerCPUHash + // PerCPUArray - This data structure is useful for people who have high performance + // network needs and can reconcile adds at the end of some cycle, so that + // hashes can be lock free without the use of XAdd, which can be costly. + // Each CPU gets a copy of this hash, the contents of all of which can be reconciled + // later. + PerCPUArray + // StackTrace - This holds whole user and kernel stack traces, it can be retrieved with + // GetStackID + StackTrace + // CGroupArray - This is a very niche structure used to help SKBInCGroup determine + // if an skb is from a socket belonging to a specific cgroup + CGroupArray + // LRUHash - This allows you to create a small hash structure that will purge the + // least recently used items rather than throw an error when you run out of memory + LRUHash + // LRUCPUHash - This is NOT like PerCPUHash, this structure is shared among the CPUs, + // it has more to do with including the CPU id with the LRU calculation so that if a + // particular CPU is using a value over-and-over again, then it will be saved, but if + // a value is being retrieved a lot but sparsely across CPUs it is not as important, basically + // giving weight to CPU locality over overall usage. + LRUCPUHash + // LPMTrie - This is an implementation of Longest-Prefix-Match Trie structure. It is useful, + // for storing things like IP addresses which can be bit masked allowing for keys of differing + // values to refer to the same reference based on their masks. See wikipedia for more details. + LPMTrie + // ArrayOfMaps - Each item in the array is another map. The inner map mustn't be a map of maps + // itself. + ArrayOfMaps + // HashOfMaps - Each item in the hash map is another map. The inner map mustn't be a map of maps + // itself. + HashOfMaps + // DevMap - Specialized map to store references to network devices. + DevMap + // SockMap - Specialized map to store references to sockets. + SockMap + // CPUMap - Specialized map to store references to CPUs. + CPUMap + // XSKMap - Specialized map for XDP programs to store references to open sockets. + XSKMap + // SockHash - Specialized hash to store references to sockets. + SockHash + // CGroupStorage - Special map for CGroups. + CGroupStorage + // ReusePortSockArray - Specialized map to store references to sockets that can be reused. + ReusePortSockArray + // PerCPUCGroupStorage - Special per CPU map for CGroups. + PerCPUCGroupStorage + // Queue - FIFO storage for BPF programs. + Queue + // Stack - LIFO storage for BPF programs. + Stack + // SkStorage - Specialized map for local storage at SK for BPF programs. + SkStorage + // DevMapHash - Hash-based indexing scheme for references to network devices. + DevMapHash + // StructOpsMap - This map holds a kernel struct with its function pointer implemented in a BPF + // program. + StructOpsMap + // RingBuf - Similar to PerfEventArray, but shared across all CPUs. + RingBuf + // InodeStorage - Specialized local storage map for inodes. + InodeStorage + // TaskStorage - Specialized local storage map for task_struct. + TaskStorage + // BloomFilter - Space-efficient data structure to quickly test whether an element exists in a set. + BloomFilter + // UserRingbuf - The reverse of RingBuf, used to send messages from user space to BPF programs. + UserRingbuf + // CgroupStorage - Store data keyed on a cgroup. If the cgroup disappears, the key is automatically removed. + CgroupStorage + // Arena - Sparse shared memory region between a BPF program and user space. + Arena +) + +// hasPerCPUValue returns true if the Map stores a value per CPU. +func (mt MapType) hasPerCPUValue() bool { + return mt == PerCPUHash || mt == PerCPUArray || mt == LRUCPUHash || mt == PerCPUCGroupStorage +} + +// canStoreMapOrProgram returns true if the Map stores references to another Map +// or Program. +func (mt MapType) canStoreMapOrProgram() bool { + return mt.canStoreMap() || mt.canStoreProgram() +} + +// canStoreMap returns true if the map type accepts a map fd +// for update and returns a map id for lookup. +func (mt MapType) canStoreMap() bool { + return mt == ArrayOfMaps || mt == HashOfMaps +} + +// canStoreProgram returns true if the map type accepts a program fd +// for update and returns a program id for lookup. +func (mt MapType) canStoreProgram() bool { + return mt == ProgramArray +} + +// canHaveValueSize returns true if the map type supports setting a value size. +func (mt MapType) canHaveValueSize() bool { + switch mt { + case RingBuf, Arena: + return false + + // Special-case perf events since they require a value size of either 0 or 4 + // for historical reasons. Let the library fix this up later. + case PerfEventArray: + return false + } + + return true +} + +// ProgramType of the eBPF program +type ProgramType uint32 + +// eBPF program types +const ( + UnspecifiedProgram = ProgramType(sys.BPF_PROG_TYPE_UNSPEC) + SocketFilter = ProgramType(sys.BPF_PROG_TYPE_SOCKET_FILTER) + Kprobe = ProgramType(sys.BPF_PROG_TYPE_KPROBE) + SchedCLS = ProgramType(sys.BPF_PROG_TYPE_SCHED_CLS) + SchedACT = ProgramType(sys.BPF_PROG_TYPE_SCHED_ACT) + TracePoint = ProgramType(sys.BPF_PROG_TYPE_TRACEPOINT) + XDP = ProgramType(sys.BPF_PROG_TYPE_XDP) + PerfEvent = ProgramType(sys.BPF_PROG_TYPE_PERF_EVENT) + CGroupSKB = ProgramType(sys.BPF_PROG_TYPE_CGROUP_SKB) + CGroupSock = ProgramType(sys.BPF_PROG_TYPE_CGROUP_SOCK) + LWTIn = ProgramType(sys.BPF_PROG_TYPE_LWT_IN) + LWTOut = ProgramType(sys.BPF_PROG_TYPE_LWT_OUT) + LWTXmit = ProgramType(sys.BPF_PROG_TYPE_LWT_XMIT) + SockOps = ProgramType(sys.BPF_PROG_TYPE_SOCK_OPS) + SkSKB = ProgramType(sys.BPF_PROG_TYPE_SK_SKB) + CGroupDevice = ProgramType(sys.BPF_PROG_TYPE_CGROUP_DEVICE) + SkMsg = ProgramType(sys.BPF_PROG_TYPE_SK_MSG) + RawTracepoint = ProgramType(sys.BPF_PROG_TYPE_RAW_TRACEPOINT) + CGroupSockAddr = ProgramType(sys.BPF_PROG_TYPE_CGROUP_SOCK_ADDR) + LWTSeg6Local = ProgramType(sys.BPF_PROG_TYPE_LWT_SEG6LOCAL) + LircMode2 = ProgramType(sys.BPF_PROG_TYPE_LIRC_MODE2) + SkReuseport = ProgramType(sys.BPF_PROG_TYPE_SK_REUSEPORT) + FlowDissector = ProgramType(sys.BPF_PROG_TYPE_FLOW_DISSECTOR) + CGroupSysctl = ProgramType(sys.BPF_PROG_TYPE_CGROUP_SYSCTL) + RawTracepointWritable = ProgramType(sys.BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) + CGroupSockopt = ProgramType(sys.BPF_PROG_TYPE_CGROUP_SOCKOPT) + Tracing = ProgramType(sys.BPF_PROG_TYPE_TRACING) + StructOps = ProgramType(sys.BPF_PROG_TYPE_STRUCT_OPS) + Extension = ProgramType(sys.BPF_PROG_TYPE_EXT) + LSM = ProgramType(sys.BPF_PROG_TYPE_LSM) + SkLookup = ProgramType(sys.BPF_PROG_TYPE_SK_LOOKUP) + Syscall = ProgramType(sys.BPF_PROG_TYPE_SYSCALL) + Netfilter = ProgramType(sys.BPF_PROG_TYPE_NETFILTER) +) + +// AttachType of the eBPF program, needed to differentiate allowed context accesses in +// some newer program types like CGroupSockAddr. Should be set to AttachNone if not required. +// Will cause invalid argument (EINVAL) at program load time if set incorrectly. +type AttachType uint32 + +//go:generate go run golang.org/x/tools/cmd/stringer@latest -type AttachType -trimprefix Attach + +// AttachNone is an alias for AttachCGroupInetIngress for readability reasons. +const AttachNone AttachType = 0 + +const ( + AttachCGroupInetIngress = AttachType(sys.BPF_CGROUP_INET_INGRESS) + AttachCGroupInetEgress = AttachType(sys.BPF_CGROUP_INET_EGRESS) + AttachCGroupInetSockCreate = AttachType(sys.BPF_CGROUP_INET_SOCK_CREATE) + AttachCGroupSockOps = AttachType(sys.BPF_CGROUP_SOCK_OPS) + AttachSkSKBStreamParser = AttachType(sys.BPF_SK_SKB_STREAM_PARSER) + AttachSkSKBStreamVerdict = AttachType(sys.BPF_SK_SKB_STREAM_VERDICT) + AttachCGroupDevice = AttachType(sys.BPF_CGROUP_DEVICE) + AttachSkMsgVerdict = AttachType(sys.BPF_SK_MSG_VERDICT) + AttachCGroupInet4Bind = AttachType(sys.BPF_CGROUP_INET4_BIND) + AttachCGroupInet6Bind = AttachType(sys.BPF_CGROUP_INET6_BIND) + AttachCGroupInet4Connect = AttachType(sys.BPF_CGROUP_INET4_CONNECT) + AttachCGroupInet6Connect = AttachType(sys.BPF_CGROUP_INET6_CONNECT) + AttachCGroupInet4PostBind = AttachType(sys.BPF_CGROUP_INET4_POST_BIND) + AttachCGroupInet6PostBind = AttachType(sys.BPF_CGROUP_INET6_POST_BIND) + AttachCGroupUDP4Sendmsg = AttachType(sys.BPF_CGROUP_UDP4_SENDMSG) + AttachCGroupUDP6Sendmsg = AttachType(sys.BPF_CGROUP_UDP6_SENDMSG) + AttachLircMode2 = AttachType(sys.BPF_LIRC_MODE2) + AttachFlowDissector = AttachType(sys.BPF_FLOW_DISSECTOR) + AttachCGroupSysctl = AttachType(sys.BPF_CGROUP_SYSCTL) + AttachCGroupUDP4Recvmsg = AttachType(sys.BPF_CGROUP_UDP4_RECVMSG) + AttachCGroupUDP6Recvmsg = AttachType(sys.BPF_CGROUP_UDP6_RECVMSG) + AttachCGroupGetsockopt = AttachType(sys.BPF_CGROUP_GETSOCKOPT) + AttachCGroupSetsockopt = AttachType(sys.BPF_CGROUP_SETSOCKOPT) + AttachTraceRawTp = AttachType(sys.BPF_TRACE_RAW_TP) + AttachTraceFEntry = AttachType(sys.BPF_TRACE_FENTRY) + AttachTraceFExit = AttachType(sys.BPF_TRACE_FEXIT) + AttachModifyReturn = AttachType(sys.BPF_MODIFY_RETURN) + AttachLSMMac = AttachType(sys.BPF_LSM_MAC) + AttachTraceIter = AttachType(sys.BPF_TRACE_ITER) + AttachCgroupInet4GetPeername = AttachType(sys.BPF_CGROUP_INET4_GETPEERNAME) + AttachCgroupInet6GetPeername = AttachType(sys.BPF_CGROUP_INET6_GETPEERNAME) + AttachCgroupInet4GetSockname = AttachType(sys.BPF_CGROUP_INET4_GETSOCKNAME) + AttachCgroupInet6GetSockname = AttachType(sys.BPF_CGROUP_INET6_GETSOCKNAME) + AttachXDPDevMap = AttachType(sys.BPF_XDP_DEVMAP) + AttachCgroupInetSockRelease = AttachType(sys.BPF_CGROUP_INET_SOCK_RELEASE) + AttachXDPCPUMap = AttachType(sys.BPF_XDP_CPUMAP) + AttachSkLookup = AttachType(sys.BPF_SK_LOOKUP) + AttachXDP = AttachType(sys.BPF_XDP) + AttachSkSKBVerdict = AttachType(sys.BPF_SK_SKB_VERDICT) + AttachSkReuseportSelect = AttachType(sys.BPF_SK_REUSEPORT_SELECT) + AttachSkReuseportSelectOrMigrate = AttachType(sys.BPF_SK_REUSEPORT_SELECT_OR_MIGRATE) + AttachPerfEvent = AttachType(sys.BPF_PERF_EVENT) + AttachTraceKprobeMulti = AttachType(sys.BPF_TRACE_KPROBE_MULTI) + AttachTraceKprobeSession = AttachType(sys.BPF_TRACE_KPROBE_SESSION) + AttachLSMCgroup = AttachType(sys.BPF_LSM_CGROUP) + AttachStructOps = AttachType(sys.BPF_STRUCT_OPS) + AttachNetfilter = AttachType(sys.BPF_NETFILTER) + AttachTCXIngress = AttachType(sys.BPF_TCX_INGRESS) + AttachTCXEgress = AttachType(sys.BPF_TCX_EGRESS) + AttachTraceUprobeMulti = AttachType(sys.BPF_TRACE_UPROBE_MULTI) + AttachCgroupUnixConnect = AttachType(sys.BPF_CGROUP_UNIX_CONNECT) + AttachCgroupUnixSendmsg = AttachType(sys.BPF_CGROUP_UNIX_SENDMSG) + AttachCgroupUnixRecvmsg = AttachType(sys.BPF_CGROUP_UNIX_RECVMSG) + AttachCgroupUnixGetpeername = AttachType(sys.BPF_CGROUP_UNIX_GETPEERNAME) + AttachCgroupUnixGetsockname = AttachType(sys.BPF_CGROUP_UNIX_GETSOCKNAME) + AttachNetkitPrimary = AttachType(sys.BPF_NETKIT_PRIMARY) + AttachNetkitPeer = AttachType(sys.BPF_NETKIT_PEER) +) + +// AttachFlags of the eBPF program used in BPF_PROG_ATTACH command +type AttachFlags uint32 + +// PinType determines whether a map is pinned into a BPFFS. +type PinType uint32 + +// Valid pin types. +// +// Mirrors enum libbpf_pin_type. +const ( + PinNone PinType = iota + // Pin an object by using its name as the filename. + PinByName +) + +// LoadPinOptions control how a pinned object is loaded. +type LoadPinOptions struct { + // Request a read-only or write-only object. The default is a read-write + // object. Only one of the flags may be set. + ReadOnly bool + WriteOnly bool + + // Raw flags for the syscall. Other fields of this struct take precedence. + Flags uint32 +} + +// Marshal returns a value suitable for BPF_OBJ_GET syscall file_flags parameter. +func (lpo *LoadPinOptions) Marshal() uint32 { + if lpo == nil { + return 0 + } + + flags := lpo.Flags + if lpo.ReadOnly { + flags |= sys.BPF_F_RDONLY + } + if lpo.WriteOnly { + flags |= sys.BPF_F_WRONLY + } + return flags +} + +// BatchOptions batch map operations options +// +// Mirrors libbpf struct bpf_map_batch_opts +// Currently BPF_F_FLAG is the only supported +// flag (for ElemFlags). +type BatchOptions struct { + ElemFlags uint64 + Flags uint64 +} + +// LogLevel controls the verbosity of the kernel's eBPF program verifier. +// These constants can be used for the ProgramOptions.LogLevel field. +type LogLevel = sys.LogLevel + +const ( + // Print verifier state at branch points. + LogLevelBranch = sys.BPF_LOG_LEVEL1 + + // Print verifier state for every instruction. + // Available since Linux v5.2. + LogLevelInstruction = sys.BPF_LOG_LEVEL2 + + // Print verifier errors and stats at the end of the verification process. + // Available since Linux v5.2. + LogLevelStats = sys.BPF_LOG_STATS +) diff --git a/vendor/github.com/cilium/ebpf/types_string.go b/vendor/github.com/cilium/ebpf/types_string.go new file mode 100644 index 0000000000..f06685112c --- /dev/null +++ b/vendor/github.com/cilium/ebpf/types_string.go @@ -0,0 +1,123 @@ +// Code generated by "stringer -output types_string.go -type=MapType,ProgramType,PinType"; DO NOT EDIT. + +package ebpf + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[UnspecifiedMap-0] + _ = x[Hash-1] + _ = x[Array-2] + _ = x[ProgramArray-3] + _ = x[PerfEventArray-4] + _ = x[PerCPUHash-5] + _ = x[PerCPUArray-6] + _ = x[StackTrace-7] + _ = x[CGroupArray-8] + _ = x[LRUHash-9] + _ = x[LRUCPUHash-10] + _ = x[LPMTrie-11] + _ = x[ArrayOfMaps-12] + _ = x[HashOfMaps-13] + _ = x[DevMap-14] + _ = x[SockMap-15] + _ = x[CPUMap-16] + _ = x[XSKMap-17] + _ = x[SockHash-18] + _ = x[CGroupStorage-19] + _ = x[ReusePortSockArray-20] + _ = x[PerCPUCGroupStorage-21] + _ = x[Queue-22] + _ = x[Stack-23] + _ = x[SkStorage-24] + _ = x[DevMapHash-25] + _ = x[StructOpsMap-26] + _ = x[RingBuf-27] + _ = x[InodeStorage-28] + _ = x[TaskStorage-29] + _ = x[BloomFilter-30] + _ = x[UserRingbuf-31] + _ = x[CgroupStorage-32] + _ = x[Arena-33] +} + +const _MapType_name = "UnspecifiedMapHashArrayProgramArrayPerfEventArrayPerCPUHashPerCPUArrayStackTraceCGroupArrayLRUHashLRUCPUHashLPMTrieArrayOfMapsHashOfMapsDevMapSockMapCPUMapXSKMapSockHashCGroupStorageReusePortSockArrayPerCPUCGroupStorageQueueStackSkStorageDevMapHashStructOpsMapRingBufInodeStorageTaskStorageBloomFilterUserRingbufCgroupStorageArena" + +var _MapType_index = [...]uint16{0, 14, 18, 23, 35, 49, 59, 70, 80, 91, 98, 108, 115, 126, 136, 142, 149, 155, 161, 169, 182, 200, 219, 224, 229, 238, 248, 260, 267, 279, 290, 301, 312, 325, 330} + +func (i MapType) String() string { + if i >= MapType(len(_MapType_index)-1) { + return "MapType(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _MapType_name[_MapType_index[i]:_MapType_index[i+1]] +} +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[UnspecifiedProgram-0] + _ = x[SocketFilter-1] + _ = x[Kprobe-2] + _ = x[SchedCLS-3] + _ = x[SchedACT-4] + _ = x[TracePoint-5] + _ = x[XDP-6] + _ = x[PerfEvent-7] + _ = x[CGroupSKB-8] + _ = x[CGroupSock-9] + _ = x[LWTIn-10] + _ = x[LWTOut-11] + _ = x[LWTXmit-12] + _ = x[SockOps-13] + _ = x[SkSKB-14] + _ = x[CGroupDevice-15] + _ = x[SkMsg-16] + _ = x[RawTracepoint-17] + _ = x[CGroupSockAddr-18] + _ = x[LWTSeg6Local-19] + _ = x[LircMode2-20] + _ = x[SkReuseport-21] + _ = x[FlowDissector-22] + _ = x[CGroupSysctl-23] + _ = x[RawTracepointWritable-24] + _ = x[CGroupSockopt-25] + _ = x[Tracing-26] + _ = x[StructOps-27] + _ = x[Extension-28] + _ = x[LSM-29] + _ = x[SkLookup-30] + _ = x[Syscall-31] + _ = x[Netfilter-32] +} + +const _ProgramType_name = "UnspecifiedProgramSocketFilterKprobeSchedCLSSchedACTTracePointXDPPerfEventCGroupSKBCGroupSockLWTInLWTOutLWTXmitSockOpsSkSKBCGroupDeviceSkMsgRawTracepointCGroupSockAddrLWTSeg6LocalLircMode2SkReuseportFlowDissectorCGroupSysctlRawTracepointWritableCGroupSockoptTracingStructOpsExtensionLSMSkLookupSyscallNetfilter" + +var _ProgramType_index = [...]uint16{0, 18, 30, 36, 44, 52, 62, 65, 74, 83, 93, 98, 104, 111, 118, 123, 135, 140, 153, 167, 179, 188, 199, 212, 224, 245, 258, 265, 274, 283, 286, 294, 301, 310} + +func (i ProgramType) String() string { + if i >= ProgramType(len(_ProgramType_index)-1) { + return "ProgramType(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _ProgramType_name[_ProgramType_index[i]:_ProgramType_index[i+1]] +} +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[PinNone-0] + _ = x[PinByName-1] +} + +const _PinType_name = "PinNonePinByName" + +var _PinType_index = [...]uint8{0, 7, 16} + +func (i PinType) String() string { + if i >= PinType(len(_PinType_index)-1) { + return "PinType(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _PinType_name[_PinType_index[i]:_PinType_index[i+1]] +} diff --git a/vendor/github.com/cilium/ebpf/variable.go b/vendor/github.com/cilium/ebpf/variable.go new file mode 100644 index 0000000000..288b173a11 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/variable.go @@ -0,0 +1,230 @@ +package ebpf + +import ( + "fmt" + "io" + + "github.com/cilium/ebpf/btf" + "github.com/cilium/ebpf/internal/sysenc" +) + +// VariableSpec is a convenience wrapper for modifying global variables of a +// CollectionSpec before loading it into the kernel. +// +// All operations on a VariableSpec's underlying MapSpec are performed in the +// host's native endianness. +type VariableSpec struct { + name string + offset uint64 + size uint64 + + m *MapSpec + t *btf.Var +} + +// Set sets the value of the VariableSpec to the provided input using the host's +// native endianness. +func (s *VariableSpec) Set(in any) error { + buf, err := sysenc.Marshal(in, int(s.size)) + if err != nil { + return fmt.Errorf("marshaling value %s: %w", s.name, err) + } + + b, _, err := s.m.dataSection() + if err != nil { + return fmt.Errorf("getting data section of map %s: %w", s.m.Name, err) + } + + if int(s.offset+s.size) > len(b) { + return fmt.Errorf("offset %d(+%d) for variable %s is out of bounds", s.offset, s.size, s.name) + } + + // MapSpec.Copy() performs a shallow copy. Fully copy the byte slice + // to avoid any changes affecting other copies of the MapSpec. + cpy := make([]byte, len(b)) + copy(cpy, b) + + buf.CopyTo(cpy[s.offset : s.offset+s.size]) + + s.m.Contents[0] = MapKV{Key: uint32(0), Value: cpy} + + return nil +} + +// Get writes the value of the VariableSpec to the provided output using the +// host's native endianness. +func (s *VariableSpec) Get(out any) error { + b, _, err := s.m.dataSection() + if err != nil { + return fmt.Errorf("getting data section of map %s: %w", s.m.Name, err) + } + + if int(s.offset+s.size) > len(b) { + return fmt.Errorf("offset %d(+%d) for variable %s is out of bounds", s.offset, s.size, s.name) + } + + if err := sysenc.Unmarshal(out, b[s.offset:s.offset+s.size]); err != nil { + return fmt.Errorf("unmarshaling value: %w", err) + } + + return nil +} + +// Size returns the size of the variable in bytes. +func (s *VariableSpec) Size() uint64 { + return s.size +} + +// MapName returns the name of the underlying MapSpec. +func (s *VariableSpec) MapName() string { + return s.m.Name +} + +// Offset returns the offset of the variable in the underlying MapSpec. +func (s *VariableSpec) Offset() uint64 { + return s.offset +} + +// Constant returns true if the VariableSpec represents a variable that is +// read-only from the perspective of the BPF program. +func (s *VariableSpec) Constant() bool { + return s.m.readOnly() +} + +// Type returns the [btf.Var] representing the variable in its data section. +// This is useful for inspecting the variable's decl tags and the type +// information of the inner type. +// +// Returns nil if the original ELF object did not contain BTF information. +func (s *VariableSpec) Type() *btf.Var { + return s.t +} + +func (s *VariableSpec) String() string { + return fmt.Sprintf("%s (type=%v, map=%s, offset=%d, size=%d)", s.name, s.t, s.m.Name, s.offset, s.size) +} + +// copy returns a new VariableSpec with the same values as the original, +// but with a different underlying MapSpec. This is useful when copying a +// CollectionSpec. Returns nil if a MapSpec with the same name is not found. +func (s *VariableSpec) copy(cpy *CollectionSpec) *VariableSpec { + out := &VariableSpec{ + name: s.name, + offset: s.offset, + size: s.size, + } + if s.t != nil { + out.t = btf.Copy(s.t).(*btf.Var) + } + + // Attempt to find a MapSpec with the same name in the copied CollectionSpec. + for _, m := range cpy.Maps { + if m.Name == s.m.Name { + out.m = m + return out + } + } + + return nil +} + +// Variable is a convenience wrapper for modifying global variables of a +// Collection after loading it into the kernel. Operations on a Variable are +// performed using direct memory access, bypassing the BPF map syscall API. +// +// On kernels older than 5.5, most interactions with Variable return +// [ErrNotSupported]. +type Variable struct { + name string + offset uint64 + size uint64 + t *btf.Var + + mm *Memory +} + +func newVariable(name string, offset, size uint64, t *btf.Var, mm *Memory) (*Variable, error) { + if mm != nil { + if int(offset+size) > mm.Size() { + return nil, fmt.Errorf("offset %d(+%d) is out of bounds", offset, size) + } + } + + return &Variable{ + name: name, + offset: offset, + size: size, + t: t, + mm: mm, + }, nil +} + +// Size returns the size of the variable. +func (v *Variable) Size() uint64 { + return v.size +} + +// ReadOnly returns true if the Variable represents a variable that is read-only +// after loading the Collection into the kernel. +// +// On systems without BPF_F_MMAPABLE support, ReadOnly always returns true. +func (v *Variable) ReadOnly() bool { + if v.mm == nil { + return true + } + return v.mm.ReadOnly() +} + +// Type returns the [btf.Var] representing the variable in its data section. +// This is useful for inspecting the variable's decl tags and the type +// information of the inner type. +// +// Returns nil if the original ELF object did not contain BTF information. +func (v *Variable) Type() *btf.Var { + return v.t +} + +func (v *Variable) String() string { + return fmt.Sprintf("%s (type=%v)", v.name, v.t) +} + +// Set the value of the Variable to the provided input. The input must marshal +// to the same length as the size of the Variable. +func (v *Variable) Set(in any) error { + if v.mm == nil { + return fmt.Errorf("variable %s: direct access requires Linux 5.5 or later: %w", v.name, ErrNotSupported) + } + + if v.ReadOnly() { + return fmt.Errorf("variable %s: %w", v.name, ErrReadOnly) + } + + buf, err := sysenc.Marshal(in, int(v.size)) + if err != nil { + return fmt.Errorf("marshaling value %s: %w", v.name, err) + } + + if _, err := v.mm.WriteAt(buf.Bytes(), int64(v.offset)); err != nil { + return fmt.Errorf("writing value to %s: %w", v.name, err) + } + + return nil +} + +// Get writes the value of the Variable to the provided output. The output must +// be a pointer to a value whose size matches the Variable. +func (v *Variable) Get(out any) error { + if v.mm == nil { + return fmt.Errorf("variable %s: direct access requires Linux 5.5 or later: %w", v.name, ErrNotSupported) + } + + if !v.mm.bounds(v.offset, v.size) { + return fmt.Errorf("variable %s: access out of bounds: %w", v.name, io.EOF) + } + + if err := sysenc.Unmarshal(out, v.mm.b[v.offset:v.offset+v.size]); err != nil { + return fmt.Errorf("unmarshaling value %s: %w", v.name, err) + } + + return nil +} diff --git a/vendor/github.com/containerd/cgroups/v3/cgroup2/cpu.go b/vendor/github.com/containerd/cgroups/v3/cgroup2/cpu.go new file mode 100644 index 0000000000..dcb253db56 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/v3/cgroup2/cpu.go @@ -0,0 +1,83 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroup2 + +import ( + "math" + "strconv" + "strings" +) + +type CPUMax string + +func NewCPUMax(quota *int64, period *uint64) CPUMax { + max := "max" + if quota != nil { + max = strconv.FormatInt(*quota, 10) + } + return CPUMax(strings.Join([]string{max, strconv.FormatUint(*period, 10)}, " ")) +} + +type CPU struct { + Weight *uint64 + Max CPUMax + Cpus string + Mems string +} + +func (c CPUMax) extractQuotaAndPeriod() (int64, uint64) { + var ( + quota int64 + period uint64 + ) + values := strings.Split(string(c), " ") + if values[0] == "max" { + quota = math.MaxInt64 + } else { + quota, _ = strconv.ParseInt(values[0], 10, 64) + } + period, _ = strconv.ParseUint(values[1], 10, 64) + return quota, period +} + +func (r *CPU) Values() (o []Value) { + if r.Weight != nil { + o = append(o, Value{ + filename: "cpu.weight", + value: *r.Weight, + }) + } + if r.Max != "" { + o = append(o, Value{ + filename: "cpu.max", + value: r.Max, + }) + } + if r.Cpus != "" { + o = append(o, Value{ + filename: "cpuset.cpus", + value: r.Cpus, + }) + } + if r.Mems != "" { + o = append(o, Value{ + filename: "cpuset.mems", + value: r.Mems, + }) + } + return o +} diff --git a/vendor/github.com/containerd/cgroups/v3/cgroup2/devicefilter.go b/vendor/github.com/containerd/cgroups/v3/cgroup2/devicefilter.go new file mode 100644 index 0000000000..94a2854c8d --- /dev/null +++ b/vendor/github.com/containerd/cgroups/v3/cgroup2/devicefilter.go @@ -0,0 +1,200 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Devicefilter contains eBPF device filter program +// +// The implementation is based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c +// +// Although ebpf.c is originally licensed under LGPL-3.0-or-later, the author (Giuseppe Scrivano) +// agreed to relicense the file in Apache License 2.0: https://github.com/opencontainers/runc/issues/2144#issuecomment-543116397 +// +// This particular Go implementation based on runc version +// https://github.com/opencontainers/runc/blob/master/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go + +package cgroup2 + +import ( + "errors" + "fmt" + "math" + + "github.com/cilium/ebpf/asm" + "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" +) + +const ( + // license string format is same as kernel MODULE_LICENSE macro + license = "Apache" +) + +// DeviceFilter returns eBPF device filter program and its license string +func DeviceFilter(devices []specs.LinuxDeviceCgroup) (asm.Instructions, string, error) { + p := &program{} + p.init() + for i := len(devices) - 1; i >= 0; i-- { + if err := p.appendDevice(devices[i]); err != nil { + return nil, "", err + } + } + insts, err := p.finalize() + return insts, license, err +} + +type program struct { + insts asm.Instructions + hasWildCard bool + blockID int +} + +func (p *program) init() { + // struct bpf_cgroup_dev_ctx: https://elixir.bootlin.com/linux/v5.3.6/source/include/uapi/linux/bpf.h#L3423 + /* + u32 access_type + u32 major + u32 minor + */ + // R2 <- type (lower 16 bit of u32 access_type at R1[0]) + p.insts = append(p.insts, + asm.LoadMem(asm.R2, asm.R1, 0, asm.Half)) + + // R3 <- access (upper 16 bit of u32 access_type at R1[0]) + p.insts = append(p.insts, + asm.LoadMem(asm.R3, asm.R1, 0, asm.Word), + // RSh: bitwise shift right + asm.RSh.Imm32(asm.R3, 16)) + + // R4 <- major (u32 major at R1[4]) + p.insts = append(p.insts, + asm.LoadMem(asm.R4, asm.R1, 4, asm.Word)) + + // R5 <- minor (u32 minor at R1[8]) + p.insts = append(p.insts, + asm.LoadMem(asm.R5, asm.R1, 8, asm.Word)) +} + +// appendDevice needs to be called from the last element of OCI linux.resources.devices to the head element. +func (p *program) appendDevice(dev specs.LinuxDeviceCgroup) error { + if p.blockID < 0 { + return errors.New("the program is finalized") + } + if p.hasWildCard { + // All entries after wildcard entry are ignored + return nil + } + + bpfType := int32(-1) + hasType := true + switch dev.Type { + case string('c'): + bpfType = int32(unix.BPF_DEVCG_DEV_CHAR) + case string('b'): + bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK) + case string('a'): + hasType = false + default: + // if not specified in OCI json, typ is set to DeviceTypeAll + return fmt.Errorf("invalid DeviceType %q", dev.Type) + } + if *dev.Major > math.MaxUint32 { + return fmt.Errorf("invalid major %d", *dev.Major) + } + if *dev.Minor > math.MaxUint32 { + return fmt.Errorf("invalid minor %d", *dev.Major) + } + hasMajor := *dev.Major >= 0 // if not specified in OCI json, major is set to -1 + hasMinor := *dev.Minor >= 0 + bpfAccess := int32(0) + for _, r := range dev.Access { + switch r { + case 'r': + bpfAccess |= unix.BPF_DEVCG_ACC_READ + case 'w': + bpfAccess |= unix.BPF_DEVCG_ACC_WRITE + case 'm': + bpfAccess |= unix.BPF_DEVCG_ACC_MKNOD + default: + return fmt.Errorf("unknown device access %v", r) + } + } + // If the access is rwm, skip the check. + hasAccess := bpfAccess != (unix.BPF_DEVCG_ACC_READ | unix.BPF_DEVCG_ACC_WRITE | unix.BPF_DEVCG_ACC_MKNOD) + + blockSym := fmt.Sprintf("block-%d", p.blockID) + nextBlockSym := fmt.Sprintf("block-%d", p.blockID+1) + prevBlockLastIdx := len(p.insts) - 1 + if hasType { + p.insts = append(p.insts, + // if (R2 != bpfType) goto next + asm.JNE.Imm(asm.R2, bpfType, nextBlockSym), + ) + } + if hasAccess { + p.insts = append(p.insts, + // if (R3 & bpfAccess == 0 /* use R1 as a temp var */) goto next + asm.Mov.Reg32(asm.R1, asm.R3), + asm.And.Imm32(asm.R1, bpfAccess), + asm.JEq.Imm(asm.R1, 0, nextBlockSym), + ) + } + if hasMajor { + p.insts = append(p.insts, + // if (R4 != major) goto next + asm.JNE.Imm(asm.R4, int32(*dev.Major), nextBlockSym), + ) + } + if hasMinor { + p.insts = append(p.insts, + // if (R5 != minor) goto next + asm.JNE.Imm(asm.R5, int32(*dev.Minor), nextBlockSym), + ) + } + if !hasType && !hasAccess && !hasMajor && !hasMinor { + p.hasWildCard = true + } + p.insts = append(p.insts, acceptBlock(dev.Allow)...) + // set blockSym to the first instruction we added in this iteration + p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].WithSymbol(blockSym) + p.blockID++ + return nil +} + +func (p *program) finalize() (asm.Instructions, error) { + if p.hasWildCard { + // acceptBlock with asm.Return() is already inserted + return p.insts, nil + } + blockSym := fmt.Sprintf("block-%d", p.blockID) + p.insts = append(p.insts, + // R0 <- 0 + asm.Mov.Imm32(asm.R0, 0).WithSymbol(blockSym), + asm.Return(), + ) + p.blockID = -1 + return p.insts, nil +} + +func acceptBlock(accept bool) asm.Instructions { + v := int32(0) + if accept { + v = 1 + } + return []asm.Instruction{ + // R0 <- v + asm.Mov.Imm32(asm.R0, v), + asm.Return(), + } +} diff --git a/vendor/github.com/containerd/cgroups/v3/cgroup2/ebpf.go b/vendor/github.com/containerd/cgroups/v3/cgroup2/ebpf.go new file mode 100644 index 0000000000..503a147bbd --- /dev/null +++ b/vendor/github.com/containerd/cgroups/v3/cgroup2/ebpf.go @@ -0,0 +1,96 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroup2 + +import ( + "fmt" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/asm" + "github.com/cilium/ebpf/link" + "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" +) + +// LoadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/ directory. +// +// Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 . +// +// https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92 +func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFD int) (func() error, error) { + nilCloser := func() error { + return nil + } + spec := &ebpf.ProgramSpec{ + Type: ebpf.CGroupDevice, + Instructions: insts, + License: license, + } + prog, err := ebpf.NewProgram(spec) + if err != nil { + return nilCloser, err + } + err = link.RawAttachProgram(link.RawAttachProgramOptions{ + Target: dirFD, + Program: prog, + Attach: ebpf.AttachCGroupDevice, + Flags: unix.BPF_F_ALLOW_MULTI, + }) + if err != nil { + return nilCloser, fmt.Errorf("failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI): %w", err) + } + closer := func() error { + err = link.RawDetachProgram(link.RawDetachProgramOptions{ + Target: dirFD, + Program: prog, + Attach: ebpf.AttachCGroupDevice, + }) + if err != nil { + return fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE): %w", err) + } + return nil + } + return closer, nil +} + +func isRWM(cgroupPermissions string) bool { + r := false + w := false + m := false + for _, rn := range cgroupPermissions { + switch rn { + case 'r': + r = true + case 'w': + w = true + case 'm': + m = true + } + } + return r && w && m +} + +// the logic is from runc +// https://github.com/opencontainers/runc/blob/master/libcontainer/cgroups/fs/devices_v2.go#L44 +func canSkipEBPFError(devices []specs.LinuxDeviceCgroup) bool { + for _, dev := range devices { + if dev.Allow || !isRWM(dev.Access) { + return false + } + } + return true +} diff --git a/vendor/github.com/containerd/cgroups/v3/cgroup2/errors.go b/vendor/github.com/containerd/cgroups/v3/cgroup2/errors.go new file mode 100644 index 0000000000..f57e15e54d --- /dev/null +++ b/vendor/github.com/containerd/cgroups/v3/cgroup2/errors.go @@ -0,0 +1,26 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroup2 + +import ( + "errors" +) + +var ( + ErrInvalidFormat = errors.New("cgroups: parsing file with invalid format failed") + ErrInvalidGroupPath = errors.New("cgroups: invalid group path") +) diff --git a/vendor/github.com/containerd/cgroups/v3/cgroup2/hugetlb.go b/vendor/github.com/containerd/cgroups/v3/cgroup2/hugetlb.go new file mode 100644 index 0000000000..b476b49ff2 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/v3/cgroup2/hugetlb.go @@ -0,0 +1,37 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroup2 + +import "strings" + +type HugeTlb []HugeTlbEntry + +type HugeTlbEntry struct { + HugePageSize string + Limit uint64 +} + +func (r *HugeTlb) Values() (o []Value) { + for _, e := range *r { + o = append(o, Value{ + filename: strings.Join([]string{"hugetlb", e.HugePageSize, "max"}, "."), + value: e.Limit, + }) + } + + return o +} diff --git a/vendor/github.com/containerd/cgroups/v3/cgroup2/io.go b/vendor/github.com/containerd/cgroups/v3/cgroup2/io.go new file mode 100644 index 0000000000..b70dd8ef5d --- /dev/null +++ b/vendor/github.com/containerd/cgroups/v3/cgroup2/io.go @@ -0,0 +1,64 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroup2 + +import "fmt" + +type IOType string + +const ( + ReadBPS IOType = "rbps" + WriteBPS IOType = "wbps" + ReadIOPS IOType = "riops" + WriteIOPS IOType = "wiops" +) + +type BFQ struct { + Weight uint16 +} + +type Entry struct { + Type IOType + Major int64 + Minor int64 + Rate uint64 +} + +func (e Entry) String() string { + return fmt.Sprintf("%d:%d %s=%d", e.Major, e.Minor, e.Type, e.Rate) +} + +type IO struct { + BFQ BFQ + Max []Entry +} + +func (i *IO) Values() (o []Value) { + if i.BFQ.Weight != 0 { + o = append(o, Value{ + filename: "io.bfq.weight", + value: i.BFQ.Weight, + }) + } + for _, e := range i.Max { + o = append(o, Value{ + filename: "io.max", + value: e.String(), + }) + } + return o +} diff --git a/vendor/github.com/containerd/cgroups/v3/cgroup2/manager.go b/vendor/github.com/containerd/cgroups/v3/cgroup2/manager.go new file mode 100644 index 0000000000..b446939d0b --- /dev/null +++ b/vendor/github.com/containerd/cgroups/v3/cgroup2/manager.go @@ -0,0 +1,1007 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroup2 + +import ( + "bufio" + "context" + "errors" + "fmt" + "io/fs" + "math" + "os" + "path/filepath" + "strconv" + "strings" + "time" + + "github.com/containerd/cgroups/v3/cgroup2/stats" + + "github.com/containerd/log" + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + "github.com/godbus/dbus/v5" + "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" +) + +const ( + subtreeControl = "cgroup.subtree_control" + controllersFile = "cgroup.controllers" + killFile = "cgroup.kill" + typeFile = "cgroup.type" + defaultCgroup2Path = "/sys/fs/cgroup" + defaultSlice = "system.slice" +) + +var canDelegate bool + +type Event struct { + Low uint64 + High uint64 + Max uint64 + OOM uint64 + OOMKill uint64 +} + +// Resources for a cgroups v2 unified hierarchy +type Resources struct { + CPU *CPU + Memory *Memory + Pids *Pids + IO *IO + RDMA *RDMA + HugeTlb *HugeTlb + // When len(Devices) is zero, devices are not controlled + Devices []specs.LinuxDeviceCgroup +} + +// Values returns the raw filenames and values that +// can be written to the unified hierarchy +func (r *Resources) Values() (o []Value) { + if r.CPU != nil { + o = append(o, r.CPU.Values()...) + } + if r.Memory != nil { + o = append(o, r.Memory.Values()...) + } + if r.Pids != nil { + o = append(o, r.Pids.Values()...) + } + if r.IO != nil { + o = append(o, r.IO.Values()...) + } + if r.RDMA != nil { + o = append(o, r.RDMA.Values()...) + } + if r.HugeTlb != nil { + o = append(o, r.HugeTlb.Values()...) + } + return o +} + +// EnabledControllers returns the list of all not nil resource controllers +func (r *Resources) EnabledControllers() (c []string) { + if r.CPU != nil { + c = append(c, "cpu") + if r.CPU.Cpus != "" || r.CPU.Mems != "" { + c = append(c, "cpuset") + } + } + if r.Memory != nil { + c = append(c, "memory") + } + if r.Pids != nil { + c = append(c, "pids") + } + if r.IO != nil { + c = append(c, "io") + } + if r.RDMA != nil { + c = append(c, "rdma") + } + if r.HugeTlb != nil { + c = append(c, "hugetlb") + } + return +} + +// Value of a cgroup setting +type Value struct { + filename string + value interface{} +} + +// write the value to the full, absolute path, of a unified hierarchy +func (c *Value) write(path string, perm os.FileMode) error { + var data []byte + switch t := c.value.(type) { + case uint64: + data = []byte(strconv.FormatUint(t, 10)) + case uint16: + data = []byte(strconv.FormatUint(uint64(t), 10)) + case int64: + data = []byte(strconv.FormatInt(t, 10)) + case []byte: + data = t + case string: + data = []byte(t) + case CPUMax: + data = []byte(t) + default: + return ErrInvalidFormat + } + + return os.WriteFile( + filepath.Join(path, c.filename), + data, + perm, + ) +} + +func writeValues(path string, values []Value) error { + for _, o := range values { + if err := o.write(path, defaultFilePerm); err != nil { + return err + } + } + return nil +} + +func NewManager(mountpoint string, group string, resources *Resources) (*Manager, error) { + if resources == nil { + return nil, errors.New("resources reference is nil") + } + if err := VerifyGroupPath(group); err != nil { + return nil, err + } + path := filepath.Join(mountpoint, group) + if err := os.MkdirAll(path, defaultDirPerm); err != nil { + return nil, err + } + m := Manager{ + unifiedMountpoint: mountpoint, + path: path, + } + if err := m.ToggleControllers(resources.EnabledControllers(), Enable); err != nil { + // clean up cgroup dir on failure + os.Remove(path) + return nil, err + } + if err := setResources(path, resources); err != nil { + os.Remove(path) + return nil, err + } + return &m, nil +} + +type InitConfig struct { + mountpoint string +} + +type InitOpts func(c *InitConfig) error + +// WithMountpoint sets the unified mountpoint. The default path is /sys/fs/cgroup. +func WithMountpoint(path string) InitOpts { + return func(c *InitConfig) error { + c.mountpoint = path + return nil + } +} + +// Load a cgroup. +func Load(group string, opts ...InitOpts) (*Manager, error) { + c := InitConfig{mountpoint: defaultCgroup2Path} + for _, opt := range opts { + if err := opt(&c); err != nil { + return nil, err + } + } + + if err := VerifyGroupPath(group); err != nil { + return nil, err + } + path := filepath.Join(c.mountpoint, group) + return &Manager{ + unifiedMountpoint: c.mountpoint, + path: path, + }, nil +} + +type Manager struct { + unifiedMountpoint string + path string +} + +func setResources(path string, resources *Resources) error { + if resources != nil { + if err := writeValues(path, resources.Values()); err != nil { + return err + } + if err := setDevices(path, resources.Devices); err != nil { + return err + } + } + return nil +} + +// CgroupType represents the types a cgroup can be. +type CgroupType string + +const ( + Domain CgroupType = "domain" + DomainThreaded CgroupType = "domain threaded" + DomainInvalid CgroupType = "domain invalid" + Threaded CgroupType = "threaded" +) + +func (c *Manager) GetType() (CgroupType, error) { + val, err := os.ReadFile(filepath.Join(c.path, typeFile)) + if err != nil { + return "", err + } + trimmed := strings.TrimSpace(string(val)) + return CgroupType(trimmed), nil +} + +func (c *Manager) SetType(cgType CgroupType) error { + // NOTE: We could abort if cgType != Threaded here as currently + // it's not possible to revert back to domain, but not sure + // it's worth being that opinionated, especially if that may + // ever change. + v := Value{ + filename: typeFile, + value: string(cgType), + } + return writeValues(c.path, []Value{v}) +} + +func (c *Manager) RootControllers() ([]string, error) { + b, err := os.ReadFile(filepath.Join(c.unifiedMountpoint, controllersFile)) + if err != nil { + return nil, err + } + return strings.Fields(string(b)), nil +} + +func (c *Manager) Controllers() ([]string, error) { + b, err := os.ReadFile(filepath.Join(c.path, controllersFile)) + if err != nil { + return nil, err + } + return strings.Fields(string(b)), nil +} + +func (c *Manager) Update(resources *Resources) error { + return setResources(c.path, resources) +} + +type ControllerToggle int + +const ( + Enable ControllerToggle = iota + 1 + Disable +) + +func toggleFunc(controllers []string, prefix string) []string { + out := make([]string, len(controllers)) + for i, c := range controllers { + out[i] = prefix + c + } + return out +} + +func (c *Manager) ToggleControllers(controllers []string, t ControllerToggle) error { + // when c.path is like /foo/bar/baz, the following files need to be written: + // * /sys/fs/cgroup/cgroup.subtree_control + // * /sys/fs/cgroup/foo/cgroup.subtree_control + // * /sys/fs/cgroup/foo/bar/cgroup.subtree_control + // Note that /sys/fs/cgroup/foo/bar/baz/cgroup.subtree_control does not need to be written. + split := strings.Split(c.path, "/") + var lastErr error + for i := range split { + f := strings.Join(split[:i], "/") + if !strings.HasPrefix(f, c.unifiedMountpoint) || f == c.path { + continue + } + filePath := filepath.Join(f, subtreeControl) + if err := c.writeSubtreeControl(filePath, controllers, t); err != nil { + // When running as rootless, the user may face EPERM on parent groups, but it is negligible when the + // controller is already written. + // So we only return the last error. + lastErr = fmt.Errorf("failed to write subtree controllers %+v to %q: %w", controllers, filePath, err) + } else { + lastErr = nil + } + } + return lastErr +} + +func (c *Manager) writeSubtreeControl(filePath string, controllers []string, t ControllerToggle) error { + f, err := os.OpenFile(filePath, os.O_WRONLY, 0) + if err != nil { + return err + } + defer f.Close() + switch t { + case Enable: + controllers = toggleFunc(controllers, "+") + case Disable: + controllers = toggleFunc(controllers, "-") + } + _, err = f.WriteString(strings.Join(controllers, " ")) + return err +} + +func (c *Manager) NewChild(name string, resources *Resources) (*Manager, error) { + if strings.HasPrefix(name, "/") { + return nil, errors.New("name must be relative") + } + path := filepath.Join(c.path, name) + if err := os.MkdirAll(path, defaultDirPerm); err != nil { + return nil, err + } + m := Manager{ + unifiedMountpoint: c.unifiedMountpoint, + path: path, + } + if resources != nil { + if err := m.ToggleControllers(resources.EnabledControllers(), Enable); err != nil { + // clean up cgroup dir on failure + os.Remove(path) + return nil, err + } + } + if err := setResources(path, resources); err != nil { + // clean up cgroup dir on failure + os.Remove(path) + return nil, err + } + return &m, nil +} + +func (c *Manager) AddProc(pid uint64) error { + v := Value{ + filename: cgroupProcs, + value: pid, + } + return writeValues(c.path, []Value{v}) +} + +func (c *Manager) AddThread(tid uint64) error { + v := Value{ + filename: cgroupThreads, + value: tid, + } + return writeValues(c.path, []Value{v}) +} + +// Kill will try to forcibly exit all of the processes in the cgroup. This is +// equivalent to sending a SIGKILL to every process. On kernels 5.14 and greater +// this will use the cgroup.kill file, on anything that doesn't have the cgroup.kill +// file, a manual process of freezing -> sending a SIGKILL to every process -> thawing +// will be used. +func (c *Manager) Kill() error { + v := Value{ + filename: killFile, + value: "1", + } + err := writeValues(c.path, []Value{v}) + if err == nil { + return nil + } + log.L.Warnf("falling back to slower kill implementation: %s", err) + // Fallback to slow method. + return c.fallbackKill() +} + +// fallbackKill is a slower fallback to the more modern (kernels 5.14+) +// approach of writing to the cgroup.kill file. This is heavily pulled +// from runc's same approach (in signalAllProcesses), with the only differences +// being this is just tailored to the API exposed in this library, and we don't +// need to care about signals other than SIGKILL. +// +// https://github.com/opencontainers/runc/blob/8da0a0b5675764feaaaaad466f6567a9983fcd08/libcontainer/init_linux.go#L523-L529 +func (c *Manager) fallbackKill() error { + logger := log.G(context.TODO()).WithFields(log.Fields{"path": c.path}) + + if err := c.Freeze(); err != nil { + logger.WithError(err).Warn("freezing cgroup2.manager") + } + pids, err := c.Procs(true) + if err != nil { + if err := c.Thaw(); err != nil { + logger.WithError(err).Warn("thawing cgroup2.manager") + } + return err + } + var procs []*os.Process + for _, pid := range pids { + p, err := os.FindProcess(int(pid)) + if err != nil { + logger.WithFields(log.Fields{"error": err, "pid": int(pid)}).Warnf("finding process") + continue + } + procs = append(procs, p) + if err := p.Signal(unix.SIGKILL); err != nil { + logger.WithFields(log.Fields{"error": err, "pid": int(pid)}).Warnf("signaling process") + } + } + if err := c.Thaw(); err != nil { + logger.WithError(err).Warn("thawing cgroup2.manager") + } + + subreaper, err := getSubreaper() + if err != nil { + // The error here means that PR_GET_CHILD_SUBREAPER is not + // supported because this code might run on a kernel older + // than 3.4. We don't want to throw an error in that case, + // and we simplify things, considering there is no subreaper + // set. + subreaper = 0 + } + + for _, p := range procs { + // In case a subreaper has been setup, this code must not + // wait for the process. Otherwise, we cannot be sure the + // current process will be reaped by the subreaper, while + // the subreaper might be waiting for this process in order + // to retrieve its exit code. + if subreaper == 0 { + if _, err := p.Wait(); err != nil { + if !errors.Is(err, unix.ECHILD) { + logger.WithFields(log.Fields{"error": err, "pid": p.Pid}).Warn("waiting on process") + } + } + } + } + return nil +} + +func (c *Manager) Delete() error { + // Kernel prevents cgroups with running process from being removed, + // check the tree is empty. + // + // Pick the right file to read based on the cgs type. + cgType, err := c.GetType() + if err != nil { + if !os.IsNotExist(err) { + return err + } + } + + var tasks []uint64 + switch cgType { + case Threaded, DomainThreaded: + tasks, err = c.Threads(true) + default: + tasks, err = c.Procs(true) + } + if err != nil { + return err + } + if len(tasks) > 0 { + return fmt.Errorf("cgroups: unable to remove path %q: still contains running tasks", c.path) + } + return remove(c.path) +} + +func (c *Manager) getTasks(recursive bool, tType string) ([]uint64, error) { + var tasks []uint64 + err := filepath.Walk(c.path, func(p string, info fs.FileInfo, err error) error { + if err != nil { + return err + } + if !recursive && info.IsDir() { + if p == c.path { + return nil + } + return filepath.SkipDir + } + _, name := filepath.Split(p) + if name != tType { + return nil + } + curTasks, err := parseCgroupTasksFile(p) + if err != nil { + return err + } + tasks = append(tasks, curTasks...) + return nil + }) + return tasks, err +} + +func (c *Manager) Procs(recursive bool) ([]uint64, error) { + return c.getTasks(recursive, cgroupProcs) +} + +func (c *Manager) Threads(recursive bool) ([]uint64, error) { + return c.getTasks(recursive, cgroupThreads) +} + +func (c *Manager) MoveTo(destination *Manager) error { + processes, err := c.Procs(true) + if err != nil { + return err + } + for _, p := range processes { + if err := destination.AddProc(p); err != nil { + if strings.Contains(err.Error(), "no such process") { + continue + } + return err + } + } + return nil +} + +func (c *Manager) Stat() (*stats.Metrics, error) { + controllers, err := c.Controllers() + if err != nil { + return nil, err + } + // Sizing this avoids an allocation to increase the map at runtime; + // currently the default bucket size is 8 and we put 40+ elements + // in it so we'd always end up allocating. + out := make(map[string]uint64, 50) + for _, controller := range controllers { + switch controller { + case "cpu", "memory": + if err := readKVStatsFile(c.path, controller+".stat", out); err != nil { + if os.IsNotExist(err) { + continue + } + return nil, err + } + } + } + memoryEvents := make(map[string]uint64) + if err := readKVStatsFile(c.path, "memory.events", memoryEvents); err != nil { + if !os.IsNotExist(err) { + return nil, err + } + } + + var metrics stats.Metrics + metrics.Pids = &stats.PidsStat{ + Current: getStatFileContentUint64(filepath.Join(c.path, "pids.current")), + Limit: getStatFileContentUint64(filepath.Join(c.path, "pids.max")), + } + metrics.CPU = &stats.CPUStat{ + UsageUsec: out["usage_usec"], + UserUsec: out["user_usec"], + SystemUsec: out["system_usec"], + NrPeriods: out["nr_periods"], + NrThrottled: out["nr_throttled"], + ThrottledUsec: out["throttled_usec"], + PSI: getStatPSIFromFile(filepath.Join(c.path, "cpu.pressure")), + } + metrics.Memory = &stats.MemoryStat{ + Anon: out["anon"], + File: out["file"], + KernelStack: out["kernel_stack"], + Slab: out["slab"], + Sock: out["sock"], + Shmem: out["shmem"], + FileMapped: out["file_mapped"], + FileDirty: out["file_dirty"], + FileWriteback: out["file_writeback"], + AnonThp: out["anon_thp"], + InactiveAnon: out["inactive_anon"], + ActiveAnon: out["active_anon"], + InactiveFile: out["inactive_file"], + ActiveFile: out["active_file"], + Unevictable: out["unevictable"], + SlabReclaimable: out["slab_reclaimable"], + SlabUnreclaimable: out["slab_unreclaimable"], + Pgfault: out["pgfault"], + Pgmajfault: out["pgmajfault"], + WorkingsetRefault: out["workingset_refault"], + WorkingsetActivate: out["workingset_activate"], + WorkingsetNodereclaim: out["workingset_nodereclaim"], + Pgrefill: out["pgrefill"], + Pgscan: out["pgscan"], + Pgsteal: out["pgsteal"], + Pgactivate: out["pgactivate"], + Pgdeactivate: out["pgdeactivate"], + Pglazyfree: out["pglazyfree"], + Pglazyfreed: out["pglazyfreed"], + ThpFaultAlloc: out["thp_fault_alloc"], + ThpCollapseAlloc: out["thp_collapse_alloc"], + Usage: getStatFileContentUint64(filepath.Join(c.path, "memory.current")), + UsageLimit: getStatFileContentUint64(filepath.Join(c.path, "memory.max")), + MaxUsage: getStatFileContentUint64(filepath.Join(c.path, "memory.peak")), + SwapUsage: getStatFileContentUint64(filepath.Join(c.path, "memory.swap.current")), + SwapLimit: getStatFileContentUint64(filepath.Join(c.path, "memory.swap.max")), + SwapMaxUsage: getStatFileContentUint64(filepath.Join(c.path, "memory.swap.peak")), + PSI: getStatPSIFromFile(filepath.Join(c.path, "memory.pressure")), + } + if len(memoryEvents) > 0 { + metrics.MemoryEvents = &stats.MemoryEvents{ + Low: memoryEvents["low"], + High: memoryEvents["high"], + Max: memoryEvents["max"], + Oom: memoryEvents["oom"], + OomKill: memoryEvents["oom_kill"], + } + } + metrics.Io = &stats.IOStat{ + Usage: readIoStats(c.path), + PSI: getStatPSIFromFile(filepath.Join(c.path, "io.pressure")), + } + metrics.Rdma = &stats.RdmaStat{ + Current: rdmaStats(filepath.Join(c.path, "rdma.current")), + Limit: rdmaStats(filepath.Join(c.path, "rdma.max")), + } + metrics.Hugetlb = readHugeTlbStats(c.path) + + return &metrics, nil +} + +func readKVStatsFile(path string, file string, out map[string]uint64) error { + f, err := os.Open(filepath.Join(path, file)) + if err != nil { + return err + } + defer f.Close() + + s := bufio.NewScanner(f) + for s.Scan() { + name, value, err := parseKV(s.Text()) + if err != nil { + return fmt.Errorf("error while parsing %s (line=%q): %w", filepath.Join(path, file), s.Text(), err) + } + out[name] = value + } + return s.Err() +} + +func (c *Manager) Freeze() error { + return c.freeze(c.path, Frozen) +} + +func (c *Manager) Thaw() error { + return c.freeze(c.path, Thawed) +} + +func (c *Manager) freeze(path string, state State) error { + values := state.Values() + for { + if err := writeValues(path, values); err != nil { + return err + } + current, err := fetchState(path) + if err != nil { + return err + } + if current == state { + return nil + } + time.Sleep(1 * time.Millisecond) + } +} + +func (c *Manager) isCgroupEmpty() bool { + // In case of any error we return true so that we exit and don't leak resources + out := make(map[string]uint64) + if err := readKVStatsFile(c.path, "cgroup.events", out); err != nil { + return true + } + if v, ok := out["populated"]; ok { + return v == 0 + } + return true +} + +// MemoryEventFD returns inotify file descriptor and 'memory.events' inotify watch descriptor +func (c *Manager) MemoryEventFD() (int, uint32, error) { + fpath := filepath.Join(c.path, "memory.events") + fd, err := unix.InotifyInit() + if err != nil { + return 0, 0, fmt.Errorf("failed to create inotify fd: %w", err) + } + wd, err := unix.InotifyAddWatch(fd, fpath, unix.IN_MODIFY) + if err != nil { + unix.Close(fd) + return 0, 0, fmt.Errorf("failed to add inotify watch for %q: %w", fpath, err) + } + // monitor to detect process exit/cgroup deletion + evpath := filepath.Join(c.path, "cgroup.events") + if _, err = unix.InotifyAddWatch(fd, evpath, unix.IN_MODIFY); err != nil { + unix.Close(fd) + return 0, 0, fmt.Errorf("failed to add inotify watch for %q: %w", evpath, err) + } + + return fd, uint32(wd), nil +} + +func (c *Manager) EventChan() (<-chan Event, <-chan error) { + ec := make(chan Event) + errCh := make(chan error, 1) + go c.waitForEvents(ec, errCh) + + return ec, errCh +} + +func (c *Manager) waitForEvents(ec chan<- Event, errCh chan<- error) { + defer close(errCh) + + fd, _, err := c.MemoryEventFD() + if err != nil { + errCh <- err + return + } + defer unix.Close(fd) + + for { + buffer := make([]byte, unix.SizeofInotifyEvent*10) + bytesRead, err := unix.Read(fd, buffer) + if err != nil { + errCh <- err + return + } + if bytesRead >= unix.SizeofInotifyEvent { + out := make(map[string]uint64) + if err := readKVStatsFile(c.path, "memory.events", out); err != nil { + // When cgroup is deleted read may return -ENODEV instead of -ENOENT from open. + if _, statErr := os.Lstat(filepath.Join(c.path, "memory.events")); !os.IsNotExist(statErr) { + errCh <- err + } + return + } + ec <- Event{ + Low: out["low"], + High: out["high"], + Max: out["max"], + OOM: out["oom"], + OOMKill: out["oom_kill"], + } + if c.isCgroupEmpty() { + return + } + } + } +} + +func setDevices(path string, devices []specs.LinuxDeviceCgroup) error { + if len(devices) == 0 { + return nil + } + insts, license, err := DeviceFilter(devices) + if err != nil { + return err + } + dirFD, err := unix.Open(path, unix.O_DIRECTORY|unix.O_RDONLY|unix.O_CLOEXEC, 0o600) + if err != nil { + return fmt.Errorf("cannot get dir FD for %s", path) + } + defer unix.Close(dirFD) + if _, err := LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil { + if !canSkipEBPFError(devices) { + return err + } + } + return nil +} + +// getSystemdFullPath returns the full systemd path when creating a systemd slice group. +// the reason this is necessary is because the "-" character has a special meaning in +// systemd slice. For example, when creating a slice called "my-group-112233.slice", +// systemd will create a hierarchy like this: +// +// /sys/fs/cgroup/my.slice/my-group.slice/my-group-112233.slice +func getSystemdFullPath(slice, group string) string { + return filepath.Join(defaultCgroup2Path, dashesToPath(slice), dashesToPath(group)) +} + +// dashesToPath converts a slice name with dashes to it's corresponding systemd filesystem path. +func dashesToPath(in string) string { + path := "" + if strings.HasSuffix(in, ".slice") && strings.Contains(in, "-") { + parts := strings.Split(in, "-") + for i := range parts { + s := strings.Join(parts[0:i+1], "-") + if !strings.HasSuffix(s, ".slice") { + s += ".slice" + } + path = filepath.Join(path, s) + } + } else { + path = filepath.Join(path, in) + } + return path +} + +func NewSystemd(slice, group string, pid int, resources *Resources) (*Manager, error) { + if slice == "" { + slice = defaultSlice + } + ctx := context.TODO() + path := getSystemdFullPath(slice, group) + conn, err := systemdDbus.NewWithContext(ctx) + if err != nil { + return &Manager{}, err + } + defer conn.Close() + + properties := []systemdDbus.Property{ + systemdDbus.PropDescription("cgroup " + group), + newSystemdProperty("DefaultDependencies", false), + newSystemdProperty("MemoryAccounting", true), + newSystemdProperty("CPUAccounting", true), + newSystemdProperty("IOAccounting", true), + } + + // if we create a slice, the parent is defined via a Wants= + if strings.HasSuffix(group, ".slice") { + properties = append(properties, systemdDbus.PropWants(defaultSlice)) + } else { + // otherwise, we use Slice= + properties = append(properties, systemdDbus.PropSlice(defaultSlice)) + } + + // only add pid if its valid, -1 is used w/ general slice creation. + if pid != -1 { + properties = append(properties, newSystemdProperty("PIDs", []uint32{uint32(pid)})) + } + + if resources.Memory != nil && resources.Memory.Min != nil && *resources.Memory.Min != 0 { + properties = append(properties, + newSystemdProperty("MemoryMin", uint64(*resources.Memory.Min))) + } + + if resources.Memory != nil && resources.Memory.Max != nil && *resources.Memory.Max != 0 { + properties = append(properties, + newSystemdProperty("MemoryMax", uint64(*resources.Memory.Max))) + } + + if resources.CPU != nil && resources.CPU.Weight != nil && *resources.CPU.Weight != 0 { + properties = append(properties, + newSystemdProperty("CPUWeight", *resources.CPU.Weight)) + } + + if resources.CPU != nil && resources.CPU.Max != "" { + quota, period := resources.CPU.Max.extractQuotaAndPeriod() + // cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd. + // corresponds to USEC_INFINITY in systemd + // if USEC_INFINITY is provided, CPUQuota is left unbound by systemd + // always setting a property value ensures we can apply a quota and remove it later + cpuQuotaPerSecUSec := uint64(math.MaxUint64) + if quota > 0 { + // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota + // (integer percentage of CPU) internally. This means that if a fractional percent of + // CPU is indicated by Resources.CpuQuota, we need to round up to the nearest + // 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect. + cpuQuotaPerSecUSec = uint64(quota*1000000) / period + if cpuQuotaPerSecUSec%10000 != 0 { + cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000 + } + } + properties = append(properties, + newSystemdProperty("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec)) + } + + // If we can delegate, we add the property back in + if canDelegate { + properties = append(properties, newSystemdProperty("Delegate", true)) + } + + if resources.Pids != nil && resources.Pids.Max > 0 { + properties = append(properties, + newSystemdProperty("TasksAccounting", true), + newSystemdProperty("TasksMax", uint64(resources.Pids.Max))) + } + + if err := startUnit(conn, group, properties, pid == -1); err != nil { + return &Manager{}, err + } + + return &Manager{ + path: path, + }, nil +} + +func startUnit(conn *systemdDbus.Conn, group string, properties []systemdDbus.Property, ignoreExists bool) error { + ctx := context.TODO() + + statusChan := make(chan string, 1) + defer close(statusChan) + + retry := true + started := false + + for !started { + if _, err := conn.StartTransientUnitContext(ctx, group, "replace", properties, statusChan); err != nil { + if !isUnitExists(err) { + return err + } + + if ignoreExists { + return nil + } + + if retry { + retry = false + // When a unit of the same name already exists, it may be a leftover failed unit. + // If we reset it once, systemd can try to remove it. + attemptFailedUnitReset(conn, group) + continue + } + + return err + } else { + started = true + } + } + + systemdStartUnitTimeout := 30 * time.Second + select { + case s := <-statusChan: + if s != "done" { + attemptFailedUnitReset(conn, group) + return fmt.Errorf("error creating systemd unit `%s`: got `%s`", group, s) + } + case <-time.After(systemdStartUnitTimeout): + attemptFailedUnitReset(conn, group) + return fmt.Errorf("timed out while waiting for StartTransientUnit(%s) completion signal from dbus after %v", group, systemdStartUnitTimeout) + } + + return nil +} + +func attemptFailedUnitReset(conn *systemdDbus.Conn, group string) { + ctx := context.TODO() + err := conn.ResetFailedUnitContext(ctx, group) + + if err != nil { + log.G(ctx).Warnf("Unable to reset failed unit: %v", err) + } +} + +func LoadSystemd(slice, group string) (*Manager, error) { + if slice == "" { + slice = defaultSlice + } + path := getSystemdFullPath(slice, group) + return &Manager{ + path: path, + }, nil +} + +func (c *Manager) DeleteSystemd() error { + ctx := context.TODO() + conn, err := systemdDbus.NewWithContext(ctx) + if err != nil { + return err + } + defer conn.Close() + group := systemdUnitFromPath(c.path) + ch := make(chan string) + _, err = conn.StopUnitContext(ctx, group, "replace", ch) + if err != nil { + return err + } + <-ch + return nil +} + +func newSystemdProperty(name string, units interface{}) systemdDbus.Property { + return systemdDbus.Property{ + Name: name, + Value: dbus.MakeVariant(units), + } +} diff --git a/vendor/github.com/containerd/cgroups/v3/cgroup2/memory.go b/vendor/github.com/containerd/cgroups/v3/cgroup2/memory.go new file mode 100644 index 0000000000..8cc8962bd8 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/v3/cgroup2/memory.go @@ -0,0 +1,59 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroup2 + +type Memory struct { + Swap *int64 + Min *int64 + Max *int64 + Low *int64 + High *int64 +} + +func (r *Memory) Values() (o []Value) { + if r.Swap != nil { + o = append(o, Value{ + filename: "memory.swap.max", + value: *r.Swap, + }) + } + if r.Min != nil { + o = append(o, Value{ + filename: "memory.min", + value: *r.Min, + }) + } + if r.Max != nil { + o = append(o, Value{ + filename: "memory.max", + value: *r.Max, + }) + } + if r.Low != nil { + o = append(o, Value{ + filename: "memory.low", + value: *r.Low, + }) + } + if r.High != nil { + o = append(o, Value{ + filename: "memory.high", + value: *r.High, + }) + } + return o +} diff --git a/vendor/github.com/containerd/cgroups/v3/cgroup2/paths.go b/vendor/github.com/containerd/cgroups/v3/cgroup2/paths.go new file mode 100644 index 0000000000..074bb83bc4 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/v3/cgroup2/paths.go @@ -0,0 +1,60 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroup2 + +import ( + "fmt" + "path/filepath" + "strings" +) + +// NestedGroupPath will nest the cgroups based on the calling processes cgroup +// placing its child processes inside its own path +func NestedGroupPath(suffix string) (string, error) { + path, err := parseCgroupFile("/proc/self/cgroup") + if err != nil { + return "", err + } + return filepath.Join(path, suffix), nil +} + +// PidGroupPath will return the correct cgroup paths for an existing process running inside a cgroup +// This is commonly used for the Load function to restore an existing container +func PidGroupPath(pid int) (string, error) { + p := fmt.Sprintf("/proc/%d/cgroup", pid) + return parseCgroupFile(p) +} + +// VerifyGroupPath verifies the format of group path string g. +// The format is same as the third field in /proc/PID/cgroup. +// e.g. "/user.slice/user-1001.slice/session-1.scope" +// +// g must be a "clean" absolute path starts with "/", and must not contain "/sys/fs/cgroup" prefix. +// +// VerifyGroupPath doesn't verify whether g actually exists on the system. +func VerifyGroupPath(g string) error { + if !strings.HasPrefix(g, "/") { + return ErrInvalidGroupPath + } + if filepath.Clean(g) != g { + return ErrInvalidGroupPath + } + if strings.HasPrefix(g, "/sys/fs/cgroup") { + return ErrInvalidGroupPath + } + return nil +} diff --git a/vendor/github.com/containerd/cgroups/v3/cgroup2/pids.go b/vendor/github.com/containerd/cgroups/v3/cgroup2/pids.go new file mode 100644 index 0000000000..7de2632606 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/v3/cgroup2/pids.go @@ -0,0 +1,37 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroup2 + +import "strconv" + +type Pids struct { + Max int64 +} + +func (r *Pids) Values() (o []Value) { + if r.Max != 0 { + limit := "max" + if r.Max > 0 { + limit = strconv.FormatInt(r.Max, 10) + } + o = append(o, Value{ + filename: "pids.max", + value: limit, + }) + } + return o +} diff --git a/vendor/github.com/containerd/cgroups/v3/cgroup2/rdma.go b/vendor/github.com/containerd/cgroups/v3/cgroup2/rdma.go new file mode 100644 index 0000000000..2a65117f10 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/v3/cgroup2/rdma.go @@ -0,0 +1,46 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroup2 + +import ( + "fmt" +) + +type RDMA struct { + Limit []RDMAEntry +} + +type RDMAEntry struct { + Device string + HcaHandles uint32 + HcaObjects uint32 +} + +func (r RDMAEntry) String() string { + return fmt.Sprintf("%s hca_handle=%d hca_object=%d", r.Device, r.HcaHandles, r.HcaObjects) +} + +func (r *RDMA) Values() (o []Value) { + for _, e := range r.Limit { + o = append(o, Value{ + filename: "rdma.max", + value: e.String(), + }) + } + + return o +} diff --git a/vendor/github.com/containerd/cgroups/v3/cgroup2/state.go b/vendor/github.com/containerd/cgroups/v3/cgroup2/state.go new file mode 100644 index 0000000000..886e94b291 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/v3/cgroup2/state.go @@ -0,0 +1,65 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroup2 + +import ( + "os" + "path/filepath" + "strings" +) + +// State is a type that represents the state of the current cgroup +type State string + +const ( + Unknown State = "" + Thawed State = "thawed" + Frozen State = "frozen" + Deleted State = "deleted" + + cgroupFreeze = "cgroup.freeze" +) + +func (s State) Values() []Value { + v := Value{ + filename: cgroupFreeze, + } + switch s { + case Frozen: + v.value = "1" + case Thawed: + v.value = "0" + } + return []Value{ + v, + } +} + +func fetchState(path string) (State, error) { + current, err := os.ReadFile(filepath.Join(path, cgroupFreeze)) + if err != nil { + return Unknown, err + } + switch strings.TrimSpace(string(current)) { + case "1": + return Frozen, nil + case "0": + return Thawed, nil + default: + return Unknown, nil + } +} diff --git a/vendor/github.com/containerd/cgroups/v3/cgroup2/stats/doc.go b/vendor/github.com/containerd/cgroups/v3/cgroup2/stats/doc.go new file mode 100644 index 0000000000..e51e12f800 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/v3/cgroup2/stats/doc.go @@ -0,0 +1,17 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package stats diff --git a/vendor/github.com/containerd/cgroups/v3/cgroup2/stats/metrics.pb.go b/vendor/github.com/containerd/cgroups/v3/cgroup2/stats/metrics.pb.go new file mode 100644 index 0000000000..3d53c224ce --- /dev/null +++ b/vendor/github.com/containerd/cgroups/v3/cgroup2/stats/metrics.pb.go @@ -0,0 +1,1558 @@ +// Code generated by protoc-gen-go. DO NOT EDIT. +// versions: +// protoc-gen-go v1.28.1 +// protoc v3.21.5 +// source: github.com/containerd/cgroups/cgroup2/stats/metrics.proto + +package stats + +import ( + protoreflect "google.golang.org/protobuf/reflect/protoreflect" + protoimpl "google.golang.org/protobuf/runtime/protoimpl" + reflect "reflect" + sync "sync" +) + +const ( + // Verify that this generated code is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) + // Verify that runtime/protoimpl is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) +) + +type Metrics struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Pids *PidsStat `protobuf:"bytes,1,opt,name=pids,proto3" json:"pids,omitempty"` + CPU *CPUStat `protobuf:"bytes,2,opt,name=cpu,proto3" json:"cpu,omitempty"` + Memory *MemoryStat `protobuf:"bytes,4,opt,name=memory,proto3" json:"memory,omitempty"` + Rdma *RdmaStat `protobuf:"bytes,5,opt,name=rdma,proto3" json:"rdma,omitempty"` + Io *IOStat `protobuf:"bytes,6,opt,name=io,proto3" json:"io,omitempty"` + Hugetlb []*HugeTlbStat `protobuf:"bytes,7,rep,name=hugetlb,proto3" json:"hugetlb,omitempty"` + MemoryEvents *MemoryEvents `protobuf:"bytes,8,opt,name=memory_events,json=memoryEvents,proto3" json:"memory_events,omitempty"` +} + +func (x *Metrics) Reset() { + *x = Metrics{} + if protoimpl.UnsafeEnabled { + mi := &file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *Metrics) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*Metrics) ProtoMessage() {} + +func (x *Metrics) ProtoReflect() protoreflect.Message { + mi := &file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[0] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use Metrics.ProtoReflect.Descriptor instead. +func (*Metrics) Descriptor() ([]byte, []int) { + return file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_rawDescGZIP(), []int{0} +} + +func (x *Metrics) GetPids() *PidsStat { + if x != nil { + return x.Pids + } + return nil +} + +func (x *Metrics) GetCPU() *CPUStat { + if x != nil { + return x.CPU + } + return nil +} + +func (x *Metrics) GetMemory() *MemoryStat { + if x != nil { + return x.Memory + } + return nil +} + +func (x *Metrics) GetRdma() *RdmaStat { + if x != nil { + return x.Rdma + } + return nil +} + +func (x *Metrics) GetIo() *IOStat { + if x != nil { + return x.Io + } + return nil +} + +func (x *Metrics) GetHugetlb() []*HugeTlbStat { + if x != nil { + return x.Hugetlb + } + return nil +} + +func (x *Metrics) GetMemoryEvents() *MemoryEvents { + if x != nil { + return x.MemoryEvents + } + return nil +} + +type PSIData struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Avg10 float64 `protobuf:"fixed64,1,opt,name=avg10,proto3" json:"avg10,omitempty"` + Avg60 float64 `protobuf:"fixed64,2,opt,name=avg60,proto3" json:"avg60,omitempty"` + Avg300 float64 `protobuf:"fixed64,3,opt,name=avg300,proto3" json:"avg300,omitempty"` + Total uint64 `protobuf:"varint,4,opt,name=total,proto3" json:"total,omitempty"` +} + +func (x *PSIData) Reset() { + *x = PSIData{} + if protoimpl.UnsafeEnabled { + mi := &file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[1] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *PSIData) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*PSIData) ProtoMessage() {} + +func (x *PSIData) ProtoReflect() protoreflect.Message { + mi := &file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[1] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use PSIData.ProtoReflect.Descriptor instead. +func (*PSIData) Descriptor() ([]byte, []int) { + return file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_rawDescGZIP(), []int{1} +} + +func (x *PSIData) GetAvg10() float64 { + if x != nil { + return x.Avg10 + } + return 0 +} + +func (x *PSIData) GetAvg60() float64 { + if x != nil { + return x.Avg60 + } + return 0 +} + +func (x *PSIData) GetAvg300() float64 { + if x != nil { + return x.Avg300 + } + return 0 +} + +func (x *PSIData) GetTotal() uint64 { + if x != nil { + return x.Total + } + return 0 +} + +type PSIStats struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Some *PSIData `protobuf:"bytes,1,opt,name=some,proto3" json:"some,omitempty"` + Full *PSIData `protobuf:"bytes,2,opt,name=full,proto3" json:"full,omitempty"` +} + +func (x *PSIStats) Reset() { + *x = PSIStats{} + if protoimpl.UnsafeEnabled { + mi := &file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[2] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *PSIStats) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*PSIStats) ProtoMessage() {} + +func (x *PSIStats) ProtoReflect() protoreflect.Message { + mi := &file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[2] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use PSIStats.ProtoReflect.Descriptor instead. +func (*PSIStats) Descriptor() ([]byte, []int) { + return file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_rawDescGZIP(), []int{2} +} + +func (x *PSIStats) GetSome() *PSIData { + if x != nil { + return x.Some + } + return nil +} + +func (x *PSIStats) GetFull() *PSIData { + if x != nil { + return x.Full + } + return nil +} + +type PidsStat struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Current uint64 `protobuf:"varint,1,opt,name=current,proto3" json:"current,omitempty"` + Limit uint64 `protobuf:"varint,2,opt,name=limit,proto3" json:"limit,omitempty"` +} + +func (x *PidsStat) Reset() { + *x = PidsStat{} + if protoimpl.UnsafeEnabled { + mi := &file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[3] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *PidsStat) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*PidsStat) ProtoMessage() {} + +func (x *PidsStat) ProtoReflect() protoreflect.Message { + mi := &file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[3] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use PidsStat.ProtoReflect.Descriptor instead. +func (*PidsStat) Descriptor() ([]byte, []int) { + return file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_rawDescGZIP(), []int{3} +} + +func (x *PidsStat) GetCurrent() uint64 { + if x != nil { + return x.Current + } + return 0 +} + +func (x *PidsStat) GetLimit() uint64 { + if x != nil { + return x.Limit + } + return 0 +} + +type CPUStat struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + UsageUsec uint64 `protobuf:"varint,1,opt,name=usage_usec,json=usageUsec,proto3" json:"usage_usec,omitempty"` + UserUsec uint64 `protobuf:"varint,2,opt,name=user_usec,json=userUsec,proto3" json:"user_usec,omitempty"` + SystemUsec uint64 `protobuf:"varint,3,opt,name=system_usec,json=systemUsec,proto3" json:"system_usec,omitempty"` + NrPeriods uint64 `protobuf:"varint,4,opt,name=nr_periods,json=nrPeriods,proto3" json:"nr_periods,omitempty"` + NrThrottled uint64 `protobuf:"varint,5,opt,name=nr_throttled,json=nrThrottled,proto3" json:"nr_throttled,omitempty"` + ThrottledUsec uint64 `protobuf:"varint,6,opt,name=throttled_usec,json=throttledUsec,proto3" json:"throttled_usec,omitempty"` + PSI *PSIStats `protobuf:"bytes,7,opt,name=psi,proto3" json:"psi,omitempty"` +} + +func (x *CPUStat) Reset() { + *x = CPUStat{} + if protoimpl.UnsafeEnabled { + mi := &file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[4] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *CPUStat) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*CPUStat) ProtoMessage() {} + +func (x *CPUStat) ProtoReflect() protoreflect.Message { + mi := &file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[4] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use CPUStat.ProtoReflect.Descriptor instead. +func (*CPUStat) Descriptor() ([]byte, []int) { + return file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_rawDescGZIP(), []int{4} +} + +func (x *CPUStat) GetUsageUsec() uint64 { + if x != nil { + return x.UsageUsec + } + return 0 +} + +func (x *CPUStat) GetUserUsec() uint64 { + if x != nil { + return x.UserUsec + } + return 0 +} + +func (x *CPUStat) GetSystemUsec() uint64 { + if x != nil { + return x.SystemUsec + } + return 0 +} + +func (x *CPUStat) GetNrPeriods() uint64 { + if x != nil { + return x.NrPeriods + } + return 0 +} + +func (x *CPUStat) GetNrThrottled() uint64 { + if x != nil { + return x.NrThrottled + } + return 0 +} + +func (x *CPUStat) GetThrottledUsec() uint64 { + if x != nil { + return x.ThrottledUsec + } + return 0 +} + +func (x *CPUStat) GetPSI() *PSIStats { + if x != nil { + return x.PSI + } + return nil +} + +type MemoryStat struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Anon uint64 `protobuf:"varint,1,opt,name=anon,proto3" json:"anon,omitempty"` + File uint64 `protobuf:"varint,2,opt,name=file,proto3" json:"file,omitempty"` + KernelStack uint64 `protobuf:"varint,3,opt,name=kernel_stack,json=kernelStack,proto3" json:"kernel_stack,omitempty"` + Slab uint64 `protobuf:"varint,4,opt,name=slab,proto3" json:"slab,omitempty"` + Sock uint64 `protobuf:"varint,5,opt,name=sock,proto3" json:"sock,omitempty"` + Shmem uint64 `protobuf:"varint,6,opt,name=shmem,proto3" json:"shmem,omitempty"` + FileMapped uint64 `protobuf:"varint,7,opt,name=file_mapped,json=fileMapped,proto3" json:"file_mapped,omitempty"` + FileDirty uint64 `protobuf:"varint,8,opt,name=file_dirty,json=fileDirty,proto3" json:"file_dirty,omitempty"` + FileWriteback uint64 `protobuf:"varint,9,opt,name=file_writeback,json=fileWriteback,proto3" json:"file_writeback,omitempty"` + AnonThp uint64 `protobuf:"varint,10,opt,name=anon_thp,json=anonThp,proto3" json:"anon_thp,omitempty"` + InactiveAnon uint64 `protobuf:"varint,11,opt,name=inactive_anon,json=inactiveAnon,proto3" json:"inactive_anon,omitempty"` + ActiveAnon uint64 `protobuf:"varint,12,opt,name=active_anon,json=activeAnon,proto3" json:"active_anon,omitempty"` + InactiveFile uint64 `protobuf:"varint,13,opt,name=inactive_file,json=inactiveFile,proto3" json:"inactive_file,omitempty"` + ActiveFile uint64 `protobuf:"varint,14,opt,name=active_file,json=activeFile,proto3" json:"active_file,omitempty"` + Unevictable uint64 `protobuf:"varint,15,opt,name=unevictable,proto3" json:"unevictable,omitempty"` + SlabReclaimable uint64 `protobuf:"varint,16,opt,name=slab_reclaimable,json=slabReclaimable,proto3" json:"slab_reclaimable,omitempty"` + SlabUnreclaimable uint64 `protobuf:"varint,17,opt,name=slab_unreclaimable,json=slabUnreclaimable,proto3" json:"slab_unreclaimable,omitempty"` + Pgfault uint64 `protobuf:"varint,18,opt,name=pgfault,proto3" json:"pgfault,omitempty"` + Pgmajfault uint64 `protobuf:"varint,19,opt,name=pgmajfault,proto3" json:"pgmajfault,omitempty"` + WorkingsetRefault uint64 `protobuf:"varint,20,opt,name=workingset_refault,json=workingsetRefault,proto3" json:"workingset_refault,omitempty"` + WorkingsetActivate uint64 `protobuf:"varint,21,opt,name=workingset_activate,json=workingsetActivate,proto3" json:"workingset_activate,omitempty"` + WorkingsetNodereclaim uint64 `protobuf:"varint,22,opt,name=workingset_nodereclaim,json=workingsetNodereclaim,proto3" json:"workingset_nodereclaim,omitempty"` + Pgrefill uint64 `protobuf:"varint,23,opt,name=pgrefill,proto3" json:"pgrefill,omitempty"` + Pgscan uint64 `protobuf:"varint,24,opt,name=pgscan,proto3" json:"pgscan,omitempty"` + Pgsteal uint64 `protobuf:"varint,25,opt,name=pgsteal,proto3" json:"pgsteal,omitempty"` + Pgactivate uint64 `protobuf:"varint,26,opt,name=pgactivate,proto3" json:"pgactivate,omitempty"` + Pgdeactivate uint64 `protobuf:"varint,27,opt,name=pgdeactivate,proto3" json:"pgdeactivate,omitempty"` + Pglazyfree uint64 `protobuf:"varint,28,opt,name=pglazyfree,proto3" json:"pglazyfree,omitempty"` + Pglazyfreed uint64 `protobuf:"varint,29,opt,name=pglazyfreed,proto3" json:"pglazyfreed,omitempty"` + ThpFaultAlloc uint64 `protobuf:"varint,30,opt,name=thp_fault_alloc,json=thpFaultAlloc,proto3" json:"thp_fault_alloc,omitempty"` + ThpCollapseAlloc uint64 `protobuf:"varint,31,opt,name=thp_collapse_alloc,json=thpCollapseAlloc,proto3" json:"thp_collapse_alloc,omitempty"` + Usage uint64 `protobuf:"varint,32,opt,name=usage,proto3" json:"usage,omitempty"` + UsageLimit uint64 `protobuf:"varint,33,opt,name=usage_limit,json=usageLimit,proto3" json:"usage_limit,omitempty"` + SwapUsage uint64 `protobuf:"varint,34,opt,name=swap_usage,json=swapUsage,proto3" json:"swap_usage,omitempty"` + SwapLimit uint64 `protobuf:"varint,35,opt,name=swap_limit,json=swapLimit,proto3" json:"swap_limit,omitempty"` + MaxUsage uint64 `protobuf:"varint,36,opt,name=max_usage,json=maxUsage,proto3" json:"max_usage,omitempty"` + SwapMaxUsage uint64 `protobuf:"varint,37,opt,name=swap_max_usage,json=swapMaxUsage,proto3" json:"swap_max_usage,omitempty"` + PSI *PSIStats `protobuf:"bytes,38,opt,name=psi,proto3" json:"psi,omitempty"` +} + +func (x *MemoryStat) Reset() { + *x = MemoryStat{} + if protoimpl.UnsafeEnabled { + mi := &file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[5] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *MemoryStat) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*MemoryStat) ProtoMessage() {} + +func (x *MemoryStat) ProtoReflect() protoreflect.Message { + mi := &file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[5] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use MemoryStat.ProtoReflect.Descriptor instead. +func (*MemoryStat) Descriptor() ([]byte, []int) { + return file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_rawDescGZIP(), []int{5} +} + +func (x *MemoryStat) GetAnon() uint64 { + if x != nil { + return x.Anon + } + return 0 +} + +func (x *MemoryStat) GetFile() uint64 { + if x != nil { + return x.File + } + return 0 +} + +func (x *MemoryStat) GetKernelStack() uint64 { + if x != nil { + return x.KernelStack + } + return 0 +} + +func (x *MemoryStat) GetSlab() uint64 { + if x != nil { + return x.Slab + } + return 0 +} + +func (x *MemoryStat) GetSock() uint64 { + if x != nil { + return x.Sock + } + return 0 +} + +func (x *MemoryStat) GetShmem() uint64 { + if x != nil { + return x.Shmem + } + return 0 +} + +func (x *MemoryStat) GetFileMapped() uint64 { + if x != nil { + return x.FileMapped + } + return 0 +} + +func (x *MemoryStat) GetFileDirty() uint64 { + if x != nil { + return x.FileDirty + } + return 0 +} + +func (x *MemoryStat) GetFileWriteback() uint64 { + if x != nil { + return x.FileWriteback + } + return 0 +} + +func (x *MemoryStat) GetAnonThp() uint64 { + if x != nil { + return x.AnonThp + } + return 0 +} + +func (x *MemoryStat) GetInactiveAnon() uint64 { + if x != nil { + return x.InactiveAnon + } + return 0 +} + +func (x *MemoryStat) GetActiveAnon() uint64 { + if x != nil { + return x.ActiveAnon + } + return 0 +} + +func (x *MemoryStat) GetInactiveFile() uint64 { + if x != nil { + return x.InactiveFile + } + return 0 +} + +func (x *MemoryStat) GetActiveFile() uint64 { + if x != nil { + return x.ActiveFile + } + return 0 +} + +func (x *MemoryStat) GetUnevictable() uint64 { + if x != nil { + return x.Unevictable + } + return 0 +} + +func (x *MemoryStat) GetSlabReclaimable() uint64 { + if x != nil { + return x.SlabReclaimable + } + return 0 +} + +func (x *MemoryStat) GetSlabUnreclaimable() uint64 { + if x != nil { + return x.SlabUnreclaimable + } + return 0 +} + +func (x *MemoryStat) GetPgfault() uint64 { + if x != nil { + return x.Pgfault + } + return 0 +} + +func (x *MemoryStat) GetPgmajfault() uint64 { + if x != nil { + return x.Pgmajfault + } + return 0 +} + +func (x *MemoryStat) GetWorkingsetRefault() uint64 { + if x != nil { + return x.WorkingsetRefault + } + return 0 +} + +func (x *MemoryStat) GetWorkingsetActivate() uint64 { + if x != nil { + return x.WorkingsetActivate + } + return 0 +} + +func (x *MemoryStat) GetWorkingsetNodereclaim() uint64 { + if x != nil { + return x.WorkingsetNodereclaim + } + return 0 +} + +func (x *MemoryStat) GetPgrefill() uint64 { + if x != nil { + return x.Pgrefill + } + return 0 +} + +func (x *MemoryStat) GetPgscan() uint64 { + if x != nil { + return x.Pgscan + } + return 0 +} + +func (x *MemoryStat) GetPgsteal() uint64 { + if x != nil { + return x.Pgsteal + } + return 0 +} + +func (x *MemoryStat) GetPgactivate() uint64 { + if x != nil { + return x.Pgactivate + } + return 0 +} + +func (x *MemoryStat) GetPgdeactivate() uint64 { + if x != nil { + return x.Pgdeactivate + } + return 0 +} + +func (x *MemoryStat) GetPglazyfree() uint64 { + if x != nil { + return x.Pglazyfree + } + return 0 +} + +func (x *MemoryStat) GetPglazyfreed() uint64 { + if x != nil { + return x.Pglazyfreed + } + return 0 +} + +func (x *MemoryStat) GetThpFaultAlloc() uint64 { + if x != nil { + return x.ThpFaultAlloc + } + return 0 +} + +func (x *MemoryStat) GetThpCollapseAlloc() uint64 { + if x != nil { + return x.ThpCollapseAlloc + } + return 0 +} + +func (x *MemoryStat) GetUsage() uint64 { + if x != nil { + return x.Usage + } + return 0 +} + +func (x *MemoryStat) GetUsageLimit() uint64 { + if x != nil { + return x.UsageLimit + } + return 0 +} + +func (x *MemoryStat) GetSwapUsage() uint64 { + if x != nil { + return x.SwapUsage + } + return 0 +} + +func (x *MemoryStat) GetSwapLimit() uint64 { + if x != nil { + return x.SwapLimit + } + return 0 +} + +func (x *MemoryStat) GetMaxUsage() uint64 { + if x != nil { + return x.MaxUsage + } + return 0 +} + +func (x *MemoryStat) GetSwapMaxUsage() uint64 { + if x != nil { + return x.SwapMaxUsage + } + return 0 +} + +func (x *MemoryStat) GetPSI() *PSIStats { + if x != nil { + return x.PSI + } + return nil +} + +type MemoryEvents struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Low uint64 `protobuf:"varint,1,opt,name=low,proto3" json:"low,omitempty"` + High uint64 `protobuf:"varint,2,opt,name=high,proto3" json:"high,omitempty"` + Max uint64 `protobuf:"varint,3,opt,name=max,proto3" json:"max,omitempty"` + Oom uint64 `protobuf:"varint,4,opt,name=oom,proto3" json:"oom,omitempty"` + OomKill uint64 `protobuf:"varint,5,opt,name=oom_kill,json=oomKill,proto3" json:"oom_kill,omitempty"` +} + +func (x *MemoryEvents) Reset() { + *x = MemoryEvents{} + if protoimpl.UnsafeEnabled { + mi := &file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[6] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *MemoryEvents) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*MemoryEvents) ProtoMessage() {} + +func (x *MemoryEvents) ProtoReflect() protoreflect.Message { + mi := &file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[6] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use MemoryEvents.ProtoReflect.Descriptor instead. +func (*MemoryEvents) Descriptor() ([]byte, []int) { + return file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_rawDescGZIP(), []int{6} +} + +func (x *MemoryEvents) GetLow() uint64 { + if x != nil { + return x.Low + } + return 0 +} + +func (x *MemoryEvents) GetHigh() uint64 { + if x != nil { + return x.High + } + return 0 +} + +func (x *MemoryEvents) GetMax() uint64 { + if x != nil { + return x.Max + } + return 0 +} + +func (x *MemoryEvents) GetOom() uint64 { + if x != nil { + return x.Oom + } + return 0 +} + +func (x *MemoryEvents) GetOomKill() uint64 { + if x != nil { + return x.OomKill + } + return 0 +} + +type RdmaStat struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Current []*RdmaEntry `protobuf:"bytes,1,rep,name=current,proto3" json:"current,omitempty"` + Limit []*RdmaEntry `protobuf:"bytes,2,rep,name=limit,proto3" json:"limit,omitempty"` +} + +func (x *RdmaStat) Reset() { + *x = RdmaStat{} + if protoimpl.UnsafeEnabled { + mi := &file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[7] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *RdmaStat) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*RdmaStat) ProtoMessage() {} + +func (x *RdmaStat) ProtoReflect() protoreflect.Message { + mi := &file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[7] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use RdmaStat.ProtoReflect.Descriptor instead. +func (*RdmaStat) Descriptor() ([]byte, []int) { + return file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_rawDescGZIP(), []int{7} +} + +func (x *RdmaStat) GetCurrent() []*RdmaEntry { + if x != nil { + return x.Current + } + return nil +} + +func (x *RdmaStat) GetLimit() []*RdmaEntry { + if x != nil { + return x.Limit + } + return nil +} + +type RdmaEntry struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Device string `protobuf:"bytes,1,opt,name=device,proto3" json:"device,omitempty"` + HcaHandles uint32 `protobuf:"varint,2,opt,name=hca_handles,json=hcaHandles,proto3" json:"hca_handles,omitempty"` + HcaObjects uint32 `protobuf:"varint,3,opt,name=hca_objects,json=hcaObjects,proto3" json:"hca_objects,omitempty"` +} + +func (x *RdmaEntry) Reset() { + *x = RdmaEntry{} + if protoimpl.UnsafeEnabled { + mi := &file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[8] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *RdmaEntry) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*RdmaEntry) ProtoMessage() {} + +func (x *RdmaEntry) ProtoReflect() protoreflect.Message { + mi := &file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[8] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use RdmaEntry.ProtoReflect.Descriptor instead. +func (*RdmaEntry) Descriptor() ([]byte, []int) { + return file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_rawDescGZIP(), []int{8} +} + +func (x *RdmaEntry) GetDevice() string { + if x != nil { + return x.Device + } + return "" +} + +func (x *RdmaEntry) GetHcaHandles() uint32 { + if x != nil { + return x.HcaHandles + } + return 0 +} + +func (x *RdmaEntry) GetHcaObjects() uint32 { + if x != nil { + return x.HcaObjects + } + return 0 +} + +type IOStat struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Usage []*IOEntry `protobuf:"bytes,1,rep,name=usage,proto3" json:"usage,omitempty"` + PSI *PSIStats `protobuf:"bytes,2,opt,name=psi,proto3" json:"psi,omitempty"` +} + +func (x *IOStat) Reset() { + *x = IOStat{} + if protoimpl.UnsafeEnabled { + mi := &file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[9] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *IOStat) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*IOStat) ProtoMessage() {} + +func (x *IOStat) ProtoReflect() protoreflect.Message { + mi := &file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[9] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use IOStat.ProtoReflect.Descriptor instead. +func (*IOStat) Descriptor() ([]byte, []int) { + return file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_rawDescGZIP(), []int{9} +} + +func (x *IOStat) GetUsage() []*IOEntry { + if x != nil { + return x.Usage + } + return nil +} + +func (x *IOStat) GetPSI() *PSIStats { + if x != nil { + return x.PSI + } + return nil +} + +type IOEntry struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Major uint64 `protobuf:"varint,1,opt,name=major,proto3" json:"major,omitempty"` + Minor uint64 `protobuf:"varint,2,opt,name=minor,proto3" json:"minor,omitempty"` + Rbytes uint64 `protobuf:"varint,3,opt,name=rbytes,proto3" json:"rbytes,omitempty"` + Wbytes uint64 `protobuf:"varint,4,opt,name=wbytes,proto3" json:"wbytes,omitempty"` + Rios uint64 `protobuf:"varint,5,opt,name=rios,proto3" json:"rios,omitempty"` + Wios uint64 `protobuf:"varint,6,opt,name=wios,proto3" json:"wios,omitempty"` +} + +func (x *IOEntry) Reset() { + *x = IOEntry{} + if protoimpl.UnsafeEnabled { + mi := &file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[10] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *IOEntry) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*IOEntry) ProtoMessage() {} + +func (x *IOEntry) ProtoReflect() protoreflect.Message { + mi := &file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[10] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use IOEntry.ProtoReflect.Descriptor instead. +func (*IOEntry) Descriptor() ([]byte, []int) { + return file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_rawDescGZIP(), []int{10} +} + +func (x *IOEntry) GetMajor() uint64 { + if x != nil { + return x.Major + } + return 0 +} + +func (x *IOEntry) GetMinor() uint64 { + if x != nil { + return x.Minor + } + return 0 +} + +func (x *IOEntry) GetRbytes() uint64 { + if x != nil { + return x.Rbytes + } + return 0 +} + +func (x *IOEntry) GetWbytes() uint64 { + if x != nil { + return x.Wbytes + } + return 0 +} + +func (x *IOEntry) GetRios() uint64 { + if x != nil { + return x.Rios + } + return 0 +} + +func (x *IOEntry) GetWios() uint64 { + if x != nil { + return x.Wios + } + return 0 +} + +type HugeTlbStat struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Current uint64 `protobuf:"varint,1,opt,name=current,proto3" json:"current,omitempty"` + Max uint64 `protobuf:"varint,2,opt,name=max,proto3" json:"max,omitempty"` + Pagesize string `protobuf:"bytes,3,opt,name=pagesize,proto3" json:"pagesize,omitempty"` +} + +func (x *HugeTlbStat) Reset() { + *x = HugeTlbStat{} + if protoimpl.UnsafeEnabled { + mi := &file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[11] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *HugeTlbStat) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*HugeTlbStat) ProtoMessage() {} + +func (x *HugeTlbStat) ProtoReflect() protoreflect.Message { + mi := &file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[11] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use HugeTlbStat.ProtoReflect.Descriptor instead. +func (*HugeTlbStat) Descriptor() ([]byte, []int) { + return file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_rawDescGZIP(), []int{11} +} + +func (x *HugeTlbStat) GetCurrent() uint64 { + if x != nil { + return x.Current + } + return 0 +} + +func (x *HugeTlbStat) GetMax() uint64 { + if x != nil { + return x.Max + } + return 0 +} + +func (x *HugeTlbStat) GetPagesize() string { + if x != nil { + return x.Pagesize + } + return "" +} + +var File_github_com_containerd_cgroups_cgroup2_stats_metrics_proto protoreflect.FileDescriptor + +var file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_rawDesc = []byte{ + 0x0a, 0x39, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x63, 0x6f, 0x6e, + 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x64, 0x2f, 0x63, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x73, 0x2f, + 0x63, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x32, 0x2f, 0x73, 0x74, 0x61, 0x74, 0x73, 0x2f, 0x6d, 0x65, + 0x74, 0x72, 0x69, 0x63, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x18, 0x69, 0x6f, 0x2e, + 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x64, 0x2e, 0x63, 0x67, 0x72, 0x6f, 0x75, + 0x70, 0x73, 0x2e, 0x76, 0x32, 0x22, 0xac, 0x03, 0x0a, 0x07, 0x4d, 0x65, 0x74, 0x72, 0x69, 0x63, + 0x73, 0x12, 0x36, 0x0a, 0x04, 0x70, 0x69, 0x64, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, + 0x22, 0x2e, 0x69, 0x6f, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x64, 0x2e, + 0x63, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x73, 0x2e, 0x76, 0x32, 0x2e, 0x50, 0x69, 0x64, 0x73, 0x53, + 0x74, 0x61, 0x74, 0x52, 0x04, 0x70, 0x69, 0x64, 0x73, 0x12, 0x33, 0x0a, 0x03, 0x63, 0x70, 0x75, + 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x21, 0x2e, 0x69, 0x6f, 0x2e, 0x63, 0x6f, 0x6e, 0x74, + 0x61, 0x69, 0x6e, 0x65, 0x72, 0x64, 0x2e, 0x63, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x73, 0x2e, 0x76, + 0x32, 0x2e, 0x43, 0x50, 0x55, 0x53, 0x74, 0x61, 0x74, 0x52, 0x03, 0x63, 0x70, 0x75, 0x12, 0x3c, + 0x0a, 0x06, 0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x18, 0x04, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x24, + 0x2e, 0x69, 0x6f, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x64, 0x2e, 0x63, + 0x67, 0x72, 0x6f, 0x75, 0x70, 0x73, 0x2e, 0x76, 0x32, 0x2e, 0x4d, 0x65, 0x6d, 0x6f, 0x72, 0x79, + 0x53, 0x74, 0x61, 0x74, 0x52, 0x06, 0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x12, 0x36, 0x0a, 0x04, + 0x72, 0x64, 0x6d, 0x61, 0x18, 0x05, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x22, 0x2e, 0x69, 0x6f, 0x2e, + 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x64, 0x2e, 0x63, 0x67, 0x72, 0x6f, 0x75, + 0x70, 0x73, 0x2e, 0x76, 0x32, 0x2e, 0x52, 0x64, 0x6d, 0x61, 0x53, 0x74, 0x61, 0x74, 0x52, 0x04, + 0x72, 0x64, 0x6d, 0x61, 0x12, 0x30, 0x0a, 0x02, 0x69, 0x6f, 0x18, 0x06, 0x20, 0x01, 0x28, 0x0b, + 0x32, 0x20, 0x2e, 0x69, 0x6f, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x64, + 0x2e, 0x63, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x73, 0x2e, 0x76, 0x32, 0x2e, 0x49, 0x4f, 0x53, 0x74, + 0x61, 0x74, 0x52, 0x02, 0x69, 0x6f, 0x12, 0x3f, 0x0a, 0x07, 0x68, 0x75, 0x67, 0x65, 0x74, 0x6c, + 0x62, 0x18, 0x07, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x25, 0x2e, 0x69, 0x6f, 0x2e, 0x63, 0x6f, 0x6e, + 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x64, 0x2e, 0x63, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x73, 0x2e, + 0x76, 0x32, 0x2e, 0x48, 0x75, 0x67, 0x65, 0x54, 0x6c, 0x62, 0x53, 0x74, 0x61, 0x74, 0x52, 0x07, + 0x68, 0x75, 0x67, 0x65, 0x74, 0x6c, 0x62, 0x12, 0x4b, 0x0a, 0x0d, 0x6d, 0x65, 0x6d, 0x6f, 0x72, + 0x79, 0x5f, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x73, 0x18, 0x08, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x26, + 0x2e, 0x69, 0x6f, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x64, 0x2e, 0x63, + 0x67, 0x72, 0x6f, 0x75, 0x70, 0x73, 0x2e, 0x76, 0x32, 0x2e, 0x4d, 0x65, 0x6d, 0x6f, 0x72, 0x79, + 0x45, 0x76, 0x65, 0x6e, 0x74, 0x73, 0x52, 0x0c, 0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x45, 0x76, + 0x65, 0x6e, 0x74, 0x73, 0x22, 0x63, 0x0a, 0x07, 0x50, 0x53, 0x49, 0x44, 0x61, 0x74, 0x61, 0x12, + 0x14, 0x0a, 0x05, 0x61, 0x76, 0x67, 0x31, 0x30, 0x18, 0x01, 0x20, 0x01, 0x28, 0x01, 0x52, 0x05, + 0x61, 0x76, 0x67, 0x31, 0x30, 0x12, 0x14, 0x0a, 0x05, 0x61, 0x76, 0x67, 0x36, 0x30, 0x18, 0x02, + 0x20, 0x01, 0x28, 0x01, 0x52, 0x05, 0x61, 0x76, 0x67, 0x36, 0x30, 0x12, 0x16, 0x0a, 0x06, 0x61, + 0x76, 0x67, 0x33, 0x30, 0x30, 0x18, 0x03, 0x20, 0x01, 0x28, 0x01, 0x52, 0x06, 0x61, 0x76, 0x67, + 0x33, 0x30, 0x30, 0x12, 0x14, 0x0a, 0x05, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x18, 0x04, 0x20, 0x01, + 0x28, 0x04, 0x52, 0x05, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x22, 0x78, 0x0a, 0x08, 0x50, 0x53, 0x49, + 0x53, 0x74, 0x61, 0x74, 0x73, 0x12, 0x35, 0x0a, 0x04, 0x73, 0x6f, 0x6d, 0x65, 0x18, 0x01, 0x20, + 0x01, 0x28, 0x0b, 0x32, 0x21, 0x2e, 0x69, 0x6f, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, + 0x65, 0x72, 0x64, 0x2e, 0x63, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x73, 0x2e, 0x76, 0x32, 0x2e, 0x50, + 0x53, 0x49, 0x44, 0x61, 0x74, 0x61, 0x52, 0x04, 0x73, 0x6f, 0x6d, 0x65, 0x12, 0x35, 0x0a, 0x04, + 0x66, 0x75, 0x6c, 0x6c, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x21, 0x2e, 0x69, 0x6f, 0x2e, + 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x64, 0x2e, 0x63, 0x67, 0x72, 0x6f, 0x75, + 0x70, 0x73, 0x2e, 0x76, 0x32, 0x2e, 0x50, 0x53, 0x49, 0x44, 0x61, 0x74, 0x61, 0x52, 0x04, 0x66, + 0x75, 0x6c, 0x6c, 0x22, 0x3a, 0x0a, 0x08, 0x50, 0x69, 0x64, 0x73, 0x53, 0x74, 0x61, 0x74, 0x12, + 0x18, 0x0a, 0x07, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x18, 0x01, 0x20, 0x01, 0x28, 0x04, + 0x52, 0x07, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x6c, 0x69, 0x6d, + 0x69, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x6c, 0x69, 0x6d, 0x69, 0x74, 0x22, + 0x85, 0x02, 0x0a, 0x07, 0x43, 0x50, 0x55, 0x53, 0x74, 0x61, 0x74, 0x12, 0x1d, 0x0a, 0x0a, 0x75, + 0x73, 0x61, 0x67, 0x65, 0x5f, 0x75, 0x73, 0x65, 0x63, 0x18, 0x01, 0x20, 0x01, 0x28, 0x04, 0x52, + 0x09, 0x75, 0x73, 0x61, 0x67, 0x65, 0x55, 0x73, 0x65, 0x63, 0x12, 0x1b, 0x0a, 0x09, 0x75, 0x73, + 0x65, 0x72, 0x5f, 0x75, 0x73, 0x65, 0x63, 0x18, 0x02, 0x20, 0x01, 0x28, 0x04, 0x52, 0x08, 0x75, + 0x73, 0x65, 0x72, 0x55, 0x73, 0x65, 0x63, 0x12, 0x1f, 0x0a, 0x0b, 0x73, 0x79, 0x73, 0x74, 0x65, + 0x6d, 0x5f, 0x75, 0x73, 0x65, 0x63, 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x0a, 0x73, 0x79, + 0x73, 0x74, 0x65, 0x6d, 0x55, 0x73, 0x65, 0x63, 0x12, 0x1d, 0x0a, 0x0a, 0x6e, 0x72, 0x5f, 0x70, + 0x65, 0x72, 0x69, 0x6f, 0x64, 0x73, 0x18, 0x04, 0x20, 0x01, 0x28, 0x04, 0x52, 0x09, 0x6e, 0x72, + 0x50, 0x65, 0x72, 0x69, 0x6f, 0x64, 0x73, 0x12, 0x21, 0x0a, 0x0c, 0x6e, 0x72, 0x5f, 0x74, 0x68, + 0x72, 0x6f, 0x74, 0x74, 0x6c, 0x65, 0x64, 0x18, 0x05, 0x20, 0x01, 0x28, 0x04, 0x52, 0x0b, 0x6e, + 0x72, 0x54, 0x68, 0x72, 0x6f, 0x74, 0x74, 0x6c, 0x65, 0x64, 0x12, 0x25, 0x0a, 0x0e, 0x74, 0x68, + 0x72, 0x6f, 0x74, 0x74, 0x6c, 0x65, 0x64, 0x5f, 0x75, 0x73, 0x65, 0x63, 0x18, 0x06, 0x20, 0x01, + 0x28, 0x04, 0x52, 0x0d, 0x74, 0x68, 0x72, 0x6f, 0x74, 0x74, 0x6c, 0x65, 0x64, 0x55, 0x73, 0x65, + 0x63, 0x12, 0x34, 0x0a, 0x03, 0x70, 0x73, 0x69, 0x18, 0x07, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x22, + 0x2e, 0x69, 0x6f, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x64, 0x2e, 0x63, + 0x67, 0x72, 0x6f, 0x75, 0x70, 0x73, 0x2e, 0x76, 0x32, 0x2e, 0x50, 0x53, 0x49, 0x53, 0x74, 0x61, + 0x74, 0x73, 0x52, 0x03, 0x70, 0x73, 0x69, 0x22, 0x88, 0x0a, 0x0a, 0x0a, 0x4d, 0x65, 0x6d, 0x6f, + 0x72, 0x79, 0x53, 0x74, 0x61, 0x74, 0x12, 0x12, 0x0a, 0x04, 0x61, 0x6e, 0x6f, 0x6e, 0x18, 0x01, + 0x20, 0x01, 0x28, 0x04, 0x52, 0x04, 0x61, 0x6e, 0x6f, 0x6e, 0x12, 0x12, 0x0a, 0x04, 0x66, 0x69, + 0x6c, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x04, 0x52, 0x04, 0x66, 0x69, 0x6c, 0x65, 0x12, 0x21, + 0x0a, 0x0c, 0x6b, 0x65, 0x72, 0x6e, 0x65, 0x6c, 0x5f, 0x73, 0x74, 0x61, 0x63, 0x6b, 0x18, 0x03, + 0x20, 0x01, 0x28, 0x04, 0x52, 0x0b, 0x6b, 0x65, 0x72, 0x6e, 0x65, 0x6c, 0x53, 0x74, 0x61, 0x63, + 0x6b, 0x12, 0x12, 0x0a, 0x04, 0x73, 0x6c, 0x61, 0x62, 0x18, 0x04, 0x20, 0x01, 0x28, 0x04, 0x52, + 0x04, 0x73, 0x6c, 0x61, 0x62, 0x12, 0x12, 0x0a, 0x04, 0x73, 0x6f, 0x63, 0x6b, 0x18, 0x05, 0x20, + 0x01, 0x28, 0x04, 0x52, 0x04, 0x73, 0x6f, 0x63, 0x6b, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x68, 0x6d, + 0x65, 0x6d, 0x18, 0x06, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x73, 0x68, 0x6d, 0x65, 0x6d, 0x12, + 0x1f, 0x0a, 0x0b, 0x66, 0x69, 0x6c, 0x65, 0x5f, 0x6d, 0x61, 0x70, 0x70, 0x65, 0x64, 0x18, 0x07, + 0x20, 0x01, 0x28, 0x04, 0x52, 0x0a, 0x66, 0x69, 0x6c, 0x65, 0x4d, 0x61, 0x70, 0x70, 0x65, 0x64, + 0x12, 0x1d, 0x0a, 0x0a, 0x66, 0x69, 0x6c, 0x65, 0x5f, 0x64, 0x69, 0x72, 0x74, 0x79, 0x18, 0x08, + 0x20, 0x01, 0x28, 0x04, 0x52, 0x09, 0x66, 0x69, 0x6c, 0x65, 0x44, 0x69, 0x72, 0x74, 0x79, 0x12, + 0x25, 0x0a, 0x0e, 0x66, 0x69, 0x6c, 0x65, 0x5f, 0x77, 0x72, 0x69, 0x74, 0x65, 0x62, 0x61, 0x63, + 0x6b, 0x18, 0x09, 0x20, 0x01, 0x28, 0x04, 0x52, 0x0d, 0x66, 0x69, 0x6c, 0x65, 0x57, 0x72, 0x69, + 0x74, 0x65, 0x62, 0x61, 0x63, 0x6b, 0x12, 0x19, 0x0a, 0x08, 0x61, 0x6e, 0x6f, 0x6e, 0x5f, 0x74, + 0x68, 0x70, 0x18, 0x0a, 0x20, 0x01, 0x28, 0x04, 0x52, 0x07, 0x61, 0x6e, 0x6f, 0x6e, 0x54, 0x68, + 0x70, 0x12, 0x23, 0x0a, 0x0d, 0x69, 0x6e, 0x61, 0x63, 0x74, 0x69, 0x76, 0x65, 0x5f, 0x61, 0x6e, + 0x6f, 0x6e, 0x18, 0x0b, 0x20, 0x01, 0x28, 0x04, 0x52, 0x0c, 0x69, 0x6e, 0x61, 0x63, 0x74, 0x69, + 0x76, 0x65, 0x41, 0x6e, 0x6f, 0x6e, 0x12, 0x1f, 0x0a, 0x0b, 0x61, 0x63, 0x74, 0x69, 0x76, 0x65, + 0x5f, 0x61, 0x6e, 0x6f, 0x6e, 0x18, 0x0c, 0x20, 0x01, 0x28, 0x04, 0x52, 0x0a, 0x61, 0x63, 0x74, + 0x69, 0x76, 0x65, 0x41, 0x6e, 0x6f, 0x6e, 0x12, 0x23, 0x0a, 0x0d, 0x69, 0x6e, 0x61, 0x63, 0x74, + 0x69, 0x76, 0x65, 0x5f, 0x66, 0x69, 0x6c, 0x65, 0x18, 0x0d, 0x20, 0x01, 0x28, 0x04, 0x52, 0x0c, + 0x69, 0x6e, 0x61, 0x63, 0x74, 0x69, 0x76, 0x65, 0x46, 0x69, 0x6c, 0x65, 0x12, 0x1f, 0x0a, 0x0b, + 0x61, 0x63, 0x74, 0x69, 0x76, 0x65, 0x5f, 0x66, 0x69, 0x6c, 0x65, 0x18, 0x0e, 0x20, 0x01, 0x28, + 0x04, 0x52, 0x0a, 0x61, 0x63, 0x74, 0x69, 0x76, 0x65, 0x46, 0x69, 0x6c, 0x65, 0x12, 0x20, 0x0a, + 0x0b, 0x75, 0x6e, 0x65, 0x76, 0x69, 0x63, 0x74, 0x61, 0x62, 0x6c, 0x65, 0x18, 0x0f, 0x20, 0x01, + 0x28, 0x04, 0x52, 0x0b, 0x75, 0x6e, 0x65, 0x76, 0x69, 0x63, 0x74, 0x61, 0x62, 0x6c, 0x65, 0x12, + 0x29, 0x0a, 0x10, 0x73, 0x6c, 0x61, 0x62, 0x5f, 0x72, 0x65, 0x63, 0x6c, 0x61, 0x69, 0x6d, 0x61, + 0x62, 0x6c, 0x65, 0x18, 0x10, 0x20, 0x01, 0x28, 0x04, 0x52, 0x0f, 0x73, 0x6c, 0x61, 0x62, 0x52, + 0x65, 0x63, 0x6c, 0x61, 0x69, 0x6d, 0x61, 0x62, 0x6c, 0x65, 0x12, 0x2d, 0x0a, 0x12, 0x73, 0x6c, + 0x61, 0x62, 0x5f, 0x75, 0x6e, 0x72, 0x65, 0x63, 0x6c, 0x61, 0x69, 0x6d, 0x61, 0x62, 0x6c, 0x65, + 0x18, 0x11, 0x20, 0x01, 0x28, 0x04, 0x52, 0x11, 0x73, 0x6c, 0x61, 0x62, 0x55, 0x6e, 0x72, 0x65, + 0x63, 0x6c, 0x61, 0x69, 0x6d, 0x61, 0x62, 0x6c, 0x65, 0x12, 0x18, 0x0a, 0x07, 0x70, 0x67, 0x66, + 0x61, 0x75, 0x6c, 0x74, 0x18, 0x12, 0x20, 0x01, 0x28, 0x04, 0x52, 0x07, 0x70, 0x67, 0x66, 0x61, + 0x75, 0x6c, 0x74, 0x12, 0x1e, 0x0a, 0x0a, 0x70, 0x67, 0x6d, 0x61, 0x6a, 0x66, 0x61, 0x75, 0x6c, + 0x74, 0x18, 0x13, 0x20, 0x01, 0x28, 0x04, 0x52, 0x0a, 0x70, 0x67, 0x6d, 0x61, 0x6a, 0x66, 0x61, + 0x75, 0x6c, 0x74, 0x12, 0x2d, 0x0a, 0x12, 0x77, 0x6f, 0x72, 0x6b, 0x69, 0x6e, 0x67, 0x73, 0x65, + 0x74, 0x5f, 0x72, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x18, 0x14, 0x20, 0x01, 0x28, 0x04, 0x52, + 0x11, 0x77, 0x6f, 0x72, 0x6b, 0x69, 0x6e, 0x67, 0x73, 0x65, 0x74, 0x52, 0x65, 0x66, 0x61, 0x75, + 0x6c, 0x74, 0x12, 0x2f, 0x0a, 0x13, 0x77, 0x6f, 0x72, 0x6b, 0x69, 0x6e, 0x67, 0x73, 0x65, 0x74, + 0x5f, 0x61, 0x63, 0x74, 0x69, 0x76, 0x61, 0x74, 0x65, 0x18, 0x15, 0x20, 0x01, 0x28, 0x04, 0x52, + 0x12, 0x77, 0x6f, 0x72, 0x6b, 0x69, 0x6e, 0x67, 0x73, 0x65, 0x74, 0x41, 0x63, 0x74, 0x69, 0x76, + 0x61, 0x74, 0x65, 0x12, 0x35, 0x0a, 0x16, 0x77, 0x6f, 0x72, 0x6b, 0x69, 0x6e, 0x67, 0x73, 0x65, + 0x74, 0x5f, 0x6e, 0x6f, 0x64, 0x65, 0x72, 0x65, 0x63, 0x6c, 0x61, 0x69, 0x6d, 0x18, 0x16, 0x20, + 0x01, 0x28, 0x04, 0x52, 0x15, 0x77, 0x6f, 0x72, 0x6b, 0x69, 0x6e, 0x67, 0x73, 0x65, 0x74, 0x4e, + 0x6f, 0x64, 0x65, 0x72, 0x65, 0x63, 0x6c, 0x61, 0x69, 0x6d, 0x12, 0x1a, 0x0a, 0x08, 0x70, 0x67, + 0x72, 0x65, 0x66, 0x69, 0x6c, 0x6c, 0x18, 0x17, 0x20, 0x01, 0x28, 0x04, 0x52, 0x08, 0x70, 0x67, + 0x72, 0x65, 0x66, 0x69, 0x6c, 0x6c, 0x12, 0x16, 0x0a, 0x06, 0x70, 0x67, 0x73, 0x63, 0x61, 0x6e, + 0x18, 0x18, 0x20, 0x01, 0x28, 0x04, 0x52, 0x06, 0x70, 0x67, 0x73, 0x63, 0x61, 0x6e, 0x12, 0x18, + 0x0a, 0x07, 0x70, 0x67, 0x73, 0x74, 0x65, 0x61, 0x6c, 0x18, 0x19, 0x20, 0x01, 0x28, 0x04, 0x52, + 0x07, 0x70, 0x67, 0x73, 0x74, 0x65, 0x61, 0x6c, 0x12, 0x1e, 0x0a, 0x0a, 0x70, 0x67, 0x61, 0x63, + 0x74, 0x69, 0x76, 0x61, 0x74, 0x65, 0x18, 0x1a, 0x20, 0x01, 0x28, 0x04, 0x52, 0x0a, 0x70, 0x67, + 0x61, 0x63, 0x74, 0x69, 0x76, 0x61, 0x74, 0x65, 0x12, 0x22, 0x0a, 0x0c, 0x70, 0x67, 0x64, 0x65, + 0x61, 0x63, 0x74, 0x69, 0x76, 0x61, 0x74, 0x65, 0x18, 0x1b, 0x20, 0x01, 0x28, 0x04, 0x52, 0x0c, + 0x70, 0x67, 0x64, 0x65, 0x61, 0x63, 0x74, 0x69, 0x76, 0x61, 0x74, 0x65, 0x12, 0x1e, 0x0a, 0x0a, + 0x70, 0x67, 0x6c, 0x61, 0x7a, 0x79, 0x66, 0x72, 0x65, 0x65, 0x18, 0x1c, 0x20, 0x01, 0x28, 0x04, + 0x52, 0x0a, 0x70, 0x67, 0x6c, 0x61, 0x7a, 0x79, 0x66, 0x72, 0x65, 0x65, 0x12, 0x20, 0x0a, 0x0b, + 0x70, 0x67, 0x6c, 0x61, 0x7a, 0x79, 0x66, 0x72, 0x65, 0x65, 0x64, 0x18, 0x1d, 0x20, 0x01, 0x28, + 0x04, 0x52, 0x0b, 0x70, 0x67, 0x6c, 0x61, 0x7a, 0x79, 0x66, 0x72, 0x65, 0x65, 0x64, 0x12, 0x26, + 0x0a, 0x0f, 0x74, 0x68, 0x70, 0x5f, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x5f, 0x61, 0x6c, 0x6c, 0x6f, + 0x63, 0x18, 0x1e, 0x20, 0x01, 0x28, 0x04, 0x52, 0x0d, 0x74, 0x68, 0x70, 0x46, 0x61, 0x75, 0x6c, + 0x74, 0x41, 0x6c, 0x6c, 0x6f, 0x63, 0x12, 0x2c, 0x0a, 0x12, 0x74, 0x68, 0x70, 0x5f, 0x63, 0x6f, + 0x6c, 0x6c, 0x61, 0x70, 0x73, 0x65, 0x5f, 0x61, 0x6c, 0x6c, 0x6f, 0x63, 0x18, 0x1f, 0x20, 0x01, + 0x28, 0x04, 0x52, 0x10, 0x74, 0x68, 0x70, 0x43, 0x6f, 0x6c, 0x6c, 0x61, 0x70, 0x73, 0x65, 0x41, + 0x6c, 0x6c, 0x6f, 0x63, 0x12, 0x14, 0x0a, 0x05, 0x75, 0x73, 0x61, 0x67, 0x65, 0x18, 0x20, 0x20, + 0x01, 0x28, 0x04, 0x52, 0x05, 0x75, 0x73, 0x61, 0x67, 0x65, 0x12, 0x1f, 0x0a, 0x0b, 0x75, 0x73, + 0x61, 0x67, 0x65, 0x5f, 0x6c, 0x69, 0x6d, 0x69, 0x74, 0x18, 0x21, 0x20, 0x01, 0x28, 0x04, 0x52, + 0x0a, 0x75, 0x73, 0x61, 0x67, 0x65, 0x4c, 0x69, 0x6d, 0x69, 0x74, 0x12, 0x1d, 0x0a, 0x0a, 0x73, + 0x77, 0x61, 0x70, 0x5f, 0x75, 0x73, 0x61, 0x67, 0x65, 0x18, 0x22, 0x20, 0x01, 0x28, 0x04, 0x52, + 0x09, 0x73, 0x77, 0x61, 0x70, 0x55, 0x73, 0x61, 0x67, 0x65, 0x12, 0x1d, 0x0a, 0x0a, 0x73, 0x77, + 0x61, 0x70, 0x5f, 0x6c, 0x69, 0x6d, 0x69, 0x74, 0x18, 0x23, 0x20, 0x01, 0x28, 0x04, 0x52, 0x09, + 0x73, 0x77, 0x61, 0x70, 0x4c, 0x69, 0x6d, 0x69, 0x74, 0x12, 0x1b, 0x0a, 0x09, 0x6d, 0x61, 0x78, + 0x5f, 0x75, 0x73, 0x61, 0x67, 0x65, 0x18, 0x24, 0x20, 0x01, 0x28, 0x04, 0x52, 0x08, 0x6d, 0x61, + 0x78, 0x55, 0x73, 0x61, 0x67, 0x65, 0x12, 0x24, 0x0a, 0x0e, 0x73, 0x77, 0x61, 0x70, 0x5f, 0x6d, + 0x61, 0x78, 0x5f, 0x75, 0x73, 0x61, 0x67, 0x65, 0x18, 0x25, 0x20, 0x01, 0x28, 0x04, 0x52, 0x0c, + 0x73, 0x77, 0x61, 0x70, 0x4d, 0x61, 0x78, 0x55, 0x73, 0x61, 0x67, 0x65, 0x12, 0x34, 0x0a, 0x03, + 0x70, 0x73, 0x69, 0x18, 0x26, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x22, 0x2e, 0x69, 0x6f, 0x2e, 0x63, + 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x64, 0x2e, 0x63, 0x67, 0x72, 0x6f, 0x75, 0x70, + 0x73, 0x2e, 0x76, 0x32, 0x2e, 0x50, 0x53, 0x49, 0x53, 0x74, 0x61, 0x74, 0x73, 0x52, 0x03, 0x70, + 0x73, 0x69, 0x22, 0x73, 0x0a, 0x0c, 0x4d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x45, 0x76, 0x65, 0x6e, + 0x74, 0x73, 0x12, 0x10, 0x0a, 0x03, 0x6c, 0x6f, 0x77, 0x18, 0x01, 0x20, 0x01, 0x28, 0x04, 0x52, + 0x03, 0x6c, 0x6f, 0x77, 0x12, 0x12, 0x0a, 0x04, 0x68, 0x69, 0x67, 0x68, 0x18, 0x02, 0x20, 0x01, + 0x28, 0x04, 0x52, 0x04, 0x68, 0x69, 0x67, 0x68, 0x12, 0x10, 0x0a, 0x03, 0x6d, 0x61, 0x78, 0x18, + 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x6d, 0x61, 0x78, 0x12, 0x10, 0x0a, 0x03, 0x6f, 0x6f, + 0x6d, 0x18, 0x04, 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x6f, 0x6f, 0x6d, 0x12, 0x19, 0x0a, 0x08, + 0x6f, 0x6f, 0x6d, 0x5f, 0x6b, 0x69, 0x6c, 0x6c, 0x18, 0x05, 0x20, 0x01, 0x28, 0x04, 0x52, 0x07, + 0x6f, 0x6f, 0x6d, 0x4b, 0x69, 0x6c, 0x6c, 0x22, 0x84, 0x01, 0x0a, 0x08, 0x52, 0x64, 0x6d, 0x61, + 0x53, 0x74, 0x61, 0x74, 0x12, 0x3d, 0x0a, 0x07, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x18, + 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x23, 0x2e, 0x69, 0x6f, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x61, + 0x69, 0x6e, 0x65, 0x72, 0x64, 0x2e, 0x63, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x73, 0x2e, 0x76, 0x32, + 0x2e, 0x52, 0x64, 0x6d, 0x61, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x52, 0x07, 0x63, 0x75, 0x72, 0x72, + 0x65, 0x6e, 0x74, 0x12, 0x39, 0x0a, 0x05, 0x6c, 0x69, 0x6d, 0x69, 0x74, 0x18, 0x02, 0x20, 0x03, + 0x28, 0x0b, 0x32, 0x23, 0x2e, 0x69, 0x6f, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, + 0x72, 0x64, 0x2e, 0x63, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x73, 0x2e, 0x76, 0x32, 0x2e, 0x52, 0x64, + 0x6d, 0x61, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x52, 0x05, 0x6c, 0x69, 0x6d, 0x69, 0x74, 0x22, 0x65, + 0x0a, 0x09, 0x52, 0x64, 0x6d, 0x61, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x12, 0x16, 0x0a, 0x06, 0x64, + 0x65, 0x76, 0x69, 0x63, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x64, 0x65, 0x76, + 0x69, 0x63, 0x65, 0x12, 0x1f, 0x0a, 0x0b, 0x68, 0x63, 0x61, 0x5f, 0x68, 0x61, 0x6e, 0x64, 0x6c, + 0x65, 0x73, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x0a, 0x68, 0x63, 0x61, 0x48, 0x61, 0x6e, + 0x64, 0x6c, 0x65, 0x73, 0x12, 0x1f, 0x0a, 0x0b, 0x68, 0x63, 0x61, 0x5f, 0x6f, 0x62, 0x6a, 0x65, + 0x63, 0x74, 0x73, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x0a, 0x68, 0x63, 0x61, 0x4f, 0x62, + 0x6a, 0x65, 0x63, 0x74, 0x73, 0x22, 0x77, 0x0a, 0x06, 0x49, 0x4f, 0x53, 0x74, 0x61, 0x74, 0x12, + 0x37, 0x0a, 0x05, 0x75, 0x73, 0x61, 0x67, 0x65, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x21, + 0x2e, 0x69, 0x6f, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x64, 0x2e, 0x63, + 0x67, 0x72, 0x6f, 0x75, 0x70, 0x73, 0x2e, 0x76, 0x32, 0x2e, 0x49, 0x4f, 0x45, 0x6e, 0x74, 0x72, + 0x79, 0x52, 0x05, 0x75, 0x73, 0x61, 0x67, 0x65, 0x12, 0x34, 0x0a, 0x03, 0x70, 0x73, 0x69, 0x18, + 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x22, 0x2e, 0x69, 0x6f, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x61, + 0x69, 0x6e, 0x65, 0x72, 0x64, 0x2e, 0x63, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x73, 0x2e, 0x76, 0x32, + 0x2e, 0x50, 0x53, 0x49, 0x53, 0x74, 0x61, 0x74, 0x73, 0x52, 0x03, 0x70, 0x73, 0x69, 0x22, 0x8d, + 0x01, 0x0a, 0x07, 0x49, 0x4f, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x12, 0x14, 0x0a, 0x05, 0x6d, 0x61, + 0x6a, 0x6f, 0x72, 0x18, 0x01, 0x20, 0x01, 0x28, 0x04, 0x52, 0x05, 0x6d, 0x61, 0x6a, 0x6f, 0x72, + 0x12, 0x14, 0x0a, 0x05, 0x6d, 0x69, 0x6e, 0x6f, 0x72, 0x18, 0x02, 0x20, 0x01, 0x28, 0x04, 0x52, + 0x05, 0x6d, 0x69, 0x6e, 0x6f, 0x72, 0x12, 0x16, 0x0a, 0x06, 0x72, 0x62, 0x79, 0x74, 0x65, 0x73, + 0x18, 0x03, 0x20, 0x01, 0x28, 0x04, 0x52, 0x06, 0x72, 0x62, 0x79, 0x74, 0x65, 0x73, 0x12, 0x16, + 0x0a, 0x06, 0x77, 0x62, 0x79, 0x74, 0x65, 0x73, 0x18, 0x04, 0x20, 0x01, 0x28, 0x04, 0x52, 0x06, + 0x77, 0x62, 0x79, 0x74, 0x65, 0x73, 0x12, 0x12, 0x0a, 0x04, 0x72, 0x69, 0x6f, 0x73, 0x18, 0x05, + 0x20, 0x01, 0x28, 0x04, 0x52, 0x04, 0x72, 0x69, 0x6f, 0x73, 0x12, 0x12, 0x0a, 0x04, 0x77, 0x69, + 0x6f, 0x73, 0x18, 0x06, 0x20, 0x01, 0x28, 0x04, 0x52, 0x04, 0x77, 0x69, 0x6f, 0x73, 0x22, 0x55, + 0x0a, 0x0b, 0x48, 0x75, 0x67, 0x65, 0x54, 0x6c, 0x62, 0x53, 0x74, 0x61, 0x74, 0x12, 0x18, 0x0a, + 0x07, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x18, 0x01, 0x20, 0x01, 0x28, 0x04, 0x52, 0x07, + 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x12, 0x10, 0x0a, 0x03, 0x6d, 0x61, 0x78, 0x18, 0x02, + 0x20, 0x01, 0x28, 0x04, 0x52, 0x03, 0x6d, 0x61, 0x78, 0x12, 0x1a, 0x0a, 0x08, 0x70, 0x61, 0x67, + 0x65, 0x73, 0x69, 0x7a, 0x65, 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, 0x70, 0x61, 0x67, + 0x65, 0x73, 0x69, 0x7a, 0x65, 0x42, 0x2d, 0x5a, 0x2b, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, + 0x63, 0x6f, 0x6d, 0x2f, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x64, 0x2f, 0x63, + 0x67, 0x72, 0x6f, 0x75, 0x70, 0x73, 0x2f, 0x63, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x32, 0x2f, 0x73, + 0x74, 0x61, 0x74, 0x73, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, +} + +var ( + file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_rawDescOnce sync.Once + file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_rawDescData = file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_rawDesc +) + +func file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_rawDescGZIP() []byte { + file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_rawDescOnce.Do(func() { + file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_rawDescData = protoimpl.X.CompressGZIP(file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_rawDescData) + }) + return file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_rawDescData +} + +var file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes = make([]protoimpl.MessageInfo, 12) +var file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_goTypes = []interface{}{ + (*Metrics)(nil), // 0: io.containerd.cgroups.v2.Metrics + (*PSIData)(nil), // 1: io.containerd.cgroups.v2.PSIData + (*PSIStats)(nil), // 2: io.containerd.cgroups.v2.PSIStats + (*PidsStat)(nil), // 3: io.containerd.cgroups.v2.PidsStat + (*CPUStat)(nil), // 4: io.containerd.cgroups.v2.CPUStat + (*MemoryStat)(nil), // 5: io.containerd.cgroups.v2.MemoryStat + (*MemoryEvents)(nil), // 6: io.containerd.cgroups.v2.MemoryEvents + (*RdmaStat)(nil), // 7: io.containerd.cgroups.v2.RdmaStat + (*RdmaEntry)(nil), // 8: io.containerd.cgroups.v2.RdmaEntry + (*IOStat)(nil), // 9: io.containerd.cgroups.v2.IOStat + (*IOEntry)(nil), // 10: io.containerd.cgroups.v2.IOEntry + (*HugeTlbStat)(nil), // 11: io.containerd.cgroups.v2.HugeTlbStat +} +var file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_depIdxs = []int32{ + 3, // 0: io.containerd.cgroups.v2.Metrics.pids:type_name -> io.containerd.cgroups.v2.PidsStat + 4, // 1: io.containerd.cgroups.v2.Metrics.cpu:type_name -> io.containerd.cgroups.v2.CPUStat + 5, // 2: io.containerd.cgroups.v2.Metrics.memory:type_name -> io.containerd.cgroups.v2.MemoryStat + 7, // 3: io.containerd.cgroups.v2.Metrics.rdma:type_name -> io.containerd.cgroups.v2.RdmaStat + 9, // 4: io.containerd.cgroups.v2.Metrics.io:type_name -> io.containerd.cgroups.v2.IOStat + 11, // 5: io.containerd.cgroups.v2.Metrics.hugetlb:type_name -> io.containerd.cgroups.v2.HugeTlbStat + 6, // 6: io.containerd.cgroups.v2.Metrics.memory_events:type_name -> io.containerd.cgroups.v2.MemoryEvents + 1, // 7: io.containerd.cgroups.v2.PSIStats.some:type_name -> io.containerd.cgroups.v2.PSIData + 1, // 8: io.containerd.cgroups.v2.PSIStats.full:type_name -> io.containerd.cgroups.v2.PSIData + 2, // 9: io.containerd.cgroups.v2.CPUStat.psi:type_name -> io.containerd.cgroups.v2.PSIStats + 2, // 10: io.containerd.cgroups.v2.MemoryStat.psi:type_name -> io.containerd.cgroups.v2.PSIStats + 8, // 11: io.containerd.cgroups.v2.RdmaStat.current:type_name -> io.containerd.cgroups.v2.RdmaEntry + 8, // 12: io.containerd.cgroups.v2.RdmaStat.limit:type_name -> io.containerd.cgroups.v2.RdmaEntry + 10, // 13: io.containerd.cgroups.v2.IOStat.usage:type_name -> io.containerd.cgroups.v2.IOEntry + 2, // 14: io.containerd.cgroups.v2.IOStat.psi:type_name -> io.containerd.cgroups.v2.PSIStats + 15, // [15:15] is the sub-list for method output_type + 15, // [15:15] is the sub-list for method input_type + 15, // [15:15] is the sub-list for extension type_name + 15, // [15:15] is the sub-list for extension extendee + 0, // [0:15] is the sub-list for field type_name +} + +func init() { file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_init() } +func file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_init() { + if File_github_com_containerd_cgroups_cgroup2_stats_metrics_proto != nil { + return + } + if !protoimpl.UnsafeEnabled { + file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*Metrics); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*PSIData); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*PSIStats); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[3].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*PidsStat); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[4].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*CPUStat); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[5].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*MemoryStat); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[6].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*MemoryEvents); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[7].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*RdmaStat); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[8].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*RdmaEntry); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[9].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*IOStat); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[10].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*IOEntry); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes[11].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*HugeTlbStat); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + } + type x struct{} + out := protoimpl.TypeBuilder{ + File: protoimpl.DescBuilder{ + GoPackagePath: reflect.TypeOf(x{}).PkgPath(), + RawDescriptor: file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_rawDesc, + NumEnums: 0, + NumMessages: 12, + NumExtensions: 0, + NumServices: 0, + }, + GoTypes: file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_goTypes, + DependencyIndexes: file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_depIdxs, + MessageInfos: file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_msgTypes, + }.Build() + File_github_com_containerd_cgroups_cgroup2_stats_metrics_proto = out.File + file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_rawDesc = nil + file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_goTypes = nil + file_github_com_containerd_cgroups_cgroup2_stats_metrics_proto_depIdxs = nil +} diff --git a/vendor/github.com/containerd/cgroups/v3/cgroup2/stats/metrics.pb.txt b/vendor/github.com/containerd/cgroups/v3/cgroup2/stats/metrics.pb.txt new file mode 100644 index 0000000000..26f5ba5de7 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/v3/cgroup2/stats/metrics.pb.txt @@ -0,0 +1,626 @@ +file { + name: "github.com/containerd/cgroups/cgroup2/stats/metrics.proto" + package: "io.containerd.cgroups.v2" + message_type { + name: "Metrics" + field { + name: "pids" + number: 1 + label: LABEL_OPTIONAL + type: TYPE_MESSAGE + type_name: ".io.containerd.cgroups.v2.PidsStat" + json_name: "pids" + } + field { + name: "cpu" + number: 2 + label: LABEL_OPTIONAL + type: TYPE_MESSAGE + type_name: ".io.containerd.cgroups.v2.CPUStat" + json_name: "cpu" + } + field { + name: "memory" + number: 4 + label: LABEL_OPTIONAL + type: TYPE_MESSAGE + type_name: ".io.containerd.cgroups.v2.MemoryStat" + json_name: "memory" + } + field { + name: "rdma" + number: 5 + label: LABEL_OPTIONAL + type: TYPE_MESSAGE + type_name: ".io.containerd.cgroups.v2.RdmaStat" + json_name: "rdma" + } + field { + name: "io" + number: 6 + label: LABEL_OPTIONAL + type: TYPE_MESSAGE + type_name: ".io.containerd.cgroups.v2.IOStat" + json_name: "io" + } + field { + name: "hugetlb" + number: 7 + label: LABEL_REPEATED + type: TYPE_MESSAGE + type_name: ".io.containerd.cgroups.v2.HugeTlbStat" + json_name: "hugetlb" + } + field { + name: "memory_events" + number: 8 + label: LABEL_OPTIONAL + type: TYPE_MESSAGE + type_name: ".io.containerd.cgroups.v2.MemoryEvents" + json_name: "memoryEvents" + } + } + message_type { + name: "PSIData" + field { + name: "avg10" + number: 1 + label: LABEL_OPTIONAL + type: TYPE_DOUBLE + json_name: "avg10" + } + field { + name: "avg60" + number: 2 + label: LABEL_OPTIONAL + type: TYPE_DOUBLE + json_name: "avg60" + } + field { + name: "avg300" + number: 3 + label: LABEL_OPTIONAL + type: TYPE_DOUBLE + json_name: "avg300" + } + field { + name: "total" + number: 4 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "total" + } + } + message_type { + name: "PSIStats" + field { + name: "some" + number: 1 + label: LABEL_OPTIONAL + type: TYPE_MESSAGE + type_name: ".io.containerd.cgroups.v2.PSIData" + json_name: "some" + } + field { + name: "full" + number: 2 + label: LABEL_OPTIONAL + type: TYPE_MESSAGE + type_name: ".io.containerd.cgroups.v2.PSIData" + json_name: "full" + } + } + message_type { + name: "PidsStat" + field { + name: "current" + number: 1 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "current" + } + field { + name: "limit" + number: 2 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "limit" + } + } + message_type { + name: "CPUStat" + field { + name: "usage_usec" + number: 1 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "usageUsec" + } + field { + name: "user_usec" + number: 2 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "userUsec" + } + field { + name: "system_usec" + number: 3 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "systemUsec" + } + field { + name: "nr_periods" + number: 4 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "nrPeriods" + } + field { + name: "nr_throttled" + number: 5 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "nrThrottled" + } + field { + name: "throttled_usec" + number: 6 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "throttledUsec" + } + field { + name: "psi" + number: 7 + label: LABEL_OPTIONAL + type: TYPE_MESSAGE + type_name: ".io.containerd.cgroups.v2.PSIStats" + json_name: "psi" + } + } + message_type { + name: "MemoryStat" + field { + name: "anon" + number: 1 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "anon" + } + field { + name: "file" + number: 2 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "file" + } + field { + name: "kernel_stack" + number: 3 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "kernelStack" + } + field { + name: "slab" + number: 4 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "slab" + } + field { + name: "sock" + number: 5 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "sock" + } + field { + name: "shmem" + number: 6 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "shmem" + } + field { + name: "file_mapped" + number: 7 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "fileMapped" + } + field { + name: "file_dirty" + number: 8 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "fileDirty" + } + field { + name: "file_writeback" + number: 9 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "fileWriteback" + } + field { + name: "anon_thp" + number: 10 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "anonThp" + } + field { + name: "inactive_anon" + number: 11 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "inactiveAnon" + } + field { + name: "active_anon" + number: 12 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "activeAnon" + } + field { + name: "inactive_file" + number: 13 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "inactiveFile" + } + field { + name: "active_file" + number: 14 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "activeFile" + } + field { + name: "unevictable" + number: 15 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "unevictable" + } + field { + name: "slab_reclaimable" + number: 16 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "slabReclaimable" + } + field { + name: "slab_unreclaimable" + number: 17 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "slabUnreclaimable" + } + field { + name: "pgfault" + number: 18 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "pgfault" + } + field { + name: "pgmajfault" + number: 19 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "pgmajfault" + } + field { + name: "workingset_refault" + number: 20 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "workingsetRefault" + } + field { + name: "workingset_activate" + number: 21 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "workingsetActivate" + } + field { + name: "workingset_nodereclaim" + number: 22 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "workingsetNodereclaim" + } + field { + name: "pgrefill" + number: 23 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "pgrefill" + } + field { + name: "pgscan" + number: 24 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "pgscan" + } + field { + name: "pgsteal" + number: 25 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "pgsteal" + } + field { + name: "pgactivate" + number: 26 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "pgactivate" + } + field { + name: "pgdeactivate" + number: 27 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "pgdeactivate" + } + field { + name: "pglazyfree" + number: 28 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "pglazyfree" + } + field { + name: "pglazyfreed" + number: 29 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "pglazyfreed" + } + field { + name: "thp_fault_alloc" + number: 30 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "thpFaultAlloc" + } + field { + name: "thp_collapse_alloc" + number: 31 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "thpCollapseAlloc" + } + field { + name: "usage" + number: 32 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "usage" + } + field { + name: "usage_limit" + number: 33 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "usageLimit" + } + field { + name: "swap_usage" + number: 34 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "swapUsage" + } + field { + name: "swap_limit" + number: 35 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "swapLimit" + } + field { + name: "max_usage" + number: 36 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "maxUsage" + } + field { + name: "swap_max_usage" + number: 37 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "swapMaxUsage" + } + field { + name: "psi" + number: 38 + label: LABEL_OPTIONAL + type: TYPE_MESSAGE + type_name: ".io.containerd.cgroups.v2.PSIStats" + json_name: "psi" + } + } + message_type { + name: "MemoryEvents" + field { + name: "low" + number: 1 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "low" + } + field { + name: "high" + number: 2 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "high" + } + field { + name: "max" + number: 3 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "max" + } + field { + name: "oom" + number: 4 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "oom" + } + field { + name: "oom_kill" + number: 5 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "oomKill" + } + } + message_type { + name: "RdmaStat" + field { + name: "current" + number: 1 + label: LABEL_REPEATED + type: TYPE_MESSAGE + type_name: ".io.containerd.cgroups.v2.RdmaEntry" + json_name: "current" + } + field { + name: "limit" + number: 2 + label: LABEL_REPEATED + type: TYPE_MESSAGE + type_name: ".io.containerd.cgroups.v2.RdmaEntry" + json_name: "limit" + } + } + message_type { + name: "RdmaEntry" + field { + name: "device" + number: 1 + label: LABEL_OPTIONAL + type: TYPE_STRING + json_name: "device" + } + field { + name: "hca_handles" + number: 2 + label: LABEL_OPTIONAL + type: TYPE_UINT32 + json_name: "hcaHandles" + } + field { + name: "hca_objects" + number: 3 + label: LABEL_OPTIONAL + type: TYPE_UINT32 + json_name: "hcaObjects" + } + } + message_type { + name: "IOStat" + field { + name: "usage" + number: 1 + label: LABEL_REPEATED + type: TYPE_MESSAGE + type_name: ".io.containerd.cgroups.v2.IOEntry" + json_name: "usage" + } + field { + name: "psi" + number: 2 + label: LABEL_OPTIONAL + type: TYPE_MESSAGE + type_name: ".io.containerd.cgroups.v2.PSIStats" + json_name: "psi" + } + } + message_type { + name: "IOEntry" + field { + name: "major" + number: 1 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "major" + } + field { + name: "minor" + number: 2 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "minor" + } + field { + name: "rbytes" + number: 3 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "rbytes" + } + field { + name: "wbytes" + number: 4 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "wbytes" + } + field { + name: "rios" + number: 5 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "rios" + } + field { + name: "wios" + number: 6 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "wios" + } + } + message_type { + name: "HugeTlbStat" + field { + name: "current" + number: 1 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "current" + } + field { + name: "max" + number: 2 + label: LABEL_OPTIONAL + type: TYPE_UINT64 + json_name: "max" + } + field { + name: "pagesize" + number: 3 + label: LABEL_OPTIONAL + type: TYPE_STRING + json_name: "pagesize" + } + } + options { + go_package: "github.com/containerd/cgroups/cgroup2/stats" + } + syntax: "proto3" +} diff --git a/vendor/github.com/containerd/cgroups/v3/cgroup2/stats/metrics.proto b/vendor/github.com/containerd/cgroups/v3/cgroup2/stats/metrics.proto new file mode 100644 index 0000000000..a4eae7a4e1 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/v3/cgroup2/stats/metrics.proto @@ -0,0 +1,122 @@ +syntax = "proto3"; + +package io.containerd.cgroups.v2; + +option go_package = "github.com/containerd/cgroups/cgroup2/stats"; + +message Metrics { + PidsStat pids = 1; + CPUStat cpu = 2; + MemoryStat memory = 4; + RdmaStat rdma = 5; + IOStat io = 6; + repeated HugeTlbStat hugetlb = 7; + MemoryEvents memory_events = 8; +} + +message PSIData { + double avg10 = 1; + double avg60 = 2; + double avg300 = 3; + uint64 total = 4; +} + +message PSIStats { + PSIData some = 1; + PSIData full = 2; +} + +message PidsStat { + uint64 current = 1; + uint64 limit = 2; +} + +message CPUStat { + uint64 usage_usec = 1; + uint64 user_usec = 2; + uint64 system_usec = 3; + uint64 nr_periods = 4; + uint64 nr_throttled = 5; + uint64 throttled_usec = 6; + PSIStats psi = 7; +} + +message MemoryStat { + uint64 anon = 1; + uint64 file = 2; + uint64 kernel_stack = 3; + uint64 slab = 4; + uint64 sock = 5; + uint64 shmem = 6; + uint64 file_mapped = 7; + uint64 file_dirty = 8; + uint64 file_writeback = 9; + uint64 anon_thp = 10; + uint64 inactive_anon = 11; + uint64 active_anon = 12; + uint64 inactive_file = 13; + uint64 active_file = 14; + uint64 unevictable = 15; + uint64 slab_reclaimable = 16; + uint64 slab_unreclaimable = 17; + uint64 pgfault = 18; + uint64 pgmajfault = 19; + uint64 workingset_refault = 20; + uint64 workingset_activate = 21; + uint64 workingset_nodereclaim = 22; + uint64 pgrefill = 23; + uint64 pgscan = 24; + uint64 pgsteal = 25; + uint64 pgactivate = 26; + uint64 pgdeactivate = 27; + uint64 pglazyfree = 28; + uint64 pglazyfreed = 29; + uint64 thp_fault_alloc = 30; + uint64 thp_collapse_alloc = 31; + uint64 usage = 32; + uint64 usage_limit = 33; + uint64 swap_usage = 34; + uint64 swap_limit = 35; + uint64 max_usage = 36; + uint64 swap_max_usage = 37; + PSIStats psi = 38; +} + +message MemoryEvents { + uint64 low = 1; + uint64 high = 2; + uint64 max = 3; + uint64 oom = 4; + uint64 oom_kill = 5; +} + +message RdmaStat { + repeated RdmaEntry current = 1; + repeated RdmaEntry limit = 2; +} + +message RdmaEntry { + string device = 1; + uint32 hca_handles = 2; + uint32 hca_objects = 3; +} + +message IOStat { + repeated IOEntry usage = 1; + PSIStats psi = 2; +} + +message IOEntry { + uint64 major = 1; + uint64 minor = 2; + uint64 rbytes = 3; + uint64 wbytes = 4; + uint64 rios = 5; + uint64 wios = 6; +} + +message HugeTlbStat { + uint64 current = 1; + uint64 max = 2; + string pagesize = 3; +} diff --git a/vendor/github.com/containerd/cgroups/v3/cgroup2/utils.go b/vendor/github.com/containerd/cgroups/v3/cgroup2/utils.go new file mode 100644 index 0000000000..0974ce6a43 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/v3/cgroup2/utils.go @@ -0,0 +1,563 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroup2 + +import ( + "bufio" + "errors" + "fmt" + "io" + "math" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + "time" + "unsafe" + + "github.com/containerd/cgroups/v3/cgroup2/stats" + + "github.com/containerd/log" + "github.com/godbus/dbus/v5" + "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" +) + +const ( + cgroupProcs = "cgroup.procs" + cgroupThreads = "cgroup.threads" + defaultDirPerm = 0o755 +) + +// defaultFilePerm is a var so that the test framework can change the filemode +// of all files created when the tests are running. The difference between the +// tests and real world use is that files like "cgroup.procs" will exist when writing +// to a read cgroup filesystem and do not exist prior when running in the tests. +// this is set to a non 0 value in the test code +var defaultFilePerm = os.FileMode(0) + +// remove will remove a cgroup path handling EAGAIN and EBUSY errors and +// retrying the remove after a exp timeout +func remove(path string) error { + var err error + delay := 10 * time.Millisecond + for i := 0; i < 5; i++ { + if i != 0 { + time.Sleep(delay) + delay *= 2 + } + if err = os.RemoveAll(path); err == nil { + return nil + } + } + return fmt.Errorf("cgroups: unable to remove path %q: %w", path, err) +} + +// parseCgroupTasksFile parses /sys/fs/cgroup/$GROUPPATH/cgroup.procs or +// /sys/fs/cgroup/$GROUPPATH/cgroup.threads +func parseCgroupTasksFile(path string) ([]uint64, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer f.Close() + var ( + out []uint64 + s = bufio.NewScanner(f) + ) + for s.Scan() { + if t := s.Text(); t != "" { + pid, err := strconv.ParseUint(t, 10, 0) + if err != nil { + return nil, err + } + out = append(out, pid) + } + } + if err := s.Err(); err != nil { + return nil, err + } + return out, nil +} + +func parseKV(raw string) (string, uint64, error) { + parts := strings.Fields(raw) + if len(parts) != 2 { + return "", 0, ErrInvalidFormat + } + v, err := parseUint(parts[1], 10, 64) + return parts[0], v, err +} + +func parseUint(s string, base, bitSize int) (uint64, error) { + v, err := strconv.ParseUint(s, base, bitSize) + if err != nil { + intValue, intErr := strconv.ParseInt(s, base, bitSize) + // 1. Handle negative values greater than MinInt64 (and) + // 2. Handle negative values lesser than MinInt64 + if intErr == nil && intValue < 0 { + return 0, nil + } else if intErr != nil && + intErr.(*strconv.NumError).Err == strconv.ErrRange && + intValue < 0 { + return 0, nil + } + return 0, err + } + return v, nil +} + +// parseCgroupFile parses /proc/PID/cgroup file and return string +func parseCgroupFile(path string) (string, error) { + f, err := os.Open(path) + if err != nil { + return "", err + } + defer f.Close() + return parseCgroupFromReader(f) +} + +func parseCgroupFromReader(r io.Reader) (string, error) { + s := bufio.NewScanner(r) + for s.Scan() { + var ( + text = s.Text() + parts = strings.SplitN(text, ":", 3) + ) + if len(parts) < 3 { + return "", fmt.Errorf("invalid cgroup entry: %q", text) + } + // text is like "0::/user.slice/user-1001.slice/session-1.scope" + if parts[0] == "0" && parts[1] == "" { + return parts[2], nil + } + } + if err := s.Err(); err != nil { + return "", err + } + return "", fmt.Errorf("cgroup path not found") +} + +// ToResources converts the oci LinuxResources struct into a +// v2 Resources type for use with this package. +// +// converting cgroups configuration from v1 to v2 +// ref: https://github.com/containers/crun/blob/master/crun.1.md#cgroup-v2 +func ToResources(spec *specs.LinuxResources) *Resources { + var resources Resources + if cpu := spec.CPU; cpu != nil { + resources.CPU = &CPU{ + Cpus: cpu.Cpus, + Mems: cpu.Mems, + } + if shares := cpu.Shares; shares != nil { + convertedWeight := 1 + ((*shares-2)*9999)/262142 + resources.CPU.Weight = &convertedWeight + } + if period := cpu.Period; period != nil { + resources.CPU.Max = NewCPUMax(cpu.Quota, period) + } + } + if mem := spec.Memory; mem != nil { + resources.Memory = &Memory{} + if swap := mem.Swap; swap != nil { + resources.Memory.Swap = swap + if l := mem.Limit; l != nil { + reduce := *swap - *l + resources.Memory.Swap = &reduce + } + } + if l := mem.Limit; l != nil { + resources.Memory.Max = l + } + if l := mem.Reservation; l != nil { + resources.Memory.Low = l + } + } + if hugetlbs := spec.HugepageLimits; hugetlbs != nil { + hugeTlbUsage := HugeTlb{} + for _, hugetlb := range hugetlbs { + hugeTlbUsage = append(hugeTlbUsage, HugeTlbEntry{ + HugePageSize: hugetlb.Pagesize, + Limit: hugetlb.Limit, + }) + } + resources.HugeTlb = &hugeTlbUsage + } + if pids := spec.Pids; pids != nil { + resources.Pids = &Pids{ + Max: pids.Limit, + } + } + if i := spec.BlockIO; i != nil { + resources.IO = &IO{} + if i.Weight != nil { + resources.IO.BFQ.Weight = 1 + (*i.Weight-10)*9999/990 + } + for t, devices := range map[IOType][]specs.LinuxThrottleDevice{ + ReadBPS: i.ThrottleReadBpsDevice, + WriteBPS: i.ThrottleWriteBpsDevice, + ReadIOPS: i.ThrottleReadIOPSDevice, + WriteIOPS: i.ThrottleWriteIOPSDevice, + } { + for _, d := range devices { + resources.IO.Max = append(resources.IO.Max, Entry{ + Type: t, + Major: d.Major, + Minor: d.Minor, + Rate: d.Rate, + }) + } + } + } + if i := spec.Rdma; i != nil { + resources.RDMA = &RDMA{} + for device, value := range spec.Rdma { + if device != "" && (value.HcaHandles != nil && value.HcaObjects != nil) { + resources.RDMA.Limit = append(resources.RDMA.Limit, RDMAEntry{ + Device: device, + HcaHandles: *value.HcaHandles, + HcaObjects: *value.HcaObjects, + }) + } + } + } + + return &resources +} + +// Gets uint64 parsed content of single value cgroup stat file +func getStatFileContentUint64(filePath string) uint64 { + f, err := os.Open(filePath) + if err != nil { + return 0 + } + defer f.Close() + + // We expect an unsigned 64 bit integer, or a "max" string + // in some cases. + buf := make([]byte, 32) + n, err := f.Read(buf) + if err != nil { + return 0 + } + + trimmed := strings.TrimSpace(string(buf[:n])) + if trimmed == "max" { + return math.MaxUint64 + } + + res, err := parseUint(trimmed, 10, 64) + if err != nil { + log.L.Errorf("unable to parse %q as a uint from Cgroup file %q", trimmed, filePath) + return res + } + + return res +} + +func readIoStats(path string) []*stats.IOEntry { + // more details on the io.stat file format: https://www.kernel.org/doc/Documentation/cgroup-v2.txt + var usage []*stats.IOEntry + fpath := filepath.Join(path, "io.stat") + currentData, err := os.ReadFile(fpath) + if err != nil { + return usage + } + entries := strings.Split(string(currentData), "\n") + + for _, entry := range entries { + parts := strings.Split(entry, " ") + if len(parts) < 2 { + continue + } + majmin := strings.Split(parts[0], ":") + if len(majmin) != 2 { + continue + } + major, err := strconv.ParseUint(majmin[0], 10, 0) + if err != nil { + return usage + } + minor, err := strconv.ParseUint(majmin[1], 10, 0) + if err != nil { + return usage + } + parts = parts[1:] + ioEntry := stats.IOEntry{ + Major: major, + Minor: minor, + } + for _, s := range parts { + keyPairValue := strings.Split(s, "=") + if len(keyPairValue) != 2 { + continue + } + v, err := strconv.ParseUint(keyPairValue[1], 10, 0) + if err != nil { + continue + } + switch keyPairValue[0] { + case "rbytes": + ioEntry.Rbytes = v + case "wbytes": + ioEntry.Wbytes = v + case "rios": + ioEntry.Rios = v + case "wios": + ioEntry.Wios = v + } + } + usage = append(usage, &ioEntry) + } + return usage +} + +func rdmaStats(filepath string) []*stats.RdmaEntry { + currentData, err := os.ReadFile(filepath) + if err != nil { + return []*stats.RdmaEntry{} + } + return toRdmaEntry(strings.Split(string(currentData), "\n")) +} + +func parseRdmaKV(raw string, entry *stats.RdmaEntry) { + var value uint64 + var err error + + parts := strings.Split(raw, "=") + switch len(parts) { + case 2: + if parts[1] == "max" { + value = math.MaxUint32 + } else { + value, err = parseUint(parts[1], 10, 32) + if err != nil { + return + } + } + if parts[0] == "hca_handle" { + entry.HcaHandles = uint32(value) + } else if parts[0] == "hca_object" { + entry.HcaObjects = uint32(value) + } + } +} + +func toRdmaEntry(strEntries []string) []*stats.RdmaEntry { + var rdmaEntries []*stats.RdmaEntry + for i := range strEntries { + parts := strings.Fields(strEntries[i]) + switch len(parts) { + case 3: + entry := new(stats.RdmaEntry) + entry.Device = parts[0] + parseRdmaKV(parts[1], entry) + parseRdmaKV(parts[2], entry) + + rdmaEntries = append(rdmaEntries, entry) + default: + continue + } + } + return rdmaEntries +} + +// isUnitExists returns true if the error is that a systemd unit already exists. +func isUnitExists(err error) bool { + if err != nil { + if dbusError, ok := err.(dbus.Error); ok { + return strings.Contains(dbusError.Name, "org.freedesktop.systemd1.UnitExists") + } + } + return false +} + +func systemdUnitFromPath(path string) string { + _, unit := filepath.Split(path) + return unit +} + +func readHugeTlbStats(path string) []*stats.HugeTlbStat { + hpSizes := hugePageSizes() + usage := make([]*stats.HugeTlbStat, len(hpSizes)) + for idx, pagesize := range hpSizes { + usage[idx] = &stats.HugeTlbStat{ + Max: getStatFileContentUint64(filepath.Join(path, "hugetlb."+pagesize+".max")), + Current: getStatFileContentUint64(filepath.Join(path, "hugetlb."+pagesize+".current")), + Pagesize: pagesize, + } + } + return usage +} + +var ( + hPageSizes []string + initHPSOnce sync.Once +) + +// The following idea and implementation is taken pretty much line for line from +// runc. Because the hugetlb files are well known, and the only variable thrown in +// the mix is what huge page sizes you have on your host, this lends itself well +// to doing the work to find the files present once, and then re-using this. This +// saves a os.Readdirnames(0) call to search for hugeltb files on every `manager.Stat` +// call. +// https://github.com/opencontainers/runc/blob/3a2c0c2565644d8a7e0f1dd594a060b21fa96cf1/libcontainer/cgroups/utils.go#L301 +func hugePageSizes() []string { + initHPSOnce.Do(func() { + dir, err := os.OpenFile("/sys/kernel/mm/hugepages", unix.O_DIRECTORY|unix.O_RDONLY, 0) + if err != nil { + return + } + files, err := dir.Readdirnames(0) + dir.Close() + if err != nil { + return + } + + hPageSizes, err = getHugePageSizeFromFilenames(files) + if err != nil { + log.L.Warnf("hugePageSizes: %s", err) + } + }) + + return hPageSizes +} + +func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) { + pageSizes := make([]string, 0, len(fileNames)) + var warn error + + for _, file := range fileNames { + // example: hugepages-1048576kB + val := strings.TrimPrefix(file, "hugepages-") + if len(val) == len(file) { + // Unexpected file name: no prefix found, ignore it. + continue + } + // In all known versions of Linux up to 6.3 the suffix is always + // "kB". If we find something else, produce an error but keep going. + eLen := len(val) - 2 + val = strings.TrimSuffix(val, "kB") + if len(val) != eLen { + // Highly unlikely. + if warn == nil { + warn = errors.New(file + `: invalid suffix (expected "kB")`) + } + continue + } + size, err := strconv.Atoi(val) + if err != nil { + // Highly unlikely. + if warn == nil { + warn = fmt.Errorf("%s: %w", file, err) + } + continue + } + // Model after https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/hugetlb_cgroup.c?id=eff48ddeab782e35e58ccc8853f7386bbae9dec4#n574 + // but in our case the size is in KB already. + if size >= (1 << 20) { + val = strconv.Itoa(size>>20) + "GB" + } else if size >= (1 << 10) { + val = strconv.Itoa(size>>10) + "MB" + } else { + val += "KB" + } + pageSizes = append(pageSizes, val) + } + + return pageSizes, warn +} + +func getStatPSIFromFile(path string) *stats.PSIStats { + f, err := os.Open(path) + if err != nil { + return nil + } + defer f.Close() + + psistats := &stats.PSIStats{} + sc := bufio.NewScanner(f) + for sc.Scan() { + parts := strings.Fields(sc.Text()) + var pv *stats.PSIData + switch parts[0] { + case "some": + psistats.Some = &stats.PSIData{} + pv = psistats.Some + case "full": + psistats.Full = &stats.PSIData{} + pv = psistats.Full + } + if pv != nil { + err = parsePSIData(parts[1:], pv) + if err != nil { + log.L.WithError(err).Errorf("failed to read file %s", path) + return nil + } + } + } + + if err := sc.Err(); err != nil { + if !errors.Is(err, unix.ENOTSUP) && !errors.Is(err, unix.EOPNOTSUPP) { + log.L.WithError(err).Error("unable to parse PSI data") + } + return nil + } + return psistats +} + +func parsePSIData(psi []string, data *stats.PSIData) error { + for _, f := range psi { + kv := strings.SplitN(f, "=", 2) + if len(kv) != 2 { + return fmt.Errorf("invalid PSI data: %q", f) + } + var pv *float64 + switch kv[0] { + case "avg10": + pv = &data.Avg10 + case "avg60": + pv = &data.Avg60 + case "avg300": + pv = &data.Avg300 + case "total": + v, err := strconv.ParseUint(kv[1], 10, 64) + if err != nil { + return fmt.Errorf("invalid %s PSI value: %w", kv[0], err) + } + data.Total = v + } + if pv != nil { + v, err := strconv.ParseFloat(kv[1], 64) + if err != nil { + return fmt.Errorf("invalid %s PSI value: %w", kv[0], err) + } + *pv = v + } + } + return nil +} + +func getSubreaper() (int, error) { + var i uintptr + if err := unix.Prctl(unix.PR_GET_CHILD_SUBREAPER, uintptr(unsafe.Pointer(&i)), 0, 0, 0); err != nil { + return -1, err + } + return int(i), nil +} diff --git a/vendor/modules.txt b/vendor/modules.txt index b82e3fafd5..516285b3ea 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -52,11 +52,28 @@ github.com/cespare/xxhash/v2 ## explicit; go 1.16 github.com/checkpoint-restore/go-criu/v6 github.com/checkpoint-restore/go-criu/v6/rpc +# github.com/cilium/ebpf v0.17.3 +## explicit; go 1.22 +github.com/cilium/ebpf +github.com/cilium/ebpf/asm +github.com/cilium/ebpf/btf +github.com/cilium/ebpf/internal +github.com/cilium/ebpf/internal/kallsyms +github.com/cilium/ebpf/internal/kconfig +github.com/cilium/ebpf/internal/linux +github.com/cilium/ebpf/internal/sys +github.com/cilium/ebpf/internal/sysenc +github.com/cilium/ebpf/internal/testutils/fdtrace +github.com/cilium/ebpf/internal/tracefs +github.com/cilium/ebpf/internal/unix +github.com/cilium/ebpf/link # github.com/containerd/cgroups/v3 v3.0.5 ## explicit; go 1.22.0 github.com/containerd/cgroups/v3 github.com/containerd/cgroups/v3/cgroup1 github.com/containerd/cgroups/v3/cgroup1/stats +github.com/containerd/cgroups/v3/cgroup2 +github.com/containerd/cgroups/v3/cgroup2/stats # github.com/containerd/console v1.0.5 ## explicit; go 1.13 github.com/containerd/console