From 403a849e9b9c7b7bd290585e2f3502c09f7fadf3 Mon Sep 17 00:00:00 2001 From: Victor Boglea Date: Mon, 23 Feb 2026 18:20:02 +0100 Subject: [PATCH 1/4] feat: add cached VLAN and LAG Redis collectors --- README.md | 12 + cmd/sonic-exporter/main.go | 8 + fixtures/test/appl_db_data.json | 27 +++ fixtures/test/config_db_data.json | 22 ++ internal/collector/collector_test.go | 99 ++++++++ internal/collector/lag_collector.go | 331 +++++++++++++++++++++++++++ internal/collector/vlan_collector.go | 314 +++++++++++++++++++++++++ 7 files changed, 813 insertions(+) create mode 100644 internal/collector/lag_collector.go create mode 100644 internal/collector/vlan_collector.go diff --git a/README.md b/README.md index 3ea4f52..9211607 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,8 @@ Currently supported collectors: - [CRM collector](internal/collector/crm_collector.go): collects Critial Resource Monitoring metrics. - [Queue collector](internal/collector/queue_collector.go): collects metrics about queues. - [LLDP collector](internal/collector/lldp_collector.go): collects LLDP neighbor information from SONiC Redis. +- [VLAN collector](internal/collector/vlan_collector.go): collects VLAN and VLAN member state from SONiC Redis. +- [LAG collector](internal/collector/lag_collector.go): collects PortChannel and member state from SONiC Redis. # Usage @@ -33,6 +35,16 @@ Environment variables: - `LLDP_REFRESH_INTERVAL` - LLDP cache refresh interval. Default: `30s`. - `LLDP_TIMEOUT` - timeout for one LLDP refresh cycle. Default: `2s`. - `LLDP_MAX_NEIGHBORS` - maximum number of LLDP neighbors exported per refresh. Default: `512`. +- `VLAN_ENABLED` - enable VLAN collector. Default: `true`. +- `VLAN_REFRESH_INTERVAL` - VLAN cache refresh interval. Default: `30s`. +- `VLAN_TIMEOUT` - timeout for one VLAN refresh cycle. Default: `2s`. +- `VLAN_MAX_VLANS` - maximum number of VLANs exported per refresh. Default: `1024`. +- `VLAN_MAX_MEMBERS` - maximum number of VLAN members exported per refresh. Default: `8192`. +- `LAG_ENABLED` - enable LAG collector. Default: `true`. +- `LAG_REFRESH_INTERVAL` - LAG cache refresh interval. Default: `30s`. +- `LAG_TIMEOUT` - timeout for one LAG refresh cycle. Default: `2s`. +- `LAG_MAX_LAGS` - maximum number of LAGs exported per refresh. Default: `512`. +- `LAG_MAX_MEMBERS` - maximum number of LAG members exported per refresh. Default: `4096`. ## Validated Platforms diff --git a/cmd/sonic-exporter/main.go b/cmd/sonic-exporter/main.go index 2c87cc6..8ad27ca 100644 --- a/cmd/sonic-exporter/main.go +++ b/cmd/sonic-exporter/main.go @@ -50,6 +50,8 @@ func main() { crmCollector := collector.NewCrmCollector(logger) queueCollector := collector.NewQueueCollector(logger) lldpCollector := collector.NewLldpCollector(logger) + vlanCollector := collector.NewVlanCollector(logger) + lagCollector := collector.NewLagCollector(logger) prometheus.MustRegister(interfaceCollector) prometheus.MustRegister(hwCollector) prometheus.MustRegister(crmCollector) @@ -57,6 +59,12 @@ func main() { if lldpCollector.IsEnabled() { prometheus.MustRegister(lldpCollector) } + if vlanCollector.IsEnabled() { + prometheus.MustRegister(vlanCollector) + } + if lagCollector.IsEnabled() { + prometheus.MustRegister(lagCollector) + } // Node exporter collectors nodeCollector, err := nodecollector.NewNodeCollector(logger, diff --git a/fixtures/test/appl_db_data.json b/fixtures/test/appl_db_data.json index 0020cfb..8ad8e0f 100644 --- a/fixtures/test/appl_db_data.json +++ b/fixtures/test/appl_db_data.json @@ -51,6 +51,33 @@ }, "LLDP_ENTRY_TABLE:Ethernet0": { "lldp_rem_index": "1" + }, + "VLAN_TABLE:Vlan1000": { + "admin_status": "up", + "oper_status": "up" + }, + "VLAN_TABLE:Vlan2000": { + "admin_status": "down", + "oper_status": "down" + }, + "LAG_TABLE:PortChannel1": { + "mtu": "9100", + "admin_status": "up", + "oper_status": "up" + }, + "LAG_TABLE:PortChannel2": { + "mtu": "9100", + "admin_status": "up", + "oper_status": "down" + }, + "LAG_MEMBER_TABLE:PortChannel1:Ethernet24": { + "status": "enabled" + }, + "LAG_MEMBER_TABLE:PortChannel1:Ethernet28": { + "status": "disabled" + }, + "LAG_MEMBER_TABLE:PortChannel2:Ethernet92": { + "status": "enabled" } } } diff --git a/fixtures/test/config_db_data.json b/fixtures/test/config_db_data.json index bd79b59..5746717 100644 --- a/fixtures/test/config_db_data.json +++ b/fixtures/test/config_db_data.json @@ -32,6 +32,28 @@ "lanes": "125,126,127,128", "mtu": "9100", "speed": "100000" + }, + "VLAN|Vlan1000": { + "admin_status": "up", + "vlanid": "1000" + }, + "VLAN|Vlan2000": { + "admin_status": "down", + "vlanid": "2000" + }, + "VLAN_MEMBER|Vlan1000|Ethernet0": { + "tagging_mode": "untagged" + }, + "VLAN_MEMBER|Vlan1000|PortChannel1": { + "tagging_mode": "tagged" + }, + "PORTCHANNEL|PortChannel1": { + "admin_status": "up", + "mtu": "9100" + }, + "PORTCHANNEL|PortChannel2": { + "admin_status": "up", + "mtu": "9100" } } } diff --git a/internal/collector/collector_test.go b/internal/collector/collector_test.go index 0b4d634..7498239 100644 --- a/internal/collector/collector_test.go +++ b/internal/collector/collector_test.go @@ -77,6 +77,8 @@ func TestMain(m *testing.M) { os.Setenv("REDIS_ADDRESS", s.Addr()) os.Setenv("LLDP_ENABLED", "true") os.Setenv("LLDP_INCLUDE_MGMT", "true") + os.Setenv("VLAN_ENABLED", "true") + os.Setenv("LAG_ENABLED", "true") err = populateRedisData() if err != nil { slog.Error("failed to populate redis data", "error", err) @@ -89,6 +91,8 @@ func TestMain(m *testing.M) { os.Unsetenv("REDIS_ADDRESS") os.Unsetenv("LLDP_ENABLED") os.Unsetenv("LLDP_INCLUDE_MGMT") + os.Unsetenv("VLAN_ENABLED") + os.Unsetenv("LAG_ENABLED") os.Exit(exitCode) } @@ -276,3 +280,98 @@ func TestLldpCollector(t *testing.T) { t.Errorf("unexpected collecting result:\n%s", err) } } + +func TestVlanCollector(t *testing.T) { + promslogConfig := &promslog.Config{} + logger := promslog.New(promslogConfig) + + vlanCollector := NewVlanCollector(logger) + + problems, err := testutil.CollectAndLint(vlanCollector) + if err != nil { + t.Error("metric lint completed with errors") + } + + for _, problem := range problems { + t.Errorf("metric %v has a problem: %v", problem.Metric, problem.Text) + } + + metadata := ` + # HELP sonic_vlan_collector_success Whether VLAN collector succeeded + # TYPE sonic_vlan_collector_success gauge + # HELP sonic_vlan_members Number of VLAN members + # TYPE sonic_vlan_members gauge + ` + + expected := ` + sonic_vlan_collector_success 1 + sonic_vlan_members{vlan="Vlan1000"} 2 + sonic_vlan_members{vlan="Vlan2000"} 0 + ` + + if err := testutil.CollectAndCompare(vlanCollector, strings.NewReader(metadata+expected), "sonic_vlan_collector_success", "sonic_vlan_members"); err != nil { + t.Errorf("unexpected collecting result:\n%s", err) + } + + memberMetadata := ` + # HELP sonic_vlan_member_info Non-numeric data about VLAN member, value is always 1 + # TYPE sonic_vlan_member_info gauge + ` + + memberExpected := ` + sonic_vlan_member_info{member="Ethernet0",tagging_mode="untagged",vlan="Vlan1000"} 1 + sonic_vlan_member_info{member="PortChannel1",tagging_mode="tagged",vlan="Vlan1000"} 1 + ` + + if err := testutil.CollectAndCompare(vlanCollector, strings.NewReader(memberMetadata+memberExpected), "sonic_vlan_member_info"); err != nil { + t.Errorf("unexpected collecting result:\n%s", err) + } +} + +func TestLagCollector(t *testing.T) { + promslogConfig := &promslog.Config{} + logger := promslog.New(promslogConfig) + + lagCollector := NewLagCollector(logger) + + problems, err := testutil.CollectAndLint(lagCollector) + if err != nil { + t.Error("metric lint completed with errors") + } + + for _, problem := range problems { + t.Errorf("metric %v has a problem: %v", problem.Metric, problem.Text) + } + + metadata := ` + # HELP sonic_lag_collector_success Whether LAG collector succeeded + # TYPE sonic_lag_collector_success gauge + # HELP sonic_lag_members Number of LAG member interfaces + # TYPE sonic_lag_members gauge + ` + + expected := ` + sonic_lag_collector_success 1 + sonic_lag_members{lag="PortChannel1"} 2 + sonic_lag_members{lag="PortChannel2"} 1 + ` + + if err := testutil.CollectAndCompare(lagCollector, strings.NewReader(metadata+expected), "sonic_lag_collector_success", "sonic_lag_members"); err != nil { + t.Errorf("unexpected collecting result:\n%s", err) + } + + memberMetadata := ` + # HELP sonic_lag_member_status Status of LAG member interface (1=enabled, 0=disabled) + # TYPE sonic_lag_member_status gauge + ` + + memberExpected := ` + sonic_lag_member_status{lag="PortChannel1",member="Ethernet24"} 1 + sonic_lag_member_status{lag="PortChannel1",member="Ethernet28"} 0 + sonic_lag_member_status{lag="PortChannel2",member="Ethernet92"} 1 + ` + + if err := testutil.CollectAndCompare(lagCollector, strings.NewReader(memberMetadata+memberExpected), "sonic_lag_member_status"); err != nil { + t.Errorf("unexpected collecting result:\n%s", err) + } +} diff --git a/internal/collector/lag_collector.go b/internal/collector/lag_collector.go new file mode 100644 index 0000000..3027ad4 --- /dev/null +++ b/internal/collector/lag_collector.go @@ -0,0 +1,331 @@ +package collector + +import ( + "context" + "fmt" + "log/slog" + "sort" + "strings" + "sync" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/vinted/sonic-exporter/pkg/redis" +) + +type lagCollectorConfig struct { + enabled bool + refreshInterval time.Duration + timeout time.Duration + maxLags int + maxMembers int + redisScanCount int64 +} + +type lagCollector struct { + lagInfo *prometheus.Desc + lagAdminStatus *prometheus.Desc + lagOperStatus *prometheus.Desc + lagMembers *prometheus.Desc + lagMemberStatus *prometheus.Desc + scrapeDuration *prometheus.Desc + scrapeCollectorSuccess *prometheus.Desc + cacheAge *prometheus.Desc + skippedEntries *prometheus.Desc + + logger *slog.Logger + config lagCollectorConfig + + mu sync.RWMutex + cachedMetrics []prometheus.Metric + lastSuccess float64 + lastScrapeDuration float64 + lastSkippedEntries float64 + lastRefreshTime time.Time +} + +type lagMemberEntry struct { + name string + status string +} + +func NewLagCollector(logger *slog.Logger) *lagCollector { + const ( + namespace = "sonic" + subsystem = "lag" + ) + + collector := &lagCollector{ + lagInfo: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "info"), + "Non-numeric data about LAG, value is always 1", []string{"lag"}, nil), + lagAdminStatus: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "admin_status"), + "Administrative state of LAG (1=up, 0=down)", []string{"lag"}, nil), + lagOperStatus: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "oper_status"), + "Operational state of LAG (1=up, 0=down)", []string{"lag"}, nil), + lagMembers: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "members"), + "Number of LAG member interfaces", []string{"lag"}, nil), + lagMemberStatus: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "member_status"), + "Status of LAG member interface (1=enabled, 0=disabled)", []string{"lag", "member"}, nil), + scrapeDuration: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "scrape_duration_seconds"), + "Time it took for exporter to refresh LAG metrics", nil, nil), + scrapeCollectorSuccess: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "collector_success"), + "Whether LAG collector succeeded", nil, nil), + cacheAge: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "cache_age_seconds"), + "Age of latest LAG cache refresh", nil, nil), + skippedEntries: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "entries_skipped"), + "Number of LAG entries skipped during latest refresh", nil, nil), + logger: logger, + config: loadLagCollectorConfig(logger), + } + + if !collector.config.enabled { + collector.logger.Info("LAG collector is disabled") + return collector + } + + collector.refreshMetrics() + go collector.refreshLoop() + + return collector +} + +func (collector *lagCollector) IsEnabled() bool { + return collector.config.enabled +} + +func (collector *lagCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- collector.lagInfo + ch <- collector.lagAdminStatus + ch <- collector.lagOperStatus + ch <- collector.lagMembers + ch <- collector.lagMemberStatus + ch <- collector.scrapeDuration + ch <- collector.scrapeCollectorSuccess + ch <- collector.cacheAge + ch <- collector.skippedEntries +} + +func (collector *lagCollector) Collect(ch chan<- prometheus.Metric) { + if !collector.config.enabled { + return + } + + collector.mu.RLock() + cachedMetrics := append([]prometheus.Metric{}, collector.cachedMetrics...) + lastSkippedEntries := collector.lastSkippedEntries + lastScrapeDuration := collector.lastScrapeDuration + lastSuccess := collector.lastSuccess + lastRefreshTime := collector.lastRefreshTime + collector.mu.RUnlock() + + for _, metric := range cachedMetrics { + ch <- metric + } + + cacheAge := 0.0 + if !lastRefreshTime.IsZero() { + cacheAge = time.Since(lastRefreshTime).Seconds() + } + + ch <- prometheus.MustNewConstMetric(collector.scrapeDuration, prometheus.GaugeValue, lastScrapeDuration) + ch <- prometheus.MustNewConstMetric(collector.scrapeCollectorSuccess, prometheus.GaugeValue, lastSuccess) + ch <- prometheus.MustNewConstMetric(collector.cacheAge, prometheus.GaugeValue, cacheAge) + ch <- prometheus.MustNewConstMetric(collector.skippedEntries, prometheus.GaugeValue, lastSkippedEntries) +} + +func (collector *lagCollector) refreshLoop() { + ticker := time.NewTicker(collector.config.refreshInterval) + defer ticker.Stop() + + for range ticker.C { + collector.refreshMetrics() + } +} + +func (collector *lagCollector) refreshMetrics() { + start := time.Now() + ctx, cancel := context.WithTimeout(context.Background(), collector.config.timeout) + defer cancel() + + metrics, skippedEntries, err := collector.scrapeMetrics(ctx) + scrapeDuration := time.Since(start).Seconds() + + collector.mu.Lock() + defer collector.mu.Unlock() + + collector.lastScrapeDuration = scrapeDuration + + if err != nil { + collector.lastSuccess = 0 + collector.logger.Error("Error refreshing LAG metrics", "error", err) + return + } + + collector.cachedMetrics = metrics + collector.lastSkippedEntries = float64(skippedEntries) + collector.lastSuccess = 1 + collector.lastRefreshTime = time.Now() +} + +func (collector *lagCollector) scrapeMetrics(ctx context.Context) ([]prometheus.Metric, int, error) { + redisClient, err := redis.NewClient() + if err != nil { + return nil, 0, fmt.Errorf("redis client initialization failed: %w", err) + } + defer redisClient.Close() + + configLags, err := redisClient.ScanKeysFromDb(ctx, "CONFIG_DB", "PORTCHANNEL|*", collector.config.redisScanCount) + if err != nil { + return nil, 0, fmt.Errorf("failed to scan PORTCHANNEL keys: %w", err) + } + + applLags, err := redisClient.ScanKeysFromDb(ctx, "APPL_DB", "LAG_TABLE:*", collector.config.redisScanCount) + if err != nil { + return nil, 0, fmt.Errorf("failed to scan LAG_TABLE keys: %w", err) + } + + memberKeys, err := redisClient.ScanKeysFromDb(ctx, "APPL_DB", "LAG_MEMBER_TABLE:*", collector.config.redisScanCount) + if err != nil { + return nil, 0, fmt.Errorf("failed to scan LAG_MEMBER_TABLE keys: %w", err) + } + + lags := map[string]struct{}{} + skippedEntries := 0 + + for _, key := range configLags { + lagName := strings.TrimPrefix(key, "PORTCHANNEL|") + if lagName == "" || lagName == key { + skippedEntries++ + continue + } + lags[lagName] = struct{}{} + } + + for _, key := range applLags { + lagName := strings.TrimPrefix(key, "LAG_TABLE:") + if lagName == "" || lagName == key { + skippedEntries++ + continue + } + lags[lagName] = struct{}{} + } + + lagNames := make([]string, 0, len(lags)) + for lagName := range lags { + lagNames = append(lagNames, lagName) + } + sort.Strings(lagNames) + + membersByLag := map[string][]lagMemberEntry{} + for _, key := range memberKeys { + memberPath := strings.TrimPrefix(key, "LAG_MEMBER_TABLE:") + if memberPath == "" || memberPath == key { + skippedEntries++ + continue + } + + parts := strings.SplitN(memberPath, ":", 2) + if len(parts) != 2 || parts[0] == "" || parts[1] == "" { + skippedEntries++ + continue + } + + memberData, err := redisClient.HgetAllFromDb(ctx, "APPL_DB", key) + if err != nil { + return nil, 0, fmt.Errorf("failed to read %s: %w", key, err) + } + + membersByLag[parts[0]] = append(membersByLag[parts[0]], lagMemberEntry{name: parts[1], status: memberData["status"]}) + } + + metrics := make([]prometheus.Metric, 0) + processedLags := 0 + processedMembers := 0 + + for _, lagName := range lagNames { + if processedLags >= collector.config.maxLags { + skippedEntries++ + continue + } + + configData, err := redisClient.HgetAllFromDb(ctx, "CONFIG_DB", "PORTCHANNEL|"+lagName) + if err != nil { + return nil, 0, fmt.Errorf("failed to read CONFIG_DB PORTCHANNEL|%s: %w", lagName, err) + } + + applData, err := redisClient.HgetAllFromDb(ctx, "APPL_DB", "LAG_TABLE:"+lagName) + if err != nil { + return nil, 0, fmt.Errorf("failed to read APPL_DB LAG_TABLE:%s: %w", lagName, err) + } + + metrics = append(metrics, prometheus.MustNewConstMetric(collector.lagInfo, prometheus.GaugeValue, 1, lagName)) + + adminStatus := firstNonEmpty(applData["admin_status"], configData["admin_status"]) + if adminStatus != "" { + metrics = append(metrics, prometheus.MustNewConstMetric(collector.lagAdminStatus, prometheus.GaugeValue, statusToGauge(adminStatus), lagName)) + } + + if operStatus := applData["oper_status"]; operStatus != "" { + metrics = append(metrics, prometheus.MustNewConstMetric(collector.lagOperStatus, prometheus.GaugeValue, statusToGauge(operStatus), lagName)) + } + + members := membersByLag[lagName] + sort.Slice(members, func(i, j int) bool { + return members[i].name < members[j].name + }) + + memberCount := 0 + for _, member := range members { + if processedMembers >= collector.config.maxMembers { + skippedEntries++ + continue + } + + metrics = append(metrics, prometheus.MustNewConstMetric( + collector.lagMemberStatus, + prometheus.GaugeValue, + statusToGauge(member.status), + lagName, + member.name, + )) + + processedMembers++ + memberCount++ + } + + metrics = append(metrics, prometheus.MustNewConstMetric(collector.lagMembers, prometheus.GaugeValue, float64(memberCount), lagName)) + processedLags++ + } + + return metrics, skippedEntries, nil +} + +func loadLagCollectorConfig(logger *slog.Logger) lagCollectorConfig { + return lagCollectorConfig{ + enabled: parseBoolEnv(logger, "LAG_ENABLED", true), + refreshInterval: parseDurationEnv(logger, "LAG_REFRESH_INTERVAL", 30*time.Second), + timeout: parseDurationEnv(logger, "LAG_TIMEOUT", 2*time.Second), + maxLags: parseIntEnv(logger, "LAG_MAX_LAGS", 512), + maxMembers: parseIntEnv(logger, "LAG_MAX_MEMBERS", 4096), + redisScanCount: 256, + } +} + +func statusToGauge(status string) float64 { + value := strings.ToLower(strings.TrimSpace(status)) + if value == "up" || value == "enabled" || value == "selected" || value == "active" || value == "true" || value == "ok" { + return 1 + } + + return 0 +} + +func firstNonEmpty(values ...string) string { + for _, value := range values { + if strings.TrimSpace(value) != "" { + return value + } + } + + return "" +} diff --git a/internal/collector/vlan_collector.go b/internal/collector/vlan_collector.go new file mode 100644 index 0000000..601abe7 --- /dev/null +++ b/internal/collector/vlan_collector.go @@ -0,0 +1,314 @@ +package collector + +import ( + "context" + "fmt" + "log/slog" + "sort" + "strings" + "sync" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/vinted/sonic-exporter/pkg/redis" +) + +type vlanCollectorConfig struct { + enabled bool + refreshInterval time.Duration + timeout time.Duration + maxVlans int + maxMembers int + redisScanCount int64 +} + +type vlanCollector struct { + vlanInfo *prometheus.Desc + vlanAdminStatus *prometheus.Desc + vlanOperStatus *prometheus.Desc + vlanMembers *prometheus.Desc + vlanMemberInfo *prometheus.Desc + scrapeDuration *prometheus.Desc + scrapeCollectorSuccess *prometheus.Desc + cacheAge *prometheus.Desc + skippedEntries *prometheus.Desc + + logger *slog.Logger + config vlanCollectorConfig + + mu sync.RWMutex + cachedMetrics []prometheus.Metric + lastSuccess float64 + lastScrapeDuration float64 + lastSkippedEntries float64 + lastRefreshTime time.Time +} + +type vlanMemberEntry struct { + name string + taggingMode string +} + +func NewVlanCollector(logger *slog.Logger) *vlanCollector { + const ( + namespace = "sonic" + subsystem = "vlan" + ) + + collector := &vlanCollector{ + vlanInfo: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "info"), + "Non-numeric data about VLAN, value is always 1", []string{"vlan", "vlan_id"}, nil), + vlanAdminStatus: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "admin_status"), + "Administrative state of VLAN (1=up, 0=down)", []string{"vlan"}, nil), + vlanOperStatus: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "oper_status"), + "Operational state of VLAN (1=up, 0=down)", []string{"vlan"}, nil), + vlanMembers: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "members"), + "Number of VLAN members", []string{"vlan"}, nil), + vlanMemberInfo: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "member_info"), + "Non-numeric data about VLAN member, value is always 1", []string{"vlan", "member", "tagging_mode"}, nil), + scrapeDuration: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "scrape_duration_seconds"), + "Time it took for exporter to refresh VLAN metrics", nil, nil), + scrapeCollectorSuccess: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "collector_success"), + "Whether VLAN collector succeeded", nil, nil), + cacheAge: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "cache_age_seconds"), + "Age of latest VLAN cache refresh", nil, nil), + skippedEntries: prometheus.NewDesc(prometheus.BuildFQName(namespace, subsystem, "entries_skipped"), + "Number of VLAN entries skipped during latest refresh", nil, nil), + logger: logger, + config: loadVlanCollectorConfig(logger), + } + + if !collector.config.enabled { + collector.logger.Info("VLAN collector is disabled") + return collector + } + + collector.refreshMetrics() + go collector.refreshLoop() + + return collector +} + +func (collector *vlanCollector) IsEnabled() bool { + return collector.config.enabled +} + +func (collector *vlanCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- collector.vlanInfo + ch <- collector.vlanAdminStatus + ch <- collector.vlanOperStatus + ch <- collector.vlanMembers + ch <- collector.vlanMemberInfo + ch <- collector.scrapeDuration + ch <- collector.scrapeCollectorSuccess + ch <- collector.cacheAge + ch <- collector.skippedEntries +} + +func (collector *vlanCollector) Collect(ch chan<- prometheus.Metric) { + if !collector.config.enabled { + return + } + + collector.mu.RLock() + cachedMetrics := append([]prometheus.Metric{}, collector.cachedMetrics...) + lastSkippedEntries := collector.lastSkippedEntries + lastScrapeDuration := collector.lastScrapeDuration + lastSuccess := collector.lastSuccess + lastRefreshTime := collector.lastRefreshTime + collector.mu.RUnlock() + + for _, metric := range cachedMetrics { + ch <- metric + } + + cacheAge := 0.0 + if !lastRefreshTime.IsZero() { + cacheAge = time.Since(lastRefreshTime).Seconds() + } + + ch <- prometheus.MustNewConstMetric(collector.scrapeDuration, prometheus.GaugeValue, lastScrapeDuration) + ch <- prometheus.MustNewConstMetric(collector.scrapeCollectorSuccess, prometheus.GaugeValue, lastSuccess) + ch <- prometheus.MustNewConstMetric(collector.cacheAge, prometheus.GaugeValue, cacheAge) + ch <- prometheus.MustNewConstMetric(collector.skippedEntries, prometheus.GaugeValue, lastSkippedEntries) +} + +func (collector *vlanCollector) refreshLoop() { + ticker := time.NewTicker(collector.config.refreshInterval) + defer ticker.Stop() + + for range ticker.C { + collector.refreshMetrics() + } +} + +func (collector *vlanCollector) refreshMetrics() { + start := time.Now() + ctx, cancel := context.WithTimeout(context.Background(), collector.config.timeout) + defer cancel() + + metrics, skippedEntries, err := collector.scrapeMetrics(ctx) + scrapeDuration := time.Since(start).Seconds() + + collector.mu.Lock() + defer collector.mu.Unlock() + + collector.lastScrapeDuration = scrapeDuration + + if err != nil { + collector.lastSuccess = 0 + collector.logger.Error("Error refreshing VLAN metrics", "error", err) + return + } + + collector.cachedMetrics = metrics + collector.lastSkippedEntries = float64(skippedEntries) + collector.lastSuccess = 1 + collector.lastRefreshTime = time.Now() +} + +func (collector *vlanCollector) scrapeMetrics(ctx context.Context) ([]prometheus.Metric, int, error) { + redisClient, err := redis.NewClient() + if err != nil { + return nil, 0, fmt.Errorf("redis client initialization failed: %w", err) + } + defer redisClient.Close() + + configVlans, err := redisClient.ScanKeysFromDb(ctx, "CONFIG_DB", "VLAN|*", collector.config.redisScanCount) + if err != nil { + return nil, 0, fmt.Errorf("failed to scan VLAN keys: %w", err) + } + + applVlans, err := redisClient.ScanKeysFromDb(ctx, "APPL_DB", "VLAN_TABLE:*", collector.config.redisScanCount) + if err != nil { + return nil, 0, fmt.Errorf("failed to scan VLAN_TABLE keys: %w", err) + } + + memberKeys, err := redisClient.ScanKeysFromDb(ctx, "CONFIG_DB", "VLAN_MEMBER|*", collector.config.redisScanCount) + if err != nil { + return nil, 0, fmt.Errorf("failed to scan VLAN_MEMBER keys: %w", err) + } + + vlans := map[string]struct{}{} + skippedEntries := 0 + + for _, key := range configVlans { + vlanName := strings.TrimPrefix(key, "VLAN|") + if vlanName == "" || vlanName == key { + skippedEntries++ + continue + } + vlans[vlanName] = struct{}{} + } + + for _, key := range applVlans { + vlanName := strings.TrimPrefix(key, "VLAN_TABLE:") + if vlanName == "" || vlanName == key { + skippedEntries++ + continue + } + vlans[vlanName] = struct{}{} + } + + vlanNames := make([]string, 0, len(vlans)) + for vlanName := range vlans { + vlanNames = append(vlanNames, vlanName) + } + sort.Strings(vlanNames) + + membersByVlan := map[string][]vlanMemberEntry{} + for _, key := range memberKeys { + memberPath := strings.TrimPrefix(key, "VLAN_MEMBER|") + if memberPath == "" || memberPath == key { + skippedEntries++ + continue + } + + parts := strings.SplitN(memberPath, "|", 2) + if len(parts) != 2 || parts[0] == "" || parts[1] == "" { + skippedEntries++ + continue + } + + memberData, err := redisClient.HgetAllFromDb(ctx, "CONFIG_DB", key) + if err != nil { + return nil, 0, fmt.Errorf("failed to read %s: %w", key, err) + } + + taggingMode := firstNonEmpty(memberData["tagging_mode"], "unknown") + membersByVlan[parts[0]] = append(membersByVlan[parts[0]], vlanMemberEntry{name: parts[1], taggingMode: taggingMode}) + } + + metrics := make([]prometheus.Metric, 0) + processedVlans := 0 + processedMembers := 0 + + for _, vlanName := range vlanNames { + if processedVlans >= collector.config.maxVlans { + skippedEntries++ + continue + } + + configData, err := redisClient.HgetAllFromDb(ctx, "CONFIG_DB", "VLAN|"+vlanName) + if err != nil { + return nil, 0, fmt.Errorf("failed to read CONFIG_DB VLAN|%s: %w", vlanName, err) + } + + applData, err := redisClient.HgetAllFromDb(ctx, "APPL_DB", "VLAN_TABLE:"+vlanName) + if err != nil { + return nil, 0, fmt.Errorf("failed to read APPL_DB VLAN_TABLE:%s: %w", vlanName, err) + } + + vlanID := firstNonEmpty(configData["vlanid"], strings.TrimPrefix(vlanName, "Vlan")) + metrics = append(metrics, prometheus.MustNewConstMetric(collector.vlanInfo, prometheus.GaugeValue, 1, vlanName, vlanID)) + + if adminStatus := firstNonEmpty(applData["admin_status"], configData["admin_status"]); adminStatus != "" { + metrics = append(metrics, prometheus.MustNewConstMetric(collector.vlanAdminStatus, prometheus.GaugeValue, statusToGauge(adminStatus), vlanName)) + } + + if operStatus := applData["oper_status"]; operStatus != "" { + metrics = append(metrics, prometheus.MustNewConstMetric(collector.vlanOperStatus, prometheus.GaugeValue, statusToGauge(operStatus), vlanName)) + } + + members := membersByVlan[vlanName] + sort.Slice(members, func(i, j int) bool { + return members[i].name < members[j].name + }) + + memberCount := 0 + for _, member := range members { + if processedMembers >= collector.config.maxMembers { + skippedEntries++ + continue + } + + metrics = append(metrics, prometheus.MustNewConstMetric( + collector.vlanMemberInfo, + prometheus.GaugeValue, + 1, + vlanName, + member.name, + member.taggingMode, + )) + + processedMembers++ + memberCount++ + } + + metrics = append(metrics, prometheus.MustNewConstMetric(collector.vlanMembers, prometheus.GaugeValue, float64(memberCount), vlanName)) + processedVlans++ + } + + return metrics, skippedEntries, nil +} + +func loadVlanCollectorConfig(logger *slog.Logger) vlanCollectorConfig { + return vlanCollectorConfig{ + enabled: parseBoolEnv(logger, "VLAN_ENABLED", true), + refreshInterval: parseDurationEnv(logger, "VLAN_REFRESH_INTERVAL", 30*time.Second), + timeout: parseDurationEnv(logger, "VLAN_TIMEOUT", 2*time.Second), + maxVlans: parseIntEnv(logger, "VLAN_MAX_VLANS", 1024), + maxMembers: parseIntEnv(logger, "VLAN_MAX_MEMBERS", 8192), + redisScanCount: 256, + } +} From f84915b6a3a341fd1ada4a9d9f913e95586fbf0b Mon Sep 17 00:00:00 2001 From: Victor Boglea Date: Mon, 23 Feb 2026 18:24:24 +0100 Subject: [PATCH 2/4] ci: checkout repository before setup-go --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f920ce2..c247955 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -9,11 +9,11 @@ jobs: go-test: runs-on: ubuntu-latest steps: + - uses: actions/checkout@v5 - name: setup Go uses: actions/setup-go@v6 with: go-version-file: 'go.mod' - - uses: actions/checkout@v5 - run: go test -v ./... golangci-lint: runs-on: ubuntu-latest From 6f0d0c0107c552e2de130dfd528a9df2480a9adc Mon Sep 17 00:00:00 2001 From: Victor Boglea Date: Mon, 23 Feb 2026 18:31:04 +0100 Subject: [PATCH 3/4] fix: handle kingpin parse errors for lint compliance --- cmd/sonic-exporter/main.go | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/cmd/sonic-exporter/main.go b/cmd/sonic-exporter/main.go index 8ad27ca..56c8688 100644 --- a/cmd/sonic-exporter/main.go +++ b/cmd/sonic-exporter/main.go @@ -1,6 +1,7 @@ package main import ( + "log/slog" "net/http" "os" @@ -17,7 +18,7 @@ import ( func main() { // setup node exporter collectors through global kingpin flags - kingpin.CommandLine.Parse([]string{ + _, err := kingpin.CommandLine.Parse([]string{ "--collector.disable-defaults", "--collector.loadavg", "--collector.cpu", @@ -27,6 +28,10 @@ func main() { "--collector.time", "--collector.stat", }) + if err != nil { + slog.Error("failed to parse node exporter collector defaults", "error", err) + os.Exit(1) + } // New kingpin instance to prevent imported code from adding flags (node exporter) kp := kingpin.New("sonic-exporter", "Prometheus exporter for SONiC network switches") @@ -40,7 +45,10 @@ func main() { flag.AddFlags(kp, promslogConfig) kp.HelpFlag.Short('h') kp.UsageWriter(os.Stdout) - kp.Parse(os.Args[1:]) + if _, err = kp.Parse(os.Args[1:]); err != nil { + slog.Error("failed to parse command line arguments", "error", err) + os.Exit(1) + } logger := promslog.New(promslogConfig) From 8fd68127a1110e604fa7de1172772fb96719ca7a Mon Sep 17 00:00:00 2001 From: Victor Boglea Date: Mon, 23 Feb 2026 18:36:16 +0100 Subject: [PATCH 4/4] ci: run golangci-lint via go run for Go 1.25 --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index c247955..aeeb9f9 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -24,4 +24,4 @@ jobs: go-version-file: 'go.mod' cache: false - name: golangci-lint - uses: golangci/golangci-lint-action@v3 + run: go run github.com/golangci/golangci-lint/cmd/golangci-lint@v1.64.8 run