From f0e00dd40def32a9c388a9b3dab1f325ef2df6ad Mon Sep 17 00:00:00 2001 From: tbuchaillot Date: Thu, 13 Nov 2025 18:20:03 +0100 Subject: [PATCH] POC: Prometheus metrics --- IMPLEMENTATION_PLAN.md | 785 +++++++++++++++++++++ PROMETHEUS_METRICS.md | 466 ++++++++++++ config/config.go | 30 + gateway/handler_success.go | 14 + gateway/instrumentation_handlers.go | 68 ++ gateway/instrumentation_prometheus.go | 462 ++++++++++++ gateway/instrumentation_prometheus_sink.go | 56 ++ gateway/server.go | 27 + generate-traffic.sh | 257 +++++++ internal/redis/redis.go | 2 + storage/connection_handler.go | 32 + storage/redis_cluster.go | 12 + tyk.conf.example | 9 + 13 files changed, 2220 insertions(+) create mode 100644 IMPLEMENTATION_PLAN.md create mode 100644 PROMETHEUS_METRICS.md create mode 100644 gateway/instrumentation_prometheus.go create mode 100644 gateway/instrumentation_prometheus_sink.go create mode 100755 generate-traffic.sh diff --git a/IMPLEMENTATION_PLAN.md b/IMPLEMENTATION_PLAN.md new file mode 100644 index 00000000000..4d7083674b8 --- /dev/null +++ b/IMPLEMENTATION_PLAN.md @@ -0,0 +1,785 @@ +# Prometheus Metrics Integration Implementation Plan + +## Overview +This document outlines the implementation plan for adding Prometheus metrics support to Tyk Gateway, following the existing patterns from StatsD and NewRelic integrations. + +## Requirements (from jira.md) + +### Metrics Categories +1. **System Metrics**: CPU, memory, goroutines, connections +2. **Gateway Metrics**: # APIs loaded, # Policies loaded, Requests RED metrics (Rate, Error, Duration) +3. **Redis Metrics**: pool size, active connections, latency +4. **Request Processing Metrics**: queue depth, throughput, latency, middleware execution and latency + +## Architecture Analysis + +### Existing Instrumentation Patterns + +#### 1. StatsD Integration (`gateway/instrumentation_statsd_sink.go`) +- **Location**: `gateway/instrumentation_statsd_sink.go` +- **Sink Pattern**: Implements `health.Sink` interface +- **UDP Transport**: Uses UDP for sending metrics +- **Metrics Types**: Events, Timings, Gauges, Complete +- **Prefix Support**: Configurable prefix for all metrics +- **Buffer Management**: Batches metrics before sending + +#### 2. NewRelic Integration (`internal/service/newrelic/`) +- **Location**: `internal/service/newrelic/newrelic.go`, `internal/service/newrelic/sink.go` +- **Sink Pattern**: Also implements `health.Sink` interface +- **Integration**: Uses NewRelic SDK +- **Middleware**: Router-level middleware (`nrgorilla.Middleware`) + +#### 3. Current Instrumentation Hook (`gateway/instrumentation_handlers.go:21-88`) +- **Entry Point**: `setupInstrumentation()` function +- **Health Stream**: Uses `health.NewStream()` from gocraft/health +- **Monitoring**: `MonitorApplicationInstrumentation()` for GC stats and RPS +- **Activation**: Via `TYK_INSTRUMENTATION` env var or `--log-instrumentation` flag + +### Key Data Points + +#### Redis Pool Stats +- Connection handler: `storage/connection_handler.go` +- Redis connector interface in `storage` package +- Need to expose pool statistics from underlying Redis connection + +#### System Metrics +- GC Stats: Already monitored in `MonitorApplicationInstrumentation()` (line 68-88) +- CPU/Memory: Available via `runtime` package +- Goroutines: `runtime.NumGoroutine()` +- Connections: Tracked in `ConnectionWatcher` (`internal/httputil/connection_watcher.go`) + +#### Gateway Metrics +- APIs Count: `len(gw.apisByID)` in Gateway struct +- Policies Count: `len(gw.policiesByID)` in Gateway struct +- Request metrics: Already tracked via `instrument.NewJob()` calls + +## Implementation Plan + +### Phase 1: Configuration Structure + +#### 1.1 Add Prometheus Config to `config/config.go` + +**Location**: `config/config.go` (after line 1187, near StatsdPrefix) + +```go +// PrometheusConfig holds configuration for Prometheus metrics exposure +type PrometheusConfig struct { + // Enabled activates Prometheus metrics endpoint + Enabled bool `json:"enabled"` + // ListenAddress is the address to expose metrics (e.g., ":9090") + ListenAddress string `json:"listen_address"` + // Path is the HTTP path for metrics endpoint (default: "/metrics") + Path string `json:"path"` + // MetricPrefix is the prefix for all Tyk metrics (default: "tyk_gateway") + MetricPrefix string `json:"metric_prefix"` + // EnableGoCollector enables Go runtime metrics + EnableGoCollector bool `json:"enable_go_collector"` + // EnableProcessCollector enables process metrics + EnableProcessCollector bool `json:"enable_process_collector"` +} + +// Add to Config struct (after line 1187) +Prometheus PrometheusConfig `json:"prometheus"` +``` + +#### 1.2 Configuration Defaults + +```go +// In config defaults +Prometheus: PrometheusConfig{ + Enabled: false, + ListenAddress: ":9090", + Path: "/metrics", + MetricPrefix: "tyk_gateway", + EnableGoCollector: true, + EnableProcessCollector: true, +} +``` + +### Phase 2: Prometheus Metrics Registry + +#### 2.1 Create Prometheus Handler (`gateway/instrumentation_prometheus.go`) + +```go +package gateway + +import ( + "context" + "net/http" + "runtime" + "strconv" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/sirupsen/logrus" +) + +type PrometheusMetrics struct { + // System Metrics + goroutines prometheus.Gauge + memoryAlloc prometheus.Gauge + memoryTotal prometheus.Gauge + cpuUsage prometheus.Gauge + connections prometheus.Gauge + + // Gateway Metrics + apisLoaded prometheus.Gauge + policiesLoaded prometheus.Gauge + requestsTotal *prometheus.CounterVec + requestDuration *prometheus.HistogramVec + requestErrors *prometheus.CounterVec + + // Redis Metrics + redisPoolSize *prometheus.GaugeVec + redisActiveConns *prometheus.GaugeVec + redisLatency prometheus.Histogram + + // Request Processing Metrics + queueDepth prometheus.Gauge + throughput prometheus.Gauge + middlewareExecTime *prometheus.HistogramVec + + registry *prometheus.Registry + gw *Gateway +} + +func NewPrometheusMetrics(gw *Gateway, prefix string) *PrometheusMetrics { + registry := prometheus.NewRegistry() + + pm := &PrometheusMetrics{ + gw: gw, + registry: registry, + } + + // System Metrics + pm.goroutines = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: prefix, + Subsystem: "system", + Name: "goroutines", + Help: "Number of active goroutines", + }) + + pm.memoryAlloc = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: prefix, + Subsystem: "system", + Name: "memory_alloc_bytes", + Help: "Bytes of allocated heap objects", + }) + + pm.memoryTotal = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: prefix, + Subsystem: "system", + Name: "memory_total_bytes", + Help: "Total bytes obtained from OS", + }) + + pm.cpuUsage = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: prefix, + Subsystem: "system", + Name: "cpu_usage_percent", + Help: "CPU usage percentage", + }) + + pm.connections = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: prefix, + Subsystem: "system", + Name: "open_connections", + Help: "Number of open connections", + }) + + // Gateway Metrics + pm.apisLoaded = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: prefix, + Subsystem: "gateway", + Name: "apis_loaded", + Help: "Number of APIs currently loaded", + }) + + pm.policiesLoaded = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: prefix, + Subsystem: "gateway", + Name: "policies_loaded", + Help: "Number of policies currently loaded", + }) + + pm.requestsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: prefix, + Subsystem: "gateway", + Name: "requests_total", + Help: "Total number of requests processed (RED: Rate)", + }, + []string{"api_id", "api_name", "method", "status_code"}, + ) + + pm.requestDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: prefix, + Subsystem: "gateway", + Name: "request_duration_seconds", + Help: "Request duration in seconds (RED: Duration)", + Buckets: prometheus.DefBuckets, + }, + []string{"api_id", "api_name", "method"}, + ) + + pm.requestErrors = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: prefix, + Subsystem: "gateway", + Name: "request_errors_total", + Help: "Total number of request errors (RED: Errors)", + }, + []string{"api_id", "api_name", "method", "error_type"}, + ) + + // Redis Metrics + pm.redisPoolSize = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: prefix, + Subsystem: "redis", + Name: "pool_size", + Help: "Redis connection pool size", + }, + []string{"type"}, + ) + + pm.redisActiveConns = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: prefix, + Subsystem: "redis", + Name: "active_connections", + Help: "Number of active Redis connections", + }, + []string{"type"}, + ) + + pm.redisLatency = prometheus.NewHistogram(prometheus.HistogramOpts{ + Namespace: prefix, + Subsystem: "redis", + Name: "operation_duration_seconds", + Help: "Redis operation latency in seconds", + Buckets: []float64{.001, .005, .01, .025, .05, .1, .25, .5, 1}, + }) + + // Request Processing Metrics + pm.queueDepth = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: prefix, + Subsystem: "processing", + Name: "queue_depth", + Help: "Current request queue depth", + }) + + pm.throughput = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: prefix, + Subsystem: "processing", + Name: "throughput_rps", + Help: "Current throughput in requests per second", + }) + + pm.middlewareExecTime = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: prefix, + Subsystem: "processing", + Name: "middleware_execution_seconds", + Help: "Middleware execution time in seconds", + Buckets: []float64{.001, .005, .01, .025, .05, .1, .25, .5, 1}, + }, + []string{"middleware_name", "api_id"}, + ) + + // Register all metrics + registry.MustRegister( + pm.goroutines, + pm.memoryAlloc, + pm.memoryTotal, + pm.cpuUsage, + pm.connections, + pm.apisLoaded, + pm.policiesLoaded, + pm.requestsTotal, + pm.requestDuration, + pm.requestErrors, + pm.redisPoolSize, + pm.redisActiveConns, + pm.redisLatency, + pm.queueDepth, + pm.throughput, + pm.middlewareExecTime, + ) + + return pm +} + +// UpdateSystemMetrics updates system-level metrics +func (pm *PrometheusMetrics) UpdateSystemMetrics() { + var m runtime.MemStats + runtime.ReadMemStats(&m) + + pm.goroutines.Set(float64(runtime.NumGoroutine())) + pm.memoryAlloc.Set(float64(m.Alloc)) + pm.memoryTotal.Set(float64(m.TotalAlloc)) + + if pm.gw.ConnectionWatcher != nil { + pm.connections.Set(float64(pm.gw.ConnectionWatcher.Count())) + } +} + +// UpdateGatewayMetrics updates gateway-specific metrics +func (pm *PrometheusMetrics) UpdateGatewayMetrics() { + pm.gw.apisMu.RLock() + pm.apisLoaded.Set(float64(len(pm.gw.apisByID))) + pm.gw.apisMu.RUnlock() + + pm.gw.policiesMu.RLock() + pm.policiesLoaded.Set(float64(len(pm.gw.policiesByID))) + pm.gw.policiesMu.RUnlock() + + // Throughput from GlobalRate + pm.throughput.Set(float64(GlobalRate.Rate())) +} + +// UpdateRedisMetrics updates Redis connection metrics +func (pm *PrometheusMetrics) UpdateRedisMetrics() { + if pm.gw.StorageConnectionHandler != nil { + for _, connType := range []string{storage.DefaultConn, storage.CacheConn, storage.AnalyticsConn} { + stats := pm.gw.StorageConnectionHandler.GetRedisStats(connType) + if stats != nil { + labels := prometheus.Labels{"type": connType} + pm.redisPoolSize.With(labels).Set(float64(stats.TotalConns)) + pm.redisActiveConns.With(labels).Set(float64(stats.ActiveConns)) + } + } + } +} + +// RecordRequest records request metrics (called from middleware/handler) +func (pm *PrometheusMetrics) RecordRequest(apiID, apiName, method string, statusCode int, duration float64) { + labels := prometheus.Labels{ + "api_id": apiID, + "api_name": apiName, + "method": method, + "status_code": strconv.Itoa(statusCode), + } + + pm.requestsTotal.With(labels).Inc() + + durLabels := prometheus.Labels{ + "api_id": apiID, + "api_name": apiName, + "method": method, + } + pm.requestDuration.With(durLabels).Observe(duration) + + if statusCode >= 400 { + errorType := "client_error" + if statusCode >= 500 { + errorType = "server_error" + } + + errLabels := prometheus.Labels{ + "api_id": apiID, + "api_name": apiName, + "method": method, + "error_type": errorType, + } + pm.requestErrors.With(errLabels).Inc() + } +} + +// RecordMiddlewareExecution records middleware execution time +func (pm *PrometheusMetrics) RecordMiddlewareExecution(middlewareName, apiID string, duration float64) { + pm.middlewareExecTime.With(prometheus.Labels{ + "middleware_name": middlewareName, + "api_id": apiID, + }).Observe(duration) +} + +// Handler returns the HTTP handler for metrics endpoint +func (pm *PrometheusMetrics) Handler() http.Handler { + return promhttp.HandlerFor(pm.registry, promhttp.HandlerOpts{}) +} + +// StartMetricsCollection starts background metrics collection +func (pm *PrometheusMetrics) StartMetricsCollection(ctx context.Context) { + ticker := time.NewTicker(5 * time.Second) + + go func() { + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + pm.UpdateSystemMetrics() + pm.UpdateGatewayMetrics() + pm.UpdateRedisMetrics() + } + } + }() +} +``` + +#### 2.2 Create Prometheus Sink (`gateway/instrumentation_prometheus_sink.go`) + +```go +package gateway + +import ( + "github.com/gocraft/health" +) + +// PrometheusSink implements health.Sink interface for Prometheus +type PrometheusSink struct { + metrics *PrometheusMetrics +} + +func NewPrometheusSink(metrics *PrometheusMetrics) *PrometheusSink { + return &PrometheusSink{ + metrics: metrics, + } +} + +func (s *PrometheusSink) EmitEvent(job, event string, kvs map[string]string) { + // Convert events to Prometheus metrics if needed +} + +func (s *PrometheusSink) EmitEventErr(job, event string, err error, kvs map[string]string) { + // Track errors in Prometheus +} + +func (s *PrometheusSink) EmitTiming(job, event string, nanos int64, kvs map[string]string) { + // Convert timing events to histograms + seconds := float64(nanos) / 1e9 + + if job == "MiddlewareCall" { + if mwName, ok := kvs["mw_name"]; ok { + apiID := kvs["api_id"] + s.metrics.RecordMiddlewareExecution(mwName, apiID, seconds) + } + } +} + +func (s *PrometheusSink) EmitGauge(job, event string, value float64, kvs map[string]string) { + // Handle gauge metrics +} + +func (s *PrometheusSink) EmitComplete(job string, status health.CompletionStatus, nanos int64, kvs map[string]string) { + // Complete status tracking +} +``` + +### Phase 3: Gateway Integration + +#### 3.1 Update `gateway/instrumentation_handlers.go` + +Add Prometheus setup alongside StatsD: + +```go +// Add to Gateway struct in gateway/server.go +PrometheusMetrics *PrometheusMetrics +prometheusServer *http.Server + +// Update setupInstrumentation() function +func (gw *Gateway) setupInstrumentation() { + gwConfig := gw.GetConfig() + + // Existing StatsD setup... + + // Prometheus Setup + if gwConfig.Prometheus.Enabled { + log.Info("Initializing Prometheus metrics...") + + gw.PrometheusMetrics = NewPrometheusMetrics(gw, gwConfig.Prometheus.MetricPrefix) + + // Add Prometheus sink to instrument stream + prometheusSink := NewPrometheusSink(gw.PrometheusMetrics) + instrument.AddSink(prometheusSink) + + // Start metrics collection + gw.PrometheusMetrics.StartMetricsCollection(gw.ctx) + + // Start Prometheus HTTP server + gw.startPrometheusServer() + + log.WithFields(logrus.Fields{ + "listen_address": gwConfig.Prometheus.ListenAddress, + "path": gwConfig.Prometheus.Path, + }).Info("Prometheus metrics endpoint started") + } + + // Existing monitoring... + gw.MonitorApplicationInstrumentation() +} + +// Add Prometheus HTTP server +func (gw *Gateway) startPrometheusServer() { + gwConfig := gw.GetConfig() + + mux := http.NewServeMux() + mux.Handle(gwConfig.Prometheus.Path, gw.PrometheusMetrics.Handler()) + + gw.prometheusServer = &http.Server{ + Addr: gwConfig.Prometheus.ListenAddress, + Handler: mux, + } + + go func() { + if err := gw.prometheusServer.ListenAndServe(); err != nil && err != http.ErrServerClosed { + log.WithError(err).Error("Prometheus metrics server error") + } + }() +} + +// Add shutdown logic in Gateway.gracefulShutdown() +if gw.prometheusServer != nil { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := gw.prometheusServer.Shutdown(ctx); err != nil { + log.WithError(err).Error("Error shutting down Prometheus server") + } +} +``` + +#### 3.2 Integrate Request Metrics in Success Handler + +Update `gateway/handler_success.go` RecordHit function (around line 300): + +```go +// Add after existing analytics recording +if gw.PrometheusMetrics != nil { + duration := float64(latency.Total) / 1e9 // Convert nanoseconds to seconds + gw.PrometheusMetrics.RecordRequest( + s.Spec.APIID, + s.Spec.Name, + r.Method, + code, + duration, + ) +} +``` + +#### 3.3 Integrate Middleware Metrics + +Update `gateway/middleware.go` createMiddleware function (around line 150-180): + +```go +// Add after existing instrumentation +if gw.PrometheusMetrics != nil { + finishTime := time.Since(startTime) + gw.PrometheusMetrics.RecordMiddlewareExecution( + mw.Name(), + spec.APIID, + float64(finishTime.Nanoseconds())/1e9, + ) +} +``` + +### Phase 4: Redis Metrics Enhancement + +#### 4.1 Expose Redis Pool Stats + +Update `storage/redis.go` to expose pool statistics: + +```go +// Add method to RedisCluster +func (r *RedisCluster) PoolStats() *redis.PoolStats { + if r.singleton != nil { + return r.singleton.PoolStats() + } + return nil +} + +// Add method to ConnectionHandler in storage/connection_handler.go +func (rc *ConnectionHandler) GetRedisStats(connType string) *redis.PoolStats { + rc.connectionsMu.RLock() + defer rc.connectionsMu.RUnlock() + + if conn, ok := rc.connections[connType]; ok { + if redisConn, ok := conn.(*RedisCluster); ok { + return redisConn.PoolStats() + } + } + return nil +} +``` + +#### 4.2 Update Prometheus Metrics Collection + +```go +// In PrometheusMetrics.UpdateRedisMetrics() +func (pm *PrometheusMetrics) UpdateRedisMetrics() { + if pm.gw.StorageConnectionHandler != nil { + for _, connType := range []string{storage.DefaultConn, storage.CacheConn, storage.AnalyticsConn} { + stats := pm.gw.StorageConnectionHandler.GetRedisStats(connType) + if stats != nil { + pm.redisPoolSize.WithLabelValues(connType).Set(float64(stats.TotalConns)) + pm.redisActiveConns.WithLabelValues(connType).Set(float64(stats.ActiveConns)) + } + } + } +} +``` + +### Phase 5: Documentation and Configuration + +#### 5.1 Configuration Example + +Add to `tyk.conf.example`: + +```json +{ + "prometheus": { + "enabled": true, + "listen_address": ":9090", + "path": "/metrics", + "metric_prefix": "tyk_gateway", + "enable_go_collector": true, + "enable_process_collector": true + } +} +``` + +#### 5.2 Environment Variables + +Support environment variable configuration: +- `TYK_GW_PROMETHEUS_ENABLED` +- `TYK_GW_PROMETHEUS_LISTENADDRESS` +- `TYK_GW_PROMETHEUS_PATH` +- `TYK_GW_PROMETHEUS_METRICPREFIX` + +### Phase 6: Testing + +#### 6.1 Unit Tests + +Create `gateway/instrumentation_prometheus_test.go`: + +```go +package gateway + +import ( + "testing" + "github.com/prometheus/client_golang/prometheus/testutil" +) + +func TestPrometheusMetrics(t *testing.T) { + // Test metric registration + // Test metric updates + // Test HTTP handler +} + +func TestPrometheusSink(t *testing.T) { + // Test sink integration with health.Stream +} +``` + +#### 6.2 Integration Tests + +- Test metrics endpoint accessibility +- Verify metric values match expected gateway state +- Test with high request load +- Verify graceful shutdown + +### Phase 7: Additional Enhancements + +#### 7.1 Optional Go Runtime Collectors + +```go +// In NewPrometheusMetrics, optionally register Go collectors +if gwConfig.Prometheus.EnableGoCollector { + registry.MustRegister(prometheus.NewGoCollector()) +} + +if gwConfig.Prometheus.EnableProcessCollector { + registry.MustRegister(prometheus.NewProcessCollector(prometheus.ProcessCollectorOpts{})) +} +``` + +#### 7.2 Custom Labels Support + +Add configuration for custom labels: + +```go +type PrometheusConfig struct { + // ... existing fields ... + CustomLabels map[string]string `json:"custom_labels"` +} +``` + +## Implementation Timeline + +### Week 1: Foundation +- [ ] Day 1-2: Configuration structure (Phase 1) +- [ ] Day 3-5: Prometheus metrics registry and handler (Phase 2.1, 2.2) + +### Week 2: Integration +- [ ] Day 1-2: Gateway integration (Phase 3.1, 3.2) +- [ ] Day 3-4: Middleware integration (Phase 3.3) +- [ ] Day 5: Redis metrics enhancement (Phase 4) + +### Week 3: Testing and Documentation +- [ ] Day 1-2: Unit tests (Phase 6.1) +- [ ] Day 3-4: Integration tests (Phase 6.2) +- [ ] Day 5: Documentation and examples (Phase 5) + +### Week 4: Enhancements and Review +- [ ] Day 1-2: Additional enhancements (Phase 7) +- [ ] Day 3-4: Code review and refinements +- [ ] Day 5: Final testing and deployment preparation + +## Metrics Exposed (Summary) + +### System Metrics +- `tyk_gateway_system_goroutines` - Number of active goroutines +- `tyk_gateway_system_memory_alloc_bytes` - Allocated heap memory +- `tyk_gateway_system_memory_total_bytes` - Total memory from OS +- `tyk_gateway_system_cpu_usage_percent` - CPU usage +- `tyk_gateway_system_open_connections` - Open connections + +### Gateway Metrics +- `tyk_gateway_gateway_apis_loaded` - Number of loaded APIs +- `tyk_gateway_gateway_policies_loaded` - Number of loaded policies +- `tyk_gateway_gateway_requests_total{api_id, api_name, method, status_code}` - Total requests (RED: Rate) +- `tyk_gateway_gateway_request_duration_seconds{api_id, api_name, method}` - Request duration histogram (RED: Duration) +- `tyk_gateway_gateway_request_errors_total{api_id, api_name, method, error_type}` - Request errors (RED: Errors) + +### Redis Metrics +- `tyk_gateway_redis_pool_size{type}` - Connection pool size (types: default, cache, analytics) +- `tyk_gateway_redis_active_connections{type}` - Active connections (types: default, cache, analytics) +- `tyk_gateway_redis_operation_duration_seconds` - Operation latency histogram + +### Request Processing Metrics +- `tyk_gateway_processing_queue_depth` - Request queue depth +- `tyk_gateway_processing_throughput_rps` - Throughput in RPS +- `tyk_gateway_processing_middleware_execution_seconds{middleware_name, api_id}` - Middleware execution time + +## Migration and Compatibility + +- Prometheus metrics run independently alongside existing StatsD/NewRelic +- No breaking changes to existing instrumentation +- Can be enabled/disabled via configuration +- Uses separate HTTP server to avoid conflicts with main gateway ports + +## Security Considerations + +1. **Separate Metrics Port**: Run Prometheus endpoint on dedicated port +2. **Access Control**: Document firewall rules for metrics port +3. **Sensitive Data**: Ensure no API keys or sensitive data in labels +4. **Rate Limiting**: Consider adding rate limiting to metrics endpoint for production + +## Deployment Recommendations + +1. **Production**: Enable with dedicated metrics port (e.g., 9090) +2. **Kubernetes**: Use ServiceMonitor for automatic discovery +3. **Docker**: Expose metrics port in container configuration +4. **Monitoring**: Set up Prometheus scraping with 15-30s interval + +## References + +- Existing implementations: + - `gateway/instrumentation_statsd_sink.go` + - `internal/service/newrelic/` + - `gateway/instrumentation_handlers.go` +- Prometheus client library: `github.com/prometheus/client_golang` +- Health stream: `github.com/gocraft/health` diff --git a/PROMETHEUS_METRICS.md b/PROMETHEUS_METRICS.md new file mode 100644 index 00000000000..68158e09a52 --- /dev/null +++ b/PROMETHEUS_METRICS.md @@ -0,0 +1,466 @@ +# Prometheus Metrics Integration for Tyk Gateway + +This document describes the Prometheus metrics integration for Tyk Gateway, providing comprehensive monitoring and observability capabilities. + +## Table of Contents + +- [Overview](#overview) +- [Configuration](#configuration) +- [Environment Variables](#environment-variables) +- [Available Metrics](#available-metrics) +- [Usage Examples](#usage-examples) +- [Integration with Monitoring Systems](#integration-with-monitoring-systems) +- [Best Practices](#best-practices) + +## Overview + +Tyk Gateway now supports native Prometheus metrics export through a dedicated HTTP endpoint. This integration provides detailed metrics across four key categories: + +1. **System Metrics**: Runtime performance and resource utilization +2. **Gateway Metrics**: API and policy management, request processing (RED metrics) +3. **Redis Metrics**: Connection pool statistics and performance +4. **Request Processing Metrics**: Queue depth, throughput, and middleware performance + +## Configuration + +### Configuration File + +Add the following section to your `tyk.conf`: + +```json +{ + "prometheus": { + "enabled": true, + "listen_address": ":9090", + "path": "/metrics", + "metric_prefix": "tyk_gateway", + "enable_go_collector": true, + "enable_process_collector": true, + "enable_per_api_metrics": false + } +} +``` + +### Configuration Options + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `enabled` | boolean | `false` | Enable/disable Prometheus metrics endpoint | +| `listen_address` | string | `":9090"` | Address and port for metrics endpoint | +| `path` | string | `"/metrics"` | HTTP path for metrics endpoint | +| `metric_prefix` | string | `"tyk_gateway"` | Prefix for all Tyk metrics | +| `enable_go_collector` | boolean | `true` | Enable Go runtime metrics collection | +| `enable_process_collector` | boolean | `true` | Enable process-level metrics collection | +| `enable_per_api_metrics` | boolean | `false` | Enable per-API metrics with `api_id` label (increases cardinality) | + +## Environment Variables + +All configuration options can be set via environment variables using the `TYK_GW_PROMETHEUS_` prefix: + +```bash +export TYK_GW_PROMETHEUS_ENABLED=true +export TYK_GW_PROMETHEUS_LISTENADDRESS=":9090" +export TYK_GW_PROMETHEUS_PATH="/metrics" +export TYK_GW_PROMETHEUS_METRICPREFIX="tyk_gateway" +export TYK_GW_PROMETHEUS_ENABLEGOCOLLECTOR=true +export TYK_GW_PROMETHEUS_ENABLEPROCESSCOLLECTOR=true +export TYK_GW_PROMETHEUS_ENABLEPERAPIMETRICS=false +``` + +## Available Metrics + +### System Metrics + +| Metric | Type | Description | Labels | +|--------|------|-------------|--------| +| `tyk_gateway_system_goroutines` | Gauge | Number of active goroutines | - | +| `tyk_gateway_system_memory_alloc_bytes` | Gauge | Bytes of allocated heap objects | - | +| `tyk_gateway_system_memory_total_bytes` | Gauge | Total bytes obtained from OS | - | +| `tyk_gateway_system_cpu_usage_percent` | Gauge | CPU usage percentage | - | +| `tyk_gateway_system_open_connections` | Gauge | Number of open connections | - | + +### Gateway Metrics (RED) + +#### Overall Gateway Metrics (Always Available) + +| Metric | Type | Description | Labels | +|--------|------|-------------|--------| +| `tyk_gateway_gateway_apis_loaded` | Gauge | Number of APIs currently loaded | - | +| `tyk_gateway_gateway_policies_loaded` | Gauge | Number of policies currently loaded | - | +| `tyk_gateway_gateway_http_requests_total` | Counter | Total HTTP requests across all APIs (RED: Rate) | `method`, `status_class` | +| `tyk_gateway_gateway_http_request_duration_seconds` | Histogram | Total HTTP request duration across all APIs (RED: Duration) | `method`, `status_class` | +| `tyk_gateway_gateway_http_request_upstream_latency_seconds` | Histogram | Upstream service latency (time waiting for upstream response) | `method`, `status_class` | +| `tyk_gateway_gateway_http_request_gateway_latency_seconds` | Histogram | Gateway processing latency (time spent in gateway middleware) | `method`, `status_class` | + +**Note**: `status_class` groups status codes as `2xx`, `3xx`, `4xx`, `5xx` to reduce cardinality. + +**Latency Breakdown**: +- **Total Duration** = Upstream Latency + Gateway Latency +- **Upstream Latency**: Time waiting for the upstream service to respond +- **Gateway Latency**: Time spent in Tyk gateway processing (authentication, rate limiting, transformations, etc.) + +**Error Metrics**: Calculate errors from `status_class=~"4xx|5xx"` in the requests metrics above. + +#### Per-API Metrics (Optional - `enable_per_api_metrics: true`) + +| Metric | Type | Description | Labels | +|--------|------|-------------|--------| +| `tyk_gateway_gateway_requests_total` | Counter | Total requests per API (RED: Rate) | `api_id`, `method`, `status_class` | +| `tyk_gateway_gateway_request_duration_seconds` | Histogram | Request duration per API (RED: Duration) | `api_id`, `method` | +| `tyk_gateway_gateway_request_errors_total` | Counter | Request errors per API (RED: Errors) | `api_id`, `method`, `status_class` | + +**Warning**: Per-API metrics can create high cardinality with many APIs. Enable only when needed for detailed per-API monitoring. + +### Redis Metrics + +| Metric | Type | Description | Labels | +|--------|------|-------------|--------| +| `tyk_gateway_redis_pool_size` | Gauge | Redis connection pool size | `type` (default, cache, analytics) | +| `tyk_gateway_redis_active_connections` | Gauge | Active Redis connections | `type` (default, cache, analytics) | +| `tyk_gateway_redis_operation_duration_seconds` | Histogram | Redis operation latency | - | + +### Request Processing Metrics + +| Metric | Type | Description | Labels | +|--------|------|-------------|--------| +| `tyk_gateway_processing_queue_depth` | Gauge | Current request queue depth | - | +| `tyk_gateway_processing_throughput_rps` | Gauge | Throughput in requests per second | - | +| `tyk_gateway_processing_middleware_execution_seconds` | Histogram | Middleware execution time | `middleware_name`, `api_id` | + +## Usage Examples + +### Basic Curl Test + +```bash +curl http://localhost:9090/metrics +``` + +### Sample Prometheus Configuration + +Add this to your `prometheus.yml`: + +```yaml +scrape_configs: + - job_name: 'tyk-gateway' + static_configs: + - targets: ['localhost:9090'] + scrape_interval: 15s + scrape_timeout: 10s +``` + +### Example Metrics Output + +```promql +# System Metrics +tyk_gateway_system_goroutines 245 +tyk_gateway_system_memory_alloc_bytes 45678912 +tyk_gateway_system_open_connections 12 + +# Gateway Metrics (Overall - Always Available) +tyk_gateway_gateway_apis_loaded 5 +tyk_gateway_gateway_policies_loaded 3 +tyk_gateway_gateway_http_requests_total{method="GET",status_class="2xx"} 1523 +tyk_gateway_gateway_http_requests_total{method="POST",status_class="4xx"} 12 +tyk_gateway_gateway_http_request_duration_seconds_bucket{method="GET",status_class="2xx",le="0.1"} 1450 +tyk_gateway_gateway_http_request_upstream_latency_seconds_bucket{method="GET",status_class="2xx",le="0.05"} 1400 +tyk_gateway_gateway_http_request_gateway_latency_seconds_bucket{method="GET",status_class="2xx",le="0.005"} 1500 + +# Gateway Metrics (Per-API - Only if enable_per_api_metrics=true) +tyk_gateway_gateway_requests_total{api_id="api1",method="GET",status_class="2xx"} 1000 +tyk_gateway_gateway_request_errors_total{api_id="api1",method="GET",status_class="4xx"} 5 + +# Redis Metrics +tyk_gateway_redis_pool_size{type="default"} 100 +tyk_gateway_redis_active_connections{type="default"} 45 +``` + +## Integration with Monitoring Systems + +### Grafana Dashboard + +Create a Grafana dashboard using these example queries: + +**Overall Request Rate (RED: Rate)** +```promql +sum(rate(tyk_gateway_gateway_http_requests_total[5m])) by (method, status_class) +``` + +**Overall Request Duration P95 (RED: Duration)** +```promql +histogram_quantile(0.95, sum(rate(tyk_gateway_gateway_http_request_duration_seconds_bucket[5m])) by (le, method)) +``` + +**Overall Error Rate (RED: Errors)** +```promql +sum(rate(tyk_gateway_gateway_http_requests_total{status_class=~"4xx|5xx"}[5m])) by (status_class) +``` + +**Upstream Latency P95** +```promql +histogram_quantile(0.95, sum(rate(tyk_gateway_gateway_http_request_upstream_latency_seconds_bucket[5m])) by (le, method)) +``` + +**Gateway Processing Latency P95** +```promql +histogram_quantile(0.95, sum(rate(tyk_gateway_gateway_http_request_gateway_latency_seconds_bucket[5m])) by (le, method)) +``` + +**Latency Breakdown Comparison** +```promql +# Compare upstream vs gateway latency +sum(rate(tyk_gateway_gateway_http_request_upstream_latency_seconds_sum[5m])) / sum(rate(tyk_gateway_gateway_http_request_upstream_latency_seconds_count[5m])) +/ +sum(rate(tyk_gateway_gateway_http_request_gateway_latency_seconds_sum[5m])) / sum(rate(tyk_gateway_gateway_http_request_gateway_latency_seconds_count[5m])) +``` + +**Per-API Request Rate (if `enable_per_api_metrics=true`)** +```promql +sum(rate(tyk_gateway_gateway_requests_total[5m])) by (api_id, method, status_class) +``` + +**Per-API Error Rate (if `enable_per_api_metrics=true`)** +```promql +sum(rate(tyk_gateway_gateway_request_errors_total[5m])) by (api_id, status_class) +``` + +**Redis Pool Utilization** +```promql +(tyk_gateway_redis_active_connections / tyk_gateway_redis_pool_size) * 100 +``` + +### Kubernetes ServiceMonitor + +For automatic Prometheus discovery in Kubernetes: + +```yaml +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: tyk-gateway +spec: + selector: + matchLabels: + app: tyk-gateway + endpoints: + - port: prometheus + interval: 15s + path: /metrics +``` + +### Docker Compose + +```yaml +version: '3.8' +services: + tyk-gateway: + image: tykio/tyk-gateway:latest + ports: + - "8080:8080" + - "9090:9090" + environment: + - TYK_GW_PROMETHEUS_ENABLED=true + - TYK_GW_PROMETHEUS_LISTENADDRESS=:9090 + + prometheus: + image: prom/prometheus:latest + ports: + - "9091:9090" + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml +``` + +## Latency Analysis + +### Using Latency Breakdown + +The gateway provides detailed latency breakdown to help identify performance bottlenecks: + +**Identify Bottlenecks**: +```promql +# If upstream latency is high, the problem is in your backend services +histogram_quantile(0.95, rate(tyk_gateway_gateway_http_request_upstream_latency_seconds_bucket[5m])) + +# If gateway latency is high, the problem is in gateway middleware/processing +histogram_quantile(0.95, rate(tyk_gateway_gateway_http_request_gateway_latency_seconds_bucket[5m])) +``` + +**Calculate Overhead Percentage**: +```promql +# What percentage of total time is spent in the gateway? +( + sum(rate(tyk_gateway_gateway_http_request_gateway_latency_seconds_sum[5m])) + / + sum(rate(tyk_gateway_gateway_http_request_duration_seconds_sum[5m])) +) * 100 +``` + +**Common Patterns**: +- **High Upstream, Low Gateway**: Backend services are slow - optimize your APIs +- **Low Upstream, High Gateway**: Gateway processing is slow - review middleware, rate limits, plugins +- **Both High**: Multiple bottlenecks - prioritize based on which contributes more to total latency + +### Troubleshooting Scenarios + +**Scenario 1: Slow Response Times** +1. Check total P95: `histogram_quantile(0.95, rate(tyk_gateway_gateway_http_request_duration_seconds_bucket[5m]))` +2. Compare upstream vs gateway latency to identify where time is spent +3. If upstream is slow: Review backend service performance +4. If gateway is slow: Review middleware configuration, plugins, rate limiting + +**Scenario 2: Increased Latency After Deployment** +```promql +# Compare gateway latency before and after deployment +histogram_quantile(0.95, + sum(rate(tyk_gateway_gateway_http_request_gateway_latency_seconds_bucket[5m] offset 1h)) by (le) +) +``` + +**Scenario 3: Inconsistent Performance** +```promql +# High variance in latency +stddev_over_time( + histogram_quantile(0.95, + rate(tyk_gateway_gateway_http_request_duration_seconds_bucket[5m]) + )[10m:1m] +) +``` + +## Best Practices + +### Cardinality Management + +**Understanding Per-API Metrics**: +- Per-API metrics add the `api_id` label to metrics +- With 100 APIs × 5 methods × 5 status classes = 2,500 time series per metric +- High cardinality can impact Prometheus performance and storage + +**When to Enable Per-API Metrics**: +- ✅ **Development/Testing**: Detailed debugging and analysis +- ✅ **Small Deployments**: <50 APIs with low request volume +- ✅ **Specific Monitoring**: Temporary troubleshooting of specific APIs +- ❌ **Large Deployments**: >100 APIs or high request volume +- ❌ **Production (default)**: Use overall gateway metrics instead + +**Recommended Configuration**: +```json +{ + "prometheus": { + "enabled": true, + "enable_per_api_metrics": false, // Default: use overall metrics + "enable_go_collector": true, + "enable_process_collector": true + } +} +``` + +**Alternative Approaches**: +- Use overall gateway metrics (`gateway_http_requests_total`) for alerting +- Enable per-API metrics temporarily for debugging specific issues +- Use API analytics/logging for detailed per-API investigation + +### Performance Considerations + +1. **Scrape Interval**: Use 15-30 second intervals for production +2. **Metrics Port**: Run on a separate port (9090) from the gateway (8080) +3. **Network Security**: Restrict metrics endpoint access via firewall rules +4. **Cardinality**: Keep per-API metrics disabled unless specifically needed + +### Security Recommendations + +1. **Separate Network**: Expose metrics port only on internal network +2. **Authentication**: Use network-level authentication (VPN, IP whitelisting) +3. **Sensitive Data**: Metrics do not include API keys or sensitive request data + +### Troubleshooting + +**Issue**: Metrics endpoint not accessible +```bash +# Check if Prometheus is enabled in configuration +grep -A 5 "prometheus" /opt/tyk-gateway/tyk.conf + +# Check if metrics port is listening +netstat -tln | grep 9090 +``` + +**Issue**: No data in metrics +- Ensure gateway is receiving traffic +- Check gateway logs for Prometheus initialization messages +- Verify metrics collection goroutine is running + +**Issue**: High memory usage in Prometheus +- Check cardinality: `curl -s http://localhost:9090/api/v1/status/tsdb | jq` +- **If per-API metrics are enabled**: Disable `enable_per_api_metrics` to reduce cardinality +- Reduce scrape frequency from 15s to 30s or 60s +- Consider disabling `enable_go_collector` and `enable_process_collector` if not needed +- Use recording rules to pre-aggregate high-cardinality metrics + +**Issue**: Too many time series +- Verify per-API metrics setting: `grep enable_per_api_metrics /opt/tyk-gateway/tyk.conf` +- Calculate expected cardinality: `(number of APIs) × (methods) × (status classes)` +- Consider using overall gateway metrics instead of per-API metrics +- Use Prometheus relabeling to drop unnecessary labels + +### Alerting Rules + +Example Prometheus alerting rules: + +```yaml +groups: + - name: tyk_gateway + rules: + - alert: TykHighErrorRate + expr: | + sum(rate(tyk_gateway_gateway_http_requests_total{status_class=~"4xx|5xx"}[5m])) + / + sum(rate(tyk_gateway_gateway_http_requests_total[5m])) + > 0.05 + for: 5m + annotations: + summary: "High error rate detected (>5%)" + description: "Gateway error rate is {{ $value | humanizePercentage }}" + + - alert: TykHighP95Latency + expr: | + histogram_quantile(0.95, + sum(rate(tyk_gateway_gateway_http_request_duration_seconds_bucket[5m])) by (le) + ) > 1 + for: 5m + annotations: + summary: "High P95 latency detected" + description: "P95 latency is {{ $value }}s (threshold: 1s)" + + - alert: TykRedisPoolExhaustion + expr: | + (tyk_gateway_redis_active_connections / tyk_gateway_redis_pool_size) > 0.9 + for: 5m + labels: + severity: warning + annotations: + summary: "Redis connection pool nearly exhausted" + description: "Redis pool utilization is {{ $value | humanizePercentage }} for {{ $labels.type }}" + + - alert: TykHighMemoryUsage + expr: tyk_gateway_system_memory_alloc_bytes > 1e9 + for: 10m + labels: + severity: warning + annotations: + summary: "High memory usage detected" + description: "Memory usage is {{ $value | humanize1024 }}" +``` + +## Migration from StatsD + +Prometheus can run alongside existing StatsD instrumentation without conflicts. Both systems collect metrics independently and can be gradually migrated: + +1. Enable Prometheus alongside StatsD +2. Validate metrics accuracy in Prometheus +3. Update monitoring dashboards to use Prometheus +4. Disable StatsD once migration is complete + +## Support + +For issues, feature requests, or questions: +- GitHub Issues: https://github.com/TykTechnologies/tyk/issues +- Documentation: https://tyk.io/docs +- Community Forum: https://community.tyk.io diff --git a/config/config.go b/config/config.go index a9c5d2db75f..5a107959d41 100644 --- a/config/config.go +++ b/config/config.go @@ -59,6 +59,15 @@ var ( Enabled: false, AllowUnsafe: []string{}, }, + Prometheus: PrometheusConfig{ + Enabled: false, + ListenAddress: ":9090", + Path: "/metrics", + MetricPrefix: "tyk_gateway", + EnableGoCollector: true, + EnableProcessCollector: true, + EnablePerAPIMetrics: false, + }, PIDFileLocation: "/var/run/tyk/tyk-gateway.pid", Security: SecurityConfig{ CertificateExpiryMonitor: CertificateExpiryMonitorConfig{ @@ -772,6 +781,24 @@ type StreamingConfig struct { AllowUnsafe []string `json:"allow_unsafe"` } +// PrometheusConfig holds configuration for Prometheus metrics exposure +type PrometheusConfig struct { + // Enabled activates Prometheus metrics endpoint + Enabled bool `json:"enabled"` + // ListenAddress is the address to expose metrics (e.g., ":9090") + ListenAddress string `json:"listen_address"` + // Path is the HTTP path for metrics endpoint (default: "/metrics") + Path string `json:"path"` + // MetricPrefix is the prefix for all Tyk metrics (default: "tyk_gateway") + MetricPrefix string `json:"metric_prefix"` + // EnableGoCollector enables Go runtime metrics + EnableGoCollector bool `json:"enable_go_collector"` + // EnableProcessCollector enables process metrics + EnableProcessCollector bool `json:"enable_process_collector"` + // EnablePerAPIMetrics enables per-API metrics with api_id label (can increase cardinality) + EnablePerAPIMetrics bool `json:"enable_per_api_metrics"` +} + // Config is the configuration object used by Tyk to set up various parameters. type Config struct { // Force your Gateway to work only on a specific domain name. Can be overridden by API custom domain. @@ -1186,6 +1213,9 @@ type Config struct { // StatsD prefix StatsdPrefix string `json:"statsd_prefix"` + // Prometheus metrics configuration + Prometheus PrometheusConfig `json:"prometheus"` + // Event System EventHandlers apidef.EventHandlerMetaConfig `json:"event_handlers"` EventTriggers map[apidef.TykEvent][]TykEventHandler `json:"event_trigers_defunct"` // Deprecated: Config.GetEventTriggers instead. diff --git a/gateway/handler_success.go b/gateway/handler_success.go index 5ad74aadfb5..96092c51d0e 100644 --- a/gateway/handler_success.go +++ b/gateway/handler_success.go @@ -182,8 +182,22 @@ func (s *SuccessHandler) addTraceIDTag(reqCtx context.Context, tags []string) [] } func (s *SuccessHandler) RecordHit(r *http.Request, timing analytics.Latency, code int, responseCopy *http.Response, cached bool) { + // Record Prometheus metrics (independent of analytics) + if s.Gw.PrometheusMetrics != nil { + s.Gw.PrometheusMetrics.RecordRequest( + s.Spec.APIID, + s.Spec.Name, + r.Method, + code, + timing.Total, + timing.Upstream, + ) + } else { + log.Debug("PrometheusMetrics is nil, skipping metrics recording") + } if s.Spec.DoNotTrack || ctxGetDoNotTrack(r) { + log.Debug("Skipping RecordHit: DoNotTrack enabled") return } diff --git a/gateway/instrumentation_handlers.go b/gateway/instrumentation_handlers.go index 2d96f5fcb58..79d65167ab9 100644 --- a/gateway/instrumentation_handlers.go +++ b/gateway/instrumentation_handlers.go @@ -8,6 +8,7 @@ import ( "time" "github.com/gocraft/health" + "github.com/sirupsen/logrus" "github.com/TykTechnologies/tyk/cli" "github.com/TykTechnologies/tyk/request" @@ -48,6 +49,73 @@ func (gw *Gateway) setupInstrumentation() { gw.MonitorApplicationInstrumentation() } +// setupPrometheusInstrumentation initializes Prometheus metrics collection and HTTP endpoint +func (gw *Gateway) setupPrometheusInstrumentation() { + gwConfig := gw.GetConfig() + + if !gwConfig.Prometheus.Enabled { + return + } + + log.WithFields(logrus.Fields{ + "per_api_metrics": gwConfig.Prometheus.EnablePerAPIMetrics, + }).Info("Initializing Prometheus metrics...") + + gw.PrometheusMetrics = NewPrometheusMetrics(gw, gwConfig.Prometheus.MetricPrefix, gwConfig.Prometheus.EnablePerAPIMetrics) + + // Register optional Go and process collectors + gw.PrometheusMetrics.RegisterGoCollectors( + gwConfig.Prometheus.EnableGoCollector, + gwConfig.Prometheus.EnableProcessCollector, + ) + + // Add Prometheus sink to instrument stream + prometheusSink := NewPrometheusSink(gw.PrometheusMetrics) + instrument.AddSink(prometheusSink) + + // Start metrics collection + gw.PrometheusMetrics.StartMetricsCollection(gw.ctx) + + // Start Prometheus HTTP server + gw.startPrometheusServer() + + log.WithFields(logrus.Fields{ + "listen_address": gwConfig.Prometheus.ListenAddress, + "path": gwConfig.Prometheus.Path, + "prefix": gwConfig.Prometheus.MetricPrefix, + }).Info("Prometheus metrics endpoint started") +} + +// startPrometheusServer starts the HTTP server for Prometheus metrics endpoint +func (gw *Gateway) startPrometheusServer() { + gwConfig := gw.GetConfig() + + mux := http.NewServeMux() + mux.Handle(gwConfig.Prometheus.Path, gw.PrometheusMetrics.Handler()) + + server := &http.Server{ + Addr: gwConfig.Prometheus.ListenAddress, + Handler: mux, + ReadTimeout: 10 * time.Second, + WriteTimeout: 10 * time.Second, + IdleTimeout: 120 * time.Second, + } + + gw.prometheusServerMu.Lock() + gw.prometheusServer = server + gw.prometheusServerMu.Unlock() + + go func() { + log.WithFields(logrus.Fields{ + "address": gwConfig.Prometheus.ListenAddress, + }).Info("Starting Prometheus metrics server...") + + if err := server.ListenAndServe(); err != nil && err != http.ErrServerClosed { + log.WithError(err).Fatal("Prometheus metrics server failed to start") + } + }() +} + // InstrumentationMW will set basic instrumentation events, variables and timers on API jobs func InstrumentationMW(next http.Handler) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { diff --git a/gateway/instrumentation_prometheus.go b/gateway/instrumentation_prometheus.go new file mode 100644 index 00000000000..81e9b8f3f50 --- /dev/null +++ b/gateway/instrumentation_prometheus.go @@ -0,0 +1,462 @@ +package gateway + +import ( + "context" + "fmt" + "net/http" + "runtime" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/collectors" + "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/sirupsen/logrus" + + "github.com/TykTechnologies/tyk/storage" +) + +// PrometheusMetrics holds all Prometheus metric collectors for the Gateway +type PrometheusMetrics struct { + // System Metrics + goroutines prometheus.Gauge + memoryAlloc prometheus.Gauge + memoryTotal prometheus.Gauge + cpuUsage prometheus.Gauge + connections prometheus.Gauge + + // Gateway Metrics + apisLoaded prometheus.Gauge + policiesLoaded prometheus.Gauge + requestsTotal *prometheus.CounterVec + requestDuration *prometheus.HistogramVec + requestErrors *prometheus.CounterVec + + // Overall Gateway RED metrics (no api_id dimension) + gatewayRequestsTotal *prometheus.CounterVec + gatewayRequestDuration *prometheus.HistogramVec + gatewayRequestUpstreamLatency *prometheus.HistogramVec + gatewayRequestGatewayLatency *prometheus.HistogramVec + + // Redis Metrics + redisPoolSize *prometheus.GaugeVec + redisActiveConns *prometheus.GaugeVec + redisLatency prometheus.Histogram + + // Request Processing Metrics + queueDepth prometheus.Gauge + throughput prometheus.Gauge + middlewareExecTime *prometheus.HistogramVec + + registry *prometheus.Registry + gw *Gateway + collectionDone chan struct{} + + // Configuration + enablePerAPIMetrics bool + + // CPU tracking + lastCPUTime time.Duration + lastCheckTime time.Time +} + +// NewPrometheusMetrics creates and registers all Prometheus metrics +func NewPrometheusMetrics(gw *Gateway, prefix string, enablePerAPIMetrics bool) *PrometheusMetrics { + registry := prometheus.NewRegistry() + + pm := &PrometheusMetrics{ + gw: gw, + registry: registry, + collectionDone: make(chan struct{}), + lastCheckTime: time.Now(), + enablePerAPIMetrics: enablePerAPIMetrics, + } + + // System Metrics + pm.goroutines = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: prefix, + Subsystem: "system", + Name: "goroutines", + Help: "Number of active goroutines", + }) + + pm.memoryAlloc = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: prefix, + Subsystem: "system", + Name: "memory_alloc_bytes", + Help: "Bytes of allocated heap objects", + }) + + pm.memoryTotal = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: prefix, + Subsystem: "system", + Name: "memory_total_bytes", + Help: "Total bytes obtained from OS", + }) + + pm.cpuUsage = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: prefix, + Subsystem: "system", + Name: "cpu_usage_percent", + Help: "CPU usage percentage", + }) + + pm.connections = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: prefix, + Subsystem: "system", + Name: "open_connections", + Help: "Number of open connections", + }) + + // Gateway Metrics + pm.apisLoaded = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: prefix, + Subsystem: "gateway", + Name: "apis_loaded", + Help: "Number of APIs currently loaded", + }) + + pm.policiesLoaded = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: prefix, + Subsystem: "gateway", + Name: "policies_loaded", + Help: "Number of policies currently loaded", + }) + + // Per-API metrics (optional, controlled by config) + if enablePerAPIMetrics { + pm.requestsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: prefix, + Subsystem: "gateway", + Name: "requests_total", + Help: "Total number of requests processed per API (RED: Rate)", + }, + []string{"api_id", "method", "status_class"}, + ) + + pm.requestDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: prefix, + Subsystem: "gateway", + Name: "request_duration_seconds", + Help: "Request duration in seconds per API (RED: Duration)", + Buckets: prometheus.DefBuckets, + }, + []string{"api_id", "method"}, + ) + + pm.requestErrors = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: prefix, + Subsystem: "gateway", + Name: "request_errors_total", + Help: "Total number of request errors per API (RED: Errors)", + }, + []string{"api_id", "method", "status_class"}, + ) + } + + // Overall Gateway RED metrics (no api_id dimension) + pm.gatewayRequestsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: prefix, + Subsystem: "gateway", + Name: "http_requests_total", + Help: "Total number of HTTP requests processed by the gateway (RED: Rate)", + }, + []string{"method", "status_class"}, + ) + + pm.gatewayRequestDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: prefix, + Subsystem: "gateway", + Name: "http_request_duration_seconds", + Help: "Total HTTP request duration in seconds (RED: Duration)", + Buckets: prometheus.DefBuckets, + }, + []string{"method", "status_class"}, + ) + + pm.gatewayRequestUpstreamLatency = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: prefix, + Subsystem: "gateway", + Name: "http_request_upstream_latency_seconds", + Help: "Upstream response latency in seconds (time waiting for upstream service)", + Buckets: prometheus.DefBuckets, + }, + []string{"method", "status_class"}, + ) + + pm.gatewayRequestGatewayLatency = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: prefix, + Subsystem: "gateway", + Name: "http_request_gateway_latency_seconds", + Help: "Gateway processing latency in seconds (time spent in gateway middleware)", + Buckets: prometheus.DefBuckets, + }, + []string{"method", "status_class"}, + ) + + // Redis Metrics + pm.redisPoolSize = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: prefix, + Subsystem: "redis", + Name: "pool_size", + Help: "Redis connection pool size", + }, + []string{"type"}, + ) + + pm.redisActiveConns = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: prefix, + Subsystem: "redis", + Name: "active_connections", + Help: "Number of active Redis connections", + }, + []string{"type"}, + ) + + pm.redisLatency = prometheus.NewHistogram(prometheus.HistogramOpts{ + Namespace: prefix, + Subsystem: "redis", + Name: "operation_duration_seconds", + Help: "Redis operation latency in seconds", + Buckets: []float64{.001, .005, .01, .025, .05, .1, .25, .5, 1}, + }) + + // Request Processing Metrics + pm.queueDepth = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: prefix, + Subsystem: "processing", + Name: "queue_depth", + Help: "Current request queue depth", + }) + + pm.throughput = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: prefix, + Subsystem: "processing", + Name: "throughput_rps", + Help: "Current throughput in requests per second", + }) + + pm.middlewareExecTime = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: prefix, + Subsystem: "processing", + Name: "middleware_execution_seconds", + Help: "Middleware execution time in seconds", + Buckets: []float64{.001, .005, .01, .025, .05, .1, .25, .5, 1}, + }, + []string{"middleware_name", "api_id"}, + ) + + // Register metrics + collectors := []prometheus.Collector{ + pm.goroutines, + pm.memoryAlloc, + pm.memoryTotal, + pm.cpuUsage, + pm.connections, + pm.apisLoaded, + pm.policiesLoaded, + pm.gatewayRequestsTotal, + pm.gatewayRequestDuration, + pm.gatewayRequestUpstreamLatency, + pm.gatewayRequestGatewayLatency, + pm.redisPoolSize, + pm.redisActiveConns, + pm.redisLatency, + pm.queueDepth, + pm.throughput, + pm.middlewareExecTime, + } + + // Conditionally add per-API metrics + if enablePerAPIMetrics { + collectors = append(collectors, + pm.requestsTotal, + pm.requestDuration, + pm.requestErrors, + ) + } + + registry.MustRegister(collectors...) + + return pm +} + +// RegisterGoCollectors registers optional Go runtime and process collectors +func (pm *PrometheusMetrics) RegisterGoCollectors(enableGoCollector, enableProcessCollector bool) { + if enableGoCollector { + pm.registry.MustRegister(collectors.NewGoCollector()) + log.Debug("Registered Prometheus Go runtime collector") + } + + if enableProcessCollector { + pm.registry.MustRegister(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{})) + log.Debug("Registered Prometheus process collector") + } +} + +// UpdateSystemMetrics updates system-level metrics +func (pm *PrometheusMetrics) UpdateSystemMetrics() { + var m runtime.MemStats + runtime.ReadMemStats(&m) + + pm.goroutines.Set(float64(runtime.NumGoroutine())) + pm.memoryAlloc.Set(float64(m.Alloc)) + pm.memoryTotal.Set(float64(m.TotalAlloc)) + + // Calculate CPU usage based on GC pause times + now := time.Now() + elapsed := now.Sub(pm.lastCheckTime).Seconds() + + if elapsed > 0 { + // Calculate CPU time from GC pauses + currentCPUTime := time.Duration(m.PauseTotalNs) + cpuDelta := currentCPUTime - pm.lastCPUTime + + // Calculate CPU usage as percentage (GC pause time / elapsed time * 100) + // Note: This is a simplified metric showing GC CPU impact + // For production, consider using more comprehensive CPU tracking + cpuUsagePercent := (cpuDelta.Seconds() / elapsed) * 100.0 / float64(runtime.NumCPU()) + + // Cap at 100% to handle edge cases + if cpuUsagePercent > 100.0 { + cpuUsagePercent = 100.0 + } + + pm.cpuUsage.Set(cpuUsagePercent) + pm.lastCPUTime = currentCPUTime + pm.lastCheckTime = now + } + + if pm.gw.ConnectionWatcher != nil { + pm.connections.Set(float64(pm.gw.ConnectionWatcher.Count())) + } +} + +// UpdateGatewayMetrics updates gateway-specific metrics +func (pm *PrometheusMetrics) UpdateGatewayMetrics() { + pm.gw.apisMu.RLock() + pm.apisLoaded.Set(float64(len(pm.gw.apisByID))) + pm.gw.apisMu.RUnlock() + + pm.gw.policiesMu.RLock() + pm.policiesLoaded.Set(float64(len(pm.gw.policiesByID))) + pm.gw.policiesMu.RUnlock() + + // Throughput from GlobalRate + pm.throughput.Set(float64(GlobalRate.Rate())) +} + +// UpdateRedisMetrics updates Redis connection metrics +func (pm *PrometheusMetrics) UpdateRedisMetrics() { + if pm.gw.StorageConnectionHandler != nil { + for _, connType := range []string{storage.DefaultConn, storage.CacheConn, storage.AnalyticsConn} { + stats := pm.gw.StorageConnectionHandler.GetRedisStats(connType) + if stats != nil { + labels := prometheus.Labels{"type": connType} + pm.redisPoolSize.With(labels).Set(float64(stats.TotalConns)) + // Active connections = Total - Idle + activeConns := stats.TotalConns - stats.IdleConns + pm.redisActiveConns.With(labels).Set(float64(activeConns)) + } + } + } +} + +// RecordRequest records request metrics (called from middleware/handler) +func (pm *PrometheusMetrics) RecordRequest(apiID, apiName, method string, statusCode int, totalNs, upstreamNs int64) { + // Group status codes to reduce cardinality (2xx, 3xx, 4xx, 5xx) + statusClass := fmt.Sprintf("%dxx", statusCode/100) + + // Convert nanoseconds to seconds for Prometheus + totalSeconds := float64(totalNs) / 1e9 + upstreamSeconds := float64(upstreamNs) / 1e9 + gatewaySeconds := float64(totalNs-upstreamNs) / 1e9 + + // Per-API metrics (only if enabled) + if pm.enablePerAPIMetrics { + labels := prometheus.Labels{ + "api_id": apiID, + "method": method, + "status_class": statusClass, + } + pm.requestsTotal.With(labels).Inc() + + durLabels := prometheus.Labels{ + "api_id": apiID, + "method": method, + } + pm.requestDuration.With(durLabels).Observe(totalSeconds) + + // Record errors with status_class instead of error_type + if statusCode >= 400 { + errLabels := prometheus.Labels{ + "api_id": apiID, + "method": method, + "status_class": statusClass, + } + pm.requestErrors.With(errLabels).Inc() + } + } + + // Overall Gateway RED metrics (always recorded) + gatewayLabels := prometheus.Labels{ + "method": method, + "status_class": statusClass, + } + pm.gatewayRequestsTotal.With(gatewayLabels).Inc() + pm.gatewayRequestDuration.With(gatewayLabels).Observe(totalSeconds) + pm.gatewayRequestUpstreamLatency.With(gatewayLabels).Observe(upstreamSeconds) + pm.gatewayRequestGatewayLatency.With(gatewayLabels).Observe(gatewaySeconds) +} + +// RecordMiddlewareExecution records middleware execution time +func (pm *PrometheusMetrics) RecordMiddlewareExecution(middlewareName, apiID string, duration float64) { + pm.middlewareExecTime.With(prometheus.Labels{ + "middleware_name": middlewareName, + "api_id": apiID, + }).Observe(duration) +} + +// Handler returns the HTTP handler for metrics endpoint +func (pm *PrometheusMetrics) Handler() http.Handler { + return promhttp.HandlerFor(pm.registry, promhttp.HandlerOpts{}) +} + +// StartMetricsCollection starts background metrics collection +func (pm *PrometheusMetrics) StartMetricsCollection(ctx context.Context) { + ticker := time.NewTicker(5 * time.Second) + + go func() { + defer ticker.Stop() + defer close(pm.collectionDone) + for { + select { + case <-ctx.Done(): + log.WithFields(logrus.Fields{ + "component": "prometheus", + }).Info("Stopping Prometheus metrics collection") + return + case <-ticker.C: + pm.UpdateSystemMetrics() + pm.UpdateGatewayMetrics() + pm.UpdateRedisMetrics() + } + } + }() +} + +// WaitForShutdown waits for the metrics collection goroutine to finish +func (pm *PrometheusMetrics) WaitForShutdown() { + if pm.collectionDone != nil { + <-pm.collectionDone + } +} diff --git a/gateway/instrumentation_prometheus_sink.go b/gateway/instrumentation_prometheus_sink.go new file mode 100644 index 00000000000..0c8532d98b6 --- /dev/null +++ b/gateway/instrumentation_prometheus_sink.go @@ -0,0 +1,56 @@ +package gateway + +import ( + "github.com/gocraft/health" +) + +// PrometheusSink implements health.Sink interface for Prometheus +type PrometheusSink struct { + metrics *PrometheusMetrics +} + +// NewPrometheusSink creates a new Prometheus sink +func NewPrometheusSink(metrics *PrometheusMetrics) *PrometheusSink { + return &PrometheusSink{ + metrics: metrics, + } +} + +// EmitEvent converts events to Prometheus metrics +func (s *PrometheusSink) EmitEvent(job, event string, kvs map[string]string) { + // Events are tracked through specific metric collectors + // Can be extended if specific event tracking is needed +} + +// EmitEventErr tracks errors in Prometheus +func (s *PrometheusSink) EmitEventErr(job, event string, err error, kvs map[string]string) { + // Error tracking is handled by RecordRequest for HTTP errors + // Can be extended for non-HTTP errors if needed +} + +// EmitTiming converts timing events to histograms +func (s *PrometheusSink) EmitTiming(job, event string, nanos int64, kvs map[string]string) { + seconds := float64(nanos) / 1e9 + + // Track middleware execution timing + if job == "MiddlewareCall" { + if mwName, ok := kvs["mw_name"]; ok { + apiID := kvs["api_id"] + s.metrics.RecordMiddlewareExecution(mwName, apiID, seconds) + } + } + + // Can be extended for other timing events +} + +// EmitGauge handles gauge metrics +func (s *PrometheusSink) EmitGauge(job, event string, value float64, kvs map[string]string) { + // Gauge metrics are updated periodically by UpdateSystemMetrics/UpdateGatewayMetrics + // Can be extended if specific gauge tracking is needed beyond the existing collectors +} + +// EmitComplete tracks completion status +func (s *PrometheusSink) EmitComplete(job string, status health.CompletionStatus, nanos int64, kvs map[string]string) { + // Completion status is tracked through RecordRequest + // Can be extended if additional completion tracking is needed +} diff --git a/gateway/server.go b/gateway/server.go index fd7551f3c4a..00821d566ec 100644 --- a/gateway/server.go +++ b/gateway/server.go @@ -132,6 +132,9 @@ type Gateway struct { HostCheckerClient *http.Client TracerProvider otel.TracerProvider NewRelicApplication *newrelic.Application + PrometheusMetrics *PrometheusMetrics + prometheusServer *http.Server + prometheusServerMu sync.RWMutex keyGen DefaultKeyGenerator @@ -1502,6 +1505,7 @@ func (gw *Gateway) initSystem() error { config.Global = gw.GetConfig gw.getHostDetails() gw.setupInstrumentation() + gw.setupPrometheusInstrumentation() // cleanIdleMemConnProviders checks memconn.Provider (a part of internal API handling) // instances periodically and deletes idle items, closes net.Listener instances to @@ -2241,6 +2245,29 @@ func (gw *Gateway) gracefulShutdown(ctx context.Context) error { <-serverShutdownDone } + // Shutdown Prometheus metrics collection and server if enabled + if gw.PrometheusMetrics != nil { + mainLog.Info("Waiting for Prometheus metrics collection to stop...") + gw.PrometheusMetrics.WaitForShutdown() + mainLog.Info("Prometheus metrics collection stopped") + } + + gw.prometheusServerMu.RLock() + server := gw.prometheusServer + gw.prometheusServerMu.RUnlock() + + if server != nil { + mainLog.Info("Shutting down Prometheus metrics server...") + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := server.Shutdown(shutdownCtx); err != nil { + mainLog.WithError(err).Error("Error shutting down Prometheus server") + errChan <- err + } else { + mainLog.Info("Prometheus metrics server shut down gracefully") + } + } + // Close all cache stores and other resources mainLog.Info("Closing cache stores and other resources...") gw.cacheClose() diff --git a/generate-traffic.sh b/generate-traffic.sh new file mode 100755 index 00000000000..d4bededa40c --- /dev/null +++ b/generate-traffic.sh @@ -0,0 +1,257 @@ +#!/bin/bash + +# Traffic Generator for Tyk Quickstart Endpoint +# Usage: ./generate-traffic.sh [OPTIONS] + +set -e + +# Default configuration +ENDPOINT="${ENDPOINT:-http://localhost:9009/quickstart/}" +REQUESTS="${REQUESTS:-100}" +CONCURRENT="${CONCURRENT:-1}" +DELAY="${DELAY:-0.1}" +VERBOSE="${VERBOSE:-false}" + +# Color output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Display usage +usage() { + cat << EOF +Traffic Generator for Tyk Gateway + +Usage: $0 [OPTIONS] + +Options: + -e, --endpoint URL Target endpoint (default: http://localhost:9009/quickstart/) + -n, --requests NUM Number of requests to make (default: 100) + -c, --concurrent NUM Number of concurrent requests (default: 1) + -d, --delay SECONDS Delay between requests in seconds (default: 0.1) + -v, --verbose Enable verbose output + -h, --help Show this help message + +Environment Variables: + ENDPOINT Same as --endpoint + REQUESTS Same as --requests + CONCURRENT Same as --concurrent + DELAY Same as --delay + VERBOSE Set to 'true' for verbose output + +Examples: + # Generate 100 requests with default settings + $0 + + # Generate 1000 requests with 10 concurrent connections + $0 -n 1000 -c 10 + + # Custom endpoint with verbose output + $0 -e http://localhost:8080/api/test -v + + # High traffic simulation (1000 requests, 50 concurrent, no delay) + $0 -n 1000 -c 50 -d 0 + +EOF + exit 0 +} + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + case $1 in + -e|--endpoint) + ENDPOINT="$2" + shift 2 + ;; + -n|--requests) + REQUESTS="$2" + shift 2 + ;; + -c|--concurrent) + CONCURRENT="$2" + shift 2 + ;; + -d|--delay) + DELAY="$2" + shift 2 + ;; + -v|--verbose) + VERBOSE=true + shift + ;; + -h|--help) + usage + ;; + *) + echo -e "${RED}Error: Unknown option $1${NC}" + usage + ;; + esac +done + +# Validate inputs +if ! [[ "$REQUESTS" =~ ^[0-9]+$ ]] || [ "$REQUESTS" -lt 1 ]; then + echo -e "${RED}Error: REQUESTS must be a positive integer${NC}" + exit 1 +fi + +if ! [[ "$CONCURRENT" =~ ^[0-9]+$ ]] || [ "$CONCURRENT" -lt 1 ]; then + echo -e "${RED}Error: CONCURRENT must be a positive integer${NC}" + exit 1 +fi + +# Statistics +SUCCESS_COUNT=0 +FAIL_COUNT=0 +TOTAL_TIME=0 +START_TIME=$(date +%s) + +# Temporary file for parallel processing +TEMP_DIR=$(mktemp -d) +trap "rm -rf $TEMP_DIR" EXIT + +# Function to make a single request +make_request() { + local request_id=$1 + local start=$(date +%s.%N) + + if [ "$VERBOSE" = true ]; then + response=$(curl -s -w "\n%{http_code}" -X GET "$ENDPOINT" 2>&1) + status_code=$(echo "$response" | tail -n1) + body=$(echo "$response" | sed '$d') + + end=$(date +%s.%N) + duration=$(echo "$end - $start" | bc) + + echo -e "${BLUE}[Request $request_id]${NC} Status: $status_code | Duration: ${duration}s" + if [ ${#body} -gt 0 ] && [ ${#body} -lt 200 ]; then + echo -e "${YELLOW}Response: $body${NC}" + fi + else + status_code=$(curl -s -o /dev/null -w "%{http_code}" -X GET "$ENDPOINT" 2>&1) + end=$(date +%s.%N) + duration=$(echo "$end - $start" | bc) + fi + + # Write result to temp file + echo "$status_code|$duration" > "$TEMP_DIR/$request_id" +} + +# Print configuration +echo -e "${GREEN}=== Tyk Traffic Generator ===${NC}" +echo -e "Endpoint: ${BLUE}$ENDPOINT${NC}" +echo -e "Requests: ${BLUE}$REQUESTS${NC}" +echo -e "Concurrent: ${BLUE}$CONCURRENT${NC}" +echo -e "Delay: ${BLUE}${DELAY}s${NC}" +echo -e "Verbose: ${BLUE}$VERBOSE${NC}" +echo "" + +# Generate traffic +echo -e "${YELLOW}Generating traffic...${NC}" +echo "" + +for ((i=1; i<=REQUESTS; i++)); do + # Launch request in background if concurrent + if [ "$CONCURRENT" -gt 1 ]; then + # Wait if we've reached concurrent limit + while [ $(jobs -r | wc -l) -ge "$CONCURRENT" ]; do + sleep 0.01 + done + make_request $i & + else + make_request $i + fi + + # Progress indicator (every 10%) + if [ $((i % (REQUESTS / 10 + 1))) -eq 0 ] && [ "$VERBOSE" = false ]; then + progress=$((i * 100 / REQUESTS)) + echo -ne "\rProgress: ${progress}% ($i/$REQUESTS)" + fi + + # Delay between requests + if [ "$DELAY" != "0" ] && [ "$CONCURRENT" -eq 1 ]; then + sleep "$DELAY" + fi +done + +# Wait for all background jobs to complete +wait + +if [ "$VERBOSE" = false ]; then + echo -ne "\rProgress: 100% ($REQUESTS/$REQUESTS)\n" +fi + +echo "" +echo -e "${YELLOW}Processing results...${NC}" + +# Collect statistics from temp files +TOTAL_DURATION=0 +MIN_DURATION=999999 +MAX_DURATION=0 + +for file in "$TEMP_DIR"/*; do + if [ -f "$file" ]; then + IFS='|' read -r status duration < "$file" + + if [ "$status" = "200" ]; then + ((SUCCESS_COUNT++)) + else + ((FAIL_COUNT++)) + fi + + TOTAL_DURATION=$(echo "$TOTAL_DURATION + $duration" | bc) + + if [ $(echo "$duration < $MIN_DURATION" | bc) -eq 1 ]; then + MIN_DURATION=$duration + fi + + if [ $(echo "$duration > $MAX_DURATION" | bc) -eq 1 ]; then + MAX_DURATION=$duration + fi + fi +done + +END_TIME=$(date +%s) +ELAPSED=$((END_TIME - START_TIME)) + +# Calculate average +if [ "$SUCCESS_COUNT" -gt 0 ]; then + AVG_DURATION=$(echo "scale=4; $TOTAL_DURATION / $SUCCESS_COUNT" | bc) +else + AVG_DURATION=0 +fi + +# Calculate requests per second +if [ "$ELAPSED" -gt 0 ]; then + RPS=$(echo "scale=2; $REQUESTS / $ELAPSED" | bc) +else + RPS=0 +fi + +# Display results +echo "" +echo -e "${GREEN}=== Results ===${NC}" +echo -e "Total Requests: ${BLUE}$REQUESTS${NC}" +echo -e "Successful: ${GREEN}$SUCCESS_COUNT${NC}" +echo -e "Failed: ${RED}$FAIL_COUNT${NC}" +echo -e "Success Rate: ${BLUE}$(echo "scale=2; $SUCCESS_COUNT * 100 / $REQUESTS" | bc)%${NC}" +echo "" +echo -e "${GREEN}=== Performance ===${NC}" +echo -e "Total Time: ${BLUE}${ELAPSED}s${NC}" +echo -e "Requests/sec: ${BLUE}$RPS${NC}" +if [ "$SUCCESS_COUNT" -gt 0 ]; then + echo -e "Avg Duration: ${BLUE}${AVG_DURATION}s${NC}" + echo -e "Min Duration: ${BLUE}${MIN_DURATION}s${NC}" + echo -e "Max Duration: ${BLUE}${MAX_DURATION}s${NC}" +fi +echo "" + +# Exit with error if any requests failed +if [ "$FAIL_COUNT" -gt 0 ]; then + echo -e "${RED}Warning: Some requests failed!${NC}" + exit 1 +fi + +echo -e "${GREEN}Traffic generation completed successfully!${NC}" diff --git a/internal/redis/redis.go b/internal/redis/redis.go index 2a3feb0afd5..37f3e1ac292 100644 --- a/internal/redis/redis.go +++ b/internal/redis/redis.go @@ -38,4 +38,6 @@ type ( IntCmd = redis.IntCmd StringCmd = redis.StringCmd StringSliceCmd = redis.StringSliceCmd + + PoolStats = redis.PoolStats ) diff --git a/storage/connection_handler.go b/storage/connection_handler.go index de22cd7d05b..ca11addd1f4 100644 --- a/storage/connection_handler.go +++ b/storage/connection_handler.go @@ -294,3 +294,35 @@ func getExponentialBackoff() *backoff.ExponentialBackOff { return exponentialBackoff } + +// GetRedisStats returns Redis connection pool statistics for the specified connection type +func (rc *ConnectionHandler) GetRedisStats(connType string) *RedisPoolStats { + rc.connectionsMu.RLock() + defer rc.connectionsMu.RUnlock() + + conn, ok := rc.connections[connType] + if !ok || conn == nil { + return nil + } + + // Try to get the connector as a RedisCluster + var redisCluster *RedisCluster + if ok := conn.As(&redisCluster); ok && redisCluster != nil { + if stats := redisCluster.PoolStats(); stats != nil { + return &RedisPoolStats{ + TotalConns: stats.TotalConns, + IdleConns: stats.IdleConns, + StaleConns: stats.StaleConns, + } + } + } + + return nil +} + +// RedisPoolStats holds simplified Redis pool statistics for Prometheus metrics +type RedisPoolStats struct { + TotalConns uint32 + IdleConns uint32 + StaleConns uint32 +} diff --git a/storage/redis_cluster.go b/storage/redis_cluster.go index eaaf3f22a4c..5fcd75c42d4 100644 --- a/storage/redis_cluster.go +++ b/storage/redis_cluster.go @@ -1086,3 +1086,15 @@ func (r *RedisCluster) ScanKeys(pattern string) ([]string, error) { return storage.Keys(context.Background(), pattern) } + +// PoolStats returns the connection pool statistics for this Redis cluster +func (r *RedisCluster) PoolStats() *redis.PoolStats { + client, err := r.Client() + if err != nil { + log.WithError(err).Debug("Failed to get Redis client for pool stats") + return nil + } + + stats := client.PoolStats() + return stats +} diff --git a/tyk.conf.example b/tyk.conf.example index 2ee6c6647f7..2df9dc236ff 100644 --- a/tyk.conf.example +++ b/tyk.conf.example @@ -21,6 +21,15 @@ "type": "", "ignored_ips": [] }, + "prometheus": { + "enabled": false, + "listen_address": ":9090", + "path": "/metrics", + "metric_prefix": "tyk_gateway", + "enable_go_collector": true, + "enable_process_collector": true, + "enable_per_api_metrics": false + }, "dns_cache": { "enabled": false, "ttl": 3600,