diff --git a/.github/workflows/ci-main.yml b/.github/workflows/ci-main.yml index 86ef14f47..c4b32703c 100644 --- a/.github/workflows/ci-main.yml +++ b/.github/workflows/ci-main.yml @@ -137,6 +137,7 @@ jobs: context: proxy - component: mimir context: services/mimir + image_description: Mimir alertmanager backend for My Nethesis steps: - uses: actions/checkout@v4 @@ -170,7 +171,9 @@ jobs: platforms: linux/amd64,linux/arm64 push: ${{ github.event_name != 'pull_request' }} tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} + labels: | + ${{ steps.meta.outputs.labels }} + ${{ matrix.image_description != '' && format('org.opencontainers.image.description={0}', matrix.image_description) || '' }} cache-from: type=gha,scope=${{ matrix.component }} cache-to: type=gha,mode=max,scope=${{ matrix.component }} build-args: | diff --git a/backend/openapi.yaml b/backend/openapi.yaml index 25b413e79..4b6f794c6 100644 --- a/backend/openapi.yaml +++ b/backend/openapi.yaml @@ -62,7 +62,8 @@ tags: description: Collect service system management and inventory collection - name: Collect - Rebranding description: Collect service rebranding endpoints for systems - + - name: Collect - Metrics + description: Collect service metrics proxy to Mimir (Prometheus remote_write and query) security: - BearerAuth: [] @@ -8231,3 +8232,127 @@ paths: format: binary '404': $ref: '#/components/responses/NotFound' + + # =========================================== + # METRICS ENDPOINTS (Collect - Mimir Proxy) + # =========================================== + + /api/services/mimir/{path}: + parameters: + - name: path + in: path + required: true + schema: + type: string + description: Wildcard path forwarded to Mimir (e.g. `api/v1/push`, `prometheus/api/v1/query`) + get: + operationId: mimirProxyGet + tags: + - Collect - Metrics + summary: Proxy GET request to Mimir + description: | + Authenticates the system using HTTP Basic Auth (`system_key` as username, + `system_secret` as password), injects the `X-Scope-OrgID` header with the + system's `organization_id` for multi-tenant isolation, and reverse-proxies + the request to Mimir. Typical use: Grafana PromQL queries via + `GET /api/services/mimir/prometheus/api/v1/query`. + security: + - BasicAuth: [] + responses: + '200': + description: Proxied response from Mimir + '400': + $ref: '#/components/responses/BadRequest' + '401': + $ref: '#/components/responses/Unauthorized' + '500': + $ref: '#/components/responses/InternalServerError' + post: + operationId: mimirProxyPost + tags: + - Collect - Metrics + summary: Proxy POST request to Mimir + description: | + Authenticates the system using HTTP Basic Auth (`system_key` as username, + `system_secret` as password), injects the `X-Scope-OrgID` header with the + system's `organization_id` for multi-tenant isolation, and reverse-proxies + the request to Mimir. Primary use case: Prometheus `remote_write` ingestion + via `POST /api/services/mimir/api/v1/push` from NethServer systems. + security: + - BasicAuth: [] + requestBody: + description: Request body forwarded as-is to Mimir (e.g. Prometheus remote_write protobuf payload) + required: false + content: + application/x-protobuf: + schema: + type: string + format: binary + application/json: + schema: + type: object + responses: + '200': + description: Proxied response from Mimir + '204': + description: No content — Mimir acknowledged the write with no response body + '400': + $ref: '#/components/responses/BadRequest' + '401': + $ref: '#/components/responses/Unauthorized' + '500': + $ref: '#/components/responses/InternalServerError' + put: + operationId: mimirProxyPut + tags: + - Collect - Metrics + summary: Proxy PUT request to Mimir + description: | + Authenticates the system using HTTP Basic Auth (`system_key` as username, + `system_secret` as password), injects the `X-Scope-OrgID` header with the + system's `organization_id` for multi-tenant isolation, and reverse-proxies + the request to Mimir. + security: + - BasicAuth: [] + requestBody: + description: Request body forwarded as-is to Mimir + required: false + content: + application/json: + schema: + type: object + responses: + '200': + description: Proxied response from Mimir + '204': + description: No content + '400': + $ref: '#/components/responses/BadRequest' + '401': + $ref: '#/components/responses/Unauthorized' + '500': + $ref: '#/components/responses/InternalServerError' + delete: + operationId: mimirProxyDelete + tags: + - Collect - Metrics + summary: Proxy DELETE request to Mimir + description: | + Authenticates the system using HTTP Basic Auth (`system_key` as username, + `system_secret` as password), injects the `X-Scope-OrgID` header with the + system's `organization_id` for multi-tenant isolation, and reverse-proxies + the request to Mimir. + security: + - BasicAuth: [] + responses: + '200': + description: Proxied response from Mimir + '204': + description: No content + '400': + $ref: '#/components/responses/BadRequest' + '401': + $ref: '#/components/responses/Unauthorized' + '500': + $ref: '#/components/responses/InternalServerError' + diff --git a/collect/.env.example b/collect/.env.example index 789f16aec..f7950befa 100644 --- a/collect/.env.example +++ b/collect/.env.example @@ -86,6 +86,9 @@ REDIS_URL=redis://localhost:6379 #CIRCUIT_BREAKER_THRESHOLD=10 #CIRCUIT_BREAKER_TIMEOUT=60s +# Mimir metrics storage +#MIMIR_URL=http://localhost:9009 + # Logging configuration #LOG_LEVEL=info #LOG_FORMAT=json diff --git a/collect/configuration/configuration.go b/collect/configuration/configuration.go index 537d50126..b8d172b6c 100644 --- a/collect/configuration/configuration.go +++ b/collect/configuration/configuration.go @@ -80,6 +80,9 @@ type Configuration struct { // Heartbeat monitoring configuration HeartbeatTimeoutMinutes int `json:"heartbeat_timeout_minutes"` + + // Mimir configuration + MimirURL string `json:"mimir_url"` } var Config = Configuration{} @@ -161,6 +164,13 @@ func Init() { // Heartbeat monitoring configuration Config.HeartbeatTimeoutMinutes = parseIntWithDefault("HEARTBEAT_TIMEOUT_MINUTES", 10) + // Mimir configuration + if mimirURL := os.Getenv("MIMIR_URL"); mimirURL != "" { + Config.MimirURL = mimirURL + } else { + Config.MimirURL = "http://localhost:9009" + } + // Log successful configuration load logger.LogConfigLoad("env", "configuration", true, nil) } diff --git a/collect/main.go b/collect/main.go index a015f2e20..26a05366c 100644 --- a/collect/main.go +++ b/collect/main.go @@ -105,8 +105,11 @@ func main() { // Add security monitoring middleware router.Use(logger.SecurityMiddleware()) - // Add compression - router.Use(gzip.Gzip(gzip.DefaultCompression)) + // Add compression (excluding Mimir proxy endpoints to avoid double-compression) + router.Use(gzip.Gzip( + gzip.DefaultCompression, + gzip.WithExcludedPathsRegexs([]string{"^/api/services/mimir"}), + )) // CORS configuration in debug mode if gin.Mode() == gin.DebugMode { @@ -158,6 +161,15 @@ func main() { systemsGroup.GET("/rebranding/:product_id/:asset", methods.GetSystemRebrandingAsset) } + // =========================================== + // EXTERNAL SERVICES PROXY + // =========================================== + servicesGroup := api.Group("/services", middleware.BasicAuthMiddleware()) + { + mimirProxy := servicesGroup.Group("/mimir") + mimirProxy.Any("/*path", methods.ProxyMimir) + } + // Handle missing endpoints router.NoRoute(func(c *gin.Context) { c.JSON(http.StatusNotFound, response.NotFound("api not found", nil)) diff --git a/collect/methods/mimir.go b/collect/methods/mimir.go new file mode 100644 index 000000000..7586c674c --- /dev/null +++ b/collect/methods/mimir.go @@ -0,0 +1,108 @@ +/* +Copyright (C) 2026 Nethesis S.r.l. +SPDX-License-Identifier: AGPL-3.0-or-later +*/ + +package methods + +import ( + "bytes" + "database/sql" + "fmt" + "io" + "net/http" + + "github.com/gin-gonic/gin" + + "github.com/nethesis/my/collect/configuration" + "github.com/nethesis/my/collect/database" + "github.com/nethesis/my/collect/logger" + "github.com/nethesis/my/collect/response" +) + +// ProxyMimir handles ANY /api/services/mimir/* — the BasicAuthMiddleware has +// already validated system credentials and placed system_id in the context. +// This handler resolves the organization_id, sets X-Scope-OrgID, and +// reverse-proxies the request to Mimir with HA support across multiple instances. +func ProxyMimir(c *gin.Context) { + // Step 1: Get system_id from context (set by BasicAuthMiddleware) + systemID, ok := getAuthenticatedSystemID(c) + if !ok { + logger.Warn().Str("reason", "missing system_id in context").Msg("mimir proxy auth failed") + c.JSON(http.StatusUnauthorized, response.Unauthorized("unauthorized", nil)) + return + } + + // Step 2: Query organization_id for this system + var organizationID string + err := database.DB.QueryRow( + `SELECT organization_id FROM systems WHERE id = $1`, + systemID, + ).Scan(&organizationID) + + if err == sql.ErrNoRows { + logger.Warn().Str("system_id", systemID).Str("reason", "system not found").Msg("mimir proxy: system lookup failed") + c.JSON(http.StatusUnauthorized, response.Unauthorized("unauthorized", nil)) + return + } + if err != nil { + logger.Error().Err(err).Str("system_id", systemID).Msg("mimir proxy: db query failed") + c.JSON(http.StatusInternalServerError, response.InternalServerError("internal server error", nil)) + return + } + + // Step 3: Buffer request body once so it can be replayed across retry attempts + bodyBytes, err := io.ReadAll(c.Request.Body) + if err != nil { + logger.Error().Err(err).Msg("mimir proxy: failed to read request body") + c.JSON(http.StatusInternalServerError, response.InternalServerError("internal server error", nil)) + return + } + + subPath := c.Param("path") + rawQuery := c.Request.URL.RawQuery + + // Step 4: Forward request to Mimir + targetURL := fmt.Sprintf("%s%s", configuration.Config.MimirURL, subPath) + if rawQuery != "" { + targetURL += "?" + rawQuery + } + + logger.Info().Str("target", targetURL).Msg("mimir proxy: forwarding request") + + req, err := http.NewRequest(c.Request.Method, targetURL, bytes.NewReader(bodyBytes)) + if err != nil { + logger.Error().Err(err).Str("target", targetURL).Msg("mimir proxy: failed to create upstream request") + c.JSON(http.StatusInternalServerError, response.InternalServerError("internal server error", nil)) + return + } + + for _, header := range []string{"Content-Type", "Content-Encoding", "Accept", "User-Agent"} { + if val := c.GetHeader(header); val != "" { + req.Header.Set(header, val) + } + } + // Remove Accept-Encoding so Mimir sends plain JSON, not gzip + req.Header.Del("Accept-Encoding") + req.Header.Set("X-Scope-OrgID", organizationID) + + resp, err := http.DefaultClient.Do(req) + if err != nil { + logger.Error().Err(err).Str("target", targetURL).Msg("mimir proxy: network error") + c.JSON(http.StatusBadGateway, response.InternalServerError("mimir is unavailable", nil)) + return + } + defer func() { + if err := resp.Body.Close(); err != nil { + logger.Error().Err(err).Msg("mimir proxy: failed to close upstream response body") + } + }() + + if ct := resp.Header.Get("Content-Type"); ct != "" { + c.Header("Content-Type", ct) + } + c.Status(resp.StatusCode) + if _, err := io.Copy(c.Writer, resp.Body); err != nil { + logger.Error().Err(err).Msg("mimir proxy: error streaming response body") + } +} diff --git a/docs/en/08-alerting.md b/docs/en/08-alerting.md new file mode 100644 index 000000000..76b0ba696 --- /dev/null +++ b/docs/en/08-alerting.md @@ -0,0 +1,93 @@ +# Alerting + +Learn how My platform manages alert rules and sends notifications per organization using Grafana Mimir's multi-tenant Alertmanager. + +## Overview + +My platform uses [Grafana Mimir](https://grafana.com/oss/mimir/)'s built-in multi-tenant Alertmanager to manage alert rules and route notifications. Each organization has its own isolated Alertmanager configuration — alert rules and notification receivers (e.g. email, PagerDuty, webhook) are fully scoped to the organization that owns them. + +## How It Works + +### Multi-Tenancy + +Each system belongs to an organization. The collect service resolves the system's `organization_id` from its credentials and injects it as the `X-Scope-OrgID` header before forwarding to Mimir. This ensures alert rules and notifications are fully isolated between organizations — each organization only manages and receives its own alerts. + +## Authentication + +Alertmanager API calls use the same credentials as system registration and inventory: + +| Field | Value | +|-------|-------| +| **Username** | `system_key` (e.g. `NOC-F64B-A989-C9E7-45B9-A55D-59EC-6545-40EE`) | +| **Password** | `system_secret` (e.g. `my_a1b2c3d4e5f6g7h8i9j0.k1l2m3n4o5p6q7r8s9t0u1v2w3x4y5z6a7b8c9d0`) | +| **Method** | HTTP Basic Auth | + +No separate registration is needed — any system that has completed registration can immediately interact with the Alertmanager API. See [System Registration](05-system-registration.md) for how to obtain credentials. + +## Alertmanager API + +The collect service proxies Alertmanager API calls and automatically injects the `X-Scope-OrgID` header based on the authenticated system's organization. The base path is: + +``` +https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/ +``` + +This maps directly to the [Alertmanager v2 API](https://github.com/prometheus/alertmanager/blob/main/api/v2/openapi.yaml). All standard endpoints are available. + +### Example: List Active Alerts + +```bash +curl https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/alerts \ + -u "NOC-F64B-A989-C9E7-45B9-A55D-59EC-6545-40EE:my_a1b2c3d4e5f6g7h8i9j0.k1l2m3n4o5p6q7r8s9t0u1v2w3x4y5z6a7b8c9d0" +``` + +### Example: Get Alertmanager Status + +```bash +curl https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/status \ + -u ":" +``` + +### Example: Create or Update a Silence + +```bash +curl -X POST https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/silences \ + -u ":" \ + -H "Content-Type: application/json" \ + -d '{ + "matchers": [{"name": "alertname", "value": "WatchdogDown", "isRegex": false}], + "startsAt": "2024-01-01T00:00:00Z", + "endsAt": "2024-01-02T00:00:00Z", + "createdBy": "admin", + "comment": "Planned maintenance" + }' +``` + +!!! tip + Replace `` and `` with the actual credentials stored on the system. The `X-Scope-OrgID` header is injected automatically by the collect service — do not set it manually. + +## Troubleshooting + +### HTTP 401 Unauthorized + +**Cause:** Incorrect `system_key` or `system_secret`. + +**Solutions:** +1. Verify credentials match what is stored on the system +2. Ensure the system has completed registration (see [System Registration](05-system-registration.md)) +3. Check for leading/trailing spaces in the credentials + +### HTTP 500 Internal Server Error + +**Cause:** Mimir Alertmanager backend is unreachable or misconfigured. + +**Solutions:** +1. This is a platform-side issue — contact your administrator +2. Check platform status page or monitoring alerts +3. Retry after a few minutes; Mimir may be restarting + +## Related Documentation + +- [System Registration](05-system-registration.md) +- [Inventory and Heartbeat](06-inventory-heartbeat.md) +- [Systems Management](04-systems.md) diff --git a/docs/it/08-alerting.md b/docs/it/08-alerting.md new file mode 100644 index 000000000..8bd177cda --- /dev/null +++ b/docs/it/08-alerting.md @@ -0,0 +1,107 @@ +# Alerting + +Scopri come la piattaforma My gestisce le regole di alerting e le notifiche per organizzazione tramite Grafana Mimir Alertmanager. + +## Panoramica + +La piattaforma My utilizza l'Alertmanager multi-tenant di [Grafana Mimir](https://grafana.com/oss/mimir/) per gestire regole di alert e inviare notifiche. Ogni organizzazione dispone di un proprio insieme isolato di regole di alerting e configurazioni di notifica: nessuna organizzazione può vedere o modificare le regole delle altre. + +## Come Funziona + +### Multi-Tenancy + +Ogni sistema appartiene a un'organizzazione. Il servizio collect risolve l'`organization_id` del sistema dalle sue credenziali e lo inietta come header `X-Scope-OrgID` prima di inoltrare la richiesta a Mimir. Questo garantisce che le regole di alert e le notifiche siano completamente isolate tra le organizzazioni — ogni organizzazione gestisce e riceve solo i propri alert. + +## Autenticazione + +L'accesso all'API Alertmanager usa le stesse credenziali della registrazione del sistema e dell'inventario: + +| Campo | Valore | +|-------|--------| +| **Username** | `system_key` (es. `NOC-F64B-A989-C9E7-45B9-A55D-59EC-6545-40EE`) | +| **Password** | `system_secret` (es. `my_a1b2c3d4e5f6g7h8i9j0.k1l2m3n4o5p6q7r8s9t0u1v2w3x4y5z6a7b8c9d0`) | +| **Metodo** | HTTP Basic Auth | + +Non è necessaria una registrazione separata — qualsiasi sistema che ha completato la registrazione può interagire con l'API Alertmanager immediatamente. Consulta [Registrazione Sistema](05-system-registration.md) per come ottenere le credenziali. + +## API Alertmanager + +L'Alertmanager è esposto tramite il proxy della piattaforma al percorso: + +``` +/api/services/mimir/alertmanager/api/v2/ +``` + +È compatibile con l'[API standard di Alertmanager v2](https://github.com/prometheus/alertmanager/blob/main/api/v2/openapi.yaml). + +### Esempi di utilizzo + +**Recuperare gli alert attivi:** +```bash +curl https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/alerts \ + -u "NOC-F64B-A989-C9E7-45B9-A55D-59EC-6545-40EE:my_a1b2c3d4e5f6g7h8i9j0.k1l2m3n4o5p6q7r8s9t0u1v2w3x4y5z6a7b8c9d0" +``` + +**Recuperare i gruppi di alert:** +```bash +curl https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/alerts/groups \ + -u ":" +``` + +**Creare un silenzio:** +```bash +curl -X POST https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/silences \ + -u ":" \ + -H "Content-Type: application/json" \ + -d '{ + "matchers": [{"name": "alertname", "value": "HighCPU", "isRegex": false}], + "startsAt": "2024-01-01T00:00:00Z", + "endsAt": "2024-01-02T00:00:00Z", + "createdBy": "admin", + "comment": "Manutenzione pianificata" + }' +``` + +**Recuperare i silenzi attivi:** +```bash +curl https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/silences \ + -u ":" +``` + +**Eliminare un silenzio:** +```bash +curl -X DELETE https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/silence/ \ + -u ":" +``` + +## Risoluzione Problemi + +### HTTP 401 Unauthorized + +**Causa:** `system_key` o `system_secret` non corretti. + +**Soluzioni:** +1. Verifica che le credenziali corrispondano a quelle memorizzate sul sistema +2. Assicurati che il sistema abbia completato la registrazione (vedi [Registrazione Sistema](05-system-registration.md)) +3. Controlla eventuali spazi iniziali o finali nelle credenziali +4. Testa manualmente: + ```bash + curl https://my.nethesis.it/api/services/mimir/alertmanager/api/v2/alerts \ + -u "NOC-F64B-A989-C9E7-45B9-A55D-59EC-6545-40EE:my_a1b2c3d4e5f6g7h8i9j0.k1l2m3n4o5p6q7r8s9t0u1v2w3x4y5z6a7b8c9d0" + ``` + Una risposta `200 OK` o `404 Not Found` (non 401) conferma che l'autenticazione funziona. + +### HTTP 500 Internal Server Error + +**Causa:** Il backend Mimir non è raggiungibile o è configurato in modo errato. + +**Soluzioni:** +1. Si tratta di un problema lato piattaforma — contatta il tuo amministratore +2. Controlla la pagina di stato della piattaforma o gli avvisi di monitoraggio +3. Riprova dopo qualche minuto; Mimir potrebbe essere in fase di riavvio + +## Documentazione Correlata + +- [Registrazione Sistema](05-system-registration.md) +- [Inventario e Heartbeat](06-inventory-heartbeat.md) +- [Gestione Sistemi](04-systems.md) diff --git a/mkdocs.yml b/mkdocs.yml index 60966e595..864f3bc73 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -131,6 +131,7 @@ plugins: Systems Management: Gestione Sistemi System Registration: Registrazione Sistema Inventory & Heartbeat: Inventario e Heartbeat + Alerting: Alerting Developer Documentation: Documentazione Sviluppatori Main Project: Progetto Principale Backend API: API Backend @@ -172,6 +173,7 @@ nav: - Systems Management: 04-systems.md - System Registration: 05-system-registration.md - Inventory & Heartbeat: 06-inventory-heartbeat.md + - Alerting: 08-alerting.md - Developer Documentation: - Main Project: https://github.com/NethServer/my/blob/main/README.md - Backend API: https://github.com/NethServer/my/blob/main/backend/README.md diff --git a/proxy/entrypoint.sh b/proxy/entrypoint.sh index f32d6afff..d53291687 100644 --- a/proxy/entrypoint.sh +++ b/proxy/entrypoint.sh @@ -35,4 +35,4 @@ echo '==> Generated upstream URLs:' grep -E 'set.*upstream' /tmp/nginx.conf || true echo '==> Starting nginx...' -exec nginx -c /tmp/nginx.conf -g 'daemon off;' \ No newline at end of file +exec nginx -c /tmp/nginx.conf -g 'daemon off;' diff --git a/proxy/nginx.conf b/proxy/nginx.conf index b65793d6c..e4b36a28a 100644 --- a/proxy/nginx.conf +++ b/proxy/nginx.conf @@ -87,6 +87,23 @@ http { proxy_read_timeout 30s; } + # Mimir metrics - proxied through collect for auth + X-Scope-OrgID + location /services/mimir/ { + set $mimir_collect_upstream https://${COLLECT_SERVICE_NAME}.onrender.com; + proxy_pass $mimir_collect_upstream/api/services/mimir/; + proxy_ssl_server_name on; + proxy_ssl_verify off; + proxy_set_header Host ${COLLECT_SERVICE_NAME}.onrender.com; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header X-Forwarded-Host $host; + + # Timeouts + proxy_connect_timeout 30s; + proxy_send_timeout 30s; + } + # Frontend routes - everything else location / { set $frontend_upstream https://${FRONTEND_SERVICE_NAME}.onrender.com; @@ -105,4 +122,4 @@ http { proxy_read_timeout 30s; } } -} \ No newline at end of file +} diff --git a/proxy/nginx.conf.local b/proxy/nginx.conf.local index 39ec91f27..7eaa765b6 100644 --- a/proxy/nginx.conf.local +++ b/proxy/nginx.conf.local @@ -77,6 +77,21 @@ http { proxy_read_timeout 30s; } + # Mimir metrics - proxied through collect for auth + X-Scope-OrgID + location /services/mimir/ { + proxy_pass http://collect-full:8080/api/services/mimir/; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header X-Forwarded-Host $host; + + # Timeouts + proxy_connect_timeout 30s; + proxy_send_timeout 30s; + proxy_read_timeout 30s; + } + # Frontend routes - everything else location / { proxy_pass http://frontend-full:8080; @@ -92,4 +107,4 @@ http { proxy_read_timeout 30s; } } -} \ No newline at end of file +} diff --git a/render.yaml b/render.yaml index e2e9ca8c7..041def0c6 100644 --- a/render.yaml +++ b/render.yaml @@ -52,12 +52,8 @@ services: sync: false - key: MIMIR_S3_SECRET_KEY sync: false - - key: MIMIR_S3_BUCKET - sync: false - key: MIMIR_S3_ALERTMANAGER_BUCKET sync: false - - key: MIMIR_S3_RULER_BUCKET - sync: false # Production Backend API Server (Private Service) - type: web @@ -227,12 +223,8 @@ services: sync: false - key: MIMIR_S3_SECRET_KEY sync: false - - key: MIMIR_S3_BUCKET - sync: false - key: MIMIR_S3_ALERTMANAGER_BUCKET sync: false - - key: MIMIR_S3_RULER_BUCKET - sync: false autoDeploy: true branch: main pullRequestPreviewsEnabled: true @@ -397,4 +389,4 @@ services: property: host autoDeploy: true # Auto-deploy on every commit branch: main - pullRequestPreviewsEnabled: true # PR previews enabled \ No newline at end of file + pullRequestPreviewsEnabled: true # PR previews enabled diff --git a/services/mimir/.env.example b/services/mimir/.env.example index 14910d081..1d15ba8ef 100644 --- a/services/mimir/.env.example +++ b/services/mimir/.env.example @@ -1,10 +1,8 @@ -# Mimir / Metrics Stack - environment variables +# Mimir / Alerting Stack - environment variables # Copy to mimir/.env and fill in actual values # S3-compatible storage credentials (DigitalOcean Spaces or AWS S3) MIMIR_S3_ENDPOINT=ams3.digitaloceanspaces.com MIMIR_S3_ACCESS_KEY=your-access-key MIMIR_S3_SECRET_KEY=your-secret-key -MIMIR_S3_BUCKET=your-mimir-blocks-bucket MIMIR_S3_ALERTMANAGER_BUCKET=your-mimir-alertmanager-bucket -MIMIR_S3_RULER_BUCKET=your-mimir-ruler-bucket diff --git a/services/mimir/Containerfile b/services/mimir/Containerfile index 7996785b0..3b4b631da 100644 --- a/services/mimir/Containerfile +++ b/services/mimir/Containerfile @@ -15,6 +15,9 @@ COPY .render-build-trigger /tmp/build-trigger # Copy Mimir config template COPY my.yaml /etc/mimir/my.yaml.template +# Copy default runtime configuration (per-tenant overrides, reloaded every 10s) +COPY runtime_config.yaml /etc/mimir/runtime_config.yaml + # Copy entrypoint script (must be executable in the repo) COPY entrypoint.sh /entrypoint.sh diff --git a/services/mimir/README.md b/services/mimir/README.md index d6e77f08a..5360b6a49 100644 --- a/services/mimir/README.md +++ b/services/mimir/README.md @@ -1,16 +1,16 @@ -# Mimir — Metrics Infrastructure +# Mimir — Alerting Infrastructure -Grafana Mimir provides long-term metrics storage for the MY platform, deployed as a single node on a dedicated VM (Server B). The collect service on Server A writes metrics to Mimir and proxies read queries. +Grafana Mimir runs as a multi-tenant **Alertmanager** (`-target=alertmanager`) for the MY platform, deployed on a dedicated VM (Server B). It does **not** ingest metrics. The collect service on Server A routes alert notifications through Mimir's Alertmanager API. ## Topology ``` ┌──────────────────────────────────┐ ┌──────────────────────────────────┐ -│ Server A (main app) │ │ Server B (metrics VM) │ +│ Server A (main app) │ │ Server B (alerting VM) │ │ │ │ │ -│ collect ──/api/services/mimir──►│◄────│ mimir (port 19009) │ -│ backend │ │ └── S3 storage │ -│ frontend │ │ │ +│ collect ──/api/services/mimir──►──────► mimir (port 19009) │ +│ backend │ │ -target=alertmanager │ +│ frontend │ │ └── S3 alertmanager state │ │ nginx proxy │ │ │ └──────────────────────────────────┘ └──────────────────────────────────┘ ``` @@ -55,15 +55,13 @@ Should return `ready`. | `MIMIR_S3_ENDPOINT` | S3-compatible storage endpoint | `ams3.digitaloceanspaces.com` | | `MIMIR_S3_ACCESS_KEY` | S3 access key | `your-access-key` | | `MIMIR_S3_SECRET_KEY` | S3 secret key | `your-secret-key` | -| `MIMIR_S3_BUCKET` | Bucket for blocks (TSDB chunks) | `my-mimir-blocks` | | `MIMIR_S3_ALERTMANAGER_BUCKET` | Bucket for Alertmanager state | `my-mimir-alertmanager` | -| `MIMIR_S3_RULER_BUCKET` | Bucket for recording/alert rules | `my-mimir-ruler` | Copy `services/mimir/.env.example` to `services/mimir/.env` and fill in every value before starting the stack. ## Architecture -Mimir runs as a single node with `replication_factor: 1`. It uses three S3 buckets (blocks, alertmanager, ruler) for persistent storage. Multitenancy is enabled; all writes from `collect` include the tenant ID resolved from the system's organization. +Mimir runs as an alertmanager-only target (`-target=alertmanager`). It uses a single S3 bucket for persistent Alertmanager state. Multitenancy is enabled; all requests from `collect` include the tenant ID resolved from the system's organization. The config template (`services/mimir/my.yaml`) uses `${VAR}` placeholders that are expanded at container startup by `entrypoint.sh` via `envsubst`. diff --git a/services/mimir/docker-compose.yml b/services/mimir/docker-compose.yml index 4fd62afc1..732662de0 100644 --- a/services/mimir/docker-compose.yml +++ b/services/mimir/docker-compose.yml @@ -1,4 +1,4 @@ -# Metrics Infrastructure - Dedicated VM deployment +# Alerting Infrastructure - Dedicated VM deployment # # Run on a separate server from the main application stack. # @@ -10,7 +10,7 @@ # # ⚙️ Required environment variables (set in mimir/.env or shell): # MIMIR_S3_ENDPOINT, MIMIR_S3_ACCESS_KEY, MIMIR_S3_SECRET_KEY -# MIMIR_S3_BUCKET, MIMIR_S3_ALERTMANAGER_BUCKET, MIMIR_S3_RULER_BUCKET +# MIMIR_S3_ALERTMANAGER_BUCKET version: '3.8' @@ -30,9 +30,7 @@ services: MIMIR_S3_ENDPOINT: ${MIMIR_S3_ENDPOINT} MIMIR_S3_ACCESS_KEY: ${MIMIR_S3_ACCESS_KEY} MIMIR_S3_SECRET_KEY: ${MIMIR_S3_SECRET_KEY} - MIMIR_S3_BUCKET: ${MIMIR_S3_BUCKET} MIMIR_S3_ALERTMANAGER_BUCKET: ${MIMIR_S3_ALERTMANAGER_BUCKET} - MIMIR_S3_RULER_BUCKET: ${MIMIR_S3_RULER_BUCKET} ports: - "19009:9009" networks: diff --git a/services/mimir/entrypoint.sh b/services/mimir/entrypoint.sh index 843c153b3..a3fc165e3 100755 --- a/services/mimir/entrypoint.sh +++ b/services/mimir/entrypoint.sh @@ -7,5 +7,5 @@ export PORT echo "==> Expanding Mimir config..." envsubst < /etc/mimir/my.yaml.template > /tmp/mimir-config.yaml -echo "==> Starting Mimir on port ${PORT}..." -exec /bin/mimir --config.file=/tmp/mimir-config.yaml +echo "==> Starting Mimir alertmanager on port ${PORT}..." +exec /bin/mimir -target=alertmanager -config.file=/tmp/mimir-config.yaml diff --git a/services/mimir/my.yaml b/services/mimir/my.yaml index 769b6dead..9c4457fd1 100644 --- a/services/mimir/my.yaml +++ b/services/mimir/my.yaml @@ -1,37 +1,26 @@ multitenancy_enabled: true -target: all - -common: - storage: - backend: s3 - s3: - endpoint: ${MIMIR_S3_ENDPOINT} - secret_access_key: ${MIMIR_S3_SECRET_KEY} - access_key_id: ${MIMIR_S3_ACCESS_KEY} - -blocks_storage: - s3: - bucket_name: ${MIMIR_S3_BUCKET} alertmanager_storage: + backend: s3 s3: + endpoint: ${MIMIR_S3_ENDPOINT} + secret_access_key: ${MIMIR_S3_SECRET_KEY} + access_key_id: ${MIMIR_S3_ACCESS_KEY} bucket_name: ${MIMIR_S3_ALERTMANAGER_BUCKET} -ruler_storage: - s3: - bucket_name: ${MIMIR_S3_RULER_BUCKET} - -compactor: - data_dir: /tmp/mimir/compactor - -ingester: - ring: - replication_factor: 1 - -store_gateway: - sharding_ring: - replication_factor: 1 +alertmanager: + data_dir: /tmp/mimir/alertmanager + poll_interval: 15s + max_config_size_bytes: 1048576 # 1MB + enable_api: true + persist_interval: 15m + retention: 120h + external_url: https://my-collect-qa-pr-41.onrender.com/api/services/mimir/alertmanager server: http_listen_port: ${PORT} log_level: info + +runtime_config: + file: /etc/mimir/runtime_config.yaml + period: 60s diff --git a/services/mimir/runtime_config.yaml b/services/mimir/runtime_config.yaml new file mode 100644 index 000000000..ba1404072 --- /dev/null +++ b/services/mimir/runtime_config.yaml @@ -0,0 +1,9 @@ +# Mimir runtime configuration — reloaded every 10s without restart. +# Use this file to set per-tenant limit overrides. +# +# Example: +# overrides: +# my-tenant-id: +# ingestion_rate: 10000 +# max_label_names_per_series: 30 +# compactor_blocks_retention_period: 48h