telemetry

import "github.com/afreidah/s3-orchestrator/internal/observe/telemetry"

Package telemetry provides Prometheus metrics registration and OpenTelemetry tracing initialization for the S3 orchestrator.

Index

Constants

TracerName and related constants used by this package.

const (
    // TracerName identifies spans created by this service.
    TracerName = "s3-orchestrator"
)

Variables

CircuitBreakerState and related package-level variables used by this package.

var (

    // CircuitBreakerState tracks the current circuit breaker state per component.
    // 0=closed (healthy), 1=open (down), 2=half-open (probing).
    CircuitBreakerState = promauto.NewGaugeVec(
        prometheus.GaugeOpts{
            Name: "s3o_circuit_breaker_state",
            Help: "Current circuit breaker state: 0=closed, 1=open, 2=half-open",
        },
        []string{"name"},
    )

    // CircuitBreakerTransitionsTotal counts state transitions per component.
    CircuitBreakerTransitionsTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_circuit_breaker_transitions_total",
            Help: "Total number of circuit breaker state transitions",
        },
        []string{"name", "from", "to"},
    )

    // CircuitBreakerInternalErrorsTotal counts errors returned by the
    // breaker's own machinery (PostCheck / state transition helpers).
    // Non-zero values indicate a bookkeeping bug, not an application
    // error  -  alert on any increase.
    CircuitBreakerInternalErrorsTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_circuit_breaker_internal_errors_total",
            Help: "Errors returned by circuit breaker PostCheck/state transitions",
        },
        []string{"name", "operation"},
    )

    // DegradedReadsTotal counts reads served via broadcast during degraded mode.
    DegradedReadsTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_degraded_reads_total",
            Help: "Total number of read operations served via broadcast during degraded mode",
        },
        []string{"operation"},
    )

    // DegradedCacheHitsTotal counts location cache hits during degraded reads.
    DegradedCacheHitsTotal = promauto.NewCounter(
        prometheus.CounterOpts{
            Name: "s3o_degraded_cache_hits_total",
            Help: "Total number of location cache hits during degraded reads",
        },
    )

    // DegradedWriteRejectionsTotal counts writes rejected during degraded mode.
    DegradedWriteRejectionsTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_degraded_write_rejections_total",
            Help: "Total number of write operations rejected during degraded mode",
        },
        []string{"operation"},
    )

    // WriteFailoverTotal counts writes that failed on one backend and were
    // retried on another. Labels: operation, failed_backend, success_backend.
    WriteFailoverTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_write_failover_total",
            Help: "Total number of write operations that failed over to a different backend",
        },
        []string{"operation", "failed_backend", "success_backend"},
    )

    // DegradedModeActive is 1 when the DB breaker is open or half-open.
    DegradedModeActive = promauto.NewGauge(
        prometheus.GaugeOpts{
            Name: "s3o_degraded_mode_active",
            Help: "1 when the read path is currently in degraded mode (DB unavailable), 0 otherwise",
        },
    )

    // DegradedBroadcastDuration is the wall-clock duration of degraded-mode broadcast reads.
    DegradedBroadcastDuration = promauto.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "s3o_degraded_broadcast_duration_seconds",
            Help:    "Wall-clock duration of degraded-mode broadcast reads, terminal outcome labelled.",
            Buckets: prometheus.DefBuckets,
        },
        []string{"operation", "outcome"},
    )

    // DegradedBroadcastMixedOutcomesTotal counts broadcasts where some backends returned 404 and others failed differently.
    DegradedBroadcastMixedOutcomesTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_degraded_broadcast_mixed_outcomes_total",
            Help: "Broadcasts where the all-failed terminal saw both 404 and non-404 failures across backends. Surfaces provider divergence or transient backend storms hidden under not_found.",
        },
        []string{"operation"},
    )

    // DegradedBroadcastDrainTimeoutTotal counts loser-drain goroutines that hit their bounded timeout because a probe never returned after cancellation.
    DegradedBroadcastDrainTimeoutTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_degraded_broadcast_drain_timeout_total",
            Help: "Loser-drain goroutines that gave up at the bounded timeout because a cancelled probe never returned. Non-zero means a backend is stranding goroutines under degraded fan-out.",
        },
        []string{"operation"},
    )
)

CacheHitsTotal and related package-level variables used by this package.

var (

    // CacheHitsTotal counts object data cache hits.
    CacheHitsTotal = promauto.NewCounter(
        prometheus.CounterOpts{
            Name: "s3o_cache_hits_total",
            Help: "Object data cache hits",
        },
    )

    // CacheMissesTotal counts object data cache misses.
    CacheMissesTotal = promauto.NewCounter(
        prometheus.CounterOpts{
            Name: "s3o_cache_misses_total",
            Help: "Object data cache misses",
        },
    )

    // CacheEvictionsTotal counts cache entries evicted by LRU or TTL.
    CacheEvictionsTotal = promauto.NewCounter(
        prometheus.CounterOpts{
            Name: "s3o_cache_evictions_total",
            Help: "Cache entries evicted (LRU or TTL)",
        },
    )

    // CacheSizeBytes tracks current cache utilization in bytes.
    CacheSizeBytes = promauto.NewGauge(
        prometheus.GaugeOpts{
            Name: "s3o_cache_size_bytes",
            Help: "Current object data cache size in bytes",
        },
    )

    // CacheEntries tracks the number of entries in the cache.
    CacheEntries = promauto.NewGauge(
        prometheus.GaugeOpts{
            Name: "s3o_cache_entries",
            Help: "Number of entries in the object data cache",
        },
    )

    // CacheFlushTotal counts admin cache-flush invocations. Useful for
    // auditing how often operators or perf runs reset cache state, and
    // for distinguishing organic eviction from explicit flushes when
    // reading cache_size_bytes / cache_entries dropouts on dashboards.
    CacheFlushTotal = promauto.NewCounter(
        prometheus.CounterOpts{
            Name: "s3o_cache_flush_total",
            Help: "Admin-triggered object data cache flushes",
        },
    )

    // CacheAdminInvalidationsTotal counts admin-triggered single-key
    // invalidations. Distinct from organic invalidations driven by
    // writes/deletes/replication so dashboards can separate operator
    // actions from background cache churn.
    CacheAdminInvalidationsTotal = promauto.NewCounter(
        prometheus.CounterOpts{
            Name: "s3o_cache_admin_invalidations_total",
            Help: "Admin-triggered single-key cache invalidations",
        },
    )

    // RedisOperationsTotal counts Redis counter backend operations.
    RedisOperationsTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_redis_operations_total",
            Help: "Total Redis counter backend operations",
        },
        []string{"operation", "status"},
    )

    // RedisFallbackActive is 1 when the Redis counter backend is in local
    // fallback mode due to circuit breaker, 0 during normal operation.
    RedisFallbackActive = promauto.NewGauge(
        prometheus.GaugeOpts{
            Name: "s3o_redis_fallback_active",
            Help: "Whether Redis counter backend is in local fallback mode",
        },
    )
)

CleanupQueueEnqueuedTotal and related package-level variables used by this package.

var (

    // CleanupQueueEnqueuedTotal counts items added to the cleanup retry queue.
    CleanupQueueEnqueuedTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_cleanup_queue_enqueued_total",
            Help: "Total items added to the cleanup retry queue",
        },
        []string{"reason"},
    )

    // CleanupEnqueueFailuresTotal counts cleanup-queue enqueue attempts
    // that failed after a backend write already succeeded. The orphan
    // object exists on the backend but the system lost the chance to
    // track it for retry. stage="enqueue" means the cleanup_queue row
    // itself did not persist (worst case  -  cleanup-queue worker will
    // never see this orphan); stage="orphan_bytes" means the row
    // persisted but the orphan_bytes counter did not increment (quota
    // accounting drifts but cleanup still runs). Operators alert on any
    // non-zero rate of stage="enqueue" and run the reconciler
    // (POST /admin/api/reconcile) once DB connectivity returns.
    CleanupEnqueueFailuresTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_cleanup_enqueue_failures_total",
            Help: "Cleanup-queue enqueue attempts that failed after a successful backend write",
        },
        []string{"backend", "reason", "stage"},
    )

    // CleanupQueueProcessedTotal counts items processed from the cleanup queue.
    CleanupQueueProcessedTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_cleanup_queue_processed_total",
            Help: "Total items processed from the cleanup retry queue",
        },
        []string{"status"},
    )

    // CleanupQueueDepth tracks the current number of pending cleanup items.
    CleanupQueueDepth = promauto.NewGauge(
        prometheus.GaugeOpts{
            Name: "s3o_cleanup_queue_depth",
            Help: "Current number of pending items in the cleanup retry queue",
        },
    )

    // CleanupDLQDepth tracks the current number of rows in the cleanup
    // dead-letter table - cleanup_queue rows that exhausted their retry
    // budget without ever succeeding at the physical backend delete. A
    // non-zero value means orphan bytes are still on the backend with
    // no automatic recovery in flight; operators must investigate.
    CleanupDLQDepth = promauto.NewGauge(
        prometheus.GaugeOpts{
            Name: "s3o_cleanup_dlq_depth",
            Help: "Current number of unrecoverable orphans in the cleanup dead-letter queue",
        },
    )

    // CleanupDLQEnqueuedTotal counts cleanup_queue rows graduated to the
    // dead-letter table per backend, labelled so dashboards can pinpoint
    // which backend is failing physical deletes.
    CleanupDLQEnqueuedTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_cleanup_dlq_enqueued_total",
            Help: "Total cleanup_queue rows moved to cleanup_dlq after exhausting retries",
        },
        []string{"backend"},
    )

    // CleanupQueueStaleClaimsRecoveredTotal counts cleanup_queue rows whose
    // claim was reclaimed because the previous holder did not finalise the
    // row within the configured grace period. A non-zero rate is operational
    // signal that a worker died mid-process or the grace period is too
    // short for the realistic worst-case processing time.
    CleanupQueueStaleClaimsRecoveredTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_cleanup_queue_stale_claims_recovered_total",
            Help: "cleanup_queue rows whose stale claim was reclaimed by a later worker tick",
        },
        []string{"backend"},
    )

    // PendingIntentsEnqueuedTotal counts pending intents inserted by the
    // write path before the backend PUT.
    PendingIntentsEnqueuedTotal = promauto.NewCounter(
        prometheus.CounterOpts{
            Name: "s3o_pending_intents_enqueued_total",
            Help: "Total in-flight PUT intents inserted before the backend write",
        },
    )

    // PendingIntentsResolvedTotal counts intents resolved by the reaper or
    // the synchronous commit path. Status is one of: committed, promoted,
    // dropped, ambiguous, already_resolved.
    PendingIntentsResolvedTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_pending_intents_resolved_total",
            Help: "Total pending PUT intents resolved by status",
        },
        []string{"status"},
    )

    // PendingIntentsDepth tracks the current number of unresolved pending
    // intents in the database.
    PendingIntentsDepth = promauto.NewGauge(
        prometheus.GaugeOpts{
            Name: "s3o_pending_intents_depth",
            Help: "Current number of unresolved pending PUT intents",
        },
    )

    // LifecycleDeletedTotal counts objects deleted by lifecycle expiration rules.
    LifecycleDeletedTotal = promauto.NewCounter(
        prometheus.CounterOpts{
            Name: "s3o_lifecycle_deleted_total",
            Help: "Objects deleted by lifecycle expiration rules",
        },
    )

    // LifecycleFailedTotal counts objects that failed lifecycle deletion.
    LifecycleFailedTotal = promauto.NewCounter(
        prometheus.CounterOpts{
            Name: "s3o_lifecycle_failed_total",
            Help: "Objects that failed lifecycle deletion",
        },
    )

    // LifecycleRunsTotal counts lifecycle worker executions.
    LifecycleRunsTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_lifecycle_runs_total",
            Help: "Lifecycle worker executions",
        },
        []string{"status"},
    )

    // DrainObjectsMoved counts objects moved during backend drain operations.
    DrainObjectsMoved = promauto.NewCounter(
        prometheus.CounterOpts{
            Name: "s3o_drain_objects_moved_total",
            Help: "Total number of objects moved during backend drain operations",
        },
    )

    // DrainBytesMoved counts bytes moved during backend drain operations.
    DrainBytesMoved = promauto.NewCounter(
        prometheus.CounterOpts{
            Name: "s3o_drain_bytes_moved_total",
            Help: "Total bytes moved during backend drain operations",
        },
    )

    // DrainActive is the live count of in-flight drain operations.
    // Inc'd on StartDrain and Dec'd on completion (success, cancel, or
    // abort) so concurrent drains across different backends do not
    // clobber each other's state the way a Set(0)/Set(1) gauge would.
    // 0 means no drains are running.
    DrainActive = promauto.NewGauge(
        prometheus.GaugeOpts{
            Name: "s3o_drain_active",
            Help: "Count of in-flight backend drain operations (Inc/Dec so concurrent drains compose)",
        },
    )

    // DrainRaceAbortedTotal counts PutObject attempts that landed bytes
    // on a backend whose drain started mid-write. The orchestrator
    // detects the race after the backend PUT completes, deletes the
    // orphaned bytes, and fails the attempt over to the next eligible
    // backend; this counter pins how often the race fires in production
    // so the drain timing assumptions can be revisited if it climbs.
    DrainRaceAbortedTotal = promauto.NewCounter(
        prometheus.CounterOpts{
            Name: "s3o_drain_race_aborted_total",
            Help: "Number of PutObject attempts aborted after drain started mid-write",
        },
    )
)

EncryptionOpsTotal and related package-level variables used by this package.

var (

    // EncryptionOpsTotal counts encryption operations by type.
    EncryptionOpsTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_encryption_operations_total",
            Help: "Total encryption operations",
        },
        []string{"op"},
    )

    // EncryptionErrorsTotal counts encryption errors by operation and type.
    EncryptionErrorsTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_encryption_errors_total",
            Help: "Total encryption errors",
        },
        []string{"op", "error_type"},
    )

    // EncryptionUnknownKeyIDTotal counts decryption attempts where the keyID
    // was not found in the configured keys, triggering a primary key fallback.
    EncryptionUnknownKeyIDTotal = promauto.NewCounter(
        prometheus.CounterOpts{
            Name: "s3o_encryption_unknown_key_id_total",
            Help: "Decryption attempts with unknown keyID (primary key fallback)",
        },
    )

    // IntegrityErrorsTotal counts hash mismatches detected during read,
    // replication, or background scrubbing.
    IntegrityErrorsTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_integrity_errors_total",
            Help: "Content hash mismatches detected",
        },
        []string{"operation"},
    )

    // IntegrityChecksTotal counts hash verifications performed.
    IntegrityChecksTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_integrity_checks_total",
            Help: "Content hash verifications performed",
        },
        []string{"operation"},
    )

    // KeyRotationObjectsTotal counts objects processed during key rotation.
    KeyRotationObjectsTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_key_rotation_objects_total",
            Help: "Total objects processed during key rotation",
        },
        []string{"status"},
    )

    // EncryptExistingObjectsTotal counts objects processed during encrypt-existing.
    EncryptExistingObjectsTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_encrypt_existing_objects_total",
            Help: "Total objects processed during encrypt-existing operation",
        },
        []string{"status"},
    )

    // DecryptExistingObjectsTotal counts objects processed during decrypt-existing.
    DecryptExistingObjectsTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_decrypt_existing_objects_total",
            Help: "Total objects processed during decrypt-existing operation",
        },
        []string{"status"},
    )
)

BuildInfo and related package-level variables used by this package.

var (

    // BuildInfo exposes version information.
    BuildInfo = promauto.NewGaugeVec(
        prometheus.GaugeOpts{
            Name: "s3o_build_info",
            Help: "Build information for the S3 proxy",
        },
        []string{"version", "go_version"},
    )

    // NotificationSentTotal counts successfully delivered webhook notifications.
    NotificationSentTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_notification_sent_total",
            Help: "Webhook notifications delivered successfully",
        },
        []string{"endpoint", "event_type"},
    )

    // NotificationFailedTotal counts webhook delivery failures.
    NotificationFailedTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_notification_failed_total",
            Help: "Webhook notification delivery failures",
        },
        []string{"endpoint", "event_type"},
    )

    // NotificationDroppedTotal counts events dropped due to dampening or enqueue failure.
    NotificationDroppedTotal = promauto.NewCounter(
        prometheus.CounterOpts{
            Name: "s3o_notification_dropped_total",
            Help: "Events dropped due to dampening or queue insertion failure",
        },
    )

    // NotificationStoreErrorsTotal counts outbox-store operation failures in
    // the delivery worker (CompleteNotification / RetryNotification). A
    // non-zero value means the worker saw a store error that could cause
    // duplicate or dropped webhook deliveries  -  alert on any increase.
    NotificationStoreErrorsTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_notification_store_errors_total",
            Help: "Outbox store errors seen by the notification delivery worker",
        },
        []string{"operation"},
    )

    // NotificationQueueDepth reports the number of pending notifications in the outbox.
    NotificationQueueDepth = promauto.NewGauge(
        prometheus.GaugeOpts{
            Name: "s3o_notification_queue_depth",
            Help: "Pending notifications in the delivery outbox",
        },
    )

    // NotificationDuration measures webhook delivery latency.
    NotificationDuration = promauto.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "s3o_notification_duration_seconds",
            Help:    "Webhook notification delivery latency",
            Buckets: prometheus.DefBuckets,
        },
        []string{"endpoint"},
    )
)

QuotaBytesUsed and related package-level variables used by this package.

var (

    // QuotaBytesUsed tracks current bytes used per backend.
    QuotaBytesUsed = promauto.NewGaugeVec(
        prometheus.GaugeOpts{
            Name: "s3o_quota_bytes_used",
            Help: "Current bytes used on each backend",
        },
        []string{"backend"},
    )

    // QuotaBytesLimit tracks quota limit per backend.
    QuotaBytesLimit = promauto.NewGaugeVec(
        prometheus.GaugeOpts{
            Name: "s3o_quota_bytes_limit",
            Help: "Quota limit in bytes for each backend",
        },
        []string{"backend"},
    )

    // QuotaBytesAvailable tracks available bytes per backend.
    QuotaBytesAvailable = promauto.NewGaugeVec(
        prometheus.GaugeOpts{
            Name: "s3o_quota_bytes_available",
            Help: "Available bytes (limit - used - orphan) for each backend",
        },
        []string{"backend"},
    )

    // QuotaOrphanBytes tracks bytes pending physical deletion per backend.
    QuotaOrphanBytes = promauto.NewGaugeVec(
        prometheus.GaugeOpts{
            Name: "s3o_quota_orphan_bytes",
            Help: "Bytes pending physical deletion (logically freed but not yet removed from backend)",
        },
        []string{"backend"},
    )

    // UsageReconcileCorrectionsTotal counts per-backend bytes_used corrections
    // applied by usage reconciliation. A steadily rising value means a write
    // path is leaking the counter; the reconcile dashboard panel reads this.
    UsageReconcileCorrectionsTotal = promauto.NewCounter(
        prometheus.CounterOpts{
            Name: "s3o_quota_reconcile_corrections_total",
            Help: "Per-backend bytes_used drift corrections applied by usage reconciliation",
        },
    )

    // ObjectCount tracks the number of objects stored per backend.
    ObjectCount = promauto.NewGaugeVec(
        prometheus.GaugeOpts{
            Name: "s3o_objects_count",
            Help: "Number of objects stored on each backend",
        },
        []string{"backend"},
    )

    // ActiveMultipartUploads tracks in-progress multipart uploads per backend.
    ActiveMultipartUploads = promauto.NewGaugeVec(
        prometheus.GaugeOpts{
            Name: "s3o_active_multipart_uploads",
            Help: "Number of in-progress multipart uploads per backend",
        },
        []string{"backend"},
    )

    // UsageAPIRequests tracks the current month's API request count per backend.
    UsageAPIRequests = promauto.NewGaugeVec(
        prometheus.GaugeOpts{
            Name: "s3o_usage_api_requests",
            Help: "Current month API request count per backend (from DB)",
        },
        []string{"backend"},
    )

    // UsageEgressBytes tracks the current month's egress bytes per backend.
    UsageEgressBytes = promauto.NewGaugeVec(
        prometheus.GaugeOpts{
            Name: "s3o_usage_egress_bytes",
            Help: "Current month egress bytes per backend (from DB)",
        },
        []string{"backend"},
    )

    // UsageIngressBytes tracks the current month's ingress bytes per backend.
    UsageIngressBytes = promauto.NewGaugeVec(
        prometheus.GaugeOpts{
            Name: "s3o_usage_ingress_bytes",
            Help: "Current month ingress bytes per backend (from DB)",
        },
        []string{"backend"},
    )

    // UsageLimitRejectionsTotal counts operations rejected due to monthly usage limits.
    UsageLimitRejectionsTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_usage_limit_rejections_total",
            Help: "Total operations rejected due to monthly usage limits",
        },
        []string{"operation", "limit_type"},
    )
)

RebalanceObjectsMoved and related package-level variables used by this package.

var (

    // RebalanceObjectsMoved counts objects moved by the rebalancer.
    RebalanceObjectsMoved = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_rebalance_objects_moved_total",
            Help: "Total number of objects moved by the rebalancer",
        },
        []string{"strategy", "status"},
    )

    // RebalanceBytesMoved counts bytes moved by the rebalancer.
    RebalanceBytesMoved = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_rebalance_bytes_moved_total",
            Help: "Total bytes moved by the rebalancer",
        },
        []string{"strategy"},
    )

    // RebalanceRunsTotal counts rebalancer executions.
    RebalanceRunsTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_rebalance_runs_total",
            Help: "Total number of rebalancer runs",
        },
        []string{"strategy", "status"},
    )

    // RebalanceDuration tracks rebalancer execution time.
    RebalanceDuration = promauto.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "s3o_rebalance_duration_seconds",
            Help:    "Rebalancer execution time in seconds",
            Buckets: []float64{1, 5, 10, 30, 60, 120, 300, 600},
        },
        []string{"strategy"},
    )

    // RebalanceSkipped counts rebalancer runs that were skipped.
    RebalanceSkipped = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_rebalance_skipped_total",
            Help: "Total number of rebalancer runs skipped",
        },
        []string{"reason"},
    )

    // RebalancePending tracks objects planned for rebalance in the current cycle.
    RebalancePending = promauto.NewGauge(
        prometheus.GaugeOpts{
            Name: "s3o_rebalance_pending",
            Help: "Number of objects planned for rebalance",
        },
    )
)

ReplicationPending and related package-level variables used by this package.

var (

    // ReplicationPending tracks objects currently below the target replication factor.
    ReplicationPending = promauto.NewGauge(
        prometheus.GaugeOpts{
            Name: "s3o_replication_pending",
            Help: "Number of objects below the target replication factor",
        },
    )

    // ReplicationCopiesCreatedTotal counts replica copies created.
    ReplicationCopiesCreatedTotal = promauto.NewCounter(
        prometheus.CounterOpts{
            Name: "s3o_replication_copies_created_total",
            Help: "Total number of replica copies created",
        },
    )

    // ReplicationErrorsTotal counts replication errors.
    ReplicationErrorsTotal = promauto.NewCounter(
        prometheus.CounterOpts{
            Name: "s3o_replication_errors_total",
            Help: "Total number of replication errors",
        },
    )

    // ReplicationDuration tracks replication worker cycle time.
    ReplicationDuration = promauto.NewHistogram(
        prometheus.HistogramOpts{
            Name:    "s3o_replication_duration_seconds",
            Help:    "Replication worker cycle time in seconds",
            Buckets: []float64{1, 5, 10, 30, 60, 120, 300, 600},
        },
    )

    // ReplicationRunsTotal counts replication worker executions.
    ReplicationRunsTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_replication_runs_total",
            Help: "Total number of replication worker runs",
        },
        []string{"status"},
    )

    // ReplicationHealthCopiesTotal counts copies created to replace copies on
    // circuit-broken backends.
    ReplicationHealthCopiesTotal = promauto.NewCounter(
        prometheus.CounterOpts{
            Name: "s3o_replication_health_copies_total",
            Help: "Replica copies created to replace copies on circuit-broken backends",
        },
    )

    // OverReplicationPending tracks objects currently above the target replication factor.
    OverReplicationPending = promauto.NewGauge(
        prometheus.GaugeOpts{
            Name: "s3o_over_replication_pending",
            Help: "Number of objects above the target replication factor",
        },
    )

    // OverReplicationRemovedTotal counts excess copies removed by the cleaner.
    OverReplicationRemovedTotal = promauto.NewCounter(
        prometheus.CounterOpts{
            Name: "s3o_over_replication_removed_total",
            Help: "Total excess copies removed by over-replication cleanup",
        },
    )

    // OverReplicationErrorsTotal counts over-replication cleanup errors.
    OverReplicationErrorsTotal = promauto.NewCounter(
        prometheus.CounterOpts{
            Name: "s3o_over_replication_errors_total",
            Help: "Total number of over-replication cleanup errors",
        },
    )

    // OverReplicationRunsTotal counts over-replication cleanup worker executions.
    OverReplicationRunsTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_over_replication_runs_total",
            Help: "Total number of over-replication cleanup worker runs",
        },
        []string{"status"},
    )

    // OverReplicationDuration tracks over-replication cleanup worker cycle time.
    OverReplicationDuration = promauto.NewHistogram(
        prometheus.HistogramOpts{
            Name:    "s3o_over_replication_duration_seconds",
            Help:    "Over-replication cleanup worker cycle time in seconds",
            Buckets: []float64{1, 5, 10, 30, 60, 120, 300, 600},
        },
    )
)

RequestsTotal and related package-level variables used by this package.

var (

    // RequestsTotal counts all HTTP requests by method and status code.
    RequestsTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_requests_total",
            Help: "Total number of HTTP requests processed",
        },
        []string{"method", "status_code"},
    )

    // RequestDuration tracks request latency distribution by method.
    RequestDuration = promauto.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "s3o_request_duration_seconds",
            Help:    "HTTP request latency in seconds",
            Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 30, 60},
        },
        []string{"method"},
    )

    // RequestSize tracks upload sizes.
    RequestSize = promauto.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "s3o_request_size_bytes",
            Help:    "HTTP request body size in bytes",
            Buckets: prometheus.ExponentialBuckets(1024, 4, 10),
        },
        []string{"method"},
    )

    // ResponseSize tracks download sizes.
    ResponseSize = promauto.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "s3o_response_size_bytes",
            Help:    "HTTP response body size in bytes",
            Buckets: prometheus.ExponentialBuckets(1024, 4, 10),
        },
        []string{"method"},
    )

    // InflightRequests tracks currently processing requests.
    InflightRequests = promauto.NewGaugeVec(
        prometheus.GaugeOpts{
            Name: "s3o_inflight_requests",
            Help: "Number of requests currently being processed",
        },
        []string{"method"},
    )

    // BackendRequestsTotal counts backend operations by operation type and status.
    BackendRequestsTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_backend_requests_total",
            Help: "Total number of backend storage operations",
        },
        []string{"operation", "backend", "status"},
    )

    // BackendDuration tracks backend operation latency.
    BackendDuration = promauto.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "s3o_backend_duration_seconds",
            Help:    "Backend operation latency in seconds",
            Buckets: []float64{.01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 30, 60, 120},
        },
        []string{"operation", "backend"},
    )

    // ManagerRequestsTotal counts manager-level operations.
    ManagerRequestsTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_manager_requests_total",
            Help: "Total number of manager-level storage operations",
        },
        []string{"operation", "backend", "status"},
    )

    // ManagerDuration tracks manager operation latency.
    ManagerDuration = promauto.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "s3o_manager_duration_seconds",
            Help:    "Manager operation latency in seconds",
            Buckets: []float64{.01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 30, 60, 120},
        },
        []string{"operation", "backend"},
    )

    // RateLimitRejectionsTotal counts requests rejected by the per-IP rate limiter.
    RateLimitRejectionsTotal = promauto.NewCounter(
        prometheus.CounterOpts{
            Name: "s3o_rate_limit_rejections_total",
            Help: "Total requests rejected due to per-IP rate limiting",
        },
    )

    // AdmissionRejectionsTotal counts requests rejected by server-level admission control.
    AdmissionRejectionsTotal = promauto.NewCounter(
        prometheus.CounterOpts{
            Name: "s3o_admission_rejections_total",
            Help: "Total requests rejected due to server-level admission control",
        },
    )

    // AdmissionClientCanceledTotal counts requests where the client closed
    // or cancelled their context while waiting for an admission slot. These
    // are not server-side rejections and do not count against capacity SLOs.
    AdmissionClientCanceledTotal = promauto.NewCounter(
        prometheus.CounterOpts{
            Name: "s3o_admission_client_canceled_total",
            Help: "Total admission waits aborted by client context cancellation",
        },
    )

    // WorkerAdmissionRejectionsTotal counts background worker tasks that were
    // skipped because the admission semaphore was full.
    WorkerAdmissionRejectionsTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_worker_admission_rejections_total",
            Help: "Background worker tasks skipped due to admission control",
        },
        []string{"worker"},
    )

    // LoadShedTotal counts requests probabilistically rejected by active
    // load shedding before reaching the hard admission limit.
    LoadShedTotal = promauto.NewCounter(
        prometheus.CounterOpts{
            Name: "s3o_load_shed_total",
            Help: "Requests shed by active load shedding before hard admission limit",
        },
    )

    // EarlyRejectionsTotal counts uploads rejected before body transmission
    // via Expect: 100-Continue pre-flight capacity checks.
    EarlyRejectionsTotal = promauto.NewCounter(
        prometheus.CounterOpts{
            Name: "s3o_early_rejections_total",
            Help: "Uploads rejected before body transmission (no backend capacity)",
        },
    )

    // HTTPPanicRecoveredTotal counts panics caught by the HTTP panic-
    // recovery middleware. A non-zero value means a request
    // handler panicked and the recovery layer translated it into a 500
    // response; the matching slog.ErrorContext line carries the panic
    // value, the captured stack, and the request id. Label scopes the
    // counter to the route group so dashboards can distinguish a
    // flaking S3 path from a flaking admin or UI path.
    HTTPPanicRecoveredTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_http_panic_recovered_total",
            Help: "HTTP handler panics caught by the recovery middleware, by route group",
        },
        []string{"route"},
    )
)

var (
    // WorkerTicksTotal counts each completed tick of a locked-ticker
    // service, labelled by service name and outcome. Outcomes:
    //   success  -  work returned nil
    //   error    -  work returned a non-nil error
    //   skipped  -  shouldRun gated the tick or the lock was busy
    WorkerTicksTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_worker_ticks_total",
            Help: "Locked-ticker service ticks by service and outcome",
        },
        []string{"service", "result"},
    )

    // WorkerLastSuccessTimestampSeconds records the Unix time of the
    // most recent successful tick for each service. Alert on staleness
    // with time() minus this gauge.
    WorkerLastSuccessTimestampSeconds = promauto.NewGaugeVec(
        prometheus.GaugeOpts{
            Name: "s3o_worker_last_success_timestamp_seconds",
            Help: "Unix time of the most recent successful tick per service",
        },
        []string{"service"},
    )

    // WorkerConsecutiveFailures gauges the run of back-to-back failed
    // ticks for each service. Resets to 0 on the next success.
    WorkerConsecutiveFailures = promauto.NewGaugeVec(
        prometheus.GaugeOpts{
            Name: "s3o_worker_consecutive_failures",
            Help: "Consecutive failed ticks per service since the last success",
        },
        []string{"service"},
    )
)

S3 orchestrator specific attribute keys.

var (
    AttrRequestID         = attribute.Key("s3o.request_id")
    AttrVirtualBucket     = attribute.Key("s3o.bucket.virtual")
    AttrBackendBucket     = attribute.Key("s3o.bucket.backend")
    AttrObjectKey         = attribute.Key("s3o.key")
    AttrBackendName       = attribute.Key("s3o.backend.name")
    AttrBackendEndpoint   = attribute.Key("s3o.backend.endpoint")
    AttrObjectSize        = attribute.Key("s3o.object.size")
    AttrContentType       = attribute.Key("s3o.object.content_type")
    AttrOperation         = attribute.Key("s3o.operation")
    AttrUploadID          = attribute.Key("s3o.upload_id")
    AttrPartNumber        = attribute.Key("s3o.part_number")
    AttrWriteFailover     = attribute.Key("s3o.write_failover")
    AttrFailoverAttempts  = attribute.Key("s3o.write_failover_attempts")
    AttrFailover          = attribute.Key("s3o.failover")
    AttrDegradedMode      = attribute.Key("s3o.degraded_mode")
    AttrCacheHit          = attribute.Key("s3o.cache_hit")
    AttrParallelBroadcast = attribute.Key("s3o.parallel_broadcast")
    AttrNativeCopy        = attribute.Key("s3o.native_copy")
)

AuditEventsTotal and related package-level variables used by this package.

var (

    // AuditEventsTotal counts audit log entries by event type.
    AuditEventsTotal = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "s3o_audit_events_total",
            Help: "Total number of audit log entries emitted",
        },
        []string{"event"},
    )
)

AuthStreamingRejectionsTotal counts streaming-payload requests rejected due to a chunk- or trailer-validation failure, labeled by reason. Reasons map 1:1 to the auth.Err* sentinels so dashboards can distinguish a tampered body (chunk_signature_mismatch) from a malformed framing attempt (chunk_malformed).

var AuthStreamingRejectionsTotal = promauto.NewCounterVec(
    prometheus.CounterOpts{
        Name: "s3o_auth_streaming_rejections_total",
        Help: "Total streaming-payload requests rejected mid-stream, by reason.",
    },
    []string{"reason"},
)

AuthStreamingRequestsTotal counts streaming-payload requests received, labeled by AWS streaming variant. Incremented at the moment the transport layer wraps the request body in a chunk-validating reader.

var AuthStreamingRequestsTotal = promauto.NewCounterVec(
    prometheus.CounterOpts{
        Name: "s3o_auth_streaming_requests_total",
        Help: "Total streaming-payload SigV4 requests received, by AWS streaming variant.",
    },
    []string{"variant"},
)

Version of the service for trace metadata. Set at build time via -ldflags “-X github.com/afreidah/s3-orchestrator/internal/telemetry.Version=…”

var Version = "dev"

func BackendAttributes

func BackendAttributes(operation, backendName, endpoint, bucket, key string) []attribute.KeyValue

BackendAttributes returns common attributes for backend operation spans.

func InitTracer

func InitTracer(ctx context.Context, cfg config.TracingConfig) (func(context.Context) error, error)

InitTracer initializes the OpenTelemetry tracer with OTLP export. Returns a shutdown function that should be called on service termination to flush spans.

func NewCircuitBreakerHook

func NewCircuitBreakerHook(name string) func(breaker.StateChangeInfo)

NewCircuitBreakerHook returns the callback to install on a breaker so its state changes drive the CircuitBreakerState gauge, the CircuitBreakerTransitionsTotal counter, and the BackendCircuitOpened / BackendCircuitClosed events.

Initializes the gauge to “closed” up front so Prometheus reports a value before the first transition.

func NewDatabaseBreakerHook

func NewDatabaseBreakerHook(name string) func(breaker.StateChangeInfo)

NewDatabaseBreakerHook chains the standard breaker hook with the DegradedModeActive gauge (1 when not closed, 0 when closed).

func RequestAttributes

func RequestAttributes(method, path, bucket, key, clientIP string) []attribute.KeyValue

RequestAttributes returns common attributes for HTTP request spans.

func StartClientSpan

func StartClientSpan(ctx context.Context, name string, attrs ...attribute.KeyValue) (context.Context, trace.Span)

StartClientSpan creates a span for outbound calls (backend S3 operations).

func StartServerSpan

func StartServerSpan(ctx context.Context, name string, attrs ...attribute.KeyValue) (context.Context, trace.Span)

StartServerSpan creates a span for inbound requests (HTTP handler entry points).

func StartSpan

func StartSpan(ctx context.Context, name string, attrs ...attribute.KeyValue) (context.Context, trace.Span)

StartSpan creates a new internal span with the given name and attributes.

func Tracer

func Tracer() trace.Tracer

Tracer returns the global tracer for this service.

type LogBuffer

LogBuffer is a thread-safe circular buffer of log entries.

type LogBuffer struct {
    // contains filtered or unexported fields
}

func NewLogBuffer

func NewLogBuffer() *LogBuffer

NewLogBuffer creates a ring buffer with the default capacity.

func (*LogBuffer) Add

func (b *LogBuffer) Add(entry LogEntry)

Add appends a log entry to the buffer, overwriting the oldest entry when the buffer is full.

func (*LogBuffer) Entries

func (b *LogBuffer) Entries(opts *LogQueryOpts) []LogEntry

Entries returns buffered log entries matching the query options. Results are returned in chronological order (oldest first).

The lock is held only long enough to snapshot the ring buffer state. Filtering and result construction happen outside the lock so concurrent Add calls are not blocked by slow dashboard queries.

type LogEntry

LogEntry is a single structured log record stored in the buffer.

type LogEntry struct {
    Time    time.Time      `json:"time"`
    Level   string         `json:"level"`
    Message string         `json:"message"`
    Attrs   map[string]any `json:"attrs,omitempty"`
}

type LogQueryOpts

LogQueryOpts controls filtering when reading from the buffer.

type LogQueryOpts struct {
    MinLevel  slog.Level // minimum severity (default 0 = DEBUG)
    Since     time.Time  // only entries after this time
    Before    time.Time  // only entries before this time
    Limit     int        // max entries to return (0 = all)
    Component string     // filter by "component" attribute value
}

type TeeHandler

TeeHandler is an slog.Handler that writes each log record to a primary handler (typically JSON to stdout) and also captures it in a LogBuffer.

type TeeHandler struct {
    // contains filtered or unexported fields
}

func NewTeeHandler

func NewTeeHandler(primary slog.Handler, buf *LogBuffer) *TeeHandler

NewTeeHandler creates a handler that fans out to both the primary handler and the ring buffer.

func (*TeeHandler) Enabled

func (h *TeeHandler) Enabled(ctx context.Context, level slog.Level) bool

Enabled reports whether the handler handles records at the given level. Delegates to the primary handler.

func (*TeeHandler) Handle

func (h *TeeHandler) Handle(ctx context.Context, r slog.Record) error

Handle writes the record to the primary handler and captures it in the buffer. The slog.Handler interface requires a value receiver for slog.Record.

func (*TeeHandler) WithAttrs

func (h *TeeHandler) WithAttrs(attrs []slog.Attr) slog.Handler

WithAttrs returns a new TeeHandler with the given attributes added.

func (*TeeHandler) WithGroup

func (h *TeeHandler) WithGroup(name string) slog.Handler

WithGroup returns a new TeeHandler with the given group name.

type TraceHandler

TraceHandler wraps an slog.Handler and injects OpenTelemetry trace context (trace_id, span_id) into every log record that has an active span.

type TraceHandler struct {
    // contains filtered or unexported fields
}

func NewTraceHandler

func NewTraceHandler(inner slog.Handler) *TraceHandler

NewTraceHandler creates a handler that injects trace context into log records before delegating to the inner handler.

func (*TraceHandler) Enabled

func (h *TraceHandler) Enabled(ctx context.Context, level slog.Level) bool

Enabled delegates to the inner handler.

func (*TraceHandler) Handle

func (h *TraceHandler) Handle(ctx context.Context, r slog.Record) error

Handle adds trace_id and span_id attributes if the context carries an active span, then delegates to the inner handler.

func (*TraceHandler) WithAttrs

func (h *TraceHandler) WithAttrs(attrs []slog.Attr) slog.Handler

WithAttrs returns a new TraceHandler wrapping the inner handler with attrs.

func (*TraceHandler) WithGroup

func (h *TraceHandler) WithGroup(name string) slog.Handler

WithGroup returns a new TraceHandler wrapping the inner handler with a group.

Generated by gomarkdoc