TracerName and related constants used by this package.
const (
// TracerName identifies spans created by this service.TracerName = "s3-orchestrator")
Variables
CircuitBreakerState and related package-level variables used by this package.
var (
// CircuitBreakerState tracks the current circuit breaker state per component.// 0=closed (healthy), 1=open (down), 2=half-open (probing).CircuitBreakerState = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "s3o_circuit_breaker_state",
Help: "Current circuit breaker state: 0=closed, 1=open, 2=half-open",
},
[]string{"name"},
)
// CircuitBreakerTransitionsTotal counts state transitions per component.CircuitBreakerTransitionsTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_circuit_breaker_transitions_total",
Help: "Total number of circuit breaker state transitions",
},
[]string{"name", "from", "to"},
)
// CircuitBreakerInternalErrorsTotal counts errors returned by the// breaker's own machinery (PostCheck / state transition helpers).// Non-zero values indicate a bookkeeping bug, not an application// error - alert on any increase.CircuitBreakerInternalErrorsTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_circuit_breaker_internal_errors_total",
Help: "Errors returned by circuit breaker PostCheck/state transitions",
},
[]string{"name", "operation"},
)
// DegradedReadsTotal counts reads served via broadcast during degraded mode.DegradedReadsTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_degraded_reads_total",
Help: "Total number of read operations served via broadcast during degraded mode",
},
[]string{"operation"},
)
// DegradedCacheHitsTotal counts location cache hits during degraded reads.DegradedCacheHitsTotal = promauto.NewCounter(
prometheus.CounterOpts{
Name: "s3o_degraded_cache_hits_total",
Help: "Total number of location cache hits during degraded reads",
},
)
// DegradedWriteRejectionsTotal counts writes rejected during degraded mode.DegradedWriteRejectionsTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_degraded_write_rejections_total",
Help: "Total number of write operations rejected during degraded mode",
},
[]string{"operation"},
)
// WriteFailoverTotal counts writes that failed on one backend and were// retried on another. Labels: operation, failed_backend, success_backend.WriteFailoverTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_write_failover_total",
Help: "Total number of write operations that failed over to a different backend",
},
[]string{"operation", "failed_backend", "success_backend"},
)
// DegradedModeActive is 1 when the DB breaker is open or half-open.DegradedModeActive = promauto.NewGauge(
prometheus.GaugeOpts{
Name: "s3o_degraded_mode_active",
Help: "1 when the read path is currently in degraded mode (DB unavailable), 0 otherwise",
},
)
// DegradedBroadcastDuration is the wall-clock duration of degraded-mode broadcast reads.DegradedBroadcastDuration = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "s3o_degraded_broadcast_duration_seconds",
Help: "Wall-clock duration of degraded-mode broadcast reads, terminal outcome labelled.",
Buckets: prometheus.DefBuckets,
},
[]string{"operation", "outcome"},
)
// DegradedBroadcastMixedOutcomesTotal counts broadcasts where some backends returned 404 and others failed differently.DegradedBroadcastMixedOutcomesTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_degraded_broadcast_mixed_outcomes_total",
Help: "Broadcasts where the all-failed terminal saw both 404 and non-404 failures across backends. Surfaces provider divergence or transient backend storms hidden under not_found.",
},
[]string{"operation"},
)
)
CacheHitsTotal and related package-level variables used by this package.
var (
// CacheHitsTotal counts object data cache hits.CacheHitsTotal = promauto.NewCounter(
prometheus.CounterOpts{
Name: "s3o_cache_hits_total",
Help: "Object data cache hits",
},
)
// CacheMissesTotal counts object data cache misses.CacheMissesTotal = promauto.NewCounter(
prometheus.CounterOpts{
Name: "s3o_cache_misses_total",
Help: "Object data cache misses",
},
)
// CacheEvictionsTotal counts cache entries evicted by LRU or TTL.CacheEvictionsTotal = promauto.NewCounter(
prometheus.CounterOpts{
Name: "s3o_cache_evictions_total",
Help: "Cache entries evicted (LRU or TTL)",
},
)
// CacheSizeBytes tracks current cache utilization in bytes.CacheSizeBytes = promauto.NewGauge(
prometheus.GaugeOpts{
Name: "s3o_cache_size_bytes",
Help: "Current object data cache size in bytes",
},
)
// CacheEntries tracks the number of entries in the cache.CacheEntries = promauto.NewGauge(
prometheus.GaugeOpts{
Name: "s3o_cache_entries",
Help: "Number of entries in the object data cache",
},
)
// CacheFlushTotal counts admin cache-flush invocations. Useful for// auditing how often operators or perf runs reset cache state, and// for distinguishing organic eviction from explicit flushes when// reading cache_size_bytes / cache_entries dropouts on dashboards.CacheFlushTotal = promauto.NewCounter(
prometheus.CounterOpts{
Name: "s3o_cache_flush_total",
Help: "Admin-triggered object data cache flushes",
},
)
// CacheAdminInvalidationsTotal counts admin-triggered single-key// invalidations. Distinct from organic invalidations driven by// writes/deletes/replication so dashboards can separate operator// actions from background cache churn.CacheAdminInvalidationsTotal = promauto.NewCounter(
prometheus.CounterOpts{
Name: "s3o_cache_admin_invalidations_total",
Help: "Admin-triggered single-key cache invalidations",
},
)
// RedisOperationsTotal counts Redis counter backend operations.RedisOperationsTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_redis_operations_total",
Help: "Total Redis counter backend operations",
},
[]string{"operation", "status"},
)
// RedisFallbackActive is 1 when the Redis counter backend is in local// fallback mode due to circuit breaker, 0 during normal operation.RedisFallbackActive = promauto.NewGauge(
prometheus.GaugeOpts{
Name: "s3o_redis_fallback_active",
Help: "Whether Redis counter backend is in local fallback mode",
},
)
)
CleanupQueueEnqueuedTotal and related package-level variables used by this package.
var (
// CleanupQueueEnqueuedTotal counts items added to the cleanup retry queue.CleanupQueueEnqueuedTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_cleanup_queue_enqueued_total",
Help: "Total items added to the cleanup retry queue",
},
[]string{"reason"},
)
// CleanupEnqueueFailuresTotal counts cleanup-queue enqueue attempts// that failed after a backend write already succeeded. The orphan// object exists on the backend but the system lost the chance to// track it for retry. stage="enqueue" means the cleanup_queue row// itself did not persist (worst case - cleanup-queue worker will// never see this orphan); stage="orphan_bytes" means the row// persisted but the orphan_bytes counter did not increment (quota// accounting drifts but cleanup still runs). Operators alert on any// non-zero rate of stage="enqueue" and run the reconciler// (POST /admin/api/reconcile) once DB connectivity returns.CleanupEnqueueFailuresTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_cleanup_enqueue_failures_total",
Help: "Cleanup-queue enqueue attempts that failed after a successful backend write",
},
[]string{"backend", "reason", "stage"},
)
// CleanupQueueProcessedTotal counts items processed from the cleanup queue.CleanupQueueProcessedTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_cleanup_queue_processed_total",
Help: "Total items processed from the cleanup retry queue",
},
[]string{"status"},
)
// CleanupQueueDepth tracks the current number of pending cleanup items.CleanupQueueDepth = promauto.NewGauge(
prometheus.GaugeOpts{
Name: "s3o_cleanup_queue_depth",
Help: "Current number of pending items in the cleanup retry queue",
},
)
// CleanupDLQDepth tracks the current number of rows in the cleanup// dead-letter table - cleanup_queue rows that exhausted their retry// budget without ever succeeding at the physical backend delete. A// non-zero value means orphan bytes are still on the backend with// no automatic recovery in flight; operators must investigate.CleanupDLQDepth = promauto.NewGauge(
prometheus.GaugeOpts{
Name: "s3o_cleanup_dlq_depth",
Help: "Current number of unrecoverable orphans in the cleanup dead-letter queue",
},
)
// CleanupDLQEnqueuedTotal counts cleanup_queue rows graduated to the// dead-letter table per backend, labelled so dashboards can pinpoint// which backend is failing physical deletes.CleanupDLQEnqueuedTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_cleanup_dlq_enqueued_total",
Help: "Total cleanup_queue rows moved to cleanup_dlq after exhausting retries",
},
[]string{"backend"},
)
// CleanupQueueStaleClaimsRecoveredTotal counts cleanup_queue rows whose// claim was reclaimed because the previous holder did not finalise the// row within the configured grace period. A non-zero rate is operational// signal that a worker died mid-process or the grace period is too// short for the realistic worst-case processing time.CleanupQueueStaleClaimsRecoveredTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_cleanup_queue_stale_claims_recovered_total",
Help: "cleanup_queue rows whose stale claim was reclaimed by a later worker tick",
},
[]string{"backend"},
)
// PendingIntentsEnqueuedTotal counts pending intents inserted by the// write path before the backend PUT.PendingIntentsEnqueuedTotal = promauto.NewCounter(
prometheus.CounterOpts{
Name: "s3o_pending_intents_enqueued_total",
Help: "Total in-flight PUT intents inserted before the backend write",
},
)
// PendingIntentsResolvedTotal counts intents resolved by the reaper or// the synchronous commit path. Status is one of: committed, promoted,// dropped, ambiguous, already_resolved.PendingIntentsResolvedTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_pending_intents_resolved_total",
Help: "Total pending PUT intents resolved by status",
},
[]string{"status"},
)
// PendingIntentsDepth tracks the current number of unresolved pending// intents in the database.PendingIntentsDepth = promauto.NewGauge(
prometheus.GaugeOpts{
Name: "s3o_pending_intents_depth",
Help: "Current number of unresolved pending PUT intents",
},
)
// LifecycleDeletedTotal counts objects deleted by lifecycle expiration rules.LifecycleDeletedTotal = promauto.NewCounter(
prometheus.CounterOpts{
Name: "s3o_lifecycle_deleted_total",
Help: "Objects deleted by lifecycle expiration rules",
},
)
// LifecycleFailedTotal counts objects that failed lifecycle deletion.LifecycleFailedTotal = promauto.NewCounter(
prometheus.CounterOpts{
Name: "s3o_lifecycle_failed_total",
Help: "Objects that failed lifecycle deletion",
},
)
// LifecycleRunsTotal counts lifecycle worker executions.LifecycleRunsTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_lifecycle_runs_total",
Help: "Lifecycle worker executions",
},
[]string{"status"},
)
// DrainObjectsMoved counts objects moved during backend drain operations.DrainObjectsMoved = promauto.NewCounter(
prometheus.CounterOpts{
Name: "s3o_drain_objects_moved_total",
Help: "Total number of objects moved during backend drain operations",
},
)
// DrainBytesMoved counts bytes moved during backend drain operations.DrainBytesMoved = promauto.NewCounter(
prometheus.CounterOpts{
Name: "s3o_drain_bytes_moved_total",
Help: "Total bytes moved during backend drain operations",
},
)
// DrainActive is the live count of in-flight drain operations.// Inc'd on StartDrain and Dec'd on completion (success, cancel, or// abort) so concurrent drains across different backends do not// clobber each other's state the way a Set(0)/Set(1) gauge would.// 0 means no drains are running.DrainActive = promauto.NewGauge(
prometheus.GaugeOpts{
Name: "s3o_drain_active",
Help: "Count of in-flight backend drain operations (Inc/Dec so concurrent drains compose)",
},
)
// DrainRaceAbortedTotal counts PutObject attempts that landed bytes// on a backend whose drain started mid-write. The orchestrator// detects the race after the backend PUT completes, deletes the// orphaned bytes, and fails the attempt over to the next eligible// backend; this counter pins how often the race fires in production// so the drain timing assumptions can be revisited if it climbs.DrainRaceAbortedTotal = promauto.NewCounter(
prometheus.CounterOpts{
Name: "s3o_drain_race_aborted_total",
Help: "Number of PutObject attempts aborted after drain started mid-write",
},
)
)
EncryptionOpsTotal and related package-level variables used by this package.
var (
// EncryptionOpsTotal counts encryption operations by type.EncryptionOpsTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_encryption_operations_total",
Help: "Total encryption operations",
},
[]string{"op"},
)
// EncryptionErrorsTotal counts encryption errors by operation and type.EncryptionErrorsTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_encryption_errors_total",
Help: "Total encryption errors",
},
[]string{"op", "error_type"},
)
// EncryptionUnknownKeyIDTotal counts decryption attempts where the keyID// was not found in the configured keys, triggering a primary key fallback.EncryptionUnknownKeyIDTotal = promauto.NewCounter(
prometheus.CounterOpts{
Name: "s3o_encryption_unknown_key_id_total",
Help: "Decryption attempts with unknown keyID (primary key fallback)",
},
)
// IntegrityErrorsTotal counts hash mismatches detected during read,// replication, or background scrubbing.IntegrityErrorsTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_integrity_errors_total",
Help: "Content hash mismatches detected",
},
[]string{"operation"},
)
// IntegrityChecksTotal counts hash verifications performed.IntegrityChecksTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_integrity_checks_total",
Help: "Content hash verifications performed",
},
[]string{"operation"},
)
// KeyRotationObjectsTotal counts objects processed during key rotation.KeyRotationObjectsTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_key_rotation_objects_total",
Help: "Total objects processed during key rotation",
},
[]string{"status"},
)
// EncryptExistingObjectsTotal counts objects processed during encrypt-existing.EncryptExistingObjectsTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_encrypt_existing_objects_total",
Help: "Total objects processed during encrypt-existing operation",
},
[]string{"status"},
)
// DecryptExistingObjectsTotal counts objects processed during decrypt-existing.DecryptExistingObjectsTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_decrypt_existing_objects_total",
Help: "Total objects processed during decrypt-existing operation",
},
[]string{"status"},
)
)
BuildInfo and related package-level variables used by this package.
var (
// BuildInfo exposes version information.BuildInfo = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "s3o_build_info",
Help: "Build information for the S3 proxy",
},
[]string{"version", "go_version"},
)
// NotificationSentTotal counts successfully delivered webhook notifications.NotificationSentTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_notification_sent_total",
Help: "Webhook notifications delivered successfully",
},
[]string{"endpoint", "event_type"},
)
// NotificationFailedTotal counts webhook delivery failures.NotificationFailedTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_notification_failed_total",
Help: "Webhook notification delivery failures",
},
[]string{"endpoint", "event_type"},
)
// NotificationDroppedTotal counts events dropped due to dampening or enqueue failure.NotificationDroppedTotal = promauto.NewCounter(
prometheus.CounterOpts{
Name: "s3o_notification_dropped_total",
Help: "Events dropped due to dampening or queue insertion failure",
},
)
// NotificationStoreErrorsTotal counts outbox-store operation failures in// the delivery worker (CompleteNotification / RetryNotification). A// non-zero value means the worker saw a store error that could cause// duplicate or dropped webhook deliveries - alert on any increase.NotificationStoreErrorsTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_notification_store_errors_total",
Help: "Outbox store errors seen by the notification delivery worker",
},
[]string{"operation"},
)
// NotificationQueueDepth reports the number of pending notifications in the outbox.NotificationQueueDepth = promauto.NewGauge(
prometheus.GaugeOpts{
Name: "s3o_notification_queue_depth",
Help: "Pending notifications in the delivery outbox",
},
)
// NotificationDuration measures webhook delivery latency.NotificationDuration = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "s3o_notification_duration_seconds",
Help: "Webhook notification delivery latency",
Buckets: prometheus.DefBuckets,
},
[]string{"endpoint"},
)
)
QuotaBytesUsed and related package-level variables used by this package.
var (
// QuotaBytesUsed tracks current bytes used per backend.QuotaBytesUsed = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "s3o_quota_bytes_used",
Help: "Current bytes used on each backend",
},
[]string{"backend"},
)
// QuotaBytesLimit tracks quota limit per backend.QuotaBytesLimit = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "s3o_quota_bytes_limit",
Help: "Quota limit in bytes for each backend",
},
[]string{"backend"},
)
// QuotaBytesAvailable tracks available bytes per backend.QuotaBytesAvailable = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "s3o_quota_bytes_available",
Help: "Available bytes (limit - used - orphan) for each backend",
},
[]string{"backend"},
)
// QuotaOrphanBytes tracks bytes pending physical deletion per backend.QuotaOrphanBytes = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "s3o_quota_orphan_bytes",
Help: "Bytes pending physical deletion (logically freed but not yet removed from backend)",
},
[]string{"backend"},
)
// ObjectCount tracks the number of objects stored per backend.ObjectCount = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "s3o_objects_count",
Help: "Number of objects stored on each backend",
},
[]string{"backend"},
)
// ActiveMultipartUploads tracks in-progress multipart uploads per backend.ActiveMultipartUploads = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "s3o_active_multipart_uploads",
Help: "Number of in-progress multipart uploads per backend",
},
[]string{"backend"},
)
// UsageAPIRequests tracks the current month's API request count per backend.UsageAPIRequests = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "s3o_usage_api_requests",
Help: "Current month API request count per backend (from DB)",
},
[]string{"backend"},
)
// UsageEgressBytes tracks the current month's egress bytes per backend.UsageEgressBytes = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "s3o_usage_egress_bytes",
Help: "Current month egress bytes per backend (from DB)",
},
[]string{"backend"},
)
// UsageIngressBytes tracks the current month's ingress bytes per backend.UsageIngressBytes = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "s3o_usage_ingress_bytes",
Help: "Current month ingress bytes per backend (from DB)",
},
[]string{"backend"},
)
// UsageLimitRejectionsTotal counts operations rejected due to monthly usage limits.UsageLimitRejectionsTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_usage_limit_rejections_total",
Help: "Total operations rejected due to monthly usage limits",
},
[]string{"operation", "limit_type"},
)
)
RebalanceObjectsMoved and related package-level variables used by this package.
var (
// RebalanceObjectsMoved counts objects moved by the rebalancer.RebalanceObjectsMoved = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_rebalance_objects_moved_total",
Help: "Total number of objects moved by the rebalancer",
},
[]string{"strategy", "status"},
)
// RebalanceBytesMoved counts bytes moved by the rebalancer.RebalanceBytesMoved = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_rebalance_bytes_moved_total",
Help: "Total bytes moved by the rebalancer",
},
[]string{"strategy"},
)
// RebalanceRunsTotal counts rebalancer executions.RebalanceRunsTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_rebalance_runs_total",
Help: "Total number of rebalancer runs",
},
[]string{"strategy", "status"},
)
// RebalanceDuration tracks rebalancer execution time.RebalanceDuration = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "s3o_rebalance_duration_seconds",
Help: "Rebalancer execution time in seconds",
Buckets: []float64{1, 5, 10, 30, 60, 120, 300, 600},
},
[]string{"strategy"},
)
// RebalanceSkipped counts rebalancer runs that were skipped.RebalanceSkipped = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_rebalance_skipped_total",
Help: "Total number of rebalancer runs skipped",
},
[]string{"reason"},
)
// RebalancePending tracks objects planned for rebalance in the current cycle.RebalancePending = promauto.NewGauge(
prometheus.GaugeOpts{
Name: "s3o_rebalance_pending",
Help: "Number of objects planned for rebalance",
},
)
)
ReplicationPending and related package-level variables used by this package.
var (
// ReplicationPending tracks objects currently below the target replication factor.ReplicationPending = promauto.NewGauge(
prometheus.GaugeOpts{
Name: "s3o_replication_pending",
Help: "Number of objects below the target replication factor",
},
)
// ReplicationCopiesCreatedTotal counts replica copies created.ReplicationCopiesCreatedTotal = promauto.NewCounter(
prometheus.CounterOpts{
Name: "s3o_replication_copies_created_total",
Help: "Total number of replica copies created",
},
)
// ReplicationErrorsTotal counts replication errors.ReplicationErrorsTotal = promauto.NewCounter(
prometheus.CounterOpts{
Name: "s3o_replication_errors_total",
Help: "Total number of replication errors",
},
)
// ReplicationDuration tracks replication worker cycle time.ReplicationDuration = promauto.NewHistogram(
prometheus.HistogramOpts{
Name: "s3o_replication_duration_seconds",
Help: "Replication worker cycle time in seconds",
Buckets: []float64{1, 5, 10, 30, 60, 120, 300, 600},
},
)
// ReplicationRunsTotal counts replication worker executions.ReplicationRunsTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_replication_runs_total",
Help: "Total number of replication worker runs",
},
[]string{"status"},
)
// ReplicationHealthCopiesTotal counts copies created to replace copies on// circuit-broken backends.ReplicationHealthCopiesTotal = promauto.NewCounter(
prometheus.CounterOpts{
Name: "s3o_replication_health_copies_total",
Help: "Replica copies created to replace copies on circuit-broken backends",
},
)
// OverReplicationPending tracks objects currently above the target replication factor.OverReplicationPending = promauto.NewGauge(
prometheus.GaugeOpts{
Name: "s3o_over_replication_pending",
Help: "Number of objects above the target replication factor",
},
)
// OverReplicationRemovedTotal counts excess copies removed by the cleaner.OverReplicationRemovedTotal = promauto.NewCounter(
prometheus.CounterOpts{
Name: "s3o_over_replication_removed_total",
Help: "Total excess copies removed by over-replication cleanup",
},
)
// OverReplicationErrorsTotal counts over-replication cleanup errors.OverReplicationErrorsTotal = promauto.NewCounter(
prometheus.CounterOpts{
Name: "s3o_over_replication_errors_total",
Help: "Total number of over-replication cleanup errors",
},
)
// OverReplicationRunsTotal counts over-replication cleanup worker executions.OverReplicationRunsTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_over_replication_runs_total",
Help: "Total number of over-replication cleanup worker runs",
},
[]string{"status"},
)
// OverReplicationDuration tracks over-replication cleanup worker cycle time.OverReplicationDuration = promauto.NewHistogram(
prometheus.HistogramOpts{
Name: "s3o_over_replication_duration_seconds",
Help: "Over-replication cleanup worker cycle time in seconds",
Buckets: []float64{1, 5, 10, 30, 60, 120, 300, 600},
},
)
)
RequestsTotal and related package-level variables used by this package.
var (
// RequestsTotal counts all HTTP requests by method and status code.RequestsTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_requests_total",
Help: "Total number of HTTP requests processed",
},
[]string{"method", "status_code"},
)
// RequestDuration tracks request latency distribution by method.RequestDuration = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "s3o_request_duration_seconds",
Help: "HTTP request latency in seconds",
Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 30, 60},
},
[]string{"method"},
)
// RequestSize tracks upload sizes.RequestSize = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "s3o_request_size_bytes",
Help: "HTTP request body size in bytes",
Buckets: prometheus.ExponentialBuckets(1024, 4, 10),
},
[]string{"method"},
)
// ResponseSize tracks download sizes.ResponseSize = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "s3o_response_size_bytes",
Help: "HTTP response body size in bytes",
Buckets: prometheus.ExponentialBuckets(1024, 4, 10),
},
[]string{"method"},
)
// InflightRequests tracks currently processing requests.InflightRequests = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "s3o_inflight_requests",
Help: "Number of requests currently being processed",
},
[]string{"method"},
)
// BackendRequestsTotal counts backend operations by operation type and status.BackendRequestsTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_backend_requests_total",
Help: "Total number of backend storage operations",
},
[]string{"operation", "backend", "status"},
)
// BackendDuration tracks backend operation latency.BackendDuration = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "s3o_backend_duration_seconds",
Help: "Backend operation latency in seconds",
Buckets: []float64{.01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 30, 60, 120},
},
[]string{"operation", "backend"},
)
// ManagerRequestsTotal counts manager-level operations.ManagerRequestsTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_manager_requests_total",
Help: "Total number of manager-level storage operations",
},
[]string{"operation", "backend", "status"},
)
// ManagerDuration tracks manager operation latency.ManagerDuration = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "s3o_manager_duration_seconds",
Help: "Manager operation latency in seconds",
Buckets: []float64{.01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 30, 60, 120},
},
[]string{"operation", "backend"},
)
// RateLimitRejectionsTotal counts requests rejected by the per-IP rate limiter.RateLimitRejectionsTotal = promauto.NewCounter(
prometheus.CounterOpts{
Name: "s3o_rate_limit_rejections_total",
Help: "Total requests rejected due to per-IP rate limiting",
},
)
// AdmissionRejectionsTotal counts requests rejected by server-level admission control.AdmissionRejectionsTotal = promauto.NewCounter(
prometheus.CounterOpts{
Name: "s3o_admission_rejections_total",
Help: "Total requests rejected due to server-level admission control",
},
)
// AdmissionClientCanceledTotal counts requests where the client closed// or cancelled their context while waiting for an admission slot. These// are not server-side rejections and do not count against capacity SLOs.AdmissionClientCanceledTotal = promauto.NewCounter(
prometheus.CounterOpts{
Name: "s3o_admission_client_canceled_total",
Help: "Total admission waits aborted by client context cancellation",
},
)
// WorkerAdmissionRejectionsTotal counts background worker tasks that were// skipped because the admission semaphore was full.WorkerAdmissionRejectionsTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_worker_admission_rejections_total",
Help: "Background worker tasks skipped due to admission control",
},
[]string{"worker"},
)
// LoadShedTotal counts requests probabilistically rejected by active// load shedding before reaching the hard admission limit.LoadShedTotal = promauto.NewCounter(
prometheus.CounterOpts{
Name: "s3o_load_shed_total",
Help: "Requests shed by active load shedding before hard admission limit",
},
)
// EarlyRejectionsTotal counts uploads rejected before body transmission// via Expect: 100-Continue pre-flight capacity checks.EarlyRejectionsTotal = promauto.NewCounter(
prometheus.CounterOpts{
Name: "s3o_early_rejections_total",
Help: "Uploads rejected before body transmission (no backend capacity)",
},
)
// ListPagesCappedTotal counts ListObjects calls that exit because the// per-request DB round-trip cap (listObjectsMaxPages) was reached while// the store still had more pages. A non-zero value means real workloads// hit the cap and the bound may need tuning; persistent zero means the// cap is over-provisioned and safe to leave alone.ListPagesCappedTotal = promauto.NewCounter(
prometheus.CounterOpts{
Name: "s3o_list_pages_capped_total",
Help: "ListObjects calls that exited at the per-request page cap with more pages remaining",
},
)
// HTTPPanicRecoveredTotal counts panics caught by the HTTP panic-// recovery middleware (#798). A non-zero value means a request// handler panicked and the recovery layer translated it into a 500// response; the matching slog.ErrorContext line carries the panic// value, the captured stack, and the request id. Label scopes the// counter to the route group so dashboards can distinguish a// flaking S3 path from a flaking admin or UI path.HTTPPanicRecoveredTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_http_panic_recovered_total",
Help: "HTTP handler panics caught by the recovery middleware, by route group",
},
[]string{"route"},
)
)
var (
// WorkerTicksTotal counts each completed tick of a locked-ticker// service, labelled by service name and outcome. Outcomes:// success - work returned nil// error - work returned a non-nil error// skipped - shouldRun gated the tick or the lock was busyWorkerTicksTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_worker_ticks_total",
Help: "Locked-ticker service ticks by service and outcome",
},
[]string{"service", "result"},
)
// WorkerLastSuccessTimestampSeconds records the Unix time of the// most recent successful tick for each service. Alert on staleness// with time() minus this gauge.WorkerLastSuccessTimestampSeconds = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "s3o_worker_last_success_timestamp_seconds",
Help: "Unix time of the most recent successful tick per service",
},
[]string{"service"},
)
// WorkerConsecutiveFailures gauges the run of back-to-back failed// ticks for each service. Resets to 0 on the next success.WorkerConsecutiveFailures = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "s3o_worker_consecutive_failures",
Help: "Consecutive failed ticks per service since the last success",
},
[]string{"service"},
)
)
AuditEventsTotal and related package-level variables used by this package.
var (
// AuditEventsTotal counts audit log entries by event type.AuditEventsTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "s3o_audit_events_total",
Help: "Total number of audit log entries emitted",
},
[]string{"event"},
)
)
AuthStreamingRejectionsTotal counts streaming-payload requests rejected due to a chunk- or trailer-validation failure, labeled by reason. Reasons map 1:1 to the auth.Err* sentinels so dashboards can distinguish a tampered body (chunk_signature_mismatch) from a malformed framing attempt (chunk_malformed).
AuthStreamingRequestsTotal counts streaming-payload requests received, labeled by AWS streaming variant. Incremented at the moment the transport layer wraps the request body in a chunk-validating reader.
InitTracer initializes the OpenTelemetry tracer with OTLP export. Returns a shutdown function that should be called on service termination to flush spans.
NewCircuitBreakerHook returns the callback to install on a breaker so its state changes drive the CircuitBreakerState gauge, the CircuitBreakerTransitionsTotal counter, and the BackendCircuitOpened / BackendCircuitClosed events.
Initializes the gauge to “closed” up front so Prometheus reports a value before the first transition.
Entries returns buffered log entries matching the query options. Results are returned in chronological order (oldest first).
The lock is held only long enough to snapshot the ring buffer state. Filtering and result construction happen outside the lock so concurrent Add calls are not blocked by slow dashboard queries.
type LogEntry
LogEntry is a single structured log record stored in the buffer.
LogQueryOpts controls filtering when reading from the buffer.
typeLogQueryOptsstruct {
MinLevelslog.Level// minimum severity (default 0 = DEBUG)Sincetime.Time// only entries after this timeBeforetime.Time// only entries before this timeLimitint// max entries to return (0 = all)Componentstring// filter by "component" attribute value}
type TeeHandler
TeeHandler is an slog.Handler that writes each log record to a primary handler (typically JSON to stdout) and also captures it in a LogBuffer.
typeTeeHandlerstruct {
// contains filtered or unexported fields}