You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
686 lines
22 KiB
686 lines
22 KiB
"""
|
|
OpenTelemetry metrics collector for redis-py.
|
|
|
|
This module defines and manages all metric instruments according to
|
|
OTel semantic conventions for database clients.
|
|
"""
|
|
|
|
import logging
|
|
import time
|
|
from enum import Enum
|
|
from typing import TYPE_CHECKING, Callable, Optional
|
|
|
|
if TYPE_CHECKING:
|
|
from redis.connection import ConnectionPoolInterface
|
|
from redis.multidb.database import SyncDatabase
|
|
|
|
from redis.observability.attributes import (
|
|
REDIS_CLIENT_CONNECTION_CLOSE_REASON,
|
|
REDIS_CLIENT_CONNECTION_NOTIFICATION,
|
|
AttributeBuilder,
|
|
CSCReason,
|
|
CSCResult,
|
|
GeoFailoverReason,
|
|
PubSubDirection,
|
|
get_pool_name,
|
|
)
|
|
from redis.observability.config import MetricGroup, OTelConfig
|
|
from redis.utils import deprecated_args
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Optional imports - OTel SDK may not be installed
|
|
try:
|
|
from opentelemetry.metrics import Meter
|
|
|
|
OTEL_AVAILABLE = True
|
|
except ImportError:
|
|
OTEL_AVAILABLE = False
|
|
Counter = None
|
|
Histogram = None
|
|
Meter = None
|
|
UpDownCounter = None
|
|
|
|
|
|
class CloseReason(Enum):
|
|
"""
|
|
Enum representing the reason why a Redis client connection was closed.
|
|
|
|
Values:
|
|
APPLICATION_CLOSE: The connection was closed intentionally by the application
|
|
(for example, during normal shutdown or explicit cleanup).
|
|
ERROR: The connection was closed due to an unexpected error
|
|
(for example, network failure or protocol error).
|
|
HEALTHCHECK_FAILED: The connection was closed because a health check
|
|
or liveness check for the connection failed.
|
|
"""
|
|
|
|
APPLICATION_CLOSE = "application_close"
|
|
ERROR = "error"
|
|
HEALTHCHECK_FAILED = "healthcheck_failed"
|
|
|
|
|
|
class RedisMetricsCollector:
|
|
"""
|
|
Collects and records OpenTelemetry metrics for Redis operations.
|
|
|
|
This class manages all metric instruments and provides methods to record
|
|
various Redis operations including connection pool events, command execution,
|
|
and cluster-specific operations.
|
|
|
|
Args:
|
|
meter: OpenTelemetry Meter instance
|
|
config: OTel configuration object
|
|
"""
|
|
|
|
METER_NAME = "redis-py"
|
|
METER_VERSION = "1.0.0"
|
|
|
|
def __init__(self, meter: Meter, config: OTelConfig):
|
|
if not OTEL_AVAILABLE:
|
|
raise ImportError(
|
|
"OpenTelemetry API is not installed. "
|
|
"Install it with: pip install opentelemetry-api"
|
|
)
|
|
|
|
self.meter = meter
|
|
self.config = config
|
|
self.attr_builder = AttributeBuilder()
|
|
self.connection_count = None
|
|
|
|
# Initialize enabled metric instruments
|
|
|
|
if MetricGroup.RESILIENCY in self.config.metric_groups:
|
|
self._init_resiliency_metrics()
|
|
|
|
if MetricGroup.COMMAND in self.config.metric_groups:
|
|
self._init_command_metrics()
|
|
|
|
if MetricGroup.CONNECTION_BASIC in self.config.metric_groups:
|
|
self._init_connection_basic_metrics()
|
|
|
|
if MetricGroup.CONNECTION_ADVANCED in self.config.metric_groups:
|
|
self._init_connection_advanced_metrics()
|
|
|
|
if MetricGroup.PUBSUB in self.config.metric_groups:
|
|
self._init_pubsub_metrics()
|
|
|
|
if MetricGroup.STREAMING in self.config.metric_groups:
|
|
self._init_streaming_metrics()
|
|
|
|
if MetricGroup.CSC in self.config.metric_groups:
|
|
self._init_csc_metrics()
|
|
|
|
logger.info("RedisMetricsCollector initialized")
|
|
|
|
def _init_resiliency_metrics(self) -> None:
|
|
"""Initialize resiliency metrics."""
|
|
self.client_errors = self.meter.create_counter(
|
|
name="redis.client.errors",
|
|
unit="{error}",
|
|
description="A counter of all errors (both returned to the user and handled internally in the client library)",
|
|
)
|
|
|
|
self.maintenance_notifications = self.meter.create_counter(
|
|
name="redis.client.maintenance.notifications",
|
|
unit="{notification}",
|
|
description="Tracks server-side maintenance notifications",
|
|
)
|
|
|
|
self.geo_failovers = self.meter.create_counter(
|
|
name="redis.client.geofailover.failovers",
|
|
unit="{geofailover}",
|
|
description="Total count of failovers happened using MultiDbClient.",
|
|
)
|
|
|
|
def _init_connection_basic_metrics(self) -> None:
|
|
"""Initialize basic connection metrics."""
|
|
self.connection_create_time = self.meter.create_histogram(
|
|
name="db.client.connection.create_time",
|
|
unit="s",
|
|
description="Time to create a new connection",
|
|
explicit_bucket_boundaries_advisory=self.config.buckets_connection_create_time,
|
|
)
|
|
|
|
self.connection_relaxed_timeout = self.meter.create_up_down_counter(
|
|
name="redis.client.connection.relaxed_timeout",
|
|
unit="{relaxation}",
|
|
description="Counts up for relaxed timeout, counts down for unrelaxed timeout",
|
|
)
|
|
|
|
self.connection_handoff = self.meter.create_counter(
|
|
name="redis.client.connection.handoff",
|
|
unit="{handoff}",
|
|
description="Connections that have been handed off (e.g., after a MOVING notification)",
|
|
)
|
|
|
|
def _init_connection_advanced_metrics(self) -> None:
|
|
"""Initialize advanced connection metrics."""
|
|
self.connection_timeouts = self.meter.create_counter(
|
|
name="db.client.connection.timeouts",
|
|
unit="{timeout}",
|
|
description="The number of connection timeouts that have occurred trying to obtain a connection from the pool.",
|
|
)
|
|
|
|
self.connection_wait_time = self.meter.create_histogram(
|
|
name="db.client.connection.wait_time",
|
|
unit="s",
|
|
description="Time to obtain an open connection from the pool",
|
|
explicit_bucket_boundaries_advisory=self.config.buckets_connection_wait_time,
|
|
)
|
|
|
|
self.connection_closed = self.meter.create_counter(
|
|
name="redis.client.connection.closed",
|
|
unit="{connection}",
|
|
description="Total number of closed connections",
|
|
)
|
|
|
|
def _init_command_metrics(self) -> None:
|
|
"""Initialize command execution metric instruments."""
|
|
self.operation_duration = self.meter.create_histogram(
|
|
name="db.client.operation.duration",
|
|
unit="s",
|
|
description="Command execution duration",
|
|
explicit_bucket_boundaries_advisory=self.config.buckets_operation_duration,
|
|
)
|
|
|
|
def _init_pubsub_metrics(self) -> None:
|
|
"""Initialize PubSub metric instruments."""
|
|
self.pubsub_messages = self.meter.create_counter(
|
|
name="redis.client.pubsub.messages",
|
|
unit="{message}",
|
|
description="Tracks published and received messages",
|
|
)
|
|
|
|
def _init_streaming_metrics(self) -> None:
|
|
"""Initialize Streaming metric instruments."""
|
|
self.stream_lag = self.meter.create_histogram(
|
|
name="redis.client.stream.lag",
|
|
unit="s",
|
|
description="End-to-end lag per message, showing how stale are the messages when the application starts processing them.",
|
|
explicit_bucket_boundaries_advisory=self.config.buckets_stream_processing_duration,
|
|
)
|
|
|
|
def _init_csc_metrics(self) -> None:
|
|
"""Initialize Client Side Caching (CSC) metric instruments."""
|
|
self.csc_requests = self.meter.create_counter(
|
|
name="redis.client.csc.requests",
|
|
unit="{request}",
|
|
description="The total number of requests to the cache",
|
|
)
|
|
|
|
self.csc_evictions = self.meter.create_counter(
|
|
name="redis.client.csc.evictions",
|
|
unit="{eviction}",
|
|
description="The total number of cache evictions",
|
|
)
|
|
|
|
self.csc_network_saved = self.meter.create_counter(
|
|
name="redis.client.csc.network_saved",
|
|
unit="By",
|
|
description="The total number of bytes saved by using CSC",
|
|
)
|
|
|
|
# Resiliency metric recording methods
|
|
|
|
def record_error_count(
|
|
self,
|
|
server_address: Optional[str] = None,
|
|
server_port: Optional[int] = None,
|
|
network_peer_address: Optional[str] = None,
|
|
network_peer_port: Optional[int] = None,
|
|
error_type: Optional[Exception] = None,
|
|
retry_attempts: Optional[int] = None,
|
|
is_internal: Optional[bool] = None,
|
|
):
|
|
"""
|
|
Record error count
|
|
|
|
Args:
|
|
server_address: Server address
|
|
server_port: Server port
|
|
network_peer_address: Network peer address
|
|
network_peer_port: Network peer port
|
|
error_type: Error type
|
|
retry_attempts: Retry attempts
|
|
is_internal: Whether the error is internal (e.g., timeout, network error)
|
|
"""
|
|
if not hasattr(self, "client_errors"):
|
|
return
|
|
|
|
attrs = self.attr_builder.build_base_attributes(
|
|
server_address=server_address,
|
|
server_port=server_port,
|
|
)
|
|
attrs.update(
|
|
self.attr_builder.build_operation_attributes(
|
|
network_peer_address=network_peer_address,
|
|
network_peer_port=network_peer_port,
|
|
retry_attempts=retry_attempts,
|
|
)
|
|
)
|
|
|
|
attrs.update(
|
|
self.attr_builder.build_error_attributes(
|
|
error_type=error_type,
|
|
is_internal=is_internal,
|
|
)
|
|
)
|
|
|
|
self.client_errors.add(1, attributes=attrs)
|
|
|
|
def record_maint_notification_count(
|
|
self,
|
|
server_address: str,
|
|
server_port: int,
|
|
network_peer_address: str,
|
|
network_peer_port: int,
|
|
maint_notification: str,
|
|
):
|
|
"""
|
|
Record maintenance notification count
|
|
|
|
Args:
|
|
server_address: Server address
|
|
server_port: Server port
|
|
network_peer_address: Network peer address
|
|
network_peer_port: Network peer port
|
|
maint_notification: Maintenance notification
|
|
"""
|
|
if not hasattr(self, "maintenance_notifications"):
|
|
return
|
|
|
|
attrs = self.attr_builder.build_base_attributes(
|
|
server_address=server_address,
|
|
server_port=server_port,
|
|
)
|
|
|
|
attrs.update(
|
|
self.attr_builder.build_operation_attributes(
|
|
network_peer_address=network_peer_address,
|
|
network_peer_port=network_peer_port,
|
|
)
|
|
)
|
|
|
|
attrs[REDIS_CLIENT_CONNECTION_NOTIFICATION] = maint_notification
|
|
self.maintenance_notifications.add(1, attributes=attrs)
|
|
|
|
def record_geo_failover(
|
|
self,
|
|
fail_from: "SyncDatabase",
|
|
fail_to: "SyncDatabase",
|
|
reason: GeoFailoverReason,
|
|
):
|
|
"""
|
|
Record geo failover
|
|
|
|
Args:
|
|
fail_from: Database failed from
|
|
fail_to: Database failed to
|
|
reason: Reason for the failover
|
|
"""
|
|
|
|
if not hasattr(self, "geo_failovers"):
|
|
return
|
|
|
|
attrs = self.attr_builder.build_geo_failover_attributes(
|
|
fail_from=fail_from,
|
|
fail_to=fail_to,
|
|
reason=reason,
|
|
)
|
|
|
|
return self.geo_failovers.add(1, attributes=attrs)
|
|
|
|
def init_connection_count(
|
|
self,
|
|
callback: Callable,
|
|
) -> None:
|
|
"""
|
|
Initialize observable gauge for connection count metric.
|
|
|
|
Args:
|
|
callback: Callback function to retrieve connection count
|
|
"""
|
|
if (
|
|
MetricGroup.CONNECTION_BASIC not in self.config.metric_groups
|
|
and not self.connection_count
|
|
):
|
|
return
|
|
|
|
self.connection_count = self.meter.create_observable_gauge(
|
|
name="db.client.connection.count",
|
|
unit="{connection}",
|
|
description="Number of connections in the pool",
|
|
callbacks=[callback],
|
|
)
|
|
|
|
def init_csc_items(
|
|
self,
|
|
callback: Callable,
|
|
) -> None:
|
|
"""
|
|
Initialize observable gauge for CSC items metric.
|
|
|
|
Args:
|
|
callback: Callback function to retrieve CSC items count
|
|
"""
|
|
if MetricGroup.CSC not in self.config.metric_groups and not self.csc_items:
|
|
return
|
|
|
|
self.csc_items = self.meter.create_observable_gauge(
|
|
name="redis.client.csc.items",
|
|
unit="{item}",
|
|
description="The total number of cached responses currently stored",
|
|
callbacks=[callback],
|
|
)
|
|
|
|
def record_connection_timeout(self, pool_name: str) -> None:
|
|
"""
|
|
Record a connection timeout event.
|
|
|
|
Args:
|
|
pool_name: Connection pool name
|
|
"""
|
|
if not hasattr(self, "connection_timeouts"):
|
|
return
|
|
|
|
attrs = self.attr_builder.build_connection_attributes(pool_name=pool_name)
|
|
self.connection_timeouts.add(1, attributes=attrs)
|
|
|
|
def record_connection_create_time(
|
|
self,
|
|
connection_pool: "ConnectionPoolInterface",
|
|
duration_seconds: float,
|
|
) -> None:
|
|
"""
|
|
Record time taken to create a new connection.
|
|
|
|
Args:
|
|
connection_pool: Connection pool implementation
|
|
duration_seconds: Creation time in seconds
|
|
"""
|
|
if not hasattr(self, "connection_create_time"):
|
|
return
|
|
|
|
attrs = self.attr_builder.build_connection_attributes(
|
|
pool_name=get_pool_name(connection_pool)
|
|
)
|
|
self.connection_create_time.record(duration_seconds, attributes=attrs)
|
|
|
|
def record_connection_wait_time(
|
|
self,
|
|
pool_name: str,
|
|
duration_seconds: float,
|
|
) -> None:
|
|
"""
|
|
Record time taken to obtain a connection from the pool.
|
|
|
|
Args:
|
|
pool_name: Connection pool name
|
|
duration_seconds: Wait time in seconds
|
|
"""
|
|
if not hasattr(self, "connection_wait_time"):
|
|
return
|
|
|
|
attrs = self.attr_builder.build_connection_attributes(pool_name=pool_name)
|
|
self.connection_wait_time.record(duration_seconds, attributes=attrs)
|
|
|
|
# Command execution metric recording methods
|
|
|
|
@deprecated_args(
|
|
args_to_warn=["batch_size"],
|
|
reason="The batch_size argument is no longer used and will be removed in the next major version.",
|
|
version="7.2.1",
|
|
)
|
|
def record_operation_duration(
|
|
self,
|
|
command_name: str,
|
|
duration_seconds: float,
|
|
server_address: Optional[str] = None,
|
|
server_port: Optional[int] = None,
|
|
db_namespace: Optional[int] = None,
|
|
batch_size: Optional[int] = None, # noqa
|
|
error_type: Optional[Exception] = None,
|
|
network_peer_address: Optional[str] = None,
|
|
network_peer_port: Optional[int] = None,
|
|
retry_attempts: Optional[int] = None,
|
|
is_blocking: Optional[bool] = None,
|
|
) -> None:
|
|
"""
|
|
Record command execution duration.
|
|
|
|
Args:
|
|
command_name: Redis command name (e.g., 'GET', 'SET', 'MULTI')
|
|
duration_seconds: Execution time in seconds
|
|
server_address: Redis server address
|
|
server_port: Redis server port
|
|
db_namespace: Redis database index
|
|
batch_size: Number of commands in batch (for pipelines/transactions)
|
|
error_type: Error type if operation failed
|
|
network_peer_address: Resolved peer address
|
|
network_peer_port: Peer port number
|
|
retry_attempts: Number of retry attempts made
|
|
is_blocking: Whether the operation is a blocking command
|
|
"""
|
|
if not hasattr(self, "operation_duration"):
|
|
return
|
|
|
|
# Check if this command should be tracked
|
|
if not self.config.should_track_command(command_name):
|
|
return
|
|
|
|
# Build attributes
|
|
attrs = self.attr_builder.build_base_attributes(
|
|
server_address=server_address,
|
|
server_port=server_port,
|
|
db_namespace=db_namespace,
|
|
)
|
|
|
|
attrs.update(
|
|
self.attr_builder.build_operation_attributes(
|
|
command_name=command_name,
|
|
network_peer_address=network_peer_address,
|
|
network_peer_port=network_peer_port,
|
|
retry_attempts=retry_attempts,
|
|
is_blocking=is_blocking,
|
|
)
|
|
)
|
|
|
|
attrs.update(
|
|
self.attr_builder.build_error_attributes(
|
|
error_type=error_type,
|
|
)
|
|
)
|
|
self.operation_duration.record(duration_seconds, attributes=attrs)
|
|
|
|
def record_connection_closed(
|
|
self,
|
|
close_reason: Optional[CloseReason] = None,
|
|
error_type: Optional[Exception] = None,
|
|
) -> None:
|
|
"""
|
|
Record a connection closed event.
|
|
|
|
Args:
|
|
close_reason: Reason for closing (e.g. 'error', 'application_close')
|
|
error_type: Error type if closed due to error
|
|
"""
|
|
if not hasattr(self, "connection_closed"):
|
|
return
|
|
|
|
attrs = self.attr_builder.build_connection_attributes()
|
|
if close_reason:
|
|
attrs[REDIS_CLIENT_CONNECTION_CLOSE_REASON] = close_reason.value
|
|
|
|
attrs.update(
|
|
self.attr_builder.build_error_attributes(
|
|
error_type=error_type,
|
|
)
|
|
)
|
|
|
|
self.connection_closed.add(1, attributes=attrs)
|
|
|
|
def record_connection_relaxed_timeout(
|
|
self,
|
|
connection_name: str,
|
|
maint_notification: str,
|
|
relaxed: bool,
|
|
) -> None:
|
|
"""
|
|
Record a connection timeout relaxation event.
|
|
|
|
Args:
|
|
connection_name: Connection name
|
|
maint_notification: Maintenance notification type
|
|
relaxed: True to count up (relaxed), False to count down (unrelaxed)
|
|
"""
|
|
if not hasattr(self, "connection_relaxed_timeout"):
|
|
return
|
|
|
|
attrs = self.attr_builder.build_connection_attributes(
|
|
connection_name=connection_name
|
|
)
|
|
attrs[REDIS_CLIENT_CONNECTION_NOTIFICATION] = maint_notification
|
|
self.connection_relaxed_timeout.add(1 if relaxed else -1, attributes=attrs)
|
|
|
|
def record_connection_handoff(
|
|
self,
|
|
pool_name: str,
|
|
) -> None:
|
|
"""
|
|
Record a connection handoff event (e.g., after MOVING notification).
|
|
|
|
Args:
|
|
pool_name: Connection pool name
|
|
"""
|
|
if not hasattr(self, "connection_handoff"):
|
|
return
|
|
|
|
attrs = self.attr_builder.build_connection_attributes(pool_name=pool_name)
|
|
self.connection_handoff.add(1, attributes=attrs)
|
|
|
|
# PubSub metric recording methods
|
|
|
|
def record_pubsub_message(
|
|
self,
|
|
direction: PubSubDirection,
|
|
channel: Optional[str] = None,
|
|
sharded: Optional[bool] = None,
|
|
) -> None:
|
|
"""
|
|
Record a PubSub message (published or received).
|
|
|
|
Args:
|
|
direction: Message direction ('publish' or 'receive')
|
|
channel: Pub/Sub channel name
|
|
sharded: True if sharded Pub/Sub channel
|
|
"""
|
|
if not hasattr(self, "pubsub_messages"):
|
|
return
|
|
|
|
attrs = self.attr_builder.build_pubsub_message_attributes(
|
|
direction=direction,
|
|
channel=channel,
|
|
sharded=sharded,
|
|
)
|
|
self.pubsub_messages.add(1, attributes=attrs)
|
|
|
|
# Streaming metric recording methods
|
|
|
|
@deprecated_args(
|
|
args_to_warn=["consumer_name"],
|
|
reason="The consumer_name argument is no longer used and will be removed in the next major version.",
|
|
version="7.2.1",
|
|
)
|
|
def record_streaming_lag(
|
|
self,
|
|
lag_seconds: float,
|
|
stream_name: Optional[str] = None,
|
|
consumer_group: Optional[str] = None,
|
|
consumer_name: Optional[str] = None, # noqa
|
|
) -> None:
|
|
"""
|
|
Record the lag of a streaming message.
|
|
|
|
Args:
|
|
lag_seconds: Lag in seconds
|
|
stream_name: Stream name
|
|
consumer_group: Consumer group name
|
|
consumer_name: Consumer name
|
|
"""
|
|
if not hasattr(self, "stream_lag"):
|
|
return
|
|
|
|
attrs = self.attr_builder.build_streaming_attributes(
|
|
stream_name=stream_name,
|
|
consumer_group=consumer_group,
|
|
)
|
|
self.stream_lag.record(lag_seconds, attributes=attrs)
|
|
|
|
# CSC metric recording methods
|
|
|
|
def record_csc_request(
|
|
self,
|
|
result: Optional[CSCResult] = None,
|
|
) -> None:
|
|
"""
|
|
Record a Client Side Caching (CSC) request.
|
|
|
|
Args:
|
|
result: CSC result ('hit' or 'miss')
|
|
"""
|
|
if not hasattr(self, "csc_requests"):
|
|
return
|
|
|
|
attrs = self.attr_builder.build_csc_attributes(result=result)
|
|
self.csc_requests.add(1, attributes=attrs)
|
|
|
|
def record_csc_eviction(
|
|
self,
|
|
count: int,
|
|
reason: Optional[CSCReason] = None,
|
|
) -> None:
|
|
"""
|
|
Record a Client Side Caching (CSC) eviction.
|
|
|
|
Args:
|
|
count: Number of evictions
|
|
reason: Reason for eviction
|
|
"""
|
|
if not hasattr(self, "csc_evictions"):
|
|
return
|
|
|
|
attrs = self.attr_builder.build_csc_attributes(reason=reason)
|
|
self.csc_evictions.add(count, attributes=attrs)
|
|
|
|
def record_csc_network_saved(
|
|
self,
|
|
bytes_saved: int,
|
|
) -> None:
|
|
"""
|
|
Record the number of bytes saved by using Client Side Caching (CSC).
|
|
|
|
Args:
|
|
bytes_saved: Number of bytes saved
|
|
"""
|
|
if not hasattr(self, "csc_network_saved"):
|
|
return
|
|
|
|
attrs = self.attr_builder.build_csc_attributes()
|
|
self.csc_network_saved.add(bytes_saved, attributes=attrs)
|
|
|
|
# Utility methods
|
|
|
|
@staticmethod
|
|
def monotonic_time() -> float:
|
|
"""
|
|
Get monotonic time for duration measurements.
|
|
|
|
Returns:
|
|
Current monotonic time in seconds
|
|
"""
|
|
return time.monotonic()
|
|
|
|
def __repr__(self) -> str:
|
|
return f"RedisMetricsCollector(meter={self.meter}, config={self.config})"
|