You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

686 lines
22 KiB

"""
OpenTelemetry metrics collector for redis-py.
This module defines and manages all metric instruments according to
OTel semantic conventions for database clients.
"""
import logging
import time
from enum import Enum
from typing import TYPE_CHECKING, Callable, Optional
if TYPE_CHECKING:
from redis.connection import ConnectionPoolInterface
from redis.multidb.database import SyncDatabase
from redis.observability.attributes import (
REDIS_CLIENT_CONNECTION_CLOSE_REASON,
REDIS_CLIENT_CONNECTION_NOTIFICATION,
AttributeBuilder,
CSCReason,
CSCResult,
GeoFailoverReason,
PubSubDirection,
get_pool_name,
)
from redis.observability.config import MetricGroup, OTelConfig
from redis.utils import deprecated_args
logger = logging.getLogger(__name__)
# Optional imports - OTel SDK may not be installed
try:
from opentelemetry.metrics import Meter
OTEL_AVAILABLE = True
except ImportError:
OTEL_AVAILABLE = False
Counter = None
Histogram = None
Meter = None
UpDownCounter = None
class CloseReason(Enum):
"""
Enum representing the reason why a Redis client connection was closed.
Values:
APPLICATION_CLOSE: The connection was closed intentionally by the application
(for example, during normal shutdown or explicit cleanup).
ERROR: The connection was closed due to an unexpected error
(for example, network failure or protocol error).
HEALTHCHECK_FAILED: The connection was closed because a health check
or liveness check for the connection failed.
"""
APPLICATION_CLOSE = "application_close"
ERROR = "error"
HEALTHCHECK_FAILED = "healthcheck_failed"
class RedisMetricsCollector:
"""
Collects and records OpenTelemetry metrics for Redis operations.
This class manages all metric instruments and provides methods to record
various Redis operations including connection pool events, command execution,
and cluster-specific operations.
Args:
meter: OpenTelemetry Meter instance
config: OTel configuration object
"""
METER_NAME = "redis-py"
METER_VERSION = "1.0.0"
def __init__(self, meter: Meter, config: OTelConfig):
if not OTEL_AVAILABLE:
raise ImportError(
"OpenTelemetry API is not installed. "
"Install it with: pip install opentelemetry-api"
)
self.meter = meter
self.config = config
self.attr_builder = AttributeBuilder()
self.connection_count = None
# Initialize enabled metric instruments
if MetricGroup.RESILIENCY in self.config.metric_groups:
self._init_resiliency_metrics()
if MetricGroup.COMMAND in self.config.metric_groups:
self._init_command_metrics()
if MetricGroup.CONNECTION_BASIC in self.config.metric_groups:
self._init_connection_basic_metrics()
if MetricGroup.CONNECTION_ADVANCED in self.config.metric_groups:
self._init_connection_advanced_metrics()
if MetricGroup.PUBSUB in self.config.metric_groups:
self._init_pubsub_metrics()
if MetricGroup.STREAMING in self.config.metric_groups:
self._init_streaming_metrics()
if MetricGroup.CSC in self.config.metric_groups:
self._init_csc_metrics()
logger.info("RedisMetricsCollector initialized")
def _init_resiliency_metrics(self) -> None:
"""Initialize resiliency metrics."""
self.client_errors = self.meter.create_counter(
name="redis.client.errors",
unit="{error}",
description="A counter of all errors (both returned to the user and handled internally in the client library)",
)
self.maintenance_notifications = self.meter.create_counter(
name="redis.client.maintenance.notifications",
unit="{notification}",
description="Tracks server-side maintenance notifications",
)
self.geo_failovers = self.meter.create_counter(
name="redis.client.geofailover.failovers",
unit="{geofailover}",
description="Total count of failovers happened using MultiDbClient.",
)
def _init_connection_basic_metrics(self) -> None:
"""Initialize basic connection metrics."""
self.connection_create_time = self.meter.create_histogram(
name="db.client.connection.create_time",
unit="s",
description="Time to create a new connection",
explicit_bucket_boundaries_advisory=self.config.buckets_connection_create_time,
)
self.connection_relaxed_timeout = self.meter.create_up_down_counter(
name="redis.client.connection.relaxed_timeout",
unit="{relaxation}",
description="Counts up for relaxed timeout, counts down for unrelaxed timeout",
)
self.connection_handoff = self.meter.create_counter(
name="redis.client.connection.handoff",
unit="{handoff}",
description="Connections that have been handed off (e.g., after a MOVING notification)",
)
def _init_connection_advanced_metrics(self) -> None:
"""Initialize advanced connection metrics."""
self.connection_timeouts = self.meter.create_counter(
name="db.client.connection.timeouts",
unit="{timeout}",
description="The number of connection timeouts that have occurred trying to obtain a connection from the pool.",
)
self.connection_wait_time = self.meter.create_histogram(
name="db.client.connection.wait_time",
unit="s",
description="Time to obtain an open connection from the pool",
explicit_bucket_boundaries_advisory=self.config.buckets_connection_wait_time,
)
self.connection_closed = self.meter.create_counter(
name="redis.client.connection.closed",
unit="{connection}",
description="Total number of closed connections",
)
def _init_command_metrics(self) -> None:
"""Initialize command execution metric instruments."""
self.operation_duration = self.meter.create_histogram(
name="db.client.operation.duration",
unit="s",
description="Command execution duration",
explicit_bucket_boundaries_advisory=self.config.buckets_operation_duration,
)
def _init_pubsub_metrics(self) -> None:
"""Initialize PubSub metric instruments."""
self.pubsub_messages = self.meter.create_counter(
name="redis.client.pubsub.messages",
unit="{message}",
description="Tracks published and received messages",
)
def _init_streaming_metrics(self) -> None:
"""Initialize Streaming metric instruments."""
self.stream_lag = self.meter.create_histogram(
name="redis.client.stream.lag",
unit="s",
description="End-to-end lag per message, showing how stale are the messages when the application starts processing them.",
explicit_bucket_boundaries_advisory=self.config.buckets_stream_processing_duration,
)
def _init_csc_metrics(self) -> None:
"""Initialize Client Side Caching (CSC) metric instruments."""
self.csc_requests = self.meter.create_counter(
name="redis.client.csc.requests",
unit="{request}",
description="The total number of requests to the cache",
)
self.csc_evictions = self.meter.create_counter(
name="redis.client.csc.evictions",
unit="{eviction}",
description="The total number of cache evictions",
)
self.csc_network_saved = self.meter.create_counter(
name="redis.client.csc.network_saved",
unit="By",
description="The total number of bytes saved by using CSC",
)
# Resiliency metric recording methods
def record_error_count(
self,
server_address: Optional[str] = None,
server_port: Optional[int] = None,
network_peer_address: Optional[str] = None,
network_peer_port: Optional[int] = None,
error_type: Optional[Exception] = None,
retry_attempts: Optional[int] = None,
is_internal: Optional[bool] = None,
):
"""
Record error count
Args:
server_address: Server address
server_port: Server port
network_peer_address: Network peer address
network_peer_port: Network peer port
error_type: Error type
retry_attempts: Retry attempts
is_internal: Whether the error is internal (e.g., timeout, network error)
"""
if not hasattr(self, "client_errors"):
return
attrs = self.attr_builder.build_base_attributes(
server_address=server_address,
server_port=server_port,
)
attrs.update(
self.attr_builder.build_operation_attributes(
network_peer_address=network_peer_address,
network_peer_port=network_peer_port,
retry_attempts=retry_attempts,
)
)
attrs.update(
self.attr_builder.build_error_attributes(
error_type=error_type,
is_internal=is_internal,
)
)
self.client_errors.add(1, attributes=attrs)
def record_maint_notification_count(
self,
server_address: str,
server_port: int,
network_peer_address: str,
network_peer_port: int,
maint_notification: str,
):
"""
Record maintenance notification count
Args:
server_address: Server address
server_port: Server port
network_peer_address: Network peer address
network_peer_port: Network peer port
maint_notification: Maintenance notification
"""
if not hasattr(self, "maintenance_notifications"):
return
attrs = self.attr_builder.build_base_attributes(
server_address=server_address,
server_port=server_port,
)
attrs.update(
self.attr_builder.build_operation_attributes(
network_peer_address=network_peer_address,
network_peer_port=network_peer_port,
)
)
attrs[REDIS_CLIENT_CONNECTION_NOTIFICATION] = maint_notification
self.maintenance_notifications.add(1, attributes=attrs)
def record_geo_failover(
self,
fail_from: "SyncDatabase",
fail_to: "SyncDatabase",
reason: GeoFailoverReason,
):
"""
Record geo failover
Args:
fail_from: Database failed from
fail_to: Database failed to
reason: Reason for the failover
"""
if not hasattr(self, "geo_failovers"):
return
attrs = self.attr_builder.build_geo_failover_attributes(
fail_from=fail_from,
fail_to=fail_to,
reason=reason,
)
return self.geo_failovers.add(1, attributes=attrs)
def init_connection_count(
self,
callback: Callable,
) -> None:
"""
Initialize observable gauge for connection count metric.
Args:
callback: Callback function to retrieve connection count
"""
if (
MetricGroup.CONNECTION_BASIC not in self.config.metric_groups
and not self.connection_count
):
return
self.connection_count = self.meter.create_observable_gauge(
name="db.client.connection.count",
unit="{connection}",
description="Number of connections in the pool",
callbacks=[callback],
)
def init_csc_items(
self,
callback: Callable,
) -> None:
"""
Initialize observable gauge for CSC items metric.
Args:
callback: Callback function to retrieve CSC items count
"""
if MetricGroup.CSC not in self.config.metric_groups and not self.csc_items:
return
self.csc_items = self.meter.create_observable_gauge(
name="redis.client.csc.items",
unit="{item}",
description="The total number of cached responses currently stored",
callbacks=[callback],
)
def record_connection_timeout(self, pool_name: str) -> None:
"""
Record a connection timeout event.
Args:
pool_name: Connection pool name
"""
if not hasattr(self, "connection_timeouts"):
return
attrs = self.attr_builder.build_connection_attributes(pool_name=pool_name)
self.connection_timeouts.add(1, attributes=attrs)
def record_connection_create_time(
self,
connection_pool: "ConnectionPoolInterface",
duration_seconds: float,
) -> None:
"""
Record time taken to create a new connection.
Args:
connection_pool: Connection pool implementation
duration_seconds: Creation time in seconds
"""
if not hasattr(self, "connection_create_time"):
return
attrs = self.attr_builder.build_connection_attributes(
pool_name=get_pool_name(connection_pool)
)
self.connection_create_time.record(duration_seconds, attributes=attrs)
def record_connection_wait_time(
self,
pool_name: str,
duration_seconds: float,
) -> None:
"""
Record time taken to obtain a connection from the pool.
Args:
pool_name: Connection pool name
duration_seconds: Wait time in seconds
"""
if not hasattr(self, "connection_wait_time"):
return
attrs = self.attr_builder.build_connection_attributes(pool_name=pool_name)
self.connection_wait_time.record(duration_seconds, attributes=attrs)
# Command execution metric recording methods
@deprecated_args(
args_to_warn=["batch_size"],
reason="The batch_size argument is no longer used and will be removed in the next major version.",
version="7.2.1",
)
def record_operation_duration(
self,
command_name: str,
duration_seconds: float,
server_address: Optional[str] = None,
server_port: Optional[int] = None,
db_namespace: Optional[int] = None,
batch_size: Optional[int] = None, # noqa
error_type: Optional[Exception] = None,
network_peer_address: Optional[str] = None,
network_peer_port: Optional[int] = None,
retry_attempts: Optional[int] = None,
is_blocking: Optional[bool] = None,
) -> None:
"""
Record command execution duration.
Args:
command_name: Redis command name (e.g., 'GET', 'SET', 'MULTI')
duration_seconds: Execution time in seconds
server_address: Redis server address
server_port: Redis server port
db_namespace: Redis database index
batch_size: Number of commands in batch (for pipelines/transactions)
error_type: Error type if operation failed
network_peer_address: Resolved peer address
network_peer_port: Peer port number
retry_attempts: Number of retry attempts made
is_blocking: Whether the operation is a blocking command
"""
if not hasattr(self, "operation_duration"):
return
# Check if this command should be tracked
if not self.config.should_track_command(command_name):
return
# Build attributes
attrs = self.attr_builder.build_base_attributes(
server_address=server_address,
server_port=server_port,
db_namespace=db_namespace,
)
attrs.update(
self.attr_builder.build_operation_attributes(
command_name=command_name,
network_peer_address=network_peer_address,
network_peer_port=network_peer_port,
retry_attempts=retry_attempts,
is_blocking=is_blocking,
)
)
attrs.update(
self.attr_builder.build_error_attributes(
error_type=error_type,
)
)
self.operation_duration.record(duration_seconds, attributes=attrs)
def record_connection_closed(
self,
close_reason: Optional[CloseReason] = None,
error_type: Optional[Exception] = None,
) -> None:
"""
Record a connection closed event.
Args:
close_reason: Reason for closing (e.g. 'error', 'application_close')
error_type: Error type if closed due to error
"""
if not hasattr(self, "connection_closed"):
return
attrs = self.attr_builder.build_connection_attributes()
if close_reason:
attrs[REDIS_CLIENT_CONNECTION_CLOSE_REASON] = close_reason.value
attrs.update(
self.attr_builder.build_error_attributes(
error_type=error_type,
)
)
self.connection_closed.add(1, attributes=attrs)
def record_connection_relaxed_timeout(
self,
connection_name: str,
maint_notification: str,
relaxed: bool,
) -> None:
"""
Record a connection timeout relaxation event.
Args:
connection_name: Connection name
maint_notification: Maintenance notification type
relaxed: True to count up (relaxed), False to count down (unrelaxed)
"""
if not hasattr(self, "connection_relaxed_timeout"):
return
attrs = self.attr_builder.build_connection_attributes(
connection_name=connection_name
)
attrs[REDIS_CLIENT_CONNECTION_NOTIFICATION] = maint_notification
self.connection_relaxed_timeout.add(1 if relaxed else -1, attributes=attrs)
def record_connection_handoff(
self,
pool_name: str,
) -> None:
"""
Record a connection handoff event (e.g., after MOVING notification).
Args:
pool_name: Connection pool name
"""
if not hasattr(self, "connection_handoff"):
return
attrs = self.attr_builder.build_connection_attributes(pool_name=pool_name)
self.connection_handoff.add(1, attributes=attrs)
# PubSub metric recording methods
def record_pubsub_message(
self,
direction: PubSubDirection,
channel: Optional[str] = None,
sharded: Optional[bool] = None,
) -> None:
"""
Record a PubSub message (published or received).
Args:
direction: Message direction ('publish' or 'receive')
channel: Pub/Sub channel name
sharded: True if sharded Pub/Sub channel
"""
if not hasattr(self, "pubsub_messages"):
return
attrs = self.attr_builder.build_pubsub_message_attributes(
direction=direction,
channel=channel,
sharded=sharded,
)
self.pubsub_messages.add(1, attributes=attrs)
# Streaming metric recording methods
@deprecated_args(
args_to_warn=["consumer_name"],
reason="The consumer_name argument is no longer used and will be removed in the next major version.",
version="7.2.1",
)
def record_streaming_lag(
self,
lag_seconds: float,
stream_name: Optional[str] = None,
consumer_group: Optional[str] = None,
consumer_name: Optional[str] = None, # noqa
) -> None:
"""
Record the lag of a streaming message.
Args:
lag_seconds: Lag in seconds
stream_name: Stream name
consumer_group: Consumer group name
consumer_name: Consumer name
"""
if not hasattr(self, "stream_lag"):
return
attrs = self.attr_builder.build_streaming_attributes(
stream_name=stream_name,
consumer_group=consumer_group,
)
self.stream_lag.record(lag_seconds, attributes=attrs)
# CSC metric recording methods
def record_csc_request(
self,
result: Optional[CSCResult] = None,
) -> None:
"""
Record a Client Side Caching (CSC) request.
Args:
result: CSC result ('hit' or 'miss')
"""
if not hasattr(self, "csc_requests"):
return
attrs = self.attr_builder.build_csc_attributes(result=result)
self.csc_requests.add(1, attributes=attrs)
def record_csc_eviction(
self,
count: int,
reason: Optional[CSCReason] = None,
) -> None:
"""
Record a Client Side Caching (CSC) eviction.
Args:
count: Number of evictions
reason: Reason for eviction
"""
if not hasattr(self, "csc_evictions"):
return
attrs = self.attr_builder.build_csc_attributes(reason=reason)
self.csc_evictions.add(count, attributes=attrs)
def record_csc_network_saved(
self,
bytes_saved: int,
) -> None:
"""
Record the number of bytes saved by using Client Side Caching (CSC).
Args:
bytes_saved: Number of bytes saved
"""
if not hasattr(self, "csc_network_saved"):
return
attrs = self.attr_builder.build_csc_attributes()
self.csc_network_saved.add(bytes_saved, attributes=attrs)
# Utility methods
@staticmethod
def monotonic_time() -> float:
"""
Get monotonic time for duration measurements.
Returns:
Current monotonic time in seconds
"""
return time.monotonic()
def __repr__(self) -> str:
return f"RedisMetricsCollector(meter={self.meter}, config={self.config})"