El Cambio hacia Testing Orientado a Observabilidad
Los enfoques tradicionales de testing se enfocan en validación pre-producción - pruebas unitarias, pruebas de integración y testing en entornos de staging. Sin embargo, los sistemas distribuidos modernos exhiben comportamientos emergentes que solo se manifiestan en producción bajo carga real, con patrones de usuarios reales y restricciones de infraestructura reales. El testing orientado a observabilidad cambia simultáneamente hacia la izquierda y derecha - instrumentando sistemas comprehensivamente mientras se prueba activamente en producción usando datos de telemetría, traces distribuidos y validación de SLO.
Este artículo explora estrategias de testing orientado a observabilidad usando OpenTelemetry, validación de tracing distribuido, testing en producción con monitoreo sintético, testing basado en SLO e integración con plataformas de observabilidad.
Instrumentación OpenTelemetry para Testing
Configuración SDK OpenTelemetry
# app/telemetry.py
from opentelemetry import trace, metrics
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.sdk.resources import Resource
def setup_telemetry(service_name: str, service_version: str):
"""Configurar instrumentación OpenTelemetry"""
resource = Resource.create({
"service.name": service_name,
"service.version": service_version,
"deployment.environment": os.getenv("ENVIRONMENT", "production")
})
trace_provider = TracerProvider(resource=resource)
otlp_trace_exporter = OTLPSpanExporter(
endpoint="http://otel-collector:4317",
insecure=True
)
trace_provider.add_span_processor(BatchSpanProcessor(otlp_trace_exporter))
trace.set_tracer_provider(trace_provider)
print(f"✓ OpenTelemetry configurado para {service_name}")
# Uso en aplicación
setup_telemetry("payment-service", "1.2.0")
tracer = trace.get_tracer(__name__)
meter = metrics.get_meter(__name__)
payment_counter = meter.create_counter(
"payments.processed",
description="Número de pagos procesados"
)
@app.route('/api/payment', methods=['POST'])
def process_payment():
with tracer.start_as_current_span("process_payment") as span:
try:
span.set_attribute("payment.amount", request.json.get('amount'))
span.set_attribute("payment.method", request.json.get('method'))
result = payment_processor.process(request.json)
payment_counter.add(1, {"status": "success"})
return jsonify(result), 200
except Exception as e:
span.set_status(trace.Status(trace.StatusCode.ERROR, str(e)))
payment_counter.add(1, {"status": "error"})
raise
Validación de Tracing Distribuido
# tests/test_distributed_tracing.py
import pytest
from opentelemetry import trace
from opentelemetry.sdk.trace.export import InMemorySpanExporter
class DistributedTracingTester:
"""Probar tracing distribuido entre microservicios"""
def test_end_to_end_trace_propagation(self):
"""Probar que el contexto de trace se propaga entre todos los servicios"""
with self.tracer.start_as_current_span("test_checkout") as span:
response = requests.post('http://api-gateway/checkout', json={
'items': [{'id': '123', 'quantity': 1}]
})
assert response.status_code == 200
# Verificar que el trace incluye todos los servicios esperados
expected_services = [
'api-gateway',
'checkout-service',
'inventory-service',
'payment-service'
]
traces = self.get_traces_from_jaeger(span.get_span_context().trace_id)
actual_services = set(s['process']['serviceName'] for s in traces['data'][0]['spans'])
for service in expected_services:
assert service in actual_services
def test_span_attributes(self):
"""Verificar que los spans contienen atributos esperados"""
response = requests.post('http://payment-service/process', json={
'amount': 50.00,
'method': 'paypal'
})
spans = self.span_exporter.get_finished_spans()
payment_span = next(s for s in spans if s.name == 'process_payment')
assert payment_span.attributes.get('payment.amount') == 50.00
assert payment_span.attributes.get('payment.method') == 'paypal'
Testing en Producción con Monitoreo Sintético
# tests/production/synthetic_tests.py
import pytest
import requests
import time
class SyntheticMonitoringTests:
"""Tests sintéticos que corren continuamente en producción"""
def test_critical_user_journey_checkout(self):
"""Probar journey crítico de checkout en producción"""
start_time = time.time()
# Paso 1: Navegar productos
response = requests.get('https://production.example.com/api/products')
assert response.status_code == 200
# Paso 2: Agregar al carrito
product_id = response.json()['products'][0]['id']
response = requests.post('https://production.example.com/api/cart/add', json={
'product_id': product_id,
'quantity': 1
})
assert response.status_code == 200
# Paso 3: Checkout
response = requests.post('https://production.example.com/api/checkout', json={
'cart_id': response.json()['cart_id']
})
assert response.status_code == 200
duration = time.time() - start_time
# Validar SLA
assert duration < 3.0, f"Checkout tomó {duration}s, excede SLA de 3s"
def test_api_latency_percentiles(self):
"""Probar que latencia de API cumple SLO"""
latencies = []
for _ in range(100):
start = time.time()
response = requests.get('https://production.example.com/api/products')
latencies.append((time.time() - start) * 1000)
assert response.status_code == 200
latencies.sort()
p50, p95, p99 = latencies[49], latencies[94], latencies[98]
print(f"P50: {p50:.2f}ms, P95: {p95:.2f}ms, P99: {p99:.2f}ms")
assert p50 < 100, f"P50 {p50}ms excede SLO de 100ms"
assert p95 < 500, f"P95 {p95}ms excede SLO de 500ms"
Testing de Service Level Objectives (SLO)
# tests/test_slo.py
from prometheus_api_client import PrometheusConnect
class SLOValidator:
"""Validar Service Level Objectives usando métricas Prometheus"""
def __init__(self, prometheus_url: str):
self.prom = PrometheusConnect(url=prometheus_url)
def test_availability_slo(self):
"""Probar SLO de disponibilidad de 99.9%"""
query = '''
sum(rate(http_requests_total{status=~"2.."}[30d]))
/
sum(rate(http_requests_total[30d]))
'''
result = self.prom.custom_query(query)
availability = float(result[0]['value'][1])
print(f"Disponibilidad 30 días: {availability * 100:.3f}%")
assert availability >= 0.999
def test_latency_slo(self):
"""Probar SLO de latencia P95 (<500ms)"""
query = '''
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
)
'''
result = self.prom.custom_query(query)
p95_latency_ms = float(result[0]['value'][1]) * 1000
print(f"Latencia P95: {p95_latency_ms:.2f}ms")
assert p95_latency_ms < 500
def test_error_budget_consumption(self):
"""Probar tasa de consumo de error budget"""
query = '''
sum(rate(http_requests_total{status=~"5.."}[1h]))
/
sum(rate(http_requests_total[1h]))
'''
result = self.prom.custom_query(query)
error_rate_1h = float(result[0]['value'][1])
monthly_budget = 0.001 # 0.1%
burn_rate = (error_rate_1h * 730) / monthly_budget
print(f"Error budget burn rate: {burn_rate:.2f}x")
assert burn_rate < 10
Testing de Despliegue Canary con Métricas
# tests/test_canary.py
class CanaryAnalyzer:
"""Analizar despliegue canary usando métricas"""
def test_canary_error_rate(self):
"""Comparar tasas de error entre baseline y canary"""
baseline_query = '''
sum(rate(http_requests_total{version="v1.0", status=~"5.."}[5m]))
/
sum(rate(http_requests_total{version="v1.0"}[5m]))
'''
canary_query = '''
sum(rate(http_requests_total{version="v1.1", status=~"5.."}[5m]))
/
sum(rate(http_requests_total{version="v1.1"}[5m]))
'''
baseline_result = self.prom.custom_query(baseline_query)
canary_result = self.prom.custom_query(canary_query)
baseline_error_rate = float(baseline_result[0]['value'][1]) if baseline_result else 0
canary_error_rate = float(canary_result[0]['value'][1]) if canary_result else 0
print(f"Baseline error rate: {baseline_error_rate * 100:.3f}%")
print(f"Canary error rate: {canary_error_rate * 100:.3f}%")
assert canary_error_rate <= baseline_error_rate * 1.5
Workflow GitHub Actions
# .github/workflows/observability-testing.yml
name: Observability Testing
on:
schedule:
- cron: '*/15 * * * *'
workflow_dispatch:
jobs:
synthetic-tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.11'
- name: Install dependencies
run: |
pip install pytest requests prometheus-api-client
- name: Run synthetic production tests
run: pytest tests/production/synthetic_tests.py -v
- name: Validate SLOs
run: pytest tests/test_slo.py -v
- name: Analyze canary deployment
if: github.event_name == 'workflow_dispatch'
run: pytest tests/test_canary.py -v
Conclusión
El testing orientado a observabilidad representa un cambio de paradigma desde validación puramente pre-producción hacia testing continuo de producción usando telemetría, traces y métricas. Implementando instrumentación OpenTelemetry, validación de tracing distribuido, monitoreo sintético, testing basado en SLO y análisis canary, los equipos pueden validar comportamiento del sistema en producción con visibilidad sin precedentes.
La clave es tratar la observabilidad como una estrategia de testing de primera clase - instrumentando comprehensivamente, testeando continuamente en producción, validando SLOs programáticamente y usando métricas para guiar decisiones de despliegue.