El Desafío del Testing de Feature Flags
Los feature flags (también conocidos como feature toggles) se han vuelto esenciales para la entrega moderna de software, habilitando rollouts progresivos, A/B testing, despliegues canary y capacidades de rollback instantáneo. Sin embargo, los feature flags introducen complejidad de testing - los equipos QA deben validar todas las combinaciones de flags, probar escenarios de rollout gradual, verificar reglas de targeting y asegurar que los flags no creen deuda técnica cuando se dejan en el código indefinidamente.
Este artículo explora estrategias de testing para feature flags usando LaunchDarkly, Flagsmith y soluciones open-source, cubriendo testing de combinaciones, validación A/B e integración con pipelines CI/CD.
Gestión de Feature Flags con LaunchDarkly
Integración SDK LaunchDarkly
# app/feature_flags.py
import ldclient
from ldclient.config import Config
import os
class FeatureFlagManager:
def __init__(self):
sdk_key = os.getenv('LAUNCHDARKLY_SDK_KEY')
ldclient.set_config(Config(sdk_key))
self.client = ldclient.get()
def is_feature_enabled(self, flag_key: str, user: dict, default: bool = False):
"""Verificar si feature está habilitado para usuario"""
return self.client.variation(flag_key, user, default)
def get_flag_value(self, flag_key: str, user: dict, default):
"""Obtener valor de feature flag"""
return self.client.variation(flag_key, user, default)
# Uso en aplicación
feature_flags = FeatureFlagManager()
def process_payment(user_id: str, amount: float):
user = {
"key": user_id,
"custom": {"subscription_tier": "premium"}
}
if feature_flags.is_feature_enabled('new-payment-processor', user):
result = new_payment_processor(amount)
else:
result = legacy_payment_processor(amount)
return result
Testing de Feature Flags con LaunchDarkly
# tests/test_feature_flags.py
import pytest
from unittest.mock import Mock, patch
class TestFeatureFlags:
@pytest.fixture
def mock_ld_client(self):
"""Mock LaunchDarkly client"""
with patch('ldclient.get') as mock_get:
mock_client = Mock()
mock_client.is_initialized.return_value = True
mock_get.return_value = mock_client
yield mock_client
def test_feature_enabled_for_user(self, mock_ld_client):
"""Probar feature flag habilitado para usuario específico"""
mock_ld_client.variation.return_value = True
ff_manager = FeatureFlagManager()
user = {"key": "user123"}
result = ff_manager.is_feature_enabled('new-payment-processor', user)
assert result == True
def test_flag_targeting_by_attribute(self, mock_ld_client):
"""Probar targeting de flag basado en atributos de usuario"""
def variation_side_effect(flag_key, user, default):
if user.get('custom', {}).get('subscription_tier') == 'premium':
return True
return False
mock_ld_client.variation.side_effect = variation_side_effect
ff_manager = FeatureFlagManager()
premium_user = {"key": "premium1", "custom": {"subscription_tier": "premium"}}
assert ff_manager.is_feature_enabled('premium-feature', premium_user) == True
free_user = {"key": "free1", "custom": {"subscription_tier": "free"}}
assert ff_manager.is_feature_enabled('premium-feature', free_user) == False
Integración y Testing de Flagsmith
# app/flagsmith_manager.py
from flagsmith import Flagsmith
import os
class FlagsmithManager:
def __init__(self):
self.flagsmith = Flagsmith(
environment_key=os.getenv('FLAGSMITH_ENVIRONMENT_KEY')
)
def is_feature_enabled(self, feature_name: str, identifier: str = None):
"""Verificar si feature está habilitado"""
flags = self.flagsmith.get_identity_flags(identifier) if identifier else self.flagsmith.get_environment_flags()
return flags.is_feature_enabled(feature_name)
def get_feature_value(self, feature_name: str, identifier: str = None, default=None):
"""Obtener valor de feature"""
flags = self.flagsmith.get_identity_flags(identifier) if identifier else self.flagsmith.get_environment_flags()
value = flags.get_feature_value(feature_name)
return value if value is not None else default
Testing de Todas las Combinaciones de Flags
# tests/test_flag_combinations.py
import pytest
from itertools import product
class FlagCombinationTester:
"""Probar todas las combinaciones posibles de feature flags"""
def __init__(self, flags: dict):
self.flags = flags
def get_all_combinations(self):
"""Generar todas las combinaciones posibles de flags"""
flag_names = list(self.flags.keys())
flag_values = [self.flags[name] for name in flag_names]
for combination in product(*flag_values):
yield dict(zip(flag_names, combination))
def test_all_combinations(self):
"""Probar comportamiento con todas las combinaciones de flags"""
for combination in self.get_all_combinations():
try:
self._test_with_flags(combination)
print(f"✓ Probada combinación: {combination}")
except Exception as e:
pytest.fail(f"✗ Falló con combinación {combination}: {str(e)}")
# Testing optimizado (solo rutas críticas)
@pytest.mark.parametrize("new_checkout,express_payment", [
(True, True),
(True, False),
(False, True),
(False, False)
])
def test_critical_checkout_paths(new_checkout, express_payment):
"""Probar rutas críticas de checkout con combinaciones clave de flags"""
with patch_feature_flags({
'new-checkout-ui': new_checkout,
'express-payment': express_payment
}):
response = complete_checkout()
assert response.success == True
Validación de A/B Testing
# tests/test_ab_testing.py
from scipy import stats
import numpy as np
class ABTestValidator:
"""Validar significancia estadística de A/B tests"""
def __init__(self, alpha: float = 0.05):
self.alpha = alpha
def calculate_sample_size(self, baseline_rate: float, minimum_detectable_effect: float, power: float = 0.8):
"""Calcular tamaño de muestra requerido para A/B test"""
z_alpha = stats.norm.ppf(1 - self.alpha / 2)
z_beta = stats.norm.ppf(power)
p1 = baseline_rate
p2 = baseline_rate * (1 + minimum_detectable_effect)
p_avg = (p1 + p2) / 2
n = (2 * p_avg * (1 - p_avg) * (z_alpha + z_beta) ** 2) / ((p2 - p1) ** 2)
return int(np.ceil(n))
def test_statistical_significance(self, control_conversions: int, control_total: int,
treatment_conversions: int, treatment_total: int):
"""Probar si diferencia entre variantes es estadísticamente significativa"""
observed = np.array([
[control_conversions, control_total - control_conversions],
[treatment_conversions, treatment_total - treatment_conversions]
])
chi2, p_value, dof, expected = stats.chi2_contingency(observed)
is_significant = p_value < self.alpha
control_rate = control_conversions / control_total
treatment_rate = treatment_conversions / treatment_total
lift = ((treatment_rate - control_rate) / control_rate) * 100
return {
'is_significant': is_significant,
'p_value': p_value,
'control_rate': control_rate,
'treatment_rate': treatment_rate,
'lift_percentage': lift
}
def test_ab_test_new_checkout():
"""Probar experimento A/B para nuevo flujo de checkout"""
validator = ABTestValidator(alpha=0.05)
result = validator.test_statistical_significance(
control_conversions=450, control_total=5000,
treatment_conversions=520, treatment_total=5000
)
print(f"Tasa conversión control: {result['control_rate']:.2%}")
print(f"Tasa conversión tratamiento: {result['treatment_rate']:.2%}")
print(f"Lift: {result['lift_percentage']:.2f}%")
assert result['is_significant'] == True
assert result['lift_percentage'] > 0
Testing de Ciclo de Vida de Flags
# tests/test_flag_lifecycle.py
from datetime import datetime, timedelta
class FlagLifecycleTester:
"""Probar gestión de ciclo de vida de feature flags"""
def test_flag_age_tracking(self):
"""Probar que flags no se dejan en código indefinidamente"""
flags = self.get_all_flags()
old_flags = []
max_age_days = 90
for flag in flags:
created_date = datetime.fromisoformat(flag['created_at'])
age = datetime.now() - created_date
if age > timedelta(days=max_age_days):
old_flags.append({
'key': flag['key'],
'age_days': age.days,
'permanent': flag.get('permanent', False)
})
temporary_old_flags = [f for f in old_flags if not f['permanent']]
assert len(temporary_old_flags) == 0, \
f"Encontrados {len(temporary_old_flags)} flags más antiguos que {max_age_days} días"
def test_unused_flags_detection(self):
"""Detectar flags que ya no están referenciados en código"""
all_flags = self.get_all_flags()
code_references = self.scan_codebase_for_flag_references()
unused_flags = [flag['key'] for flag in all_flags if flag['key'] not in code_references]
assert len(unused_flags) == 0, f"Encontrados {len(unused_flags)} flags sin referencias"
Integración CI/CD
# .github/workflows/feature-flags-test.yml
name: Feature Flag Testing
on:
pull_request:
push:
branches: [main]
jobs:
test-flags:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.11'
- name: Install dependencies
run: |
pip install pytest scipy numpy launchdarkly-server-sdk
- name: Run flag combination tests
env:
LAUNCHDARKLY_SDK_KEY: ${{ secrets.LAUNCHDARKLY_SDK_KEY }}
run: pytest tests/test_flag_combinations.py -v
- name: Validate A/B tests
run: pytest tests/test_ab_testing.py -v
- name: Check flag lifecycle
run: pytest tests/test_flag_lifecycle.py -v
Conclusión
El testing de feature flags requiere una estrategia integral que valide todas las combinaciones de flags, asegure que los A/B tests sean estadísticamente válidos y gestione el ciclo de vida de flags para prevenir deuda técnica. Implementando testing sistemático de combinaciones, validación A/B con rigor estadístico, gestión de ciclo de vida e integración CI/CD, los equipos pueden usar feature flags con confianza para entrega progresiva mientras mantienen la calidad del código.
La clave es tratar los feature flags como temporales por defecto, probar todas las combinaciones críticas, validar experimentos apropiadamente y automatizar la limpieza de flags.