The Kubernetes Testing Challenge
Kubernetes has become the de facto standard for container orchestration, but its complexity introduces unique testing challenges. QA teams must validate not just application code, but also Kubernetes manifests, Helm charts, operators, custom resource definitions (CRDs), network policies, and service mesh configurations. Traditional testing approaches fall short in this distributed, declarative environment.
Effective Kubernetes testing requires a multi-layered strategy that validates everything from individual pod configurations to complex multi-service interactions. This article explores comprehensive testing strategies for Kubernetes environments, covering infrastructure validation, application testing, chaos engineering, and production readiness verification.
Pod and Container Testing Strategies
Pod Configuration Validation
# tests/pod_config_test.py
import pytest
from kubernetes import client, config
from typing import List, Dict
class PodConfigTester:
def __init__(self, namespace: str = "default"):
config.load_kube_config()
self.v1 = client.CoreV1Api()
self.namespace = namespace
def test_pod_resource_limits(self):
"""Verify all pods have resource limits defined"""
pods = self.v1.list_namespaced_pod(namespace=self.namespace)
for pod in pods.items:
for container in pod.spec.containers:
assert container.resources is not None, \
f"Pod {pod.metadata.name} container {container.name} missing resources"
assert container.resources.limits is not None, \
f"Pod {pod.metadata.name} container {container.name} missing resource limits"
# Verify CPU and memory limits
assert 'cpu' in container.resources.limits, \
f"Pod {pod.metadata.name} container {container.name} missing CPU limit"
assert 'memory' in container.resources.limits, \
f"Pod {pod.metadata.name} container {container.name} missing memory limit"
def test_pod_security_context(self):
"""Verify pods follow security best practices"""
pods = self.v1.list_namespaced_pod(namespace=self.namespace)
for pod in pods.items:
# Check pod-level security context
if pod.spec.security_context:
assert pod.spec.security_context.run_as_non_root == True, \
f"Pod {pod.metadata.name} allows running as root"
# Check container-level security context
for container in pod.spec.containers:
assert container.security_context is not None, \
f"Container {container.name} in pod {pod.metadata.name} missing security context"
assert container.security_context.privileged == False, \
f"Container {container.name} running in privileged mode"
assert container.security_context.read_only_root_filesystem == True, \
f"Container {container.name} has writable root filesystem"
def test_pod_health_checks(self):
"""Verify pods have proper health checks configured"""
pods = self.v1.list_namespaced_pod(namespace=self.namespace)
for pod in pods.items:
for container in pod.spec.containers:
assert container.liveness_probe is not None, \
f"Container {container.name} in pod {pod.metadata.name} missing liveness probe"
assert container.readiness_probe is not None, \
f"Container {container.name} in pod {pod.metadata.name} missing readiness probe"
def test_pod_labels_and_annotations(self):
"""Verify required labels and annotations"""
required_labels = ['app', 'version', 'component']
required_annotations = ['prometheus.io/scrape']
pods = self.v1.list_namespaced_pod(namespace=self.namespace)
for pod in pods.items:
# Check labels
for label in required_labels:
assert label in pod.metadata.labels, \
f"Pod {pod.metadata.name} missing required label: {label}"
# Check annotations
for annotation in required_annotations:
assert annotation in pod.metadata.annotations, \
f"Pod {pod.metadata.name} missing required annotation: {annotation}"
Helm Chart Testing
Helm Chart Validation with helm-unittest
# charts/myapp/tests/deployment_test.yaml
suite: test deployment
templates:
- deployment.yaml
tests:
- it: should create deployment with correct name
asserts:
- isKind:
of: Deployment
- equal:
path: metadata.name
value: RELEASE-NAME-myapp
- it: should set correct number of replicas
set:
replicaCount: 3
asserts:
- equal:
path: spec.replicas
value: 3
- it: should set resource limits
asserts:
- exists:
path: spec.template.spec.containers[0].resources.limits
- exists:
path: spec.template.spec.containers[0].resources.requests
- it: should create service account when enabled
set:
serviceAccount.create: true
serviceAccount.name: test-sa
asserts:
- equal:
path: spec.template.spec.serviceAccountName
value: test-sa
- it: should add security context
asserts:
- equal:
path: spec.template.spec.securityContext.runAsNonRoot
value: true
- equal:
path: spec.template.spec.containers[0].securityContext.allowPrivilegeEscalation
value: false
Integration Testing with Kind
#!/bin/bash
# scripts/helm-integration-test.sh
set -e
# Create Kind cluster
echo "Creating Kind cluster..."
kind create cluster --name helm-test --config - <<EOF
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
nodes:
- role: control-plane
- role: worker
- role: worker
EOF
# Install chart
echo "Installing Helm chart..."
helm install test-release ./charts/myapp \
--values ./charts/myapp/values-test.yaml \
--wait \
--timeout 5m
# Run smoke tests
echo "Running smoke tests..."
kubectl run test-pod --image=curlimages/curl:latest --rm -it --restart=Never -- \
curl -f http://test-release-myapp:80/health
# Test horizontal pod autoscaling
echo "Testing HPA..."
kubectl autoscale deployment test-release-myapp --cpu-percent=50 --min=2 --max=10
kubectl get hpa test-release-myapp
# Test rolling update
echo "Testing rolling update..."
helm upgrade test-release ./charts/myapp \
--set image.tag=v2.0.0 \
--wait
# Verify pods rolled successfully
kubectl rollout status deployment/test-release-myapp
# Cleanup
echo "Cleaning up..."
helm uninstall test-release
kind delete cluster --name helm-test
echo "✓ All tests passed!"
Service Mesh Testing with Istio
Traffic Management Validation
# tests/istio_traffic_test.py
import pytest
import requests
from kubernetes import client, config
import time
class IstioTrafficTester:
def __init__(self, namespace: str = "default"):
config.load_kube_config()
self.custom_api = client.CustomObjectsApi()
self.namespace = namespace
def test_virtual_service_routing(self):
"""Test VirtualService routes traffic correctly"""
# Get VirtualService
vs = self.custom_api.get_namespaced_custom_object(
group="networking.istio.io",
version="v1beta1",
namespace=self.namespace,
plural="virtualservices",
name="myapp-vs"
)
# Verify routing rules
assert len(vs['spec']['http']) > 0
# Test traffic distribution
v1_count = 0
v2_count = 0
total_requests = 100
for _ in range(total_requests):
response = requests.get(
"http://myapp.example.com/api/version",
headers={"Host": "myapp.example.com"}
)
version = response.json()['version']
if version == 'v1':
v1_count += 1
elif version == 'v2':
v2_count += 1
# Verify traffic split (90/10)
v1_percentage = (v1_count / total_requests) * 100
v2_percentage = (v2_count / total_requests) * 100
assert 85 <= v1_percentage <= 95, f"V1 traffic: {v1_percentage}%"
assert 5 <= v2_percentage <= 15, f"V2 traffic: {v2_percentage}%"
def test_destination_rule_circuit_breaker(self):
"""Test circuit breaker configuration"""
dr = self.custom_api.get_namespaced_custom_object(
group="networking.istio.io",
version="v1beta1",
namespace=self.namespace,
plural="destinationrules",
name="myapp-dr"
)
# Verify circuit breaker settings
outlier_detection = dr['spec']['trafficPolicy']['outlierDetection']
assert outlier_detection['consecutiveErrors'] == 5
assert outlier_detection['interval'] == "30s"
assert outlier_detection['baseEjectionTime'] == "30s"
# Test circuit breaker activation
service_url = "http://myapp.default.svc.cluster.local"
# Generate errors to trip circuit breaker
error_count = 0
for _ in range(10):
try:
requests.get(f"{service_url}/error", timeout=1)
except:
error_count += 1
# Verify circuit breaker tripped
time.sleep(2)
response = requests.get(f"{service_url}/health")
assert response.status_code == 503 # Service Unavailable
def test_mtls_enforcement(self):
"""Test mutual TLS enforcement"""
peer_auth = self.custom_api.get_namespaced_custom_object(
group="security.istio.io",
version="v1beta1",
namespace=self.namespace,
plural="peerauthentications",
name="default"
)
# Verify mTLS mode
assert peer_auth['spec']['mtls']['mode'] == "STRICT"
# Test that non-mTLS traffic is rejected
try:
response = requests.get(
"http://myapp.default.svc.cluster.local",
verify=False
)
assert False, "Non-mTLS traffic should be rejected"
except:
pass # Expected to fail
Chaos Engineering for Kubernetes
Pod Failure Testing with Chaos Mesh
# chaos-experiments/pod-failure.yaml
apiVersion: chaos-mesh.org/v1alpha1
kind: PodChaos
metadata:
name: pod-failure-test
namespace: chaos-testing
spec:
action: pod-failure
mode: one
duration: "30s"
selector:
namespaces:
- default
labelSelectors:
app: myapp
scheduler:
cron: "@every 1h"
Network Chaos Testing
# tests/chaos_test.py
import pytest
from kubernetes import client, config
import time
import requests
class ChaosEngineeringTester:
def __init__(self):
config.load_kube_config()
self.custom_api = client.CustomObjectsApi()
def test_pod_failure_resilience(self):
"""Test application resilience to pod failures"""
# Apply pod failure chaos
chaos_spec = {
"apiVersion": "chaos-mesh.org/v1alpha1",
"kind": "PodChaos",
"metadata": {"name": "pod-failure-test", "namespace": "default"},
"spec": {
"action": "pod-failure",
"mode": "fixed",
"value": "1",
"duration": "60s",
"selector": {
"namespaces": ["default"],
"labelSelectors": {"app": "myapp"}
}
}
}
self.custom_api.create_namespaced_custom_object(
group="chaos-mesh.org",
version="v1alpha1",
namespace="default",
plural="podchaos",
body=chaos_spec
)
# Monitor service availability during chaos
start_time = time.time()
error_count = 0
total_requests = 0
while time.time() - start_time < 60:
try:
response = requests.get("http://myapp/health", timeout=2)
if response.status_code != 200:
error_count += 1
total_requests += 1
except:
error_count += 1
total_requests += 1
time.sleep(1)
# Verify acceptable error rate (< 5%)
error_rate = (error_count / total_requests) * 100
assert error_rate < 5, f"Error rate {error_rate}% exceeds threshold"
# Cleanup chaos experiment
self.custom_api.delete_namespaced_custom_object(
group="chaos-mesh.org",
version="v1alpha1",
namespace="default",
plural="podchaos",
name="pod-failure-test"
)
def test_network_delay_resilience(self):
"""Test application resilience to network delays"""
chaos_spec = {
"apiVersion": "chaos-mesh.org/v1alpha1",
"kind": "NetworkChaos",
"metadata": {"name": "network-delay-test", "namespace": "default"},
"spec": {
"action": "delay",
"mode": "all",
"selector": {
"namespaces": ["default"],
"labelSelectors": {"app": "myapp"}
},
"delay": {
"latency": "200ms",
"correlation": "100",
"jitter": "50ms"
},
"duration": "2m"
}
}
self.custom_api.create_namespaced_custom_object(
group="chaos-mesh.org",
version="v1alpha1",
namespace="default",
plural="networkchaos",
body=chaos_spec
)
# Test service still responds within SLA
response_times = []
for _ in range(20):
start = time.time()
requests.get("http://myapp/api/data", timeout=5)
response_times.append(time.time() - start)
avg_response_time = sum(response_times) / len(response_times)
p95_response_time = sorted(response_times)[int(len(response_times) * 0.95)]
# Verify responses within acceptable range despite delay
assert avg_response_time < 1.0, f"Average response time {avg_response_time}s too high"
assert p95_response_time < 2.0, f"P95 response time {p95_response_time}s too high"
Custom Resource Definition (CRD) Testing
CRD Validation Testing
# tests/crd_test.py
import pytest
from kubernetes import client, config
import yaml
class CRDTester:
def __init__(self):
config.load_kube_config()
self.api_extensions = client.ApiextensionsV1Api()
self.custom_api = client.CustomObjectsApi()
def test_crd_schema_validation(self):
"""Test CRD validates input correctly"""
# Valid resource should be accepted
valid_resource = {
"apiVersion": "mycompany.com/v1",
"kind": "MyApp",
"metadata": {"name": "test-app", "namespace": "default"},
"spec": {
"replicas": 3,
"image": "myapp:v1.0.0",
"resources": {
"limits": {"cpu": "500m", "memory": "512Mi"}
}
}
}
self.custom_api.create_namespaced_custom_object(
group="mycompany.com",
version="v1",
namespace="default",
plural="myapps",
body=valid_resource
)
# Invalid resource should be rejected
invalid_resource = {
"apiVersion": "mycompany.com/v1",
"kind": "MyApp",
"metadata": {"name": "invalid-app", "namespace": "default"},
"spec": {
"replicas": "invalid", # Should be integer
"image": "myapp:v1.0.0"
}
}
with pytest.raises(client.exceptions.ApiException):
self.custom_api.create_namespaced_custom_object(
group="mycompany.com",
version="v1",
namespace="default",
plural="myapps",
body=invalid_resource
)
def test_operator_reconciliation(self):
"""Test operator reconciles custom resources correctly"""
# Create custom resource
resource = {
"apiVersion": "mycompany.com/v1",
"kind": "MyApp",
"metadata": {"name": "reconcile-test", "namespace": "default"},
"spec": {"replicas": 3, "image": "myapp:v1.0.0"}
}
self.custom_api.create_namespaced_custom_object(
group="mycompany.com",
version="v1",
namespace="default",
plural="myapps",
body=resource
)
# Wait for operator to reconcile
time.sleep(5)
# Verify operator created expected resources
v1 = client.CoreV1Api()
apps_v1 = client.AppsV1Api()
# Check deployment was created
deployment = apps_v1.read_namespaced_deployment(
name="reconcile-test",
namespace="default"
)
assert deployment.spec.replicas == 3
# Check service was created
service = v1.read_namespaced_service(
name="reconcile-test",
namespace="default"
)
assert service is not None
Conclusion
Kubernetes testing requires a comprehensive, multi-layered approach that validates infrastructure configurations, application behavior, and system resilience. By implementing pod configuration tests, Helm chart validation, service mesh testing, chaos engineering experiments, and CRD validation, teams can build confidence in their Kubernetes deployments.
The key is treating Kubernetes infrastructure as code that deserves the same testing rigor as application code. Automated validation in CI/CD pipelines, combined with regular chaos engineering exercises, ensures Kubernetes environments remain reliable and resilient. With these testing strategies, teams can confidently deploy complex microservices architectures on Kubernetes while maintaining high availability and performance standards.