Testing Computer Vision Systems
Computer vision powers autonomous vehicles, medical diagnostics, security systems, and manufacturing QA. Unlike traditional software, CV models deal with ambiguity, visual variability, and real-world complexity. A misclassified stop sign could cause an accident. A false-negative tumor detection could cost lives. Like AI-powered test generation, computer vision testing requires understanding how machine learning models behave under various conditions.
Testing CV systems requires evaluating accuracy across diverse conditions, adversarial robustness, fairness across demographics, and real-time performance constraints.
Core Testing Strategies
1. Accuracy Metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import numpy as np
class CVModelEvaluator:
def __init__(self, model):
self.model = model
def evaluate_classification(self, test_images, true_labels):
"""Evaluate classification model"""
predictions = self.model.predict(test_images)
predicted_labels = np.argmax(predictions, axis=1)
# Overall accuracy
accuracy = accuracy_score(true_labels, predicted_labels)
# Per-class metrics
precision, recall, f1, support = precision_recall_fscore_support(
true_labels,
predicted_labels,
average=None
)
# Confusion matrix
cm = confusion_matrix(true_labels, predicted_labels)
return {
'accuracy': accuracy,
'per_class_metrics': {
self.model.class_names[i]: {
'precision': precision[i],
'recall': recall[i],
'f1_score': f1[i],
'support': support[i]
}
for i in range(len(self.model.class_names))
},
'confusion_matrix': cm
}
def evaluate_object_detection(self, test_images, ground_truth_boxes):
"""Evaluate object detection with mAP"""
predictions = self.model.detect(test_images)
# Calculate IoU (Intersection over Union)
def calculate_iou(box1, box2):
x1 = max(box1[0], box2[0])
y1 = max(box1[1], box2[1])
x2 = min(box1[2], box2[2])
y2 = min(box1[3], box2[3])
intersection = max(0, x2 - x1) * max(0, y2 - y1)
area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
union = area1 + area2 - intersection
return intersection / union if union > 0 else 0
# Calculate mAP (mean Average Precision)
aps = []
for class_id in range(len(self.model.class_names)):
# Get predictions and ground truth for this class
class_predictions = [
p for p in predictions if p['class_id'] == class_id
]
class_gt = [
gt for gt in ground_truth_boxes if gt['class_id'] == class_id
]
# Sort by confidence
class_predictions.sort(key=lambda x: x['confidence'], reverse=True)
# Calculate precision-recall curve
tp = 0
fp = 0
precisions = []
recalls = []
for pred in class_predictions:
# Find best matching ground truth
best_iou = 0
best_gt_idx = -1
for idx, gt in enumerate(class_gt):
iou = calculate_iou(pred['box'], gt['box'])
if iou > best_iou:
best_iou = iou
best_gt_idx = idx
if best_iou >= 0.5: # IoU threshold
tp += 1
class_gt.pop(best_gt_idx) # Remove matched GT
else:
fp += 1
precision = tp / (tp + fp)
recall = tp / len(class_gt) if class_gt else 0
precisions.append(precision)
recalls.append(recall)
# Calculate AP (area under precision-recall curve)
ap = np.trapz(precisions, recalls)
aps.append(ap)
return {
'mAP': np.mean(aps),
'per_class_AP': dict(zip(self.model.class_names, aps))
}
2. Dataset Validation
import cv2
from collections import Counter
class DatasetValidator:
def __init__(self, dataset):
self.dataset = dataset
def check_class_balance(self):
"""Detect class imbalance"""
label_counts = Counter(self.dataset.labels)
total = len(self.dataset.labels)
imbalance_report = {}
for class_name, count in label_counts.items():
percentage = (count / total) * 100
imbalance_report[class_name] = {
'count': count,
'percentage': percentage,
'imbalanced': percentage < 5 or percentage > 50
}
return imbalance_report
def detect_duplicate_images(self):
"""Find duplicate or near-duplicate images"""
import imagehash
from PIL import Image
hashes = {}
duplicates = []
for idx, img_path in enumerate(self.dataset.image_paths):
img = Image.open(img_path)
img_hash = imagehash.average_hash(img)
if img_hash in hashes:
duplicates.append({
'original': hashes[img_hash],
'duplicate': img_path
})
else:
hashes[img_hash] = img_path
return duplicates
def analyze_image_quality(self):
"""Check for low-quality images"""
quality_issues = []
for img_path in self.dataset.image_paths:
img = cv2.imread(img_path)
# Check resolution
height, width = img.shape[:2]
if height < 224 or width < 224:
quality_issues.append({
'image': img_path,
'issue': 'low_resolution',
'resolution': f"{width}x{height}"
})
# Check brightness
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
brightness = np.mean(gray)
if brightness < 30 or brightness > 225:
quality_issues.append({
'image': img_path,
'issue': 'poor_brightness',
'brightness': brightness
})
# Check blur
laplacian_var = cv2.Laplacian(gray, cv2.CV_64F).var()
if laplacian_var < 100: # Threshold for blur
quality_issues.append({
'image': img_path,
'issue': 'blurry',
'blur_score': laplacian_var
})
return quality_issues
3. Adversarial Testing
Testing model robustness against adversarial attacks is critical, similar to security testing approaches that validate system resilience against malicious inputs.
import tensorflow as tf
class AdversarialTester:
def __init__(self, model):
self.model = model
def fgsm_attack(self, image, true_label, epsilon=0.01):
"""Fast Gradient Sign Method attack"""
image_tensor = tf.convert_to_tensor(image[np.newaxis, ...])
with tf.GradientTape() as tape:
tape.watch(image_tensor)
prediction = self.model(image_tensor)
loss = tf.keras.losses.sparse_categorical_crossentropy(
[true_label], prediction
)
gradient = tape.gradient(loss, image_tensor)
signed_grad = tf.sign(gradient)
# Create adversarial image
adversarial_image = image + epsilon * signed_grad.numpy()[0]
adversarial_image = np.clip(adversarial_image, 0, 1)
# Test if attack succeeded
adv_prediction = self.model.predict(adversarial_image[np.newaxis, ...])
adv_label = np.argmax(adv_prediction)
return {
'original_label': true_label,
'adversarial_label': adv_label,
'attack_succeeded': adv_label != true_label,
'confidence_change': np.max(adv_prediction) - np.max(prediction),
'adversarial_image': adversarial_image
}
def test_robustness(self, test_set, epsilon_values=[0.01, 0.05, 0.1]):
"""Test robustness across attack strengths"""
results = {eps: {'successes': 0, 'total': 0} for eps in epsilon_values}
for image, label in test_set:
for epsilon in epsilon_values:
result = self.fgsm_attack(image, label, epsilon)
results[epsilon]['total'] += 1
if result['attack_succeeded']:
results[epsilon]['successes'] += 1
# Calculate robustness scores
robustness_scores = {
eps: 1 - (data['successes'] / data['total'])
for eps, data in results.items()
}
return robustness_scores
4. Augmentation Testing
import albumentations as A
class AugmentationTester:
def __init__(self, model):
self.model = model
def test_with_augmentations(self, image, true_label):
"""Test model consistency under augmentations"""
augmentations = [
('rotation', A.Rotate(limit=15, p=1)),
('brightness', A.RandomBrightness(limit=0.2, p=1)),
('blur', A.Blur(blur_limit=3, p=1)),
('noise', A.GaussNoise(var_limit=(10, 50), p=1)),
('flip', A.HorizontalFlip(p=1)),
('crop', A.RandomCrop(height=200, width=200, p=1))
]
original_prediction = self.model.predict(image[np.newaxis, ...])[0]
original_class = np.argmax(original_prediction)
original_confidence = np.max(original_prediction)
results = {}
for aug_name, augmentation in augmentations:
augmented = augmentation(image=image)['image']
aug_prediction = self.model.predict(augmented[np.newaxis, ...])[0]
aug_class = np.argmax(aug_prediction)
aug_confidence = np.max(aug_prediction)
results[aug_name] = {
'prediction_changed': aug_class != original_class,
'confidence_drop': original_confidence - aug_confidence,
'still_correct': aug_class == true_label
}
# Calculate invariance score
invariance_score = sum(
1 for r in results.values() if not r['prediction_changed']
) / len(results)
return {
'augmentation_results': results,
'invariance_score': invariance_score
}
Performance Testing
Performance validation ensures models meet real-time requirements, applying principles from performance testing guides to measure latency, throughput, and resource usage under load.
import time
class PerformanceTester:
def __init__(self, model):
self.model = model
def benchmark_inference(self, test_images, batch_sizes=[1, 8, 32]):
"""Benchmark inference speed"""
results = {}
for batch_size in batch_sizes:
latencies = []
for i in range(0, len(test_images), batch_size):
batch = test_images[i:i+batch_size]
start = time.time()
_ = self.model.predict(batch)
end = time.time()
latency_ms = (end - start) * 1000 / len(batch)
latencies.append(latency_ms)
results[f'batch_{batch_size}'] = {
'avg_latency_ms': np.mean(latencies),
'p95_latency_ms': np.percentile(latencies, 95),
'p99_latency_ms': np.percentile(latencies, 99),
'throughput_fps': 1000 / np.mean(latencies)
}
return results
def test_memory_usage(self):
"""Monitor GPU/CPU memory during inference"""
import psutil
import GPUtil
process = psutil.Process()
# Before inference
cpu_before = process.memory_info().rss / 1024 / 1024 # MB
gpus = GPUtil.getGPUs()
gpu_before = gpus[0].memoryUsed if gpus else 0
# Run inference
_ = self.model.predict(test_images[:100])
# After inference
cpu_after = process.memory_info().rss / 1024 / 1024
gpu_after = gpus[0].memoryUsed if gpus else 0
return {
'cpu_memory_mb': cpu_after - cpu_before,
'gpu_memory_mb': gpu_after - gpu_before
}
Best Practices
Practice | Description |
---|---|
Diverse Test Set | Include various lighting, angles, backgrounds |
Edge Case Collection | Occlusions, extreme angles, poor lighting |
Cross-Dataset Validation | Test on data from different sources |
Adversarial Hardening | Include adversarial examples in training |
Continuous Evaluation | Monitor production performance drift |
Fairness Testing | Test across demographics (skin tones, ages) |
Benchmark Standards | Use COCO, ImageNet benchmarks for comparison |
Conclusion
Computer vision testing goes beyond accuracy metrics—requiring robustness testing, dataset validation, adversarial defenses, and fairness evaluation. As CV systems deploy in safety-critical applications, rigorous testing becomes essential. For visual regression testing of UI components, explore visual testing tools that complement computer vision validation.
Start with comprehensive accuracy evaluation, expand to adversarial robustness, validate dataset quality, and continuously monitor production performance. The goal: reliable vision systems that work across all real-world conditions.