Computer Vision Testing: Validating Image Recognition Systems

Testing Computer Vision Systems

Computer vision powers autonomous vehicles, medical diagnostics, security systems, and manufacturing QA. Unlike traditional software, CV models deal with ambiguity, visual variability, and real-world complexity. A misclassified stop sign could cause an accident. A false-negative tumor detection could cost lives. Like AI-powered test generation, computer vision testing requires understanding how machine learning models behave under various conditions.

Testing CV systems requires evaluating accuracy across diverse conditions, adversarial robustness, fairness across demographics, and real-time performance constraints.

Core Testing Strategies

1. Accuracy Metrics

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import numpy as np

class CVModelEvaluator:
    def __init__(self, model):
        self.model = model

    def evaluate_classification(self, test_images, true_labels):
        """Evaluate classification model"""
        predictions = self.model.predict(test_images)
        predicted_labels = np.argmax(predictions, axis=1)

        # Overall accuracy
        accuracy = accuracy_score(true_labels, predicted_labels)

        # Per-class metrics
        precision, recall, f1, support = precision_recall_fscore_support(
            true_labels,
            predicted_labels,
            average=None
        )

        # Confusion matrix
        cm = confusion_matrix(true_labels, predicted_labels)

        return {
            'accuracy': accuracy,
            'per_class_metrics': {
                self.model.class_names[i]: {
                    'precision': precision[i],
                    'recall': recall[i],
                    'f1_score': f1[i],
                    'support': support[i]
                }
                for i in range(len(self.model.class_names))
            },
            'confusion_matrix': cm
        }

    def evaluate_object_detection(self, test_images, ground_truth_boxes):
        """Evaluate object detection with mAP"""
        predictions = self.model.detect(test_images)

        # Calculate IoU (Intersection over Union)
        def calculate_iou(box1, box2):
            x1 = max(box1[0], box2[0])
            y1 = max(box1[1], box2[1])
            x2 = min(box1[2], box2[2])
            y2 = min(box1[3], box2[3])

            intersection = max(0, x2 - x1) * max(0, y2 - y1)
            area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
            area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
            union = area1 + area2 - intersection

            return intersection / union if union > 0 else 0

        # Calculate mAP (mean Average Precision)
        aps = []
        for class_id in range(len(self.model.class_names)):
            # Get predictions and ground truth for this class
            class_predictions = [
                p for p in predictions if p['class_id'] == class_id
            ]
            class_gt = [
                gt for gt in ground_truth_boxes if gt['class_id'] == class_id
            ]

            # Sort by confidence
            class_predictions.sort(key=lambda x: x['confidence'], reverse=True)

            # Calculate precision-recall curve
            tp = 0
            fp = 0
            precisions = []
            recalls = []

            for pred in class_predictions:
                # Find best matching ground truth
                best_iou = 0
                best_gt_idx = -1

                for idx, gt in enumerate(class_gt):
                    iou = calculate_iou(pred['box'], gt['box'])
                    if iou > best_iou:
                        best_iou = iou
                        best_gt_idx = idx

                if best_iou >= 0.5:  # IoU threshold
                    tp += 1
                    class_gt.pop(best_gt_idx)  # Remove matched GT
                else:
                    fp += 1

                precision = tp / (tp + fp)
                recall = tp / len(class_gt) if class_gt else 0

                precisions.append(precision)
                recalls.append(recall)

            # Calculate AP (area under precision-recall curve)
            ap = np.trapz(precisions, recalls)
            aps.append(ap)

        return {
            'mAP': np.mean(aps),
            'per_class_AP': dict(zip(self.model.class_names, aps))
        }

2. Dataset Validation

import cv2
from collections import Counter

class DatasetValidator:
    def __init__(self, dataset):
        self.dataset = dataset

    def check_class_balance(self):
        """Detect class imbalance"""
        label_counts = Counter(self.dataset.labels)
        total = len(self.dataset.labels)

        imbalance_report = {}
        for class_name, count in label_counts.items():
            percentage = (count / total) * 100
            imbalance_report[class_name] = {
                'count': count,
                'percentage': percentage,
                'imbalanced': percentage < 5 or percentage > 50
            }

        return imbalance_report

    def detect_duplicate_images(self):
        """Find duplicate or near-duplicate images"""
        import imagehash
        from PIL import Image

        hashes = {}
        duplicates = []

        for idx, img_path in enumerate(self.dataset.image_paths):
            img = Image.open(img_path)
            img_hash = imagehash.average_hash(img)

            if img_hash in hashes:
                duplicates.append({
                    'original': hashes[img_hash],
                    'duplicate': img_path
                })
            else:
                hashes[img_hash] = img_path

        return duplicates

    def analyze_image_quality(self):
        """Check for low-quality images"""
        quality_issues = []

        for img_path in self.dataset.image_paths:
            img = cv2.imread(img_path)

            # Check resolution
            height, width = img.shape[:2]
            if height < 224 or width < 224:
                quality_issues.append({
                    'image': img_path,
                    'issue': 'low_resolution',
                    'resolution': f"{width}x{height}"
                })

            # Check brightness
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            brightness = np.mean(gray)
            if brightness < 30 or brightness > 225:
                quality_issues.append({
                    'image': img_path,
                    'issue': 'poor_brightness',
                    'brightness': brightness
                })

            # Check blur
            laplacian_var = cv2.Laplacian(gray, cv2.CV_64F).var()
            if laplacian_var < 100:  # Threshold for blur
                quality_issues.append({
                    'image': img_path,
                    'issue': 'blurry',
                    'blur_score': laplacian_var
                })

        return quality_issues

3. Adversarial Testing

Testing model robustness against adversarial attacks is critical, similar to security testing approaches that validate system resilience against malicious inputs.

import tensorflow as tf

class AdversarialTester:
    def __init__(self, model):
        self.model = model

    def fgsm_attack(self, image, true_label, epsilon=0.01):
        """Fast Gradient Sign Method attack"""
        image_tensor = tf.convert_to_tensor(image[np.newaxis, ...])

        with tf.GradientTape() as tape:
            tape.watch(image_tensor)
            prediction = self.model(image_tensor)
            loss = tf.keras.losses.sparse_categorical_crossentropy(
                [true_label], prediction
            )

        gradient = tape.gradient(loss, image_tensor)
        signed_grad = tf.sign(gradient)

        # Create adversarial image
        adversarial_image = image + epsilon * signed_grad.numpy()[0]
        adversarial_image = np.clip(adversarial_image, 0, 1)

        # Test if attack succeeded
        adv_prediction = self.model.predict(adversarial_image[np.newaxis, ...])
        adv_label = np.argmax(adv_prediction)

        return {
            'original_label': true_label,
            'adversarial_label': adv_label,
            'attack_succeeded': adv_label != true_label,
            'confidence_change': np.max(adv_prediction) - np.max(prediction),
            'adversarial_image': adversarial_image
        }

    def test_robustness(self, test_set, epsilon_values=[0.01, 0.05, 0.1]):
        """Test robustness across attack strengths"""
        results = {eps: {'successes': 0, 'total': 0} for eps in epsilon_values}

        for image, label in test_set:
            for epsilon in epsilon_values:
                result = self.fgsm_attack(image, label, epsilon)
                results[epsilon]['total'] += 1
                if result['attack_succeeded']:
                    results[epsilon]['successes'] += 1

        # Calculate robustness scores
        robustness_scores = {
            eps: 1 - (data['successes'] / data['total'])
            for eps, data in results.items()
        }

        return robustness_scores

4. Augmentation Testing

import albumentations as A

class AugmentationTester:
    def __init__(self, model):
        self.model = model

    def test_with_augmentations(self, image, true_label):
        """Test model consistency under augmentations"""
        augmentations = [
            ('rotation', A.Rotate(limit=15, p=1)),
            ('brightness', A.RandomBrightness(limit=0.2, p=1)),
            ('blur', A.Blur(blur_limit=3, p=1)),
            ('noise', A.GaussNoise(var_limit=(10, 50), p=1)),
            ('flip', A.HorizontalFlip(p=1)),
            ('crop', A.RandomCrop(height=200, width=200, p=1))
        ]

        original_prediction = self.model.predict(image[np.newaxis, ...])[0]
        original_class = np.argmax(original_prediction)
        original_confidence = np.max(original_prediction)

        results = {}

        for aug_name, augmentation in augmentations:
            augmented = augmentation(image=image)['image']
            aug_prediction = self.model.predict(augmented[np.newaxis, ...])[0]
            aug_class = np.argmax(aug_prediction)
            aug_confidence = np.max(aug_prediction)

            results[aug_name] = {
                'prediction_changed': aug_class != original_class,
                'confidence_drop': original_confidence - aug_confidence,
                'still_correct': aug_class == true_label
            }

        # Calculate invariance score
        invariance_score = sum(
            1 for r in results.values() if not r['prediction_changed']
        ) / len(results)

        return {
            'augmentation_results': results,
            'invariance_score': invariance_score
        }

Performance Testing

Performance validation ensures models meet real-time requirements, applying principles from performance testing guides to measure latency, throughput, and resource usage under load.

import time

class PerformanceTester:
    def __init__(self, model):
        self.model = model

    def benchmark_inference(self, test_images, batch_sizes=[1, 8, 32]):
        """Benchmark inference speed"""
        results = {}

        for batch_size in batch_sizes:
            latencies = []

            for i in range(0, len(test_images), batch_size):
                batch = test_images[i:i+batch_size]

                start = time.time()
                _ = self.model.predict(batch)
                end = time.time()

                latency_ms = (end - start) * 1000 / len(batch)
                latencies.append(latency_ms)

            results[f'batch_{batch_size}'] = {
                'avg_latency_ms': np.mean(latencies),
                'p95_latency_ms': np.percentile(latencies, 95),
                'p99_latency_ms': np.percentile(latencies, 99),
                'throughput_fps': 1000 / np.mean(latencies)
            }

        return results

    def test_memory_usage(self):
        """Monitor GPU/CPU memory during inference"""
        import psutil
        import GPUtil

        process = psutil.Process()

        # Before inference
        cpu_before = process.memory_info().rss / 1024 / 1024  # MB
        gpus = GPUtil.getGPUs()
        gpu_before = gpus[0].memoryUsed if gpus else 0

        # Run inference
        _ = self.model.predict(test_images[:100])

        # After inference
        cpu_after = process.memory_info().rss / 1024 / 1024
        gpu_after = gpus[0].memoryUsed if gpus else 0

        return {
            'cpu_memory_mb': cpu_after - cpu_before,
            'gpu_memory_mb': gpu_after - gpu_before
        }

Best Practices

Practice	Description
Diverse Test Set	Include various lighting, angles, backgrounds
Edge Case Collection	Occlusions, extreme angles, poor lighting
Cross-Dataset Validation	Test on data from different sources
Adversarial Hardening	Include adversarial examples in training
Continuous Evaluation	Monitor production performance drift
Fairness Testing	Test across demographics (skin tones, ages)
Benchmark Standards	Use COCO, ImageNet benchmarks for comparison

Conclusion

Computer vision testing goes beyond accuracy metrics—requiring robustness testing, dataset validation, adversarial defenses, and fairness evaluation. As CV systems deploy in safety-critical applications, rigorous testing becomes essential. For visual regression testing of UI components, explore visual testing tools that complement computer vision validation.

Start with comprehensive accuracy evaluation, expand to adversarial robustness, validate dataset quality, and continuously monitor production performance. The goal: reliable vision systems that work across all real-world conditions.