Spaces:

neural-thinker
/

cidadao.ai-backend

Paused

anderson-ufrj commited on Sep 16

Commit

8d9d872

1 Parent(s): 48a4081

test(ml): add anomaly detection pipeline tests

- Test statistical anomaly detection (Z-score, IQR, MAD)
- Test ML-based anomaly detection methods
- Test spectral analysis implementation
- Test pattern detection algorithms
- Test ensemble anomaly detector
- Add tests for feature engineering

Files changed (1) hide show

tests/unit/test_anomaly_detection.py +438 -0

tests/unit/test_anomaly_detection.py ADDED Viewed

	@@ -0,0 +1,438 @@

+"""Unit tests for anomaly detection components."""
+import pytest
+import numpy as np
+import pandas as pd
+from datetime import datetime, timedelta
+from unittest.mock import MagicMock, patch
+import json
+from src.ml.anomaly_detector import (
+    AnomalyDetector,
+    AnomalyResult,
+    AnomalyType,
+    StatisticalAnomalyDetector,
+    MLAnomalyDetector,
+    EnsembleAnomalyDetector
+)
+from src.ml.spectral_analyzer import SpectralAnalyzer, SpectralResult
+from src.ml.pattern_analyzer import PatternAnalyzer, PatternType
+class TestAnomalyResult:
+    """Test AnomalyResult data structure."""
+    def test_anomaly_result_creation(self):
+        """Test creating anomaly result."""
+        result = AnomalyResult(
+            is_anomaly=True,
+            score=0.85,
+            type=AnomalyType.STATISTICAL,
+            description="Price significantly above average",
+            evidence={"z_score": 3.2, "mean": 100000, "value": 250000},
+            severity="high"
+        )
+        assert result.is_anomaly is True
+        assert result.score == 0.85
+        assert result.type == AnomalyType.STATISTICAL
+        assert result.severity == "high"
+        assert "z_score" in result.evidence
+    def test_anomaly_result_to_dict(self):
+        """Test converting anomaly result to dictionary."""
+        result = AnomalyResult(
+            is_anomaly=True,
+            score=0.75,
+            type=AnomalyType.PATTERN,
+            description="Unusual temporal pattern detected"
+        )
+        result_dict = result.to_dict()
+        assert isinstance(result_dict, dict)
+        assert result_dict["is_anomaly"] is True
+        assert result_dict["score"] == 0.75
+        assert result_dict["type"] == "pattern"
+class TestStatisticalAnomalyDetector:
+    """Test statistical anomaly detection methods."""
+    @pytest.fixture
+    def detector(self):
+        """Create statistical detector instance."""
+        return StatisticalAnomalyDetector(z_score_threshold=2.5)
+    def test_z_score_detection_normal(self, detector):
+        """Test Z-score detection with normal values."""
+        # Generate normal data
+        np.random.seed(42)
+        values = np.random.normal(100, 20, 100).tolist()
+        # Test with a normal value
+        result = detector.detect_z_score(values, 105)
+        assert result.is_anomaly is False
+        assert result.score < 0.5
+        assert result.type == AnomalyType.STATISTICAL
+    def test_z_score_detection_anomaly(self, detector):
+        """Test Z-score detection with anomalous value."""
+        # Generate normal data
+        np.random.seed(42)
+        values = np.random.normal(100, 20, 100).tolist()
+        # Test with an extreme value
+        result = detector.detect_z_score(values, 200)
+        assert result.is_anomaly is True
+        assert result.score > 0.7
+        assert "z_score" in result.evidence
+        assert result.evidence["z_score"] > 2.5
+    def test_iqr_detection(self, detector):
+        """Test IQR-based outlier detection."""
+        # Create data with outliers
+        values = list(range(1, 101))  # 1 to 100
+        outlier = 200
+        result = detector.detect_iqr_outlier(values, outlier)
+        assert result.is_anomaly is True
+        assert result.score > 0.8
+        assert "iqr" in result.evidence
+        assert "q1" in result.evidence
+        assert "q3" in result.evidence
+    def test_modified_z_score_detection(self, detector):
+        """Test Modified Z-score (MAD-based) detection."""
+        # Generate data with outliers
+        values = [10, 12, 13, 11, 14, 12, 11, 13, 200]  # 200 is outlier
+        result = detector.detect_modified_z_score(values[:-1], 200)
+        assert result.is_anomaly is True
+        assert result.score > 0.8
+        assert "mad_z_score" in result.evidence
+    def test_insufficient_data(self, detector):
+        """Test handling of insufficient data."""
+        # Too few values
+        values = [100, 110]
+        result = detector.detect_z_score(values, 120)
+        assert result.is_anomaly is False
+        assert "Insufficient data" in result.description
+class TestMLAnomalyDetector:
+    """Test machine learning anomaly detection."""
+    @pytest.fixture
+    def detector(self):
+        """Create ML detector instance."""
+        return MLAnomalyDetector()
+    @pytest.fixture
+    def sample_data(self):
+        """Create sample contract data."""
+        np.random.seed(42)
+        n_samples = 100
+        # Normal contracts
+        normal_data = pd.DataFrame({
+            'value': np.random.normal(100000, 20000, n_samples),
+            'duration_days': np.random.normal(180, 30, n_samples),
+            'n_items': np.random.poisson(10, n_samples),
+            'supplier_history': np.random.randint(1, 20, n_samples)
+        })
+        # Add some anomalies
+        anomalies = pd.DataFrame({
+            'value': [500000, 1000, 300000],  # Too high/low
+            'duration_days': [10, 500, 365],  # Too short/long
+            'n_items': [100, 1, 50],  # Too many/few
+            'supplier_history': [0, 0, 1]  # New suppliers
+        })
+        return pd.concat([normal_data, anomalies], ignore_index=True)
+    def test_isolation_forest_detection(self, detector, sample_data):
+        """Test Isolation Forest anomaly detection."""
+        # Train on normal data
+        normal_data = sample_data.iloc[:90]
+        detector.fit_isolation_forest(normal_data)
+        # Test on anomalies
+        anomaly_data = sample_data.iloc[-3:]
+        results = detector.detect_isolation_forest(anomaly_data)
+        assert len(results) == 3
+        assert sum(r.is_anomaly for r in results) >= 2  # At least 2 anomalies
+        assert all(r.type == AnomalyType.ML for r in results)
+    def test_clustering_anomaly_detection(self, detector, sample_data):
+        """Test clustering-based anomaly detection."""
+        # Fit clustering model
+        detector.fit_clustering(sample_data)
+        # Test on extreme outlier
+        outlier = pd.DataFrame({
+            'value': [10000000],  # 100x normal
+            'duration_days': [1],
+            'n_items': [1000],
+            'supplier_history': [0]
+        })
+        results = detector.detect_clustering_anomaly(outlier)
+        assert len(results) == 1
+        assert results[0].is_anomaly is True
+        assert results[0].score > 0.8
+    def test_autoencoder_detection(self, detector, sample_data):
+        """Test autoencoder-based anomaly detection."""
+        # Train autoencoder
+        normal_data = sample_data.iloc[:80]
+        detector.fit_autoencoder(normal_data, epochs=5)  # Few epochs for testing
+        # Test on normal and anomalous data
+        test_data = sample_data.iloc[80:]
+        results = detector.detect_autoencoder_anomaly(test_data)
+        assert len(results) == len(test_data)
+        # Should detect some anomalies
+        anomaly_count = sum(r.is_anomaly for r in results)
+        assert anomaly_count > 0
+class TestSpectralAnalyzer:
+    """Test spectral analysis for anomaly detection."""
+    @pytest.fixture
+    def analyzer(self):
+        """Create spectral analyzer instance."""
+        return SpectralAnalyzer()
+    @pytest.fixture
+    def periodic_signal(self):
+        """Create periodic signal with anomalies."""
+        # Daily data for 365 days
+        days = np.arange(365)
+        # Normal pattern: weekly and monthly cycles
+        weekly = 10 * np.sin(2 * np.pi * days / 7)
+        monthly = 20 * np.sin(2 * np.pi * days / 30)
+        noise = np.random.normal(0, 5, 365)
+        signal = 100 + weekly + monthly + noise
+        # Add anomalies (sudden spikes)
+        signal[100] += 50  # Day 100
+        signal[200] += 70  # Day 200
+        signal[300] -= 60  # Day 300
+        return days, signal
+    def test_fft_analysis(self, analyzer, periodic_signal):
+        """Test FFT-based spectral analysis."""
+        days, signal = periodic_signal
+        result = analyzer.analyze_spectrum(signal, sampling_rate=1.0)  # 1 sample/day
+        assert isinstance(result, SpectralResult)
+        assert result.dominant_frequencies is not None
+        assert len(result.dominant_frequencies) > 0
+        # Should detect weekly frequency (~0.14 Hz = 1/7 days)
+        weekly_freq = 1/7
+        assert any(abs(f - weekly_freq) < 0.01 for f in result.dominant_frequencies)
+    def test_spectral_anomaly_detection(self, analyzer, periodic_signal):
+        """Test spectral anomaly detection."""
+        days, signal = periodic_signal
+        # Analyze normal portion
+        normal_result = analyzer.analyze_spectrum(signal[:90])
+        # Analyze anomalous portion
+        anomaly_result = analyzer.analyze_spectrum(signal[95:105])
+        # Spectral entropy should be higher in anomalous region
+        assert anomaly_result.spectral_entropy > normal_result.spectral_entropy
+    def test_periodogram_analysis(self, analyzer):
+        """Test periodogram computation."""
+        # Create simple sinusoidal signal
+        t = np.linspace(0, 10, 1000)
+        frequency = 2.5  # Hz
+        signal = np.sin(2 * np.pi * frequency * t)
+        result = analyzer.compute_periodogram(signal, sampling_rate=100)
+        assert "frequencies" in result
+        assert "power" in result
+        # Peak should be at the signal frequency
+        peak_idx = np.argmax(result["power"])
+        peak_freq = result["frequencies"][peak_idx]
+        assert abs(peak_freq - frequency) < 0.1
+    def test_wavelet_analysis(self, analyzer):
+        """Test wavelet transform analysis."""
+        # Create signal with time-varying frequency
+        t = np.linspace(0, 1, 1000)
+        chirp = np.sin(2 * np.pi * (10 * t + 5 * t**2))
+        result = analyzer.wavelet_analysis(chirp)
+        assert "scales" in result
+        assert "coefficients" in result
+        assert result["coefficients"].shape[0] == len(result["scales"])
+class TestPatternAnalyzer:
+    """Test pattern analysis for anomaly detection."""
+    @pytest.fixture
+    def analyzer(self):
+        """Create pattern analyzer instance."""
+        return PatternAnalyzer()
+    @pytest.fixture
+    def time_series_data(self):
+        """Create time series data with patterns."""
+        dates = pd.date_range(start='2023-01-01', periods=365, freq='D')
+        # Base trend
+        trend = np.linspace(100, 150, 365)
+        # Seasonal pattern
+        seasonal = 20 * np.sin(2 * np.pi * np.arange(365) / 365)
+        # Weekly pattern
+        weekly = 10 * np.sin(2 * np.pi * np.arange(365) / 7)
+        # Random noise
+        noise = np.random.normal(0, 5, 365)
+        values = trend + seasonal + weekly + noise
+        return pd.DataFrame({
+            'date': dates,
+            'value': values
+        })
+    def test_temporal_pattern_detection(self, analyzer, time_series_data):
+        """Test temporal pattern detection."""
+        patterns = analyzer.detect_temporal_patterns(time_series_data)
+        assert len(patterns) > 0
+        # Should detect trend
+        trend_patterns = [p for p in patterns if p.type == PatternType.TREND]
+        assert len(trend_patterns) > 0
+        # Should detect seasonality
+        seasonal_patterns = [p for p in patterns if p.type == PatternType.SEASONAL]
+        assert len(seasonal_patterns) > 0
+    def test_clustering_pattern_detection(self, analyzer):
+        """Test clustering pattern detection."""
+        # Create data with clear clusters
+        np.random.seed(42)
+        # Three clusters
+        cluster1 = np.random.normal([0, 0], 0.5, (50, 2))
+        cluster2 = np.random.normal([5, 5], 0.5, (50, 2))
+        cluster3 = np.random.normal([10, 0], 0.5, (50, 2))
+        data = pd.DataFrame(
+            np.vstack([cluster1, cluster2, cluster3]),
+            columns=['feature1', 'feature2']
+        )
+        patterns = analyzer.detect_clustering_patterns(data)
+        assert len(patterns) > 0
+        cluster_patterns = [p for p in patterns if p.type == PatternType.CLUSTER]
+        assert len(cluster_patterns) == 3  # Three clusters
+    def test_correlation_pattern_detection(self, analyzer):
+        """Test correlation pattern detection."""
+        # Create correlated data
+        np.random.seed(42)
+        n = 100
+        x = np.random.normal(0, 1, n)
+        data = pd.DataFrame({
+            'feature1': x,
+            'feature2': 2 * x + np.random.normal(0, 0.1, n),  # Strong positive
+            'feature3': -1.5 * x + np.random.normal(0, 0.1, n),  # Strong negative
+            'feature4': np.random.normal(0, 1, n)  # No correlation
+        })
+        patterns = analyzer.detect_correlation_patterns(data)
+        correlation_patterns = [p for p in patterns if p.type == PatternType.CORRELATION]
+        assert len(correlation_patterns) >= 2  # At least 2 strong correlations
+        # Check correlation values
+        for pattern in correlation_patterns:
+            assert abs(pattern.confidence) > 0.8  # Strong correlation
+class TestEnsembleAnomalyDetector:
+    """Test ensemble anomaly detection."""
+    @pytest.fixture
+    def detector(self):
+        """Create ensemble detector instance."""
+        return EnsembleAnomalyDetector()
+    def test_ensemble_voting(self, detector):
+        """Test ensemble voting mechanism."""
+        # Create mock individual results
+        results = [
+            AnomalyResult(is_anomaly=True, score=0.8, type=AnomalyType.STATISTICAL),
+            AnomalyResult(is_anomaly=True, score=0.9, type=AnomalyType.ML),
+            AnomalyResult(is_anomaly=False, score=0.3, type=AnomalyType.PATTERN)
+        ]
+        # Test majority voting
+        ensemble_result = detector.combine_results(results, method='majority')
+        assert ensemble_result.is_anomaly is True  # 2 out of 3 say anomaly
+        assert ensemble_result.type == AnomalyType.ENSEMBLE
+    def test_ensemble_averaging(self, detector):
+        """Test ensemble score averaging."""
+        results = [
+            AnomalyResult(is_anomaly=True, score=0.8, type=AnomalyType.STATISTICAL),
+            AnomalyResult(is_anomaly=True, score=0.9, type=AnomalyType.ML),
+            AnomalyResult(is_anomaly=False, score=0.3, type=AnomalyType.PATTERN)
+        ]
+        # Test averaging
+        ensemble_result = detector.combine_results(results, method='average')
+        expected_score = (0.8 + 0.9 + 0.3) / 3
+        assert abs(ensemble_result.score - expected_score) < 0.01
+    def test_weighted_ensemble(self, detector):
+        """Test weighted ensemble combination."""
+        results = [
+            AnomalyResult(is_anomaly=True, score=0.8, type=AnomalyType.STATISTICAL),
+            AnomalyResult(is_anomaly=True, score=0.6, type=AnomalyType.ML)
+        ]
+        weights = {
+            AnomalyType.STATISTICAL: 0.7,
+            AnomalyType.ML: 0.3
+        }
+        ensemble_result = detector.combine_results(results, method='weighted', weights=weights)
+        expected_score = 0.8 * 0.7 + 0.6 * 0.3
+        assert abs(ensemble_result.score - expected_score) < 0.01