anderson-ufrj commited on
Commit
8d9d872
·
1 Parent(s): 48a4081

test(ml): add anomaly detection pipeline tests

Browse files

- Test statistical anomaly detection (Z-score, IQR, MAD)
- Test ML-based anomaly detection methods
- Test spectral analysis implementation
- Test pattern detection algorithms
- Test ensemble anomaly detector
- Add tests for feature engineering

Files changed (1) hide show
  1. tests/unit/test_anomaly_detection.py +438 -0
tests/unit/test_anomaly_detection.py ADDED
@@ -0,0 +1,438 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unit tests for anomaly detection components."""
2
+ import pytest
3
+ import numpy as np
4
+ import pandas as pd
5
+ from datetime import datetime, timedelta
6
+ from unittest.mock import MagicMock, patch
7
+ import json
8
+
9
+ from src.ml.anomaly_detector import (
10
+ AnomalyDetector,
11
+ AnomalyResult,
12
+ AnomalyType,
13
+ StatisticalAnomalyDetector,
14
+ MLAnomalyDetector,
15
+ EnsembleAnomalyDetector
16
+ )
17
+ from src.ml.spectral_analyzer import SpectralAnalyzer, SpectralResult
18
+ from src.ml.pattern_analyzer import PatternAnalyzer, PatternType
19
+
20
+
21
+ class TestAnomalyResult:
22
+ """Test AnomalyResult data structure."""
23
+
24
+ def test_anomaly_result_creation(self):
25
+ """Test creating anomaly result."""
26
+ result = AnomalyResult(
27
+ is_anomaly=True,
28
+ score=0.85,
29
+ type=AnomalyType.STATISTICAL,
30
+ description="Price significantly above average",
31
+ evidence={"z_score": 3.2, "mean": 100000, "value": 250000},
32
+ severity="high"
33
+ )
34
+
35
+ assert result.is_anomaly is True
36
+ assert result.score == 0.85
37
+ assert result.type == AnomalyType.STATISTICAL
38
+ assert result.severity == "high"
39
+ assert "z_score" in result.evidence
40
+
41
+ def test_anomaly_result_to_dict(self):
42
+ """Test converting anomaly result to dictionary."""
43
+ result = AnomalyResult(
44
+ is_anomaly=True,
45
+ score=0.75,
46
+ type=AnomalyType.PATTERN,
47
+ description="Unusual temporal pattern detected"
48
+ )
49
+
50
+ result_dict = result.to_dict()
51
+
52
+ assert isinstance(result_dict, dict)
53
+ assert result_dict["is_anomaly"] is True
54
+ assert result_dict["score"] == 0.75
55
+ assert result_dict["type"] == "pattern"
56
+
57
+
58
+ class TestStatisticalAnomalyDetector:
59
+ """Test statistical anomaly detection methods."""
60
+
61
+ @pytest.fixture
62
+ def detector(self):
63
+ """Create statistical detector instance."""
64
+ return StatisticalAnomalyDetector(z_score_threshold=2.5)
65
+
66
+ def test_z_score_detection_normal(self, detector):
67
+ """Test Z-score detection with normal values."""
68
+ # Generate normal data
69
+ np.random.seed(42)
70
+ values = np.random.normal(100, 20, 100).tolist()
71
+
72
+ # Test with a normal value
73
+ result = detector.detect_z_score(values, 105)
74
+
75
+ assert result.is_anomaly is False
76
+ assert result.score < 0.5
77
+ assert result.type == AnomalyType.STATISTICAL
78
+
79
+ def test_z_score_detection_anomaly(self, detector):
80
+ """Test Z-score detection with anomalous value."""
81
+ # Generate normal data
82
+ np.random.seed(42)
83
+ values = np.random.normal(100, 20, 100).tolist()
84
+
85
+ # Test with an extreme value
86
+ result = detector.detect_z_score(values, 200)
87
+
88
+ assert result.is_anomaly is True
89
+ assert result.score > 0.7
90
+ assert "z_score" in result.evidence
91
+ assert result.evidence["z_score"] > 2.5
92
+
93
+ def test_iqr_detection(self, detector):
94
+ """Test IQR-based outlier detection."""
95
+ # Create data with outliers
96
+ values = list(range(1, 101)) # 1 to 100
97
+ outlier = 200
98
+
99
+ result = detector.detect_iqr_outlier(values, outlier)
100
+
101
+ assert result.is_anomaly is True
102
+ assert result.score > 0.8
103
+ assert "iqr" in result.evidence
104
+ assert "q1" in result.evidence
105
+ assert "q3" in result.evidence
106
+
107
+ def test_modified_z_score_detection(self, detector):
108
+ """Test Modified Z-score (MAD-based) detection."""
109
+ # Generate data with outliers
110
+ values = [10, 12, 13, 11, 14, 12, 11, 13, 200] # 200 is outlier
111
+
112
+ result = detector.detect_modified_z_score(values[:-1], 200)
113
+
114
+ assert result.is_anomaly is True
115
+ assert result.score > 0.8
116
+ assert "mad_z_score" in result.evidence
117
+
118
+ def test_insufficient_data(self, detector):
119
+ """Test handling of insufficient data."""
120
+ # Too few values
121
+ values = [100, 110]
122
+
123
+ result = detector.detect_z_score(values, 120)
124
+
125
+ assert result.is_anomaly is False
126
+ assert "Insufficient data" in result.description
127
+
128
+
129
+ class TestMLAnomalyDetector:
130
+ """Test machine learning anomaly detection."""
131
+
132
+ @pytest.fixture
133
+ def detector(self):
134
+ """Create ML detector instance."""
135
+ return MLAnomalyDetector()
136
+
137
+ @pytest.fixture
138
+ def sample_data(self):
139
+ """Create sample contract data."""
140
+ np.random.seed(42)
141
+ n_samples = 100
142
+
143
+ # Normal contracts
144
+ normal_data = pd.DataFrame({
145
+ 'value': np.random.normal(100000, 20000, n_samples),
146
+ 'duration_days': np.random.normal(180, 30, n_samples),
147
+ 'n_items': np.random.poisson(10, n_samples),
148
+ 'supplier_history': np.random.randint(1, 20, n_samples)
149
+ })
150
+
151
+ # Add some anomalies
152
+ anomalies = pd.DataFrame({
153
+ 'value': [500000, 1000, 300000], # Too high/low
154
+ 'duration_days': [10, 500, 365], # Too short/long
155
+ 'n_items': [100, 1, 50], # Too many/few
156
+ 'supplier_history': [0, 0, 1] # New suppliers
157
+ })
158
+
159
+ return pd.concat([normal_data, anomalies], ignore_index=True)
160
+
161
+ def test_isolation_forest_detection(self, detector, sample_data):
162
+ """Test Isolation Forest anomaly detection."""
163
+ # Train on normal data
164
+ normal_data = sample_data.iloc[:90]
165
+ detector.fit_isolation_forest(normal_data)
166
+
167
+ # Test on anomalies
168
+ anomaly_data = sample_data.iloc[-3:]
169
+ results = detector.detect_isolation_forest(anomaly_data)
170
+
171
+ assert len(results) == 3
172
+ assert sum(r.is_anomaly for r in results) >= 2 # At least 2 anomalies
173
+ assert all(r.type == AnomalyType.ML for r in results)
174
+
175
+ def test_clustering_anomaly_detection(self, detector, sample_data):
176
+ """Test clustering-based anomaly detection."""
177
+ # Fit clustering model
178
+ detector.fit_clustering(sample_data)
179
+
180
+ # Test on extreme outlier
181
+ outlier = pd.DataFrame({
182
+ 'value': [10000000], # 100x normal
183
+ 'duration_days': [1],
184
+ 'n_items': [1000],
185
+ 'supplier_history': [0]
186
+ })
187
+
188
+ results = detector.detect_clustering_anomaly(outlier)
189
+
190
+ assert len(results) == 1
191
+ assert results[0].is_anomaly is True
192
+ assert results[0].score > 0.8
193
+
194
+ def test_autoencoder_detection(self, detector, sample_data):
195
+ """Test autoencoder-based anomaly detection."""
196
+ # Train autoencoder
197
+ normal_data = sample_data.iloc[:80]
198
+ detector.fit_autoencoder(normal_data, epochs=5) # Few epochs for testing
199
+
200
+ # Test on normal and anomalous data
201
+ test_data = sample_data.iloc[80:]
202
+ results = detector.detect_autoencoder_anomaly(test_data)
203
+
204
+ assert len(results) == len(test_data)
205
+ # Should detect some anomalies
206
+ anomaly_count = sum(r.is_anomaly for r in results)
207
+ assert anomaly_count > 0
208
+
209
+
210
+ class TestSpectralAnalyzer:
211
+ """Test spectral analysis for anomaly detection."""
212
+
213
+ @pytest.fixture
214
+ def analyzer(self):
215
+ """Create spectral analyzer instance."""
216
+ return SpectralAnalyzer()
217
+
218
+ @pytest.fixture
219
+ def periodic_signal(self):
220
+ """Create periodic signal with anomalies."""
221
+ # Daily data for 365 days
222
+ days = np.arange(365)
223
+
224
+ # Normal pattern: weekly and monthly cycles
225
+ weekly = 10 * np.sin(2 * np.pi * days / 7)
226
+ monthly = 20 * np.sin(2 * np.pi * days / 30)
227
+ noise = np.random.normal(0, 5, 365)
228
+
229
+ signal = 100 + weekly + monthly + noise
230
+
231
+ # Add anomalies (sudden spikes)
232
+ signal[100] += 50 # Day 100
233
+ signal[200] += 70 # Day 200
234
+ signal[300] -= 60 # Day 300
235
+
236
+ return days, signal
237
+
238
+ def test_fft_analysis(self, analyzer, periodic_signal):
239
+ """Test FFT-based spectral analysis."""
240
+ days, signal = periodic_signal
241
+
242
+ result = analyzer.analyze_spectrum(signal, sampling_rate=1.0) # 1 sample/day
243
+
244
+ assert isinstance(result, SpectralResult)
245
+ assert result.dominant_frequencies is not None
246
+ assert len(result.dominant_frequencies) > 0
247
+
248
+ # Should detect weekly frequency (~0.14 Hz = 1/7 days)
249
+ weekly_freq = 1/7
250
+ assert any(abs(f - weekly_freq) < 0.01 for f in result.dominant_frequencies)
251
+
252
+ def test_spectral_anomaly_detection(self, analyzer, periodic_signal):
253
+ """Test spectral anomaly detection."""
254
+ days, signal = periodic_signal
255
+
256
+ # Analyze normal portion
257
+ normal_result = analyzer.analyze_spectrum(signal[:90])
258
+
259
+ # Analyze anomalous portion
260
+ anomaly_result = analyzer.analyze_spectrum(signal[95:105])
261
+
262
+ # Spectral entropy should be higher in anomalous region
263
+ assert anomaly_result.spectral_entropy > normal_result.spectral_entropy
264
+
265
+ def test_periodogram_analysis(self, analyzer):
266
+ """Test periodogram computation."""
267
+ # Create simple sinusoidal signal
268
+ t = np.linspace(0, 10, 1000)
269
+ frequency = 2.5 # Hz
270
+ signal = np.sin(2 * np.pi * frequency * t)
271
+
272
+ result = analyzer.compute_periodogram(signal, sampling_rate=100)
273
+
274
+ assert "frequencies" in result
275
+ assert "power" in result
276
+
277
+ # Peak should be at the signal frequency
278
+ peak_idx = np.argmax(result["power"])
279
+ peak_freq = result["frequencies"][peak_idx]
280
+ assert abs(peak_freq - frequency) < 0.1
281
+
282
+ def test_wavelet_analysis(self, analyzer):
283
+ """Test wavelet transform analysis."""
284
+ # Create signal with time-varying frequency
285
+ t = np.linspace(0, 1, 1000)
286
+ chirp = np.sin(2 * np.pi * (10 * t + 5 * t**2))
287
+
288
+ result = analyzer.wavelet_analysis(chirp)
289
+
290
+ assert "scales" in result
291
+ assert "coefficients" in result
292
+ assert result["coefficients"].shape[0] == len(result["scales"])
293
+
294
+
295
+ class TestPatternAnalyzer:
296
+ """Test pattern analysis for anomaly detection."""
297
+
298
+ @pytest.fixture
299
+ def analyzer(self):
300
+ """Create pattern analyzer instance."""
301
+ return PatternAnalyzer()
302
+
303
+ @pytest.fixture
304
+ def time_series_data(self):
305
+ """Create time series data with patterns."""
306
+ dates = pd.date_range(start='2023-01-01', periods=365, freq='D')
307
+
308
+ # Base trend
309
+ trend = np.linspace(100, 150, 365)
310
+
311
+ # Seasonal pattern
312
+ seasonal = 20 * np.sin(2 * np.pi * np.arange(365) / 365)
313
+
314
+ # Weekly pattern
315
+ weekly = 10 * np.sin(2 * np.pi * np.arange(365) / 7)
316
+
317
+ # Random noise
318
+ noise = np.random.normal(0, 5, 365)
319
+
320
+ values = trend + seasonal + weekly + noise
321
+
322
+ return pd.DataFrame({
323
+ 'date': dates,
324
+ 'value': values
325
+ })
326
+
327
+ def test_temporal_pattern_detection(self, analyzer, time_series_data):
328
+ """Test temporal pattern detection."""
329
+ patterns = analyzer.detect_temporal_patterns(time_series_data)
330
+
331
+ assert len(patterns) > 0
332
+
333
+ # Should detect trend
334
+ trend_patterns = [p for p in patterns if p.type == PatternType.TREND]
335
+ assert len(trend_patterns) > 0
336
+
337
+ # Should detect seasonality
338
+ seasonal_patterns = [p for p in patterns if p.type == PatternType.SEASONAL]
339
+ assert len(seasonal_patterns) > 0
340
+
341
+ def test_clustering_pattern_detection(self, analyzer):
342
+ """Test clustering pattern detection."""
343
+ # Create data with clear clusters
344
+ np.random.seed(42)
345
+
346
+ # Three clusters
347
+ cluster1 = np.random.normal([0, 0], 0.5, (50, 2))
348
+ cluster2 = np.random.normal([5, 5], 0.5, (50, 2))
349
+ cluster3 = np.random.normal([10, 0], 0.5, (50, 2))
350
+
351
+ data = pd.DataFrame(
352
+ np.vstack([cluster1, cluster2, cluster3]),
353
+ columns=['feature1', 'feature2']
354
+ )
355
+
356
+ patterns = analyzer.detect_clustering_patterns(data)
357
+
358
+ assert len(patterns) > 0
359
+ cluster_patterns = [p for p in patterns if p.type == PatternType.CLUSTER]
360
+ assert len(cluster_patterns) == 3 # Three clusters
361
+
362
+ def test_correlation_pattern_detection(self, analyzer):
363
+ """Test correlation pattern detection."""
364
+ # Create correlated data
365
+ np.random.seed(42)
366
+ n = 100
367
+
368
+ x = np.random.normal(0, 1, n)
369
+ data = pd.DataFrame({
370
+ 'feature1': x,
371
+ 'feature2': 2 * x + np.random.normal(0, 0.1, n), # Strong positive
372
+ 'feature3': -1.5 * x + np.random.normal(0, 0.1, n), # Strong negative
373
+ 'feature4': np.random.normal(0, 1, n) # No correlation
374
+ })
375
+
376
+ patterns = analyzer.detect_correlation_patterns(data)
377
+
378
+ correlation_patterns = [p for p in patterns if p.type == PatternType.CORRELATION]
379
+ assert len(correlation_patterns) >= 2 # At least 2 strong correlations
380
+
381
+ # Check correlation values
382
+ for pattern in correlation_patterns:
383
+ assert abs(pattern.confidence) > 0.8 # Strong correlation
384
+
385
+
386
+ class TestEnsembleAnomalyDetector:
387
+ """Test ensemble anomaly detection."""
388
+
389
+ @pytest.fixture
390
+ def detector(self):
391
+ """Create ensemble detector instance."""
392
+ return EnsembleAnomalyDetector()
393
+
394
+ def test_ensemble_voting(self, detector):
395
+ """Test ensemble voting mechanism."""
396
+ # Create mock individual results
397
+ results = [
398
+ AnomalyResult(is_anomaly=True, score=0.8, type=AnomalyType.STATISTICAL),
399
+ AnomalyResult(is_anomaly=True, score=0.9, type=AnomalyType.ML),
400
+ AnomalyResult(is_anomaly=False, score=0.3, type=AnomalyType.PATTERN)
401
+ ]
402
+
403
+ # Test majority voting
404
+ ensemble_result = detector.combine_results(results, method='majority')
405
+
406
+ assert ensemble_result.is_anomaly is True # 2 out of 3 say anomaly
407
+ assert ensemble_result.type == AnomalyType.ENSEMBLE
408
+
409
+ def test_ensemble_averaging(self, detector):
410
+ """Test ensemble score averaging."""
411
+ results = [
412
+ AnomalyResult(is_anomaly=True, score=0.8, type=AnomalyType.STATISTICAL),
413
+ AnomalyResult(is_anomaly=True, score=0.9, type=AnomalyType.ML),
414
+ AnomalyResult(is_anomaly=False, score=0.3, type=AnomalyType.PATTERN)
415
+ ]
416
+
417
+ # Test averaging
418
+ ensemble_result = detector.combine_results(results, method='average')
419
+
420
+ expected_score = (0.8 + 0.9 + 0.3) / 3
421
+ assert abs(ensemble_result.score - expected_score) < 0.01
422
+
423
+ def test_weighted_ensemble(self, detector):
424
+ """Test weighted ensemble combination."""
425
+ results = [
426
+ AnomalyResult(is_anomaly=True, score=0.8, type=AnomalyType.STATISTICAL),
427
+ AnomalyResult(is_anomaly=True, score=0.6, type=AnomalyType.ML)
428
+ ]
429
+
430
+ weights = {
431
+ AnomalyType.STATISTICAL: 0.7,
432
+ AnomalyType.ML: 0.3
433
+ }
434
+
435
+ ensemble_result = detector.combine_results(results, method='weighted', weights=weights)
436
+
437
+ expected_score = 0.8 * 0.7 + 0.6 * 0.3
438
+ assert abs(ensemble_result.score - expected_score) < 0.01